1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/config.h" 40 #include "spdk/endian.h" 41 #include "spdk/bdev.h" 42 #include "spdk/json.h" 43 #include "spdk/nvme.h" 44 #include "spdk/nvme_ocssd.h" 45 #include "spdk/thread.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 49 #include "spdk/bdev_module.h" 50 #include "spdk/log.h" 51 52 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 53 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 54 55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 56 57 struct nvme_bdev_io { 58 /** array of iovecs to transfer. */ 59 struct iovec *iovs; 60 61 /** Number of iovecs in iovs array. */ 62 int iovcnt; 63 64 /** Current iovec position. */ 65 int iovpos; 66 67 /** Offset in current iovec. */ 68 uint32_t iov_offset; 69 70 /** array of iovecs to transfer. */ 71 struct iovec *fused_iovs; 72 73 /** Number of iovecs in iovs array. */ 74 int fused_iovcnt; 75 76 /** Current iovec position. */ 77 int fused_iovpos; 78 79 /** Offset in current iovec. */ 80 uint32_t fused_iov_offset; 81 82 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 83 struct spdk_nvme_cpl cpl; 84 85 /** Originating thread */ 86 struct spdk_thread *orig_thread; 87 88 /** Keeps track if first of fused commands was submitted */ 89 bool first_fused_submitted; 90 }; 91 92 struct nvme_probe_ctx { 93 size_t count; 94 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 95 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 96 const char *names[NVME_MAX_CONTROLLERS]; 97 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 98 const char *hostnqn; 99 }; 100 101 struct nvme_probe_skip_entry { 102 struct spdk_nvme_transport_id trid; 103 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 104 }; 105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 107 g_skipped_nvme_ctrlrs); 108 109 static struct spdk_bdev_nvme_opts g_opts = { 110 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 111 .timeout_us = 0, 112 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 113 .retry_count = 4, 114 .arbitration_burst = 0, 115 .low_priority_weight = 0, 116 .medium_priority_weight = 0, 117 .high_priority_weight = 0, 118 .nvme_adminq_poll_period_us = 10000ULL, 119 .nvme_ioq_poll_period_us = 0, 120 .io_queue_requests = 0, 121 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 122 }; 123 124 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 125 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 126 127 static int g_hot_insert_nvme_controller_index = 0; 128 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 129 static bool g_nvme_hotplug_enabled = false; 130 static struct spdk_thread *g_bdev_nvme_init_thread; 131 static struct spdk_poller *g_hotplug_poller; 132 static struct spdk_poller *g_hotplug_probe_poller; 133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 134 135 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 136 struct nvme_async_probe_ctx *ctx); 137 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 138 struct nvme_async_probe_ctx *ctx); 139 static int bdev_nvme_library_init(void); 140 static void bdev_nvme_library_fini(void); 141 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 142 struct nvme_bdev_io *bio, 143 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 144 uint32_t flags); 145 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 146 struct nvme_bdev_io *bio, 147 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 148 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 149 struct nvme_bdev_io *bio, 150 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 151 uint32_t flags); 152 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 153 struct nvme_bdev_io *bio, 154 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 155 uint32_t flags); 156 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 157 struct spdk_nvme_qpair *qpair, 158 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 159 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 160 uint32_t flags); 161 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, 162 struct nvme_bdev_io *bio, 163 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 164 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 165 struct nvme_bdev_io *bio, 166 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 167 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 168 struct nvme_bdev_io *bio, 169 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 170 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch, 171 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 172 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio); 173 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove); 174 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 175 176 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 177 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 178 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 179 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 180 181 static populate_namespace_fn g_populate_namespace_fn[] = { 182 NULL, 183 nvme_ctrlr_populate_standard_namespace, 184 bdev_ocssd_populate_namespace, 185 }; 186 187 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns); 188 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns); 189 190 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 191 NULL, 192 nvme_ctrlr_depopulate_standard_namespace, 193 bdev_ocssd_depopulate_namespace, 194 }; 195 196 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 197 struct nvme_bdev_ns *nvme_ns); 198 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 199 struct nvme_bdev_ns *nvme_ns); 200 201 static config_json_namespace_fn g_config_json_namespace_fn[] = { 202 NULL, 203 nvme_ctrlr_config_json_standard_namespace, 204 bdev_ocssd_namespace_config_json, 205 }; 206 207 struct spdk_nvme_qpair * 208 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 209 { 210 struct nvme_io_channel *nvme_ch; 211 212 assert(ctrlr_io_ch != NULL); 213 214 nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 215 216 return nvme_ch->qpair; 217 } 218 219 static int 220 bdev_nvme_get_ctx_size(void) 221 { 222 return sizeof(struct nvme_bdev_io); 223 } 224 225 static struct spdk_bdev_module nvme_if = { 226 .name = "nvme", 227 .async_fini = true, 228 .module_init = bdev_nvme_library_init, 229 .module_fini = bdev_nvme_library_fini, 230 .config_json = bdev_nvme_config_json, 231 .get_ctx_size = bdev_nvme_get_ctx_size, 232 233 }; 234 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 235 236 static void 237 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 238 { 239 int rc; 240 241 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 242 /* 243 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 244 * reconnect a qpair and we will stop getting a callback for this one. 245 */ 246 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 247 if (rc != 0) { 248 SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 249 } 250 } 251 252 static int 253 bdev_nvme_poll(void *arg) 254 { 255 struct nvme_bdev_poll_group *group = arg; 256 int64_t num_completions; 257 258 if (group->collect_spin_stat && group->start_ticks == 0) { 259 group->start_ticks = spdk_get_ticks(); 260 } 261 262 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 263 bdev_nvme_disconnected_qpair_cb); 264 if (group->collect_spin_stat) { 265 if (num_completions > 0) { 266 if (group->end_ticks != 0) { 267 group->spin_ticks += (group->end_ticks - group->start_ticks); 268 group->end_ticks = 0; 269 } 270 group->start_ticks = 0; 271 } else { 272 group->end_ticks = spdk_get_ticks(); 273 } 274 } 275 276 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 277 } 278 279 static int 280 bdev_nvme_poll_adminq(void *arg) 281 { 282 int32_t rc; 283 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 284 285 assert(nvme_bdev_ctrlr != NULL); 286 287 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); 288 if (rc < 0) { 289 bdev_nvme_failover(nvme_bdev_ctrlr, false); 290 } 291 292 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 293 } 294 295 static int 296 bdev_nvme_destruct(void *ctx) 297 { 298 struct nvme_bdev *nvme_disk = ctx; 299 struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns; 300 301 nvme_ns->bdev = NULL; 302 303 nvme_bdev_ns_detach(nvme_ns); 304 305 free(nvme_disk->disk.name); 306 free(nvme_disk); 307 308 return 0; 309 } 310 311 static int 312 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 313 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 314 { 315 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); 316 317 return 0; 318 } 319 320 static int 321 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch) 322 { 323 struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr; 324 struct spdk_nvme_io_qpair_opts opts; 325 int rc; 326 327 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 328 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 329 opts.create_only = true; 330 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 331 g_opts.io_queue_requests = opts.io_queue_requests; 332 333 nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 334 if (nvme_ch->qpair == NULL) { 335 return -1; 336 } 337 338 assert(nvme_ch->group != NULL); 339 340 rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair); 341 if (rc != 0) { 342 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 343 goto err; 344 } 345 346 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair); 347 if (rc != 0) { 348 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 349 goto err; 350 } 351 352 return 0; 353 354 err: 355 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 356 357 return rc; 358 } 359 360 static void 361 _bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status) 362 { 363 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 364 365 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 366 if (nvme_bdev_ctrlr->destruct_after_reset) { 367 assert(nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct); 368 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 369 370 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct, 371 nvme_bdev_ctrlr); 372 } else { 373 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 374 } 375 } 376 377 static void 378 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 379 { 380 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 381 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 382 struct spdk_bdev_io *bdev_io; 383 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 384 385 /* A NULL ctx means success. */ 386 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 387 status = SPDK_BDEV_IO_STATUS_FAILED; 388 } 389 390 while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) { 391 bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets); 392 TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link); 393 spdk_bdev_io_complete(bdev_io, status); 394 } 395 396 spdk_for_each_channel_continue(i, 0); 397 } 398 399 static void 400 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) 401 { 402 /* we are using the for_each_channel cb_arg like a return code here. */ 403 /* If it's zero, we succeeded, otherwise, the reset failed. */ 404 void *cb_arg = NULL; 405 struct nvme_bdev_ctrlr_trid *curr_trid; 406 407 if (rc) { 408 cb_arg = (void *)0x1; 409 SPDK_ERRLOG("Resetting controller failed.\n"); 410 } else { 411 SPDK_NOTICELOG("Resetting controller successful.\n"); 412 } 413 414 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 415 nvme_bdev_ctrlr->resetting = false; 416 nvme_bdev_ctrlr->failover_in_progress = false; 417 418 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 419 assert(curr_trid != NULL); 420 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 421 422 curr_trid->is_failed = cb_arg != NULL ? true : false; 423 424 if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { 425 /* Destruct ctrlr after clearing pending resets. */ 426 nvme_bdev_ctrlr->destruct_after_reset = true; 427 } 428 429 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 430 431 /* Make sure we clear any pending resets before returning. */ 432 spdk_for_each_channel(nvme_bdev_ctrlr, 433 _bdev_nvme_complete_pending_resets, 434 cb_arg, 435 _bdev_nvme_check_pending_destruct); 436 } 437 438 static void 439 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 440 { 441 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 442 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 443 int rc = SPDK_BDEV_IO_STATUS_SUCCESS; 444 445 if (status) { 446 rc = SPDK_BDEV_IO_STATUS_FAILED; 447 } 448 if (bio) { 449 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc); 450 } 451 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 452 } 453 454 static void 455 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 456 { 457 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 458 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 459 int rc; 460 461 rc = bdev_nvme_create_qpair(nvme_ch); 462 463 spdk_for_each_channel_continue(i, rc); 464 } 465 466 static void 467 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 468 { 469 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 470 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 471 int rc; 472 473 if (status) { 474 rc = status; 475 goto err; 476 } 477 478 rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); 479 if (rc != 0) { 480 goto err; 481 } 482 483 /* Recreate all of the I/O queue pairs */ 484 spdk_for_each_channel(nvme_bdev_ctrlr, 485 _bdev_nvme_reset_create_qpair, 486 bio, 487 _bdev_nvme_reset_create_qpairs_done); 488 return; 489 490 err: 491 if (bio) { 492 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); 493 } 494 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); 495 } 496 497 static void 498 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 499 { 500 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 501 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 502 int rc; 503 504 rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 505 if (!rc) { 506 nvme_ch->qpair = NULL; 507 } 508 509 spdk_for_each_channel_continue(i, rc); 510 } 511 512 static int 513 _bdev_nvme_reset_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 514 { 515 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 516 if (nvme_bdev_ctrlr->destruct) { 517 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 518 return -EBUSY; 519 } 520 521 if (nvme_bdev_ctrlr->resetting) { 522 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 523 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 524 return -EAGAIN; 525 } 526 527 nvme_bdev_ctrlr->resetting = true; 528 529 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 530 return 0; 531 } 532 533 static int 534 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 535 { 536 int rc; 537 538 rc = _bdev_nvme_reset_start(nvme_bdev_ctrlr); 539 if (rc == 0) { 540 /* First, delete all NVMe I/O queue pairs. */ 541 spdk_for_each_channel(nvme_bdev_ctrlr, 542 _bdev_nvme_reset_destroy_qpair, 543 NULL, 544 _bdev_nvme_reset_ctrlr); 545 } 546 547 return rc; 548 } 549 550 static int 551 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio) 552 { 553 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 554 int rc; 555 556 rc = _bdev_nvme_reset_start(nvme_ch->ctrlr); 557 if (rc == 0) { 558 /* First, delete all NVMe I/O queue pairs. */ 559 spdk_for_each_channel(nvme_ch->ctrlr, 560 _bdev_nvme_reset_destroy_qpair, 561 bio, 562 _bdev_nvme_reset_ctrlr); 563 } else if (rc == -EBUSY) { 564 /* Don't bother resetting if the controller is in the process of being destructed. */ 565 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 566 } else if (rc == -EAGAIN) { 567 /* 568 * Reset call is queued only if it is from the app framework. This is on purpose so that 569 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 570 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 571 */ 572 TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link); 573 } else { 574 return rc; 575 } 576 577 return 0; 578 } 579 580 static int 581 _bdev_nvme_failover_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 582 { 583 struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 584 int rc; 585 586 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 587 if (nvme_bdev_ctrlr->destruct) { 588 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 589 /* Don't bother resetting if the controller is in the process of being destructed. */ 590 return -EBUSY; 591 } 592 593 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 594 assert(curr_trid); 595 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 596 next_trid = TAILQ_NEXT(curr_trid, link); 597 598 if (nvme_bdev_ctrlr->resetting) { 599 if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { 600 rc = -EAGAIN; 601 } else { 602 rc = -EBUSY; 603 } 604 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 605 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 606 return rc; 607 } 608 609 nvme_bdev_ctrlr->resetting = true; 610 curr_trid->is_failed = true; 611 612 if (next_trid) { 613 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 614 615 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 616 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 617 618 nvme_bdev_ctrlr->failover_in_progress = true; 619 spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); 620 nvme_bdev_ctrlr->connected_trid = &next_trid->trid; 621 rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); 622 assert(rc == 0); 623 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link); 624 if (!remove) { 625 /** Shuffle the old trid to the end of the list and use the new one. 626 * Allows for round robin through multiple connections. 627 */ 628 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link); 629 } else { 630 free(curr_trid); 631 } 632 } 633 634 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 635 return 0; 636 } 637 638 static int 639 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 640 { 641 int rc; 642 643 rc = _bdev_nvme_failover_start(nvme_bdev_ctrlr, remove); 644 if (rc == 0) { 645 /* First, delete all NVMe I/O queue pairs. */ 646 spdk_for_each_channel(nvme_bdev_ctrlr, 647 _bdev_nvme_reset_destroy_qpair, 648 NULL, 649 _bdev_nvme_reset_ctrlr); 650 } else if (rc != -EBUSY) { 651 return rc; 652 } 653 654 return 0; 655 } 656 657 static int 658 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 659 struct nvme_bdev_io *bio, 660 uint64_t offset_blocks, 661 uint64_t num_blocks); 662 663 static void 664 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 665 bool success) 666 { 667 struct spdk_bdev *bdev = bdev_io->bdev; 668 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 669 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 670 struct nvme_bdev_ns *nvme_ns; 671 struct spdk_nvme_qpair *qpair; 672 int ret; 673 674 if (!success) { 675 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 676 return; 677 } 678 679 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 680 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 681 return; 682 } 683 684 ret = bdev_nvme_readv(nvme_ns->ns, 685 qpair, 686 (struct nvme_bdev_io *)bdev_io->driver_ctx, 687 bdev_io->u.bdev.iovs, 688 bdev_io->u.bdev.iovcnt, 689 bdev_io->u.bdev.md_buf, 690 bdev_io->u.bdev.num_blocks, 691 bdev_io->u.bdev.offset_blocks, 692 bdev->dif_check_flags); 693 694 if (spdk_likely(ret == 0)) { 695 return; 696 } else if (ret == -ENOMEM) { 697 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 698 } else { 699 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 700 } 701 } 702 703 static int 704 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 705 { 706 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 707 struct spdk_bdev *bdev = bdev_io->bdev; 708 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 709 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 710 struct nvme_bdev_io *nbdev_io_to_abort; 711 struct nvme_bdev_ns *nvme_ns; 712 struct spdk_nvme_qpair *qpair; 713 714 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 715 return -1; 716 } 717 718 switch (bdev_io->type) { 719 case SPDK_BDEV_IO_TYPE_READ: 720 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 721 return bdev_nvme_readv(nvme_ns->ns, 722 qpair, 723 nbdev_io, 724 bdev_io->u.bdev.iovs, 725 bdev_io->u.bdev.iovcnt, 726 bdev_io->u.bdev.md_buf, 727 bdev_io->u.bdev.num_blocks, 728 bdev_io->u.bdev.offset_blocks, 729 bdev->dif_check_flags); 730 } else { 731 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 732 bdev_io->u.bdev.num_blocks * bdev->blocklen); 733 return 0; 734 } 735 736 case SPDK_BDEV_IO_TYPE_WRITE: 737 return bdev_nvme_writev(nvme_ns->ns, 738 qpair, 739 nbdev_io, 740 bdev_io->u.bdev.iovs, 741 bdev_io->u.bdev.iovcnt, 742 bdev_io->u.bdev.md_buf, 743 bdev_io->u.bdev.num_blocks, 744 bdev_io->u.bdev.offset_blocks, 745 bdev->dif_check_flags); 746 747 case SPDK_BDEV_IO_TYPE_COMPARE: 748 return bdev_nvme_comparev(nvme_ns->ns, 749 qpair, 750 nbdev_io, 751 bdev_io->u.bdev.iovs, 752 bdev_io->u.bdev.iovcnt, 753 bdev_io->u.bdev.md_buf, 754 bdev_io->u.bdev.num_blocks, 755 bdev_io->u.bdev.offset_blocks, 756 bdev->dif_check_flags); 757 758 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 759 return bdev_nvme_comparev_and_writev(nvme_ns->ns, 760 qpair, 761 nbdev_io, 762 bdev_io->u.bdev.iovs, 763 bdev_io->u.bdev.iovcnt, 764 bdev_io->u.bdev.fused_iovs, 765 bdev_io->u.bdev.fused_iovcnt, 766 bdev_io->u.bdev.md_buf, 767 bdev_io->u.bdev.num_blocks, 768 bdev_io->u.bdev.offset_blocks, 769 bdev->dif_check_flags); 770 771 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 772 return bdev_nvme_unmap(nvme_ns->ns, 773 qpair, 774 nbdev_io, 775 bdev_io->u.bdev.offset_blocks, 776 bdev_io->u.bdev.num_blocks); 777 778 case SPDK_BDEV_IO_TYPE_UNMAP: 779 return bdev_nvme_unmap(nvme_ns->ns, 780 qpair, 781 nbdev_io, 782 bdev_io->u.bdev.offset_blocks, 783 bdev_io->u.bdev.num_blocks); 784 785 case SPDK_BDEV_IO_TYPE_RESET: 786 return bdev_nvme_reset(nvme_ch, nbdev_io); 787 788 case SPDK_BDEV_IO_TYPE_FLUSH: 789 return bdev_nvme_flush(nvme_ns->ns, 790 qpair, 791 nbdev_io, 792 bdev_io->u.bdev.offset_blocks, 793 bdev_io->u.bdev.num_blocks); 794 795 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 796 return bdev_nvme_admin_passthru(nvme_ch, 797 nbdev_io, 798 &bdev_io->u.nvme_passthru.cmd, 799 bdev_io->u.nvme_passthru.buf, 800 bdev_io->u.nvme_passthru.nbytes); 801 802 case SPDK_BDEV_IO_TYPE_NVME_IO: 803 return bdev_nvme_io_passthru(nvme_ns->ns, 804 qpair, 805 nbdev_io, 806 &bdev_io->u.nvme_passthru.cmd, 807 bdev_io->u.nvme_passthru.buf, 808 bdev_io->u.nvme_passthru.nbytes); 809 810 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 811 return bdev_nvme_io_passthru_md(nvme_ns->ns, 812 qpair, 813 nbdev_io, 814 &bdev_io->u.nvme_passthru.cmd, 815 bdev_io->u.nvme_passthru.buf, 816 bdev_io->u.nvme_passthru.nbytes, 817 bdev_io->u.nvme_passthru.md_buf, 818 bdev_io->u.nvme_passthru.md_len); 819 820 case SPDK_BDEV_IO_TYPE_ABORT: 821 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 822 return bdev_nvme_abort(nvme_ch, 823 nbdev_io, 824 nbdev_io_to_abort); 825 826 default: 827 return -EINVAL; 828 } 829 return 0; 830 } 831 832 static void 833 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 834 { 835 int rc = _bdev_nvme_submit_request(ch, bdev_io); 836 837 if (spdk_unlikely(rc != 0)) { 838 if (rc == -ENOMEM) { 839 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 840 } else { 841 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 842 } 843 } 844 } 845 846 static bool 847 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 848 { 849 struct nvme_bdev *nbdev = ctx; 850 struct nvme_bdev_ns *nvme_ns; 851 struct spdk_nvme_ns *ns; 852 struct spdk_nvme_ctrlr *ctrlr; 853 const struct spdk_nvme_ctrlr_data *cdata; 854 855 nvme_ns = nvme_bdev_to_bdev_ns(nbdev); 856 assert(nvme_ns != NULL); 857 ns = nvme_ns->ns; 858 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 859 860 switch (io_type) { 861 case SPDK_BDEV_IO_TYPE_READ: 862 case SPDK_BDEV_IO_TYPE_WRITE: 863 case SPDK_BDEV_IO_TYPE_RESET: 864 case SPDK_BDEV_IO_TYPE_FLUSH: 865 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 866 case SPDK_BDEV_IO_TYPE_NVME_IO: 867 case SPDK_BDEV_IO_TYPE_ABORT: 868 return true; 869 870 case SPDK_BDEV_IO_TYPE_COMPARE: 871 return spdk_nvme_ns_supports_compare(ns); 872 873 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 874 return spdk_nvme_ns_get_md_size(ns) ? true : false; 875 876 case SPDK_BDEV_IO_TYPE_UNMAP: 877 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 878 return cdata->oncs.dsm; 879 880 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 881 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 882 /* 883 * If an NVMe controller guarantees reading unallocated blocks returns zero, 884 * we can implement WRITE_ZEROES as an NVMe deallocate command. 885 */ 886 if (cdata->oncs.dsm && 887 spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) == 888 SPDK_NVME_DEALLOC_READ_00) { 889 return true; 890 } 891 /* 892 * The NVMe controller write_zeroes function is currently not used by our driver. 893 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. 894 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. 895 */ 896 return false; 897 898 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 899 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 900 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 901 return true; 902 } 903 return false; 904 905 default: 906 return false; 907 } 908 } 909 910 static int 911 bdev_nvme_create_cb(void *io_device, void *ctx_buf) 912 { 913 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 914 struct nvme_io_channel *nvme_ch = ctx_buf; 915 struct spdk_io_channel *pg_ch = NULL; 916 int rc; 917 918 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 919 rc = bdev_ocssd_create_io_channel(nvme_ch); 920 if (rc != 0) { 921 return rc; 922 } 923 } 924 925 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 926 if (!pg_ch) { 927 rc = -1; 928 goto err_pg_ch; 929 } 930 931 nvme_ch->group = spdk_io_channel_get_ctx(pg_ch); 932 933 #ifdef SPDK_CONFIG_VTUNE 934 nvme_ch->group->collect_spin_stat = true; 935 #else 936 nvme_ch->group->collect_spin_stat = false; 937 #endif 938 939 TAILQ_INIT(&nvme_ch->pending_resets); 940 941 nvme_ch->ctrlr = nvme_bdev_ctrlr; 942 943 rc = bdev_nvme_create_qpair(nvme_ch); 944 if (rc != 0) { 945 goto err_qpair; 946 } 947 948 return 0; 949 950 err_qpair: 951 spdk_put_io_channel(pg_ch); 952 err_pg_ch: 953 if (nvme_ch->ocssd_ch) { 954 bdev_ocssd_destroy_io_channel(nvme_ch); 955 } 956 957 return rc; 958 } 959 960 static void 961 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) 962 { 963 struct nvme_io_channel *nvme_ch = ctx_buf; 964 965 assert(nvme_ch->group != NULL); 966 967 if (nvme_ch->ocssd_ch != NULL) { 968 bdev_ocssd_destroy_io_channel(nvme_ch); 969 } 970 971 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 972 973 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group)); 974 } 975 976 static int 977 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) 978 { 979 struct nvme_bdev_poll_group *group = ctx_buf; 980 981 group->group = spdk_nvme_poll_group_create(group, NULL); 982 if (group->group == NULL) { 983 return -1; 984 } 985 986 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 987 988 if (group->poller == NULL) { 989 spdk_nvme_poll_group_destroy(group->group); 990 return -1; 991 } 992 993 return 0; 994 } 995 996 static void 997 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) 998 { 999 struct nvme_bdev_poll_group *group = ctx_buf; 1000 1001 spdk_poller_unregister(&group->poller); 1002 if (spdk_nvme_poll_group_destroy(group->group)) { 1003 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module."); 1004 assert(false); 1005 } 1006 } 1007 1008 static struct spdk_io_channel * 1009 bdev_nvme_get_io_channel(void *ctx) 1010 { 1011 struct nvme_bdev *nvme_bdev = ctx; 1012 1013 return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 1014 } 1015 1016 static void * 1017 bdev_nvme_get_module_ctx(void *ctx) 1018 { 1019 struct nvme_bdev *nvme_bdev = ctx; 1020 1021 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 1022 } 1023 1024 static int 1025 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 1026 { 1027 struct nvme_bdev *nvme_bdev = ctx; 1028 struct nvme_bdev_ns *nvme_ns; 1029 struct spdk_nvme_ns *ns; 1030 struct spdk_nvme_ctrlr *ctrlr; 1031 const struct spdk_nvme_ctrlr_data *cdata; 1032 const struct spdk_nvme_transport_id *trid; 1033 union spdk_nvme_vs_register vs; 1034 union spdk_nvme_csts_register csts; 1035 char buf[128]; 1036 1037 nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev); 1038 assert(nvme_ns != NULL); 1039 ns = nvme_ns->ns; 1040 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1041 1042 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1043 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1044 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1045 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1046 1047 spdk_json_write_named_object_begin(w, "nvme"); 1048 1049 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1050 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1051 } 1052 1053 spdk_json_write_named_object_begin(w, "trid"); 1054 1055 nvme_bdev_dump_trid_json(trid, w); 1056 1057 spdk_json_write_object_end(w); 1058 1059 #ifdef SPDK_CONFIG_NVME_CUSE 1060 size_t cuse_name_size = 128; 1061 char cuse_name[cuse_name_size]; 1062 1063 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1064 cuse_name, &cuse_name_size); 1065 if (rc == 0) { 1066 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1067 } 1068 #endif 1069 1070 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1071 1072 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1073 1074 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1075 spdk_str_trim(buf); 1076 spdk_json_write_named_string(w, "model_number", buf); 1077 1078 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1079 spdk_str_trim(buf); 1080 spdk_json_write_named_string(w, "serial_number", buf); 1081 1082 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1083 spdk_str_trim(buf); 1084 spdk_json_write_named_string(w, "firmware_revision", buf); 1085 1086 if (cdata->subnqn[0] != '\0') { 1087 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1088 } 1089 1090 spdk_json_write_named_object_begin(w, "oacs"); 1091 1092 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1093 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1094 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1095 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1096 1097 spdk_json_write_object_end(w); 1098 1099 spdk_json_write_object_end(w); 1100 1101 spdk_json_write_named_object_begin(w, "vs"); 1102 1103 spdk_json_write_name(w, "nvme_version"); 1104 if (vs.bits.ter) { 1105 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1106 } else { 1107 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1108 } 1109 1110 spdk_json_write_object_end(w); 1111 1112 spdk_json_write_named_object_begin(w, "csts"); 1113 1114 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1115 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1116 1117 spdk_json_write_object_end(w); 1118 1119 spdk_json_write_named_object_begin(w, "ns_data"); 1120 1121 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1122 1123 spdk_json_write_object_end(w); 1124 1125 if (cdata->oacs.security) { 1126 spdk_json_write_named_object_begin(w, "security"); 1127 1128 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1129 1130 spdk_json_write_object_end(w); 1131 } 1132 1133 spdk_json_write_object_end(w); 1134 1135 return 0; 1136 } 1137 1138 static void 1139 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1140 { 1141 /* No config per bdev needed */ 1142 } 1143 1144 static uint64_t 1145 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1146 { 1147 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 1148 struct nvme_bdev_poll_group *group = nvme_ch->group; 1149 uint64_t spin_time; 1150 1151 if (!group || !group->collect_spin_stat) { 1152 return 0; 1153 } 1154 1155 if (group->end_ticks != 0) { 1156 group->spin_ticks += (group->end_ticks - group->start_ticks); 1157 group->end_ticks = 0; 1158 } 1159 1160 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1161 group->start_ticks = 0; 1162 group->spin_ticks = 0; 1163 1164 return spin_time; 1165 } 1166 1167 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1168 .destruct = bdev_nvme_destruct, 1169 .submit_request = bdev_nvme_submit_request, 1170 .io_type_supported = bdev_nvme_io_type_supported, 1171 .get_io_channel = bdev_nvme_get_io_channel, 1172 .dump_info_json = bdev_nvme_dump_info_json, 1173 .write_config_json = bdev_nvme_write_config_json, 1174 .get_spin_time = bdev_nvme_get_spin_time, 1175 .get_module_ctx = bdev_nvme_get_module_ctx, 1176 }; 1177 1178 static int 1179 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1180 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1181 uint32_t prchk_flags, void *ctx) 1182 { 1183 const struct spdk_uuid *uuid; 1184 const struct spdk_nvme_ctrlr_data *cdata; 1185 const struct spdk_nvme_ns_data *nsdata; 1186 int rc; 1187 1188 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1189 1190 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1191 if (!disk->name) { 1192 return -ENOMEM; 1193 } 1194 disk->product_name = "NVMe disk"; 1195 1196 disk->write_cache = 0; 1197 if (cdata->vwc.present) { 1198 /* Enable if the Volatile Write Cache exists */ 1199 disk->write_cache = 1; 1200 } 1201 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1202 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1203 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1204 1205 uuid = spdk_nvme_ns_get_uuid(ns); 1206 if (uuid != NULL) { 1207 disk->uuid = *uuid; 1208 } 1209 1210 nsdata = spdk_nvme_ns_get_data(ns); 1211 1212 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1213 if (disk->md_len != 0) { 1214 disk->md_interleave = nsdata->flbas.extended; 1215 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1216 if (disk->dif_type != SPDK_DIF_DISABLE) { 1217 disk->dif_is_head_of_md = nsdata->dps.md_start; 1218 disk->dif_check_flags = prchk_flags; 1219 } 1220 } 1221 1222 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1223 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1224 disk->acwu = 0; 1225 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1226 disk->acwu = nsdata->nacwu; 1227 } else { 1228 disk->acwu = cdata->acwu; 1229 } 1230 1231 disk->ctxt = ctx; 1232 disk->fn_table = &nvmelib_fn_table; 1233 disk->module = &nvme_if; 1234 rc = spdk_bdev_register(disk); 1235 if (rc) { 1236 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1237 free(disk->name); 1238 return rc; 1239 } 1240 1241 return 0; 1242 } 1243 1244 static int 1245 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns) 1246 { 1247 struct nvme_bdev *bdev; 1248 int rc; 1249 1250 bdev = calloc(1, sizeof(*bdev)); 1251 if (!bdev) { 1252 SPDK_ERRLOG("bdev calloc() failed\n"); 1253 return -ENOMEM; 1254 } 1255 1256 bdev->nvme_ns = nvme_ns; 1257 bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL; 1258 1259 rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr, 1260 nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev); 1261 if (rc != 0) { 1262 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1263 free(bdev); 1264 return rc; 1265 } 1266 1267 nvme_ns->ref++; 1268 nvme_ns->bdev = bdev; 1269 1270 return 0; 1271 } 1272 1273 static void 1274 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1275 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1276 { 1277 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1278 struct spdk_nvme_ns *ns; 1279 int rc = 0; 1280 1281 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1282 if (!ns) { 1283 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1284 rc = -EINVAL; 1285 goto done; 1286 } 1287 1288 nvme_ns->ns = ns; 1289 nvme_ns->ref = 1; 1290 1291 rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns); 1292 done: 1293 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1294 } 1295 1296 static bool 1297 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1298 struct spdk_nvme_ctrlr_opts *opts) 1299 { 1300 struct nvme_probe_skip_entry *entry; 1301 1302 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1303 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1304 return false; 1305 } 1306 } 1307 1308 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1309 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1310 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1311 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1312 1313 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1314 1315 return true; 1316 } 1317 1318 static void 1319 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1320 { 1321 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1322 1323 if (spdk_nvme_cpl_is_error(cpl)) { 1324 SPDK_WARNLOG("Abort failed. Resetting controller.\n"); 1325 _bdev_nvme_reset(nvme_bdev_ctrlr); 1326 } 1327 } 1328 1329 static void 1330 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1331 struct spdk_nvme_qpair *qpair, uint16_t cid) 1332 { 1333 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg; 1334 union spdk_nvme_csts_register csts; 1335 int rc; 1336 1337 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1338 1339 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1340 1341 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1342 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1343 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1344 * completion recursively. 1345 */ 1346 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1347 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1348 if (csts.bits.cfs) { 1349 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1350 _bdev_nvme_reset(nvme_bdev_ctrlr); 1351 return; 1352 } 1353 } 1354 1355 switch (g_opts.action_on_timeout) { 1356 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1357 if (qpair) { 1358 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1359 nvme_abort_cpl, nvme_bdev_ctrlr); 1360 if (rc == 0) { 1361 return; 1362 } 1363 1364 SPDK_ERRLOG("Unable to send abort. Resetting.\n"); 1365 } 1366 1367 /* FALLTHROUGH */ 1368 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1369 _bdev_nvme_reset(nvme_bdev_ctrlr); 1370 break; 1371 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1372 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1373 break; 1374 default: 1375 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1376 break; 1377 } 1378 } 1379 1380 void 1381 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns) 1382 { 1383 nvme_bdev_ns_detach(nvme_ns); 1384 } 1385 1386 static void 1387 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns) 1388 { 1389 struct nvme_bdev *bdev; 1390 1391 bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1392 if (bdev != NULL) { 1393 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1394 } 1395 1396 nvme_ns->populated = false; 1397 1398 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1399 } 1400 1401 static void 1402 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns, 1403 struct nvme_async_probe_ctx *ctx) 1404 { 1405 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1406 } 1407 1408 static void 1409 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns) 1410 { 1411 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1412 } 1413 1414 void 1415 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1416 struct nvme_bdev_ns *nvme_ns, int rc) 1417 { 1418 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr; 1419 1420 assert(nvme_bdev_ctrlr != NULL); 1421 1422 if (rc == 0) { 1423 nvme_ns->populated = true; 1424 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1425 nvme_bdev_ctrlr->ref++; 1426 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1427 } else { 1428 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1429 } 1430 1431 if (ctx) { 1432 ctx->populates_in_progress--; 1433 if (ctx->populates_in_progress == 0) { 1434 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1435 } 1436 } 1437 } 1438 1439 static void 1440 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1441 struct nvme_async_probe_ctx *ctx) 1442 { 1443 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1444 struct nvme_bdev_ns *nvme_ns; 1445 struct spdk_nvme_ns *ns; 1446 struct nvme_bdev *bdev; 1447 uint32_t i; 1448 int rc; 1449 uint64_t num_sectors; 1450 bool ns_is_active; 1451 1452 if (ctx) { 1453 /* Initialize this count to 1 to handle the populate functions 1454 * calling nvme_ctrlr_populate_namespace_done() immediately. 1455 */ 1456 ctx->populates_in_progress = 1; 1457 } 1458 1459 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1460 uint32_t nsid = i + 1; 1461 1462 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1463 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1464 1465 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) { 1466 /* NS is still there but attributes may have changed */ 1467 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1468 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1469 bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1470 assert(bdev != NULL); 1471 if (bdev->disk.blockcnt != num_sectors) { 1472 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1473 nsid, 1474 bdev->disk.name, 1475 bdev->disk.blockcnt, 1476 num_sectors); 1477 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1478 if (rc != 0) { 1479 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1480 bdev->disk.name, rc); 1481 } 1482 } 1483 } 1484 1485 if (!nvme_ns->populated && ns_is_active) { 1486 nvme_ns->id = nsid; 1487 nvme_ns->ctrlr = nvme_bdev_ctrlr; 1488 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1489 nvme_ns->type = NVME_BDEV_NS_OCSSD; 1490 } else { 1491 nvme_ns->type = NVME_BDEV_NS_STANDARD; 1492 } 1493 1494 nvme_ns->bdev = NULL; 1495 1496 if (ctx) { 1497 ctx->populates_in_progress++; 1498 } 1499 nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx); 1500 } 1501 1502 if (nvme_ns->populated && !ns_is_active) { 1503 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1504 } 1505 } 1506 1507 if (ctx) { 1508 /* Decrement this count now that the loop is over to account 1509 * for the one we started with. If the count is then 0, we 1510 * know any populate_namespace functions completed immediately, 1511 * so we'll kick the callback here. 1512 */ 1513 ctx->populates_in_progress--; 1514 if (ctx->populates_in_progress == 0) { 1515 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1516 } 1517 } 1518 1519 } 1520 1521 static void 1522 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 1523 { 1524 uint32_t i; 1525 struct nvme_bdev_ns *nvme_ns; 1526 1527 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1528 uint32_t nsid = i + 1; 1529 1530 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1531 if (nvme_ns->populated) { 1532 assert(nvme_ns->id == nsid); 1533 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1534 } 1535 } 1536 } 1537 1538 static void 1539 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1540 { 1541 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 1542 union spdk_nvme_async_event_completion event; 1543 1544 if (spdk_nvme_cpl_is_error(cpl)) { 1545 SPDK_WARNLOG("AER request execute failed"); 1546 return; 1547 } 1548 1549 event.raw = cpl->cdw0; 1550 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1551 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1552 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1553 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1554 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1555 spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1556 bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); 1557 } 1558 } 1559 1560 static int 1561 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1562 const char *name, 1563 const struct spdk_nvme_transport_id *trid, 1564 uint32_t prchk_flags, 1565 struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr) 1566 { 1567 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1568 struct nvme_bdev_ctrlr_trid *trid_entry; 1569 uint32_t i; 1570 int rc; 1571 1572 nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); 1573 if (nvme_bdev_ctrlr == NULL) { 1574 SPDK_ERRLOG("Failed to allocate device struct\n"); 1575 return -ENOMEM; 1576 } 1577 1578 rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL); 1579 if (rc != 0) { 1580 goto err_init_mutex; 1581 } 1582 1583 TAILQ_INIT(&nvme_bdev_ctrlr->trids); 1584 nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 1585 if (nvme_bdev_ctrlr->num_ns != 0) { 1586 nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 1587 if (!nvme_bdev_ctrlr->namespaces) { 1588 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 1589 rc = -ENOMEM; 1590 goto err_alloc_namespaces; 1591 } 1592 } 1593 1594 trid_entry = calloc(1, sizeof(*trid_entry)); 1595 if (trid_entry == NULL) { 1596 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 1597 rc = -ENOMEM; 1598 goto err_alloc_trid; 1599 } 1600 1601 trid_entry->trid = *trid; 1602 1603 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1604 nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 1605 if (nvme_bdev_ctrlr->namespaces[i] == NULL) { 1606 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 1607 rc = -ENOMEM; 1608 goto err_alloc_namespace; 1609 } 1610 } 1611 1612 nvme_bdev_ctrlr->thread = spdk_get_thread(); 1613 nvme_bdev_ctrlr->adminq_timer_poller = NULL; 1614 nvme_bdev_ctrlr->ctrlr = ctrlr; 1615 nvme_bdev_ctrlr->ref = 1; 1616 nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; 1617 nvme_bdev_ctrlr->name = strdup(name); 1618 if (nvme_bdev_ctrlr->name == NULL) { 1619 rc = -ENOMEM; 1620 goto err_alloc_name; 1621 } 1622 1623 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1624 rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); 1625 if (spdk_unlikely(rc != 0)) { 1626 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 1627 goto err_init_ocssd; 1628 } 1629 } 1630 1631 nvme_bdev_ctrlr->prchk_flags = prchk_flags; 1632 1633 spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, 1634 sizeof(struct nvme_io_channel), 1635 name); 1636 1637 nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, 1638 g_opts.nvme_adminq_poll_period_us); 1639 1640 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 1641 1642 if (g_opts.timeout_us > 0) { 1643 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 1644 timeout_cb, nvme_bdev_ctrlr); 1645 } 1646 1647 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); 1648 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr); 1649 1650 if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & 1651 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 1652 nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); 1653 if (nvme_bdev_ctrlr->opal_dev == NULL) { 1654 SPDK_ERRLOG("Failed to initialize Opal\n"); 1655 } 1656 } 1657 1658 TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); 1659 1660 if (_nvme_bdev_ctrlr != NULL) { 1661 *_nvme_bdev_ctrlr = nvme_bdev_ctrlr; 1662 } 1663 return 0; 1664 1665 err_init_ocssd: 1666 free(nvme_bdev_ctrlr->name); 1667 err_alloc_name: 1668 err_alloc_namespace: 1669 for (; i > 0; i--) { 1670 free(nvme_bdev_ctrlr->namespaces[i - 1]); 1671 } 1672 free(trid_entry); 1673 err_alloc_trid: 1674 free(nvme_bdev_ctrlr->namespaces); 1675 err_alloc_namespaces: 1676 pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex); 1677 err_init_mutex: 1678 free(nvme_bdev_ctrlr); 1679 return rc; 1680 } 1681 1682 static void 1683 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1684 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1685 { 1686 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1687 struct nvme_probe_ctx *ctx = cb_ctx; 1688 char *name = NULL; 1689 uint32_t prchk_flags = 0; 1690 size_t i; 1691 int rc; 1692 1693 if (ctx) { 1694 for (i = 0; i < ctx->count; i++) { 1695 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1696 prchk_flags = ctx->prchk_flags[i]; 1697 name = strdup(ctx->names[i]); 1698 break; 1699 } 1700 } 1701 } else { 1702 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 1703 } 1704 if (!name) { 1705 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 1706 return; 1707 } 1708 1709 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 1710 1711 rc = nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr); 1712 if (rc != 0) { 1713 SPDK_ERRLOG("Failed to create new NVMe controller\n"); 1714 free(name); 1715 return; 1716 } 1717 1718 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1719 1720 free(name); 1721 } 1722 1723 static void 1724 _nvme_bdev_ctrlr_destruct(void *ctx) 1725 { 1726 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1727 1728 nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr); 1729 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1730 } 1731 1732 static int 1733 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug) 1734 { 1735 struct nvme_probe_skip_entry *entry; 1736 1737 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1738 1739 /* The controller's destruction was already started */ 1740 if (nvme_bdev_ctrlr->destruct) { 1741 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1742 return 0; 1743 } 1744 1745 if (!hotplug && 1746 nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1747 entry = calloc(1, sizeof(*entry)); 1748 if (!entry) { 1749 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1750 return -ENOMEM; 1751 } 1752 entry->trid = *nvme_bdev_ctrlr->connected_trid; 1753 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 1754 } 1755 1756 nvme_bdev_ctrlr->destruct = true; 1757 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1758 1759 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1760 1761 return 0; 1762 } 1763 1764 static void 1765 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 1766 { 1767 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx; 1768 1769 _bdev_nvme_delete(nvme_bdev_ctrlr, true); 1770 } 1771 1772 static int 1773 bdev_nvme_hotplug_probe(void *arg) 1774 { 1775 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 1776 g_hotplug_probe_ctx = NULL; 1777 spdk_poller_unregister(&g_hotplug_probe_poller); 1778 } 1779 1780 return SPDK_POLLER_BUSY; 1781 } 1782 1783 static int 1784 bdev_nvme_hotplug(void *arg) 1785 { 1786 struct spdk_nvme_transport_id trid_pcie; 1787 1788 if (g_hotplug_probe_ctx) { 1789 return SPDK_POLLER_BUSY; 1790 } 1791 1792 memset(&trid_pcie, 0, sizeof(trid_pcie)); 1793 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 1794 1795 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 1796 hotplug_probe_cb, attach_cb, NULL); 1797 1798 if (g_hotplug_probe_ctx) { 1799 assert(g_hotplug_probe_poller == NULL); 1800 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 1801 } 1802 1803 return SPDK_POLLER_BUSY; 1804 } 1805 1806 void 1807 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 1808 { 1809 *opts = g_opts; 1810 } 1811 1812 int 1813 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 1814 { 1815 if (g_bdev_nvme_init_thread != NULL) { 1816 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 1817 return -EPERM; 1818 } 1819 } 1820 1821 g_opts = *opts; 1822 1823 return 0; 1824 } 1825 1826 struct set_nvme_hotplug_ctx { 1827 uint64_t period_us; 1828 bool enabled; 1829 spdk_msg_fn fn; 1830 void *fn_ctx; 1831 }; 1832 1833 static void 1834 set_nvme_hotplug_period_cb(void *_ctx) 1835 { 1836 struct set_nvme_hotplug_ctx *ctx = _ctx; 1837 1838 spdk_poller_unregister(&g_hotplug_poller); 1839 if (ctx->enabled) { 1840 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 1841 } 1842 1843 g_nvme_hotplug_poll_period_us = ctx->period_us; 1844 g_nvme_hotplug_enabled = ctx->enabled; 1845 if (ctx->fn) { 1846 ctx->fn(ctx->fn_ctx); 1847 } 1848 1849 free(ctx); 1850 } 1851 1852 int 1853 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 1854 { 1855 struct set_nvme_hotplug_ctx *ctx; 1856 1857 if (enabled == true && !spdk_process_is_primary()) { 1858 return -EPERM; 1859 } 1860 1861 ctx = calloc(1, sizeof(*ctx)); 1862 if (ctx == NULL) { 1863 return -ENOMEM; 1864 } 1865 1866 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 1867 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 1868 ctx->enabled = enabled; 1869 ctx->fn = cb; 1870 ctx->fn_ctx = cb_ctx; 1871 1872 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 1873 return 0; 1874 } 1875 1876 static void 1877 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1878 { 1879 if (ctx->cb_fn) { 1880 ctx->cb_fn(ctx->cb_ctx, count, rc); 1881 } 1882 1883 ctx->namespaces_populated = true; 1884 if (ctx->probe_done) { 1885 /* The probe was already completed, so we need to free the context 1886 * here. This can happen for cases like OCSSD, where we need to 1887 * send additional commands to the SSD after attach. 1888 */ 1889 free(ctx); 1890 } 1891 } 1892 1893 static void 1894 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1895 struct nvme_async_probe_ctx *ctx) 1896 { 1897 struct nvme_bdev_ns *nvme_ns; 1898 struct nvme_bdev *nvme_bdev; 1899 uint32_t i, nsid; 1900 size_t j; 1901 1902 assert(nvme_bdev_ctrlr != NULL); 1903 1904 /* 1905 * Report the new bdevs that were created in this call. 1906 * There can be more than one bdev per NVMe controller. 1907 */ 1908 j = 0; 1909 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1910 nsid = i + 1; 1911 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1912 if (!nvme_ns->populated) { 1913 continue; 1914 } 1915 assert(nvme_ns->id == nsid); 1916 nvme_bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1917 if (nvme_bdev == NULL) { 1918 assert(nvme_ns->type == NVME_BDEV_NS_OCSSD); 1919 continue; 1920 } 1921 if (j < ctx->count) { 1922 ctx->names[j] = nvme_bdev->disk.name; 1923 j++; 1924 } else { 1925 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 1926 ctx->count); 1927 populate_namespaces_cb(ctx, 0, -ERANGE); 1928 return; 1929 } 1930 } 1931 1932 populate_namespaces_cb(ctx, j, 0); 1933 } 1934 1935 static bool 1936 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1937 { 1938 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1939 1940 nsdata1 = spdk_nvme_ns_get_data(ns1); 1941 nsdata2 = spdk_nvme_ns_get_data(ns2); 1942 1943 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)); 1944 } 1945 1946 static int 1947 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr, 1948 struct spdk_nvme_transport_id *trid) 1949 { 1950 uint32_t i, nsid; 1951 struct nvme_bdev_ns *nvme_ns; 1952 struct spdk_nvme_ns *new_ns; 1953 struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid; 1954 int rc = 0; 1955 1956 assert(nvme_bdev_ctrlr != NULL); 1957 1958 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1959 SPDK_ERRLOG("PCIe failover is not supported.\n"); 1960 return -ENOTSUP; 1961 } 1962 1963 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1964 1965 /* Currently we only support failover to the same transport type. */ 1966 if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { 1967 rc = -EINVAL; 1968 goto exit; 1969 } 1970 1971 /* Currently we only support failover to the same NQN. */ 1972 if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 1973 rc = -EINVAL; 1974 goto exit; 1975 } 1976 1977 /* Skip all the other checks if we've already registered this path. */ 1978 TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) { 1979 if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) { 1980 rc = -EEXIST; 1981 goto exit; 1982 } 1983 } 1984 1985 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { 1986 rc = -EINVAL; 1987 goto exit; 1988 } 1989 1990 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1991 nsid = i + 1; 1992 1993 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1994 if (!nvme_ns->populated) { 1995 continue; 1996 } 1997 1998 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 1999 assert(new_ns != NULL); 2000 2001 if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) { 2002 rc = -EINVAL; 2003 goto exit; 2004 } 2005 } 2006 2007 new_trid = calloc(1, sizeof(*new_trid)); 2008 if (new_trid == NULL) { 2009 rc = -ENOMEM; 2010 goto exit; 2011 } 2012 new_trid->trid = *trid; 2013 new_trid->is_failed = false; 2014 2015 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 2016 if (tmp_trid->is_failed) { 2017 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 2018 goto exit; 2019 } 2020 } 2021 2022 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); 2023 2024 exit: 2025 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2026 return rc; 2027 } 2028 2029 static void 2030 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2031 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2032 { 2033 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2034 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2035 struct nvme_async_probe_ctx *ctx; 2036 int rc; 2037 2038 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2039 ctx->ctrlr_attached = true; 2040 2041 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 2042 if (nvme_bdev_ctrlr) { 2043 /* This is the case that a secondary path is added to an existing 2044 * nvme_bdev_ctrlr for failover. After checking if it can access the same 2045 * namespaces as the primary path, it is disconnected until failover occurs. 2046 */ 2047 rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid); 2048 2049 spdk_nvme_detach(ctrlr); 2050 goto exit; 2051 } 2052 2053 rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, 2054 &nvme_bdev_ctrlr); 2055 if (rc) { 2056 SPDK_ERRLOG("Failed to create new device\n"); 2057 goto exit; 2058 } 2059 2060 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); 2061 return; 2062 2063 exit: 2064 populate_namespaces_cb(ctx, 0, rc); 2065 } 2066 2067 static int 2068 bdev_nvme_async_poll(void *arg) 2069 { 2070 struct nvme_async_probe_ctx *ctx = arg; 2071 int rc; 2072 2073 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 2074 if (spdk_unlikely(rc != -EAGAIN)) { 2075 ctx->probe_done = true; 2076 spdk_poller_unregister(&ctx->poller); 2077 if (!ctx->ctrlr_attached) { 2078 /* The probe is done, but no controller was attached. 2079 * That means we had a failure, so report -EIO back to 2080 * the caller (usually the RPC). populate_namespaces_cb() 2081 * will take care of freeing the nvme_async_probe_ctx. 2082 */ 2083 populate_namespaces_cb(ctx, 0, -EIO); 2084 } else if (ctx->namespaces_populated) { 2085 /* The namespaces for the attached controller were all 2086 * populated and the response was already sent to the 2087 * caller (usually the RPC). So free the context here. 2088 */ 2089 free(ctx); 2090 } 2091 } 2092 2093 return SPDK_POLLER_BUSY; 2094 } 2095 2096 int 2097 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2098 struct spdk_nvme_host_id *hostid, 2099 const char *base_name, 2100 const char **names, 2101 uint32_t count, 2102 const char *hostnqn, 2103 uint32_t prchk_flags, 2104 spdk_bdev_create_nvme_fn cb_fn, 2105 void *cb_ctx, 2106 struct spdk_nvme_ctrlr_opts *opts) 2107 { 2108 struct nvme_probe_skip_entry *entry, *tmp; 2109 struct nvme_async_probe_ctx *ctx; 2110 2111 /* TODO expand this check to include both the host and target TRIDs. 2112 * Only if both are the same should we fail. 2113 */ 2114 if (nvme_bdev_ctrlr_get(trid) != NULL) { 2115 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2116 return -EEXIST; 2117 } 2118 2119 ctx = calloc(1, sizeof(*ctx)); 2120 if (!ctx) { 2121 return -ENOMEM; 2122 } 2123 ctx->base_name = base_name; 2124 ctx->names = names; 2125 ctx->count = count; 2126 ctx->cb_fn = cb_fn; 2127 ctx->cb_ctx = cb_ctx; 2128 ctx->prchk_flags = prchk_flags; 2129 ctx->trid = *trid; 2130 2131 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2132 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2133 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2134 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2135 free(entry); 2136 break; 2137 } 2138 } 2139 } 2140 2141 if (opts) { 2142 memcpy(&ctx->opts, opts, sizeof(*opts)); 2143 } else { 2144 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2145 } 2146 2147 ctx->opts.transport_retry_count = g_opts.retry_count; 2148 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2149 2150 if (hostnqn) { 2151 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 2152 } 2153 2154 if (hostid->hostaddr[0] != '\0') { 2155 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2156 } 2157 2158 if (hostid->hostsvcid[0] != '\0') { 2159 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2160 } 2161 2162 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 2163 if (ctx->probe_ctx == NULL) { 2164 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2165 free(ctx); 2166 return -ENODEV; 2167 } 2168 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2169 2170 return 0; 2171 } 2172 2173 int 2174 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid) 2175 { 2176 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2177 struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; 2178 2179 if (name == NULL) { 2180 return -EINVAL; 2181 } 2182 2183 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2184 if (nvme_bdev_ctrlr == NULL) { 2185 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2186 return -ENODEV; 2187 } 2188 2189 /* case 1: remove the controller itself. */ 2190 if (trid == NULL) { 2191 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2192 } 2193 2194 /* case 2: we are currently using the path to be removed. */ 2195 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2196 ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 2197 assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); 2198 /* case 2A: the current path is the only path. */ 2199 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2200 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2201 } 2202 2203 /* case 1B: there is an alternative path. */ 2204 return bdev_nvme_failover(nvme_bdev_ctrlr, true); 2205 } 2206 /* case 3: We are not using the specified path. */ 2207 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { 2208 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2209 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 2210 free(ctrlr_trid); 2211 return 0; 2212 } 2213 } 2214 2215 /* case 3A: The address isn't even in the registered list. */ 2216 return -ENXIO; 2217 } 2218 2219 static int 2220 bdev_nvme_library_init(void) 2221 { 2222 g_bdev_nvme_init_thread = spdk_get_thread(); 2223 2224 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, 2225 bdev_nvme_poll_group_destroy_cb, 2226 sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); 2227 2228 return 0; 2229 } 2230 2231 static void 2232 bdev_nvme_library_fini(void) 2233 { 2234 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; 2235 struct nvme_probe_skip_entry *entry, *entry_tmp; 2236 2237 spdk_poller_unregister(&g_hotplug_poller); 2238 free(g_hotplug_probe_ctx); 2239 2240 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2241 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2242 free(entry); 2243 } 2244 2245 pthread_mutex_lock(&g_bdev_nvme_mutex); 2246 TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { 2247 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 2248 if (nvme_bdev_ctrlr->destruct) { 2249 /* This controller's destruction was already started 2250 * before the application started shutting down 2251 */ 2252 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2253 continue; 2254 } 2255 nvme_bdev_ctrlr->destruct = true; 2256 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2257 2258 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct, 2259 nvme_bdev_ctrlr); 2260 } 2261 2262 g_bdev_nvme_module_finish = true; 2263 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2264 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2265 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 2266 spdk_bdev_module_finish_done(); 2267 return; 2268 } 2269 2270 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2271 } 2272 2273 static void 2274 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io) 2275 { 2276 struct spdk_bdev *bdev = bdev_io->bdev; 2277 struct spdk_dif_ctx dif_ctx; 2278 struct spdk_dif_error err_blk = {}; 2279 int rc; 2280 2281 rc = spdk_dif_ctx_init(&dif_ctx, 2282 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2283 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2284 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2285 if (rc != 0) { 2286 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2287 return; 2288 } 2289 2290 if (bdev->md_interleave) { 2291 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2292 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2293 } else { 2294 struct iovec md_iov = { 2295 .iov_base = bdev_io->u.bdev.md_buf, 2296 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2297 }; 2298 2299 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2300 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2301 } 2302 2303 if (rc != 0) { 2304 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2305 err_blk.err_type, err_blk.err_offset); 2306 } else { 2307 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2308 } 2309 } 2310 2311 static void 2312 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2313 { 2314 struct nvme_bdev_io *bio = ref; 2315 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2316 2317 if (spdk_nvme_cpl_is_success(cpl)) { 2318 /* Run PI verification for read data buffer. */ 2319 bdev_nvme_verify_pi_error(bdev_io); 2320 } 2321 2322 /* Return original completion status */ 2323 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, 2324 bio->cpl.status.sc); 2325 } 2326 2327 static void 2328 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2329 { 2330 struct nvme_bdev_io *bio = ref; 2331 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2332 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2333 struct nvme_io_channel *nvme_ch; 2334 struct nvme_bdev_ns *nvme_ns; 2335 struct spdk_nvme_qpair *qpair; 2336 int ret; 2337 2338 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2339 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2340 cpl->status.sct, cpl->status.sc); 2341 2342 /* Save completion status to use after verifying PI error. */ 2343 bio->cpl = *cpl; 2344 2345 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2346 2347 if (spdk_likely(bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 2348 /* Read without PI checking to verify PI error. */ 2349 ret = bdev_nvme_no_pi_readv(nvme_ns->ns, 2350 qpair, 2351 bio, 2352 bdev_io->u.bdev.iovs, 2353 bdev_io->u.bdev.iovcnt, 2354 bdev_io->u.bdev.md_buf, 2355 bdev_io->u.bdev.num_blocks, 2356 bdev_io->u.bdev.offset_blocks); 2357 if (ret == 0) { 2358 return; 2359 } 2360 } 2361 } 2362 2363 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2364 } 2365 2366 static void 2367 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2368 { 2369 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2370 2371 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2372 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2373 cpl->status.sct, cpl->status.sc); 2374 /* Run PI verification for write data buffer if PI error is detected. */ 2375 bdev_nvme_verify_pi_error(bdev_io); 2376 } 2377 2378 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2379 } 2380 2381 static void 2382 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2383 { 2384 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2385 2386 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2387 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2388 cpl->status.sct, cpl->status.sc); 2389 /* Run PI verification for compare data buffer if PI error is detected. */ 2390 bdev_nvme_verify_pi_error(bdev_io); 2391 } 2392 2393 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2394 } 2395 2396 static void 2397 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2398 { 2399 struct nvme_bdev_io *bio = ref; 2400 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2401 2402 /* Compare operation completion */ 2403 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2404 /* Save compare result for write callback */ 2405 bio->cpl = *cpl; 2406 return; 2407 } 2408 2409 /* Write operation completion */ 2410 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2411 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2412 * complete the IO with the compare operation's status. 2413 */ 2414 if (!spdk_nvme_cpl_is_error(cpl)) { 2415 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2416 } 2417 2418 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2419 } else { 2420 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2421 } 2422 } 2423 2424 static void 2425 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2426 { 2427 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2428 2429 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2430 } 2431 2432 static void 2433 bdev_nvme_admin_passthru_completion(void *ctx) 2434 { 2435 struct nvme_bdev_io *bio = ctx; 2436 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2437 2438 spdk_bdev_io_complete_nvme_status(bdev_io, 2439 bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2440 } 2441 2442 static void 2443 bdev_nvme_abort_completion(void *ctx) 2444 { 2445 struct nvme_bdev_io *bio = ctx; 2446 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2447 2448 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 2449 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2450 } else { 2451 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2452 } 2453 } 2454 2455 static void 2456 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 2457 { 2458 struct nvme_bdev_io *bio = ref; 2459 2460 bio->cpl = *cpl; 2461 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2462 } 2463 2464 static void 2465 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 2466 { 2467 struct nvme_bdev_io *bio = ref; 2468 2469 bio->cpl = *cpl; 2470 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 2471 } 2472 2473 static void 2474 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 2475 { 2476 struct nvme_bdev_io *bio = ref; 2477 struct iovec *iov; 2478 2479 bio->iov_offset = sgl_offset; 2480 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 2481 iov = &bio->iovs[bio->iovpos]; 2482 if (bio->iov_offset < iov->iov_len) { 2483 break; 2484 } 2485 2486 bio->iov_offset -= iov->iov_len; 2487 } 2488 } 2489 2490 static int 2491 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 2492 { 2493 struct nvme_bdev_io *bio = ref; 2494 struct iovec *iov; 2495 2496 assert(bio->iovpos < bio->iovcnt); 2497 2498 iov = &bio->iovs[bio->iovpos]; 2499 2500 *address = iov->iov_base; 2501 *length = iov->iov_len; 2502 2503 if (bio->iov_offset) { 2504 assert(bio->iov_offset <= iov->iov_len); 2505 *address += bio->iov_offset; 2506 *length -= bio->iov_offset; 2507 } 2508 2509 bio->iov_offset += *length; 2510 if (bio->iov_offset == iov->iov_len) { 2511 bio->iovpos++; 2512 bio->iov_offset = 0; 2513 } 2514 2515 return 0; 2516 } 2517 2518 static void 2519 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 2520 { 2521 struct nvme_bdev_io *bio = ref; 2522 struct iovec *iov; 2523 2524 bio->fused_iov_offset = sgl_offset; 2525 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 2526 iov = &bio->fused_iovs[bio->fused_iovpos]; 2527 if (bio->fused_iov_offset < iov->iov_len) { 2528 break; 2529 } 2530 2531 bio->fused_iov_offset -= iov->iov_len; 2532 } 2533 } 2534 2535 static int 2536 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 2537 { 2538 struct nvme_bdev_io *bio = ref; 2539 struct iovec *iov; 2540 2541 assert(bio->fused_iovpos < bio->fused_iovcnt); 2542 2543 iov = &bio->fused_iovs[bio->fused_iovpos]; 2544 2545 *address = iov->iov_base; 2546 *length = iov->iov_len; 2547 2548 if (bio->fused_iov_offset) { 2549 assert(bio->fused_iov_offset <= iov->iov_len); 2550 *address += bio->fused_iov_offset; 2551 *length -= bio->fused_iov_offset; 2552 } 2553 2554 bio->fused_iov_offset += *length; 2555 if (bio->fused_iov_offset == iov->iov_len) { 2556 bio->fused_iovpos++; 2557 bio->fused_iov_offset = 0; 2558 } 2559 2560 return 0; 2561 } 2562 2563 static int 2564 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2565 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2566 void *md, uint64_t lba_count, uint64_t lba) 2567 { 2568 int rc; 2569 2570 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 2571 lba_count, lba); 2572 2573 bio->iovs = iov; 2574 bio->iovcnt = iovcnt; 2575 bio->iovpos = 0; 2576 bio->iov_offset = 0; 2577 2578 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2579 bdev_nvme_no_pi_readv_done, bio, 0, 2580 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2581 md, 0, 0); 2582 2583 if (rc != 0 && rc != -ENOMEM) { 2584 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 2585 } 2586 return rc; 2587 } 2588 2589 static int 2590 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2591 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2592 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2593 { 2594 int rc; 2595 2596 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2597 lba_count, lba); 2598 2599 bio->iovs = iov; 2600 bio->iovcnt = iovcnt; 2601 bio->iovpos = 0; 2602 bio->iov_offset = 0; 2603 2604 if (iovcnt == 1) { 2605 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 2606 lba_count, 2607 bdev_nvme_readv_done, bio, 2608 flags, 2609 0, 0); 2610 } else { 2611 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2612 bdev_nvme_readv_done, bio, flags, 2613 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2614 md, 0, 0); 2615 } 2616 2617 if (rc != 0 && rc != -ENOMEM) { 2618 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 2619 } 2620 return rc; 2621 } 2622 2623 static int 2624 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2625 struct nvme_bdev_io *bio, 2626 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2627 uint32_t flags) 2628 { 2629 int rc; 2630 2631 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2632 lba_count, lba); 2633 2634 bio->iovs = iov; 2635 bio->iovcnt = iovcnt; 2636 bio->iovpos = 0; 2637 bio->iov_offset = 0; 2638 2639 if (iovcnt == 1) { 2640 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 2641 lba_count, 2642 bdev_nvme_writev_done, bio, 2643 flags, 2644 0, 0); 2645 } else { 2646 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2647 bdev_nvme_writev_done, bio, flags, 2648 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2649 md, 0, 0); 2650 } 2651 2652 if (rc != 0 && rc != -ENOMEM) { 2653 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 2654 } 2655 return rc; 2656 } 2657 2658 static int 2659 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2660 struct nvme_bdev_io *bio, 2661 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2662 uint32_t flags) 2663 { 2664 int rc; 2665 2666 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2667 lba_count, lba); 2668 2669 bio->iovs = iov; 2670 bio->iovcnt = iovcnt; 2671 bio->iovpos = 0; 2672 bio->iov_offset = 0; 2673 2674 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2675 bdev_nvme_comparev_done, bio, flags, 2676 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2677 md, 0, 0); 2678 2679 if (rc != 0 && rc != -ENOMEM) { 2680 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 2681 } 2682 return rc; 2683 } 2684 2685 static int 2686 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2687 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 2688 struct iovec *write_iov, int write_iovcnt, 2689 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2690 { 2691 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2692 int rc; 2693 2694 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2695 lba_count, lba); 2696 2697 bio->iovs = cmp_iov; 2698 bio->iovcnt = cmp_iovcnt; 2699 bio->iovpos = 0; 2700 bio->iov_offset = 0; 2701 bio->fused_iovs = write_iov; 2702 bio->fused_iovcnt = write_iovcnt; 2703 bio->fused_iovpos = 0; 2704 bio->fused_iov_offset = 0; 2705 2706 if (bdev_io->num_retries == 0) { 2707 bio->first_fused_submitted = false; 2708 } 2709 2710 if (!bio->first_fused_submitted) { 2711 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2712 memset(&bio->cpl, 0, sizeof(bio->cpl)); 2713 2714 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2715 bdev_nvme_comparev_and_writev_done, bio, flags, 2716 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 2717 if (rc == 0) { 2718 bio->first_fused_submitted = true; 2719 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2720 } else { 2721 if (rc != -ENOMEM) { 2722 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 2723 } 2724 return rc; 2725 } 2726 } 2727 2728 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 2729 2730 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2731 bdev_nvme_comparev_and_writev_done, bio, flags, 2732 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 2733 if (rc != 0 && rc != -ENOMEM) { 2734 SPDK_ERRLOG("write failed: rc = %d\n", rc); 2735 rc = 0; 2736 } 2737 2738 return rc; 2739 } 2740 2741 static int 2742 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2743 struct nvme_bdev_io *bio, 2744 uint64_t offset_blocks, 2745 uint64_t num_blocks) 2746 { 2747 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 2748 struct spdk_nvme_dsm_range *range; 2749 uint64_t offset, remaining; 2750 uint64_t num_ranges_u64; 2751 uint16_t num_ranges; 2752 int rc; 2753 2754 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 2755 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2756 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 2757 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 2758 return -EINVAL; 2759 } 2760 num_ranges = (uint16_t)num_ranges_u64; 2761 2762 offset = offset_blocks; 2763 remaining = num_blocks; 2764 range = &dsm_ranges[0]; 2765 2766 /* Fill max-size ranges until the remaining blocks fit into one range */ 2767 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 2768 range->attributes.raw = 0; 2769 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2770 range->starting_lba = offset; 2771 2772 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2773 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2774 range++; 2775 } 2776 2777 /* Final range describes the remaining blocks */ 2778 range->attributes.raw = 0; 2779 range->length = remaining; 2780 range->starting_lba = offset; 2781 2782 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 2783 SPDK_NVME_DSM_ATTR_DEALLOCATE, 2784 dsm_ranges, num_ranges, 2785 bdev_nvme_queued_done, bio); 2786 2787 return rc; 2788 } 2789 2790 static int 2791 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2792 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2793 { 2794 uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr); 2795 2796 if (nbytes > max_xfer_size) { 2797 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2798 return -EINVAL; 2799 } 2800 2801 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2802 2803 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf, 2804 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 2805 } 2806 2807 static int 2808 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2809 struct nvme_bdev_io *bio, 2810 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2811 { 2812 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2813 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2814 2815 if (nbytes > max_xfer_size) { 2816 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2817 return -EINVAL; 2818 } 2819 2820 /* 2821 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2822 * so fill it out automatically. 2823 */ 2824 cmd->nsid = spdk_nvme_ns_get_id(ns); 2825 2826 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 2827 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 2828 } 2829 2830 static int 2831 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2832 struct nvme_bdev_io *bio, 2833 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 2834 { 2835 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 2836 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2837 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2838 2839 if (nbytes > max_xfer_size) { 2840 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2841 return -EINVAL; 2842 } 2843 2844 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 2845 SPDK_ERRLOG("invalid meta data buffer size\n"); 2846 return -EINVAL; 2847 } 2848 2849 /* 2850 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2851 * so fill it out automatically. 2852 */ 2853 cmd->nsid = spdk_nvme_ns_get_id(ns); 2854 2855 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 2856 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 2857 } 2858 2859 static void 2860 bdev_nvme_abort_admin_cmd(void *ctx) 2861 { 2862 struct nvme_bdev_io *bio = ctx; 2863 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2864 struct nvme_io_channel *nvme_ch; 2865 struct nvme_bdev_io *bio_to_abort; 2866 int rc; 2867 2868 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2869 bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2870 2871 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2872 NULL, 2873 bio_to_abort, 2874 bdev_nvme_abort_done, bio); 2875 if (rc == -ENOENT) { 2876 /* If no admin command was found in admin qpair, complete the abort 2877 * request with failure. 2878 */ 2879 bio->cpl.cdw0 |= 1U; 2880 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 2881 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2882 2883 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2884 } 2885 } 2886 2887 static int 2888 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2889 struct nvme_bdev_io *bio_to_abort) 2890 { 2891 int rc; 2892 2893 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2894 2895 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2896 nvme_ch->qpair, 2897 bio_to_abort, 2898 bdev_nvme_abort_done, bio); 2899 if (rc == -ENOENT) { 2900 /* If no command was found in I/O qpair, the target command may be 2901 * admin command. Only a single thread tries aborting admin command 2902 * to clean I/O flow. 2903 */ 2904 spdk_thread_send_msg(nvme_ch->ctrlr->thread, 2905 bdev_nvme_abort_admin_cmd, bio); 2906 rc = 0; 2907 } 2908 2909 return rc; 2910 } 2911 2912 static void 2913 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 2914 struct nvme_bdev_ns *nvme_ns) 2915 { 2916 /* nop */ 2917 } 2918 2919 static void 2920 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns) 2921 { 2922 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 2923 } 2924 2925 static void 2926 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 2927 { 2928 const char *action; 2929 2930 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 2931 action = "reset"; 2932 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 2933 action = "abort"; 2934 } else { 2935 action = "none"; 2936 } 2937 2938 spdk_json_write_object_begin(w); 2939 2940 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 2941 2942 spdk_json_write_named_object_begin(w, "params"); 2943 spdk_json_write_named_string(w, "action_on_timeout", action); 2944 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 2945 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 2946 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 2947 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 2948 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 2949 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 2950 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 2951 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 2952 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 2953 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 2954 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 2955 spdk_json_write_object_end(w); 2956 2957 spdk_json_write_object_end(w); 2958 } 2959 2960 static void 2961 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w, 2962 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 2963 { 2964 struct spdk_nvme_transport_id *trid; 2965 2966 trid = nvme_bdev_ctrlr->connected_trid; 2967 2968 spdk_json_write_object_begin(w); 2969 2970 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 2971 2972 spdk_json_write_named_object_begin(w, "params"); 2973 spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); 2974 nvme_bdev_dump_trid_json(trid, w); 2975 spdk_json_write_named_bool(w, "prchk_reftag", 2976 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 2977 spdk_json_write_named_bool(w, "prchk_guard", 2978 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 2979 2980 spdk_json_write_object_end(w); 2981 2982 spdk_json_write_object_end(w); 2983 } 2984 2985 static void 2986 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 2987 { 2988 spdk_json_write_object_begin(w); 2989 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 2990 2991 spdk_json_write_named_object_begin(w, "params"); 2992 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 2993 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 2994 spdk_json_write_object_end(w); 2995 2996 spdk_json_write_object_end(w); 2997 } 2998 2999 static int 3000 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 3001 { 3002 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 3003 uint32_t nsid; 3004 3005 bdev_nvme_opts_config_json(w); 3006 3007 pthread_mutex_lock(&g_bdev_nvme_mutex); 3008 3009 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 3010 nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr); 3011 3012 for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { 3013 if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { 3014 continue; 3015 } 3016 3017 nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); 3018 } 3019 } 3020 3021 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 3022 * before enabling hotplug poller. 3023 */ 3024 bdev_nvme_hotplug_config_json(w); 3025 3026 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3027 return 0; 3028 } 3029 3030 struct spdk_nvme_ctrlr * 3031 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 3032 { 3033 if (!bdev || bdev->module != &nvme_if) { 3034 return NULL; 3035 } 3036 3037 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 3038 } 3039 3040 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3041