1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/accel_engine.h" 40 #include "spdk/config.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/nvme.h" 45 #include "spdk/nvme_ocssd.h" 46 #include "spdk/thread.h" 47 #include "spdk/string.h" 48 #include "spdk/util.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk/log.h" 52 53 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 54 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 55 56 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 57 58 struct nvme_bdev_io { 59 /** array of iovecs to transfer. */ 60 struct iovec *iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int iovcnt; 64 65 /** Current iovec position. */ 66 int iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t iov_offset; 70 71 /** array of iovecs to transfer. */ 72 struct iovec *fused_iovs; 73 74 /** Number of iovecs in iovs array. */ 75 int fused_iovcnt; 76 77 /** Current iovec position. */ 78 int fused_iovpos; 79 80 /** Offset in current iovec. */ 81 uint32_t fused_iov_offset; 82 83 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 84 struct spdk_nvme_cpl cpl; 85 86 /** Originating thread */ 87 struct spdk_thread *orig_thread; 88 89 /** Keeps track if first of fused commands was submitted */ 90 bool first_fused_submitted; 91 }; 92 93 struct nvme_probe_ctx { 94 size_t count; 95 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 96 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 97 const char *names[NVME_MAX_CONTROLLERS]; 98 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 99 const char *hostnqn; 100 }; 101 102 struct nvme_probe_skip_entry { 103 struct spdk_nvme_transport_id trid; 104 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 105 }; 106 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 107 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 108 g_skipped_nvme_ctrlrs); 109 110 static struct spdk_bdev_nvme_opts g_opts = { 111 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 112 .timeout_us = 0, 113 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 114 .retry_count = 4, 115 .arbitration_burst = 0, 116 .low_priority_weight = 0, 117 .medium_priority_weight = 0, 118 .high_priority_weight = 0, 119 .nvme_adminq_poll_period_us = 10000ULL, 120 .nvme_ioq_poll_period_us = 0, 121 .io_queue_requests = 0, 122 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 123 }; 124 125 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 127 128 static int g_hot_insert_nvme_controller_index = 0; 129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 130 static bool g_nvme_hotplug_enabled = false; 131 static struct spdk_thread *g_bdev_nvme_init_thread; 132 static struct spdk_poller *g_hotplug_poller; 133 static struct spdk_poller *g_hotplug_probe_poller; 134 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 135 136 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 137 struct nvme_async_probe_ctx *ctx); 138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 139 struct nvme_async_probe_ctx *ctx); 140 static int bdev_nvme_library_init(void); 141 static void bdev_nvme_library_fini(void); 142 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 143 struct nvme_bdev_io *bio, 144 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 145 uint32_t flags); 146 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 147 struct nvme_bdev_io *bio, 148 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 149 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 150 struct nvme_bdev_io *bio, 151 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 152 uint32_t flags); 153 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 154 struct nvme_bdev_io *bio, 155 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 156 uint32_t flags); 157 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 158 struct spdk_nvme_qpair *qpair, 159 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 160 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 161 uint32_t flags); 162 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, 163 struct nvme_bdev_io *bio, 164 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 165 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 166 struct nvme_bdev_io *bio, 167 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 168 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 169 struct nvme_bdev_io *bio, 170 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 171 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch, 172 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 173 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio); 174 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove); 175 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 176 177 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 178 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 179 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 180 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 181 182 static populate_namespace_fn g_populate_namespace_fn[] = { 183 NULL, 184 nvme_ctrlr_populate_standard_namespace, 185 bdev_ocssd_populate_namespace, 186 }; 187 188 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns); 189 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns); 190 191 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 192 NULL, 193 nvme_ctrlr_depopulate_standard_namespace, 194 bdev_ocssd_depopulate_namespace, 195 }; 196 197 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 198 struct nvme_bdev_ns *nvme_ns); 199 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 200 struct nvme_bdev_ns *nvme_ns); 201 202 static config_json_namespace_fn g_config_json_namespace_fn[] = { 203 NULL, 204 nvme_ctrlr_config_json_standard_namespace, 205 bdev_ocssd_namespace_config_json, 206 }; 207 208 struct spdk_nvme_qpair * 209 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 210 { 211 struct nvme_io_channel *nvme_ch; 212 213 assert(ctrlr_io_ch != NULL); 214 215 nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 216 217 return nvme_ch->qpair; 218 } 219 220 static int 221 bdev_nvme_get_ctx_size(void) 222 { 223 return sizeof(struct nvme_bdev_io); 224 } 225 226 static struct spdk_bdev_module nvme_if = { 227 .name = "nvme", 228 .async_fini = true, 229 .module_init = bdev_nvme_library_init, 230 .module_fini = bdev_nvme_library_fini, 231 .config_json = bdev_nvme_config_json, 232 .get_ctx_size = bdev_nvme_get_ctx_size, 233 234 }; 235 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 236 237 static void 238 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 239 { 240 int rc; 241 242 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 243 /* 244 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 245 * reconnect a qpair and we will stop getting a callback for this one. 246 */ 247 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 248 if (rc != 0) { 249 SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 250 } 251 } 252 253 static int 254 bdev_nvme_poll(void *arg) 255 { 256 struct nvme_bdev_poll_group *group = arg; 257 int64_t num_completions; 258 259 if (group->collect_spin_stat && group->start_ticks == 0) { 260 group->start_ticks = spdk_get_ticks(); 261 } 262 263 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 264 bdev_nvme_disconnected_qpair_cb); 265 if (group->collect_spin_stat) { 266 if (num_completions > 0) { 267 if (group->end_ticks != 0) { 268 group->spin_ticks += (group->end_ticks - group->start_ticks); 269 group->end_ticks = 0; 270 } 271 group->start_ticks = 0; 272 } else { 273 group->end_ticks = spdk_get_ticks(); 274 } 275 } 276 277 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 278 } 279 280 static int 281 bdev_nvme_poll_adminq(void *arg) 282 { 283 int32_t rc; 284 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 285 286 assert(nvme_bdev_ctrlr != NULL); 287 288 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); 289 if (rc < 0) { 290 bdev_nvme_failover(nvme_bdev_ctrlr, false); 291 } 292 293 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 294 } 295 296 static int 297 bdev_nvme_destruct(void *ctx) 298 { 299 struct nvme_bdev *nvme_disk = ctx; 300 struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns; 301 302 nvme_ns->bdev = NULL; 303 304 nvme_bdev_ns_detach(nvme_ns); 305 306 free(nvme_disk->disk.name); 307 free(nvme_disk); 308 309 return 0; 310 } 311 312 static int 313 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 314 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 315 { 316 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); 317 318 return 0; 319 } 320 321 static int 322 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch) 323 { 324 struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr; 325 struct spdk_nvme_io_qpair_opts opts; 326 int rc; 327 328 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 329 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 330 opts.create_only = true; 331 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 332 g_opts.io_queue_requests = opts.io_queue_requests; 333 334 nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 335 if (nvme_ch->qpair == NULL) { 336 return -1; 337 } 338 339 assert(nvme_ch->group != NULL); 340 341 rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair); 342 if (rc != 0) { 343 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 344 goto err; 345 } 346 347 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair); 348 if (rc != 0) { 349 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 350 goto err; 351 } 352 353 return 0; 354 355 err: 356 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 357 358 return rc; 359 } 360 361 static void 362 _bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status) 363 { 364 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 365 366 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 367 if (nvme_bdev_ctrlr->destruct_after_reset) { 368 assert(nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct); 369 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 370 371 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct, 372 nvme_bdev_ctrlr); 373 } else { 374 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 375 } 376 } 377 378 static void 379 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 380 { 381 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 382 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 383 struct spdk_bdev_io *bdev_io; 384 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 385 386 /* A NULL ctx means success. */ 387 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 388 status = SPDK_BDEV_IO_STATUS_FAILED; 389 } 390 391 while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) { 392 bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets); 393 TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link); 394 spdk_bdev_io_complete(bdev_io, status); 395 } 396 397 spdk_for_each_channel_continue(i, 0); 398 } 399 400 static void 401 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) 402 { 403 /* we are using the for_each_channel cb_arg like a return code here. */ 404 /* If it's zero, we succeeded, otherwise, the reset failed. */ 405 void *cb_arg = NULL; 406 struct nvme_bdev_ctrlr_trid *curr_trid; 407 408 if (rc) { 409 cb_arg = (void *)0x1; 410 SPDK_ERRLOG("Resetting controller failed.\n"); 411 } else { 412 SPDK_NOTICELOG("Resetting controller successful.\n"); 413 } 414 415 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 416 nvme_bdev_ctrlr->resetting = false; 417 nvme_bdev_ctrlr->failover_in_progress = false; 418 419 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 420 assert(curr_trid != NULL); 421 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 422 423 curr_trid->is_failed = cb_arg != NULL ? true : false; 424 425 if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { 426 /* Destruct ctrlr after clearing pending resets. */ 427 nvme_bdev_ctrlr->destruct_after_reset = true; 428 } 429 430 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 431 432 /* Make sure we clear any pending resets before returning. */ 433 spdk_for_each_channel(nvme_bdev_ctrlr, 434 _bdev_nvme_complete_pending_resets, 435 cb_arg, 436 _bdev_nvme_check_pending_destruct); 437 } 438 439 static void 440 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 441 { 442 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 443 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 444 int rc = SPDK_BDEV_IO_STATUS_SUCCESS; 445 446 if (status) { 447 rc = SPDK_BDEV_IO_STATUS_FAILED; 448 } 449 if (bio) { 450 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc); 451 } 452 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 453 } 454 455 static void 456 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 457 { 458 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 459 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 460 int rc; 461 462 rc = bdev_nvme_create_qpair(nvme_ch); 463 464 spdk_for_each_channel_continue(i, rc); 465 } 466 467 static void 468 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 469 { 470 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 471 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 472 int rc; 473 474 if (status) { 475 rc = status; 476 goto err; 477 } 478 479 rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); 480 if (rc != 0) { 481 goto err; 482 } 483 484 /* Recreate all of the I/O queue pairs */ 485 spdk_for_each_channel(nvme_bdev_ctrlr, 486 _bdev_nvme_reset_create_qpair, 487 bio, 488 _bdev_nvme_reset_create_qpairs_done); 489 return; 490 491 err: 492 if (bio) { 493 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); 494 } 495 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); 496 } 497 498 static void 499 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 500 { 501 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 502 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 503 int rc; 504 505 rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 506 if (!rc) { 507 nvme_ch->qpair = NULL; 508 } 509 510 spdk_for_each_channel_continue(i, rc); 511 } 512 513 static int 514 _bdev_nvme_reset_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 515 { 516 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 517 if (nvme_bdev_ctrlr->destruct) { 518 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 519 return -EBUSY; 520 } 521 522 if (nvme_bdev_ctrlr->resetting) { 523 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 524 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 525 return -EAGAIN; 526 } 527 528 nvme_bdev_ctrlr->resetting = true; 529 530 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 531 return 0; 532 } 533 534 static int 535 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 536 { 537 int rc; 538 539 rc = _bdev_nvme_reset_start(nvme_bdev_ctrlr); 540 if (rc == 0) { 541 /* First, delete all NVMe I/O queue pairs. */ 542 spdk_for_each_channel(nvme_bdev_ctrlr, 543 _bdev_nvme_reset_destroy_qpair, 544 NULL, 545 _bdev_nvme_reset_ctrlr); 546 } 547 548 return rc; 549 } 550 551 static int 552 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio) 553 { 554 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 555 int rc; 556 557 rc = _bdev_nvme_reset_start(nvme_ch->ctrlr); 558 if (rc == 0) { 559 /* First, delete all NVMe I/O queue pairs. */ 560 spdk_for_each_channel(nvme_ch->ctrlr, 561 _bdev_nvme_reset_destroy_qpair, 562 bio, 563 _bdev_nvme_reset_ctrlr); 564 } else if (rc == -EBUSY) { 565 /* Don't bother resetting if the controller is in the process of being destructed. */ 566 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 567 } else if (rc == -EAGAIN) { 568 /* 569 * Reset call is queued only if it is from the app framework. This is on purpose so that 570 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 571 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 572 */ 573 TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link); 574 } else { 575 return rc; 576 } 577 578 return 0; 579 } 580 581 static int 582 _bdev_nvme_failover_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 583 { 584 struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 585 int rc; 586 587 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 588 if (nvme_bdev_ctrlr->destruct) { 589 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 590 /* Don't bother resetting if the controller is in the process of being destructed. */ 591 return -EBUSY; 592 } 593 594 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 595 assert(curr_trid); 596 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 597 next_trid = TAILQ_NEXT(curr_trid, link); 598 599 if (nvme_bdev_ctrlr->resetting) { 600 if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { 601 rc = -EAGAIN; 602 } else { 603 rc = -EBUSY; 604 } 605 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 606 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 607 return rc; 608 } 609 610 nvme_bdev_ctrlr->resetting = true; 611 curr_trid->is_failed = true; 612 613 if (next_trid) { 614 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 615 616 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 617 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 618 619 nvme_bdev_ctrlr->failover_in_progress = true; 620 spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); 621 nvme_bdev_ctrlr->connected_trid = &next_trid->trid; 622 rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); 623 assert(rc == 0); 624 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link); 625 if (!remove) { 626 /** Shuffle the old trid to the end of the list and use the new one. 627 * Allows for round robin through multiple connections. 628 */ 629 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link); 630 } else { 631 free(curr_trid); 632 } 633 } 634 635 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 636 return 0; 637 } 638 639 static int 640 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 641 { 642 int rc; 643 644 rc = _bdev_nvme_failover_start(nvme_bdev_ctrlr, remove); 645 if (rc == 0) { 646 /* First, delete all NVMe I/O queue pairs. */ 647 spdk_for_each_channel(nvme_bdev_ctrlr, 648 _bdev_nvme_reset_destroy_qpair, 649 NULL, 650 _bdev_nvme_reset_ctrlr); 651 } else if (rc != -EBUSY) { 652 return rc; 653 } 654 655 return 0; 656 } 657 658 static int 659 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 660 struct nvme_bdev_io *bio, 661 uint64_t offset_blocks, 662 uint64_t num_blocks); 663 664 static void 665 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 666 bool success) 667 { 668 struct spdk_bdev *bdev = bdev_io->bdev; 669 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 670 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 671 struct nvme_bdev_ns *nvme_ns; 672 struct spdk_nvme_qpair *qpair; 673 int ret; 674 675 if (!success) { 676 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 677 return; 678 } 679 680 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 681 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 682 return; 683 } 684 685 ret = bdev_nvme_readv(nvme_ns->ns, 686 qpair, 687 (struct nvme_bdev_io *)bdev_io->driver_ctx, 688 bdev_io->u.bdev.iovs, 689 bdev_io->u.bdev.iovcnt, 690 bdev_io->u.bdev.md_buf, 691 bdev_io->u.bdev.num_blocks, 692 bdev_io->u.bdev.offset_blocks, 693 bdev->dif_check_flags); 694 695 if (spdk_likely(ret == 0)) { 696 return; 697 } else if (ret == -ENOMEM) { 698 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 699 } else { 700 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 701 } 702 } 703 704 static int 705 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 706 { 707 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 708 struct spdk_bdev *bdev = bdev_io->bdev; 709 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 710 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 711 struct nvme_bdev_io *nbdev_io_to_abort; 712 struct nvme_bdev_ns *nvme_ns; 713 struct spdk_nvme_qpair *qpair; 714 715 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 716 return -1; 717 } 718 719 switch (bdev_io->type) { 720 case SPDK_BDEV_IO_TYPE_READ: 721 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 722 return bdev_nvme_readv(nvme_ns->ns, 723 qpair, 724 nbdev_io, 725 bdev_io->u.bdev.iovs, 726 bdev_io->u.bdev.iovcnt, 727 bdev_io->u.bdev.md_buf, 728 bdev_io->u.bdev.num_blocks, 729 bdev_io->u.bdev.offset_blocks, 730 bdev->dif_check_flags); 731 } else { 732 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 733 bdev_io->u.bdev.num_blocks * bdev->blocklen); 734 return 0; 735 } 736 737 case SPDK_BDEV_IO_TYPE_WRITE: 738 return bdev_nvme_writev(nvme_ns->ns, 739 qpair, 740 nbdev_io, 741 bdev_io->u.bdev.iovs, 742 bdev_io->u.bdev.iovcnt, 743 bdev_io->u.bdev.md_buf, 744 bdev_io->u.bdev.num_blocks, 745 bdev_io->u.bdev.offset_blocks, 746 bdev->dif_check_flags); 747 748 case SPDK_BDEV_IO_TYPE_COMPARE: 749 return bdev_nvme_comparev(nvme_ns->ns, 750 qpair, 751 nbdev_io, 752 bdev_io->u.bdev.iovs, 753 bdev_io->u.bdev.iovcnt, 754 bdev_io->u.bdev.md_buf, 755 bdev_io->u.bdev.num_blocks, 756 bdev_io->u.bdev.offset_blocks, 757 bdev->dif_check_flags); 758 759 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 760 return bdev_nvme_comparev_and_writev(nvme_ns->ns, 761 qpair, 762 nbdev_io, 763 bdev_io->u.bdev.iovs, 764 bdev_io->u.bdev.iovcnt, 765 bdev_io->u.bdev.fused_iovs, 766 bdev_io->u.bdev.fused_iovcnt, 767 bdev_io->u.bdev.md_buf, 768 bdev_io->u.bdev.num_blocks, 769 bdev_io->u.bdev.offset_blocks, 770 bdev->dif_check_flags); 771 772 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 773 return bdev_nvme_unmap(nvme_ns->ns, 774 qpair, 775 nbdev_io, 776 bdev_io->u.bdev.offset_blocks, 777 bdev_io->u.bdev.num_blocks); 778 779 case SPDK_BDEV_IO_TYPE_UNMAP: 780 return bdev_nvme_unmap(nvme_ns->ns, 781 qpair, 782 nbdev_io, 783 bdev_io->u.bdev.offset_blocks, 784 bdev_io->u.bdev.num_blocks); 785 786 case SPDK_BDEV_IO_TYPE_RESET: 787 return bdev_nvme_reset(nvme_ch, nbdev_io); 788 789 case SPDK_BDEV_IO_TYPE_FLUSH: 790 return bdev_nvme_flush(nvme_ns->ns, 791 qpair, 792 nbdev_io, 793 bdev_io->u.bdev.offset_blocks, 794 bdev_io->u.bdev.num_blocks); 795 796 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 797 return bdev_nvme_admin_passthru(nvme_ch, 798 nbdev_io, 799 &bdev_io->u.nvme_passthru.cmd, 800 bdev_io->u.nvme_passthru.buf, 801 bdev_io->u.nvme_passthru.nbytes); 802 803 case SPDK_BDEV_IO_TYPE_NVME_IO: 804 return bdev_nvme_io_passthru(nvme_ns->ns, 805 qpair, 806 nbdev_io, 807 &bdev_io->u.nvme_passthru.cmd, 808 bdev_io->u.nvme_passthru.buf, 809 bdev_io->u.nvme_passthru.nbytes); 810 811 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 812 return bdev_nvme_io_passthru_md(nvme_ns->ns, 813 qpair, 814 nbdev_io, 815 &bdev_io->u.nvme_passthru.cmd, 816 bdev_io->u.nvme_passthru.buf, 817 bdev_io->u.nvme_passthru.nbytes, 818 bdev_io->u.nvme_passthru.md_buf, 819 bdev_io->u.nvme_passthru.md_len); 820 821 case SPDK_BDEV_IO_TYPE_ABORT: 822 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 823 return bdev_nvme_abort(nvme_ch, 824 nbdev_io, 825 nbdev_io_to_abort); 826 827 default: 828 return -EINVAL; 829 } 830 return 0; 831 } 832 833 static void 834 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 835 { 836 int rc = _bdev_nvme_submit_request(ch, bdev_io); 837 838 if (spdk_unlikely(rc != 0)) { 839 if (rc == -ENOMEM) { 840 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 841 } else { 842 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 843 } 844 } 845 } 846 847 static bool 848 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 849 { 850 struct nvme_bdev *nbdev = ctx; 851 struct nvme_bdev_ns *nvme_ns; 852 struct spdk_nvme_ns *ns; 853 struct spdk_nvme_ctrlr *ctrlr; 854 const struct spdk_nvme_ctrlr_data *cdata; 855 856 nvme_ns = nvme_bdev_to_bdev_ns(nbdev); 857 assert(nvme_ns != NULL); 858 ns = nvme_ns->ns; 859 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 860 861 switch (io_type) { 862 case SPDK_BDEV_IO_TYPE_READ: 863 case SPDK_BDEV_IO_TYPE_WRITE: 864 case SPDK_BDEV_IO_TYPE_RESET: 865 case SPDK_BDEV_IO_TYPE_FLUSH: 866 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 867 case SPDK_BDEV_IO_TYPE_NVME_IO: 868 case SPDK_BDEV_IO_TYPE_ABORT: 869 return true; 870 871 case SPDK_BDEV_IO_TYPE_COMPARE: 872 return spdk_nvme_ns_supports_compare(ns); 873 874 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 875 return spdk_nvme_ns_get_md_size(ns) ? true : false; 876 877 case SPDK_BDEV_IO_TYPE_UNMAP: 878 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 879 return cdata->oncs.dsm; 880 881 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 882 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 883 /* 884 * If an NVMe controller guarantees reading unallocated blocks returns zero, 885 * we can implement WRITE_ZEROES as an NVMe deallocate command. 886 */ 887 if (cdata->oncs.dsm && 888 spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) == 889 SPDK_NVME_DEALLOC_READ_00) { 890 return true; 891 } 892 /* 893 * The NVMe controller write_zeroes function is currently not used by our driver. 894 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. 895 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. 896 */ 897 return false; 898 899 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 900 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 901 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 902 return true; 903 } 904 return false; 905 906 default: 907 return false; 908 } 909 } 910 911 static int 912 bdev_nvme_create_cb(void *io_device, void *ctx_buf) 913 { 914 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 915 struct nvme_io_channel *nvme_ch = ctx_buf; 916 struct spdk_io_channel *pg_ch = NULL; 917 int rc; 918 919 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 920 rc = bdev_ocssd_create_io_channel(nvme_ch); 921 if (rc != 0) { 922 return rc; 923 } 924 } 925 926 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 927 if (!pg_ch) { 928 rc = -1; 929 goto err_pg_ch; 930 } 931 932 nvme_ch->group = spdk_io_channel_get_ctx(pg_ch); 933 934 #ifdef SPDK_CONFIG_VTUNE 935 nvme_ch->group->collect_spin_stat = true; 936 #else 937 nvme_ch->group->collect_spin_stat = false; 938 #endif 939 940 TAILQ_INIT(&nvme_ch->pending_resets); 941 942 nvme_ch->ctrlr = nvme_bdev_ctrlr; 943 944 rc = bdev_nvme_create_qpair(nvme_ch); 945 if (rc != 0) { 946 goto err_qpair; 947 } 948 949 return 0; 950 951 err_qpair: 952 spdk_put_io_channel(pg_ch); 953 err_pg_ch: 954 if (nvme_ch->ocssd_ch) { 955 bdev_ocssd_destroy_io_channel(nvme_ch); 956 } 957 958 return rc; 959 } 960 961 static void 962 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) 963 { 964 struct nvme_io_channel *nvme_ch = ctx_buf; 965 966 assert(nvme_ch->group != NULL); 967 968 if (nvme_ch->ocssd_ch != NULL) { 969 bdev_ocssd_destroy_io_channel(nvme_ch); 970 } 971 972 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 973 974 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group)); 975 } 976 977 static void 978 bdev_nvme_poll_group_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 979 uint32_t iov_cnt, uint32_t seed, 980 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 981 { 982 struct nvme_bdev_poll_group *group = ctx; 983 int rc; 984 985 assert(group->accel_channel != NULL); 986 assert(cb_fn != NULL); 987 988 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 989 if (rc) { 990 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 991 if (rc == -ENOMEM || rc == -EINVAL) { 992 cb_fn(cb_arg, rc); 993 } 994 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 995 } 996 } 997 998 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 999 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 1000 .submit_accel_crc32c = bdev_nvme_poll_group_submit_accel_crc32c, 1001 }; 1002 1003 static int 1004 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) 1005 { 1006 struct nvme_bdev_poll_group *group = ctx_buf; 1007 1008 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 1009 if (group->group == NULL) { 1010 return -1; 1011 } 1012 1013 group->accel_channel = spdk_accel_engine_get_io_channel(); 1014 if (!group->accel_channel) { 1015 spdk_nvme_poll_group_destroy(group->group); 1016 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 1017 group); 1018 return -1; 1019 } 1020 1021 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 1022 1023 if (group->poller == NULL) { 1024 spdk_put_io_channel(group->accel_channel); 1025 spdk_nvme_poll_group_destroy(group->group); 1026 return -1; 1027 } 1028 1029 return 0; 1030 } 1031 1032 static void 1033 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) 1034 { 1035 struct nvme_bdev_poll_group *group = ctx_buf; 1036 1037 if (group->accel_channel) { 1038 spdk_put_io_channel(group->accel_channel); 1039 } 1040 1041 spdk_poller_unregister(&group->poller); 1042 if (spdk_nvme_poll_group_destroy(group->group)) { 1043 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module."); 1044 assert(false); 1045 } 1046 } 1047 1048 static struct spdk_io_channel * 1049 bdev_nvme_get_io_channel(void *ctx) 1050 { 1051 struct nvme_bdev *nvme_bdev = ctx; 1052 1053 return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 1054 } 1055 1056 static void * 1057 bdev_nvme_get_module_ctx(void *ctx) 1058 { 1059 struct nvme_bdev *nvme_bdev = ctx; 1060 1061 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 1062 } 1063 1064 static int 1065 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 1066 { 1067 struct nvme_bdev *nvme_bdev = ctx; 1068 struct nvme_bdev_ns *nvme_ns; 1069 struct spdk_nvme_ns *ns; 1070 struct spdk_nvme_ctrlr *ctrlr; 1071 const struct spdk_nvme_ctrlr_data *cdata; 1072 const struct spdk_nvme_transport_id *trid; 1073 union spdk_nvme_vs_register vs; 1074 union spdk_nvme_csts_register csts; 1075 char buf[128]; 1076 1077 nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev); 1078 assert(nvme_ns != NULL); 1079 ns = nvme_ns->ns; 1080 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1081 1082 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1083 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1084 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1085 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1086 1087 spdk_json_write_named_object_begin(w, "nvme"); 1088 1089 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1090 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1091 } 1092 1093 spdk_json_write_named_object_begin(w, "trid"); 1094 1095 nvme_bdev_dump_trid_json(trid, w); 1096 1097 spdk_json_write_object_end(w); 1098 1099 #ifdef SPDK_CONFIG_NVME_CUSE 1100 size_t cuse_name_size = 128; 1101 char cuse_name[cuse_name_size]; 1102 1103 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1104 cuse_name, &cuse_name_size); 1105 if (rc == 0) { 1106 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1107 } 1108 #endif 1109 1110 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1111 1112 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1113 1114 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1115 spdk_str_trim(buf); 1116 spdk_json_write_named_string(w, "model_number", buf); 1117 1118 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1119 spdk_str_trim(buf); 1120 spdk_json_write_named_string(w, "serial_number", buf); 1121 1122 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1123 spdk_str_trim(buf); 1124 spdk_json_write_named_string(w, "firmware_revision", buf); 1125 1126 if (cdata->subnqn[0] != '\0') { 1127 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1128 } 1129 1130 spdk_json_write_named_object_begin(w, "oacs"); 1131 1132 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1133 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1134 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1135 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1136 1137 spdk_json_write_object_end(w); 1138 1139 spdk_json_write_object_end(w); 1140 1141 spdk_json_write_named_object_begin(w, "vs"); 1142 1143 spdk_json_write_name(w, "nvme_version"); 1144 if (vs.bits.ter) { 1145 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1146 } else { 1147 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1148 } 1149 1150 spdk_json_write_object_end(w); 1151 1152 spdk_json_write_named_object_begin(w, "csts"); 1153 1154 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1155 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1156 1157 spdk_json_write_object_end(w); 1158 1159 spdk_json_write_named_object_begin(w, "ns_data"); 1160 1161 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1162 1163 spdk_json_write_object_end(w); 1164 1165 if (cdata->oacs.security) { 1166 spdk_json_write_named_object_begin(w, "security"); 1167 1168 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1169 1170 spdk_json_write_object_end(w); 1171 } 1172 1173 spdk_json_write_object_end(w); 1174 1175 return 0; 1176 } 1177 1178 static void 1179 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1180 { 1181 /* No config per bdev needed */ 1182 } 1183 1184 static uint64_t 1185 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1186 { 1187 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 1188 struct nvme_bdev_poll_group *group = nvme_ch->group; 1189 uint64_t spin_time; 1190 1191 if (!group || !group->collect_spin_stat) { 1192 return 0; 1193 } 1194 1195 if (group->end_ticks != 0) { 1196 group->spin_ticks += (group->end_ticks - group->start_ticks); 1197 group->end_ticks = 0; 1198 } 1199 1200 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1201 group->start_ticks = 0; 1202 group->spin_ticks = 0; 1203 1204 return spin_time; 1205 } 1206 1207 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1208 .destruct = bdev_nvme_destruct, 1209 .submit_request = bdev_nvme_submit_request, 1210 .io_type_supported = bdev_nvme_io_type_supported, 1211 .get_io_channel = bdev_nvme_get_io_channel, 1212 .dump_info_json = bdev_nvme_dump_info_json, 1213 .write_config_json = bdev_nvme_write_config_json, 1214 .get_spin_time = bdev_nvme_get_spin_time, 1215 .get_module_ctx = bdev_nvme_get_module_ctx, 1216 }; 1217 1218 static int 1219 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1220 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1221 uint32_t prchk_flags, void *ctx) 1222 { 1223 const struct spdk_uuid *uuid; 1224 const struct spdk_nvme_ctrlr_data *cdata; 1225 const struct spdk_nvme_ns_data *nsdata; 1226 int rc; 1227 1228 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1229 1230 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1231 if (!disk->name) { 1232 return -ENOMEM; 1233 } 1234 disk->product_name = "NVMe disk"; 1235 1236 disk->write_cache = 0; 1237 if (cdata->vwc.present) { 1238 /* Enable if the Volatile Write Cache exists */ 1239 disk->write_cache = 1; 1240 } 1241 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1242 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1243 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1244 1245 uuid = spdk_nvme_ns_get_uuid(ns); 1246 if (uuid != NULL) { 1247 disk->uuid = *uuid; 1248 } 1249 1250 nsdata = spdk_nvme_ns_get_data(ns); 1251 1252 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1253 if (disk->md_len != 0) { 1254 disk->md_interleave = nsdata->flbas.extended; 1255 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1256 if (disk->dif_type != SPDK_DIF_DISABLE) { 1257 disk->dif_is_head_of_md = nsdata->dps.md_start; 1258 disk->dif_check_flags = prchk_flags; 1259 } 1260 } 1261 1262 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1263 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1264 disk->acwu = 0; 1265 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1266 disk->acwu = nsdata->nacwu; 1267 } else { 1268 disk->acwu = cdata->acwu; 1269 } 1270 1271 disk->ctxt = ctx; 1272 disk->fn_table = &nvmelib_fn_table; 1273 disk->module = &nvme_if; 1274 rc = spdk_bdev_register(disk); 1275 if (rc) { 1276 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1277 free(disk->name); 1278 return rc; 1279 } 1280 1281 return 0; 1282 } 1283 1284 static int 1285 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns) 1286 { 1287 struct nvme_bdev *bdev; 1288 int rc; 1289 1290 bdev = calloc(1, sizeof(*bdev)); 1291 if (!bdev) { 1292 SPDK_ERRLOG("bdev calloc() failed\n"); 1293 return -ENOMEM; 1294 } 1295 1296 bdev->nvme_ns = nvme_ns; 1297 bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL; 1298 1299 rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr, 1300 nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev); 1301 if (rc != 0) { 1302 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1303 free(bdev); 1304 return rc; 1305 } 1306 1307 nvme_ns->ref++; 1308 nvme_ns->bdev = bdev; 1309 1310 return 0; 1311 } 1312 1313 static void 1314 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1315 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1316 { 1317 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1318 struct spdk_nvme_ns *ns; 1319 int rc = 0; 1320 1321 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1322 if (!ns) { 1323 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1324 rc = -EINVAL; 1325 goto done; 1326 } 1327 1328 nvme_ns->ns = ns; 1329 nvme_ns->ref = 1; 1330 1331 rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns); 1332 done: 1333 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1334 } 1335 1336 static bool 1337 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1338 struct spdk_nvme_ctrlr_opts *opts) 1339 { 1340 struct nvme_probe_skip_entry *entry; 1341 1342 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1343 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1344 return false; 1345 } 1346 } 1347 1348 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1349 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1350 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1351 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1352 1353 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1354 1355 return true; 1356 } 1357 1358 static void 1359 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1360 { 1361 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1362 1363 if (spdk_nvme_cpl_is_error(cpl)) { 1364 SPDK_WARNLOG("Abort failed. Resetting controller.\n"); 1365 _bdev_nvme_reset(nvme_bdev_ctrlr); 1366 } 1367 } 1368 1369 static void 1370 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1371 struct spdk_nvme_qpair *qpair, uint16_t cid) 1372 { 1373 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg; 1374 union spdk_nvme_csts_register csts; 1375 int rc; 1376 1377 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1378 1379 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1380 1381 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1382 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1383 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1384 * completion recursively. 1385 */ 1386 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1387 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1388 if (csts.bits.cfs) { 1389 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1390 _bdev_nvme_reset(nvme_bdev_ctrlr); 1391 return; 1392 } 1393 } 1394 1395 switch (g_opts.action_on_timeout) { 1396 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1397 if (qpair) { 1398 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1399 nvme_abort_cpl, nvme_bdev_ctrlr); 1400 if (rc == 0) { 1401 return; 1402 } 1403 1404 SPDK_ERRLOG("Unable to send abort. Resetting.\n"); 1405 } 1406 1407 /* FALLTHROUGH */ 1408 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1409 _bdev_nvme_reset(nvme_bdev_ctrlr); 1410 break; 1411 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1412 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1413 break; 1414 default: 1415 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1416 break; 1417 } 1418 } 1419 1420 void 1421 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns) 1422 { 1423 nvme_bdev_ns_detach(nvme_ns); 1424 } 1425 1426 static void 1427 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns) 1428 { 1429 struct nvme_bdev *bdev; 1430 1431 bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1432 if (bdev != NULL) { 1433 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1434 } 1435 1436 nvme_ns->populated = false; 1437 1438 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1439 } 1440 1441 static void 1442 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns, 1443 struct nvme_async_probe_ctx *ctx) 1444 { 1445 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1446 } 1447 1448 static void 1449 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns) 1450 { 1451 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1452 } 1453 1454 void 1455 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1456 struct nvme_bdev_ns *nvme_ns, int rc) 1457 { 1458 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr; 1459 1460 assert(nvme_bdev_ctrlr != NULL); 1461 1462 if (rc == 0) { 1463 nvme_ns->populated = true; 1464 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1465 nvme_bdev_ctrlr->ref++; 1466 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1467 } else { 1468 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1469 } 1470 1471 if (ctx) { 1472 ctx->populates_in_progress--; 1473 if (ctx->populates_in_progress == 0) { 1474 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1475 } 1476 } 1477 } 1478 1479 static void 1480 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1481 struct nvme_async_probe_ctx *ctx) 1482 { 1483 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1484 struct nvme_bdev_ns *nvme_ns; 1485 struct spdk_nvme_ns *ns; 1486 struct nvme_bdev *bdev; 1487 uint32_t i; 1488 int rc; 1489 uint64_t num_sectors; 1490 bool ns_is_active; 1491 1492 if (ctx) { 1493 /* Initialize this count to 1 to handle the populate functions 1494 * calling nvme_ctrlr_populate_namespace_done() immediately. 1495 */ 1496 ctx->populates_in_progress = 1; 1497 } 1498 1499 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1500 uint32_t nsid = i + 1; 1501 1502 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1503 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1504 1505 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) { 1506 /* NS is still there but attributes may have changed */ 1507 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1508 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1509 bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1510 assert(bdev != NULL); 1511 if (bdev->disk.blockcnt != num_sectors) { 1512 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1513 nsid, 1514 bdev->disk.name, 1515 bdev->disk.blockcnt, 1516 num_sectors); 1517 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1518 if (rc != 0) { 1519 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1520 bdev->disk.name, rc); 1521 } 1522 } 1523 } 1524 1525 if (!nvme_ns->populated && ns_is_active) { 1526 nvme_ns->id = nsid; 1527 nvme_ns->ctrlr = nvme_bdev_ctrlr; 1528 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1529 nvme_ns->type = NVME_BDEV_NS_OCSSD; 1530 } else { 1531 nvme_ns->type = NVME_BDEV_NS_STANDARD; 1532 } 1533 1534 nvme_ns->bdev = NULL; 1535 1536 if (ctx) { 1537 ctx->populates_in_progress++; 1538 } 1539 nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx); 1540 } 1541 1542 if (nvme_ns->populated && !ns_is_active) { 1543 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1544 } 1545 } 1546 1547 if (ctx) { 1548 /* Decrement this count now that the loop is over to account 1549 * for the one we started with. If the count is then 0, we 1550 * know any populate_namespace functions completed immediately, 1551 * so we'll kick the callback here. 1552 */ 1553 ctx->populates_in_progress--; 1554 if (ctx->populates_in_progress == 0) { 1555 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1556 } 1557 } 1558 1559 } 1560 1561 static void 1562 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 1563 { 1564 uint32_t i; 1565 struct nvme_bdev_ns *nvme_ns; 1566 1567 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1568 uint32_t nsid = i + 1; 1569 1570 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1571 if (nvme_ns->populated) { 1572 assert(nvme_ns->id == nsid); 1573 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1574 } 1575 } 1576 } 1577 1578 static void 1579 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1580 { 1581 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 1582 union spdk_nvme_async_event_completion event; 1583 1584 if (spdk_nvme_cpl_is_error(cpl)) { 1585 SPDK_WARNLOG("AER request execute failed"); 1586 return; 1587 } 1588 1589 event.raw = cpl->cdw0; 1590 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1591 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1592 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1593 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1594 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1595 spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1596 bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); 1597 } 1598 } 1599 1600 static int 1601 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1602 const char *name, 1603 const struct spdk_nvme_transport_id *trid, 1604 uint32_t prchk_flags, 1605 struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr) 1606 { 1607 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1608 struct nvme_bdev_ctrlr_trid *trid_entry; 1609 uint32_t i; 1610 int rc; 1611 1612 nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); 1613 if (nvme_bdev_ctrlr == NULL) { 1614 SPDK_ERRLOG("Failed to allocate device struct\n"); 1615 return -ENOMEM; 1616 } 1617 1618 rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL); 1619 if (rc != 0) { 1620 goto err_init_mutex; 1621 } 1622 1623 TAILQ_INIT(&nvme_bdev_ctrlr->trids); 1624 nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 1625 if (nvme_bdev_ctrlr->num_ns != 0) { 1626 nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 1627 if (!nvme_bdev_ctrlr->namespaces) { 1628 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 1629 rc = -ENOMEM; 1630 goto err_alloc_namespaces; 1631 } 1632 } 1633 1634 trid_entry = calloc(1, sizeof(*trid_entry)); 1635 if (trid_entry == NULL) { 1636 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 1637 rc = -ENOMEM; 1638 goto err_alloc_trid; 1639 } 1640 1641 trid_entry->trid = *trid; 1642 1643 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1644 nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 1645 if (nvme_bdev_ctrlr->namespaces[i] == NULL) { 1646 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 1647 rc = -ENOMEM; 1648 goto err_alloc_namespace; 1649 } 1650 } 1651 1652 nvme_bdev_ctrlr->thread = spdk_get_thread(); 1653 nvme_bdev_ctrlr->adminq_timer_poller = NULL; 1654 nvme_bdev_ctrlr->ctrlr = ctrlr; 1655 nvme_bdev_ctrlr->ref = 1; 1656 nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; 1657 nvme_bdev_ctrlr->name = strdup(name); 1658 if (nvme_bdev_ctrlr->name == NULL) { 1659 rc = -ENOMEM; 1660 goto err_alloc_name; 1661 } 1662 1663 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1664 rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); 1665 if (spdk_unlikely(rc != 0)) { 1666 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 1667 goto err_init_ocssd; 1668 } 1669 } 1670 1671 nvme_bdev_ctrlr->prchk_flags = prchk_flags; 1672 1673 spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, 1674 sizeof(struct nvme_io_channel), 1675 name); 1676 1677 nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, 1678 g_opts.nvme_adminq_poll_period_us); 1679 1680 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 1681 1682 if (g_opts.timeout_us > 0) { 1683 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 1684 timeout_cb, nvme_bdev_ctrlr); 1685 } 1686 1687 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); 1688 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr); 1689 1690 if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & 1691 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 1692 nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); 1693 if (nvme_bdev_ctrlr->opal_dev == NULL) { 1694 SPDK_ERRLOG("Failed to initialize Opal\n"); 1695 } 1696 } 1697 1698 TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); 1699 1700 if (_nvme_bdev_ctrlr != NULL) { 1701 *_nvme_bdev_ctrlr = nvme_bdev_ctrlr; 1702 } 1703 return 0; 1704 1705 err_init_ocssd: 1706 free(nvme_bdev_ctrlr->name); 1707 err_alloc_name: 1708 err_alloc_namespace: 1709 for (; i > 0; i--) { 1710 free(nvme_bdev_ctrlr->namespaces[i - 1]); 1711 } 1712 free(trid_entry); 1713 err_alloc_trid: 1714 free(nvme_bdev_ctrlr->namespaces); 1715 err_alloc_namespaces: 1716 pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex); 1717 err_init_mutex: 1718 free(nvme_bdev_ctrlr); 1719 return rc; 1720 } 1721 1722 static void 1723 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1724 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1725 { 1726 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1727 struct nvme_probe_ctx *ctx = cb_ctx; 1728 char *name = NULL; 1729 uint32_t prchk_flags = 0; 1730 size_t i; 1731 int rc; 1732 1733 if (ctx) { 1734 for (i = 0; i < ctx->count; i++) { 1735 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1736 prchk_flags = ctx->prchk_flags[i]; 1737 name = strdup(ctx->names[i]); 1738 break; 1739 } 1740 } 1741 } else { 1742 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 1743 } 1744 if (!name) { 1745 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 1746 return; 1747 } 1748 1749 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 1750 1751 rc = nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr); 1752 if (rc != 0) { 1753 SPDK_ERRLOG("Failed to create new NVMe controller\n"); 1754 free(name); 1755 return; 1756 } 1757 1758 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1759 1760 free(name); 1761 } 1762 1763 static void 1764 _nvme_bdev_ctrlr_destruct(void *ctx) 1765 { 1766 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1767 1768 nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr); 1769 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1770 } 1771 1772 static int 1773 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug) 1774 { 1775 struct nvme_probe_skip_entry *entry; 1776 1777 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1778 1779 /* The controller's destruction was already started */ 1780 if (nvme_bdev_ctrlr->destruct) { 1781 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1782 return 0; 1783 } 1784 1785 if (!hotplug && 1786 nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1787 entry = calloc(1, sizeof(*entry)); 1788 if (!entry) { 1789 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1790 return -ENOMEM; 1791 } 1792 entry->trid = *nvme_bdev_ctrlr->connected_trid; 1793 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 1794 } 1795 1796 nvme_bdev_ctrlr->destruct = true; 1797 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1798 1799 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1800 1801 return 0; 1802 } 1803 1804 static void 1805 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 1806 { 1807 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx; 1808 1809 _bdev_nvme_delete(nvme_bdev_ctrlr, true); 1810 } 1811 1812 static int 1813 bdev_nvme_hotplug_probe(void *arg) 1814 { 1815 if (g_hotplug_probe_ctx == NULL) { 1816 spdk_poller_unregister(&g_hotplug_probe_poller); 1817 return SPDK_POLLER_IDLE; 1818 } 1819 1820 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 1821 g_hotplug_probe_ctx = NULL; 1822 spdk_poller_unregister(&g_hotplug_probe_poller); 1823 } 1824 1825 return SPDK_POLLER_BUSY; 1826 } 1827 1828 static int 1829 bdev_nvme_hotplug(void *arg) 1830 { 1831 struct spdk_nvme_transport_id trid_pcie; 1832 1833 if (g_hotplug_probe_ctx) { 1834 return SPDK_POLLER_BUSY; 1835 } 1836 1837 memset(&trid_pcie, 0, sizeof(trid_pcie)); 1838 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 1839 1840 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 1841 hotplug_probe_cb, attach_cb, NULL); 1842 1843 if (g_hotplug_probe_ctx) { 1844 assert(g_hotplug_probe_poller == NULL); 1845 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 1846 } 1847 1848 return SPDK_POLLER_BUSY; 1849 } 1850 1851 void 1852 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 1853 { 1854 *opts = g_opts; 1855 } 1856 1857 int 1858 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 1859 { 1860 if (g_bdev_nvme_init_thread != NULL) { 1861 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 1862 return -EPERM; 1863 } 1864 } 1865 1866 g_opts = *opts; 1867 1868 return 0; 1869 } 1870 1871 struct set_nvme_hotplug_ctx { 1872 uint64_t period_us; 1873 bool enabled; 1874 spdk_msg_fn fn; 1875 void *fn_ctx; 1876 }; 1877 1878 static void 1879 set_nvme_hotplug_period_cb(void *_ctx) 1880 { 1881 struct set_nvme_hotplug_ctx *ctx = _ctx; 1882 1883 spdk_poller_unregister(&g_hotplug_poller); 1884 if (ctx->enabled) { 1885 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 1886 } 1887 1888 g_nvme_hotplug_poll_period_us = ctx->period_us; 1889 g_nvme_hotplug_enabled = ctx->enabled; 1890 if (ctx->fn) { 1891 ctx->fn(ctx->fn_ctx); 1892 } 1893 1894 free(ctx); 1895 } 1896 1897 int 1898 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 1899 { 1900 struct set_nvme_hotplug_ctx *ctx; 1901 1902 if (enabled == true && !spdk_process_is_primary()) { 1903 return -EPERM; 1904 } 1905 1906 ctx = calloc(1, sizeof(*ctx)); 1907 if (ctx == NULL) { 1908 return -ENOMEM; 1909 } 1910 1911 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 1912 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 1913 ctx->enabled = enabled; 1914 ctx->fn = cb; 1915 ctx->fn_ctx = cb_ctx; 1916 1917 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 1918 return 0; 1919 } 1920 1921 static void 1922 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1923 { 1924 if (ctx->cb_fn) { 1925 ctx->cb_fn(ctx->cb_ctx, count, rc); 1926 } 1927 1928 ctx->namespaces_populated = true; 1929 if (ctx->probe_done) { 1930 /* The probe was already completed, so we need to free the context 1931 * here. This can happen for cases like OCSSD, where we need to 1932 * send additional commands to the SSD after attach. 1933 */ 1934 free(ctx); 1935 } 1936 } 1937 1938 static void 1939 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1940 struct nvme_async_probe_ctx *ctx) 1941 { 1942 struct nvme_bdev_ns *nvme_ns; 1943 struct nvme_bdev *nvme_bdev; 1944 uint32_t i, nsid; 1945 size_t j; 1946 1947 assert(nvme_bdev_ctrlr != NULL); 1948 1949 /* 1950 * Report the new bdevs that were created in this call. 1951 * There can be more than one bdev per NVMe controller. 1952 */ 1953 j = 0; 1954 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1955 nsid = i + 1; 1956 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1957 if (!nvme_ns->populated) { 1958 continue; 1959 } 1960 assert(nvme_ns->id == nsid); 1961 nvme_bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1962 if (nvme_bdev == NULL) { 1963 assert(nvme_ns->type == NVME_BDEV_NS_OCSSD); 1964 continue; 1965 } 1966 if (j < ctx->count) { 1967 ctx->names[j] = nvme_bdev->disk.name; 1968 j++; 1969 } else { 1970 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 1971 ctx->count); 1972 populate_namespaces_cb(ctx, 0, -ERANGE); 1973 return; 1974 } 1975 } 1976 1977 populate_namespaces_cb(ctx, j, 0); 1978 } 1979 1980 static bool 1981 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1982 { 1983 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1984 1985 nsdata1 = spdk_nvme_ns_get_data(ns1); 1986 nsdata2 = spdk_nvme_ns_get_data(ns2); 1987 1988 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)); 1989 } 1990 1991 static int 1992 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr, 1993 struct spdk_nvme_transport_id *trid) 1994 { 1995 uint32_t i, nsid; 1996 struct nvme_bdev_ns *nvme_ns; 1997 struct spdk_nvme_ns *new_ns; 1998 struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid; 1999 int rc = 0; 2000 2001 assert(nvme_bdev_ctrlr != NULL); 2002 2003 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2004 SPDK_ERRLOG("PCIe failover is not supported.\n"); 2005 return -ENOTSUP; 2006 } 2007 2008 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 2009 2010 /* Currently we only support failover to the same transport type. */ 2011 if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { 2012 rc = -EINVAL; 2013 goto exit; 2014 } 2015 2016 /* Currently we only support failover to the same NQN. */ 2017 if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 2018 rc = -EINVAL; 2019 goto exit; 2020 } 2021 2022 /* Skip all the other checks if we've already registered this path. */ 2023 TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) { 2024 if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) { 2025 rc = -EEXIST; 2026 goto exit; 2027 } 2028 } 2029 2030 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { 2031 rc = -EINVAL; 2032 goto exit; 2033 } 2034 2035 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 2036 nsid = i + 1; 2037 2038 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 2039 if (!nvme_ns->populated) { 2040 continue; 2041 } 2042 2043 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 2044 assert(new_ns != NULL); 2045 2046 if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) { 2047 rc = -EINVAL; 2048 goto exit; 2049 } 2050 } 2051 2052 new_trid = calloc(1, sizeof(*new_trid)); 2053 if (new_trid == NULL) { 2054 rc = -ENOMEM; 2055 goto exit; 2056 } 2057 new_trid->trid = *trid; 2058 new_trid->is_failed = false; 2059 2060 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 2061 if (tmp_trid->is_failed) { 2062 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 2063 goto exit; 2064 } 2065 } 2066 2067 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); 2068 2069 exit: 2070 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2071 return rc; 2072 } 2073 2074 static void 2075 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2076 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2077 { 2078 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2079 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2080 struct nvme_async_probe_ctx *ctx; 2081 int rc; 2082 2083 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2084 ctx->ctrlr_attached = true; 2085 2086 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 2087 if (nvme_bdev_ctrlr) { 2088 /* This is the case that a secondary path is added to an existing 2089 * nvme_bdev_ctrlr for failover. After checking if it can access the same 2090 * namespaces as the primary path, it is disconnected until failover occurs. 2091 */ 2092 rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid); 2093 2094 spdk_nvme_detach(ctrlr); 2095 goto exit; 2096 } 2097 2098 rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, 2099 &nvme_bdev_ctrlr); 2100 if (rc) { 2101 SPDK_ERRLOG("Failed to create new device\n"); 2102 goto exit; 2103 } 2104 2105 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); 2106 return; 2107 2108 exit: 2109 populate_namespaces_cb(ctx, 0, rc); 2110 } 2111 2112 static int 2113 bdev_nvme_async_poll(void *arg) 2114 { 2115 struct nvme_async_probe_ctx *ctx = arg; 2116 int rc; 2117 2118 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 2119 if (spdk_unlikely(rc != -EAGAIN)) { 2120 ctx->probe_done = true; 2121 spdk_poller_unregister(&ctx->poller); 2122 if (!ctx->ctrlr_attached) { 2123 /* The probe is done, but no controller was attached. 2124 * That means we had a failure, so report -EIO back to 2125 * the caller (usually the RPC). populate_namespaces_cb() 2126 * will take care of freeing the nvme_async_probe_ctx. 2127 */ 2128 populate_namespaces_cb(ctx, 0, -EIO); 2129 } else if (ctx->namespaces_populated) { 2130 /* The namespaces for the attached controller were all 2131 * populated and the response was already sent to the 2132 * caller (usually the RPC). So free the context here. 2133 */ 2134 free(ctx); 2135 } 2136 } 2137 2138 return SPDK_POLLER_BUSY; 2139 } 2140 2141 int 2142 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2143 struct spdk_nvme_host_id *hostid, 2144 const char *base_name, 2145 const char **names, 2146 uint32_t count, 2147 const char *hostnqn, 2148 uint32_t prchk_flags, 2149 spdk_bdev_create_nvme_fn cb_fn, 2150 void *cb_ctx, 2151 struct spdk_nvme_ctrlr_opts *opts) 2152 { 2153 struct nvme_probe_skip_entry *entry, *tmp; 2154 struct nvme_async_probe_ctx *ctx; 2155 2156 /* TODO expand this check to include both the host and target TRIDs. 2157 * Only if both are the same should we fail. 2158 */ 2159 if (nvme_bdev_ctrlr_get(trid) != NULL) { 2160 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2161 return -EEXIST; 2162 } 2163 2164 ctx = calloc(1, sizeof(*ctx)); 2165 if (!ctx) { 2166 return -ENOMEM; 2167 } 2168 ctx->base_name = base_name; 2169 ctx->names = names; 2170 ctx->count = count; 2171 ctx->cb_fn = cb_fn; 2172 ctx->cb_ctx = cb_ctx; 2173 ctx->prchk_flags = prchk_flags; 2174 ctx->trid = *trid; 2175 2176 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2177 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2178 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2179 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2180 free(entry); 2181 break; 2182 } 2183 } 2184 } 2185 2186 if (opts) { 2187 memcpy(&ctx->opts, opts, sizeof(*opts)); 2188 } else { 2189 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2190 } 2191 2192 ctx->opts.transport_retry_count = g_opts.retry_count; 2193 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2194 2195 if (hostnqn) { 2196 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 2197 } 2198 2199 if (hostid->hostaddr[0] != '\0') { 2200 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2201 } 2202 2203 if (hostid->hostsvcid[0] != '\0') { 2204 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2205 } 2206 2207 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 2208 if (ctx->probe_ctx == NULL) { 2209 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2210 free(ctx); 2211 return -ENODEV; 2212 } 2213 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2214 2215 return 0; 2216 } 2217 2218 int 2219 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid) 2220 { 2221 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2222 struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; 2223 2224 if (name == NULL) { 2225 return -EINVAL; 2226 } 2227 2228 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2229 if (nvme_bdev_ctrlr == NULL) { 2230 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2231 return -ENODEV; 2232 } 2233 2234 /* case 1: remove the controller itself. */ 2235 if (trid == NULL) { 2236 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2237 } 2238 2239 /* case 2: we are currently using the path to be removed. */ 2240 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2241 ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 2242 assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); 2243 /* case 2A: the current path is the only path. */ 2244 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2245 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2246 } 2247 2248 /* case 1B: there is an alternative path. */ 2249 return bdev_nvme_failover(nvme_bdev_ctrlr, true); 2250 } 2251 /* case 3: We are not using the specified path. */ 2252 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { 2253 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2254 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 2255 free(ctrlr_trid); 2256 return 0; 2257 } 2258 } 2259 2260 /* case 3A: The address isn't even in the registered list. */ 2261 return -ENXIO; 2262 } 2263 2264 static int 2265 bdev_nvme_library_init(void) 2266 { 2267 g_bdev_nvme_init_thread = spdk_get_thread(); 2268 2269 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, 2270 bdev_nvme_poll_group_destroy_cb, 2271 sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); 2272 2273 return 0; 2274 } 2275 2276 static void 2277 bdev_nvme_library_fini(void) 2278 { 2279 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; 2280 struct nvme_probe_skip_entry *entry, *entry_tmp; 2281 2282 spdk_poller_unregister(&g_hotplug_poller); 2283 free(g_hotplug_probe_ctx); 2284 g_hotplug_probe_ctx = NULL; 2285 2286 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2287 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2288 free(entry); 2289 } 2290 2291 pthread_mutex_lock(&g_bdev_nvme_mutex); 2292 TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { 2293 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 2294 if (nvme_bdev_ctrlr->destruct) { 2295 /* This controller's destruction was already started 2296 * before the application started shutting down 2297 */ 2298 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2299 continue; 2300 } 2301 nvme_bdev_ctrlr->destruct = true; 2302 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2303 2304 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct, 2305 nvme_bdev_ctrlr); 2306 } 2307 2308 g_bdev_nvme_module_finish = true; 2309 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2310 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2311 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 2312 spdk_bdev_module_finish_done(); 2313 return; 2314 } 2315 2316 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2317 } 2318 2319 static void 2320 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io) 2321 { 2322 struct spdk_bdev *bdev = bdev_io->bdev; 2323 struct spdk_dif_ctx dif_ctx; 2324 struct spdk_dif_error err_blk = {}; 2325 int rc; 2326 2327 rc = spdk_dif_ctx_init(&dif_ctx, 2328 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2329 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2330 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2331 if (rc != 0) { 2332 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2333 return; 2334 } 2335 2336 if (bdev->md_interleave) { 2337 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2338 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2339 } else { 2340 struct iovec md_iov = { 2341 .iov_base = bdev_io->u.bdev.md_buf, 2342 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2343 }; 2344 2345 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2346 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2347 } 2348 2349 if (rc != 0) { 2350 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2351 err_blk.err_type, err_blk.err_offset); 2352 } else { 2353 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2354 } 2355 } 2356 2357 static void 2358 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2359 { 2360 struct nvme_bdev_io *bio = ref; 2361 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2362 2363 if (spdk_nvme_cpl_is_success(cpl)) { 2364 /* Run PI verification for read data buffer. */ 2365 bdev_nvme_verify_pi_error(bdev_io); 2366 } 2367 2368 /* Return original completion status */ 2369 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, 2370 bio->cpl.status.sc); 2371 } 2372 2373 static void 2374 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2375 { 2376 struct nvme_bdev_io *bio = ref; 2377 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2378 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2379 struct nvme_io_channel *nvme_ch; 2380 struct nvme_bdev_ns *nvme_ns; 2381 struct spdk_nvme_qpair *qpair; 2382 int ret; 2383 2384 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2385 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2386 cpl->status.sct, cpl->status.sc); 2387 2388 /* Save completion status to use after verifying PI error. */ 2389 bio->cpl = *cpl; 2390 2391 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2392 2393 if (spdk_likely(bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 2394 /* Read without PI checking to verify PI error. */ 2395 ret = bdev_nvme_no_pi_readv(nvme_ns->ns, 2396 qpair, 2397 bio, 2398 bdev_io->u.bdev.iovs, 2399 bdev_io->u.bdev.iovcnt, 2400 bdev_io->u.bdev.md_buf, 2401 bdev_io->u.bdev.num_blocks, 2402 bdev_io->u.bdev.offset_blocks); 2403 if (ret == 0) { 2404 return; 2405 } 2406 } 2407 } 2408 2409 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2410 } 2411 2412 static void 2413 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2414 { 2415 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2416 2417 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2418 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2419 cpl->status.sct, cpl->status.sc); 2420 /* Run PI verification for write data buffer if PI error is detected. */ 2421 bdev_nvme_verify_pi_error(bdev_io); 2422 } 2423 2424 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2425 } 2426 2427 static void 2428 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2429 { 2430 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2431 2432 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2433 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2434 cpl->status.sct, cpl->status.sc); 2435 /* Run PI verification for compare data buffer if PI error is detected. */ 2436 bdev_nvme_verify_pi_error(bdev_io); 2437 } 2438 2439 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2440 } 2441 2442 static void 2443 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2444 { 2445 struct nvme_bdev_io *bio = ref; 2446 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2447 2448 /* Compare operation completion */ 2449 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2450 /* Save compare result for write callback */ 2451 bio->cpl = *cpl; 2452 return; 2453 } 2454 2455 /* Write operation completion */ 2456 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2457 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2458 * complete the IO with the compare operation's status. 2459 */ 2460 if (!spdk_nvme_cpl_is_error(cpl)) { 2461 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2462 } 2463 2464 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2465 } else { 2466 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2467 } 2468 } 2469 2470 static void 2471 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2472 { 2473 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2474 2475 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2476 } 2477 2478 static void 2479 bdev_nvme_admin_passthru_completion(void *ctx) 2480 { 2481 struct nvme_bdev_io *bio = ctx; 2482 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2483 2484 spdk_bdev_io_complete_nvme_status(bdev_io, 2485 bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2486 } 2487 2488 static void 2489 bdev_nvme_abort_completion(void *ctx) 2490 { 2491 struct nvme_bdev_io *bio = ctx; 2492 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2493 2494 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 2495 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2496 } else { 2497 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2498 } 2499 } 2500 2501 static void 2502 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 2503 { 2504 struct nvme_bdev_io *bio = ref; 2505 2506 bio->cpl = *cpl; 2507 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2508 } 2509 2510 static void 2511 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 2512 { 2513 struct nvme_bdev_io *bio = ref; 2514 2515 bio->cpl = *cpl; 2516 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 2517 } 2518 2519 static void 2520 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 2521 { 2522 struct nvme_bdev_io *bio = ref; 2523 struct iovec *iov; 2524 2525 bio->iov_offset = sgl_offset; 2526 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 2527 iov = &bio->iovs[bio->iovpos]; 2528 if (bio->iov_offset < iov->iov_len) { 2529 break; 2530 } 2531 2532 bio->iov_offset -= iov->iov_len; 2533 } 2534 } 2535 2536 static int 2537 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 2538 { 2539 struct nvme_bdev_io *bio = ref; 2540 struct iovec *iov; 2541 2542 assert(bio->iovpos < bio->iovcnt); 2543 2544 iov = &bio->iovs[bio->iovpos]; 2545 2546 *address = iov->iov_base; 2547 *length = iov->iov_len; 2548 2549 if (bio->iov_offset) { 2550 assert(bio->iov_offset <= iov->iov_len); 2551 *address += bio->iov_offset; 2552 *length -= bio->iov_offset; 2553 } 2554 2555 bio->iov_offset += *length; 2556 if (bio->iov_offset == iov->iov_len) { 2557 bio->iovpos++; 2558 bio->iov_offset = 0; 2559 } 2560 2561 return 0; 2562 } 2563 2564 static void 2565 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 2566 { 2567 struct nvme_bdev_io *bio = ref; 2568 struct iovec *iov; 2569 2570 bio->fused_iov_offset = sgl_offset; 2571 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 2572 iov = &bio->fused_iovs[bio->fused_iovpos]; 2573 if (bio->fused_iov_offset < iov->iov_len) { 2574 break; 2575 } 2576 2577 bio->fused_iov_offset -= iov->iov_len; 2578 } 2579 } 2580 2581 static int 2582 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 2583 { 2584 struct nvme_bdev_io *bio = ref; 2585 struct iovec *iov; 2586 2587 assert(bio->fused_iovpos < bio->fused_iovcnt); 2588 2589 iov = &bio->fused_iovs[bio->fused_iovpos]; 2590 2591 *address = iov->iov_base; 2592 *length = iov->iov_len; 2593 2594 if (bio->fused_iov_offset) { 2595 assert(bio->fused_iov_offset <= iov->iov_len); 2596 *address += bio->fused_iov_offset; 2597 *length -= bio->fused_iov_offset; 2598 } 2599 2600 bio->fused_iov_offset += *length; 2601 if (bio->fused_iov_offset == iov->iov_len) { 2602 bio->fused_iovpos++; 2603 bio->fused_iov_offset = 0; 2604 } 2605 2606 return 0; 2607 } 2608 2609 static int 2610 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2611 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2612 void *md, uint64_t lba_count, uint64_t lba) 2613 { 2614 int rc; 2615 2616 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 2617 lba_count, lba); 2618 2619 bio->iovs = iov; 2620 bio->iovcnt = iovcnt; 2621 bio->iovpos = 0; 2622 bio->iov_offset = 0; 2623 2624 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2625 bdev_nvme_no_pi_readv_done, bio, 0, 2626 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2627 md, 0, 0); 2628 2629 if (rc != 0 && rc != -ENOMEM) { 2630 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 2631 } 2632 return rc; 2633 } 2634 2635 static int 2636 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2637 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2638 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2639 { 2640 int rc; 2641 2642 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2643 lba_count, lba); 2644 2645 bio->iovs = iov; 2646 bio->iovcnt = iovcnt; 2647 bio->iovpos = 0; 2648 bio->iov_offset = 0; 2649 2650 if (iovcnt == 1) { 2651 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 2652 lba_count, 2653 bdev_nvme_readv_done, bio, 2654 flags, 2655 0, 0); 2656 } else { 2657 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2658 bdev_nvme_readv_done, bio, flags, 2659 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2660 md, 0, 0); 2661 } 2662 2663 if (rc != 0 && rc != -ENOMEM) { 2664 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 2665 } 2666 return rc; 2667 } 2668 2669 static int 2670 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2671 struct nvme_bdev_io *bio, 2672 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2673 uint32_t flags) 2674 { 2675 int rc; 2676 2677 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2678 lba_count, lba); 2679 2680 bio->iovs = iov; 2681 bio->iovcnt = iovcnt; 2682 bio->iovpos = 0; 2683 bio->iov_offset = 0; 2684 2685 if (iovcnt == 1) { 2686 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 2687 lba_count, 2688 bdev_nvme_writev_done, bio, 2689 flags, 2690 0, 0); 2691 } else { 2692 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2693 bdev_nvme_writev_done, bio, flags, 2694 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2695 md, 0, 0); 2696 } 2697 2698 if (rc != 0 && rc != -ENOMEM) { 2699 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 2700 } 2701 return rc; 2702 } 2703 2704 static int 2705 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2706 struct nvme_bdev_io *bio, 2707 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2708 uint32_t flags) 2709 { 2710 int rc; 2711 2712 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2713 lba_count, lba); 2714 2715 bio->iovs = iov; 2716 bio->iovcnt = iovcnt; 2717 bio->iovpos = 0; 2718 bio->iov_offset = 0; 2719 2720 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2721 bdev_nvme_comparev_done, bio, flags, 2722 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2723 md, 0, 0); 2724 2725 if (rc != 0 && rc != -ENOMEM) { 2726 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 2727 } 2728 return rc; 2729 } 2730 2731 static int 2732 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2733 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 2734 struct iovec *write_iov, int write_iovcnt, 2735 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2736 { 2737 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2738 int rc; 2739 2740 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2741 lba_count, lba); 2742 2743 bio->iovs = cmp_iov; 2744 bio->iovcnt = cmp_iovcnt; 2745 bio->iovpos = 0; 2746 bio->iov_offset = 0; 2747 bio->fused_iovs = write_iov; 2748 bio->fused_iovcnt = write_iovcnt; 2749 bio->fused_iovpos = 0; 2750 bio->fused_iov_offset = 0; 2751 2752 if (bdev_io->num_retries == 0) { 2753 bio->first_fused_submitted = false; 2754 } 2755 2756 if (!bio->first_fused_submitted) { 2757 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2758 memset(&bio->cpl, 0, sizeof(bio->cpl)); 2759 2760 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2761 bdev_nvme_comparev_and_writev_done, bio, flags, 2762 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 2763 if (rc == 0) { 2764 bio->first_fused_submitted = true; 2765 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2766 } else { 2767 if (rc != -ENOMEM) { 2768 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 2769 } 2770 return rc; 2771 } 2772 } 2773 2774 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 2775 2776 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2777 bdev_nvme_comparev_and_writev_done, bio, flags, 2778 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 2779 if (rc != 0 && rc != -ENOMEM) { 2780 SPDK_ERRLOG("write failed: rc = %d\n", rc); 2781 rc = 0; 2782 } 2783 2784 return rc; 2785 } 2786 2787 static int 2788 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2789 struct nvme_bdev_io *bio, 2790 uint64_t offset_blocks, 2791 uint64_t num_blocks) 2792 { 2793 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 2794 struct spdk_nvme_dsm_range *range; 2795 uint64_t offset, remaining; 2796 uint64_t num_ranges_u64; 2797 uint16_t num_ranges; 2798 int rc; 2799 2800 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 2801 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2802 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 2803 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 2804 return -EINVAL; 2805 } 2806 num_ranges = (uint16_t)num_ranges_u64; 2807 2808 offset = offset_blocks; 2809 remaining = num_blocks; 2810 range = &dsm_ranges[0]; 2811 2812 /* Fill max-size ranges until the remaining blocks fit into one range */ 2813 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 2814 range->attributes.raw = 0; 2815 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2816 range->starting_lba = offset; 2817 2818 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2819 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2820 range++; 2821 } 2822 2823 /* Final range describes the remaining blocks */ 2824 range->attributes.raw = 0; 2825 range->length = remaining; 2826 range->starting_lba = offset; 2827 2828 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 2829 SPDK_NVME_DSM_ATTR_DEALLOCATE, 2830 dsm_ranges, num_ranges, 2831 bdev_nvme_queued_done, bio); 2832 2833 return rc; 2834 } 2835 2836 static int 2837 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2838 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2839 { 2840 uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr); 2841 2842 if (nbytes > max_xfer_size) { 2843 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2844 return -EINVAL; 2845 } 2846 2847 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2848 2849 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf, 2850 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 2851 } 2852 2853 static int 2854 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2855 struct nvme_bdev_io *bio, 2856 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2857 { 2858 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2859 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2860 2861 if (nbytes > max_xfer_size) { 2862 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2863 return -EINVAL; 2864 } 2865 2866 /* 2867 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2868 * so fill it out automatically. 2869 */ 2870 cmd->nsid = spdk_nvme_ns_get_id(ns); 2871 2872 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 2873 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 2874 } 2875 2876 static int 2877 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2878 struct nvme_bdev_io *bio, 2879 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 2880 { 2881 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 2882 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2883 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2884 2885 if (nbytes > max_xfer_size) { 2886 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2887 return -EINVAL; 2888 } 2889 2890 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 2891 SPDK_ERRLOG("invalid meta data buffer size\n"); 2892 return -EINVAL; 2893 } 2894 2895 /* 2896 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2897 * so fill it out automatically. 2898 */ 2899 cmd->nsid = spdk_nvme_ns_get_id(ns); 2900 2901 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 2902 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 2903 } 2904 2905 static void 2906 bdev_nvme_abort_admin_cmd(void *ctx) 2907 { 2908 struct nvme_bdev_io *bio = ctx; 2909 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2910 struct nvme_io_channel *nvme_ch; 2911 struct nvme_bdev_io *bio_to_abort; 2912 int rc; 2913 2914 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2915 bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2916 2917 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2918 NULL, 2919 bio_to_abort, 2920 bdev_nvme_abort_done, bio); 2921 if (rc == -ENOENT) { 2922 /* If no admin command was found in admin qpair, complete the abort 2923 * request with failure. 2924 */ 2925 bio->cpl.cdw0 |= 1U; 2926 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 2927 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2928 2929 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2930 } 2931 } 2932 2933 static int 2934 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2935 struct nvme_bdev_io *bio_to_abort) 2936 { 2937 int rc; 2938 2939 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2940 2941 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2942 nvme_ch->qpair, 2943 bio_to_abort, 2944 bdev_nvme_abort_done, bio); 2945 if (rc == -ENOENT) { 2946 /* If no command was found in I/O qpair, the target command may be 2947 * admin command. Only a single thread tries aborting admin command 2948 * to clean I/O flow. 2949 */ 2950 spdk_thread_send_msg(nvme_ch->ctrlr->thread, 2951 bdev_nvme_abort_admin_cmd, bio); 2952 rc = 0; 2953 } 2954 2955 return rc; 2956 } 2957 2958 static void 2959 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 2960 struct nvme_bdev_ns *nvme_ns) 2961 { 2962 /* nop */ 2963 } 2964 2965 static void 2966 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns) 2967 { 2968 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 2969 } 2970 2971 static void 2972 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 2973 { 2974 const char *action; 2975 2976 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 2977 action = "reset"; 2978 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 2979 action = "abort"; 2980 } else { 2981 action = "none"; 2982 } 2983 2984 spdk_json_write_object_begin(w); 2985 2986 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 2987 2988 spdk_json_write_named_object_begin(w, "params"); 2989 spdk_json_write_named_string(w, "action_on_timeout", action); 2990 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 2991 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 2992 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 2993 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 2994 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 2995 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 2996 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 2997 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 2998 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 2999 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 3000 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 3001 spdk_json_write_object_end(w); 3002 3003 spdk_json_write_object_end(w); 3004 } 3005 3006 static void 3007 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w, 3008 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 3009 { 3010 struct spdk_nvme_transport_id *trid; 3011 3012 trid = nvme_bdev_ctrlr->connected_trid; 3013 3014 spdk_json_write_object_begin(w); 3015 3016 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 3017 3018 spdk_json_write_named_object_begin(w, "params"); 3019 spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); 3020 nvme_bdev_dump_trid_json(trid, w); 3021 spdk_json_write_named_bool(w, "prchk_reftag", 3022 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 3023 spdk_json_write_named_bool(w, "prchk_guard", 3024 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 3025 3026 spdk_json_write_object_end(w); 3027 3028 spdk_json_write_object_end(w); 3029 } 3030 3031 static void 3032 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 3033 { 3034 spdk_json_write_object_begin(w); 3035 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 3036 3037 spdk_json_write_named_object_begin(w, "params"); 3038 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 3039 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 3040 spdk_json_write_object_end(w); 3041 3042 spdk_json_write_object_end(w); 3043 } 3044 3045 static int 3046 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 3047 { 3048 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 3049 uint32_t nsid; 3050 3051 bdev_nvme_opts_config_json(w); 3052 3053 pthread_mutex_lock(&g_bdev_nvme_mutex); 3054 3055 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 3056 nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr); 3057 3058 for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { 3059 if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { 3060 continue; 3061 } 3062 3063 nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); 3064 } 3065 } 3066 3067 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 3068 * before enabling hotplug poller. 3069 */ 3070 bdev_nvme_hotplug_config_json(w); 3071 3072 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3073 return 0; 3074 } 3075 3076 struct spdk_nvme_ctrlr * 3077 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 3078 { 3079 if (!bdev || bdev->module != &nvme_if) { 3080 return NULL; 3081 } 3082 3083 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 3084 } 3085 3086 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3087