1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/config.h" 40 #include "spdk/endian.h" 41 #include "spdk/bdev.h" 42 #include "spdk/json.h" 43 #include "spdk/nvme.h" 44 #include "spdk/nvme_ocssd.h" 45 #include "spdk/thread.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 49 #include "spdk/bdev_module.h" 50 #include "spdk/log.h" 51 52 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 53 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 54 55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 56 57 struct nvme_bdev_io { 58 /** array of iovecs to transfer. */ 59 struct iovec *iovs; 60 61 /** Number of iovecs in iovs array. */ 62 int iovcnt; 63 64 /** Current iovec position. */ 65 int iovpos; 66 67 /** Offset in current iovec. */ 68 uint32_t iov_offset; 69 70 /** array of iovecs to transfer. */ 71 struct iovec *fused_iovs; 72 73 /** Number of iovecs in iovs array. */ 74 int fused_iovcnt; 75 76 /** Current iovec position. */ 77 int fused_iovpos; 78 79 /** Offset in current iovec. */ 80 uint32_t fused_iov_offset; 81 82 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 83 struct spdk_nvme_cpl cpl; 84 85 /** Originating thread */ 86 struct spdk_thread *orig_thread; 87 88 /** Keeps track if first of fused commands was submitted */ 89 bool first_fused_submitted; 90 }; 91 92 struct nvme_probe_ctx { 93 size_t count; 94 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 95 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 96 const char *names[NVME_MAX_CONTROLLERS]; 97 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 98 const char *hostnqn; 99 }; 100 101 struct nvme_probe_skip_entry { 102 struct spdk_nvme_transport_id trid; 103 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 104 }; 105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 107 g_skipped_nvme_ctrlrs); 108 109 static struct spdk_bdev_nvme_opts g_opts = { 110 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 111 .timeout_us = 0, 112 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 113 .retry_count = 4, 114 .arbitration_burst = 0, 115 .low_priority_weight = 0, 116 .medium_priority_weight = 0, 117 .high_priority_weight = 0, 118 .nvme_adminq_poll_period_us = 10000ULL, 119 .nvme_ioq_poll_period_us = 0, 120 .io_queue_requests = 0, 121 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 122 }; 123 124 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 125 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 126 127 static int g_hot_insert_nvme_controller_index = 0; 128 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 129 static bool g_nvme_hotplug_enabled = false; 130 static struct spdk_thread *g_bdev_nvme_init_thread; 131 static struct spdk_poller *g_hotplug_poller; 132 static struct spdk_poller *g_hotplug_probe_poller; 133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 134 135 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 136 struct nvme_async_probe_ctx *ctx); 137 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx); 138 static int bdev_nvme_library_init(void); 139 static void bdev_nvme_library_fini(void); 140 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 141 struct nvme_bdev_io *bio, 142 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 143 uint32_t flags); 144 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 145 struct nvme_bdev_io *bio, 146 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 147 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 148 struct nvme_bdev_io *bio, 149 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 150 uint32_t flags); 151 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 152 struct nvme_bdev_io *bio, 153 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 154 uint32_t flags); 155 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 156 struct spdk_nvme_qpair *qpair, 157 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 158 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 159 uint32_t flags); 160 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, 161 struct nvme_bdev_io *bio, 162 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 163 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 164 struct nvme_bdev_io *bio, 165 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 166 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 167 struct nvme_bdev_io *bio, 168 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 169 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch, 170 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 171 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio); 172 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove); 173 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 174 175 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 176 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 177 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 178 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 179 180 static populate_namespace_fn g_populate_namespace_fn[] = { 181 NULL, 182 nvme_ctrlr_populate_standard_namespace, 183 bdev_ocssd_populate_namespace, 184 }; 185 186 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns); 187 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns); 188 189 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 190 NULL, 191 nvme_ctrlr_depopulate_standard_namespace, 192 bdev_ocssd_depopulate_namespace, 193 }; 194 195 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 196 struct nvme_bdev_ns *nvme_ns); 197 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 198 struct nvme_bdev_ns *nvme_ns); 199 200 static config_json_namespace_fn g_config_json_namespace_fn[] = { 201 NULL, 202 nvme_ctrlr_config_json_standard_namespace, 203 bdev_ocssd_namespace_config_json, 204 }; 205 206 struct spdk_nvme_qpair * 207 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 208 { 209 struct nvme_io_channel *nvme_ch; 210 211 nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 212 213 return nvme_ch->qpair; 214 } 215 216 static int 217 bdev_nvme_get_ctx_size(void) 218 { 219 return sizeof(struct nvme_bdev_io); 220 } 221 222 static struct spdk_bdev_module nvme_if = { 223 .name = "nvme", 224 .async_fini = true, 225 .module_init = bdev_nvme_library_init, 226 .module_fini = bdev_nvme_library_fini, 227 .config_json = bdev_nvme_config_json, 228 .get_ctx_size = bdev_nvme_get_ctx_size, 229 230 }; 231 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 232 233 static void 234 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 235 { 236 int rc; 237 238 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 239 /* 240 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 241 * reconnect a qpair and we will stop getting a callback for this one. 242 */ 243 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 244 if (rc != 0) { 245 SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 246 } 247 } 248 249 static int 250 bdev_nvme_poll(void *arg) 251 { 252 struct nvme_bdev_poll_group *group = arg; 253 int64_t num_completions; 254 255 if (group->collect_spin_stat && group->start_ticks == 0) { 256 group->start_ticks = spdk_get_ticks(); 257 } 258 259 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 260 bdev_nvme_disconnected_qpair_cb); 261 if (group->collect_spin_stat) { 262 if (num_completions > 0) { 263 if (group->end_ticks != 0) { 264 group->spin_ticks += (group->end_ticks - group->start_ticks); 265 group->end_ticks = 0; 266 } 267 group->start_ticks = 0; 268 } else { 269 group->end_ticks = spdk_get_ticks(); 270 } 271 } 272 273 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 274 } 275 276 static int 277 bdev_nvme_poll_adminq(void *arg) 278 { 279 int32_t rc; 280 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 281 282 assert(nvme_bdev_ctrlr != NULL); 283 284 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); 285 if (rc < 0) { 286 bdev_nvme_failover(nvme_bdev_ctrlr, false); 287 } 288 289 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 290 } 291 292 static int 293 bdev_nvme_destruct(void *ctx) 294 { 295 struct nvme_bdev *nvme_disk = ctx; 296 struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns; 297 298 nvme_ns->bdev = NULL; 299 300 nvme_bdev_ns_detach(nvme_ns); 301 302 free(nvme_disk->disk.name); 303 free(nvme_disk); 304 305 return 0; 306 } 307 308 static int 309 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 310 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 311 { 312 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); 313 314 return 0; 315 } 316 317 static int 318 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch) 319 { 320 struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr; 321 struct spdk_nvme_io_qpair_opts opts; 322 int rc; 323 324 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 325 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 326 opts.create_only = true; 327 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 328 g_opts.io_queue_requests = opts.io_queue_requests; 329 330 nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 331 if (nvme_ch->qpair == NULL) { 332 return -1; 333 } 334 335 assert(nvme_ch->group != NULL); 336 337 rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair); 338 if (rc != 0) { 339 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 340 goto err; 341 } 342 343 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair); 344 if (rc != 0) { 345 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 346 goto err; 347 } 348 349 return 0; 350 351 err: 352 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 353 354 return rc; 355 } 356 357 static void 358 _bdev_nvme_reset_destruct_ctrlr(struct spdk_io_channel_iter *i, int status) 359 { 360 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 361 362 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct, 363 nvme_bdev_ctrlr); 364 } 365 366 static void 367 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 368 { 369 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 370 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 371 struct spdk_bdev_io *bdev_io; 372 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 373 374 /* A NULL ctx means success. */ 375 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 376 status = SPDK_BDEV_IO_STATUS_FAILED; 377 } 378 379 while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) { 380 bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets); 381 TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link); 382 spdk_bdev_io_complete(bdev_io, status); 383 } 384 385 spdk_for_each_channel_continue(i, 0); 386 } 387 388 static void 389 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) 390 { 391 /* we are using the for_each_channel cb_arg like a return code here. */ 392 /* If it's zero, we succeeded, otherwise, the reset failed. */ 393 void *cb_arg = NULL; 394 struct nvme_bdev_ctrlr_trid *curr_trid; 395 bool do_destruct = false; 396 397 if (rc) { 398 cb_arg = (void *)0x1; 399 SPDK_ERRLOG("Resetting controller failed.\n"); 400 } else { 401 SPDK_NOTICELOG("Resetting controller successful.\n"); 402 } 403 404 pthread_mutex_lock(&g_bdev_nvme_mutex); 405 nvme_bdev_ctrlr->resetting = false; 406 nvme_bdev_ctrlr->failover_in_progress = false; 407 408 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 409 assert(curr_trid != NULL); 410 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 411 412 curr_trid->is_failed = cb_arg != NULL ? true : false; 413 414 if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { 415 /* Destruct ctrlr after clearing pending resets. */ 416 do_destruct = true; 417 } 418 419 pthread_mutex_unlock(&g_bdev_nvme_mutex); 420 /* Make sure we clear any pending resets before returning. */ 421 spdk_for_each_channel(nvme_bdev_ctrlr, 422 _bdev_nvme_complete_pending_resets, 423 cb_arg, 424 do_destruct ? _bdev_nvme_reset_destruct_ctrlr : NULL); 425 } 426 427 static void 428 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 429 { 430 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 431 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 432 int rc = SPDK_BDEV_IO_STATUS_SUCCESS; 433 434 if (status) { 435 rc = SPDK_BDEV_IO_STATUS_FAILED; 436 } 437 if (bio) { 438 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc); 439 } 440 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 441 } 442 443 static void 444 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 445 { 446 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 447 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 448 int rc; 449 450 rc = bdev_nvme_create_qpair(nvme_ch); 451 452 spdk_for_each_channel_continue(i, rc); 453 } 454 455 static void 456 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 457 { 458 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 459 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 460 int rc; 461 462 if (status) { 463 rc = status; 464 goto err; 465 } 466 467 rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); 468 if (rc != 0) { 469 goto err; 470 } 471 472 /* Recreate all of the I/O queue pairs */ 473 spdk_for_each_channel(nvme_bdev_ctrlr, 474 _bdev_nvme_reset_create_qpair, 475 bio, 476 _bdev_nvme_reset_create_qpairs_done); 477 return; 478 479 err: 480 if (bio) { 481 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); 482 } 483 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); 484 } 485 486 static void 487 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 488 { 489 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 490 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 491 int rc; 492 493 rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 494 if (!rc) { 495 nvme_ch->qpair = NULL; 496 } 497 498 spdk_for_each_channel_continue(i, rc); 499 } 500 501 static int 502 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, void *ctx) 503 { 504 pthread_mutex_lock(&g_bdev_nvme_mutex); 505 if (nvme_bdev_ctrlr->destruct) { 506 pthread_mutex_unlock(&g_bdev_nvme_mutex); 507 return -EBUSY; 508 } 509 510 if (nvme_bdev_ctrlr->resetting) { 511 pthread_mutex_unlock(&g_bdev_nvme_mutex); 512 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 513 return -EAGAIN; 514 } 515 516 nvme_bdev_ctrlr->resetting = true; 517 518 pthread_mutex_unlock(&g_bdev_nvme_mutex); 519 /* First, delete all NVMe I/O queue pairs. */ 520 spdk_for_each_channel(nvme_bdev_ctrlr, 521 _bdev_nvme_reset_destroy_qpair, 522 ctx, 523 _bdev_nvme_reset_ctrlr); 524 525 return 0; 526 } 527 528 static int 529 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio) 530 { 531 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 532 int rc; 533 534 rc = _bdev_nvme_reset(nvme_ch->ctrlr, bio); 535 if (rc == -EBUSY) { 536 /* Don't bother resetting if the controller is in the process of being destructed. */ 537 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 538 return 0; 539 } else if (rc == -EAGAIN) { 540 /* 541 * Reset call is queued only if it is from the app framework. This is on purpose so that 542 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 543 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 544 */ 545 TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link); 546 return 0; 547 } else { 548 return rc; 549 } 550 } 551 552 static int 553 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 554 { 555 struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 556 int rc = 0; 557 558 pthread_mutex_lock(&g_bdev_nvme_mutex); 559 if (nvme_bdev_ctrlr->destruct) { 560 pthread_mutex_unlock(&g_bdev_nvme_mutex); 561 /* Don't bother resetting if the controller is in the process of being destructed. */ 562 return 0; 563 } 564 565 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 566 assert(curr_trid); 567 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 568 next_trid = TAILQ_NEXT(curr_trid, link); 569 570 if (nvme_bdev_ctrlr->resetting) { 571 if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { 572 rc = -EAGAIN; 573 } 574 pthread_mutex_unlock(&g_bdev_nvme_mutex); 575 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 576 return rc; 577 } 578 579 nvme_bdev_ctrlr->resetting = true; 580 curr_trid->is_failed = true; 581 582 if (next_trid) { 583 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 584 585 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 586 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 587 588 nvme_bdev_ctrlr->failover_in_progress = true; 589 spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); 590 nvme_bdev_ctrlr->connected_trid = &next_trid->trid; 591 rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); 592 assert(rc == 0); 593 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link); 594 if (!remove) { 595 /** Shuffle the old trid to the end of the list and use the new one. 596 * Allows for round robin through multiple connections. 597 */ 598 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link); 599 } else { 600 free(curr_trid); 601 } 602 } 603 604 pthread_mutex_unlock(&g_bdev_nvme_mutex); 605 /* First, delete all NVMe I/O queue pairs. */ 606 spdk_for_each_channel(nvme_bdev_ctrlr, 607 _bdev_nvme_reset_destroy_qpair, 608 NULL, 609 _bdev_nvme_reset_ctrlr); 610 611 return 0; 612 } 613 614 static int 615 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 616 struct nvme_bdev_io *bio, 617 uint64_t offset_blocks, 618 uint64_t num_blocks); 619 620 static void 621 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 622 bool success) 623 { 624 struct spdk_bdev *bdev = bdev_io->bdev; 625 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 626 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 627 struct nvme_bdev_ns *nvme_ns; 628 struct spdk_nvme_qpair *qpair; 629 int ret; 630 631 if (!success) { 632 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 633 return; 634 } 635 636 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 637 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 638 return; 639 } 640 641 ret = bdev_nvme_readv(nvme_ns->ns, 642 qpair, 643 (struct nvme_bdev_io *)bdev_io->driver_ctx, 644 bdev_io->u.bdev.iovs, 645 bdev_io->u.bdev.iovcnt, 646 bdev_io->u.bdev.md_buf, 647 bdev_io->u.bdev.num_blocks, 648 bdev_io->u.bdev.offset_blocks, 649 bdev->dif_check_flags); 650 651 if (spdk_likely(ret == 0)) { 652 return; 653 } else if (ret == -ENOMEM) { 654 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 655 } else { 656 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 657 } 658 } 659 660 static int 661 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 662 { 663 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 664 struct spdk_bdev *bdev = bdev_io->bdev; 665 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 666 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 667 struct nvme_bdev_io *nbdev_io_to_abort; 668 struct nvme_bdev_ns *nvme_ns; 669 struct spdk_nvme_qpair *qpair; 670 671 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 672 return -1; 673 } 674 675 switch (bdev_io->type) { 676 case SPDK_BDEV_IO_TYPE_READ: 677 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 678 return bdev_nvme_readv(nvme_ns->ns, 679 qpair, 680 nbdev_io, 681 bdev_io->u.bdev.iovs, 682 bdev_io->u.bdev.iovcnt, 683 bdev_io->u.bdev.md_buf, 684 bdev_io->u.bdev.num_blocks, 685 bdev_io->u.bdev.offset_blocks, 686 bdev->dif_check_flags); 687 } else { 688 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 689 bdev_io->u.bdev.num_blocks * bdev->blocklen); 690 return 0; 691 } 692 693 case SPDK_BDEV_IO_TYPE_WRITE: 694 return bdev_nvme_writev(nvme_ns->ns, 695 qpair, 696 nbdev_io, 697 bdev_io->u.bdev.iovs, 698 bdev_io->u.bdev.iovcnt, 699 bdev_io->u.bdev.md_buf, 700 bdev_io->u.bdev.num_blocks, 701 bdev_io->u.bdev.offset_blocks, 702 bdev->dif_check_flags); 703 704 case SPDK_BDEV_IO_TYPE_COMPARE: 705 return bdev_nvme_comparev(nvme_ns->ns, 706 qpair, 707 nbdev_io, 708 bdev_io->u.bdev.iovs, 709 bdev_io->u.bdev.iovcnt, 710 bdev_io->u.bdev.md_buf, 711 bdev_io->u.bdev.num_blocks, 712 bdev_io->u.bdev.offset_blocks, 713 bdev->dif_check_flags); 714 715 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 716 return bdev_nvme_comparev_and_writev(nvme_ns->ns, 717 qpair, 718 nbdev_io, 719 bdev_io->u.bdev.iovs, 720 bdev_io->u.bdev.iovcnt, 721 bdev_io->u.bdev.fused_iovs, 722 bdev_io->u.bdev.fused_iovcnt, 723 bdev_io->u.bdev.md_buf, 724 bdev_io->u.bdev.num_blocks, 725 bdev_io->u.bdev.offset_blocks, 726 bdev->dif_check_flags); 727 728 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 729 return bdev_nvme_unmap(nvme_ns->ns, 730 qpair, 731 nbdev_io, 732 bdev_io->u.bdev.offset_blocks, 733 bdev_io->u.bdev.num_blocks); 734 735 case SPDK_BDEV_IO_TYPE_UNMAP: 736 return bdev_nvme_unmap(nvme_ns->ns, 737 qpair, 738 nbdev_io, 739 bdev_io->u.bdev.offset_blocks, 740 bdev_io->u.bdev.num_blocks); 741 742 case SPDK_BDEV_IO_TYPE_RESET: 743 return bdev_nvme_reset(nvme_ch, nbdev_io); 744 745 case SPDK_BDEV_IO_TYPE_FLUSH: 746 return bdev_nvme_flush(nvme_ns->ns, 747 qpair, 748 nbdev_io, 749 bdev_io->u.bdev.offset_blocks, 750 bdev_io->u.bdev.num_blocks); 751 752 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 753 return bdev_nvme_admin_passthru(nvme_ch, 754 nbdev_io, 755 &bdev_io->u.nvme_passthru.cmd, 756 bdev_io->u.nvme_passthru.buf, 757 bdev_io->u.nvme_passthru.nbytes); 758 759 case SPDK_BDEV_IO_TYPE_NVME_IO: 760 return bdev_nvme_io_passthru(nvme_ns->ns, 761 qpair, 762 nbdev_io, 763 &bdev_io->u.nvme_passthru.cmd, 764 bdev_io->u.nvme_passthru.buf, 765 bdev_io->u.nvme_passthru.nbytes); 766 767 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 768 return bdev_nvme_io_passthru_md(nvme_ns->ns, 769 qpair, 770 nbdev_io, 771 &bdev_io->u.nvme_passthru.cmd, 772 bdev_io->u.nvme_passthru.buf, 773 bdev_io->u.nvme_passthru.nbytes, 774 bdev_io->u.nvme_passthru.md_buf, 775 bdev_io->u.nvme_passthru.md_len); 776 777 case SPDK_BDEV_IO_TYPE_ABORT: 778 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 779 return bdev_nvme_abort(nvme_ch, 780 nbdev_io, 781 nbdev_io_to_abort); 782 783 default: 784 return -EINVAL; 785 } 786 return 0; 787 } 788 789 static void 790 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 791 { 792 int rc = _bdev_nvme_submit_request(ch, bdev_io); 793 794 if (spdk_unlikely(rc != 0)) { 795 if (rc == -ENOMEM) { 796 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 797 } else { 798 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 799 } 800 } 801 } 802 803 static bool 804 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 805 { 806 struct nvme_bdev *nbdev = ctx; 807 struct nvme_bdev_ns *nvme_ns; 808 struct spdk_nvme_ns *ns; 809 struct spdk_nvme_ctrlr *ctrlr; 810 const struct spdk_nvme_ctrlr_data *cdata; 811 812 nvme_ns = nvme_bdev_to_bdev_ns(nbdev); 813 assert(nvme_ns != NULL); 814 ns = nvme_ns->ns; 815 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 816 817 switch (io_type) { 818 case SPDK_BDEV_IO_TYPE_READ: 819 case SPDK_BDEV_IO_TYPE_WRITE: 820 case SPDK_BDEV_IO_TYPE_RESET: 821 case SPDK_BDEV_IO_TYPE_FLUSH: 822 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 823 case SPDK_BDEV_IO_TYPE_NVME_IO: 824 case SPDK_BDEV_IO_TYPE_ABORT: 825 return true; 826 827 case SPDK_BDEV_IO_TYPE_COMPARE: 828 return spdk_nvme_ns_supports_compare(ns); 829 830 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 831 return spdk_nvme_ns_get_md_size(ns) ? true : false; 832 833 case SPDK_BDEV_IO_TYPE_UNMAP: 834 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 835 return cdata->oncs.dsm; 836 837 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 838 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 839 /* 840 * If an NVMe controller guarantees reading unallocated blocks returns zero, 841 * we can implement WRITE_ZEROES as an NVMe deallocate command. 842 */ 843 if (cdata->oncs.dsm && 844 spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) == 845 SPDK_NVME_DEALLOC_READ_00) { 846 return true; 847 } 848 /* 849 * The NVMe controller write_zeroes function is currently not used by our driver. 850 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. 851 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. 852 */ 853 return false; 854 855 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 856 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 857 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 858 return true; 859 } 860 return false; 861 862 default: 863 return false; 864 } 865 } 866 867 static int 868 bdev_nvme_create_cb(void *io_device, void *ctx_buf) 869 { 870 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 871 struct nvme_io_channel *nvme_ch = ctx_buf; 872 struct spdk_io_channel *pg_ch = NULL; 873 int rc; 874 875 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 876 rc = bdev_ocssd_create_io_channel(nvme_ch); 877 if (rc != 0) { 878 return rc; 879 } 880 } 881 882 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 883 if (!pg_ch) { 884 rc = -1; 885 goto err_pg_ch; 886 } 887 888 nvme_ch->group = spdk_io_channel_get_ctx(pg_ch); 889 890 #ifdef SPDK_CONFIG_VTUNE 891 nvme_ch->group->collect_spin_stat = true; 892 #else 893 nvme_ch->group->collect_spin_stat = false; 894 #endif 895 896 TAILQ_INIT(&nvme_ch->pending_resets); 897 898 nvme_ch->ctrlr = nvme_bdev_ctrlr; 899 900 rc = bdev_nvme_create_qpair(nvme_ch); 901 if (rc != 0) { 902 goto err_qpair; 903 } 904 905 return 0; 906 907 err_qpair: 908 spdk_put_io_channel(pg_ch); 909 err_pg_ch: 910 if (nvme_ch->ocssd_ch) { 911 bdev_ocssd_destroy_io_channel(nvme_ch); 912 } 913 914 return rc; 915 } 916 917 static void 918 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) 919 { 920 struct nvme_io_channel *nvme_ch = ctx_buf; 921 922 assert(nvme_ch->group != NULL); 923 924 if (nvme_ch->ocssd_ch != NULL) { 925 bdev_ocssd_destroy_io_channel(nvme_ch); 926 } 927 928 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 929 930 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group)); 931 } 932 933 static int 934 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) 935 { 936 struct nvme_bdev_poll_group *group = ctx_buf; 937 938 group->group = spdk_nvme_poll_group_create(group); 939 if (group->group == NULL) { 940 return -1; 941 } 942 943 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 944 945 if (group->poller == NULL) { 946 spdk_nvme_poll_group_destroy(group->group); 947 return -1; 948 } 949 950 return 0; 951 } 952 953 static void 954 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) 955 { 956 struct nvme_bdev_poll_group *group = ctx_buf; 957 958 spdk_poller_unregister(&group->poller); 959 if (spdk_nvme_poll_group_destroy(group->group)) { 960 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module."); 961 assert(false); 962 } 963 } 964 965 static struct spdk_io_channel * 966 bdev_nvme_get_io_channel(void *ctx) 967 { 968 struct nvme_bdev *nvme_bdev = ctx; 969 970 return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 971 } 972 973 static void * 974 bdev_nvme_get_module_ctx(void *ctx) 975 { 976 struct nvme_bdev *nvme_bdev = ctx; 977 978 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 979 } 980 981 static int 982 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 983 { 984 struct nvme_bdev *nvme_bdev = ctx; 985 struct nvme_bdev_ns *nvme_ns; 986 struct spdk_nvme_ns *ns; 987 struct spdk_nvme_ctrlr *ctrlr; 988 const struct spdk_nvme_ctrlr_data *cdata; 989 const struct spdk_nvme_transport_id *trid; 990 union spdk_nvme_vs_register vs; 991 union spdk_nvme_csts_register csts; 992 char buf[128]; 993 994 nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev); 995 assert(nvme_ns != NULL); 996 ns = nvme_ns->ns; 997 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 998 999 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1000 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1001 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1002 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1003 1004 spdk_json_write_named_object_begin(w, "nvme"); 1005 1006 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1007 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1008 } 1009 1010 spdk_json_write_named_object_begin(w, "trid"); 1011 1012 nvme_bdev_dump_trid_json(trid, w); 1013 1014 spdk_json_write_object_end(w); 1015 1016 #ifdef SPDK_CONFIG_NVME_CUSE 1017 size_t cuse_name_size = 128; 1018 char cuse_name[cuse_name_size]; 1019 1020 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1021 cuse_name, &cuse_name_size); 1022 if (rc == 0) { 1023 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1024 } 1025 #endif 1026 1027 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1028 1029 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1030 1031 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1032 spdk_str_trim(buf); 1033 spdk_json_write_named_string(w, "model_number", buf); 1034 1035 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1036 spdk_str_trim(buf); 1037 spdk_json_write_named_string(w, "serial_number", buf); 1038 1039 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1040 spdk_str_trim(buf); 1041 spdk_json_write_named_string(w, "firmware_revision", buf); 1042 1043 if (cdata->subnqn[0] != '\0') { 1044 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1045 } 1046 1047 spdk_json_write_named_object_begin(w, "oacs"); 1048 1049 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1050 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1051 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1052 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1053 1054 spdk_json_write_object_end(w); 1055 1056 spdk_json_write_object_end(w); 1057 1058 spdk_json_write_named_object_begin(w, "vs"); 1059 1060 spdk_json_write_name(w, "nvme_version"); 1061 if (vs.bits.ter) { 1062 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1063 } else { 1064 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1065 } 1066 1067 spdk_json_write_object_end(w); 1068 1069 spdk_json_write_named_object_begin(w, "csts"); 1070 1071 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1072 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1073 1074 spdk_json_write_object_end(w); 1075 1076 spdk_json_write_named_object_begin(w, "ns_data"); 1077 1078 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1079 1080 spdk_json_write_object_end(w); 1081 1082 if (cdata->oacs.security) { 1083 spdk_json_write_named_object_begin(w, "security"); 1084 1085 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1086 1087 spdk_json_write_object_end(w); 1088 } 1089 1090 spdk_json_write_object_end(w); 1091 1092 return 0; 1093 } 1094 1095 static void 1096 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1097 { 1098 /* No config per bdev needed */ 1099 } 1100 1101 static uint64_t 1102 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1103 { 1104 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 1105 struct nvme_bdev_poll_group *group = nvme_ch->group; 1106 uint64_t spin_time; 1107 1108 if (!group || !group->collect_spin_stat) { 1109 return 0; 1110 } 1111 1112 if (group->end_ticks != 0) { 1113 group->spin_ticks += (group->end_ticks - group->start_ticks); 1114 group->end_ticks = 0; 1115 } 1116 1117 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1118 group->start_ticks = 0; 1119 group->spin_ticks = 0; 1120 1121 return spin_time; 1122 } 1123 1124 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1125 .destruct = bdev_nvme_destruct, 1126 .submit_request = bdev_nvme_submit_request, 1127 .io_type_supported = bdev_nvme_io_type_supported, 1128 .get_io_channel = bdev_nvme_get_io_channel, 1129 .dump_info_json = bdev_nvme_dump_info_json, 1130 .write_config_json = bdev_nvme_write_config_json, 1131 .get_spin_time = bdev_nvme_get_spin_time, 1132 .get_module_ctx = bdev_nvme_get_module_ctx, 1133 }; 1134 1135 static int 1136 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1137 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1138 uint32_t prchk_flags, void *ctx) 1139 { 1140 const struct spdk_uuid *uuid; 1141 const struct spdk_nvme_ctrlr_data *cdata; 1142 const struct spdk_nvme_ns_data *nsdata; 1143 int rc; 1144 1145 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1146 1147 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1148 if (!disk->name) { 1149 return -ENOMEM; 1150 } 1151 disk->product_name = "NVMe disk"; 1152 1153 disk->write_cache = 0; 1154 if (cdata->vwc.present) { 1155 /* Enable if the Volatile Write Cache exists */ 1156 disk->write_cache = 1; 1157 } 1158 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1159 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1160 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1161 1162 uuid = spdk_nvme_ns_get_uuid(ns); 1163 if (uuid != NULL) { 1164 disk->uuid = *uuid; 1165 } 1166 1167 nsdata = spdk_nvme_ns_get_data(ns); 1168 1169 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1170 if (disk->md_len != 0) { 1171 disk->md_interleave = nsdata->flbas.extended; 1172 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1173 if (disk->dif_type != SPDK_DIF_DISABLE) { 1174 disk->dif_is_head_of_md = nsdata->dps.md_start; 1175 disk->dif_check_flags = prchk_flags; 1176 } 1177 } 1178 1179 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1180 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1181 disk->acwu = 0; 1182 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1183 disk->acwu = nsdata->nacwu; 1184 } else { 1185 disk->acwu = cdata->acwu; 1186 } 1187 1188 disk->ctxt = ctx; 1189 disk->fn_table = &nvmelib_fn_table; 1190 disk->module = &nvme_if; 1191 rc = spdk_bdev_register(disk); 1192 if (rc) { 1193 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1194 free(disk->name); 1195 return rc; 1196 } 1197 1198 return 0; 1199 } 1200 1201 static int 1202 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns) 1203 { 1204 struct nvme_bdev *bdev; 1205 int rc; 1206 1207 bdev = calloc(1, sizeof(*bdev)); 1208 if (!bdev) { 1209 SPDK_ERRLOG("bdev calloc() failed\n"); 1210 return -ENOMEM; 1211 } 1212 1213 bdev->nvme_ns = nvme_ns; 1214 bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL; 1215 1216 rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr, 1217 nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev); 1218 if (rc != 0) { 1219 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1220 free(bdev); 1221 return rc; 1222 } 1223 1224 nvme_ns->ref++; 1225 nvme_ns->bdev = bdev; 1226 1227 return 0; 1228 } 1229 1230 static void 1231 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1232 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1233 { 1234 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1235 struct spdk_nvme_ns *ns; 1236 int rc = 0; 1237 1238 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1239 if (!ns) { 1240 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1241 rc = -EINVAL; 1242 goto done; 1243 } 1244 1245 nvme_ns->ns = ns; 1246 nvme_ns->ref = 1; 1247 1248 rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns); 1249 done: 1250 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1251 } 1252 1253 static bool 1254 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1255 struct spdk_nvme_ctrlr_opts *opts) 1256 { 1257 struct nvme_probe_skip_entry *entry; 1258 1259 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1260 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1261 return false; 1262 } 1263 } 1264 1265 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1266 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1267 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1268 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1269 1270 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1271 1272 return true; 1273 } 1274 1275 static void 1276 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1277 { 1278 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1279 1280 if (spdk_nvme_cpl_is_error(cpl)) { 1281 SPDK_WARNLOG("Abort failed. Resetting controller.\n"); 1282 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1283 } 1284 } 1285 1286 static void 1287 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1288 struct spdk_nvme_qpair *qpair, uint16_t cid) 1289 { 1290 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg; 1291 union spdk_nvme_csts_register csts; 1292 int rc; 1293 1294 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1295 1296 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1297 1298 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1299 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1300 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1301 * completion recursively. 1302 */ 1303 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1304 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1305 if (csts.bits.cfs) { 1306 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1307 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1308 return; 1309 } 1310 } 1311 1312 switch (g_opts.action_on_timeout) { 1313 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1314 if (qpair) { 1315 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1316 nvme_abort_cpl, nvme_bdev_ctrlr); 1317 if (rc == 0) { 1318 return; 1319 } 1320 1321 SPDK_ERRLOG("Unable to send abort. Resetting.\n"); 1322 } 1323 1324 /* FALLTHROUGH */ 1325 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1326 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1327 break; 1328 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1329 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1330 break; 1331 default: 1332 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1333 break; 1334 } 1335 } 1336 1337 void 1338 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns) 1339 { 1340 nvme_bdev_ns_detach(nvme_ns); 1341 } 1342 1343 static void 1344 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns) 1345 { 1346 struct nvme_bdev *bdev; 1347 1348 bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1349 if (bdev != NULL) { 1350 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1351 } 1352 1353 nvme_ns->populated = false; 1354 1355 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1356 } 1357 1358 static void 1359 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns, 1360 struct nvme_async_probe_ctx *ctx) 1361 { 1362 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1363 } 1364 1365 static void 1366 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns) 1367 { 1368 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1369 } 1370 1371 void 1372 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1373 struct nvme_bdev_ns *nvme_ns, int rc) 1374 { 1375 if (rc == 0) { 1376 nvme_ns->populated = true; 1377 pthread_mutex_lock(&g_bdev_nvme_mutex); 1378 nvme_ns->ctrlr->ref++; 1379 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1380 } else { 1381 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1382 } 1383 1384 if (ctx) { 1385 ctx->populates_in_progress--; 1386 if (ctx->populates_in_progress == 0) { 1387 nvme_ctrlr_populate_namespaces_done(ctx); 1388 } 1389 } 1390 } 1391 1392 static void 1393 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1394 struct nvme_async_probe_ctx *ctx) 1395 { 1396 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1397 struct nvme_bdev_ns *nvme_ns; 1398 struct spdk_nvme_ns *ns; 1399 struct nvme_bdev *bdev; 1400 uint32_t i; 1401 int rc; 1402 uint64_t num_sectors; 1403 bool ns_is_active; 1404 1405 if (ctx) { 1406 /* Initialize this count to 1 to handle the populate functions 1407 * calling nvme_ctrlr_populate_namespace_done() immediately. 1408 */ 1409 ctx->populates_in_progress = 1; 1410 } 1411 1412 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1413 uint32_t nsid = i + 1; 1414 1415 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1416 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1417 1418 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) { 1419 /* NS is still there but attributes may have changed */ 1420 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1421 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1422 bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1423 assert(bdev != NULL); 1424 if (bdev->disk.blockcnt != num_sectors) { 1425 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1426 nsid, 1427 bdev->disk.name, 1428 bdev->disk.blockcnt, 1429 num_sectors); 1430 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1431 if (rc != 0) { 1432 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1433 bdev->disk.name, rc); 1434 } 1435 } 1436 } 1437 1438 if (!nvme_ns->populated && ns_is_active) { 1439 nvme_ns->id = nsid; 1440 nvme_ns->ctrlr = nvme_bdev_ctrlr; 1441 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1442 nvme_ns->type = NVME_BDEV_NS_OCSSD; 1443 } else { 1444 nvme_ns->type = NVME_BDEV_NS_STANDARD; 1445 } 1446 1447 nvme_ns->bdev = NULL; 1448 1449 if (ctx) { 1450 ctx->populates_in_progress++; 1451 } 1452 nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx); 1453 } 1454 1455 if (nvme_ns->populated && !ns_is_active) { 1456 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1457 } 1458 } 1459 1460 if (ctx) { 1461 /* Decrement this count now that the loop is over to account 1462 * for the one we started with. If the count is then 0, we 1463 * know any populate_namespace functions completed immediately, 1464 * so we'll kick the callback here. 1465 */ 1466 ctx->populates_in_progress--; 1467 if (ctx->populates_in_progress == 0) { 1468 nvme_ctrlr_populate_namespaces_done(ctx); 1469 } 1470 } 1471 1472 } 1473 1474 static void 1475 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 1476 { 1477 uint32_t i; 1478 struct nvme_bdev_ns *nvme_ns; 1479 1480 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1481 uint32_t nsid = i + 1; 1482 1483 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1484 if (nvme_ns->populated) { 1485 assert(nvme_ns->id == nsid); 1486 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1487 } 1488 } 1489 } 1490 1491 static void 1492 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1493 { 1494 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 1495 union spdk_nvme_async_event_completion event; 1496 1497 if (spdk_nvme_cpl_is_error(cpl)) { 1498 SPDK_WARNLOG("AER request execute failed"); 1499 return; 1500 } 1501 1502 event.raw = cpl->cdw0; 1503 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1504 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1505 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1506 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1507 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1508 spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1509 bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); 1510 } 1511 } 1512 1513 static int 1514 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1515 const char *name, 1516 const struct spdk_nvme_transport_id *trid, 1517 uint32_t prchk_flags) 1518 { 1519 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1520 struct nvme_bdev_ctrlr_trid *trid_entry; 1521 uint32_t i; 1522 int rc; 1523 1524 nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); 1525 if (nvme_bdev_ctrlr == NULL) { 1526 SPDK_ERRLOG("Failed to allocate device struct\n"); 1527 return -ENOMEM; 1528 } 1529 1530 TAILQ_INIT(&nvme_bdev_ctrlr->trids); 1531 nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 1532 if (nvme_bdev_ctrlr->num_ns != 0) { 1533 nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 1534 if (!nvme_bdev_ctrlr->namespaces) { 1535 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 1536 rc = -ENOMEM; 1537 goto err_alloc_namespaces; 1538 } 1539 } 1540 1541 trid_entry = calloc(1, sizeof(*trid_entry)); 1542 if (trid_entry == NULL) { 1543 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 1544 rc = -ENOMEM; 1545 goto err_alloc_trid; 1546 } 1547 1548 trid_entry->trid = *trid; 1549 1550 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1551 nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 1552 if (nvme_bdev_ctrlr->namespaces[i] == NULL) { 1553 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 1554 rc = -ENOMEM; 1555 goto err_alloc_namespace; 1556 } 1557 } 1558 1559 nvme_bdev_ctrlr->thread = spdk_get_thread(); 1560 nvme_bdev_ctrlr->adminq_timer_poller = NULL; 1561 nvme_bdev_ctrlr->ctrlr = ctrlr; 1562 nvme_bdev_ctrlr->ref = 1; 1563 nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; 1564 nvme_bdev_ctrlr->name = strdup(name); 1565 if (nvme_bdev_ctrlr->name == NULL) { 1566 rc = -ENOMEM; 1567 goto err_alloc_name; 1568 } 1569 1570 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1571 rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); 1572 if (spdk_unlikely(rc != 0)) { 1573 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 1574 goto err_init_ocssd; 1575 } 1576 } 1577 1578 nvme_bdev_ctrlr->prchk_flags = prchk_flags; 1579 1580 spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, 1581 sizeof(struct nvme_io_channel), 1582 name); 1583 1584 nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, 1585 g_opts.nvme_adminq_poll_period_us); 1586 1587 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 1588 1589 if (g_opts.timeout_us > 0) { 1590 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 1591 timeout_cb, nvme_bdev_ctrlr); 1592 } 1593 1594 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); 1595 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr); 1596 1597 if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & 1598 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 1599 nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); 1600 if (nvme_bdev_ctrlr->opal_dev == NULL) { 1601 SPDK_ERRLOG("Failed to initialize Opal\n"); 1602 } 1603 } 1604 1605 TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); 1606 return 0; 1607 1608 err_init_ocssd: 1609 free(nvme_bdev_ctrlr->name); 1610 err_alloc_name: 1611 err_alloc_namespace: 1612 for (; i > 0; i--) { 1613 free(nvme_bdev_ctrlr->namespaces[i - 1]); 1614 } 1615 free(trid_entry); 1616 err_alloc_trid: 1617 free(nvme_bdev_ctrlr->namespaces); 1618 err_alloc_namespaces: 1619 free(nvme_bdev_ctrlr); 1620 return rc; 1621 } 1622 1623 static void 1624 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1625 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1626 { 1627 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1628 struct nvme_probe_ctx *ctx = cb_ctx; 1629 char *name = NULL; 1630 uint32_t prchk_flags = 0; 1631 size_t i; 1632 1633 if (ctx) { 1634 for (i = 0; i < ctx->count; i++) { 1635 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1636 prchk_flags = ctx->prchk_flags[i]; 1637 name = strdup(ctx->names[i]); 1638 break; 1639 } 1640 } 1641 } else { 1642 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 1643 } 1644 if (!name) { 1645 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 1646 return; 1647 } 1648 1649 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 1650 1651 nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags); 1652 1653 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid); 1654 if (!nvme_bdev_ctrlr) { 1655 SPDK_ERRLOG("Failed to find new NVMe controller\n"); 1656 free(name); 1657 return; 1658 } 1659 1660 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1661 1662 free(name); 1663 } 1664 1665 static void 1666 _nvme_bdev_ctrlr_destruct(void *ctx) 1667 { 1668 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1669 1670 nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr); 1671 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1672 } 1673 1674 static void 1675 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 1676 { 1677 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx; 1678 1679 pthread_mutex_lock(&g_bdev_nvme_mutex); 1680 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1681 /* The controller's destruction was already started */ 1682 if (nvme_bdev_ctrlr->destruct) { 1683 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1684 return; 1685 } 1686 nvme_bdev_ctrlr->destruct = true; 1687 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1688 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1689 } 1690 1691 static int 1692 bdev_nvme_hotplug_probe(void *arg) 1693 { 1694 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 1695 g_hotplug_probe_ctx = NULL; 1696 spdk_poller_unregister(&g_hotplug_probe_poller); 1697 } 1698 1699 return SPDK_POLLER_BUSY; 1700 } 1701 1702 static int 1703 bdev_nvme_hotplug(void *arg) 1704 { 1705 struct spdk_nvme_transport_id trid_pcie; 1706 1707 if (g_hotplug_probe_ctx) { 1708 return SPDK_POLLER_BUSY; 1709 } 1710 1711 memset(&trid_pcie, 0, sizeof(trid_pcie)); 1712 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 1713 1714 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 1715 hotplug_probe_cb, attach_cb, NULL); 1716 1717 if (g_hotplug_probe_ctx) { 1718 assert(g_hotplug_probe_poller == NULL); 1719 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 1720 } 1721 1722 return SPDK_POLLER_BUSY; 1723 } 1724 1725 void 1726 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 1727 { 1728 *opts = g_opts; 1729 } 1730 1731 int 1732 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 1733 { 1734 if (g_bdev_nvme_init_thread != NULL) { 1735 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 1736 return -EPERM; 1737 } 1738 } 1739 1740 g_opts = *opts; 1741 1742 return 0; 1743 } 1744 1745 struct set_nvme_hotplug_ctx { 1746 uint64_t period_us; 1747 bool enabled; 1748 spdk_msg_fn fn; 1749 void *fn_ctx; 1750 }; 1751 1752 static void 1753 set_nvme_hotplug_period_cb(void *_ctx) 1754 { 1755 struct set_nvme_hotplug_ctx *ctx = _ctx; 1756 1757 spdk_poller_unregister(&g_hotplug_poller); 1758 if (ctx->enabled) { 1759 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 1760 } 1761 1762 g_nvme_hotplug_poll_period_us = ctx->period_us; 1763 g_nvme_hotplug_enabled = ctx->enabled; 1764 if (ctx->fn) { 1765 ctx->fn(ctx->fn_ctx); 1766 } 1767 1768 free(ctx); 1769 } 1770 1771 int 1772 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 1773 { 1774 struct set_nvme_hotplug_ctx *ctx; 1775 1776 if (enabled == true && !spdk_process_is_primary()) { 1777 return -EPERM; 1778 } 1779 1780 ctx = calloc(1, sizeof(*ctx)); 1781 if (ctx == NULL) { 1782 return -ENOMEM; 1783 } 1784 1785 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 1786 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 1787 ctx->enabled = enabled; 1788 ctx->fn = cb; 1789 ctx->fn_ctx = cb_ctx; 1790 1791 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 1792 return 0; 1793 } 1794 1795 static void 1796 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1797 { 1798 if (ctx->cb_fn) { 1799 ctx->cb_fn(ctx->cb_ctx, count, rc); 1800 } 1801 1802 ctx->namespaces_populated = true; 1803 if (ctx->probe_done) { 1804 /* The probe was already completed, so we need to free the context 1805 * here. This can happen for cases like OCSSD, where we need to 1806 * send additional commands to the SSD after attach. 1807 */ 1808 free(ctx); 1809 } 1810 } 1811 1812 static void 1813 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx) 1814 { 1815 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1816 struct nvme_bdev_ns *nvme_ns; 1817 struct nvme_bdev *nvme_bdev; 1818 uint32_t i, nsid; 1819 size_t j; 1820 1821 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 1822 assert(nvme_bdev_ctrlr != NULL); 1823 1824 /* 1825 * Report the new bdevs that were created in this call. 1826 * There can be more than one bdev per NVMe controller. 1827 */ 1828 j = 0; 1829 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1830 nsid = i + 1; 1831 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1832 if (!nvme_ns->populated) { 1833 continue; 1834 } 1835 assert(nvme_ns->id == nsid); 1836 nvme_bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1837 if (nvme_bdev == NULL) { 1838 assert(nvme_ns->type == NVME_BDEV_NS_OCSSD); 1839 continue; 1840 } 1841 if (j < ctx->count) { 1842 ctx->names[j] = nvme_bdev->disk.name; 1843 j++; 1844 } else { 1845 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 1846 ctx->count); 1847 populate_namespaces_cb(ctx, 0, -ERANGE); 1848 return; 1849 } 1850 } 1851 1852 populate_namespaces_cb(ctx, j, 0); 1853 } 1854 1855 static bool 1856 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1857 { 1858 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1859 1860 nsdata1 = spdk_nvme_ns_get_data(ns1); 1861 nsdata2 = spdk_nvme_ns_get_data(ns2); 1862 1863 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)); 1864 } 1865 1866 static int 1867 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr, 1868 struct spdk_nvme_transport_id *trid) 1869 { 1870 uint32_t i, nsid; 1871 struct nvme_bdev_ns *nvme_ns; 1872 struct spdk_nvme_ns *new_ns; 1873 struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid; 1874 int rc = 0; 1875 1876 assert(nvme_bdev_ctrlr != NULL); 1877 1878 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1879 SPDK_ERRLOG("PCIe failover is not supported.\n"); 1880 return -ENOTSUP; 1881 } 1882 1883 pthread_mutex_lock(&g_bdev_nvme_mutex); 1884 1885 /* Currently we only support failover to the same transport type. */ 1886 if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { 1887 rc = -EINVAL; 1888 goto exit; 1889 } 1890 1891 /* Currently we only support failover to the same NQN. */ 1892 if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 1893 rc = -EINVAL; 1894 goto exit; 1895 } 1896 1897 /* Skip all the other checks if we've already registered this path. */ 1898 TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) { 1899 if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) { 1900 rc = -EEXIST; 1901 goto exit; 1902 } 1903 } 1904 1905 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { 1906 rc = -EINVAL; 1907 goto exit; 1908 } 1909 1910 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1911 nsid = i + 1; 1912 1913 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1914 if (!nvme_ns->populated) { 1915 continue; 1916 } 1917 1918 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 1919 assert(new_ns != NULL); 1920 1921 if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) { 1922 rc = -EINVAL; 1923 goto exit; 1924 } 1925 } 1926 1927 new_trid = calloc(1, sizeof(*new_trid)); 1928 if (new_trid == NULL) { 1929 rc = -ENOMEM; 1930 goto exit; 1931 } 1932 new_trid->trid = *trid; 1933 new_trid->is_failed = false; 1934 1935 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 1936 if (tmp_trid->is_failed) { 1937 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 1938 goto exit; 1939 } 1940 } 1941 1942 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); 1943 1944 exit: 1945 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1946 return rc; 1947 } 1948 1949 static void 1950 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1951 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1952 { 1953 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 1954 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1955 struct nvme_async_probe_ctx *ctx; 1956 int rc; 1957 1958 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 1959 ctx->ctrlr_attached = true; 1960 1961 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 1962 if (nvme_bdev_ctrlr) { 1963 /* This is the case that a secondary path is added to an existing 1964 * nvme_bdev_ctrlr for failover. After checking if it can access the same 1965 * namespaces as the primary path, it is disconnected until failover occurs. 1966 */ 1967 rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid); 1968 1969 spdk_nvme_detach(ctrlr); 1970 goto exit; 1971 } 1972 1973 rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags); 1974 if (rc) { 1975 SPDK_ERRLOG("Failed to create new device\n"); 1976 goto exit; 1977 } 1978 1979 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid); 1980 assert(nvme_bdev_ctrlr != NULL); 1981 1982 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); 1983 return; 1984 1985 exit: 1986 populate_namespaces_cb(ctx, 0, rc); 1987 } 1988 1989 static int 1990 bdev_nvme_async_poll(void *arg) 1991 { 1992 struct nvme_async_probe_ctx *ctx = arg; 1993 int rc; 1994 1995 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 1996 if (spdk_unlikely(rc != -EAGAIN)) { 1997 ctx->probe_done = true; 1998 spdk_poller_unregister(&ctx->poller); 1999 if (!ctx->ctrlr_attached) { 2000 /* The probe is done, but no controller was attached. 2001 * That means we had a failure, so report -EIO back to 2002 * the caller (usually the RPC). populate_namespaces_cb() 2003 * will take care of freeing the nvme_async_probe_ctx. 2004 */ 2005 populate_namespaces_cb(ctx, 0, -EIO); 2006 } else if (ctx->namespaces_populated) { 2007 /* The namespaces for the attached controller were all 2008 * populated and the response was already sent to the 2009 * caller (usually the RPC). So free the context here. 2010 */ 2011 free(ctx); 2012 } 2013 } 2014 2015 return SPDK_POLLER_BUSY; 2016 } 2017 2018 int 2019 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2020 struct spdk_nvme_host_id *hostid, 2021 const char *base_name, 2022 const char **names, 2023 uint32_t count, 2024 const char *hostnqn, 2025 uint32_t prchk_flags, 2026 spdk_bdev_create_nvme_fn cb_fn, 2027 void *cb_ctx, 2028 struct spdk_nvme_ctrlr_opts *opts) 2029 { 2030 struct nvme_probe_skip_entry *entry, *tmp; 2031 struct nvme_async_probe_ctx *ctx; 2032 2033 /* TODO expand this check to include both the host and target TRIDs. 2034 * Only if both are the same should we fail. 2035 */ 2036 if (nvme_bdev_ctrlr_get(trid) != NULL) { 2037 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2038 return -EEXIST; 2039 } 2040 2041 ctx = calloc(1, sizeof(*ctx)); 2042 if (!ctx) { 2043 return -ENOMEM; 2044 } 2045 ctx->base_name = base_name; 2046 ctx->names = names; 2047 ctx->count = count; 2048 ctx->cb_fn = cb_fn; 2049 ctx->cb_ctx = cb_ctx; 2050 ctx->prchk_flags = prchk_flags; 2051 ctx->trid = *trid; 2052 2053 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2054 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2055 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2056 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2057 free(entry); 2058 break; 2059 } 2060 } 2061 } 2062 2063 if (opts) { 2064 memcpy(&ctx->opts, opts, sizeof(*opts)); 2065 } else { 2066 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2067 } 2068 2069 ctx->opts.transport_retry_count = g_opts.retry_count; 2070 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2071 2072 if (hostnqn) { 2073 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 2074 } 2075 2076 if (hostid->hostaddr[0] != '\0') { 2077 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2078 } 2079 2080 if (hostid->hostsvcid[0] != '\0') { 2081 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2082 } 2083 2084 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 2085 if (ctx->probe_ctx == NULL) { 2086 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2087 free(ctx); 2088 return -ENODEV; 2089 } 2090 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2091 2092 return 0; 2093 } 2094 2095 int 2096 bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid) 2097 { 2098 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2099 struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; 2100 2101 if (name == NULL) { 2102 return -EINVAL; 2103 } 2104 2105 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2106 if (nvme_bdev_ctrlr == NULL) { 2107 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2108 return -ENODEV; 2109 } 2110 2111 /* case 1: we are currently using the path to be removed. */ 2112 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2113 ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 2114 assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); 2115 /* case 1A: the current path is the only path. */ 2116 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2117 return bdev_nvme_delete(name); 2118 } 2119 2120 /* case 1B: there is an alternative path. */ 2121 return bdev_nvme_failover(nvme_bdev_ctrlr, true); 2122 } 2123 /* case 2: We are not using the specified path. */ 2124 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { 2125 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2126 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 2127 free(ctrlr_trid); 2128 return 0; 2129 } 2130 } 2131 2132 /* case 2A: The address isn't even in the registered list. */ 2133 return -ENXIO; 2134 } 2135 2136 int 2137 bdev_nvme_delete(const char *name) 2138 { 2139 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2140 struct nvme_probe_skip_entry *entry; 2141 2142 if (name == NULL) { 2143 return -EINVAL; 2144 } 2145 2146 pthread_mutex_lock(&g_bdev_nvme_mutex); 2147 2148 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2149 if (nvme_bdev_ctrlr == NULL) { 2150 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2151 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2152 return -ENODEV; 2153 } 2154 2155 /* The controller's destruction was already started */ 2156 if (nvme_bdev_ctrlr->destruct) { 2157 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2158 return 0; 2159 } 2160 2161 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2162 entry = calloc(1, sizeof(*entry)); 2163 if (!entry) { 2164 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2165 return -ENOMEM; 2166 } 2167 entry->trid = *nvme_bdev_ctrlr->connected_trid; 2168 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 2169 } 2170 2171 nvme_bdev_ctrlr->destruct = true; 2172 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2173 2174 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 2175 2176 return 0; 2177 } 2178 2179 static int 2180 bdev_nvme_library_init(void) 2181 { 2182 g_bdev_nvme_init_thread = spdk_get_thread(); 2183 2184 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, 2185 bdev_nvme_poll_group_destroy_cb, 2186 sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); 2187 2188 return 0; 2189 } 2190 2191 static void 2192 bdev_nvme_library_fini(void) 2193 { 2194 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; 2195 struct nvme_probe_skip_entry *entry, *entry_tmp; 2196 2197 spdk_poller_unregister(&g_hotplug_poller); 2198 free(g_hotplug_probe_ctx); 2199 2200 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2201 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2202 free(entry); 2203 } 2204 2205 pthread_mutex_lock(&g_bdev_nvme_mutex); 2206 TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { 2207 if (nvme_bdev_ctrlr->destruct) { 2208 /* This controller's destruction was already started 2209 * before the application started shutting down 2210 */ 2211 continue; 2212 } 2213 nvme_bdev_ctrlr->destruct = true; 2214 2215 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct, 2216 nvme_bdev_ctrlr); 2217 } 2218 2219 g_bdev_nvme_module_finish = true; 2220 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2221 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2222 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 2223 spdk_bdev_module_finish_done(); 2224 return; 2225 } 2226 2227 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2228 } 2229 2230 static void 2231 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io) 2232 { 2233 struct spdk_bdev *bdev = bdev_io->bdev; 2234 struct spdk_dif_ctx dif_ctx; 2235 struct spdk_dif_error err_blk = {}; 2236 int rc; 2237 2238 rc = spdk_dif_ctx_init(&dif_ctx, 2239 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2240 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2241 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2242 if (rc != 0) { 2243 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2244 return; 2245 } 2246 2247 if (bdev->md_interleave) { 2248 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2249 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2250 } else { 2251 struct iovec md_iov = { 2252 .iov_base = bdev_io->u.bdev.md_buf, 2253 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2254 }; 2255 2256 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2257 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2258 } 2259 2260 if (rc != 0) { 2261 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2262 err_blk.err_type, err_blk.err_offset); 2263 } else { 2264 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2265 } 2266 } 2267 2268 static void 2269 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2270 { 2271 struct nvme_bdev_io *bio = ref; 2272 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2273 2274 if (spdk_nvme_cpl_is_success(cpl)) { 2275 /* Run PI verification for read data buffer. */ 2276 bdev_nvme_verify_pi_error(bdev_io); 2277 } 2278 2279 /* Return original completion status */ 2280 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, 2281 bio->cpl.status.sc); 2282 } 2283 2284 static void 2285 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2286 { 2287 struct nvme_bdev_io *bio = ref; 2288 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2289 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2290 struct nvme_io_channel *nvme_ch; 2291 int ret; 2292 2293 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2294 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2295 cpl->status.sct, cpl->status.sc); 2296 2297 /* Save completion status to use after verifying PI error. */ 2298 bio->cpl = *cpl; 2299 2300 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2301 2302 /* Read without PI checking to verify PI error. */ 2303 ret = bdev_nvme_no_pi_readv(nbdev->nvme_ns->ns, 2304 nvme_ch->qpair, 2305 bio, 2306 bdev_io->u.bdev.iovs, 2307 bdev_io->u.bdev.iovcnt, 2308 bdev_io->u.bdev.md_buf, 2309 bdev_io->u.bdev.num_blocks, 2310 bdev_io->u.bdev.offset_blocks); 2311 if (ret == 0) { 2312 return; 2313 } 2314 } 2315 2316 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2317 } 2318 2319 static void 2320 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2321 { 2322 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2323 2324 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2325 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2326 cpl->status.sct, cpl->status.sc); 2327 /* Run PI verification for write data buffer if PI error is detected. */ 2328 bdev_nvme_verify_pi_error(bdev_io); 2329 } 2330 2331 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2332 } 2333 2334 static void 2335 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2336 { 2337 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2338 2339 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2340 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2341 cpl->status.sct, cpl->status.sc); 2342 /* Run PI verification for compare data buffer if PI error is detected. */ 2343 bdev_nvme_verify_pi_error(bdev_io); 2344 } 2345 2346 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2347 } 2348 2349 static void 2350 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2351 { 2352 struct nvme_bdev_io *bio = ref; 2353 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2354 2355 /* Compare operation completion */ 2356 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2357 /* Save compare result for write callback */ 2358 bio->cpl = *cpl; 2359 return; 2360 } 2361 2362 /* Write operation completion */ 2363 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2364 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2365 * complete the IO with the compare operation's status. 2366 */ 2367 if (!spdk_nvme_cpl_is_error(cpl)) { 2368 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2369 } 2370 2371 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2372 } else { 2373 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2374 } 2375 } 2376 2377 static void 2378 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2379 { 2380 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2381 2382 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2383 } 2384 2385 static void 2386 bdev_nvme_admin_passthru_completion(void *ctx) 2387 { 2388 struct nvme_bdev_io *bio = ctx; 2389 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2390 2391 spdk_bdev_io_complete_nvme_status(bdev_io, 2392 bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2393 } 2394 2395 static void 2396 bdev_nvme_abort_completion(void *ctx) 2397 { 2398 struct nvme_bdev_io *bio = ctx; 2399 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2400 2401 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 2402 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2403 } else { 2404 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2405 } 2406 } 2407 2408 static void 2409 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 2410 { 2411 struct nvme_bdev_io *bio = ref; 2412 2413 bio->cpl = *cpl; 2414 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2415 } 2416 2417 static void 2418 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 2419 { 2420 struct nvme_bdev_io *bio = ref; 2421 2422 bio->cpl = *cpl; 2423 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 2424 } 2425 2426 static void 2427 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 2428 { 2429 struct nvme_bdev_io *bio = ref; 2430 struct iovec *iov; 2431 2432 bio->iov_offset = sgl_offset; 2433 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 2434 iov = &bio->iovs[bio->iovpos]; 2435 if (bio->iov_offset < iov->iov_len) { 2436 break; 2437 } 2438 2439 bio->iov_offset -= iov->iov_len; 2440 } 2441 } 2442 2443 static int 2444 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 2445 { 2446 struct nvme_bdev_io *bio = ref; 2447 struct iovec *iov; 2448 2449 assert(bio->iovpos < bio->iovcnt); 2450 2451 iov = &bio->iovs[bio->iovpos]; 2452 2453 *address = iov->iov_base; 2454 *length = iov->iov_len; 2455 2456 if (bio->iov_offset) { 2457 assert(bio->iov_offset <= iov->iov_len); 2458 *address += bio->iov_offset; 2459 *length -= bio->iov_offset; 2460 } 2461 2462 bio->iov_offset += *length; 2463 if (bio->iov_offset == iov->iov_len) { 2464 bio->iovpos++; 2465 bio->iov_offset = 0; 2466 } 2467 2468 return 0; 2469 } 2470 2471 static void 2472 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 2473 { 2474 struct nvme_bdev_io *bio = ref; 2475 struct iovec *iov; 2476 2477 bio->fused_iov_offset = sgl_offset; 2478 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 2479 iov = &bio->fused_iovs[bio->fused_iovpos]; 2480 if (bio->fused_iov_offset < iov->iov_len) { 2481 break; 2482 } 2483 2484 bio->fused_iov_offset -= iov->iov_len; 2485 } 2486 } 2487 2488 static int 2489 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 2490 { 2491 struct nvme_bdev_io *bio = ref; 2492 struct iovec *iov; 2493 2494 assert(bio->fused_iovpos < bio->fused_iovcnt); 2495 2496 iov = &bio->fused_iovs[bio->fused_iovpos]; 2497 2498 *address = iov->iov_base; 2499 *length = iov->iov_len; 2500 2501 if (bio->fused_iov_offset) { 2502 assert(bio->fused_iov_offset <= iov->iov_len); 2503 *address += bio->fused_iov_offset; 2504 *length -= bio->fused_iov_offset; 2505 } 2506 2507 bio->fused_iov_offset += *length; 2508 if (bio->fused_iov_offset == iov->iov_len) { 2509 bio->fused_iovpos++; 2510 bio->fused_iov_offset = 0; 2511 } 2512 2513 return 0; 2514 } 2515 2516 static int 2517 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2518 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2519 void *md, uint64_t lba_count, uint64_t lba) 2520 { 2521 int rc; 2522 2523 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 2524 lba_count, lba); 2525 2526 bio->iovs = iov; 2527 bio->iovcnt = iovcnt; 2528 bio->iovpos = 0; 2529 bio->iov_offset = 0; 2530 2531 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2532 bdev_nvme_no_pi_readv_done, bio, 0, 2533 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2534 md, 0, 0); 2535 2536 if (rc != 0 && rc != -ENOMEM) { 2537 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 2538 } 2539 return rc; 2540 } 2541 2542 static int 2543 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2544 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2545 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2546 { 2547 int rc; 2548 2549 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2550 lba_count, lba); 2551 2552 bio->iovs = iov; 2553 bio->iovcnt = iovcnt; 2554 bio->iovpos = 0; 2555 bio->iov_offset = 0; 2556 2557 if (iovcnt == 1) { 2558 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 2559 lba_count, 2560 bdev_nvme_readv_done, bio, 2561 flags, 2562 0, 0); 2563 } else { 2564 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2565 bdev_nvme_readv_done, bio, flags, 2566 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2567 md, 0, 0); 2568 } 2569 2570 if (rc != 0 && rc != -ENOMEM) { 2571 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 2572 } 2573 return rc; 2574 } 2575 2576 static int 2577 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2578 struct nvme_bdev_io *bio, 2579 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2580 uint32_t flags) 2581 { 2582 int rc; 2583 2584 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2585 lba_count, lba); 2586 2587 bio->iovs = iov; 2588 bio->iovcnt = iovcnt; 2589 bio->iovpos = 0; 2590 bio->iov_offset = 0; 2591 2592 if (iovcnt == 1) { 2593 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 2594 lba_count, 2595 bdev_nvme_writev_done, bio, 2596 flags, 2597 0, 0); 2598 } else { 2599 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2600 bdev_nvme_writev_done, bio, flags, 2601 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2602 md, 0, 0); 2603 } 2604 2605 if (rc != 0 && rc != -ENOMEM) { 2606 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 2607 } 2608 return rc; 2609 } 2610 2611 static int 2612 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2613 struct nvme_bdev_io *bio, 2614 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2615 uint32_t flags) 2616 { 2617 int rc; 2618 2619 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2620 lba_count, lba); 2621 2622 bio->iovs = iov; 2623 bio->iovcnt = iovcnt; 2624 bio->iovpos = 0; 2625 bio->iov_offset = 0; 2626 2627 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2628 bdev_nvme_comparev_done, bio, flags, 2629 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2630 md, 0, 0); 2631 2632 if (rc != 0 && rc != -ENOMEM) { 2633 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 2634 } 2635 return rc; 2636 } 2637 2638 static int 2639 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2640 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 2641 struct iovec *write_iov, int write_iovcnt, 2642 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2643 { 2644 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2645 int rc; 2646 2647 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2648 lba_count, lba); 2649 2650 bio->iovs = cmp_iov; 2651 bio->iovcnt = cmp_iovcnt; 2652 bio->iovpos = 0; 2653 bio->iov_offset = 0; 2654 bio->fused_iovs = write_iov; 2655 bio->fused_iovcnt = write_iovcnt; 2656 bio->fused_iovpos = 0; 2657 bio->fused_iov_offset = 0; 2658 2659 if (bdev_io->num_retries == 0) { 2660 bio->first_fused_submitted = false; 2661 } 2662 2663 if (!bio->first_fused_submitted) { 2664 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2665 memset(&bio->cpl, 0, sizeof(bio->cpl)); 2666 2667 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2668 bdev_nvme_comparev_and_writev_done, bio, flags, 2669 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 2670 if (rc == 0) { 2671 bio->first_fused_submitted = true; 2672 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2673 } else { 2674 if (rc != -ENOMEM) { 2675 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 2676 } 2677 return rc; 2678 } 2679 } 2680 2681 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 2682 2683 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2684 bdev_nvme_comparev_and_writev_done, bio, flags, 2685 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 2686 if (rc != 0 && rc != -ENOMEM) { 2687 SPDK_ERRLOG("write failed: rc = %d\n", rc); 2688 rc = 0; 2689 } 2690 2691 return rc; 2692 } 2693 2694 static int 2695 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2696 struct nvme_bdev_io *bio, 2697 uint64_t offset_blocks, 2698 uint64_t num_blocks) 2699 { 2700 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 2701 struct spdk_nvme_dsm_range *range; 2702 uint64_t offset, remaining; 2703 uint64_t num_ranges_u64; 2704 uint16_t num_ranges; 2705 int rc; 2706 2707 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 2708 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2709 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 2710 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 2711 return -EINVAL; 2712 } 2713 num_ranges = (uint16_t)num_ranges_u64; 2714 2715 offset = offset_blocks; 2716 remaining = num_blocks; 2717 range = &dsm_ranges[0]; 2718 2719 /* Fill max-size ranges until the remaining blocks fit into one range */ 2720 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 2721 range->attributes.raw = 0; 2722 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2723 range->starting_lba = offset; 2724 2725 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2726 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2727 range++; 2728 } 2729 2730 /* Final range describes the remaining blocks */ 2731 range->attributes.raw = 0; 2732 range->length = remaining; 2733 range->starting_lba = offset; 2734 2735 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 2736 SPDK_NVME_DSM_ATTR_DEALLOCATE, 2737 dsm_ranges, num_ranges, 2738 bdev_nvme_queued_done, bio); 2739 2740 return rc; 2741 } 2742 2743 static int 2744 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2745 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2746 { 2747 uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr); 2748 2749 if (nbytes > max_xfer_size) { 2750 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2751 return -EINVAL; 2752 } 2753 2754 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2755 2756 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf, 2757 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 2758 } 2759 2760 static int 2761 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2762 struct nvme_bdev_io *bio, 2763 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2764 { 2765 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2766 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2767 2768 if (nbytes > max_xfer_size) { 2769 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2770 return -EINVAL; 2771 } 2772 2773 /* 2774 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2775 * so fill it out automatically. 2776 */ 2777 cmd->nsid = spdk_nvme_ns_get_id(ns); 2778 2779 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 2780 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 2781 } 2782 2783 static int 2784 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2785 struct nvme_bdev_io *bio, 2786 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 2787 { 2788 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 2789 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2790 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2791 2792 if (nbytes > max_xfer_size) { 2793 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2794 return -EINVAL; 2795 } 2796 2797 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 2798 SPDK_ERRLOG("invalid meta data buffer size\n"); 2799 return -EINVAL; 2800 } 2801 2802 /* 2803 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2804 * so fill it out automatically. 2805 */ 2806 cmd->nsid = spdk_nvme_ns_get_id(ns); 2807 2808 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 2809 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 2810 } 2811 2812 static void 2813 bdev_nvme_abort_admin_cmd(void *ctx) 2814 { 2815 struct nvme_bdev_io *bio = ctx; 2816 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2817 struct nvme_io_channel *nvme_ch; 2818 struct nvme_bdev_io *bio_to_abort; 2819 int rc; 2820 2821 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2822 bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2823 2824 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2825 NULL, 2826 bio_to_abort, 2827 bdev_nvme_abort_done, bio); 2828 if (rc == -ENOENT) { 2829 /* If no admin command was found in admin qpair, complete the abort 2830 * request with failure. 2831 */ 2832 bio->cpl.cdw0 |= 1U; 2833 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 2834 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2835 2836 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2837 } 2838 } 2839 2840 static int 2841 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2842 struct nvme_bdev_io *bio_to_abort) 2843 { 2844 int rc; 2845 2846 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2847 2848 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2849 nvme_ch->qpair, 2850 bio_to_abort, 2851 bdev_nvme_abort_done, bio); 2852 if (rc == -ENOENT) { 2853 /* If no command was found in I/O qpair, the target command may be 2854 * admin command. Only a single thread tries aborting admin command 2855 * to clean I/O flow. 2856 */ 2857 spdk_thread_send_msg(nvme_ch->ctrlr->thread, 2858 bdev_nvme_abort_admin_cmd, bio); 2859 rc = 0; 2860 } 2861 2862 return rc; 2863 } 2864 2865 static void 2866 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 2867 struct nvme_bdev_ns *nvme_ns) 2868 { 2869 /* nop */ 2870 } 2871 2872 static void 2873 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns) 2874 { 2875 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 2876 } 2877 2878 static void 2879 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 2880 { 2881 const char *action; 2882 2883 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 2884 action = "reset"; 2885 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 2886 action = "abort"; 2887 } else { 2888 action = "none"; 2889 } 2890 2891 spdk_json_write_object_begin(w); 2892 2893 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 2894 2895 spdk_json_write_named_object_begin(w, "params"); 2896 spdk_json_write_named_string(w, "action_on_timeout", action); 2897 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 2898 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 2899 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 2900 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 2901 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 2902 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 2903 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 2904 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 2905 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 2906 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 2907 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 2908 spdk_json_write_object_end(w); 2909 2910 spdk_json_write_object_end(w); 2911 } 2912 2913 static void 2914 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w, 2915 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 2916 { 2917 struct spdk_nvme_transport_id *trid; 2918 2919 trid = nvme_bdev_ctrlr->connected_trid; 2920 2921 spdk_json_write_object_begin(w); 2922 2923 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 2924 2925 spdk_json_write_named_object_begin(w, "params"); 2926 spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); 2927 nvme_bdev_dump_trid_json(trid, w); 2928 spdk_json_write_named_bool(w, "prchk_reftag", 2929 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 2930 spdk_json_write_named_bool(w, "prchk_guard", 2931 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 2932 2933 spdk_json_write_object_end(w); 2934 2935 spdk_json_write_object_end(w); 2936 } 2937 2938 static void 2939 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 2940 { 2941 spdk_json_write_object_begin(w); 2942 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 2943 2944 spdk_json_write_named_object_begin(w, "params"); 2945 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 2946 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 2947 spdk_json_write_object_end(w); 2948 2949 spdk_json_write_object_end(w); 2950 } 2951 2952 static int 2953 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 2954 { 2955 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2956 uint32_t nsid; 2957 2958 bdev_nvme_opts_config_json(w); 2959 2960 pthread_mutex_lock(&g_bdev_nvme_mutex); 2961 2962 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 2963 nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr); 2964 2965 for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { 2966 if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { 2967 continue; 2968 } 2969 2970 nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); 2971 } 2972 } 2973 2974 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 2975 * before enabling hotplug poller. 2976 */ 2977 bdev_nvme_hotplug_config_json(w); 2978 2979 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2980 return 0; 2981 } 2982 2983 struct spdk_nvme_ctrlr * 2984 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 2985 { 2986 if (!bdev || bdev->module != &nvme_if) { 2987 return NULL; 2988 } 2989 2990 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 2991 } 2992 2993 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 2994