1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/config.h" 40 #include "spdk/conf.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/nvme.h" 45 #include "spdk/nvme_ocssd.h" 46 #include "spdk/thread.h" 47 #include "spdk/string.h" 48 #include "spdk/likely.h" 49 #include "spdk/util.h" 50 51 #include "spdk/bdev_module.h" 52 #include "spdk/log.h" 53 54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 55 56 static void bdev_nvme_get_spdk_running_config(FILE *fp); 57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 58 59 struct nvme_bdev_io { 60 /** array of iovecs to transfer. */ 61 struct iovec *iovs; 62 63 /** Number of iovecs in iovs array. */ 64 int iovcnt; 65 66 /** Current iovec position. */ 67 int iovpos; 68 69 /** Offset in current iovec. */ 70 uint32_t iov_offset; 71 72 /** array of iovecs to transfer. */ 73 struct iovec *fused_iovs; 74 75 /** Number of iovecs in iovs array. */ 76 int fused_iovcnt; 77 78 /** Current iovec position. */ 79 int fused_iovpos; 80 81 /** Offset in current iovec. */ 82 uint32_t fused_iov_offset; 83 84 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 85 struct spdk_nvme_cpl cpl; 86 87 /** Originating thread */ 88 struct spdk_thread *orig_thread; 89 90 /** Keeps track if first of fused commands was submitted */ 91 bool first_fused_submitted; 92 }; 93 94 struct nvme_probe_ctx { 95 size_t count; 96 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 97 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 98 const char *names[NVME_MAX_CONTROLLERS]; 99 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 100 const char *hostnqn; 101 }; 102 103 struct nvme_probe_skip_entry { 104 struct spdk_nvme_transport_id trid; 105 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 106 }; 107 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 108 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 109 g_skipped_nvme_ctrlrs); 110 111 static struct spdk_bdev_nvme_opts g_opts = { 112 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 113 .timeout_us = 0, 114 .retry_count = 4, 115 .arbitration_burst = 0, 116 .low_priority_weight = 0, 117 .medium_priority_weight = 0, 118 .high_priority_weight = 0, 119 .nvme_adminq_poll_period_us = 10000ULL, 120 .nvme_ioq_poll_period_us = 0, 121 .io_queue_requests = 0, 122 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 123 }; 124 125 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 127 128 static int g_hot_insert_nvme_controller_index = 0; 129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 130 static bool g_nvme_hotplug_enabled = false; 131 static struct spdk_thread *g_bdev_nvme_init_thread; 132 static struct spdk_poller *g_hotplug_poller; 133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 134 static char *g_nvme_hostnqn = NULL; 135 136 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 137 struct nvme_async_probe_ctx *ctx); 138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx); 139 static int bdev_nvme_library_init(void); 140 static void bdev_nvme_library_fini(void); 141 static int bdev_nvme_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 142 struct nvme_bdev_io *bio, 143 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 144 uint32_t flags); 145 static int bdev_nvme_no_pi_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 146 struct nvme_bdev_io *bio, 147 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 148 static int bdev_nvme_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 149 struct nvme_bdev_io *bio, 150 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 151 uint32_t flags); 152 static int bdev_nvme_comparev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 153 struct nvme_bdev_io *bio, 154 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 155 uint32_t flags); 156 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_ns *nvme_ns, 157 struct nvme_io_channel *nvme_ch, 158 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 159 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 160 uint32_t flags); 161 static int bdev_nvme_admin_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 162 struct nvme_bdev_io *bio, 163 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 164 static int bdev_nvme_io_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 165 struct nvme_bdev_io *bio, 166 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 167 static int bdev_nvme_io_passthru_md(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 168 struct nvme_bdev_io *bio, 169 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 170 static int bdev_nvme_abort(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 171 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 172 static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio, 173 bool failover); 174 175 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 176 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 177 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 178 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 179 180 static populate_namespace_fn g_populate_namespace_fn[] = { 181 NULL, 182 nvme_ctrlr_populate_standard_namespace, 183 bdev_ocssd_populate_namespace, 184 }; 185 186 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *ns); 187 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns); 188 189 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 190 NULL, 191 nvme_ctrlr_depopulate_standard_namespace, 192 bdev_ocssd_depopulate_namespace, 193 }; 194 195 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns); 196 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 197 struct nvme_bdev_ns *ns); 198 199 static config_json_namespace_fn g_config_json_namespace_fn[] = { 200 NULL, 201 nvme_ctrlr_config_json_standard_namespace, 202 bdev_ocssd_namespace_config_json, 203 }; 204 205 struct spdk_nvme_qpair * 206 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 207 { 208 struct nvme_io_channel *nvme_ch; 209 210 nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 211 212 return nvme_ch->qpair; 213 } 214 215 static int 216 bdev_nvme_get_ctx_size(void) 217 { 218 return sizeof(struct nvme_bdev_io); 219 } 220 221 static struct spdk_bdev_module nvme_if = { 222 .name = "nvme", 223 .async_fini = true, 224 .module_init = bdev_nvme_library_init, 225 .module_fini = bdev_nvme_library_fini, 226 .config_text = bdev_nvme_get_spdk_running_config, 227 .config_json = bdev_nvme_config_json, 228 .get_ctx_size = bdev_nvme_get_ctx_size, 229 230 }; 231 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 232 233 static void 234 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 235 { 236 SPDK_DEBUGLOG(bdev_nvme, "qpar %p is disconnected, attempting reconnect.\n", qpair); 237 /* 238 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 239 * reconnect a qpair and we will stop getting a callback for this one. 240 */ 241 spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 242 } 243 244 static int 245 bdev_nvme_poll(void *arg) 246 { 247 struct nvme_bdev_poll_group *group = arg; 248 int64_t num_completions; 249 250 if (group->collect_spin_stat && group->start_ticks == 0) { 251 group->start_ticks = spdk_get_ticks(); 252 } 253 254 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 255 bdev_nvme_disconnected_qpair_cb); 256 if (group->collect_spin_stat) { 257 if (num_completions > 0) { 258 if (group->end_ticks != 0) { 259 group->spin_ticks += (group->end_ticks - group->start_ticks); 260 group->end_ticks = 0; 261 } 262 group->start_ticks = 0; 263 } else { 264 group->end_ticks = spdk_get_ticks(); 265 } 266 } 267 268 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 269 } 270 271 static int 272 bdev_nvme_poll_adminq(void *arg) 273 { 274 int32_t rc; 275 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 276 277 assert(nvme_bdev_ctrlr != NULL); 278 279 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); 280 if (rc < 0) { 281 bdev_nvme_reset(nvme_bdev_ctrlr, NULL, true); 282 } 283 284 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 285 } 286 287 static int 288 bdev_nvme_destruct(void *ctx) 289 { 290 struct nvme_bdev *nvme_disk = ctx; 291 292 nvme_bdev_detach_bdev_from_ns(nvme_disk); 293 294 free(nvme_disk->disk.name); 295 free(nvme_disk); 296 297 return 0; 298 } 299 300 static int 301 bdev_nvme_flush(struct nvme_bdev_ns *nvme_ns, struct nvme_bdev_io *bio, 302 uint64_t offset, uint64_t nbytes) 303 { 304 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); 305 306 return 0; 307 } 308 309 static void 310 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 311 { 312 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 313 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 314 struct spdk_bdev_io *bdev_io; 315 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 316 317 /* A NULL ctx means success. */ 318 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 319 status = SPDK_BDEV_IO_STATUS_FAILED; 320 } 321 322 while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) { 323 bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets); 324 TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link); 325 spdk_bdev_io_complete(bdev_io, status); 326 } 327 328 spdk_for_each_channel_continue(i, 0); 329 } 330 331 static void 332 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) 333 { 334 /* we are using the for_each_channel cb_arg like a return code here. */ 335 /* If it's zero, we succeeded, otherwise, the reset failed. */ 336 void *cb_arg = NULL; 337 338 if (rc) { 339 cb_arg = (void *)0x1; 340 SPDK_ERRLOG("Resetting controller failed.\n"); 341 } else { 342 SPDK_NOTICELOG("Resetting controller successful.\n"); 343 } 344 345 pthread_mutex_lock(&g_bdev_nvme_mutex); 346 nvme_bdev_ctrlr->resetting = false; 347 nvme_bdev_ctrlr->failover_in_progress = false; 348 pthread_mutex_unlock(&g_bdev_nvme_mutex); 349 /* Make sure we clear any pending resets before returning. */ 350 spdk_for_each_channel(nvme_bdev_ctrlr, 351 _bdev_nvme_complete_pending_resets, 352 cb_arg, NULL); 353 } 354 355 static void 356 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 357 { 358 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 359 void *ctx = spdk_io_channel_iter_get_ctx(i); 360 int rc = SPDK_BDEV_IO_STATUS_SUCCESS; 361 362 if (status) { 363 rc = SPDK_BDEV_IO_STATUS_FAILED; 364 } 365 if (ctx) { 366 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc); 367 } 368 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 369 } 370 371 static void 372 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 373 { 374 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 375 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 376 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 377 struct spdk_nvme_io_qpair_opts opts; 378 379 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); 380 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 381 opts.create_only = true; 382 383 nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); 384 if (!nvme_ch->qpair) { 385 spdk_for_each_channel_continue(i, -1); 386 return; 387 } 388 389 assert(nvme_ch->group != NULL); 390 if (spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair) != 0) { 391 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 392 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 393 spdk_for_each_channel_continue(i, -1); 394 return; 395 } 396 397 if (spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair)) { 398 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 399 spdk_nvme_poll_group_remove(nvme_ch->group->group, nvme_ch->qpair); 400 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 401 spdk_for_each_channel_continue(i, -1); 402 return; 403 } 404 405 spdk_for_each_channel_continue(i, 0); 406 } 407 408 static void 409 _bdev_nvme_reset(struct spdk_io_channel_iter *i, int status) 410 { 411 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 412 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 413 int rc; 414 415 if (status) { 416 if (bio) { 417 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); 418 } 419 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 420 return; 421 } 422 423 rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); 424 if (rc != 0) { 425 if (bio) { 426 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); 427 } 428 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); 429 return; 430 } 431 432 /* Recreate all of the I/O queue pairs */ 433 spdk_for_each_channel(nvme_bdev_ctrlr, 434 _bdev_nvme_reset_create_qpair, 435 bio, 436 _bdev_nvme_reset_create_qpairs_done); 437 438 439 } 440 441 static void 442 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 443 { 444 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 445 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 446 int rc; 447 448 rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 449 if (!rc) { 450 nvme_ch->qpair = NULL; 451 } 452 453 spdk_for_each_channel_continue(i, rc); 454 } 455 456 static int 457 bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio, bool failover) 458 { 459 struct spdk_io_channel *ch; 460 struct nvme_io_channel *nvme_ch; 461 struct nvme_bdev_ctrlr_trid *next_trid = NULL, *tmp_trid = NULL; 462 int rc = 0; 463 464 pthread_mutex_lock(&g_bdev_nvme_mutex); 465 if (nvme_bdev_ctrlr->destruct) { 466 /* Don't bother resetting if the controller is in the process of being destructed. */ 467 if (bio) { 468 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); 469 } 470 pthread_mutex_unlock(&g_bdev_nvme_mutex); 471 return 0; 472 } 473 474 if (failover) { 475 tmp_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 476 assert(tmp_trid); 477 assert(&tmp_trid->trid == nvme_bdev_ctrlr->connected_trid); 478 next_trid = TAILQ_NEXT(tmp_trid, link); 479 if (!next_trid) { 480 failover = false; 481 } 482 } 483 484 if (!nvme_bdev_ctrlr->resetting) { 485 nvme_bdev_ctrlr->resetting = true; 486 if (failover) { 487 nvme_bdev_ctrlr->failover_in_progress = true; 488 } 489 } else { 490 if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { 491 rc = -EAGAIN; 492 } 493 pthread_mutex_unlock(&g_bdev_nvme_mutex); 494 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 495 /* 496 * The internal reset calls won't be queued. This is on purpose so that we don't 497 * interfere with the app framework reset strategy. i.e. we are deferring to the 498 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 499 */ 500 if (bio) { 501 ch = spdk_get_io_channel(nvme_bdev_ctrlr); 502 assert(ch != NULL); 503 nvme_ch = spdk_io_channel_get_ctx(ch); 504 TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link); 505 spdk_put_io_channel(ch); 506 } 507 return rc; 508 } 509 510 if (failover) { 511 spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); 512 nvme_bdev_ctrlr->connected_trid = &next_trid->trid; 513 rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); 514 assert(rc == 0); 515 /** Shuffle the old trid to the end of the list and use the new one. 516 * Allows for round robin through multiple connections. 517 */ 518 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, tmp_trid, link); 519 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, tmp_trid, link); 520 } 521 522 pthread_mutex_unlock(&g_bdev_nvme_mutex); 523 /* First, delete all NVMe I/O queue pairs. */ 524 spdk_for_each_channel(nvme_bdev_ctrlr, 525 _bdev_nvme_reset_destroy_qpair, 526 bio, 527 _bdev_nvme_reset); 528 529 return 0; 530 } 531 532 static int 533 bdev_nvme_unmap(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 534 struct nvme_bdev_io *bio, 535 uint64_t offset_blocks, 536 uint64_t num_blocks); 537 538 static void 539 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 540 bool success) 541 { 542 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 543 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 544 int ret; 545 546 if (!success) { 547 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 548 return; 549 } 550 551 ret = bdev_nvme_readv(nbdev->nvme_ns, 552 nvme_ch, 553 (struct nvme_bdev_io *)bdev_io->driver_ctx, 554 bdev_io->u.bdev.iovs, 555 bdev_io->u.bdev.iovcnt, 556 bdev_io->u.bdev.md_buf, 557 bdev_io->u.bdev.num_blocks, 558 bdev_io->u.bdev.offset_blocks, 559 nbdev->disk.dif_check_flags); 560 561 if (spdk_likely(ret == 0)) { 562 return; 563 } else if (ret == -ENOMEM) { 564 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 565 } else { 566 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 567 } 568 } 569 570 static int 571 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 572 { 573 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 574 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 575 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 576 struct nvme_bdev_io *nbdev_io_to_abort; 577 578 if (nvme_ch->qpair == NULL) { 579 /* The device is currently resetting */ 580 return -1; 581 } 582 583 switch (bdev_io->type) { 584 case SPDK_BDEV_IO_TYPE_READ: 585 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 586 bdev_nvme_get_buf_cb(ch, bdev_io, true); 587 } else { 588 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 589 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 590 } 591 return 0; 592 593 case SPDK_BDEV_IO_TYPE_WRITE: 594 return bdev_nvme_writev(nbdev->nvme_ns, 595 nvme_ch, 596 nbdev_io, 597 bdev_io->u.bdev.iovs, 598 bdev_io->u.bdev.iovcnt, 599 bdev_io->u.bdev.md_buf, 600 bdev_io->u.bdev.num_blocks, 601 bdev_io->u.bdev.offset_blocks, 602 nbdev->disk.dif_check_flags); 603 604 case SPDK_BDEV_IO_TYPE_COMPARE: 605 return bdev_nvme_comparev(nbdev->nvme_ns, 606 nvme_ch, 607 nbdev_io, 608 bdev_io->u.bdev.iovs, 609 bdev_io->u.bdev.iovcnt, 610 bdev_io->u.bdev.md_buf, 611 bdev_io->u.bdev.num_blocks, 612 bdev_io->u.bdev.offset_blocks, 613 nbdev->disk.dif_check_flags); 614 615 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 616 return bdev_nvme_comparev_and_writev(nbdev->nvme_ns, 617 nvme_ch, 618 nbdev_io, 619 bdev_io->u.bdev.iovs, 620 bdev_io->u.bdev.iovcnt, 621 bdev_io->u.bdev.fused_iovs, 622 bdev_io->u.bdev.fused_iovcnt, 623 bdev_io->u.bdev.md_buf, 624 bdev_io->u.bdev.num_blocks, 625 bdev_io->u.bdev.offset_blocks, 626 nbdev->disk.dif_check_flags); 627 628 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 629 return bdev_nvme_unmap(nbdev->nvme_ns, 630 nvme_ch, 631 nbdev_io, 632 bdev_io->u.bdev.offset_blocks, 633 bdev_io->u.bdev.num_blocks); 634 635 case SPDK_BDEV_IO_TYPE_UNMAP: 636 return bdev_nvme_unmap(nbdev->nvme_ns, 637 nvme_ch, 638 nbdev_io, 639 bdev_io->u.bdev.offset_blocks, 640 bdev_io->u.bdev.num_blocks); 641 642 case SPDK_BDEV_IO_TYPE_RESET: 643 return bdev_nvme_reset(nbdev->nvme_ns->ctrlr, nbdev_io, false); 644 645 case SPDK_BDEV_IO_TYPE_FLUSH: 646 return bdev_nvme_flush(nbdev->nvme_ns, 647 nbdev_io, 648 bdev_io->u.bdev.offset_blocks, 649 bdev_io->u.bdev.num_blocks); 650 651 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 652 return bdev_nvme_admin_passthru(nbdev->nvme_ns, 653 nvme_ch, 654 nbdev_io, 655 &bdev_io->u.nvme_passthru.cmd, 656 bdev_io->u.nvme_passthru.buf, 657 bdev_io->u.nvme_passthru.nbytes); 658 659 case SPDK_BDEV_IO_TYPE_NVME_IO: 660 return bdev_nvme_io_passthru(nbdev->nvme_ns, 661 nvme_ch, 662 nbdev_io, 663 &bdev_io->u.nvme_passthru.cmd, 664 bdev_io->u.nvme_passthru.buf, 665 bdev_io->u.nvme_passthru.nbytes); 666 667 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 668 return bdev_nvme_io_passthru_md(nbdev->nvme_ns, 669 nvme_ch, 670 nbdev_io, 671 &bdev_io->u.nvme_passthru.cmd, 672 bdev_io->u.nvme_passthru.buf, 673 bdev_io->u.nvme_passthru.nbytes, 674 bdev_io->u.nvme_passthru.md_buf, 675 bdev_io->u.nvme_passthru.md_len); 676 677 case SPDK_BDEV_IO_TYPE_ABORT: 678 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 679 return bdev_nvme_abort(nbdev->nvme_ns, 680 nvme_ch, 681 nbdev_io, 682 nbdev_io_to_abort); 683 684 default: 685 return -EINVAL; 686 } 687 return 0; 688 } 689 690 static void 691 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 692 { 693 int rc = _bdev_nvme_submit_request(ch, bdev_io); 694 695 if (spdk_unlikely(rc != 0)) { 696 if (rc == -ENOMEM) { 697 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 698 } else { 699 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 700 } 701 } 702 } 703 704 static bool 705 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 706 { 707 struct nvme_bdev *nbdev = ctx; 708 struct nvme_bdev_ns *nvme_ns = nbdev->nvme_ns; 709 const struct spdk_nvme_ctrlr_data *cdata; 710 711 switch (io_type) { 712 case SPDK_BDEV_IO_TYPE_READ: 713 case SPDK_BDEV_IO_TYPE_WRITE: 714 case SPDK_BDEV_IO_TYPE_RESET: 715 case SPDK_BDEV_IO_TYPE_FLUSH: 716 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 717 case SPDK_BDEV_IO_TYPE_NVME_IO: 718 case SPDK_BDEV_IO_TYPE_ABORT: 719 return true; 720 721 case SPDK_BDEV_IO_TYPE_COMPARE: 722 return spdk_nvme_ns_supports_compare(nvme_ns->ns); 723 724 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 725 return spdk_nvme_ns_get_md_size(nvme_ns->ns) ? true : false; 726 727 case SPDK_BDEV_IO_TYPE_UNMAP: 728 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 729 return cdata->oncs.dsm; 730 731 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 732 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 733 /* 734 * If an NVMe controller guarantees reading unallocated blocks returns zero, 735 * we can implement WRITE_ZEROES as an NVMe deallocate command. 736 */ 737 if (cdata->oncs.dsm && 738 spdk_nvme_ns_get_dealloc_logical_block_read_value(nvme_ns->ns) == 739 SPDK_NVME_DEALLOC_READ_00) { 740 return true; 741 } 742 /* 743 * The NVMe controller write_zeroes function is currently not used by our driver. 744 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. 745 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. 746 */ 747 return false; 748 749 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 750 if (spdk_nvme_ctrlr_get_flags(nvme_ns->ctrlr->ctrlr) & 751 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 752 return true; 753 } 754 return false; 755 756 default: 757 return false; 758 } 759 } 760 761 static int 762 bdev_nvme_create_cb(void *io_device, void *ctx_buf) 763 { 764 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 765 struct nvme_io_channel *ch = ctx_buf; 766 struct spdk_nvme_io_qpair_opts opts; 767 struct spdk_io_channel *pg_ch = NULL; 768 int rc; 769 770 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); 771 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 772 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 773 opts.create_only = true; 774 g_opts.io_queue_requests = opts.io_queue_requests; 775 776 ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); 777 778 if (ch->qpair == NULL) { 779 return -1; 780 } 781 782 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 783 if (bdev_ocssd_create_io_channel(ch)) { 784 goto err; 785 } 786 } 787 788 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 789 if (!pg_ch) { 790 goto err; 791 } 792 793 ch->group = spdk_io_channel_get_ctx(pg_ch); 794 if (spdk_nvme_poll_group_add(ch->group->group, ch->qpair) != 0) { 795 goto err; 796 } 797 798 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, ch->qpair); 799 if (rc) { 800 spdk_nvme_poll_group_remove(ch->group->group, ch->qpair); 801 goto err; 802 } 803 804 #ifdef SPDK_CONFIG_VTUNE 805 ch->group->collect_spin_stat = true; 806 #else 807 ch->group->collect_spin_stat = false; 808 #endif 809 810 TAILQ_INIT(&ch->pending_resets); 811 return 0; 812 813 err: 814 if (pg_ch) { 815 spdk_put_io_channel(pg_ch); 816 } 817 spdk_nvme_ctrlr_free_io_qpair(ch->qpair); 818 return -1; 819 } 820 821 static void 822 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) 823 { 824 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 825 struct nvme_io_channel *ch = ctx_buf; 826 struct nvme_bdev_poll_group *group; 827 828 group = ch->group; 829 assert(group != NULL); 830 831 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 832 bdev_ocssd_destroy_io_channel(ch); 833 } 834 835 if (ch->qpair != NULL) { 836 spdk_nvme_poll_group_remove(group->group, ch->qpair); 837 } 838 spdk_put_io_channel(spdk_io_channel_from_ctx(group)); 839 840 spdk_nvme_ctrlr_free_io_qpair(ch->qpair); 841 } 842 843 static int 844 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) 845 { 846 struct nvme_bdev_poll_group *group = ctx_buf; 847 848 group->group = spdk_nvme_poll_group_create(group); 849 if (group->group == NULL) { 850 return -1; 851 } 852 853 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 854 855 if (group->poller == NULL) { 856 spdk_nvme_poll_group_destroy(group->group); 857 return -1; 858 } 859 860 return 0; 861 } 862 863 static void 864 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) 865 { 866 struct nvme_bdev_poll_group *group = ctx_buf; 867 868 spdk_poller_unregister(&group->poller); 869 if (spdk_nvme_poll_group_destroy(group->group)) { 870 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module."); 871 assert(false); 872 } 873 } 874 875 static struct spdk_io_channel * 876 bdev_nvme_get_io_channel(void *ctx) 877 { 878 struct nvme_bdev *nvme_bdev = ctx; 879 880 return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 881 } 882 883 static int 884 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 885 { 886 struct nvme_bdev *nvme_bdev = ctx; 887 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_bdev->nvme_ns->ctrlr; 888 const struct spdk_nvme_ctrlr_data *cdata; 889 struct spdk_nvme_ns *ns; 890 union spdk_nvme_vs_register vs; 891 union spdk_nvme_csts_register csts; 892 char buf[128]; 893 894 cdata = spdk_nvme_ctrlr_get_data(nvme_bdev_ctrlr->ctrlr); 895 vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev_ctrlr->ctrlr); 896 csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev_ctrlr->ctrlr); 897 ns = nvme_bdev->nvme_ns->ns; 898 899 spdk_json_write_named_object_begin(w, "nvme"); 900 901 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 902 spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->connected_trid->traddr); 903 } 904 905 spdk_json_write_named_object_begin(w, "trid"); 906 907 nvme_bdev_dump_trid_json(nvme_bdev_ctrlr->connected_trid, w); 908 909 spdk_json_write_object_end(w); 910 911 #ifdef SPDK_CONFIG_NVME_CUSE 912 size_t cuse_name_size = 128; 913 char cuse_name[cuse_name_size]; 914 915 int rc = spdk_nvme_cuse_get_ns_name(nvme_bdev_ctrlr->ctrlr, spdk_nvme_ns_get_id(ns), 916 cuse_name, &cuse_name_size); 917 if (rc == 0) { 918 spdk_json_write_named_string(w, "cuse_device", cuse_name); 919 } 920 #endif 921 922 spdk_json_write_named_object_begin(w, "ctrlr_data"); 923 924 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 925 926 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 927 spdk_str_trim(buf); 928 spdk_json_write_named_string(w, "model_number", buf); 929 930 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 931 spdk_str_trim(buf); 932 spdk_json_write_named_string(w, "serial_number", buf); 933 934 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 935 spdk_str_trim(buf); 936 spdk_json_write_named_string(w, "firmware_revision", buf); 937 938 spdk_json_write_named_object_begin(w, "oacs"); 939 940 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 941 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 942 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 943 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 944 945 spdk_json_write_object_end(w); 946 947 spdk_json_write_object_end(w); 948 949 spdk_json_write_named_object_begin(w, "vs"); 950 951 spdk_json_write_name(w, "nvme_version"); 952 if (vs.bits.ter) { 953 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 954 } else { 955 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 956 } 957 958 spdk_json_write_object_end(w); 959 960 spdk_json_write_named_object_begin(w, "csts"); 961 962 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 963 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 964 965 spdk_json_write_object_end(w); 966 967 spdk_json_write_named_object_begin(w, "ns_data"); 968 969 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 970 971 spdk_json_write_object_end(w); 972 973 if (cdata->oacs.security) { 974 spdk_json_write_named_object_begin(w, "security"); 975 976 spdk_json_write_named_bool(w, "opal", nvme_bdev_ctrlr->opal_dev ? true : false); 977 978 spdk_json_write_object_end(w); 979 } 980 981 spdk_json_write_object_end(w); 982 983 return 0; 984 } 985 986 static void 987 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 988 { 989 /* No config per bdev needed */ 990 } 991 992 static uint64_t 993 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 994 { 995 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 996 struct nvme_bdev_poll_group *group = nvme_ch->group; 997 uint64_t spin_time; 998 999 if (!group || !group->collect_spin_stat) { 1000 return 0; 1001 } 1002 1003 if (group->end_ticks != 0) { 1004 group->spin_ticks += (group->end_ticks - group->start_ticks); 1005 group->end_ticks = 0; 1006 } 1007 1008 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1009 group->start_ticks = 0; 1010 group->spin_ticks = 0; 1011 1012 return spin_time; 1013 } 1014 1015 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1016 .destruct = bdev_nvme_destruct, 1017 .submit_request = bdev_nvme_submit_request, 1018 .io_type_supported = bdev_nvme_io_type_supported, 1019 .get_io_channel = bdev_nvme_get_io_channel, 1020 .dump_info_json = bdev_nvme_dump_info_json, 1021 .write_config_json = bdev_nvme_write_config_json, 1022 .get_spin_time = bdev_nvme_get_spin_time, 1023 }; 1024 1025 static void 1026 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1027 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1028 { 1029 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1030 struct nvme_bdev *bdev; 1031 struct spdk_nvme_ns *ns; 1032 const struct spdk_uuid *uuid; 1033 const struct spdk_nvme_ctrlr_data *cdata; 1034 const struct spdk_nvme_ns_data *nsdata; 1035 int rc; 1036 1037 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1038 1039 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1040 if (!ns) { 1041 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1042 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -EINVAL); 1043 return; 1044 } 1045 1046 bdev = calloc(1, sizeof(*bdev)); 1047 if (!bdev) { 1048 SPDK_ERRLOG("bdev calloc() failed\n"); 1049 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM); 1050 return; 1051 } 1052 1053 nvme_ns->ns = ns; 1054 bdev->nvme_ns = nvme_ns; 1055 1056 bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns)); 1057 if (!bdev->disk.name) { 1058 free(bdev); 1059 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM); 1060 return; 1061 } 1062 bdev->disk.product_name = "NVMe disk"; 1063 1064 bdev->disk.write_cache = 0; 1065 if (cdata->vwc.present) { 1066 /* Enable if the Volatile Write Cache exists */ 1067 bdev->disk.write_cache = 1; 1068 } 1069 bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1070 bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1071 bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1072 1073 uuid = spdk_nvme_ns_get_uuid(ns); 1074 if (uuid != NULL) { 1075 bdev->disk.uuid = *uuid; 1076 } 1077 1078 nsdata = spdk_nvme_ns_get_data(ns); 1079 1080 bdev->disk.md_len = spdk_nvme_ns_get_md_size(ns); 1081 if (bdev->disk.md_len != 0) { 1082 bdev->disk.md_interleave = nsdata->flbas.extended; 1083 bdev->disk.dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1084 if (bdev->disk.dif_type != SPDK_DIF_DISABLE) { 1085 bdev->disk.dif_is_head_of_md = nsdata->dps.md_start; 1086 bdev->disk.dif_check_flags = nvme_bdev_ctrlr->prchk_flags; 1087 } 1088 } 1089 1090 if (!bdev_nvme_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 1091 bdev->disk.acwu = 0; 1092 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1093 bdev->disk.acwu = nsdata->nacwu; 1094 } else { 1095 bdev->disk.acwu = cdata->acwu; 1096 } 1097 1098 bdev->disk.ctxt = bdev; 1099 bdev->disk.fn_table = &nvmelib_fn_table; 1100 bdev->disk.module = &nvme_if; 1101 rc = spdk_bdev_register(&bdev->disk); 1102 if (rc) { 1103 free(bdev->disk.name); 1104 free(bdev); 1105 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1106 return; 1107 } 1108 1109 nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev); 1110 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0); 1111 } 1112 1113 static bool 1114 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1115 struct spdk_nvme_ctrlr_opts *opts) 1116 { 1117 struct nvme_probe_skip_entry *entry; 1118 1119 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1120 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1121 return false; 1122 } 1123 } 1124 1125 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1126 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1127 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1128 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1129 1130 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1131 1132 return true; 1133 } 1134 1135 static bool 1136 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1137 struct spdk_nvme_ctrlr_opts *opts) 1138 { 1139 struct nvme_probe_ctx *ctx = cb_ctx; 1140 1141 SPDK_DEBUGLOG(bdev_nvme, "Probing device %s\n", trid->traddr); 1142 1143 if (nvme_bdev_ctrlr_get(trid)) { 1144 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", 1145 trid->traddr); 1146 return false; 1147 } 1148 1149 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1150 bool claim_device = false; 1151 size_t i; 1152 1153 for (i = 0; i < ctx->count; i++) { 1154 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1155 claim_device = true; 1156 break; 1157 } 1158 } 1159 1160 if (!claim_device) { 1161 SPDK_DEBUGLOG(bdev_nvme, "Not claiming device at %s\n", trid->traddr); 1162 return false; 1163 } 1164 } 1165 1166 if (ctx->hostnqn) { 1167 snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", ctx->hostnqn); 1168 } 1169 1170 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1171 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1172 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1173 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1174 1175 return true; 1176 } 1177 1178 static void 1179 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1180 { 1181 struct spdk_nvme_ctrlr *ctrlr = ctx; 1182 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1183 1184 if (spdk_nvme_cpl_is_error(cpl)) { 1185 SPDK_WARNLOG("Abort failed. Resetting controller.\n"); 1186 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); 1187 assert(nvme_bdev_ctrlr != NULL); 1188 bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false); 1189 } 1190 } 1191 1192 static void 1193 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1194 struct spdk_nvme_qpair *qpair, uint16_t cid) 1195 { 1196 int rc; 1197 union spdk_nvme_csts_register csts; 1198 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1199 1200 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1201 1202 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1203 if (csts.bits.cfs) { 1204 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1205 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); 1206 assert(nvme_bdev_ctrlr != NULL); 1207 bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false); 1208 return; 1209 } 1210 1211 switch (g_opts.action_on_timeout) { 1212 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1213 if (qpair) { 1214 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1215 nvme_abort_cpl, ctrlr); 1216 if (rc == 0) { 1217 return; 1218 } 1219 1220 SPDK_ERRLOG("Unable to send abort. Resetting.\n"); 1221 } 1222 1223 /* FALLTHROUGH */ 1224 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1225 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); 1226 assert(nvme_bdev_ctrlr != NULL); 1227 bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false); 1228 break; 1229 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1230 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1231 break; 1232 default: 1233 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1234 break; 1235 } 1236 } 1237 1238 void 1239 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 1240 { 1241 pthread_mutex_lock(&g_bdev_nvme_mutex); 1242 nvme_bdev_ctrlr->ref--; 1243 1244 if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { 1245 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1246 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1247 return; 1248 } 1249 1250 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1251 } 1252 1253 static void 1254 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns) 1255 { 1256 struct nvme_bdev *bdev, *tmp; 1257 1258 TAILQ_FOREACH_SAFE(bdev, &ns->bdevs, tailq, tmp) { 1259 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1260 } 1261 1262 ns->populated = false; 1263 1264 nvme_ctrlr_depopulate_namespace_done(ns->ctrlr); 1265 } 1266 1267 static void nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns, 1268 struct nvme_async_probe_ctx *ctx) 1269 { 1270 g_populate_namespace_fn[ns->type](ctrlr, ns, ctx); 1271 } 1272 1273 static void nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns) 1274 { 1275 g_depopulate_namespace_fn[ns->type](ns); 1276 } 1277 1278 void 1279 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1280 struct nvme_bdev_ns *ns, int rc) 1281 { 1282 if (rc == 0) { 1283 ns->populated = true; 1284 pthread_mutex_lock(&g_bdev_nvme_mutex); 1285 ns->ctrlr->ref++; 1286 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1287 } else { 1288 memset(ns, 0, sizeof(*ns)); 1289 } 1290 1291 if (ctx) { 1292 ctx->populates_in_progress--; 1293 if (ctx->populates_in_progress == 0) { 1294 nvme_ctrlr_populate_namespaces_done(ctx); 1295 } 1296 } 1297 } 1298 1299 static void 1300 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1301 struct nvme_async_probe_ctx *ctx) 1302 { 1303 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1304 struct nvme_bdev_ns *ns; 1305 struct spdk_nvme_ns *nvme_ns; 1306 struct nvme_bdev *bdev; 1307 uint32_t i; 1308 int rc; 1309 uint64_t num_sectors; 1310 bool ns_is_active; 1311 1312 if (ctx) { 1313 /* Initialize this count to 1 to handle the populate functions 1314 * calling nvme_ctrlr_populate_namespace_done() immediately. 1315 */ 1316 ctx->populates_in_progress = 1; 1317 } 1318 1319 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1320 uint32_t nsid = i + 1; 1321 1322 ns = nvme_bdev_ctrlr->namespaces[i]; 1323 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1324 1325 if (ns->populated && ns_is_active && ns->type == NVME_BDEV_NS_STANDARD) { 1326 /* NS is still there but attributes may have changed */ 1327 nvme_ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1328 num_sectors = spdk_nvme_ns_get_num_sectors(nvme_ns); 1329 bdev = TAILQ_FIRST(&ns->bdevs); 1330 if (bdev->disk.blockcnt != num_sectors) { 1331 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %lu, new size %lu\n", 1332 nsid, 1333 bdev->disk.name, 1334 bdev->disk.blockcnt, 1335 num_sectors); 1336 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1337 if (rc != 0) { 1338 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1339 bdev->disk.name, rc); 1340 } 1341 } 1342 } 1343 1344 if (!ns->populated && ns_is_active) { 1345 ns->id = nsid; 1346 ns->ctrlr = nvme_bdev_ctrlr; 1347 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1348 ns->type = NVME_BDEV_NS_OCSSD; 1349 } else { 1350 ns->type = NVME_BDEV_NS_STANDARD; 1351 } 1352 1353 TAILQ_INIT(&ns->bdevs); 1354 1355 if (ctx) { 1356 ctx->populates_in_progress++; 1357 } 1358 nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx); 1359 } 1360 1361 if (ns->populated && !ns_is_active) { 1362 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns); 1363 } 1364 } 1365 1366 if (ctx) { 1367 /* Decrement this count now that the loop is over to account 1368 * for the one we started with. If the count is then 0, we 1369 * know any populate_namespace functions completed immediately, 1370 * so we'll kick the callback here. 1371 */ 1372 ctx->populates_in_progress--; 1373 if (ctx->populates_in_progress == 0) { 1374 nvme_ctrlr_populate_namespaces_done(ctx); 1375 } 1376 } 1377 1378 } 1379 1380 static void 1381 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1382 { 1383 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 1384 union spdk_nvme_async_event_completion event; 1385 1386 if (spdk_nvme_cpl_is_error(cpl)) { 1387 SPDK_WARNLOG("AER request execute failed"); 1388 return; 1389 } 1390 1391 event.raw = cpl->cdw0; 1392 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1393 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1394 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1395 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1396 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1397 spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1398 bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); 1399 } 1400 } 1401 1402 static int 1403 create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, 1404 const char *name, 1405 const struct spdk_nvme_transport_id *trid, 1406 uint32_t prchk_flags) 1407 { 1408 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1409 struct nvme_bdev_ctrlr_trid *trid_entry; 1410 uint32_t i; 1411 int rc; 1412 1413 nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); 1414 if (nvme_bdev_ctrlr == NULL) { 1415 SPDK_ERRLOG("Failed to allocate device struct\n"); 1416 return -ENOMEM; 1417 } 1418 1419 TAILQ_INIT(&nvme_bdev_ctrlr->trids); 1420 nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 1421 nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 1422 if (!nvme_bdev_ctrlr->namespaces) { 1423 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 1424 free(nvme_bdev_ctrlr); 1425 return -ENOMEM; 1426 } 1427 1428 trid_entry = calloc(1, sizeof(*trid_entry)); 1429 if (trid_entry == NULL) { 1430 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 1431 free(nvme_bdev_ctrlr->namespaces); 1432 free(nvme_bdev_ctrlr); 1433 return -ENOMEM; 1434 } 1435 1436 trid_entry->trid = *trid; 1437 1438 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1439 nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 1440 if (nvme_bdev_ctrlr->namespaces[i] == NULL) { 1441 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 1442 for (; i > 0; i--) { 1443 free(nvme_bdev_ctrlr->namespaces[i - 1]); 1444 } 1445 free(trid_entry); 1446 free(nvme_bdev_ctrlr->namespaces); 1447 free(nvme_bdev_ctrlr); 1448 return -ENOMEM; 1449 } 1450 } 1451 1452 nvme_bdev_ctrlr->thread = spdk_get_thread(); 1453 nvme_bdev_ctrlr->adminq_timer_poller = NULL; 1454 nvme_bdev_ctrlr->ctrlr = ctrlr; 1455 nvme_bdev_ctrlr->ref = 0; 1456 nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; 1457 nvme_bdev_ctrlr->name = strdup(name); 1458 if (nvme_bdev_ctrlr->name == NULL) { 1459 free(trid_entry); 1460 free(nvme_bdev_ctrlr->namespaces); 1461 free(nvme_bdev_ctrlr); 1462 return -ENOMEM; 1463 } 1464 1465 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1466 rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); 1467 if (spdk_unlikely(rc != 0)) { 1468 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 1469 free(trid_entry); 1470 free(nvme_bdev_ctrlr->name); 1471 free(nvme_bdev_ctrlr->namespaces); 1472 free(nvme_bdev_ctrlr); 1473 return rc; 1474 } 1475 } 1476 1477 nvme_bdev_ctrlr->prchk_flags = prchk_flags; 1478 1479 spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, 1480 sizeof(struct nvme_io_channel), 1481 name); 1482 1483 nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, 1484 g_opts.nvme_adminq_poll_period_us); 1485 1486 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 1487 1488 if (g_opts.timeout_us > 0) { 1489 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 1490 timeout_cb, NULL); 1491 } 1492 1493 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); 1494 1495 if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & 1496 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 1497 nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); 1498 if (nvme_bdev_ctrlr->opal_dev == NULL) { 1499 SPDK_ERRLOG("Failed to initialize Opal\n"); 1500 } 1501 } 1502 1503 TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); 1504 return 0; 1505 } 1506 1507 static void 1508 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1509 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1510 { 1511 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1512 struct nvme_probe_ctx *ctx = cb_ctx; 1513 char *name = NULL; 1514 uint32_t prchk_flags = 0; 1515 size_t i; 1516 1517 if (ctx) { 1518 for (i = 0; i < ctx->count; i++) { 1519 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1520 prchk_flags = ctx->prchk_flags[i]; 1521 name = strdup(ctx->names[i]); 1522 break; 1523 } 1524 } 1525 } else { 1526 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 1527 } 1528 if (!name) { 1529 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 1530 return; 1531 } 1532 1533 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 1534 1535 create_ctrlr(ctrlr, name, trid, prchk_flags); 1536 1537 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid); 1538 if (!nvme_bdev_ctrlr) { 1539 SPDK_ERRLOG("Failed to find new NVMe controller\n"); 1540 free(name); 1541 return; 1542 } 1543 1544 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1545 1546 free(name); 1547 } 1548 1549 static void 1550 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 1551 { 1552 uint32_t i; 1553 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1554 struct nvme_bdev_ns *ns; 1555 1556 pthread_mutex_lock(&g_bdev_nvme_mutex); 1557 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 1558 if (nvme_bdev_ctrlr->ctrlr == ctrlr) { 1559 /* The controller's destruction was already started */ 1560 if (nvme_bdev_ctrlr->destruct) { 1561 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1562 return; 1563 } 1564 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1565 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1566 uint32_t nsid = i + 1; 1567 1568 ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1569 if (ns->populated) { 1570 assert(ns->id == nsid); 1571 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns); 1572 } 1573 } 1574 1575 pthread_mutex_lock(&g_bdev_nvme_mutex); 1576 nvme_bdev_ctrlr->destruct = true; 1577 if (nvme_bdev_ctrlr->ref == 0) { 1578 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1579 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1580 } else { 1581 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1582 } 1583 return; 1584 } 1585 } 1586 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1587 } 1588 1589 static int 1590 bdev_nvme_hotplug(void *arg) 1591 { 1592 struct spdk_nvme_transport_id trid_pcie; 1593 int done; 1594 1595 if (!g_hotplug_probe_ctx) { 1596 memset(&trid_pcie, 0, sizeof(trid_pcie)); 1597 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 1598 1599 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 1600 hotplug_probe_cb, 1601 attach_cb, remove_cb); 1602 if (!g_hotplug_probe_ctx) { 1603 return SPDK_POLLER_BUSY; 1604 } 1605 } 1606 1607 done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx); 1608 if (done != -EAGAIN) { 1609 g_hotplug_probe_ctx = NULL; 1610 } 1611 1612 return SPDK_POLLER_BUSY; 1613 } 1614 1615 void 1616 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 1617 { 1618 *opts = g_opts; 1619 } 1620 1621 int 1622 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 1623 { 1624 if (g_bdev_nvme_init_thread != NULL) { 1625 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 1626 return -EPERM; 1627 } 1628 } 1629 1630 g_opts = *opts; 1631 1632 return 0; 1633 } 1634 1635 struct set_nvme_hotplug_ctx { 1636 uint64_t period_us; 1637 bool enabled; 1638 spdk_msg_fn fn; 1639 void *fn_ctx; 1640 }; 1641 1642 static void 1643 set_nvme_hotplug_period_cb(void *_ctx) 1644 { 1645 struct set_nvme_hotplug_ctx *ctx = _ctx; 1646 1647 spdk_poller_unregister(&g_hotplug_poller); 1648 if (ctx->enabled) { 1649 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 1650 } 1651 1652 g_nvme_hotplug_poll_period_us = ctx->period_us; 1653 g_nvme_hotplug_enabled = ctx->enabled; 1654 if (ctx->fn) { 1655 ctx->fn(ctx->fn_ctx); 1656 } 1657 1658 free(ctx); 1659 } 1660 1661 int 1662 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 1663 { 1664 struct set_nvme_hotplug_ctx *ctx; 1665 1666 if (enabled == true && !spdk_process_is_primary()) { 1667 return -EPERM; 1668 } 1669 1670 ctx = calloc(1, sizeof(*ctx)); 1671 if (ctx == NULL) { 1672 return -ENOMEM; 1673 } 1674 1675 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 1676 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 1677 ctx->enabled = enabled; 1678 ctx->fn = cb; 1679 ctx->fn_ctx = cb_ctx; 1680 1681 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 1682 return 0; 1683 } 1684 1685 static void 1686 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1687 { 1688 if (ctx->cb_fn) { 1689 ctx->cb_fn(ctx->cb_ctx, count, rc); 1690 } 1691 1692 free(ctx); 1693 } 1694 1695 static void 1696 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx) 1697 { 1698 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1699 struct nvme_bdev_ns *ns; 1700 struct nvme_bdev *nvme_bdev, *tmp; 1701 uint32_t i, nsid; 1702 size_t j; 1703 1704 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 1705 assert(nvme_bdev_ctrlr != NULL); 1706 1707 /* 1708 * Report the new bdevs that were created in this call. 1709 * There can be more than one bdev per NVMe controller. 1710 */ 1711 j = 0; 1712 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1713 nsid = i + 1; 1714 ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1715 if (!ns->populated) { 1716 continue; 1717 } 1718 assert(ns->id == nsid); 1719 TAILQ_FOREACH_SAFE(nvme_bdev, &ns->bdevs, tailq, tmp) { 1720 if (j < ctx->count) { 1721 ctx->names[j] = nvme_bdev->disk.name; 1722 j++; 1723 } else { 1724 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 1725 ctx->count); 1726 populate_namespaces_cb(ctx, 0, -ERANGE); 1727 return; 1728 } 1729 } 1730 } 1731 1732 populate_namespaces_cb(ctx, j, 0); 1733 } 1734 1735 static void 1736 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1737 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1738 { 1739 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 1740 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1741 struct nvme_async_probe_ctx *ctx; 1742 int rc; 1743 1744 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 1745 1746 spdk_poller_unregister(&ctx->poller); 1747 1748 rc = create_ctrlr(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags); 1749 if (rc) { 1750 SPDK_ERRLOG("Failed to create new device\n"); 1751 populate_namespaces_cb(ctx, 0, rc); 1752 return; 1753 } 1754 1755 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid); 1756 assert(nvme_bdev_ctrlr != NULL); 1757 1758 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); 1759 } 1760 1761 static int 1762 bdev_nvme_async_poll(void *arg) 1763 { 1764 struct nvme_async_probe_ctx *ctx = arg; 1765 int rc; 1766 1767 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 1768 if (spdk_unlikely(rc != -EAGAIN && rc != 0)) { 1769 spdk_poller_unregister(&ctx->poller); 1770 free(ctx); 1771 } 1772 1773 return SPDK_POLLER_BUSY; 1774 } 1775 1776 static int 1777 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_transport_id *trid) 1778 { 1779 struct spdk_nvme_ctrlr *new_ctrlr; 1780 struct spdk_nvme_ctrlr_opts opts; 1781 uint32_t i; 1782 struct spdk_nvme_ns *ns, *new_ns; 1783 const struct spdk_nvme_ns_data *ns_data, *new_ns_data; 1784 struct nvme_bdev_ctrlr_trid *new_trid; 1785 int rc = 0; 1786 1787 assert(nvme_bdev_ctrlr != NULL); 1788 1789 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1790 SPDK_ERRLOG("PCIe failover is not supported.\n"); 1791 return -ENOTSUP; 1792 } 1793 1794 /* Currently we only support failover to the same transport type. */ 1795 if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { 1796 return -EINVAL; 1797 } 1798 1799 /* Currently we only support failover to the same NQN. */ 1800 if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 1801 return -EINVAL; 1802 } 1803 1804 /* Skip all the other checks if we've already registered this path. */ 1805 TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) { 1806 if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) { 1807 return -EEXIST; 1808 } 1809 } 1810 1811 spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts)); 1812 opts.transport_retry_count = g_opts.retry_count; 1813 1814 new_ctrlr = spdk_nvme_connect(trid, &opts, sizeof(opts)); 1815 1816 if (new_ctrlr == NULL) { 1817 return -ENODEV; 1818 } 1819 1820 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { 1821 rc = -EINVAL; 1822 goto out; 1823 } 1824 1825 for (i = 1; i <= nvme_bdev_ctrlr->num_ns; i++) { 1826 ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, i); 1827 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, i); 1828 assert(ns != NULL); 1829 assert(new_ns != NULL); 1830 1831 ns_data = spdk_nvme_ns_get_data(ns); 1832 new_ns_data = spdk_nvme_ns_get_data(new_ns); 1833 if (memcmp(ns_data->nguid, new_ns_data->nguid, sizeof(ns_data->nguid))) { 1834 rc = -EINVAL; 1835 goto out; 1836 } 1837 } 1838 1839 new_trid = calloc(1, sizeof(*new_trid)); 1840 if (new_trid == NULL) { 1841 rc = -ENOMEM; 1842 goto out; 1843 } 1844 new_trid->trid = *trid; 1845 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); 1846 1847 out: 1848 spdk_nvme_detach(new_ctrlr); 1849 return rc; 1850 } 1851 1852 int 1853 bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid) 1854 { 1855 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1856 struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; 1857 1858 if (name == NULL) { 1859 return -EINVAL; 1860 } 1861 1862 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 1863 if (nvme_bdev_ctrlr == NULL) { 1864 SPDK_ERRLOG("Failed to find NVMe controller\n"); 1865 return -ENODEV; 1866 } 1867 1868 /* case 1: we are currently using the path to be removed. */ 1869 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 1870 ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 1871 assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); 1872 /* case 1A: the current path is the only path. */ 1873 if (!TAILQ_NEXT(ctrlr_trid, link)) { 1874 return bdev_nvme_delete(name); 1875 } 1876 1877 /* case 1B: there is an alternative path. */ 1878 if (bdev_nvme_reset(nvme_bdev_ctrlr, NULL, true) == -EAGAIN) { 1879 return -EAGAIN; 1880 } 1881 assert(nvme_bdev_ctrlr->connected_trid != &ctrlr_trid->trid); 1882 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 1883 free(ctrlr_trid); 1884 return 0; 1885 } 1886 /* case 2: We are not using the specified path. */ 1887 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { 1888 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 1889 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 1890 free(ctrlr_trid); 1891 return 0; 1892 } 1893 } 1894 1895 /* case 2A: The address isn't even in the registered list. */ 1896 return -ENXIO; 1897 } 1898 1899 int 1900 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 1901 struct spdk_nvme_host_id *hostid, 1902 const char *base_name, 1903 const char **names, 1904 uint32_t count, 1905 const char *hostnqn, 1906 uint32_t prchk_flags, 1907 spdk_bdev_create_nvme_fn cb_fn, 1908 void *cb_ctx) 1909 { 1910 struct nvme_probe_skip_entry *entry, *tmp; 1911 struct nvme_async_probe_ctx *ctx; 1912 struct nvme_bdev_ctrlr *existing_ctrlr; 1913 int rc; 1914 1915 /* TODO expand this check to include both the host and target TRIDs. 1916 * Only if both are the same should we fail. 1917 */ 1918 if (nvme_bdev_ctrlr_get(trid) != NULL) { 1919 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 1920 return -EEXIST; 1921 } 1922 1923 ctx = calloc(1, sizeof(*ctx)); 1924 if (!ctx) { 1925 return -ENOMEM; 1926 } 1927 ctx->base_name = base_name; 1928 ctx->names = names; 1929 ctx->count = count; 1930 ctx->cb_fn = cb_fn; 1931 ctx->cb_ctx = cb_ctx; 1932 ctx->prchk_flags = prchk_flags; 1933 ctx->trid = *trid; 1934 1935 existing_ctrlr = nvme_bdev_ctrlr_get_by_name(base_name); 1936 if (existing_ctrlr) { 1937 rc = bdev_nvme_add_trid(existing_ctrlr, trid); 1938 if (rc) { 1939 free(ctx); 1940 return rc; 1941 } 1942 1943 nvme_ctrlr_populate_namespaces_done(ctx); 1944 return 0; 1945 } 1946 1947 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1948 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 1949 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1950 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 1951 free(entry); 1952 break; 1953 } 1954 } 1955 } 1956 1957 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 1958 ctx->opts.transport_retry_count = g_opts.retry_count; 1959 1960 if (hostnqn) { 1961 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 1962 } 1963 1964 if (hostid->hostaddr[0] != '\0') { 1965 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 1966 } 1967 1968 if (hostid->hostsvcid[0] != '\0') { 1969 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 1970 } 1971 1972 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 1973 if (ctx->probe_ctx == NULL) { 1974 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 1975 free(ctx); 1976 return -ENODEV; 1977 } 1978 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 1979 1980 return 0; 1981 } 1982 1983 int 1984 bdev_nvme_delete(const char *name) 1985 { 1986 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL; 1987 struct nvme_probe_skip_entry *entry; 1988 1989 if (name == NULL) { 1990 return -EINVAL; 1991 } 1992 1993 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 1994 if (nvme_bdev_ctrlr == NULL) { 1995 SPDK_ERRLOG("Failed to find NVMe controller\n"); 1996 return -ENODEV; 1997 } 1998 1999 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2000 entry = calloc(1, sizeof(*entry)); 2001 if (!entry) { 2002 return -ENOMEM; 2003 } 2004 entry->trid = *nvme_bdev_ctrlr->connected_trid; 2005 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 2006 } 2007 2008 remove_cb(NULL, nvme_bdev_ctrlr->ctrlr); 2009 return 0; 2010 } 2011 2012 static int 2013 bdev_nvme_library_init(void) 2014 { 2015 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2016 struct spdk_conf_section *sp; 2017 const char *val; 2018 int rc = 0; 2019 int64_t intval = 0; 2020 size_t i; 2021 struct nvme_probe_ctx *probe_ctx = NULL; 2022 int retry_count; 2023 uint32_t local_nvme_num = 0; 2024 int64_t hotplug_period; 2025 bool hotplug_enabled = g_nvme_hotplug_enabled; 2026 2027 g_bdev_nvme_init_thread = spdk_get_thread(); 2028 2029 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, 2030 bdev_nvme_poll_group_destroy_cb, 2031 sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); 2032 2033 sp = spdk_conf_find_section(NULL, "Nvme"); 2034 if (sp == NULL) { 2035 goto end; 2036 } 2037 2038 probe_ctx = calloc(1, sizeof(*probe_ctx)); 2039 if (probe_ctx == NULL) { 2040 SPDK_ERRLOG("Failed to allocate probe_ctx\n"); 2041 rc = -1; 2042 goto end; 2043 } 2044 2045 retry_count = spdk_conf_section_get_intval(sp, "RetryCount"); 2046 if (retry_count >= 0) { 2047 g_opts.retry_count = retry_count; 2048 } 2049 2050 val = spdk_conf_section_get_val(sp, "TimeoutUsec"); 2051 if (val != NULL) { 2052 intval = spdk_strtoll(val, 10); 2053 if (intval < 0) { 2054 SPDK_ERRLOG("Invalid TimeoutUsec value\n"); 2055 rc = -1; 2056 goto end; 2057 } 2058 } 2059 2060 g_opts.timeout_us = intval; 2061 2062 if (g_opts.timeout_us > 0) { 2063 val = spdk_conf_section_get_val(sp, "ActionOnTimeout"); 2064 if (val != NULL) { 2065 if (!strcasecmp(val, "Reset")) { 2066 g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET; 2067 } else if (!strcasecmp(val, "Abort")) { 2068 g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT; 2069 } 2070 } 2071 } 2072 2073 intval = spdk_conf_section_get_intval(sp, "AdminPollRate"); 2074 if (intval > 0) { 2075 g_opts.nvme_adminq_poll_period_us = intval; 2076 } 2077 2078 intval = spdk_conf_section_get_intval(sp, "IOPollRate"); 2079 if (intval > 0) { 2080 g_opts.nvme_ioq_poll_period_us = intval; 2081 } 2082 2083 if (spdk_process_is_primary()) { 2084 hotplug_enabled = spdk_conf_section_get_boolval(sp, "HotplugEnable", false); 2085 } 2086 2087 hotplug_period = spdk_conf_section_get_intval(sp, "HotplugPollRate"); 2088 if (hotplug_period < 0) { 2089 hotplug_period = 0; 2090 } 2091 2092 g_nvme_hostnqn = spdk_conf_section_get_val(sp, "HostNQN"); 2093 probe_ctx->hostnqn = g_nvme_hostnqn; 2094 2095 g_opts.delay_cmd_submit = spdk_conf_section_get_boolval(sp, "DelayCmdSubmit", 2096 SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT); 2097 2098 for (i = 0; i < NVME_MAX_CONTROLLERS; i++) { 2099 val = spdk_conf_section_get_nmval(sp, "TransportID", i, 0); 2100 if (val == NULL) { 2101 break; 2102 } 2103 2104 rc = spdk_nvme_transport_id_parse(&probe_ctx->trids[i], val); 2105 if (rc < 0) { 2106 SPDK_ERRLOG("Unable to parse TransportID: %s\n", val); 2107 rc = -1; 2108 goto end; 2109 } 2110 2111 rc = spdk_nvme_host_id_parse(&probe_ctx->hostids[i], val); 2112 if (rc < 0) { 2113 SPDK_ERRLOG("Unable to parse HostID: %s\n", val); 2114 rc = -1; 2115 goto end; 2116 } 2117 2118 val = spdk_conf_section_get_nmval(sp, "TransportID", i, 1); 2119 if (val == NULL) { 2120 SPDK_ERRLOG("No name provided for TransportID\n"); 2121 rc = -1; 2122 goto end; 2123 } 2124 2125 probe_ctx->names[i] = val; 2126 2127 val = spdk_conf_section_get_nmval(sp, "TransportID", i, 2); 2128 if (val != NULL) { 2129 rc = spdk_nvme_prchk_flags_parse(&probe_ctx->prchk_flags[i], val); 2130 if (rc < 0) { 2131 SPDK_ERRLOG("Unable to parse prchk: %s\n", val); 2132 rc = -1; 2133 goto end; 2134 } 2135 } 2136 2137 probe_ctx->count++; 2138 2139 if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) { 2140 struct spdk_nvme_ctrlr *ctrlr; 2141 struct spdk_nvme_ctrlr_opts opts; 2142 2143 if (nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) { 2144 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", 2145 probe_ctx->trids[i].traddr); 2146 rc = -1; 2147 goto end; 2148 } 2149 2150 if (probe_ctx->trids[i].subnqn[0] == '\0') { 2151 SPDK_ERRLOG("Need to provide subsystem nqn\n"); 2152 rc = -1; 2153 goto end; 2154 } 2155 2156 spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts)); 2157 opts.transport_retry_count = g_opts.retry_count; 2158 2159 if (probe_ctx->hostnqn != NULL) { 2160 snprintf(opts.hostnqn, sizeof(opts.hostnqn), "%s", probe_ctx->hostnqn); 2161 } 2162 2163 if (probe_ctx->hostids[i].hostaddr[0] != '\0') { 2164 snprintf(opts.src_addr, sizeof(opts.src_addr), "%s", probe_ctx->hostids[i].hostaddr); 2165 } 2166 2167 if (probe_ctx->hostids[i].hostsvcid[0] != '\0') { 2168 snprintf(opts.src_svcid, sizeof(opts.src_svcid), "%s", probe_ctx->hostids[i].hostsvcid); 2169 } 2170 2171 ctrlr = spdk_nvme_connect(&probe_ctx->trids[i], &opts, sizeof(opts)); 2172 if (ctrlr == NULL) { 2173 SPDK_ERRLOG("Unable to connect to provided trid (traddr: %s)\n", 2174 probe_ctx->trids[i].traddr); 2175 rc = -1; 2176 goto end; 2177 } 2178 2179 rc = create_ctrlr(ctrlr, probe_ctx->names[i], &probe_ctx->trids[i], 0); 2180 if (rc) { 2181 goto end; 2182 } 2183 2184 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&probe_ctx->trids[i]); 2185 if (!nvme_bdev_ctrlr) { 2186 SPDK_ERRLOG("Failed to find new NVMe controller\n"); 2187 rc = -ENODEV; 2188 goto end; 2189 } 2190 2191 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 2192 } else { 2193 local_nvme_num++; 2194 } 2195 } 2196 2197 if (local_nvme_num > 0) { 2198 /* used to probe local NVMe device */ 2199 if (spdk_nvme_probe(NULL, probe_ctx, probe_cb, attach_cb, remove_cb)) { 2200 rc = -1; 2201 goto end; 2202 } 2203 2204 for (i = 0; i < probe_ctx->count; i++) { 2205 if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) { 2206 continue; 2207 } 2208 2209 if (!nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) { 2210 SPDK_ERRLOG("NVMe SSD \"%s\" could not be found.\n", probe_ctx->trids[i].traddr); 2211 SPDK_ERRLOG("Check PCIe BDF and that it is attached to UIO/VFIO driver.\n"); 2212 } 2213 } 2214 } 2215 2216 rc = bdev_nvme_set_hotplug(hotplug_enabled, hotplug_period, NULL, NULL); 2217 if (rc) { 2218 SPDK_ERRLOG("Failed to setup hotplug (%d): %s", rc, spdk_strerror(rc)); 2219 rc = -1; 2220 } 2221 end: 2222 free(probe_ctx); 2223 return rc; 2224 } 2225 2226 static void 2227 bdev_nvme_library_fini(void) 2228 { 2229 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; 2230 struct nvme_probe_skip_entry *entry, *entry_tmp; 2231 struct nvme_bdev_ns *ns; 2232 uint32_t i; 2233 2234 spdk_poller_unregister(&g_hotplug_poller); 2235 free(g_hotplug_probe_ctx); 2236 2237 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2238 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2239 free(entry); 2240 } 2241 2242 pthread_mutex_lock(&g_bdev_nvme_mutex); 2243 TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { 2244 if (nvme_bdev_ctrlr->destruct) { 2245 /* This controller's destruction was already started 2246 * before the application started shutting down 2247 */ 2248 continue; 2249 } 2250 2251 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2252 2253 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 2254 uint32_t nsid = i + 1; 2255 2256 ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 2257 if (ns->populated) { 2258 assert(ns->id == nsid); 2259 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns); 2260 } 2261 } 2262 2263 pthread_mutex_lock(&g_bdev_nvme_mutex); 2264 nvme_bdev_ctrlr->destruct = true; 2265 2266 if (nvme_bdev_ctrlr->ref == 0) { 2267 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2268 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 2269 pthread_mutex_lock(&g_bdev_nvme_mutex); 2270 } 2271 } 2272 2273 g_bdev_nvme_module_finish = true; 2274 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2275 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2276 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 2277 spdk_bdev_module_finish_done(); 2278 return; 2279 } 2280 2281 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2282 } 2283 2284 static void 2285 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io) 2286 { 2287 struct spdk_bdev *bdev = bdev_io->bdev; 2288 struct spdk_dif_ctx dif_ctx; 2289 struct spdk_dif_error err_blk = {}; 2290 int rc; 2291 2292 rc = spdk_dif_ctx_init(&dif_ctx, 2293 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2294 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2295 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2296 if (rc != 0) { 2297 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2298 return; 2299 } 2300 2301 if (bdev->md_interleave) { 2302 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2303 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2304 } else { 2305 struct iovec md_iov = { 2306 .iov_base = bdev_io->u.bdev.md_buf, 2307 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2308 }; 2309 2310 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2311 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2312 } 2313 2314 if (rc != 0) { 2315 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2316 err_blk.err_type, err_blk.err_offset); 2317 } else { 2318 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2319 } 2320 } 2321 2322 static void 2323 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2324 { 2325 struct nvme_bdev_io *bio = ref; 2326 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2327 2328 if (spdk_nvme_cpl_is_success(cpl)) { 2329 /* Run PI verification for read data buffer. */ 2330 bdev_nvme_verify_pi_error(bdev_io); 2331 } 2332 2333 /* Return original completion status */ 2334 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, 2335 bio->cpl.status.sc); 2336 } 2337 2338 static void 2339 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2340 { 2341 struct nvme_bdev_io *bio = ref; 2342 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2343 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2344 struct nvme_io_channel *nvme_ch; 2345 int ret; 2346 2347 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2348 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2349 cpl->status.sct, cpl->status.sc); 2350 2351 /* Save completion status to use after verifying PI error. */ 2352 bio->cpl = *cpl; 2353 2354 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2355 2356 /* Read without PI checking to verify PI error. */ 2357 ret = bdev_nvme_no_pi_readv(nbdev->nvme_ns, 2358 nvme_ch, 2359 bio, 2360 bdev_io->u.bdev.iovs, 2361 bdev_io->u.bdev.iovcnt, 2362 bdev_io->u.bdev.md_buf, 2363 bdev_io->u.bdev.num_blocks, 2364 bdev_io->u.bdev.offset_blocks); 2365 if (ret == 0) { 2366 return; 2367 } 2368 } 2369 2370 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2371 } 2372 2373 static void 2374 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2375 { 2376 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2377 2378 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2379 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2380 cpl->status.sct, cpl->status.sc); 2381 /* Run PI verification for write data buffer if PI error is detected. */ 2382 bdev_nvme_verify_pi_error(bdev_io); 2383 } 2384 2385 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2386 } 2387 2388 static void 2389 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2390 { 2391 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2392 2393 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2394 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2395 cpl->status.sct, cpl->status.sc); 2396 /* Run PI verification for compare data buffer if PI error is detected. */ 2397 bdev_nvme_verify_pi_error(bdev_io); 2398 } 2399 2400 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2401 } 2402 2403 static void 2404 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2405 { 2406 struct nvme_bdev_io *bio = ref; 2407 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2408 2409 /* Compare operation completion */ 2410 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2411 /* Save compare result for write callback */ 2412 bio->cpl = *cpl; 2413 return; 2414 } 2415 2416 /* Write operation completion */ 2417 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2418 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2419 * complete the IO with the compare operation's status. 2420 */ 2421 if (!spdk_nvme_cpl_is_error(cpl)) { 2422 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2423 } 2424 2425 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2426 } else { 2427 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2428 } 2429 } 2430 2431 static void 2432 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2433 { 2434 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2435 2436 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2437 } 2438 2439 static void 2440 bdev_nvme_admin_passthru_completion(void *ctx) 2441 { 2442 struct nvme_bdev_io *bio = ctx; 2443 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2444 2445 spdk_bdev_io_complete_nvme_status(bdev_io, 2446 bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2447 } 2448 2449 static void 2450 bdev_nvme_abort_completion(void *ctx) 2451 { 2452 struct nvme_bdev_io *bio = ctx; 2453 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2454 2455 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 2456 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2457 } else { 2458 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2459 } 2460 } 2461 2462 static void 2463 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 2464 { 2465 struct nvme_bdev_io *bio = ref; 2466 2467 bio->cpl = *cpl; 2468 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2469 } 2470 2471 static void 2472 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 2473 { 2474 struct nvme_bdev_io *bio = ref; 2475 2476 bio->cpl = *cpl; 2477 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 2478 } 2479 2480 static void 2481 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 2482 { 2483 struct nvme_bdev_io *bio = ref; 2484 struct iovec *iov; 2485 2486 bio->iov_offset = sgl_offset; 2487 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 2488 iov = &bio->iovs[bio->iovpos]; 2489 if (bio->iov_offset < iov->iov_len) { 2490 break; 2491 } 2492 2493 bio->iov_offset -= iov->iov_len; 2494 } 2495 } 2496 2497 static int 2498 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 2499 { 2500 struct nvme_bdev_io *bio = ref; 2501 struct iovec *iov; 2502 2503 assert(bio->iovpos < bio->iovcnt); 2504 2505 iov = &bio->iovs[bio->iovpos]; 2506 2507 *address = iov->iov_base; 2508 *length = iov->iov_len; 2509 2510 if (bio->iov_offset) { 2511 assert(bio->iov_offset <= iov->iov_len); 2512 *address += bio->iov_offset; 2513 *length -= bio->iov_offset; 2514 } 2515 2516 bio->iov_offset += *length; 2517 if (bio->iov_offset == iov->iov_len) { 2518 bio->iovpos++; 2519 bio->iov_offset = 0; 2520 } 2521 2522 return 0; 2523 } 2524 2525 static void 2526 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 2527 { 2528 struct nvme_bdev_io *bio = ref; 2529 struct iovec *iov; 2530 2531 bio->fused_iov_offset = sgl_offset; 2532 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 2533 iov = &bio->fused_iovs[bio->fused_iovpos]; 2534 if (bio->fused_iov_offset < iov->iov_len) { 2535 break; 2536 } 2537 2538 bio->fused_iov_offset -= iov->iov_len; 2539 } 2540 } 2541 2542 static int 2543 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 2544 { 2545 struct nvme_bdev_io *bio = ref; 2546 struct iovec *iov; 2547 2548 assert(bio->fused_iovpos < bio->fused_iovcnt); 2549 2550 iov = &bio->fused_iovs[bio->fused_iovpos]; 2551 2552 *address = iov->iov_base; 2553 *length = iov->iov_len; 2554 2555 if (bio->fused_iov_offset) { 2556 assert(bio->fused_iov_offset <= iov->iov_len); 2557 *address += bio->fused_iov_offset; 2558 *length -= bio->fused_iov_offset; 2559 } 2560 2561 bio->fused_iov_offset += *length; 2562 if (bio->fused_iov_offset == iov->iov_len) { 2563 bio->fused_iovpos++; 2564 bio->fused_iov_offset = 0; 2565 } 2566 2567 return 0; 2568 } 2569 2570 static int 2571 bdev_nvme_no_pi_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2572 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2573 void *md, uint64_t lba_count, uint64_t lba) 2574 { 2575 int rc; 2576 2577 SPDK_DEBUGLOG(bdev_nvme, "read %lu blocks with offset %#lx without PI check\n", 2578 lba_count, lba); 2579 2580 bio->iovs = iov; 2581 bio->iovcnt = iovcnt; 2582 bio->iovpos = 0; 2583 bio->iov_offset = 0; 2584 2585 rc = spdk_nvme_ns_cmd_readv_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count, 2586 bdev_nvme_no_pi_readv_done, bio, 0, 2587 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2588 md, 0, 0); 2589 2590 if (rc != 0 && rc != -ENOMEM) { 2591 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 2592 } 2593 return rc; 2594 } 2595 2596 static int 2597 bdev_nvme_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2598 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2599 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2600 { 2601 int rc; 2602 2603 SPDK_DEBUGLOG(bdev_nvme, "read %lu blocks with offset %#lx\n", 2604 lba_count, lba); 2605 2606 bio->iovs = iov; 2607 bio->iovcnt = iovcnt; 2608 bio->iovpos = 0; 2609 bio->iov_offset = 0; 2610 2611 if (iovcnt == 1) { 2612 rc = spdk_nvme_ns_cmd_read_with_md(nvme_ns->ns, nvme_ch->qpair, iov[0].iov_base, md, lba, 2613 lba_count, 2614 bdev_nvme_readv_done, bio, 2615 flags, 2616 0, 0); 2617 } else { 2618 rc = spdk_nvme_ns_cmd_readv_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count, 2619 bdev_nvme_readv_done, bio, flags, 2620 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2621 md, 0, 0); 2622 } 2623 2624 if (rc != 0 && rc != -ENOMEM) { 2625 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 2626 } 2627 return rc; 2628 } 2629 2630 static int 2631 bdev_nvme_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2632 struct nvme_bdev_io *bio, 2633 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2634 uint32_t flags) 2635 { 2636 int rc; 2637 2638 SPDK_DEBUGLOG(bdev_nvme, "write %lu blocks with offset %#lx\n", 2639 lba_count, lba); 2640 2641 bio->iovs = iov; 2642 bio->iovcnt = iovcnt; 2643 bio->iovpos = 0; 2644 bio->iov_offset = 0; 2645 2646 if (iovcnt == 1) { 2647 rc = spdk_nvme_ns_cmd_write_with_md(nvme_ns->ns, nvme_ch->qpair, iov[0].iov_base, md, lba, 2648 lba_count, 2649 bdev_nvme_readv_done, bio, 2650 flags, 2651 0, 0); 2652 } else { 2653 rc = spdk_nvme_ns_cmd_writev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count, 2654 bdev_nvme_writev_done, bio, flags, 2655 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2656 md, 0, 0); 2657 } 2658 2659 if (rc != 0 && rc != -ENOMEM) { 2660 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 2661 } 2662 return rc; 2663 } 2664 2665 static int 2666 bdev_nvme_comparev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2667 struct nvme_bdev_io *bio, 2668 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2669 uint32_t flags) 2670 { 2671 int rc; 2672 2673 SPDK_DEBUGLOG(bdev_nvme, "compare %lu blocks with offset %#lx\n", 2674 lba_count, lba); 2675 2676 bio->iovs = iov; 2677 bio->iovcnt = iovcnt; 2678 bio->iovpos = 0; 2679 bio->iov_offset = 0; 2680 2681 rc = spdk_nvme_ns_cmd_comparev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count, 2682 bdev_nvme_comparev_done, bio, flags, 2683 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2684 md, 0, 0); 2685 2686 if (rc != 0 && rc != -ENOMEM) { 2687 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 2688 } 2689 return rc; 2690 } 2691 2692 static int 2693 bdev_nvme_comparev_and_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2694 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 2695 struct iovec *write_iov, int write_iovcnt, 2696 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2697 { 2698 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2699 int rc; 2700 2701 SPDK_DEBUGLOG(bdev_nvme, "compare and write %lu blocks with offset %#lx\n", 2702 lba_count, lba); 2703 2704 bio->iovs = cmp_iov; 2705 bio->iovcnt = cmp_iovcnt; 2706 bio->iovpos = 0; 2707 bio->iov_offset = 0; 2708 bio->fused_iovs = write_iov; 2709 bio->fused_iovcnt = write_iovcnt; 2710 bio->fused_iovpos = 0; 2711 bio->fused_iov_offset = 0; 2712 2713 if (bdev_io->num_retries == 0) { 2714 bio->first_fused_submitted = false; 2715 } 2716 2717 if (!bio->first_fused_submitted) { 2718 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2719 memset(&bio->cpl, 0, sizeof(bio->cpl)); 2720 2721 rc = spdk_nvme_ns_cmd_comparev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count, 2722 bdev_nvme_comparev_and_writev_done, bio, flags, 2723 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 2724 if (rc == 0) { 2725 bio->first_fused_submitted = true; 2726 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2727 } else { 2728 if (rc != -ENOMEM) { 2729 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 2730 } 2731 return rc; 2732 } 2733 } 2734 2735 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 2736 2737 rc = spdk_nvme_ns_cmd_writev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count, 2738 bdev_nvme_comparev_and_writev_done, bio, flags, 2739 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 2740 if (rc != 0 && rc != -ENOMEM) { 2741 SPDK_ERRLOG("write failed: rc = %d\n", rc); 2742 rc = 0; 2743 } 2744 2745 return rc; 2746 } 2747 2748 static int 2749 bdev_nvme_unmap(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2750 struct nvme_bdev_io *bio, 2751 uint64_t offset_blocks, 2752 uint64_t num_blocks) 2753 { 2754 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 2755 struct spdk_nvme_dsm_range *range; 2756 uint64_t offset, remaining; 2757 uint64_t num_ranges_u64; 2758 uint16_t num_ranges; 2759 int rc; 2760 2761 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 2762 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2763 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 2764 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 2765 return -EINVAL; 2766 } 2767 num_ranges = (uint16_t)num_ranges_u64; 2768 2769 offset = offset_blocks; 2770 remaining = num_blocks; 2771 range = &dsm_ranges[0]; 2772 2773 /* Fill max-size ranges until the remaining blocks fit into one range */ 2774 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 2775 range->attributes.raw = 0; 2776 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2777 range->starting_lba = offset; 2778 2779 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2780 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2781 range++; 2782 } 2783 2784 /* Final range describes the remaining blocks */ 2785 range->attributes.raw = 0; 2786 range->length = remaining; 2787 range->starting_lba = offset; 2788 2789 rc = spdk_nvme_ns_cmd_dataset_management(nvme_ns->ns, nvme_ch->qpair, 2790 SPDK_NVME_DSM_ATTR_DEALLOCATE, 2791 dsm_ranges, num_ranges, 2792 bdev_nvme_queued_done, bio); 2793 2794 return rc; 2795 } 2796 2797 static int 2798 bdev_nvme_admin_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2799 struct nvme_bdev_io *bio, 2800 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2801 { 2802 uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr); 2803 2804 if (nbytes > max_xfer_size) { 2805 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2806 return -EINVAL; 2807 } 2808 2809 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2810 2811 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ns->ctrlr->ctrlr, cmd, buf, 2812 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 2813 } 2814 2815 static int 2816 bdev_nvme_io_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2817 struct nvme_bdev_io *bio, 2818 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2819 { 2820 uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr); 2821 2822 if (nbytes > max_xfer_size) { 2823 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2824 return -EINVAL; 2825 } 2826 2827 /* 2828 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2829 * so fill it out automatically. 2830 */ 2831 cmd->nsid = spdk_nvme_ns_get_id(nvme_ns->ns); 2832 2833 return spdk_nvme_ctrlr_cmd_io_raw(nvme_ns->ctrlr->ctrlr, nvme_ch->qpair, cmd, buf, 2834 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 2835 } 2836 2837 static int 2838 bdev_nvme_io_passthru_md(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2839 struct nvme_bdev_io *bio, 2840 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 2841 { 2842 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nvme_ns->ns); 2843 uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr); 2844 2845 if (nbytes > max_xfer_size) { 2846 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2847 return -EINVAL; 2848 } 2849 2850 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nvme_ns->ns)) { 2851 SPDK_ERRLOG("invalid meta data buffer size\n"); 2852 return -EINVAL; 2853 } 2854 2855 /* 2856 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2857 * so fill it out automatically. 2858 */ 2859 cmd->nsid = spdk_nvme_ns_get_id(nvme_ns->ns); 2860 2861 return spdk_nvme_ctrlr_cmd_io_raw_with_md(nvme_ns->ctrlr->ctrlr, nvme_ch->qpair, cmd, buf, 2862 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 2863 } 2864 2865 static void 2866 bdev_nvme_abort_admin_cmd(void *ctx) 2867 { 2868 struct nvme_bdev_io *bio = ctx; 2869 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2870 struct nvme_bdev *nbdev; 2871 struct nvme_bdev_io *bio_to_abort; 2872 int rc; 2873 2874 nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2875 bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2876 2877 rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_ns->ctrlr->ctrlr, 2878 NULL, 2879 bio_to_abort, 2880 bdev_nvme_abort_done, bio); 2881 if (rc == -ENOENT) { 2882 /* If no admin command was found in admin qpair, complete the abort 2883 * request with failure. 2884 */ 2885 bio->cpl.cdw0 |= 1U; 2886 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 2887 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2888 2889 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2890 } 2891 } 2892 2893 static int 2894 bdev_nvme_abort(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch, 2895 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort) 2896 { 2897 int rc; 2898 2899 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2900 2901 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ns->ctrlr->ctrlr, 2902 nvme_ch->qpair, 2903 bio_to_abort, 2904 bdev_nvme_abort_done, bio); 2905 if (rc == -ENOENT) { 2906 /* If no command was found in I/O qpair, the target command may be 2907 * admin command. Only a single thread tries aborting admin command 2908 * to clean I/O flow. 2909 */ 2910 spdk_thread_send_msg(nvme_ns->ctrlr->thread, 2911 bdev_nvme_abort_admin_cmd, bio); 2912 rc = 0; 2913 } 2914 2915 return rc; 2916 } 2917 2918 static void 2919 bdev_nvme_get_spdk_running_config(FILE *fp) 2920 { 2921 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2922 2923 fprintf(fp, "\n[Nvme]"); 2924 fprintf(fp, "\n" 2925 "# NVMe Device Whitelist\n" 2926 "# Users may specify which NVMe devices to claim by their transport id.\n" 2927 "# See spdk_nvme_transport_id_parse() in spdk/nvme.h for the correct format.\n" 2928 "# The second argument is the assigned name, which can be referenced from\n" 2929 "# other sections in the configuration file. For NVMe devices, a namespace\n" 2930 "# is automatically appended to each name in the format <YourName>nY, where\n" 2931 "# Y is the NSID (starts at 1).\n"); 2932 2933 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 2934 const char *trtype; 2935 const char *prchk_flags; 2936 2937 trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->connected_trid->trtype); 2938 if (!trtype) { 2939 continue; 2940 } 2941 2942 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2943 fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n", 2944 trtype, 2945 nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->name); 2946 } else { 2947 const char *adrfam; 2948 2949 adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->connected_trid->adrfam); 2950 prchk_flags = spdk_nvme_prchk_flags_str(nvme_bdev_ctrlr->prchk_flags); 2951 2952 if (adrfam) { 2953 fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s", 2954 trtype, adrfam, 2955 nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->connected_trid->trsvcid, 2956 nvme_bdev_ctrlr->connected_trid->subnqn, nvme_bdev_ctrlr->name); 2957 } else { 2958 fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s", 2959 trtype, 2960 nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->connected_trid->trsvcid, 2961 nvme_bdev_ctrlr->connected_trid->subnqn, nvme_bdev_ctrlr->name); 2962 } 2963 2964 if (prchk_flags) { 2965 fprintf(fp, " \"%s\"\n", prchk_flags); 2966 } else { 2967 fprintf(fp, "\n"); 2968 } 2969 } 2970 } 2971 2972 fprintf(fp, "\n" 2973 "# The number of attempts per I/O when an I/O fails. Do not include\n" 2974 "# this key to get the default behavior.\n"); 2975 fprintf(fp, "RetryCount %d\n", g_opts.retry_count); 2976 fprintf(fp, "\n" 2977 "# Timeout for each command, in microseconds. If 0, don't track timeouts.\n"); 2978 fprintf(fp, "TimeoutUsec %"PRIu64"\n", g_opts.timeout_us); 2979 2980 fprintf(fp, "\n" 2981 "# Action to take on command time out. Only valid when Timeout is greater\n" 2982 "# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort\n" 2983 "# the command, or 'None' to just print a message but do nothing.\n" 2984 "# Admin command timeouts will always result in a reset.\n"); 2985 switch (g_opts.action_on_timeout) { 2986 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 2987 fprintf(fp, "ActionOnTimeout None\n"); 2988 break; 2989 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 2990 fprintf(fp, "ActionOnTimeout Reset\n"); 2991 break; 2992 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 2993 fprintf(fp, "ActionOnTimeout Abort\n"); 2994 break; 2995 } 2996 2997 fprintf(fp, "\n" 2998 "# Set how often the admin queue is polled for asynchronous events.\n" 2999 "# Units in microseconds.\n"); 3000 fprintf(fp, "AdminPollRate %"PRIu64"\n", g_opts.nvme_adminq_poll_period_us); 3001 fprintf(fp, "IOPollRate %" PRIu64"\n", g_opts.nvme_ioq_poll_period_us); 3002 fprintf(fp, "\n" 3003 "# Disable handling of hotplug (runtime insert and remove) events,\n" 3004 "# users can set to Yes if want to enable it.\n" 3005 "# Default: No\n"); 3006 fprintf(fp, "HotplugEnable %s\n", g_nvme_hotplug_enabled ? "Yes" : "No"); 3007 fprintf(fp, "\n" 3008 "# Set how often the hotplug is processed for insert and remove events." 3009 "# Units in microseconds.\n"); 3010 fprintf(fp, "HotplugPollRate %"PRIu64"\n", g_nvme_hotplug_poll_period_us); 3011 if (g_nvme_hostnqn) { 3012 fprintf(fp, "HostNQN %s\n", g_nvme_hostnqn); 3013 } 3014 fprintf(fp, "DelayCmdSubmit %s\n", g_opts.delay_cmd_submit ? "True" : "False"); 3015 3016 fprintf(fp, "\n"); 3017 } 3018 3019 static void 3020 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns) 3021 { 3022 /* nop */ 3023 } 3024 3025 static void 3026 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns) 3027 { 3028 g_config_json_namespace_fn[ns->type](w, ns); 3029 } 3030 3031 static int 3032 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 3033 { 3034 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 3035 struct spdk_nvme_transport_id *trid; 3036 const char *action; 3037 uint32_t nsid; 3038 3039 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 3040 action = "reset"; 3041 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 3042 action = "abort"; 3043 } else { 3044 action = "none"; 3045 } 3046 3047 spdk_json_write_object_begin(w); 3048 3049 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 3050 3051 spdk_json_write_named_object_begin(w, "params"); 3052 spdk_json_write_named_string(w, "action_on_timeout", action); 3053 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 3054 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 3055 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 3056 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 3057 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 3058 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 3059 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 3060 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 3061 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 3062 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 3063 spdk_json_write_object_end(w); 3064 3065 spdk_json_write_object_end(w); 3066 3067 pthread_mutex_lock(&g_bdev_nvme_mutex); 3068 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 3069 trid = nvme_bdev_ctrlr->connected_trid; 3070 3071 spdk_json_write_object_begin(w); 3072 3073 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 3074 3075 spdk_json_write_named_object_begin(w, "params"); 3076 spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); 3077 nvme_bdev_dump_trid_json(trid, w); 3078 spdk_json_write_named_bool(w, "prchk_reftag", 3079 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 3080 spdk_json_write_named_bool(w, "prchk_guard", 3081 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 3082 3083 spdk_json_write_object_end(w); 3084 3085 spdk_json_write_object_end(w); 3086 3087 for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { 3088 if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { 3089 continue; 3090 } 3091 3092 nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); 3093 } 3094 } 3095 3096 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 3097 * before enabling hotplug poller. 3098 */ 3099 spdk_json_write_object_begin(w); 3100 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 3101 3102 spdk_json_write_named_object_begin(w, "params"); 3103 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 3104 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 3105 spdk_json_write_object_end(w); 3106 3107 spdk_json_write_object_end(w); 3108 3109 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3110 return 0; 3111 } 3112 3113 struct spdk_nvme_ctrlr * 3114 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 3115 { 3116 if (!bdev || bdev->module != &nvme_if) { 3117 return NULL; 3118 } 3119 3120 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 3121 } 3122 3123 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3124