1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2017, IBM Corporation. All rights reserved. 6 * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * NVMe over PCIe transport 37 */ 38 39 #include "spdk/stdinc.h" 40 #include "spdk/env.h" 41 #include "spdk/likely.h" 42 #include "spdk/string.h" 43 #include "nvme_internal.h" 44 #include "nvme_uevent.h" 45 46 /* 47 * Number of completion queue entries to process before ringing the 48 * completion queue doorbell. 49 */ 50 #define NVME_MIN_COMPLETIONS (1) 51 #define NVME_MAX_COMPLETIONS (128) 52 53 /* 54 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL 55 * segment. 56 */ 57 #define NVME_MAX_SGL_DESCRIPTORS (250) 58 59 #define NVME_MAX_PRP_LIST_ENTRIES (503) 60 61 struct nvme_pcie_enum_ctx { 62 struct spdk_nvme_probe_ctx *probe_ctx; 63 struct spdk_pci_addr pci_addr; 64 bool has_pci_addr; 65 }; 66 67 /* PCIe transport extensions for spdk_nvme_ctrlr */ 68 struct nvme_pcie_ctrlr { 69 struct spdk_nvme_ctrlr ctrlr; 70 71 /** NVMe MMIO register space */ 72 volatile struct spdk_nvme_registers *regs; 73 74 /** NVMe MMIO register size */ 75 uint64_t regs_size; 76 77 struct { 78 /* BAR mapping address which contains controller memory buffer */ 79 void *bar_va; 80 81 /* BAR physical address which contains controller memory buffer */ 82 uint64_t bar_pa; 83 84 /* Controller memory buffer size in Bytes */ 85 uint64_t size; 86 87 /* Current offset of controller memory buffer, relative to start of BAR virt addr */ 88 uint64_t current_offset; 89 90 void *mem_register_addr; 91 size_t mem_register_size; 92 } cmb; 93 94 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */ 95 uint32_t doorbell_stride_u32; 96 97 /* Opaque handle to associated PCI device. */ 98 struct spdk_pci_device *devhandle; 99 100 /* Flag to indicate the MMIO register has been remapped */ 101 bool is_remapped; 102 }; 103 104 struct nvme_tracker { 105 TAILQ_ENTRY(nvme_tracker) tq_list; 106 107 struct nvme_request *req; 108 uint16_t cid; 109 110 uint16_t rsvd0; 111 uint32_t rsvd1; 112 113 spdk_nvme_cmd_cb cb_fn; 114 void *cb_arg; 115 116 uint64_t prp_sgl_bus_addr; 117 118 /* Don't move, metadata SGL is always contiguous with Data Block SGL */ 119 struct spdk_nvme_sgl_descriptor meta_sgl; 120 union { 121 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; 122 struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; 123 } u; 124 }; 125 /* 126 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary 127 * and so that there is no padding required to meet alignment requirements. 128 */ 129 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K"); 130 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned"); 131 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, meta_sgl) & 7) == 0, "SGL must be Qword aligned"); 132 133 struct nvme_pcie_poll_group { 134 struct spdk_nvme_transport_poll_group group; 135 }; 136 137 /* PCIe transport extensions for spdk_nvme_qpair */ 138 struct nvme_pcie_qpair { 139 /* Submission queue tail doorbell */ 140 volatile uint32_t *sq_tdbl; 141 142 /* Completion queue head doorbell */ 143 volatile uint32_t *cq_hdbl; 144 145 /* Submission queue */ 146 struct spdk_nvme_cmd *cmd; 147 148 /* Completion queue */ 149 struct spdk_nvme_cpl *cpl; 150 151 TAILQ_HEAD(, nvme_tracker) free_tr; 152 TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr; 153 154 /* Array of trackers indexed by command ID. */ 155 struct nvme_tracker *tr; 156 157 uint16_t num_entries; 158 159 uint8_t retry_count; 160 161 uint16_t max_completions_cap; 162 163 uint16_t last_sq_tail; 164 uint16_t sq_tail; 165 uint16_t cq_head; 166 uint16_t sq_head; 167 168 struct { 169 uint8_t phase : 1; 170 uint8_t delay_cmd_submit : 1; 171 uint8_t has_shadow_doorbell : 1; 172 } flags; 173 174 /* 175 * Base qpair structure. 176 * This is located after the hot data in this structure so that the important parts of 177 * nvme_pcie_qpair are in the same cache line. 178 */ 179 struct spdk_nvme_qpair qpair; 180 181 struct { 182 /* Submission queue shadow tail doorbell */ 183 volatile uint32_t *sq_tdbl; 184 185 /* Completion queue shadow head doorbell */ 186 volatile uint32_t *cq_hdbl; 187 188 /* Submission queue event index */ 189 volatile uint32_t *sq_eventidx; 190 191 /* Completion queue event index */ 192 volatile uint32_t *cq_eventidx; 193 } shadow_doorbell; 194 195 /* 196 * Fields below this point should not be touched on the normal I/O path. 197 */ 198 199 bool sq_in_cmb; 200 201 uint64_t cmd_bus_addr; 202 uint64_t cpl_bus_addr; 203 204 struct spdk_nvme_cmd *sq_vaddr; 205 struct spdk_nvme_cpl *cq_vaddr; 206 }; 207 208 static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, 209 struct spdk_pci_addr *pci_addr); 210 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, 211 const struct spdk_nvme_io_qpair_opts *opts); 212 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair); 213 214 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; 215 static uint16_t g_signal_lock; 216 static bool g_sigset = false; 217 static int g_hotplug_fd = -1; 218 219 static void 220 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx) 221 { 222 void *map_address; 223 uint16_t flag = 0; 224 225 if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE, 226 __ATOMIC_RELAXED)) { 227 SPDK_DEBUGLOG(SPDK_LOG_NVME, "request g_signal_lock failed\n"); 228 return; 229 } 230 231 assert(g_thread_mmio_ctrlr != NULL); 232 233 if (!g_thread_mmio_ctrlr->is_remapped) { 234 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, 235 PROT_READ | PROT_WRITE, 236 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); 237 if (map_address == MAP_FAILED) { 238 SPDK_ERRLOG("mmap failed\n"); 239 __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); 240 return; 241 } 242 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); 243 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; 244 g_thread_mmio_ctrlr->is_remapped = true; 245 } 246 __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); 247 } 248 249 static void 250 nvme_pcie_ctrlr_setup_signal(void) 251 { 252 struct sigaction sa; 253 254 sa.sa_sigaction = nvme_sigbus_fault_sighandler; 255 sigemptyset(&sa.sa_mask); 256 sa.sa_flags = SA_SIGINFO; 257 sigaction(SIGBUS, &sa, NULL); 258 } 259 260 static inline struct nvme_pcie_ctrlr * 261 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) 262 { 263 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); 264 return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); 265 } 266 267 static int 268 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) 269 { 270 struct spdk_nvme_ctrlr *ctrlr, *tmp; 271 struct spdk_uevent event; 272 struct spdk_pci_addr pci_addr; 273 union spdk_nvme_csts_register csts; 274 struct spdk_nvme_ctrlr_process *proc; 275 276 while (spdk_get_uevent(g_hotplug_fd, &event) > 0) { 277 if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO || 278 event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) { 279 if (event.action == SPDK_NVME_UEVENT_ADD) { 280 SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n", 281 event.traddr); 282 if (spdk_process_is_primary()) { 283 if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) { 284 nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr); 285 } 286 } 287 } else if (event.action == SPDK_NVME_UEVENT_REMOVE) { 288 struct spdk_nvme_transport_id trid; 289 290 memset(&trid, 0, sizeof(trid)); 291 spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE); 292 snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr); 293 294 ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid); 295 if (ctrlr == NULL) { 296 return 0; 297 } 298 SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n", 299 event.traddr); 300 301 nvme_ctrlr_fail(ctrlr, true); 302 303 /* get the user app to clean up and stop I/O */ 304 if (ctrlr->remove_cb) { 305 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 306 ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr); 307 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 308 } 309 } 310 } 311 } 312 313 /* This is a work around for vfio-attached device hot remove detection. */ 314 TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { 315 bool do_remove = false; 316 317 if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 318 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 319 320 if (spdk_pci_device_is_removed(pctrlr->devhandle)) { 321 do_remove = true; 322 } 323 } 324 325 /* NVMe controller BAR must be mapped in the current process before any access. */ 326 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 327 if (proc) { 328 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 329 if (csts.raw == 0xffffffffU) { 330 do_remove = true; 331 } 332 } 333 334 if (do_remove) { 335 nvme_ctrlr_fail(ctrlr, true); 336 if (ctrlr->remove_cb) { 337 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 338 ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr); 339 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 340 } 341 } 342 } 343 return 0; 344 } 345 346 static inline struct nvme_pcie_qpair * 347 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) 348 { 349 assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE); 350 return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair); 351 } 352 353 static volatile void * 354 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) 355 { 356 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 357 358 return (volatile void *)((uintptr_t)pctrlr->regs + offset); 359 } 360 361 static int 362 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) 363 { 364 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 365 366 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 367 g_thread_mmio_ctrlr = pctrlr; 368 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); 369 g_thread_mmio_ctrlr = NULL; 370 return 0; 371 } 372 373 static int 374 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) 375 { 376 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 377 378 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 379 g_thread_mmio_ctrlr = pctrlr; 380 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); 381 g_thread_mmio_ctrlr = NULL; 382 return 0; 383 } 384 385 static int 386 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) 387 { 388 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 389 390 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 391 assert(value != NULL); 392 g_thread_mmio_ctrlr = pctrlr; 393 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); 394 g_thread_mmio_ctrlr = NULL; 395 if (~(*value) == 0) { 396 return -1; 397 } 398 399 return 0; 400 } 401 402 static int 403 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) 404 { 405 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 406 407 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 408 assert(value != NULL); 409 g_thread_mmio_ctrlr = pctrlr; 410 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); 411 g_thread_mmio_ctrlr = NULL; 412 if (~(*value) == 0) { 413 return -1; 414 } 415 416 return 0; 417 } 418 419 static int 420 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 421 { 422 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), 423 value); 424 } 425 426 static int 427 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 428 { 429 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), 430 value); 431 } 432 433 static int 434 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) 435 { 436 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), 437 aqa->raw); 438 } 439 440 static int 441 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) 442 { 443 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), 444 &cmbloc->raw); 445 } 446 447 static int 448 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) 449 { 450 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), 451 &cmbsz->raw); 452 } 453 454 static uint32_t 455 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 456 { 457 /* 458 * For commands requiring more than 2 PRP entries, one PRP will be 459 * embedded in the command (prp1), and the rest of the PRP entries 460 * will be in a list pointed to by the command (prp2). This means 461 * that real max number of PRP entries we support is 506+1, which 462 * results in a max xfer size of 506*ctrlr->page_size. 463 */ 464 return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; 465 } 466 467 static uint16_t 468 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 469 { 470 return NVME_MAX_SGL_DESCRIPTORS; 471 } 472 473 static void 474 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) 475 { 476 int rc; 477 void *addr; 478 uint32_t bir; 479 union spdk_nvme_cmbsz_register cmbsz; 480 union spdk_nvme_cmbloc_register cmbloc; 481 uint64_t size, unit_size, offset, bar_size, bar_phys_addr; 482 483 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || 484 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 485 SPDK_ERRLOG("get registers failed\n"); 486 goto exit; 487 } 488 489 if (!cmbsz.bits.sz) { 490 goto exit; 491 } 492 493 bir = cmbloc.bits.bir; 494 /* Values 0 2 3 4 5 are valid for BAR */ 495 if (bir > 5 || bir == 1) { 496 goto exit; 497 } 498 499 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ 500 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); 501 /* controller memory buffer size in Bytes */ 502 size = unit_size * cmbsz.bits.sz; 503 /* controller memory buffer offset from BAR in Bytes */ 504 offset = unit_size * cmbloc.bits.ofst; 505 506 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, 507 &bar_phys_addr, &bar_size); 508 if ((rc != 0) || addr == NULL) { 509 goto exit; 510 } 511 512 if (offset > bar_size) { 513 goto exit; 514 } 515 516 if (size > bar_size - offset) { 517 goto exit; 518 } 519 520 pctrlr->cmb.bar_va = addr; 521 pctrlr->cmb.bar_pa = bar_phys_addr; 522 pctrlr->cmb.size = size; 523 pctrlr->cmb.current_offset = offset; 524 525 if (!cmbsz.bits.sqs) { 526 pctrlr->ctrlr.opts.use_cmb_sqs = false; 527 } 528 529 return; 530 exit: 531 pctrlr->ctrlr.opts.use_cmb_sqs = false; 532 return; 533 } 534 535 static int 536 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) 537 { 538 int rc = 0; 539 union spdk_nvme_cmbloc_register cmbloc; 540 void *addr = pctrlr->cmb.bar_va; 541 542 if (addr) { 543 if (pctrlr->cmb.mem_register_addr) { 544 spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size); 545 } 546 547 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 548 SPDK_ERRLOG("get_cmbloc() failed\n"); 549 return -EIO; 550 } 551 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); 552 } 553 return rc; 554 } 555 556 static int 557 nvme_pcie_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr) 558 { 559 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 560 561 if (pctrlr->cmb.bar_va == NULL) { 562 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); 563 return -ENOTSUP; 564 } 565 566 if (ctrlr->opts.use_cmb_sqs) { 567 SPDK_ERRLOG("CMB is already in use for submission queues.\n"); 568 return -ENOTSUP; 569 } 570 571 return 0; 572 } 573 574 static void * 575 nvme_pcie_ctrlr_map_io_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) 576 { 577 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 578 union spdk_nvme_cmbsz_register cmbsz; 579 union spdk_nvme_cmbloc_register cmbloc; 580 uint64_t mem_register_start, mem_register_end; 581 int rc; 582 583 if (pctrlr->cmb.mem_register_addr != NULL) { 584 *size = pctrlr->cmb.mem_register_size; 585 return pctrlr->cmb.mem_register_addr; 586 } 587 588 *size = 0; 589 590 if (pctrlr->cmb.bar_va == NULL) { 591 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); 592 return NULL; 593 } 594 595 if (ctrlr->opts.use_cmb_sqs) { 596 SPDK_ERRLOG("CMB is already in use for submission queues.\n"); 597 return NULL; 598 } 599 600 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || 601 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 602 SPDK_ERRLOG("get registers failed\n"); 603 return NULL; 604 } 605 606 /* If only SQS is supported */ 607 if (!(cmbsz.bits.wds || cmbsz.bits.rds)) { 608 return NULL; 609 } 610 611 /* If CMB is less than 4MiB in size then abort CMB mapping */ 612 if (pctrlr->cmb.size < (1ULL << 22)) { 613 return NULL; 614 } 615 616 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + 617 VALUE_2MB - 1); 618 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + 619 pctrlr->cmb.size); 620 pctrlr->cmb.mem_register_addr = (void *)mem_register_start; 621 pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start; 622 623 rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start); 624 if (rc) { 625 SPDK_ERRLOG("spdk_mem_register() failed\n"); 626 return NULL; 627 } 628 629 pctrlr->cmb.mem_register_addr = (void *)mem_register_start; 630 pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start; 631 632 *size = pctrlr->cmb.mem_register_size; 633 return pctrlr->cmb.mem_register_addr; 634 } 635 636 static int 637 nvme_pcie_ctrlr_unmap_io_cmb(struct spdk_nvme_ctrlr *ctrlr) 638 { 639 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 640 int rc; 641 642 if (pctrlr->cmb.mem_register_addr == NULL) { 643 return 0; 644 } 645 646 rc = spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size); 647 648 if (rc == 0) { 649 pctrlr->cmb.mem_register_addr = NULL; 650 pctrlr->cmb.mem_register_size = 0; 651 } 652 653 return rc; 654 } 655 656 static int 657 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) 658 { 659 int rc; 660 void *addr; 661 uint64_t phys_addr, size; 662 663 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, 664 &phys_addr, &size); 665 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; 666 if ((pctrlr->regs == NULL) || (rc != 0)) { 667 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", 668 rc, pctrlr->regs); 669 return -1; 670 } 671 672 pctrlr->regs_size = size; 673 nvme_pcie_ctrlr_map_cmb(pctrlr); 674 675 return 0; 676 } 677 678 static int 679 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) 680 { 681 int rc = 0; 682 void *addr = (void *)pctrlr->regs; 683 684 if (pctrlr->ctrlr.is_removed) { 685 return rc; 686 } 687 688 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); 689 if (rc != 0) { 690 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); 691 return -1; 692 } 693 694 if (addr) { 695 /* NOTE: addr may have been remapped here. We're relying on DPDK to call 696 * munmap internally. 697 */ 698 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); 699 } 700 return rc; 701 } 702 703 static int 704 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries) 705 { 706 struct nvme_pcie_qpair *pqpair; 707 int rc; 708 709 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 710 if (pqpair == NULL) { 711 return -ENOMEM; 712 } 713 714 pqpair->num_entries = num_entries; 715 pqpair->flags.delay_cmd_submit = 0; 716 717 ctrlr->adminq = &pqpair->qpair; 718 719 rc = nvme_qpair_init(ctrlr->adminq, 720 0, /* qpair ID */ 721 ctrlr, 722 SPDK_NVME_QPRIO_URGENT, 723 num_entries); 724 if (rc != 0) { 725 return rc; 726 } 727 728 return nvme_pcie_qpair_construct(ctrlr->adminq, NULL); 729 } 730 731 /* This function must only be called while holding g_spdk_nvme_driver->lock */ 732 static int 733 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) 734 { 735 struct spdk_nvme_transport_id trid = {}; 736 struct nvme_pcie_enum_ctx *enum_ctx = ctx; 737 struct spdk_nvme_ctrlr *ctrlr; 738 struct spdk_pci_addr pci_addr; 739 740 pci_addr = spdk_pci_device_get_addr(pci_dev); 741 742 spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE); 743 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); 744 745 ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid); 746 if (!spdk_process_is_primary()) { 747 if (!ctrlr) { 748 SPDK_ERRLOG("Controller must be constructed in the primary process first.\n"); 749 return -1; 750 } 751 752 return nvme_ctrlr_add_process(ctrlr, pci_dev); 753 } 754 755 /* check whether user passes the pci_addr */ 756 if (enum_ctx->has_pci_addr && 757 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { 758 return 1; 759 } 760 761 return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev); 762 } 763 764 static int 765 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, 766 bool direct_connect) 767 { 768 struct nvme_pcie_enum_ctx enum_ctx = {}; 769 770 enum_ctx.probe_ctx = probe_ctx; 771 772 if (strlen(probe_ctx->trid.traddr) != 0) { 773 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) { 774 return -1; 775 } 776 enum_ctx.has_pci_addr = true; 777 } 778 779 /* Only the primary process can monitor hotplug. */ 780 if (spdk_process_is_primary()) { 781 if (g_hotplug_fd < 0) { 782 g_hotplug_fd = spdk_uevent_connect(); 783 if (g_hotplug_fd < 0) { 784 SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n"); 785 } 786 } else { 787 _nvme_pcie_hotplug_monitor(probe_ctx); 788 } 789 } 790 791 if (enum_ctx.has_pci_addr == false) { 792 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), 793 pcie_nvme_enum_cb, &enum_ctx); 794 } else { 795 return spdk_pci_device_attach(spdk_pci_nvme_get_driver(), 796 pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); 797 } 798 } 799 800 static int 801 nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr) 802 { 803 struct nvme_pcie_enum_ctx enum_ctx; 804 805 enum_ctx.probe_ctx = probe_ctx; 806 enum_ctx.has_pci_addr = true; 807 enum_ctx.pci_addr = *pci_addr; 808 809 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx); 810 } 811 812 static struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 813 const struct spdk_nvme_ctrlr_opts *opts, 814 void *devhandle) 815 { 816 struct spdk_pci_device *pci_dev = devhandle; 817 struct nvme_pcie_ctrlr *pctrlr; 818 union spdk_nvme_cap_register cap; 819 union spdk_nvme_vs_register vs; 820 uint32_t cmd_reg; 821 int rc; 822 struct spdk_pci_id pci_id; 823 824 rc = spdk_pci_device_claim(pci_dev); 825 if (rc < 0) { 826 SPDK_ERRLOG("could not claim device %s (%s)\n", 827 trid->traddr, spdk_strerror(-rc)); 828 return NULL; 829 } 830 831 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, 832 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 833 if (pctrlr == NULL) { 834 spdk_pci_device_unclaim(pci_dev); 835 SPDK_ERRLOG("could not allocate ctrlr\n"); 836 return NULL; 837 } 838 839 pctrlr->is_remapped = false; 840 pctrlr->ctrlr.is_removed = false; 841 pctrlr->devhandle = devhandle; 842 pctrlr->ctrlr.opts = *opts; 843 pctrlr->ctrlr.trid = *trid; 844 845 rc = nvme_ctrlr_construct(&pctrlr->ctrlr); 846 if (rc != 0) { 847 spdk_pci_device_unclaim(pci_dev); 848 spdk_free(pctrlr); 849 return NULL; 850 } 851 852 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); 853 if (rc != 0) { 854 spdk_pci_device_unclaim(pci_dev); 855 spdk_free(pctrlr); 856 return NULL; 857 } 858 859 /* Enable PCI busmaster and disable INTx */ 860 spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4); 861 cmd_reg |= 0x404; 862 spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4); 863 864 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { 865 SPDK_ERRLOG("get_cap() failed\n"); 866 spdk_pci_device_unclaim(pci_dev); 867 spdk_free(pctrlr); 868 return NULL; 869 } 870 871 if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { 872 SPDK_ERRLOG("get_vs() failed\n"); 873 spdk_pci_device_unclaim(pci_dev); 874 spdk_free(pctrlr); 875 return NULL; 876 } 877 878 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); 879 880 /* Doorbell stride is 2 ^ (dstrd + 2), 881 * but we want multiples of 4, so drop the + 2 */ 882 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; 883 884 pci_id = spdk_pci_device_get_id(pci_dev); 885 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); 886 887 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr, pctrlr->ctrlr.opts.admin_queue_size); 888 if (rc != 0) { 889 nvme_ctrlr_destruct(&pctrlr->ctrlr); 890 return NULL; 891 } 892 893 /* Construct the primary process properties */ 894 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); 895 if (rc != 0) { 896 nvme_ctrlr_destruct(&pctrlr->ctrlr); 897 return NULL; 898 } 899 900 if (g_sigset != true) { 901 nvme_pcie_ctrlr_setup_signal(); 902 g_sigset = true; 903 } 904 905 return &pctrlr->ctrlr; 906 } 907 908 static int 909 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 910 { 911 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 912 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); 913 union spdk_nvme_aqa_register aqa; 914 915 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { 916 SPDK_ERRLOG("set_asq() failed\n"); 917 return -EIO; 918 } 919 920 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { 921 SPDK_ERRLOG("set_acq() failed\n"); 922 return -EIO; 923 } 924 925 aqa.raw = 0; 926 /* acqs and asqs are 0-based. */ 927 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 928 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 929 930 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { 931 SPDK_ERRLOG("set_aqa() failed\n"); 932 return -EIO; 933 } 934 935 return 0; 936 } 937 938 static int 939 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 940 { 941 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 942 struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); 943 944 if (ctrlr->adminq) { 945 nvme_pcie_qpair_destroy(ctrlr->adminq); 946 } 947 948 nvme_ctrlr_destruct_finish(ctrlr); 949 950 nvme_ctrlr_free_processes(ctrlr); 951 952 nvme_pcie_ctrlr_free_bars(pctrlr); 953 954 if (devhandle) { 955 spdk_pci_device_unclaim(devhandle); 956 spdk_pci_device_detach(devhandle); 957 } 958 959 spdk_free(pctrlr); 960 961 return 0; 962 } 963 964 static void 965 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) 966 { 967 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); 968 tr->cid = cid; 969 tr->req = NULL; 970 } 971 972 static int 973 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) 974 { 975 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 976 uint32_t i; 977 978 /* all head/tail vals are set to 0 */ 979 pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0; 980 981 /* 982 * First time through the completion queue, HW will set phase 983 * bit on completions to 1. So set this to 1 here, indicating 984 * we're looking for a 1 to know which entries have completed. 985 * we'll toggle the bit each time when the completion queue 986 * rolls over. 987 */ 988 pqpair->flags.phase = 1; 989 for (i = 0; i < pqpair->num_entries; i++) { 990 pqpair->cpl[i].status.p = 0; 991 } 992 993 return 0; 994 } 995 996 static void * 997 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment, 998 uint64_t *phys_addr) 999 { 1000 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1001 uintptr_t addr; 1002 1003 if (pctrlr->cmb.mem_register_addr != NULL) { 1004 /* BAR is mapped for data */ 1005 return NULL; 1006 } 1007 1008 addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset; 1009 addr = (addr + (alignment - 1)) & ~(alignment - 1); 1010 1011 /* CMB may only consume part of the BAR, calculate accordingly */ 1012 if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) { 1013 SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); 1014 return NULL; 1015 } 1016 *phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va; 1017 1018 pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va; 1019 1020 return (void *)addr; 1021 } 1022 1023 static int 1024 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, 1025 const struct spdk_nvme_io_qpair_opts *opts) 1026 { 1027 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1028 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1029 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1030 struct nvme_tracker *tr; 1031 uint16_t i; 1032 volatile uint32_t *doorbell_base; 1033 uint16_t num_trackers; 1034 size_t page_align = sysconf(_SC_PAGESIZE); 1035 size_t queue_align, queue_len; 1036 uint32_t flags = SPDK_MALLOC_DMA; 1037 uint64_t sq_paddr = 0; 1038 uint64_t cq_paddr = 0; 1039 1040 if (opts) { 1041 pqpair->sq_vaddr = opts->sq.vaddr; 1042 pqpair->cq_vaddr = opts->cq.vaddr; 1043 sq_paddr = opts->sq.paddr; 1044 cq_paddr = opts->cq.paddr; 1045 } 1046 1047 pqpair->retry_count = ctrlr->opts.transport_retry_count; 1048 1049 /* 1050 * Limit the maximum number of completions to return per call to prevent wraparound, 1051 * and calculate how many trackers can be submitted at once without overflowing the 1052 * completion queue. 1053 */ 1054 pqpair->max_completions_cap = pqpair->num_entries / 4; 1055 pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); 1056 pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); 1057 num_trackers = pqpair->num_entries - pqpair->max_completions_cap; 1058 1059 SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", 1060 pqpair->max_completions_cap, num_trackers); 1061 1062 assert(num_trackers != 0); 1063 1064 pqpair->sq_in_cmb = false; 1065 1066 if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { 1067 flags |= SPDK_MALLOC_SHARE; 1068 } 1069 1070 /* cmd and cpl rings must be aligned on page size boundaries. */ 1071 if (ctrlr->opts.use_cmb_sqs) { 1072 pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1073 page_align, &pqpair->cmd_bus_addr); 1074 if (pqpair->cmd != NULL) { 1075 pqpair->sq_in_cmb = true; 1076 } 1077 } 1078 1079 if (pqpair->sq_in_cmb == false) { 1080 if (pqpair->sq_vaddr) { 1081 pqpair->cmd = pqpair->sq_vaddr; 1082 } else { 1083 /* To ensure physical address contiguity we make each ring occupy 1084 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. 1085 */ 1086 queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd); 1087 queue_align = spdk_max(spdk_align32pow2(queue_len), page_align); 1088 pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags); 1089 if (pqpair->cmd == NULL) { 1090 SPDK_ERRLOG("alloc qpair_cmd failed\n"); 1091 return -ENOMEM; 1092 } 1093 } 1094 if (sq_paddr) { 1095 assert(pqpair->sq_vaddr != NULL); 1096 pqpair->cmd_bus_addr = sq_paddr; 1097 } else { 1098 pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL); 1099 if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) { 1100 SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n"); 1101 return -EFAULT; 1102 } 1103 } 1104 } 1105 1106 if (pqpair->cq_vaddr) { 1107 pqpair->cpl = pqpair->cq_vaddr; 1108 } else { 1109 queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl); 1110 queue_align = spdk_max(spdk_align32pow2(queue_len), page_align); 1111 pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags); 1112 if (pqpair->cpl == NULL) { 1113 SPDK_ERRLOG("alloc qpair_cpl failed\n"); 1114 return -ENOMEM; 1115 } 1116 } 1117 if (cq_paddr) { 1118 assert(pqpair->cq_vaddr != NULL); 1119 pqpair->cpl_bus_addr = cq_paddr; 1120 } else { 1121 pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL); 1122 if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) { 1123 SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n"); 1124 return -EFAULT; 1125 } 1126 } 1127 1128 doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl; 1129 pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; 1130 pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; 1131 1132 /* 1133 * Reserve space for all of the trackers in a single allocation. 1134 * struct nvme_tracker must be padded so that its size is already a power of 2. 1135 * This ensures the PRP list embedded in the nvme_tracker object will not span a 1136 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. 1137 */ 1138 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, 1139 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1140 if (pqpair->tr == NULL) { 1141 SPDK_ERRLOG("nvme_tr failed\n"); 1142 return -ENOMEM; 1143 } 1144 1145 TAILQ_INIT(&pqpair->free_tr); 1146 TAILQ_INIT(&pqpair->outstanding_tr); 1147 1148 for (i = 0; i < num_trackers; i++) { 1149 tr = &pqpair->tr[i]; 1150 nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL)); 1151 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1152 } 1153 1154 nvme_pcie_qpair_reset(qpair); 1155 1156 return 0; 1157 } 1158 1159 static inline void 1160 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) 1161 { 1162 /* dst and src are known to be non-overlapping and 64-byte aligned. */ 1163 #if defined(__SSE2__) 1164 __m128i *d128 = (__m128i *)dst; 1165 const __m128i *s128 = (const __m128i *)src; 1166 1167 _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0])); 1168 _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1])); 1169 _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2])); 1170 _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3])); 1171 #else 1172 *dst = *src; 1173 #endif 1174 } 1175 1176 /** 1177 * Note: the ctrlr_lock must be held when calling this function. 1178 */ 1179 static void 1180 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, 1181 struct nvme_request *req, struct spdk_nvme_cpl *cpl) 1182 { 1183 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1184 struct nvme_request *active_req = req; 1185 struct spdk_nvme_ctrlr_process *active_proc; 1186 1187 /* 1188 * The admin request is from another process. Move to the per 1189 * process list for that process to handle it later. 1190 */ 1191 assert(nvme_qpair_is_admin_queue(qpair)); 1192 assert(active_req->pid != getpid()); 1193 1194 active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid); 1195 if (active_proc) { 1196 /* Save the original completion information */ 1197 memcpy(&active_req->cpl, cpl, sizeof(*cpl)); 1198 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); 1199 } else { 1200 SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", 1201 active_req->pid); 1202 1203 nvme_free_request(active_req); 1204 } 1205 } 1206 1207 /** 1208 * Note: the ctrlr_lock must be held when calling this function. 1209 */ 1210 static void 1211 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) 1212 { 1213 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1214 struct nvme_request *req, *tmp_req; 1215 pid_t pid = getpid(); 1216 struct spdk_nvme_ctrlr_process *proc; 1217 1218 /* 1219 * Check whether there is any pending admin request from 1220 * other active processes. 1221 */ 1222 assert(nvme_qpair_is_admin_queue(qpair)); 1223 1224 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 1225 if (!proc) { 1226 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); 1227 assert(proc); 1228 return; 1229 } 1230 1231 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { 1232 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); 1233 1234 assert(req->pid == pid); 1235 1236 nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl); 1237 nvme_free_request(req); 1238 } 1239 } 1240 1241 static inline int 1242 nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) 1243 { 1244 return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old); 1245 } 1246 1247 static bool 1248 nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value, 1249 volatile uint32_t *shadow_db, 1250 volatile uint32_t *eventidx) 1251 { 1252 uint16_t old; 1253 1254 if (!shadow_db) { 1255 return true; 1256 } 1257 1258 old = *shadow_db; 1259 *shadow_db = value; 1260 1261 /* 1262 * Ensure that the doorbell is updated before reading the EventIdx from 1263 * memory 1264 */ 1265 spdk_mb(); 1266 1267 if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) { 1268 return false; 1269 } 1270 1271 return true; 1272 } 1273 1274 static inline void 1275 nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair) 1276 { 1277 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1278 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1279 bool need_mmio = true; 1280 1281 if (qpair->first_fused_submitted) { 1282 /* This is first cmd of two fused commands - don't ring doorbell */ 1283 qpair->first_fused_submitted = 0; 1284 return; 1285 } 1286 1287 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1288 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1289 pqpair->sq_tail, 1290 pqpair->shadow_doorbell.sq_tdbl, 1291 pqpair->shadow_doorbell.sq_eventidx); 1292 } 1293 1294 if (spdk_likely(need_mmio)) { 1295 spdk_wmb(); 1296 g_thread_mmio_ctrlr = pctrlr; 1297 spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail); 1298 g_thread_mmio_ctrlr = NULL; 1299 } 1300 } 1301 1302 static inline void 1303 nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair) 1304 { 1305 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1306 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1307 bool need_mmio = true; 1308 1309 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1310 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1311 pqpair->cq_head, 1312 pqpair->shadow_doorbell.cq_hdbl, 1313 pqpair->shadow_doorbell.cq_eventidx); 1314 } 1315 1316 if (spdk_likely(need_mmio)) { 1317 g_thread_mmio_ctrlr = pctrlr; 1318 spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head); 1319 g_thread_mmio_ctrlr = NULL; 1320 } 1321 } 1322 1323 static void 1324 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1325 { 1326 struct nvme_request *req; 1327 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1328 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1329 1330 req = tr->req; 1331 assert(req != NULL); 1332 1333 if (req->cmd.fuse == SPDK_NVME_IO_FLAGS_FUSE_FIRST) { 1334 /* This is first cmd of two fused commands - don't ring doorbell */ 1335 qpair->first_fused_submitted = 1; 1336 } 1337 1338 /* Don't use wide instructions to copy NVMe command, this is limited by QEMU 1339 * virtual NVMe controller, the maximum access width is 8 Bytes for one time. 1340 */ 1341 if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) { 1342 pqpair->cmd[pqpair->sq_tail] = req->cmd; 1343 } else { 1344 /* Copy the command from the tracker to the submission queue. */ 1345 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); 1346 } 1347 1348 if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) { 1349 pqpair->sq_tail = 0; 1350 } 1351 1352 if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) { 1353 SPDK_ERRLOG("sq_tail is passing sq_head!\n"); 1354 } 1355 1356 if (!pqpair->flags.delay_cmd_submit) { 1357 nvme_pcie_qpair_ring_sq_doorbell(qpair); 1358 } 1359 } 1360 1361 static void 1362 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 1363 struct spdk_nvme_cpl *cpl, bool print_on_error) 1364 { 1365 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1366 struct nvme_request *req; 1367 bool retry, error; 1368 bool req_from_current_proc = true; 1369 1370 req = tr->req; 1371 1372 assert(req != NULL); 1373 1374 error = spdk_nvme_cpl_is_error(cpl); 1375 retry = error && nvme_completion_is_retry(cpl) && 1376 req->retries < pqpair->retry_count; 1377 1378 if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { 1379 spdk_nvme_qpair_print_command(qpair, &req->cmd); 1380 spdk_nvme_qpair_print_completion(qpair, cpl); 1381 } 1382 1383 assert(cpl->cid == req->cmd.cid); 1384 1385 if (retry) { 1386 req->retries++; 1387 nvme_pcie_qpair_submit_tracker(qpair, tr); 1388 } else { 1389 /* Only check admin requests from different processes. */ 1390 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { 1391 req_from_current_proc = false; 1392 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); 1393 } else { 1394 nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl); 1395 } 1396 1397 if (req_from_current_proc == true) { 1398 nvme_qpair_free_request(qpair, req); 1399 } 1400 1401 tr->req = NULL; 1402 1403 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); 1404 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1405 } 1406 } 1407 1408 static void 1409 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, 1410 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 1411 bool print_on_error) 1412 { 1413 struct spdk_nvme_cpl cpl; 1414 1415 memset(&cpl, 0, sizeof(cpl)); 1416 cpl.sqid = qpair->id; 1417 cpl.cid = tr->cid; 1418 cpl.status.sct = sct; 1419 cpl.status.sc = sc; 1420 cpl.status.dnr = dnr; 1421 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); 1422 } 1423 1424 static void 1425 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1426 { 1427 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1428 struct nvme_tracker *tr, *temp, *last; 1429 1430 last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head); 1431 1432 /* Abort previously submitted (outstanding) trs */ 1433 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { 1434 if (!qpair->ctrlr->opts.disable_error_logging) { 1435 SPDK_ERRLOG("aborting outstanding command\n"); 1436 } 1437 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1438 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); 1439 1440 if (tr == last) { 1441 break; 1442 } 1443 } 1444 } 1445 1446 static void 1447 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 1448 { 1449 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1450 struct nvme_tracker *tr; 1451 1452 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1453 while (tr != NULL) { 1454 assert(tr->req != NULL); 1455 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 1456 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, 1457 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, 1458 false); 1459 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1460 } else { 1461 tr = TAILQ_NEXT(tr, tq_list); 1462 } 1463 } 1464 } 1465 1466 static void 1467 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) 1468 { 1469 nvme_pcie_admin_qpair_abort_aers(qpair); 1470 } 1471 1472 static int 1473 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) 1474 { 1475 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1476 1477 if (nvme_qpair_is_admin_queue(qpair)) { 1478 nvme_pcie_admin_qpair_destroy(qpair); 1479 } 1480 /* 1481 * We check sq_vaddr and cq_vaddr to see if the user specified the memory 1482 * buffers when creating the I/O queue. 1483 * If the user specified them, we cannot free that memory. 1484 * Nor do we free it if it's in the CMB. 1485 */ 1486 if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) { 1487 spdk_free(pqpair->cmd); 1488 } 1489 if (!pqpair->cq_vaddr && pqpair->cpl) { 1490 spdk_free(pqpair->cpl); 1491 } 1492 if (pqpair->tr) { 1493 spdk_free(pqpair->tr); 1494 } 1495 1496 nvme_qpair_deinit(qpair); 1497 1498 spdk_free(pqpair); 1499 1500 return 0; 1501 } 1502 1503 static void 1504 nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1505 { 1506 nvme_pcie_qpair_abort_trackers(qpair, dnr); 1507 } 1508 1509 static int 1510 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, 1511 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, 1512 void *cb_arg) 1513 { 1514 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1515 struct nvme_request *req; 1516 struct spdk_nvme_cmd *cmd; 1517 1518 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1519 if (req == NULL) { 1520 return -ENOMEM; 1521 } 1522 1523 cmd = &req->cmd; 1524 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; 1525 1526 cmd->cdw10_bits.create_io_q.qid = io_que->id; 1527 cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1; 1528 1529 cmd->cdw11_bits.create_io_cq.pc = 1; 1530 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; 1531 1532 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1533 } 1534 1535 static int 1536 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, 1537 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1538 { 1539 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1540 struct nvme_request *req; 1541 struct spdk_nvme_cmd *cmd; 1542 1543 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1544 if (req == NULL) { 1545 return -ENOMEM; 1546 } 1547 1548 cmd = &req->cmd; 1549 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; 1550 1551 cmd->cdw10_bits.create_io_q.qid = io_que->id; 1552 cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1; 1553 cmd->cdw11_bits.create_io_sq.pc = 1; 1554 cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio; 1555 cmd->cdw11_bits.create_io_sq.cqid = io_que->id; 1556 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; 1557 1558 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1559 } 1560 1561 static int 1562 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1563 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1564 { 1565 struct nvme_request *req; 1566 struct spdk_nvme_cmd *cmd; 1567 1568 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1569 if (req == NULL) { 1570 return -ENOMEM; 1571 } 1572 1573 cmd = &req->cmd; 1574 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; 1575 cmd->cdw10_bits.delete_io_q.qid = qpair->id; 1576 1577 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1578 } 1579 1580 static int 1581 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1582 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1583 { 1584 struct nvme_request *req; 1585 struct spdk_nvme_cmd *cmd; 1586 1587 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1588 if (req == NULL) { 1589 return -ENOMEM; 1590 } 1591 1592 cmd = &req->cmd; 1593 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; 1594 cmd->cdw10_bits.delete_io_q.qid = qpair->id; 1595 1596 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1597 } 1598 1599 static int 1600 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1601 uint16_t qid) 1602 { 1603 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1604 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1605 struct nvme_completion_poll_status *status; 1606 int rc; 1607 1608 status = calloc(1, sizeof(*status)); 1609 if (!status) { 1610 SPDK_ERRLOG("Failed to allocate status tracker\n"); 1611 return -ENOMEM; 1612 } 1613 1614 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); 1615 if (rc != 0) { 1616 free(status); 1617 return rc; 1618 } 1619 1620 if (spdk_nvme_wait_for_completion(ctrlr->adminq, status)) { 1621 SPDK_ERRLOG("nvme_create_io_cq failed!\n"); 1622 if (!status->timed_out) { 1623 free(status); 1624 } 1625 return -1; 1626 } 1627 1628 memset(status, 0, sizeof(*status)); 1629 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status); 1630 if (rc != 0) { 1631 free(status); 1632 return rc; 1633 } 1634 1635 if (spdk_nvme_wait_for_completion(ctrlr->adminq, status)) { 1636 SPDK_ERRLOG("nvme_create_io_sq failed!\n"); 1637 if (status->timed_out) { 1638 /* Request is still queued, the memory will be freed in a completion callback. 1639 allocate a new request */ 1640 status = calloc(1, sizeof(*status)); 1641 if (!status) { 1642 SPDK_ERRLOG("Failed to allocate status tracker\n"); 1643 return -ENOMEM; 1644 } 1645 } 1646 1647 memset(status, 0, sizeof(*status)); 1648 /* Attempt to delete the completion queue */ 1649 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status); 1650 if (rc != 0) { 1651 /* The originall or newly allocated status structure can be freed since 1652 * the corresponding request has been completed of failed to submit */ 1653 free(status); 1654 return -1; 1655 } 1656 spdk_nvme_wait_for_completion(ctrlr->adminq, status); 1657 if (!status->timed_out) { 1658 /* status can be freed regardless of spdk_nvme_wait_for_completion return value */ 1659 free(status); 1660 } 1661 return -1; 1662 } 1663 1664 if (ctrlr->shadow_doorbell) { 1665 pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * 1666 pctrlr->doorbell_stride_u32; 1667 pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * 1668 pctrlr->doorbell_stride_u32; 1669 pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * 1670 pctrlr->doorbell_stride_u32; 1671 pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * 1672 pctrlr->doorbell_stride_u32; 1673 pqpair->flags.has_shadow_doorbell = 1; 1674 } else { 1675 pqpair->flags.has_shadow_doorbell = 0; 1676 } 1677 nvme_pcie_qpair_reset(qpair); 1678 free(status); 1679 1680 return 0; 1681 } 1682 1683 static struct spdk_nvme_qpair * 1684 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 1685 const struct spdk_nvme_io_qpair_opts *opts) 1686 { 1687 struct nvme_pcie_qpair *pqpair; 1688 struct spdk_nvme_qpair *qpair; 1689 int rc; 1690 1691 assert(ctrlr != NULL); 1692 1693 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, 1694 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1695 if (pqpair == NULL) { 1696 return NULL; 1697 } 1698 1699 pqpair->num_entries = opts->io_queue_size; 1700 pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit; 1701 1702 qpair = &pqpair->qpair; 1703 1704 rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); 1705 if (rc != 0) { 1706 nvme_pcie_qpair_destroy(qpair); 1707 return NULL; 1708 } 1709 1710 rc = nvme_pcie_qpair_construct(qpair, opts); 1711 1712 if (rc != 0) { 1713 nvme_pcie_qpair_destroy(qpair); 1714 return NULL; 1715 } 1716 1717 return qpair; 1718 } 1719 1720 static int 1721 nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1722 { 1723 if (nvme_qpair_is_admin_queue(qpair)) { 1724 return 0; 1725 } else { 1726 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); 1727 } 1728 } 1729 1730 static void 1731 nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1732 { 1733 } 1734 1735 static int 1736 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1737 { 1738 struct nvme_completion_poll_status *status; 1739 int rc; 1740 1741 assert(ctrlr != NULL); 1742 1743 if (ctrlr->is_removed) { 1744 goto free; 1745 } 1746 1747 status = calloc(1, sizeof(*status)); 1748 if (!status) { 1749 SPDK_ERRLOG("Failed to allocate status tracker\n"); 1750 return -ENOMEM; 1751 } 1752 1753 /* Delete the I/O submission queue */ 1754 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status); 1755 if (rc != 0) { 1756 SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); 1757 free(status); 1758 return rc; 1759 } 1760 if (spdk_nvme_wait_for_completion(ctrlr->adminq, status)) { 1761 if (!status->timed_out) { 1762 free(status); 1763 } 1764 return -1; 1765 } 1766 1767 memset(status, 0, sizeof(*status)); 1768 /* Delete the completion queue */ 1769 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); 1770 if (rc != 0) { 1771 SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); 1772 free(status); 1773 return rc; 1774 } 1775 if (spdk_nvme_wait_for_completion(ctrlr->adminq, status)) { 1776 if (!status->timed_out) { 1777 free(status); 1778 } 1779 return -1; 1780 } 1781 free(status); 1782 1783 free: 1784 if (qpair->no_deletion_notification_needed == 0) { 1785 /* Abort the rest of the I/O */ 1786 nvme_pcie_qpair_abort_trackers(qpair, 1); 1787 } 1788 1789 nvme_pcie_qpair_destroy(qpair); 1790 return 0; 1791 } 1792 1793 static void 1794 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1795 { 1796 /* 1797 * Bad vtophys translation, so abort this request and return 1798 * immediately. 1799 */ 1800 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1801 SPDK_NVME_SC_INVALID_FIELD, 1802 1 /* do not retry */, true); 1803 } 1804 1805 /* 1806 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. 1807 * 1808 * *prp_index will be updated to account for the number of PRP entries used. 1809 */ 1810 static inline int 1811 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, 1812 uint32_t page_size) 1813 { 1814 struct spdk_nvme_cmd *cmd = &tr->req->cmd; 1815 uintptr_t page_mask = page_size - 1; 1816 uint64_t phys_addr; 1817 uint32_t i; 1818 1819 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n", 1820 *prp_index, virt_addr, (uint32_t)len); 1821 1822 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { 1823 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1824 return -EFAULT; 1825 } 1826 1827 i = *prp_index; 1828 while (len) { 1829 uint32_t seg_len; 1830 1831 /* 1832 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, 1833 * so prp_index == count is valid. 1834 */ 1835 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { 1836 SPDK_ERRLOG("out of PRP entries\n"); 1837 return -EFAULT; 1838 } 1839 1840 phys_addr = spdk_vtophys(virt_addr, NULL); 1841 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { 1842 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); 1843 return -EFAULT; 1844 } 1845 1846 if (i == 0) { 1847 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr); 1848 cmd->dptr.prp.prp1 = phys_addr; 1849 seg_len = page_size - ((uintptr_t)virt_addr & page_mask); 1850 } else { 1851 if ((phys_addr & page_mask) != 0) { 1852 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); 1853 return -EFAULT; 1854 } 1855 1856 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr); 1857 tr->u.prp[i - 1] = phys_addr; 1858 seg_len = page_size; 1859 } 1860 1861 seg_len = spdk_min(seg_len, len); 1862 virt_addr += seg_len; 1863 len -= seg_len; 1864 i++; 1865 } 1866 1867 cmd->psdt = SPDK_NVME_PSDT_PRP; 1868 if (i <= 1) { 1869 cmd->dptr.prp.prp2 = 0; 1870 } else if (i == 2) { 1871 cmd->dptr.prp.prp2 = tr->u.prp[0]; 1872 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); 1873 } else { 1874 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; 1875 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); 1876 } 1877 1878 *prp_index = i; 1879 return 0; 1880 } 1881 1882 static int 1883 nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair, 1884 struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned) 1885 { 1886 assert(0); 1887 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1888 return -EINVAL; 1889 } 1890 1891 /** 1892 * Build PRP list describing physically contiguous payload buffer. 1893 */ 1894 static int 1895 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1896 struct nvme_tracker *tr, bool dword_aligned) 1897 { 1898 uint32_t prp_index = 0; 1899 int rc; 1900 1901 rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, 1902 req->payload_size, qpair->ctrlr->page_size); 1903 if (rc) { 1904 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1905 } 1906 1907 return rc; 1908 } 1909 1910 /** 1911 * Build an SGL describing a physically contiguous payload buffer. 1912 * 1913 * This is more efficient than using PRP because large buffers can be 1914 * described this way. 1915 */ 1916 static int 1917 nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1918 struct nvme_tracker *tr, bool dword_aligned) 1919 { 1920 void *virt_addr; 1921 uint64_t phys_addr, mapping_length; 1922 uint32_t length; 1923 struct spdk_nvme_sgl_descriptor *sgl; 1924 uint32_t nseg = 0; 1925 1926 assert(req->payload_size != 0); 1927 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); 1928 1929 sgl = tr->u.sgl; 1930 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1931 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1932 1933 length = req->payload_size; 1934 virt_addr = req->payload.contig_or_cb_arg + req->payload_offset; 1935 mapping_length = length; 1936 1937 while (length > 0) { 1938 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1939 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1940 return -EFAULT; 1941 } 1942 1943 if (dword_aligned && ((uintptr_t)virt_addr & 3)) { 1944 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1945 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1946 return -EFAULT; 1947 } 1948 1949 phys_addr = spdk_vtophys(virt_addr, &mapping_length); 1950 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1951 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1952 return -EFAULT; 1953 } 1954 1955 mapping_length = spdk_min(length, mapping_length); 1956 1957 length -= mapping_length; 1958 virt_addr += mapping_length; 1959 1960 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1961 sgl->unkeyed.length = mapping_length; 1962 sgl->address = phys_addr; 1963 sgl->unkeyed.subtype = 0; 1964 1965 sgl++; 1966 nseg++; 1967 } 1968 1969 if (nseg == 1) { 1970 /* 1971 * The whole transfer can be described by a single SGL descriptor. 1972 * Use the special case described by the spec where SGL1's type is Data Block. 1973 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1974 * SGL element into SGL1. 1975 */ 1976 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1977 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1978 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1979 } else { 1980 /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because 1981 * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. 1982 */ 1983 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1984 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1985 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1986 } 1987 1988 return 0; 1989 } 1990 1991 /** 1992 * Build SGL list describing scattered payload buffer. 1993 */ 1994 static int 1995 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1996 struct nvme_tracker *tr, bool dword_aligned) 1997 { 1998 int rc; 1999 void *virt_addr; 2000 uint64_t phys_addr; 2001 uint32_t remaining_transfer_len, remaining_user_sge_len, length; 2002 struct spdk_nvme_sgl_descriptor *sgl; 2003 uint32_t nseg = 0; 2004 2005 /* 2006 * Build scattered payloads. 2007 */ 2008 assert(req->payload_size != 0); 2009 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 2010 assert(req->payload.reset_sgl_fn != NULL); 2011 assert(req->payload.next_sge_fn != NULL); 2012 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 2013 2014 sgl = tr->u.sgl; 2015 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 2016 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 2017 2018 remaining_transfer_len = req->payload_size; 2019 2020 while (remaining_transfer_len > 0) { 2021 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, 2022 &virt_addr, &remaining_user_sge_len); 2023 if (rc) { 2024 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2025 return -EFAULT; 2026 } 2027 2028 /* Bit Bucket SGL descriptor */ 2029 if ((uint64_t)virt_addr == UINT64_MAX) { 2030 /* TODO: enable WRITE and COMPARE when necessary */ 2031 if (req->cmd.opc != SPDK_NVME_OPC_READ) { 2032 SPDK_ERRLOG("Only READ command can be supported\n"); 2033 goto exit; 2034 } 2035 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 2036 SPDK_ERRLOG("Too many SGL entries\n"); 2037 goto exit; 2038 } 2039 2040 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET; 2041 /* If the SGL describes a destination data buffer, the length of data 2042 * buffer shall be discarded by controller, and the length is included 2043 * in Number of Logical Blocks (NLB) parameter. Otherwise, the length 2044 * is not included in the NLB parameter. 2045 */ 2046 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 2047 remaining_transfer_len -= remaining_user_sge_len; 2048 2049 sgl->unkeyed.length = remaining_user_sge_len; 2050 sgl->address = 0; 2051 sgl->unkeyed.subtype = 0; 2052 2053 sgl++; 2054 nseg++; 2055 2056 continue; 2057 } 2058 2059 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 2060 remaining_transfer_len -= remaining_user_sge_len; 2061 while (remaining_user_sge_len > 0) { 2062 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 2063 SPDK_ERRLOG("Too many SGL entries\n"); 2064 goto exit; 2065 } 2066 2067 if (dword_aligned && ((uintptr_t)virt_addr & 3)) { 2068 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 2069 goto exit; 2070 } 2071 2072 phys_addr = spdk_vtophys(virt_addr, NULL); 2073 if (phys_addr == SPDK_VTOPHYS_ERROR) { 2074 goto exit; 2075 } 2076 2077 length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr)); 2078 remaining_user_sge_len -= length; 2079 virt_addr += length; 2080 2081 if (nseg > 0 && phys_addr == 2082 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { 2083 /* extend previous entry */ 2084 (*(sgl - 1)).unkeyed.length += length; 2085 continue; 2086 } 2087 2088 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 2089 sgl->unkeyed.length = length; 2090 sgl->address = phys_addr; 2091 sgl->unkeyed.subtype = 0; 2092 2093 sgl++; 2094 nseg++; 2095 } 2096 } 2097 2098 if (nseg == 1) { 2099 /* 2100 * The whole transfer can be described by a single SGL descriptor. 2101 * Use the special case described by the spec where SGL1's type is Data Block. 2102 * This means the SGL in the tracker is not used at all, so copy the first (and only) 2103 * SGL element into SGL1. 2104 */ 2105 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 2106 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 2107 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 2108 } else { 2109 /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because 2110 * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. 2111 */ 2112 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 2113 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 2114 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 2115 } 2116 2117 return 0; 2118 2119 exit: 2120 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2121 return -EFAULT; 2122 } 2123 2124 /** 2125 * Build PRP list describing scattered payload buffer. 2126 */ 2127 static int 2128 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 2129 struct nvme_tracker *tr, bool dword_aligned) 2130 { 2131 int rc; 2132 void *virt_addr; 2133 uint32_t remaining_transfer_len, length; 2134 uint32_t prp_index = 0; 2135 uint32_t page_size = qpair->ctrlr->page_size; 2136 2137 /* 2138 * Build scattered payloads. 2139 */ 2140 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 2141 assert(req->payload.reset_sgl_fn != NULL); 2142 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 2143 2144 remaining_transfer_len = req->payload_size; 2145 while (remaining_transfer_len > 0) { 2146 assert(req->payload.next_sge_fn != NULL); 2147 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); 2148 if (rc) { 2149 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2150 return -EFAULT; 2151 } 2152 2153 length = spdk_min(remaining_transfer_len, length); 2154 2155 /* 2156 * Any incompatible sges should have been handled up in the splitting routine, 2157 * but assert here as an additional check. 2158 * 2159 * All SGEs except last must end on a page boundary. 2160 */ 2161 assert((length == remaining_transfer_len) || 2162 _is_page_aligned((uintptr_t)virt_addr + length, page_size)); 2163 2164 rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); 2165 if (rc) { 2166 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2167 return rc; 2168 } 2169 2170 remaining_transfer_len -= length; 2171 } 2172 2173 return 0; 2174 } 2175 2176 typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *, 2177 bool); 2178 2179 static build_req_fn const g_nvme_pcie_build_req_table[][2] = { 2180 [NVME_PAYLOAD_TYPE_INVALID] = { 2181 nvme_pcie_qpair_build_request_invalid, /* PRP */ 2182 nvme_pcie_qpair_build_request_invalid /* SGL */ 2183 }, 2184 [NVME_PAYLOAD_TYPE_CONTIG] = { 2185 nvme_pcie_qpair_build_contig_request, /* PRP */ 2186 nvme_pcie_qpair_build_contig_hw_sgl_request /* SGL */ 2187 }, 2188 [NVME_PAYLOAD_TYPE_SGL] = { 2189 nvme_pcie_qpair_build_prps_sgl_request, /* PRP */ 2190 nvme_pcie_qpair_build_hw_sgl_request /* SGL */ 2191 } 2192 }; 2193 2194 static int 2195 nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 2196 bool sgl_supported, bool dword_aligned) 2197 { 2198 void *md_payload; 2199 struct nvme_request *req = tr->req; 2200 2201 if (req->payload.md) { 2202 md_payload = req->payload.md + req->md_offset; 2203 if (dword_aligned && ((uintptr_t)md_payload & 3)) { 2204 SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload); 2205 goto exit; 2206 } 2207 2208 if (sgl_supported && dword_aligned) { 2209 assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG); 2210 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL; 2211 tr->meta_sgl.address = spdk_vtophys(md_payload, NULL); 2212 if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR) { 2213 goto exit; 2214 } 2215 tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 2216 tr->meta_sgl.unkeyed.length = req->md_size; 2217 tr->meta_sgl.unkeyed.subtype = 0; 2218 req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor); 2219 } else { 2220 req->cmd.mptr = spdk_vtophys(md_payload, NULL); 2221 if (req->cmd.mptr == SPDK_VTOPHYS_ERROR) { 2222 goto exit; 2223 } 2224 } 2225 } 2226 2227 return 0; 2228 2229 exit: 2230 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2231 return -EINVAL; 2232 } 2233 2234 static int 2235 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) 2236 { 2237 struct nvme_tracker *tr; 2238 int rc = 0; 2239 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2240 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2241 enum nvme_payload_type payload_type; 2242 bool sgl_supported; 2243 bool dword_aligned = true; 2244 2245 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2246 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 2247 } 2248 2249 tr = TAILQ_FIRST(&pqpair->free_tr); 2250 2251 if (tr == NULL) { 2252 /* Inform the upper layer to try again later. */ 2253 rc = -EAGAIN; 2254 goto exit; 2255 } 2256 2257 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ 2258 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); 2259 tr->req = req; 2260 tr->cb_fn = req->cb_fn; 2261 tr->cb_arg = req->cb_arg; 2262 req->cmd.cid = tr->cid; 2263 2264 if (req->payload_size != 0) { 2265 payload_type = nvme_payload_type(&req->payload); 2266 /* According to the specification, PRPs shall be used for all 2267 * Admin commands for NVMe over PCIe implementations. 2268 */ 2269 sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 && 2270 !nvme_qpair_is_admin_queue(qpair); 2271 2272 if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) { 2273 dword_aligned = false; 2274 } 2275 rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned); 2276 if (rc < 0) { 2277 goto exit; 2278 } 2279 2280 rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, dword_aligned); 2281 if (rc < 0) { 2282 goto exit; 2283 } 2284 } 2285 2286 nvme_pcie_qpair_submit_tracker(qpair, tr); 2287 2288 exit: 2289 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2290 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2291 } 2292 2293 return rc; 2294 } 2295 2296 static void 2297 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 2298 { 2299 uint64_t t02; 2300 struct nvme_tracker *tr, *tmp; 2301 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2302 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2303 struct spdk_nvme_ctrlr_process *active_proc; 2304 2305 /* Don't check timeouts during controller initialization. */ 2306 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 2307 return; 2308 } 2309 2310 if (nvme_qpair_is_admin_queue(qpair)) { 2311 active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 2312 } else { 2313 active_proc = qpair->active_proc; 2314 } 2315 2316 /* Only check timeouts if the current process has a timeout callback. */ 2317 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 2318 return; 2319 } 2320 2321 t02 = spdk_get_ticks(); 2322 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { 2323 assert(tr->req != NULL); 2324 2325 if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { 2326 /* 2327 * The requests are in order, so as soon as one has not timed out, 2328 * stop iterating. 2329 */ 2330 break; 2331 } 2332 } 2333 } 2334 2335 static int32_t 2336 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) 2337 { 2338 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2339 struct nvme_tracker *tr; 2340 struct spdk_nvme_cpl *cpl, *next_cpl; 2341 uint32_t num_completions = 0; 2342 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2343 uint16_t next_cq_head; 2344 uint8_t next_phase; 2345 bool next_is_valid = false; 2346 2347 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2348 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 2349 } 2350 2351 if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { 2352 /* 2353 * max_completions == 0 means unlimited, but complete at most 2354 * max_completions_cap batch of I/O at a time so that the completion 2355 * queue doorbells don't wrap around. 2356 */ 2357 max_completions = pqpair->max_completions_cap; 2358 } 2359 2360 while (1) { 2361 cpl = &pqpair->cpl[pqpair->cq_head]; 2362 2363 if (!next_is_valid && cpl->status.p != pqpair->flags.phase) { 2364 break; 2365 } 2366 2367 if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) { 2368 next_cq_head = pqpair->cq_head + 1; 2369 next_phase = pqpair->flags.phase; 2370 } else { 2371 next_cq_head = 0; 2372 next_phase = !pqpair->flags.phase; 2373 } 2374 next_cpl = &pqpair->cpl[next_cq_head]; 2375 next_is_valid = (next_cpl->status.p == next_phase); 2376 if (next_is_valid) { 2377 __builtin_prefetch(&pqpair->tr[next_cpl->cid]); 2378 } 2379 2380 #ifdef __PPC64__ 2381 /* 2382 * This memory barrier prevents reordering of: 2383 * - load after store from/to tr 2384 * - load after load cpl phase and cpl cid 2385 */ 2386 spdk_mb(); 2387 #elif defined(__aarch64__) 2388 __asm volatile("dmb oshld" ::: "memory"); 2389 #endif 2390 2391 if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { 2392 pqpair->cq_head = 0; 2393 pqpair->flags.phase = !pqpair->flags.phase; 2394 } 2395 2396 tr = &pqpair->tr[cpl->cid]; 2397 /* Prefetch the req's STAILQ_ENTRY since we'll need to access it 2398 * as part of putting the req back on the qpair's free list. 2399 */ 2400 __builtin_prefetch(&tr->req->stailq); 2401 pqpair->sq_head = cpl->sqhd; 2402 2403 if (tr->req) { 2404 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); 2405 } else { 2406 SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); 2407 spdk_nvme_qpair_print_completion(qpair, cpl); 2408 assert(0); 2409 } 2410 2411 if (++num_completions == max_completions) { 2412 break; 2413 } 2414 } 2415 2416 if (num_completions > 0) { 2417 nvme_pcie_qpair_ring_cq_doorbell(qpair); 2418 } 2419 2420 if (pqpair->flags.delay_cmd_submit) { 2421 if (pqpair->last_sq_tail != pqpair->sq_tail) { 2422 nvme_pcie_qpair_ring_sq_doorbell(qpair); 2423 pqpair->last_sq_tail = pqpair->sq_tail; 2424 } 2425 } 2426 2427 if (spdk_unlikely(ctrlr->timeout_enabled)) { 2428 /* 2429 * User registered for timeout callback 2430 */ 2431 nvme_pcie_qpair_check_timeout(qpair); 2432 } 2433 2434 /* Before returning, complete any pending admin request. */ 2435 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2436 nvme_pcie_qpair_complete_pending_admin_request(qpair); 2437 2438 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2439 } 2440 2441 return num_completions; 2442 } 2443 2444 static struct spdk_nvme_transport_poll_group * 2445 nvme_pcie_poll_group_create(void) 2446 { 2447 struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group)); 2448 2449 if (group == NULL) { 2450 SPDK_ERRLOG("Unable to allocate poll group.\n"); 2451 return NULL; 2452 } 2453 2454 return &group->group; 2455 } 2456 2457 static int 2458 nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) 2459 { 2460 return 0; 2461 } 2462 2463 static int 2464 nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) 2465 { 2466 return 0; 2467 } 2468 2469 static int 2470 nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, 2471 struct spdk_nvme_qpair *qpair) 2472 { 2473 return 0; 2474 } 2475 2476 static int 2477 nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, 2478 struct spdk_nvme_qpair *qpair) 2479 { 2480 return 0; 2481 } 2482 2483 static int64_t 2484 nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, 2485 uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) 2486 { 2487 struct spdk_nvme_qpair *qpair, *tmp_qpair; 2488 int32_t local_completions = 0; 2489 int64_t total_completions = 0; 2490 2491 STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { 2492 disconnected_qpair_cb(qpair, tgroup->group->ctx); 2493 } 2494 2495 STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { 2496 local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair); 2497 if (local_completions < 0) { 2498 disconnected_qpair_cb(qpair, tgroup->group->ctx); 2499 local_completions = 0; 2500 } 2501 total_completions += local_completions; 2502 } 2503 2504 return total_completions; 2505 } 2506 2507 static int 2508 nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) 2509 { 2510 if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { 2511 return -EBUSY; 2512 } 2513 2514 free(tgroup); 2515 2516 return 0; 2517 } 2518 2519 const struct spdk_nvme_transport_ops pcie_ops = { 2520 .name = "PCIE", 2521 .type = SPDK_NVME_TRANSPORT_PCIE, 2522 .ctrlr_construct = nvme_pcie_ctrlr_construct, 2523 .ctrlr_scan = nvme_pcie_ctrlr_scan, 2524 .ctrlr_destruct = nvme_pcie_ctrlr_destruct, 2525 .ctrlr_enable = nvme_pcie_ctrlr_enable, 2526 2527 .ctrlr_set_reg_4 = nvme_pcie_ctrlr_set_reg_4, 2528 .ctrlr_set_reg_8 = nvme_pcie_ctrlr_set_reg_8, 2529 .ctrlr_get_reg_4 = nvme_pcie_ctrlr_get_reg_4, 2530 .ctrlr_get_reg_8 = nvme_pcie_ctrlr_get_reg_8, 2531 2532 .ctrlr_get_max_xfer_size = nvme_pcie_ctrlr_get_max_xfer_size, 2533 .ctrlr_get_max_sges = nvme_pcie_ctrlr_get_max_sges, 2534 2535 .ctrlr_reserve_cmb = nvme_pcie_ctrlr_reserve_cmb, 2536 .ctrlr_map_cmb = nvme_pcie_ctrlr_map_io_cmb, 2537 .ctrlr_unmap_cmb = nvme_pcie_ctrlr_unmap_io_cmb, 2538 2539 .ctrlr_create_io_qpair = nvme_pcie_ctrlr_create_io_qpair, 2540 .ctrlr_delete_io_qpair = nvme_pcie_ctrlr_delete_io_qpair, 2541 .ctrlr_connect_qpair = nvme_pcie_ctrlr_connect_qpair, 2542 .ctrlr_disconnect_qpair = nvme_pcie_ctrlr_disconnect_qpair, 2543 2544 .qpair_abort_reqs = nvme_pcie_qpair_abort_reqs, 2545 .qpair_reset = nvme_pcie_qpair_reset, 2546 .qpair_submit_request = nvme_pcie_qpair_submit_request, 2547 .qpair_process_completions = nvme_pcie_qpair_process_completions, 2548 .admin_qpair_abort_aers = nvme_pcie_admin_qpair_abort_aers, 2549 2550 .poll_group_create = nvme_pcie_poll_group_create, 2551 .poll_group_connect_qpair = nvme_pcie_poll_group_connect_qpair, 2552 .poll_group_disconnect_qpair = nvme_pcie_poll_group_disconnect_qpair, 2553 .poll_group_add = nvme_pcie_poll_group_add, 2554 .poll_group_remove = nvme_pcie_poll_group_remove, 2555 .poll_group_process_completions = nvme_pcie_poll_group_process_completions, 2556 .poll_group_destroy = nvme_pcie_poll_group_destroy, 2557 }; 2558 2559 SPDK_NVME_TRANSPORT_REGISTER(pcie, &pcie_ops); 2560