1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * Copyright (c) 2017, IBM Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * NVMe over PCIe transport 37 */ 38 39 #include "spdk/stdinc.h" 40 #include "spdk/env.h" 41 #include "spdk/likely.h" 42 #include "nvme_internal.h" 43 #include "nvme_uevent.h" 44 45 /* 46 * Number of completion queue entries to process before ringing the 47 * completion queue doorbell. 48 */ 49 #define NVME_MIN_COMPLETIONS (1) 50 #define NVME_MAX_COMPLETIONS (128) 51 52 #define NVME_ADMIN_ENTRIES (128) 53 54 /* 55 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL 56 * segment. 57 */ 58 #define NVME_MAX_SGL_DESCRIPTORS (253) 59 60 #define NVME_MAX_PRP_LIST_ENTRIES (506) 61 62 struct nvme_pcie_enum_ctx { 63 struct spdk_nvme_probe_ctx *probe_ctx; 64 struct spdk_pci_addr pci_addr; 65 bool has_pci_addr; 66 }; 67 68 /* PCIe transport extensions for spdk_nvme_ctrlr */ 69 struct nvme_pcie_ctrlr { 70 struct spdk_nvme_ctrlr ctrlr; 71 72 /** NVMe MMIO register space */ 73 volatile struct spdk_nvme_registers *regs; 74 75 /** NVMe MMIO register size */ 76 uint64_t regs_size; 77 78 /* BAR mapping address which contains controller memory buffer */ 79 void *cmb_bar_virt_addr; 80 81 /* BAR physical address which contains controller memory buffer */ 82 uint64_t cmb_bar_phys_addr; 83 84 /* Controller memory buffer size in Bytes */ 85 uint64_t cmb_size; 86 87 /* Current offset of controller memory buffer, relative to start of BAR virt addr */ 88 uint64_t cmb_current_offset; 89 90 /* Last valid offset into CMB, this differs if CMB memory registration occurs or not */ 91 uint64_t cmb_max_offset; 92 93 void *cmb_mem_register_addr; 94 size_t cmb_mem_register_size; 95 96 bool cmb_io_data_supported; 97 98 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */ 99 uint32_t doorbell_stride_u32; 100 101 /* Opaque handle to associated PCI device. */ 102 struct spdk_pci_device *devhandle; 103 104 /* File descriptor returned from spdk_pci_device_claim(). Closed when ctrlr is detached. */ 105 int claim_fd; 106 107 /* Flag to indicate the MMIO register has been remapped */ 108 bool is_remapped; 109 }; 110 111 struct nvme_tracker { 112 TAILQ_ENTRY(nvme_tracker) tq_list; 113 114 struct nvme_request *req; 115 uint16_t cid; 116 117 uint16_t rsvd1: 15; 118 uint16_t active: 1; 119 120 uint32_t rsvd2; 121 122 uint64_t rsvd3; 123 124 uint64_t prp_sgl_bus_addr; 125 126 union { 127 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; 128 struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; 129 } u; 130 }; 131 /* 132 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary 133 * and so that there is no padding required to meet alignment requirements. 134 */ 135 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K"); 136 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned"); 137 138 /* PCIe transport extensions for spdk_nvme_qpair */ 139 struct nvme_pcie_qpair { 140 /* Submission queue tail doorbell */ 141 volatile uint32_t *sq_tdbl; 142 143 /* Completion queue head doorbell */ 144 volatile uint32_t *cq_hdbl; 145 146 /* Submission queue */ 147 struct spdk_nvme_cmd *cmd; 148 149 /* Completion queue */ 150 struct spdk_nvme_cpl *cpl; 151 152 TAILQ_HEAD(, nvme_tracker) free_tr; 153 TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr; 154 155 /* Array of trackers indexed by command ID. */ 156 struct nvme_tracker *tr; 157 158 uint16_t num_entries; 159 160 uint16_t max_completions_cap; 161 162 uint16_t last_sq_tail; 163 uint16_t sq_tail; 164 uint16_t cq_head; 165 uint16_t sq_head; 166 167 struct { 168 uint8_t phase : 1; 169 uint8_t is_enabled : 1; 170 uint8_t delay_pcie_doorbell : 1; 171 uint8_t has_shadow_doorbell : 1; 172 } flags; 173 174 /* 175 * Base qpair structure. 176 * This is located after the hot data in this structure so that the important parts of 177 * nvme_pcie_qpair are in the same cache line. 178 */ 179 struct spdk_nvme_qpair qpair; 180 181 struct { 182 /* Submission queue shadow tail doorbell */ 183 volatile uint32_t *sq_tdbl; 184 185 /* Completion queue shadow head doorbell */ 186 volatile uint32_t *cq_hdbl; 187 188 /* Submission queue event index */ 189 volatile uint32_t *sq_eventidx; 190 191 /* Completion queue event index */ 192 volatile uint32_t *cq_eventidx; 193 } shadow_doorbell; 194 195 /* 196 * Fields below this point should not be touched on the normal I/O path. 197 */ 198 199 bool sq_in_cmb; 200 201 uint64_t cmd_bus_addr; 202 uint64_t cpl_bus_addr; 203 }; 204 205 static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, 206 struct spdk_pci_addr *pci_addr); 207 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair); 208 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair); 209 210 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; 211 static volatile uint16_t g_signal_lock; 212 static bool g_sigset = false; 213 static int hotplug_fd = -1; 214 215 static void 216 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx) 217 { 218 void *map_address; 219 220 if (!__sync_bool_compare_and_swap(&g_signal_lock, 0, 1)) { 221 return; 222 } 223 224 assert(g_thread_mmio_ctrlr != NULL); 225 226 if (!g_thread_mmio_ctrlr->is_remapped) { 227 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, 228 PROT_READ | PROT_WRITE, 229 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); 230 if (map_address == MAP_FAILED) { 231 SPDK_ERRLOG("mmap failed\n"); 232 g_signal_lock = 0; 233 return; 234 } 235 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); 236 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; 237 g_thread_mmio_ctrlr->is_remapped = true; 238 } 239 g_signal_lock = 0; 240 return; 241 } 242 243 static void 244 nvme_pcie_ctrlr_setup_signal(void) 245 { 246 struct sigaction sa; 247 248 sa.sa_sigaction = nvme_sigbus_fault_sighandler; 249 sigemptyset(&sa.sa_mask); 250 sa.sa_flags = SA_SIGINFO; 251 sigaction(SIGBUS, &sa, NULL); 252 } 253 254 static int 255 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) 256 { 257 struct spdk_nvme_ctrlr *ctrlr, *tmp; 258 struct spdk_uevent event; 259 struct spdk_pci_addr pci_addr; 260 union spdk_nvme_csts_register csts; 261 struct spdk_nvme_ctrlr_process *proc; 262 263 while (spdk_get_uevent(hotplug_fd, &event) > 0) { 264 if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO || 265 event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) { 266 if (event.action == SPDK_NVME_UEVENT_ADD) { 267 SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n", 268 event.traddr); 269 if (spdk_process_is_primary()) { 270 if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) { 271 nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr); 272 } 273 } 274 } else if (event.action == SPDK_NVME_UEVENT_REMOVE) { 275 struct spdk_nvme_transport_id trid; 276 277 memset(&trid, 0, sizeof(trid)); 278 trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 279 snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr); 280 281 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); 282 if (ctrlr == NULL) { 283 return 0; 284 } 285 SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n", 286 event.traddr); 287 288 nvme_ctrlr_fail(ctrlr, true); 289 290 /* get the user app to clean up and stop I/O */ 291 if (probe_ctx->remove_cb) { 292 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 293 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); 294 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 295 } 296 } 297 } 298 } 299 300 /* This is a work around for vfio-attached device hot remove detection. */ 301 TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { 302 /* NVMe controller BAR must be mapped to secondary process space before any access. */ 303 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 304 if (proc) { 305 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 306 if (csts.raw == 0xffffffffU) { 307 nvme_ctrlr_fail(ctrlr, true); 308 if (probe_ctx->remove_cb) { 309 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 310 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); 311 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 312 } 313 } 314 } 315 } 316 return 0; 317 } 318 319 static inline struct nvme_pcie_ctrlr * 320 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) 321 { 322 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); 323 return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); 324 } 325 326 static inline struct nvme_pcie_qpair * 327 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) 328 { 329 assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE); 330 return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair); 331 } 332 333 static volatile void * 334 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) 335 { 336 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 337 338 return (volatile void *)((uintptr_t)pctrlr->regs + offset); 339 } 340 341 int 342 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) 343 { 344 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 345 346 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 347 g_thread_mmio_ctrlr = pctrlr; 348 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); 349 g_thread_mmio_ctrlr = NULL; 350 return 0; 351 } 352 353 int 354 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) 355 { 356 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 357 358 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 359 g_thread_mmio_ctrlr = pctrlr; 360 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); 361 g_thread_mmio_ctrlr = NULL; 362 return 0; 363 } 364 365 int 366 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) 367 { 368 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 369 370 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 371 assert(value != NULL); 372 g_thread_mmio_ctrlr = pctrlr; 373 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); 374 g_thread_mmio_ctrlr = NULL; 375 if (~(*value) == 0) { 376 return -1; 377 } 378 379 return 0; 380 } 381 382 int 383 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) 384 { 385 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 386 387 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 388 assert(value != NULL); 389 g_thread_mmio_ctrlr = pctrlr; 390 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); 391 g_thread_mmio_ctrlr = NULL; 392 if (~(*value) == 0) { 393 return -1; 394 } 395 396 return 0; 397 } 398 399 static int 400 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 401 { 402 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), 403 value); 404 } 405 406 static int 407 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 408 { 409 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), 410 value); 411 } 412 413 static int 414 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) 415 { 416 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), 417 aqa->raw); 418 } 419 420 static int 421 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) 422 { 423 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), 424 &cmbloc->raw); 425 } 426 427 static int 428 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) 429 { 430 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), 431 &cmbsz->raw); 432 } 433 434 uint32_t 435 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 436 { 437 /* 438 * For commands requiring more than 2 PRP entries, one PRP will be 439 * embedded in the command (prp1), and the rest of the PRP entries 440 * will be in a list pointed to by the command (prp2). This means 441 * that real max number of PRP entries we support is 506+1, which 442 * results in a max xfer size of 506*ctrlr->page_size. 443 */ 444 return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; 445 } 446 447 uint16_t 448 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 449 { 450 return NVME_MAX_SGL_DESCRIPTORS; 451 } 452 453 static void 454 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) 455 { 456 int rc; 457 void *addr; 458 uint32_t bir; 459 union spdk_nvme_cmbsz_register cmbsz; 460 union spdk_nvme_cmbloc_register cmbloc; 461 uint64_t size, unit_size, offset, bar_size, bar_phys_addr; 462 uint64_t mem_register_start, mem_register_end; 463 464 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || 465 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 466 SPDK_ERRLOG("get registers failed\n"); 467 goto exit; 468 } 469 470 if (!cmbsz.bits.sz) { 471 goto exit; 472 } 473 474 bir = cmbloc.bits.bir; 475 /* Values 0 2 3 4 5 are valid for BAR */ 476 if (bir > 5 || bir == 1) { 477 goto exit; 478 } 479 480 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ 481 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); 482 /* controller memory buffer size in Bytes */ 483 size = unit_size * cmbsz.bits.sz; 484 /* controller memory buffer offset from BAR in Bytes */ 485 offset = unit_size * cmbloc.bits.ofst; 486 487 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, 488 &bar_phys_addr, &bar_size); 489 if ((rc != 0) || addr == NULL) { 490 goto exit; 491 } 492 493 if (offset > bar_size) { 494 goto exit; 495 } 496 497 if (size > bar_size - offset) { 498 goto exit; 499 } 500 501 pctrlr->cmb_bar_virt_addr = addr; 502 pctrlr->cmb_bar_phys_addr = bar_phys_addr; 503 pctrlr->cmb_size = size; 504 pctrlr->cmb_current_offset = offset; 505 pctrlr->cmb_max_offset = offset + size; 506 507 if (!cmbsz.bits.sqs) { 508 pctrlr->ctrlr.opts.use_cmb_sqs = false; 509 } 510 511 /* If only SQS is supported use legacy mapping */ 512 if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) { 513 return; 514 } 515 516 /* If CMB is less than 4MiB in size then abort CMB mapping */ 517 if (pctrlr->cmb_size < (1ULL << 22)) { 518 goto exit; 519 } 520 521 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + VALUE_2MB - 1); 522 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size); 523 pctrlr->cmb_mem_register_addr = (void *)mem_register_start; 524 pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start; 525 526 rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); 527 if (rc) { 528 SPDK_ERRLOG("spdk_mem_register() failed\n"); 529 goto exit; 530 } 531 pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr); 532 pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr); 533 pctrlr->cmb_io_data_supported = true; 534 535 return; 536 exit: 537 pctrlr->cmb_bar_virt_addr = NULL; 538 pctrlr->ctrlr.opts.use_cmb_sqs = false; 539 return; 540 } 541 542 static int 543 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) 544 { 545 int rc = 0; 546 union spdk_nvme_cmbloc_register cmbloc; 547 void *addr = pctrlr->cmb_bar_virt_addr; 548 549 if (addr) { 550 if (pctrlr->cmb_mem_register_addr) { 551 spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); 552 } 553 554 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 555 SPDK_ERRLOG("get_cmbloc() failed\n"); 556 return -EIO; 557 } 558 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); 559 } 560 return rc; 561 } 562 563 static int 564 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned, 565 uint64_t *offset) 566 { 567 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 568 uint64_t round_offset; 569 570 round_offset = pctrlr->cmb_current_offset; 571 round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1); 572 573 /* CMB may only consume part of the BAR, calculate accordingly */ 574 if (round_offset + length > pctrlr->cmb_max_offset) { 575 SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); 576 return -1; 577 } 578 579 *offset = round_offset; 580 pctrlr->cmb_current_offset = round_offset + length; 581 582 return 0; 583 } 584 585 void * 586 nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) 587 { 588 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 589 uint64_t offset; 590 591 if (pctrlr->cmb_bar_virt_addr == NULL) { 592 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); 593 return NULL; 594 } 595 596 if (!pctrlr->cmb_io_data_supported) { 597 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n"); 598 return NULL; 599 } 600 601 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) { 602 SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size); 603 return NULL; 604 } 605 606 return pctrlr->cmb_bar_virt_addr + offset; 607 } 608 609 int 610 nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) 611 { 612 /* 613 * Do nothing for now. 614 * TODO: Track free space so buffers may be reused. 615 */ 616 SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n", 617 __func__); 618 return 0; 619 } 620 621 static int 622 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) 623 { 624 int rc; 625 void *addr; 626 uint64_t phys_addr, size; 627 628 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, 629 &phys_addr, &size); 630 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; 631 if ((pctrlr->regs == NULL) || (rc != 0)) { 632 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", 633 rc, pctrlr->regs); 634 return -1; 635 } 636 637 pctrlr->regs_size = size; 638 nvme_pcie_ctrlr_map_cmb(pctrlr); 639 640 return 0; 641 } 642 643 static int 644 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) 645 { 646 int rc = 0; 647 void *addr = (void *)pctrlr->regs; 648 649 if (pctrlr->ctrlr.is_removed) { 650 return rc; 651 } 652 653 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); 654 if (rc != 0) { 655 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); 656 return -1; 657 } 658 659 if (addr) { 660 /* NOTE: addr may have been remapped here. We're relying on DPDK to call 661 * munmap internally. 662 */ 663 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); 664 } 665 return rc; 666 } 667 668 static int 669 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr) 670 { 671 struct nvme_pcie_qpair *pqpair; 672 int rc; 673 674 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 675 if (pqpair == NULL) { 676 return -ENOMEM; 677 } 678 679 pqpair->num_entries = NVME_ADMIN_ENTRIES; 680 pqpair->flags.delay_pcie_doorbell = 0; 681 682 ctrlr->adminq = &pqpair->qpair; 683 684 rc = nvme_qpair_init(ctrlr->adminq, 685 0, /* qpair ID */ 686 ctrlr, 687 SPDK_NVME_QPRIO_URGENT, 688 NVME_ADMIN_ENTRIES); 689 if (rc != 0) { 690 return rc; 691 } 692 693 return nvme_pcie_qpair_construct(ctrlr->adminq); 694 } 695 696 /* This function must only be called while holding g_spdk_nvme_driver->lock */ 697 static int 698 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) 699 { 700 struct spdk_nvme_transport_id trid = {}; 701 struct nvme_pcie_enum_ctx *enum_ctx = ctx; 702 struct spdk_nvme_ctrlr *ctrlr; 703 struct spdk_pci_addr pci_addr; 704 705 pci_addr = spdk_pci_device_get_addr(pci_dev); 706 707 trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 708 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); 709 710 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); 711 if (!spdk_process_is_primary()) { 712 if (!ctrlr) { 713 SPDK_ERRLOG("Controller must be constructed in the primary process first.\n"); 714 return -1; 715 } 716 717 return nvme_ctrlr_add_process(ctrlr, pci_dev); 718 } 719 720 /* check whether user passes the pci_addr */ 721 if (enum_ctx->has_pci_addr && 722 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { 723 return 1; 724 } 725 726 return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev); 727 } 728 729 int 730 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, 731 bool direct_connect) 732 { 733 struct nvme_pcie_enum_ctx enum_ctx = {}; 734 735 enum_ctx.probe_ctx = probe_ctx; 736 737 if (strlen(probe_ctx->trid.traddr) != 0) { 738 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) { 739 return -1; 740 } 741 enum_ctx.has_pci_addr = true; 742 } 743 744 if (hotplug_fd < 0) { 745 hotplug_fd = spdk_uevent_connect(); 746 if (hotplug_fd < 0) { 747 SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n"); 748 } 749 } else { 750 _nvme_pcie_hotplug_monitor(probe_ctx); 751 } 752 753 if (enum_ctx.has_pci_addr == false) { 754 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), 755 pcie_nvme_enum_cb, &enum_ctx); 756 } else { 757 return spdk_pci_device_attach(spdk_pci_nvme_get_driver(), 758 pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); 759 } 760 } 761 762 static int 763 nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr) 764 { 765 struct nvme_pcie_enum_ctx enum_ctx; 766 767 enum_ctx.probe_ctx = probe_ctx; 768 enum_ctx.has_pci_addr = true; 769 enum_ctx.pci_addr = *pci_addr; 770 771 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx); 772 } 773 774 struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 775 const struct spdk_nvme_ctrlr_opts *opts, 776 void *devhandle) 777 { 778 struct spdk_pci_device *pci_dev = devhandle; 779 struct nvme_pcie_ctrlr *pctrlr; 780 union spdk_nvme_cap_register cap; 781 union spdk_nvme_vs_register vs; 782 uint32_t cmd_reg; 783 int rc, claim_fd; 784 struct spdk_pci_id pci_id; 785 struct spdk_pci_addr pci_addr; 786 787 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { 788 SPDK_ERRLOG("could not parse pci address\n"); 789 return NULL; 790 } 791 792 claim_fd = spdk_pci_device_claim(&pci_addr); 793 if (claim_fd < 0) { 794 SPDK_ERRLOG("could not claim device %s\n", trid->traddr); 795 return NULL; 796 } 797 798 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, 799 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 800 if (pctrlr == NULL) { 801 close(claim_fd); 802 SPDK_ERRLOG("could not allocate ctrlr\n"); 803 return NULL; 804 } 805 806 pctrlr->is_remapped = false; 807 pctrlr->ctrlr.is_removed = false; 808 pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 809 pctrlr->devhandle = devhandle; 810 pctrlr->ctrlr.opts = *opts; 811 pctrlr->claim_fd = claim_fd; 812 memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid)); 813 814 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); 815 if (rc != 0) { 816 close(claim_fd); 817 spdk_free(pctrlr); 818 return NULL; 819 } 820 821 /* Enable PCI busmaster and disable INTx */ 822 spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4); 823 cmd_reg |= 0x404; 824 spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4); 825 826 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { 827 SPDK_ERRLOG("get_cap() failed\n"); 828 close(claim_fd); 829 spdk_free(pctrlr); 830 return NULL; 831 } 832 833 if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { 834 SPDK_ERRLOG("get_vs() failed\n"); 835 close(claim_fd); 836 spdk_free(pctrlr); 837 return NULL; 838 } 839 840 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); 841 842 /* Doorbell stride is 2 ^ (dstrd + 2), 843 * but we want multiples of 4, so drop the + 2 */ 844 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; 845 846 rc = nvme_ctrlr_construct(&pctrlr->ctrlr); 847 if (rc != 0) { 848 nvme_ctrlr_destruct(&pctrlr->ctrlr); 849 return NULL; 850 } 851 852 pci_id = spdk_pci_device_get_id(pci_dev); 853 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); 854 855 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr); 856 if (rc != 0) { 857 nvme_ctrlr_destruct(&pctrlr->ctrlr); 858 return NULL; 859 } 860 861 /* Construct the primary process properties */ 862 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); 863 if (rc != 0) { 864 nvme_ctrlr_destruct(&pctrlr->ctrlr); 865 return NULL; 866 } 867 868 if (g_sigset != true) { 869 nvme_pcie_ctrlr_setup_signal(); 870 g_sigset = true; 871 } 872 873 return &pctrlr->ctrlr; 874 } 875 876 int 877 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 878 { 879 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 880 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); 881 union spdk_nvme_aqa_register aqa; 882 883 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { 884 SPDK_ERRLOG("set_asq() failed\n"); 885 return -EIO; 886 } 887 888 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { 889 SPDK_ERRLOG("set_acq() failed\n"); 890 return -EIO; 891 } 892 893 aqa.raw = 0; 894 /* acqs and asqs are 0-based. */ 895 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 896 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 897 898 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { 899 SPDK_ERRLOG("set_aqa() failed\n"); 900 return -EIO; 901 } 902 903 return 0; 904 } 905 906 int 907 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 908 { 909 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 910 struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); 911 912 close(pctrlr->claim_fd); 913 914 if (ctrlr->adminq) { 915 nvme_pcie_qpair_destroy(ctrlr->adminq); 916 } 917 918 nvme_ctrlr_destruct_finish(ctrlr); 919 920 nvme_ctrlr_free_processes(ctrlr); 921 922 nvme_pcie_ctrlr_free_bars(pctrlr); 923 924 if (devhandle) { 925 spdk_pci_device_detach(devhandle); 926 } 927 928 spdk_free(pctrlr); 929 930 return 0; 931 } 932 933 static void 934 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) 935 { 936 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); 937 tr->cid = cid; 938 tr->active = false; 939 } 940 941 int 942 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) 943 { 944 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 945 946 pqpair->last_sq_tail = pqpair->sq_tail = pqpair->cq_head = 0; 947 948 /* 949 * First time through the completion queue, HW will set phase 950 * bit on completions to 1. So set this to 1 here, indicating 951 * we're looking for a 1 to know which entries have completed. 952 * we'll toggle the bit each time when the completion queue 953 * rolls over. 954 */ 955 pqpair->flags.phase = 1; 956 957 memset(pqpair->cmd, 0, 958 pqpair->num_entries * sizeof(struct spdk_nvme_cmd)); 959 memset(pqpair->cpl, 0, 960 pqpair->num_entries * sizeof(struct spdk_nvme_cpl)); 961 962 return 0; 963 } 964 965 static int 966 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair) 967 { 968 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 969 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 970 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 971 struct nvme_tracker *tr; 972 uint16_t i; 973 volatile uint32_t *doorbell_base; 974 uint64_t offset; 975 uint16_t num_trackers; 976 size_t page_align = VALUE_2MB; 977 uint32_t flags = SPDK_MALLOC_DMA; 978 979 /* 980 * Limit the maximum number of completions to return per call to prevent wraparound, 981 * and calculate how many trackers can be submitted at once without overflowing the 982 * completion queue. 983 */ 984 pqpair->max_completions_cap = pqpair->num_entries / 4; 985 pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); 986 pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); 987 num_trackers = pqpair->num_entries - pqpair->max_completions_cap; 988 989 SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", 990 pqpair->max_completions_cap, num_trackers); 991 992 assert(num_trackers != 0); 993 994 pqpair->sq_in_cmb = false; 995 996 if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { 997 flags |= SPDK_MALLOC_SHARE; 998 } 999 1000 /* cmd and cpl rings must be aligned on page size boundaries. */ 1001 if (ctrlr->opts.use_cmb_sqs) { 1002 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1003 sysconf(_SC_PAGESIZE), &offset) == 0) { 1004 pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset; 1005 pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset; 1006 pqpair->sq_in_cmb = true; 1007 } 1008 } 1009 1010 /* To ensure physical address contiguity we make each ring occupy 1011 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. 1012 */ 1013 if (pqpair->sq_in_cmb == false) { 1014 pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1015 page_align, NULL, 1016 SPDK_ENV_SOCKET_ID_ANY, flags); 1017 if (pqpair->cmd == NULL) { 1018 SPDK_ERRLOG("alloc qpair_cmd failed\n"); 1019 return -ENOMEM; 1020 } 1021 1022 pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL); 1023 if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) { 1024 SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n"); 1025 return -EFAULT; 1026 } 1027 } 1028 1029 pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl), 1030 page_align, NULL, 1031 SPDK_ENV_SOCKET_ID_ANY, flags); 1032 if (pqpair->cpl == NULL) { 1033 SPDK_ERRLOG("alloc qpair_cpl failed\n"); 1034 return -ENOMEM; 1035 } 1036 1037 pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL); 1038 if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) { 1039 SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n"); 1040 return -EFAULT; 1041 } 1042 1043 doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl; 1044 pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; 1045 pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; 1046 1047 /* 1048 * Reserve space for all of the trackers in a single allocation. 1049 * struct nvme_tracker must be padded so that its size is already a power of 2. 1050 * This ensures the PRP list embedded in the nvme_tracker object will not span a 1051 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. 1052 */ 1053 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, 1054 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1055 if (pqpair->tr == NULL) { 1056 SPDK_ERRLOG("nvme_tr failed\n"); 1057 return -ENOMEM; 1058 } 1059 1060 TAILQ_INIT(&pqpair->free_tr); 1061 TAILQ_INIT(&pqpair->outstanding_tr); 1062 1063 for (i = 0; i < num_trackers; i++) { 1064 tr = &pqpair->tr[i]; 1065 nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL)); 1066 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1067 } 1068 1069 nvme_pcie_qpair_reset(qpair); 1070 1071 return 0; 1072 } 1073 1074 static inline void 1075 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) 1076 { 1077 /* dst and src are known to be non-overlapping and 64-byte aligned. */ 1078 #if defined(__AVX__) 1079 __m256i *d256 = (__m256i *)dst; 1080 const __m256i *s256 = (const __m256i *)src; 1081 1082 _mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0])); 1083 _mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1])); 1084 #elif defined(__SSE2__) 1085 __m128i *d128 = (__m128i *)dst; 1086 const __m128i *s128 = (const __m128i *)src; 1087 1088 _mm_store_si128(&d128[0], _mm_load_si128(&s128[0])); 1089 _mm_store_si128(&d128[1], _mm_load_si128(&s128[1])); 1090 _mm_store_si128(&d128[2], _mm_load_si128(&s128[2])); 1091 _mm_store_si128(&d128[3], _mm_load_si128(&s128[3])); 1092 #else 1093 *dst = *src; 1094 #endif 1095 } 1096 1097 /** 1098 * Note: the ctrlr_lock must be held when calling this function. 1099 */ 1100 static void 1101 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, 1102 struct nvme_request *req, struct spdk_nvme_cpl *cpl) 1103 { 1104 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1105 struct nvme_request *active_req = req; 1106 struct spdk_nvme_ctrlr_process *active_proc; 1107 1108 /* 1109 * The admin request is from another process. Move to the per 1110 * process list for that process to handle it later. 1111 */ 1112 assert(nvme_qpair_is_admin_queue(qpair)); 1113 assert(active_req->pid != getpid()); 1114 1115 active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid); 1116 if (active_proc) { 1117 /* Save the original completion information */ 1118 memcpy(&active_req->cpl, cpl, sizeof(*cpl)); 1119 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); 1120 } else { 1121 SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", 1122 active_req->pid); 1123 1124 nvme_free_request(active_req); 1125 } 1126 } 1127 1128 /** 1129 * Note: the ctrlr_lock must be held when calling this function. 1130 */ 1131 static void 1132 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) 1133 { 1134 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1135 struct nvme_request *req, *tmp_req; 1136 pid_t pid = getpid(); 1137 struct spdk_nvme_ctrlr_process *proc; 1138 1139 /* 1140 * Check whether there is any pending admin request from 1141 * other active processes. 1142 */ 1143 assert(nvme_qpair_is_admin_queue(qpair)); 1144 1145 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 1146 if (!proc) { 1147 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); 1148 assert(proc); 1149 return; 1150 } 1151 1152 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { 1153 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); 1154 1155 assert(req->pid == pid); 1156 1157 nvme_complete_request(req, &req->cpl); 1158 nvme_free_request(req); 1159 } 1160 } 1161 1162 static inline int 1163 nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) 1164 { 1165 return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old); 1166 } 1167 1168 static bool 1169 nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value, 1170 volatile uint32_t *shadow_db, 1171 volatile uint32_t *eventidx) 1172 { 1173 uint16_t old; 1174 1175 if (!shadow_db) { 1176 return true; 1177 } 1178 1179 old = *shadow_db; 1180 *shadow_db = value; 1181 1182 if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) { 1183 return false; 1184 } 1185 1186 return true; 1187 } 1188 1189 static inline void 1190 nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair) 1191 { 1192 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1193 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1194 bool need_mmio = true; 1195 1196 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1197 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1198 pqpair->sq_tail, 1199 pqpair->shadow_doorbell.sq_tdbl, 1200 pqpair->shadow_doorbell.sq_eventidx); 1201 } 1202 1203 if (spdk_likely(need_mmio)) { 1204 spdk_wmb(); 1205 g_thread_mmio_ctrlr = pctrlr; 1206 spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail); 1207 g_thread_mmio_ctrlr = NULL; 1208 } 1209 } 1210 1211 static void 1212 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1213 { 1214 struct nvme_request *req; 1215 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1216 1217 req = tr->req; 1218 assert(req != NULL); 1219 1220 pqpair->tr[tr->cid].active = true; 1221 1222 /* Copy the command from the tracker to the submission queue. */ 1223 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); 1224 1225 if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) { 1226 pqpair->sq_tail = 0; 1227 } 1228 1229 if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) { 1230 SPDK_ERRLOG("sq_tail is passing sq_head!\n"); 1231 } 1232 1233 if (!pqpair->flags.delay_pcie_doorbell) { 1234 nvme_pcie_qpair_ring_sq_doorbell(qpair); 1235 } 1236 } 1237 1238 static void 1239 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 1240 struct spdk_nvme_cpl *cpl, bool print_on_error) 1241 { 1242 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1243 struct nvme_request *req; 1244 bool retry, error, was_active; 1245 bool req_from_current_proc = true; 1246 1247 req = tr->req; 1248 1249 assert(req != NULL); 1250 1251 error = spdk_nvme_cpl_is_error(cpl); 1252 retry = error && nvme_completion_is_retry(cpl) && 1253 req->retries < spdk_nvme_retry_count; 1254 1255 if (error && print_on_error) { 1256 nvme_qpair_print_command(qpair, &req->cmd); 1257 nvme_qpair_print_completion(qpair, cpl); 1258 } 1259 1260 was_active = pqpair->tr[cpl->cid].active; 1261 pqpair->tr[cpl->cid].active = false; 1262 1263 assert(cpl->cid == req->cmd.cid); 1264 1265 if (retry) { 1266 req->retries++; 1267 nvme_pcie_qpair_submit_tracker(qpair, tr); 1268 } else { 1269 if (was_active) { 1270 /* Only check admin requests from different processes. */ 1271 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { 1272 req_from_current_proc = false; 1273 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); 1274 } else { 1275 nvme_complete_request(req, cpl); 1276 } 1277 } 1278 1279 if (req_from_current_proc == true) { 1280 nvme_free_request(req); 1281 } 1282 1283 tr->req = NULL; 1284 1285 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); 1286 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1287 1288 /* 1289 * If the controller is in the middle of resetting, don't 1290 * try to submit queued requests here - let the reset logic 1291 * handle that instead. 1292 */ 1293 if (!STAILQ_EMPTY(&qpair->queued_req) && 1294 !qpair->ctrlr->is_resetting) { 1295 req = STAILQ_FIRST(&qpair->queued_req); 1296 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1297 nvme_qpair_submit_request(qpair, req); 1298 } 1299 } 1300 } 1301 1302 static void 1303 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, 1304 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 1305 bool print_on_error) 1306 { 1307 struct spdk_nvme_cpl cpl; 1308 1309 memset(&cpl, 0, sizeof(cpl)); 1310 cpl.sqid = qpair->id; 1311 cpl.cid = tr->cid; 1312 cpl.status.sct = sct; 1313 cpl.status.sc = sc; 1314 cpl.status.dnr = dnr; 1315 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); 1316 } 1317 1318 static void 1319 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1320 { 1321 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1322 struct nvme_tracker *tr, *temp; 1323 1324 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { 1325 SPDK_ERRLOG("aborting outstanding command\n"); 1326 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1327 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); 1328 } 1329 } 1330 1331 static void 1332 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 1333 { 1334 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1335 struct nvme_tracker *tr; 1336 1337 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1338 while (tr != NULL) { 1339 assert(tr->req != NULL); 1340 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 1341 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, 1342 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, 1343 false); 1344 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1345 } else { 1346 tr = TAILQ_NEXT(tr, tq_list); 1347 } 1348 } 1349 } 1350 1351 static void 1352 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) 1353 { 1354 nvme_pcie_admin_qpair_abort_aers(qpair); 1355 } 1356 1357 static int 1358 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) 1359 { 1360 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1361 1362 if (nvme_qpair_is_admin_queue(qpair)) { 1363 nvme_pcie_admin_qpair_destroy(qpair); 1364 } 1365 if (pqpair->cmd && !pqpair->sq_in_cmb) { 1366 spdk_free(pqpair->cmd); 1367 } 1368 if (pqpair->cpl) { 1369 spdk_free(pqpair->cpl); 1370 } 1371 if (pqpair->tr) { 1372 spdk_free(pqpair->tr); 1373 } 1374 1375 nvme_qpair_deinit(qpair); 1376 1377 spdk_free(pqpair); 1378 1379 return 0; 1380 } 1381 1382 static void 1383 nvme_pcie_admin_qpair_enable(struct spdk_nvme_qpair *qpair) 1384 { 1385 /* 1386 * Manually abort each outstanding admin command. Do not retry 1387 * admin commands found here, since they will be left over from 1388 * a controller reset and its likely the context in which the 1389 * command was issued no longer applies. 1390 */ 1391 nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */); 1392 } 1393 1394 static void 1395 nvme_pcie_io_qpair_enable(struct spdk_nvme_qpair *qpair) 1396 { 1397 /* Manually abort each outstanding I/O. */ 1398 nvme_pcie_qpair_abort_trackers(qpair, 0); 1399 } 1400 1401 int 1402 nvme_pcie_qpair_enable(struct spdk_nvme_qpair *qpair) 1403 { 1404 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1405 1406 pqpair->flags.is_enabled = true; 1407 if (nvme_qpair_is_io_queue(qpair)) { 1408 nvme_pcie_io_qpair_enable(qpair); 1409 } else { 1410 nvme_pcie_admin_qpair_enable(qpair); 1411 } 1412 1413 return 0; 1414 } 1415 1416 static void 1417 nvme_pcie_admin_qpair_disable(struct spdk_nvme_qpair *qpair) 1418 { 1419 nvme_pcie_admin_qpair_abort_aers(qpair); 1420 } 1421 1422 static void 1423 nvme_pcie_io_qpair_disable(struct spdk_nvme_qpair *qpair) 1424 { 1425 } 1426 1427 int 1428 nvme_pcie_qpair_disable(struct spdk_nvme_qpair *qpair) 1429 { 1430 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1431 1432 pqpair->flags.is_enabled = false; 1433 if (nvme_qpair_is_io_queue(qpair)) { 1434 nvme_pcie_io_qpair_disable(qpair); 1435 } else { 1436 nvme_pcie_admin_qpair_disable(qpair); 1437 } 1438 1439 return 0; 1440 } 1441 1442 1443 int 1444 nvme_pcie_qpair_fail(struct spdk_nvme_qpair *qpair) 1445 { 1446 nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */); 1447 1448 return 0; 1449 } 1450 1451 static int 1452 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, 1453 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, 1454 void *cb_arg) 1455 { 1456 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1457 struct nvme_request *req; 1458 struct spdk_nvme_cmd *cmd; 1459 1460 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1461 if (req == NULL) { 1462 return -ENOMEM; 1463 } 1464 1465 cmd = &req->cmd; 1466 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; 1467 1468 /* 1469 * TODO: create a create io completion queue command data 1470 * structure. 1471 */ 1472 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; 1473 /* 1474 * 0x2 = interrupts enabled 1475 * 0x1 = physically contiguous 1476 */ 1477 cmd->cdw11 = 0x1; 1478 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; 1479 1480 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1481 } 1482 1483 static int 1484 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, 1485 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1486 { 1487 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1488 struct nvme_request *req; 1489 struct spdk_nvme_cmd *cmd; 1490 1491 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1492 if (req == NULL) { 1493 return -ENOMEM; 1494 } 1495 1496 cmd = &req->cmd; 1497 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; 1498 1499 /* 1500 * TODO: create a create io submission queue command data 1501 * structure. 1502 */ 1503 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; 1504 /* 0x1 = physically contiguous */ 1505 cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1; 1506 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; 1507 1508 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1509 } 1510 1511 static int 1512 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1513 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1514 { 1515 struct nvme_request *req; 1516 struct spdk_nvme_cmd *cmd; 1517 1518 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1519 if (req == NULL) { 1520 return -ENOMEM; 1521 } 1522 1523 cmd = &req->cmd; 1524 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; 1525 cmd->cdw10 = qpair->id; 1526 1527 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1528 } 1529 1530 static int 1531 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1532 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1533 { 1534 struct nvme_request *req; 1535 struct spdk_nvme_cmd *cmd; 1536 1537 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1538 if (req == NULL) { 1539 return -ENOMEM; 1540 } 1541 1542 cmd = &req->cmd; 1543 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; 1544 cmd->cdw10 = qpair->id; 1545 1546 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1547 } 1548 1549 static int 1550 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1551 uint16_t qid) 1552 { 1553 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1554 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1555 struct nvme_completion_poll_status status; 1556 int rc; 1557 1558 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1559 if (rc != 0) { 1560 return rc; 1561 } 1562 1563 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1564 SPDK_ERRLOG("nvme_create_io_cq failed!\n"); 1565 return -1; 1566 } 1567 1568 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); 1569 if (rc != 0) { 1570 return rc; 1571 } 1572 1573 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1574 SPDK_ERRLOG("nvme_create_io_sq failed!\n"); 1575 /* Attempt to delete the completion queue */ 1576 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); 1577 if (rc != 0) { 1578 return -1; 1579 } 1580 spdk_nvme_wait_for_completion(ctrlr->adminq, &status); 1581 return -1; 1582 } 1583 1584 if (ctrlr->shadow_doorbell) { 1585 pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * 1586 pctrlr->doorbell_stride_u32; 1587 pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * 1588 pctrlr->doorbell_stride_u32; 1589 pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * 1590 pctrlr->doorbell_stride_u32; 1591 pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * 1592 pctrlr->doorbell_stride_u32; 1593 pqpair->flags.has_shadow_doorbell = 1; 1594 } else { 1595 pqpair->flags.has_shadow_doorbell = 0; 1596 } 1597 nvme_pcie_qpair_reset(qpair); 1598 1599 return 0; 1600 } 1601 1602 struct spdk_nvme_qpair * 1603 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 1604 const struct spdk_nvme_io_qpair_opts *opts) 1605 { 1606 struct nvme_pcie_qpair *pqpair; 1607 struct spdk_nvme_qpair *qpair; 1608 int rc; 1609 1610 assert(ctrlr != NULL); 1611 1612 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, 1613 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1614 if (pqpair == NULL) { 1615 return NULL; 1616 } 1617 1618 pqpair->num_entries = opts->io_queue_size; 1619 pqpair->flags.delay_pcie_doorbell = opts->delay_pcie_doorbell; 1620 1621 qpair = &pqpair->qpair; 1622 1623 rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); 1624 if (rc != 0) { 1625 nvme_pcie_qpair_destroy(qpair); 1626 return NULL; 1627 } 1628 1629 rc = nvme_pcie_qpair_construct(qpair); 1630 if (rc != 0) { 1631 nvme_pcie_qpair_destroy(qpair); 1632 return NULL; 1633 } 1634 1635 rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid); 1636 1637 if (rc != 0) { 1638 SPDK_ERRLOG("I/O queue creation failed\n"); 1639 nvme_pcie_qpair_destroy(qpair); 1640 return NULL; 1641 } 1642 1643 return qpair; 1644 } 1645 1646 int 1647 nvme_pcie_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1648 { 1649 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); 1650 } 1651 1652 int 1653 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1654 { 1655 struct nvme_completion_poll_status status; 1656 int rc; 1657 1658 assert(ctrlr != NULL); 1659 1660 if (ctrlr->is_removed) { 1661 goto free; 1662 } 1663 1664 /* Delete the I/O submission queue */ 1665 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1666 if (rc != 0) { 1667 SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); 1668 return rc; 1669 } 1670 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1671 return -1; 1672 } 1673 1674 if (qpair->no_deletion_notification_needed == 0) { 1675 /* Complete any I/O in the completion queue */ 1676 nvme_pcie_qpair_process_completions(qpair, 0); 1677 1678 /* Abort the rest of the I/O */ 1679 nvme_pcie_qpair_abort_trackers(qpair, 1); 1680 } 1681 1682 /* Delete the completion queue */ 1683 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1684 if (rc != 0) { 1685 SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); 1686 return rc; 1687 } 1688 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1689 return -1; 1690 } 1691 1692 free: 1693 nvme_pcie_qpair_destroy(qpair); 1694 return 0; 1695 } 1696 1697 static void 1698 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1699 { 1700 /* 1701 * Bad vtophys translation, so abort this request and return 1702 * immediately. 1703 */ 1704 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1705 SPDK_NVME_SC_INVALID_FIELD, 1706 1 /* do not retry */, true); 1707 } 1708 1709 /* 1710 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. 1711 * 1712 * *prp_index will be updated to account for the number of PRP entries used. 1713 */ 1714 static int 1715 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, 1716 uint32_t page_size) 1717 { 1718 struct spdk_nvme_cmd *cmd = &tr->req->cmd; 1719 uintptr_t page_mask = page_size - 1; 1720 uint64_t phys_addr; 1721 uint32_t i; 1722 1723 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n", 1724 *prp_index, virt_addr, (uint32_t)len); 1725 1726 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { 1727 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1728 return -EINVAL; 1729 } 1730 1731 i = *prp_index; 1732 while (len) { 1733 uint32_t seg_len; 1734 1735 /* 1736 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, 1737 * so prp_index == count is valid. 1738 */ 1739 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { 1740 SPDK_ERRLOG("out of PRP entries\n"); 1741 return -EINVAL; 1742 } 1743 1744 phys_addr = spdk_vtophys(virt_addr, NULL); 1745 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { 1746 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); 1747 return -EINVAL; 1748 } 1749 1750 if (i == 0) { 1751 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr); 1752 cmd->dptr.prp.prp1 = phys_addr; 1753 seg_len = page_size - ((uintptr_t)virt_addr & page_mask); 1754 } else { 1755 if ((phys_addr & page_mask) != 0) { 1756 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); 1757 return -EINVAL; 1758 } 1759 1760 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr); 1761 tr->u.prp[i - 1] = phys_addr; 1762 seg_len = page_size; 1763 } 1764 1765 seg_len = spdk_min(seg_len, len); 1766 virt_addr += seg_len; 1767 len -= seg_len; 1768 i++; 1769 } 1770 1771 cmd->psdt = SPDK_NVME_PSDT_PRP; 1772 if (i <= 1) { 1773 cmd->dptr.prp.prp2 = 0; 1774 } else if (i == 2) { 1775 cmd->dptr.prp.prp2 = tr->u.prp[0]; 1776 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); 1777 } else { 1778 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; 1779 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); 1780 } 1781 1782 *prp_index = i; 1783 return 0; 1784 } 1785 1786 /** 1787 * Build PRP list describing physically contiguous payload buffer. 1788 */ 1789 static int 1790 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1791 struct nvme_tracker *tr) 1792 { 1793 uint32_t prp_index = 0; 1794 int rc; 1795 1796 rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, 1797 req->payload_size, qpair->ctrlr->page_size); 1798 if (rc) { 1799 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1800 return rc; 1801 } 1802 1803 return 0; 1804 } 1805 1806 /** 1807 * Build SGL list describing scattered payload buffer. 1808 */ 1809 static int 1810 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1811 struct nvme_tracker *tr) 1812 { 1813 int rc; 1814 void *virt_addr; 1815 uint64_t phys_addr; 1816 uint32_t remaining_transfer_len, remaining_user_sge_len, length; 1817 struct spdk_nvme_sgl_descriptor *sgl; 1818 uint32_t nseg = 0; 1819 1820 /* 1821 * Build scattered payloads. 1822 */ 1823 assert(req->payload_size != 0); 1824 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1825 assert(req->payload.reset_sgl_fn != NULL); 1826 assert(req->payload.next_sge_fn != NULL); 1827 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1828 1829 sgl = tr->u.sgl; 1830 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1831 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1832 1833 remaining_transfer_len = req->payload_size; 1834 1835 while (remaining_transfer_len > 0) { 1836 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, 1837 &virt_addr, &remaining_user_sge_len); 1838 if (rc) { 1839 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1840 return -1; 1841 } 1842 1843 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 1844 remaining_transfer_len -= remaining_user_sge_len; 1845 while (remaining_user_sge_len > 0) { 1846 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1847 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1848 return -1; 1849 } 1850 1851 phys_addr = spdk_vtophys(virt_addr, NULL); 1852 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1853 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1854 return -1; 1855 } 1856 1857 length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr)); 1858 remaining_user_sge_len -= length; 1859 virt_addr += length; 1860 1861 if (nseg > 0 && phys_addr == 1862 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { 1863 /* extend previous entry */ 1864 (*(sgl - 1)).unkeyed.length += length; 1865 continue; 1866 } 1867 1868 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1869 sgl->unkeyed.length = length; 1870 sgl->address = phys_addr; 1871 sgl->unkeyed.subtype = 0; 1872 1873 sgl++; 1874 nseg++; 1875 } 1876 } 1877 1878 if (nseg == 1) { 1879 /* 1880 * The whole transfer can be described by a single SGL descriptor. 1881 * Use the special case described by the spec where SGL1's type is Data Block. 1882 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1883 * SGL element into SGL1. 1884 */ 1885 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1886 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1887 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1888 } else { 1889 /* For now we can only support 1 SGL segment in NVMe controller */ 1890 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1891 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1892 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1893 } 1894 1895 return 0; 1896 } 1897 1898 /** 1899 * Build PRP list describing scattered payload buffer. 1900 */ 1901 static int 1902 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1903 struct nvme_tracker *tr) 1904 { 1905 int rc; 1906 void *virt_addr; 1907 uint32_t remaining_transfer_len, length; 1908 uint32_t prp_index = 0; 1909 uint32_t page_size = qpair->ctrlr->page_size; 1910 1911 /* 1912 * Build scattered payloads. 1913 */ 1914 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1915 assert(req->payload.reset_sgl_fn != NULL); 1916 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1917 1918 remaining_transfer_len = req->payload_size; 1919 while (remaining_transfer_len > 0) { 1920 assert(req->payload.next_sge_fn != NULL); 1921 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); 1922 if (rc) { 1923 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1924 return -1; 1925 } 1926 1927 length = spdk_min(remaining_transfer_len, length); 1928 1929 /* 1930 * Any incompatible sges should have been handled up in the splitting routine, 1931 * but assert here as an additional check. 1932 * 1933 * All SGEs except last must end on a page boundary. 1934 */ 1935 assert((length == remaining_transfer_len) || 1936 _is_page_aligned((uintptr_t)virt_addr + length, page_size)); 1937 1938 rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); 1939 if (rc) { 1940 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1941 return rc; 1942 } 1943 1944 remaining_transfer_len -= length; 1945 } 1946 1947 return 0; 1948 } 1949 1950 static inline bool 1951 nvme_pcie_qpair_check_enabled(struct spdk_nvme_qpair *qpair) 1952 { 1953 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1954 1955 if (!pqpair->flags.is_enabled && 1956 !qpair->ctrlr->is_resetting) { 1957 nvme_qpair_enable(qpair); 1958 } 1959 return pqpair->flags.is_enabled; 1960 } 1961 1962 int 1963 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) 1964 { 1965 struct nvme_tracker *tr; 1966 int rc = 0; 1967 void *md_payload; 1968 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1969 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1970 1971 nvme_pcie_qpair_check_enabled(qpair); 1972 1973 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 1974 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 1975 } 1976 1977 tr = TAILQ_FIRST(&pqpair->free_tr); 1978 1979 if (tr == NULL || !pqpair->flags.is_enabled) { 1980 /* 1981 * No tracker is available, or the qpair is disabled due to 1982 * an in-progress controller-level reset. 1983 * 1984 * Put the request on the qpair's request queue to be 1985 * processed when a tracker frees up via a command 1986 * completion or when the controller reset is 1987 * completed. 1988 */ 1989 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 1990 goto exit; 1991 } 1992 1993 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ 1994 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); 1995 tr->req = req; 1996 req->cmd.cid = tr->cid; 1997 1998 if (req->payload_size && req->payload.md) { 1999 md_payload = req->payload.md + req->md_offset; 2000 tr->req->cmd.mptr = spdk_vtophys(md_payload, NULL); 2001 if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) { 2002 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2003 rc = -EINVAL; 2004 goto exit; 2005 } 2006 } 2007 2008 if (req->payload_size == 0) { 2009 /* Null payload - leave PRP fields zeroed */ 2010 rc = 0; 2011 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { 2012 rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr); 2013 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { 2014 if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 2015 rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr); 2016 } else { 2017 rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr); 2018 } 2019 } else { 2020 assert(0); 2021 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2022 rc = -EINVAL; 2023 } 2024 2025 if (rc < 0) { 2026 goto exit; 2027 } 2028 2029 nvme_pcie_qpair_submit_tracker(qpair, tr); 2030 2031 exit: 2032 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2033 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2034 } 2035 2036 return rc; 2037 } 2038 2039 static void 2040 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 2041 { 2042 uint64_t t02; 2043 struct nvme_tracker *tr, *tmp; 2044 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2045 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2046 struct spdk_nvme_ctrlr_process *active_proc; 2047 2048 /* Don't check timeouts during controller initialization. */ 2049 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 2050 return; 2051 } 2052 2053 if (nvme_qpair_is_admin_queue(qpair)) { 2054 active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 2055 } else { 2056 active_proc = qpair->active_proc; 2057 } 2058 2059 /* Only check timeouts if the current process has a timeout callback. */ 2060 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 2061 return; 2062 } 2063 2064 t02 = spdk_get_ticks(); 2065 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { 2066 assert(tr->req != NULL); 2067 2068 if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { 2069 /* 2070 * The requests are in order, so as soon as one has not timed out, 2071 * stop iterating. 2072 */ 2073 break; 2074 } 2075 } 2076 } 2077 2078 int32_t 2079 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) 2080 { 2081 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2082 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 2083 struct nvme_tracker *tr; 2084 struct spdk_nvme_cpl *cpl; 2085 uint32_t num_completions = 0; 2086 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2087 2088 if (spdk_unlikely(!nvme_pcie_qpair_check_enabled(qpair))) { 2089 /* 2090 * qpair is not enabled, likely because a controller reset is 2091 * is in progress. Ignore the interrupt - any I/O that was 2092 * associated with this interrupt will get retried when the 2093 * reset is complete. 2094 */ 2095 return 0; 2096 } 2097 2098 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2099 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 2100 } 2101 2102 if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { 2103 /* 2104 * max_completions == 0 means unlimited, but complete at most 2105 * max_completions_cap batch of I/O at a time so that the completion 2106 * queue doorbells don't wrap around. 2107 */ 2108 max_completions = pqpair->max_completions_cap; 2109 } 2110 2111 while (1) { 2112 cpl = &pqpair->cpl[pqpair->cq_head]; 2113 2114 if (cpl->status.p != pqpair->flags.phase) { 2115 break; 2116 } 2117 #if defined(__PPC64__) || defined(__aarch64__) 2118 /* 2119 * This memory barrier prevents reordering of: 2120 * - load after store from/to tr 2121 * - load after load cpl phase and cpl cid 2122 */ 2123 spdk_mb(); 2124 #endif 2125 2126 if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { 2127 pqpair->cq_head = 0; 2128 pqpair->flags.phase = !pqpair->flags.phase; 2129 } 2130 2131 tr = &pqpair->tr[cpl->cid]; 2132 pqpair->sq_head = cpl->sqhd; 2133 2134 if (tr->active) { 2135 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); 2136 } else { 2137 SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); 2138 nvme_qpair_print_completion(qpair, cpl); 2139 assert(0); 2140 } 2141 2142 if (++num_completions == max_completions) { 2143 break; 2144 } 2145 } 2146 2147 if (num_completions > 0) { 2148 bool need_mmio = true; 2149 2150 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 2151 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 2152 pqpair->cq_head, 2153 pqpair->shadow_doorbell.cq_hdbl, 2154 pqpair->shadow_doorbell.cq_eventidx); 2155 } 2156 2157 if (spdk_likely(need_mmio)) { 2158 g_thread_mmio_ctrlr = pctrlr; 2159 spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head); 2160 g_thread_mmio_ctrlr = NULL; 2161 } 2162 } 2163 2164 if (pqpair->flags.delay_pcie_doorbell) { 2165 if (pqpair->last_sq_tail != pqpair->sq_tail) { 2166 nvme_pcie_qpair_ring_sq_doorbell(qpair); 2167 pqpair->last_sq_tail = pqpair->sq_tail; 2168 } 2169 } 2170 2171 if (spdk_unlikely(ctrlr->timeout_enabled)) { 2172 /* 2173 * User registered for timeout callback 2174 */ 2175 nvme_pcie_qpair_check_timeout(qpair); 2176 } 2177 2178 /* Before returning, complete any pending admin request. */ 2179 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2180 nvme_pcie_qpair_complete_pending_admin_request(qpair); 2181 2182 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2183 } 2184 2185 return num_completions; 2186 } 2187