1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * Copyright (c) 2017, IBM Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * NVMe over PCIe transport 37 */ 38 39 #include "spdk/stdinc.h" 40 #include "spdk/env.h" 41 #include "spdk/likely.h" 42 #include "nvme_internal.h" 43 #include "nvme_uevent.h" 44 45 /* 46 * Number of completion queue entries to process before ringing the 47 * completion queue doorbell. 48 */ 49 #define NVME_MIN_COMPLETIONS (1) 50 #define NVME_MAX_COMPLETIONS (128) 51 52 #define NVME_ADMIN_ENTRIES (128) 53 54 /* 55 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL 56 * segment. 57 */ 58 #define NVME_MAX_SGL_DESCRIPTORS (251) 59 60 #define NVME_MAX_PRP_LIST_ENTRIES (505) 61 62 struct nvme_pcie_enum_ctx { 63 struct spdk_nvme_probe_ctx *probe_ctx; 64 struct spdk_pci_addr pci_addr; 65 bool has_pci_addr; 66 }; 67 68 /* PCIe transport extensions for spdk_nvme_ctrlr */ 69 struct nvme_pcie_ctrlr { 70 struct spdk_nvme_ctrlr ctrlr; 71 72 /** NVMe MMIO register space */ 73 volatile struct spdk_nvme_registers *regs; 74 75 /** NVMe MMIO register size */ 76 uint64_t regs_size; 77 78 /* BAR mapping address which contains controller memory buffer */ 79 void *cmb_bar_virt_addr; 80 81 /* BAR physical address which contains controller memory buffer */ 82 uint64_t cmb_bar_phys_addr; 83 84 /* Controller memory buffer size in Bytes */ 85 uint64_t cmb_size; 86 87 /* Current offset of controller memory buffer, relative to start of BAR virt addr */ 88 uint64_t cmb_current_offset; 89 90 /* Last valid offset into CMB, this differs if CMB memory registration occurs or not */ 91 uint64_t cmb_max_offset; 92 93 void *cmb_mem_register_addr; 94 size_t cmb_mem_register_size; 95 96 bool cmb_io_data_supported; 97 98 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */ 99 uint32_t doorbell_stride_u32; 100 101 /* Opaque handle to associated PCI device. */ 102 struct spdk_pci_device *devhandle; 103 104 /* File descriptor returned from spdk_pci_device_claim(). Closed when ctrlr is detached. */ 105 int claim_fd; 106 107 /* Flag to indicate the MMIO register has been remapped */ 108 bool is_remapped; 109 }; 110 111 struct nvme_tracker { 112 TAILQ_ENTRY(nvme_tracker) tq_list; 113 114 struct nvme_request *req; 115 uint16_t cid; 116 117 uint16_t rsvd0; 118 uint32_t rsvd1; 119 120 spdk_nvme_cmd_cb cb_fn; 121 void *cb_arg; 122 123 uint64_t prp_sgl_bus_addr; 124 125 union { 126 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; 127 struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; 128 } u; 129 }; 130 /* 131 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary 132 * and so that there is no padding required to meet alignment requirements. 133 */ 134 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K"); 135 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned"); 136 137 /* PCIe transport extensions for spdk_nvme_qpair */ 138 struct nvme_pcie_qpair { 139 /* Submission queue tail doorbell */ 140 volatile uint32_t *sq_tdbl; 141 142 /* Completion queue head doorbell */ 143 volatile uint32_t *cq_hdbl; 144 145 /* Submission queue */ 146 struct spdk_nvme_cmd *cmd; 147 148 /* Completion queue */ 149 struct spdk_nvme_cpl *cpl; 150 151 TAILQ_HEAD(, nvme_tracker) free_tr; 152 TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr; 153 154 /* Array of trackers indexed by command ID. */ 155 struct nvme_tracker *tr; 156 157 uint16_t num_entries; 158 159 uint8_t retry_count; 160 161 uint16_t max_completions_cap; 162 163 uint16_t last_sq_tail; 164 uint16_t sq_tail; 165 uint16_t cq_head; 166 uint16_t sq_head; 167 168 struct { 169 uint8_t phase : 1; 170 uint8_t delay_pcie_doorbell : 1; 171 uint8_t has_shadow_doorbell : 1; 172 } flags; 173 174 /* 175 * Base qpair structure. 176 * This is located after the hot data in this structure so that the important parts of 177 * nvme_pcie_qpair are in the same cache line. 178 */ 179 struct spdk_nvme_qpair qpair; 180 181 struct { 182 /* Submission queue shadow tail doorbell */ 183 volatile uint32_t *sq_tdbl; 184 185 /* Completion queue shadow head doorbell */ 186 volatile uint32_t *cq_hdbl; 187 188 /* Submission queue event index */ 189 volatile uint32_t *sq_eventidx; 190 191 /* Completion queue event index */ 192 volatile uint32_t *cq_eventidx; 193 } shadow_doorbell; 194 195 /* 196 * Fields below this point should not be touched on the normal I/O path. 197 */ 198 199 bool sq_in_cmb; 200 201 uint64_t cmd_bus_addr; 202 uint64_t cpl_bus_addr; 203 204 struct spdk_nvme_cmd *sq_vaddr; 205 struct spdk_nvme_cpl *cq_vaddr; 206 }; 207 208 static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, 209 struct spdk_pci_addr *pci_addr); 210 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, 211 const struct spdk_nvme_io_qpair_opts *opts); 212 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair); 213 214 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; 215 static uint16_t g_signal_lock; 216 static bool g_sigset = false; 217 static int hotplug_fd = -1; 218 219 static void 220 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx) 221 { 222 void *map_address; 223 uint16_t flag = 0; 224 225 if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE, 226 __ATOMIC_RELAXED)) { 227 SPDK_DEBUGLOG(SPDK_LOG_NVME, "request g_signal_lock failed\n"); 228 return; 229 } 230 231 assert(g_thread_mmio_ctrlr != NULL); 232 233 if (!g_thread_mmio_ctrlr->is_remapped) { 234 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, 235 PROT_READ | PROT_WRITE, 236 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); 237 if (map_address == MAP_FAILED) { 238 SPDK_ERRLOG("mmap failed\n"); 239 __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); 240 return; 241 } 242 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); 243 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; 244 g_thread_mmio_ctrlr->is_remapped = true; 245 } 246 __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); 247 } 248 249 static void 250 nvme_pcie_ctrlr_setup_signal(void) 251 { 252 struct sigaction sa; 253 254 sa.sa_sigaction = nvme_sigbus_fault_sighandler; 255 sigemptyset(&sa.sa_mask); 256 sa.sa_flags = SA_SIGINFO; 257 sigaction(SIGBUS, &sa, NULL); 258 } 259 260 static inline struct nvme_pcie_ctrlr * 261 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) 262 { 263 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); 264 return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); 265 } 266 267 static int 268 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) 269 { 270 struct spdk_nvme_ctrlr *ctrlr, *tmp; 271 struct spdk_uevent event; 272 struct spdk_pci_addr pci_addr; 273 union spdk_nvme_csts_register csts; 274 struct spdk_nvme_ctrlr_process *proc; 275 276 while (spdk_get_uevent(hotplug_fd, &event) > 0) { 277 if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO || 278 event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) { 279 if (event.action == SPDK_NVME_UEVENT_ADD) { 280 SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n", 281 event.traddr); 282 if (spdk_process_is_primary()) { 283 if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) { 284 nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr); 285 } 286 } 287 } else if (event.action == SPDK_NVME_UEVENT_REMOVE) { 288 struct spdk_nvme_transport_id trid; 289 290 memset(&trid, 0, sizeof(trid)); 291 trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 292 snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr); 293 294 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); 295 if (ctrlr == NULL) { 296 return 0; 297 } 298 SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n", 299 event.traddr); 300 301 nvme_ctrlr_fail(ctrlr, true); 302 303 /* get the user app to clean up and stop I/O */ 304 if (probe_ctx->remove_cb) { 305 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 306 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); 307 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 308 } 309 } 310 } 311 } 312 313 /* This is a work around for vfio-attached device hot remove detection. */ 314 TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { 315 bool do_remove = false; 316 317 if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 318 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 319 320 if (spdk_pci_device_is_removed(pctrlr->devhandle)) { 321 do_remove = true; 322 } 323 } 324 325 /* NVMe controller BAR must be mapped in the current process before any access. */ 326 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 327 if (proc) { 328 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 329 if (csts.raw == 0xffffffffU) { 330 do_remove = true; 331 } 332 } 333 334 if (do_remove) { 335 nvme_ctrlr_fail(ctrlr, true); 336 if (probe_ctx->remove_cb) { 337 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 338 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); 339 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 340 } 341 } 342 } 343 return 0; 344 } 345 346 static inline struct nvme_pcie_qpair * 347 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) 348 { 349 assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE); 350 return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair); 351 } 352 353 static volatile void * 354 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) 355 { 356 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 357 358 return (volatile void *)((uintptr_t)pctrlr->regs + offset); 359 } 360 361 int 362 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) 363 { 364 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 365 366 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 367 g_thread_mmio_ctrlr = pctrlr; 368 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); 369 g_thread_mmio_ctrlr = NULL; 370 return 0; 371 } 372 373 int 374 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) 375 { 376 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 377 378 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 379 g_thread_mmio_ctrlr = pctrlr; 380 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); 381 g_thread_mmio_ctrlr = NULL; 382 return 0; 383 } 384 385 int 386 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) 387 { 388 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 389 390 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 391 assert(value != NULL); 392 g_thread_mmio_ctrlr = pctrlr; 393 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); 394 g_thread_mmio_ctrlr = NULL; 395 if (~(*value) == 0) { 396 return -1; 397 } 398 399 return 0; 400 } 401 402 int 403 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) 404 { 405 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 406 407 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 408 assert(value != NULL); 409 g_thread_mmio_ctrlr = pctrlr; 410 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); 411 g_thread_mmio_ctrlr = NULL; 412 if (~(*value) == 0) { 413 return -1; 414 } 415 416 return 0; 417 } 418 419 static int 420 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 421 { 422 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), 423 value); 424 } 425 426 static int 427 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 428 { 429 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), 430 value); 431 } 432 433 static int 434 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) 435 { 436 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), 437 aqa->raw); 438 } 439 440 static int 441 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) 442 { 443 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), 444 &cmbloc->raw); 445 } 446 447 static int 448 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) 449 { 450 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), 451 &cmbsz->raw); 452 } 453 454 uint32_t 455 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 456 { 457 /* 458 * For commands requiring more than 2 PRP entries, one PRP will be 459 * embedded in the command (prp1), and the rest of the PRP entries 460 * will be in a list pointed to by the command (prp2). This means 461 * that real max number of PRP entries we support is 506+1, which 462 * results in a max xfer size of 506*ctrlr->page_size. 463 */ 464 return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; 465 } 466 467 uint16_t 468 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 469 { 470 return NVME_MAX_SGL_DESCRIPTORS; 471 } 472 473 static void 474 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) 475 { 476 int rc; 477 void *addr; 478 uint32_t bir; 479 union spdk_nvme_cmbsz_register cmbsz; 480 union spdk_nvme_cmbloc_register cmbloc; 481 uint64_t size, unit_size, offset, bar_size, bar_phys_addr; 482 uint64_t mem_register_start, mem_register_end; 483 484 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || 485 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 486 SPDK_ERRLOG("get registers failed\n"); 487 goto exit; 488 } 489 490 if (!cmbsz.bits.sz) { 491 goto exit; 492 } 493 494 bir = cmbloc.bits.bir; 495 /* Values 0 2 3 4 5 are valid for BAR */ 496 if (bir > 5 || bir == 1) { 497 goto exit; 498 } 499 500 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ 501 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); 502 /* controller memory buffer size in Bytes */ 503 size = unit_size * cmbsz.bits.sz; 504 /* controller memory buffer offset from BAR in Bytes */ 505 offset = unit_size * cmbloc.bits.ofst; 506 507 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, 508 &bar_phys_addr, &bar_size); 509 if ((rc != 0) || addr == NULL) { 510 goto exit; 511 } 512 513 if (offset > bar_size) { 514 goto exit; 515 } 516 517 if (size > bar_size - offset) { 518 goto exit; 519 } 520 521 pctrlr->cmb_bar_virt_addr = addr; 522 pctrlr->cmb_bar_phys_addr = bar_phys_addr; 523 pctrlr->cmb_size = size; 524 pctrlr->cmb_current_offset = offset; 525 pctrlr->cmb_max_offset = offset + size; 526 527 if (!cmbsz.bits.sqs) { 528 pctrlr->ctrlr.opts.use_cmb_sqs = false; 529 } 530 531 /* If only SQS is supported use legacy mapping */ 532 if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) { 533 return; 534 } 535 536 /* If CMB is less than 4MiB in size then abort CMB mapping */ 537 if (pctrlr->cmb_size < (1ULL << 22)) { 538 goto exit; 539 } 540 541 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + VALUE_2MB - 1); 542 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size); 543 pctrlr->cmb_mem_register_addr = (void *)mem_register_start; 544 pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start; 545 546 rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); 547 if (rc) { 548 SPDK_ERRLOG("spdk_mem_register() failed\n"); 549 goto exit; 550 } 551 pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr); 552 pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr); 553 pctrlr->cmb_io_data_supported = true; 554 555 return; 556 exit: 557 pctrlr->cmb_bar_virt_addr = NULL; 558 pctrlr->ctrlr.opts.use_cmb_sqs = false; 559 return; 560 } 561 562 static int 563 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) 564 { 565 int rc = 0; 566 union spdk_nvme_cmbloc_register cmbloc; 567 void *addr = pctrlr->cmb_bar_virt_addr; 568 569 if (addr) { 570 if (pctrlr->cmb_mem_register_addr) { 571 spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); 572 } 573 574 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 575 SPDK_ERRLOG("get_cmbloc() failed\n"); 576 return -EIO; 577 } 578 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); 579 } 580 return rc; 581 } 582 583 static int 584 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned, 585 uint64_t *offset) 586 { 587 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 588 uint64_t round_offset; 589 590 round_offset = pctrlr->cmb_current_offset; 591 round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1); 592 593 /* CMB may only consume part of the BAR, calculate accordingly */ 594 if (round_offset + length > pctrlr->cmb_max_offset) { 595 SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); 596 return -1; 597 } 598 599 *offset = round_offset; 600 pctrlr->cmb_current_offset = round_offset + length; 601 602 return 0; 603 } 604 605 volatile struct spdk_nvme_registers * 606 nvme_pcie_ctrlr_get_registers(struct spdk_nvme_ctrlr *ctrlr) 607 { 608 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 609 610 return pctrlr->regs; 611 } 612 613 void * 614 nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) 615 { 616 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 617 uint64_t offset; 618 619 if (pctrlr->cmb_bar_virt_addr == NULL) { 620 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); 621 return NULL; 622 } 623 624 if (!pctrlr->cmb_io_data_supported) { 625 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n"); 626 return NULL; 627 } 628 629 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) { 630 SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size); 631 return NULL; 632 } 633 634 return pctrlr->cmb_bar_virt_addr + offset; 635 } 636 637 int 638 nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) 639 { 640 /* 641 * Do nothing for now. 642 * TODO: Track free space so buffers may be reused. 643 */ 644 SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n", 645 __func__); 646 return 0; 647 } 648 649 static int 650 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) 651 { 652 int rc; 653 void *addr; 654 uint64_t phys_addr, size; 655 656 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, 657 &phys_addr, &size); 658 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; 659 if ((pctrlr->regs == NULL) || (rc != 0)) { 660 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", 661 rc, pctrlr->regs); 662 return -1; 663 } 664 665 pctrlr->regs_size = size; 666 nvme_pcie_ctrlr_map_cmb(pctrlr); 667 668 return 0; 669 } 670 671 static int 672 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) 673 { 674 int rc = 0; 675 void *addr = (void *)pctrlr->regs; 676 677 if (pctrlr->ctrlr.is_removed) { 678 return rc; 679 } 680 681 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); 682 if (rc != 0) { 683 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); 684 return -1; 685 } 686 687 if (addr) { 688 /* NOTE: addr may have been remapped here. We're relying on DPDK to call 689 * munmap internally. 690 */ 691 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); 692 } 693 return rc; 694 } 695 696 static int 697 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr) 698 { 699 struct nvme_pcie_qpair *pqpair; 700 int rc; 701 702 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 703 if (pqpair == NULL) { 704 return -ENOMEM; 705 } 706 707 pqpair->num_entries = NVME_ADMIN_ENTRIES; 708 pqpair->flags.delay_pcie_doorbell = 0; 709 710 ctrlr->adminq = &pqpair->qpair; 711 712 rc = nvme_qpair_init(ctrlr->adminq, 713 0, /* qpair ID */ 714 ctrlr, 715 SPDK_NVME_QPRIO_URGENT, 716 NVME_ADMIN_ENTRIES); 717 if (rc != 0) { 718 return rc; 719 } 720 721 return nvme_pcie_qpair_construct(ctrlr->adminq, NULL); 722 } 723 724 /* This function must only be called while holding g_spdk_nvme_driver->lock */ 725 static int 726 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) 727 { 728 struct spdk_nvme_transport_id trid = {}; 729 struct nvme_pcie_enum_ctx *enum_ctx = ctx; 730 struct spdk_nvme_ctrlr *ctrlr; 731 struct spdk_pci_addr pci_addr; 732 733 pci_addr = spdk_pci_device_get_addr(pci_dev); 734 735 trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 736 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); 737 738 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); 739 if (!spdk_process_is_primary()) { 740 if (!ctrlr) { 741 SPDK_ERRLOG("Controller must be constructed in the primary process first.\n"); 742 return -1; 743 } 744 745 return nvme_ctrlr_add_process(ctrlr, pci_dev); 746 } 747 748 /* check whether user passes the pci_addr */ 749 if (enum_ctx->has_pci_addr && 750 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { 751 return 1; 752 } 753 754 return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev); 755 } 756 757 int 758 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, 759 bool direct_connect) 760 { 761 struct nvme_pcie_enum_ctx enum_ctx = {}; 762 763 enum_ctx.probe_ctx = probe_ctx; 764 765 if (strlen(probe_ctx->trid.traddr) != 0) { 766 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) { 767 return -1; 768 } 769 enum_ctx.has_pci_addr = true; 770 } 771 772 if (hotplug_fd < 0) { 773 hotplug_fd = spdk_uevent_connect(); 774 if (hotplug_fd < 0) { 775 SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n"); 776 } 777 } else { 778 _nvme_pcie_hotplug_monitor(probe_ctx); 779 } 780 781 if (enum_ctx.has_pci_addr == false) { 782 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), 783 pcie_nvme_enum_cb, &enum_ctx); 784 } else { 785 return spdk_pci_device_attach(spdk_pci_nvme_get_driver(), 786 pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); 787 } 788 } 789 790 static int 791 nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr) 792 { 793 struct nvme_pcie_enum_ctx enum_ctx; 794 795 enum_ctx.probe_ctx = probe_ctx; 796 enum_ctx.has_pci_addr = true; 797 enum_ctx.pci_addr = *pci_addr; 798 799 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx); 800 } 801 802 struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 803 const struct spdk_nvme_ctrlr_opts *opts, 804 void *devhandle) 805 { 806 struct spdk_pci_device *pci_dev = devhandle; 807 struct nvme_pcie_ctrlr *pctrlr; 808 union spdk_nvme_cap_register cap; 809 union spdk_nvme_vs_register vs; 810 uint32_t cmd_reg; 811 int rc, claim_fd; 812 struct spdk_pci_id pci_id; 813 struct spdk_pci_addr pci_addr; 814 815 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { 816 SPDK_ERRLOG("could not parse pci address\n"); 817 return NULL; 818 } 819 820 claim_fd = spdk_pci_device_claim(&pci_addr); 821 if (claim_fd < 0) { 822 SPDK_ERRLOG("could not claim device %s\n", trid->traddr); 823 return NULL; 824 } 825 826 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, 827 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 828 if (pctrlr == NULL) { 829 close(claim_fd); 830 SPDK_ERRLOG("could not allocate ctrlr\n"); 831 return NULL; 832 } 833 834 pctrlr->is_remapped = false; 835 pctrlr->ctrlr.is_removed = false; 836 pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 837 pctrlr->devhandle = devhandle; 838 pctrlr->ctrlr.opts = *opts; 839 pctrlr->claim_fd = claim_fd; 840 memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid)); 841 842 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); 843 if (rc != 0) { 844 close(claim_fd); 845 spdk_free(pctrlr); 846 return NULL; 847 } 848 849 /* Enable PCI busmaster and disable INTx */ 850 spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4); 851 cmd_reg |= 0x404; 852 spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4); 853 854 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { 855 SPDK_ERRLOG("get_cap() failed\n"); 856 close(claim_fd); 857 spdk_free(pctrlr); 858 return NULL; 859 } 860 861 if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { 862 SPDK_ERRLOG("get_vs() failed\n"); 863 close(claim_fd); 864 spdk_free(pctrlr); 865 return NULL; 866 } 867 868 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); 869 870 /* Doorbell stride is 2 ^ (dstrd + 2), 871 * but we want multiples of 4, so drop the + 2 */ 872 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; 873 874 rc = nvme_ctrlr_construct(&pctrlr->ctrlr); 875 if (rc != 0) { 876 nvme_ctrlr_destruct(&pctrlr->ctrlr); 877 return NULL; 878 } 879 880 pci_id = spdk_pci_device_get_id(pci_dev); 881 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); 882 883 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr); 884 if (rc != 0) { 885 nvme_ctrlr_destruct(&pctrlr->ctrlr); 886 return NULL; 887 } 888 889 /* Construct the primary process properties */ 890 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); 891 if (rc != 0) { 892 nvme_ctrlr_destruct(&pctrlr->ctrlr); 893 return NULL; 894 } 895 896 if (g_sigset != true) { 897 nvme_pcie_ctrlr_setup_signal(); 898 g_sigset = true; 899 } 900 901 return &pctrlr->ctrlr; 902 } 903 904 int 905 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 906 { 907 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 908 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); 909 union spdk_nvme_aqa_register aqa; 910 911 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { 912 SPDK_ERRLOG("set_asq() failed\n"); 913 return -EIO; 914 } 915 916 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { 917 SPDK_ERRLOG("set_acq() failed\n"); 918 return -EIO; 919 } 920 921 aqa.raw = 0; 922 /* acqs and asqs are 0-based. */ 923 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 924 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 925 926 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { 927 SPDK_ERRLOG("set_aqa() failed\n"); 928 return -EIO; 929 } 930 931 return 0; 932 } 933 934 int 935 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 936 { 937 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 938 struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); 939 940 close(pctrlr->claim_fd); 941 942 if (ctrlr->adminq) { 943 nvme_pcie_qpair_destroy(ctrlr->adminq); 944 } 945 946 nvme_ctrlr_destruct_finish(ctrlr); 947 948 nvme_ctrlr_free_processes(ctrlr); 949 950 nvme_pcie_ctrlr_free_bars(pctrlr); 951 952 if (devhandle) { 953 spdk_pci_device_detach(devhandle); 954 } 955 956 spdk_free(pctrlr); 957 958 return 0; 959 } 960 961 static void 962 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) 963 { 964 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); 965 tr->cid = cid; 966 tr->req = NULL; 967 } 968 969 int 970 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) 971 { 972 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 973 974 /* all head/tail vals are set to 0 */ 975 pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0; 976 977 /* 978 * First time through the completion queue, HW will set phase 979 * bit on completions to 1. So set this to 1 here, indicating 980 * we're looking for a 1 to know which entries have completed. 981 * we'll toggle the bit each time when the completion queue 982 * rolls over. 983 */ 984 pqpair->flags.phase = 1; 985 986 memset(pqpair->cmd, 0, 987 pqpair->num_entries * sizeof(struct spdk_nvme_cmd)); 988 memset(pqpair->cpl, 0, 989 pqpair->num_entries * sizeof(struct spdk_nvme_cpl)); 990 991 return 0; 992 } 993 994 static int 995 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, 996 const struct spdk_nvme_io_qpair_opts *opts) 997 { 998 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 999 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1000 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1001 struct nvme_tracker *tr; 1002 uint16_t i; 1003 volatile uint32_t *doorbell_base; 1004 uint64_t offset; 1005 uint16_t num_trackers; 1006 size_t page_align = VALUE_2MB; 1007 uint32_t flags = SPDK_MALLOC_DMA; 1008 uint64_t sq_paddr = 0; 1009 uint64_t cq_paddr = 0; 1010 1011 if (opts) { 1012 pqpair->sq_vaddr = opts->sq.vaddr; 1013 pqpair->cq_vaddr = opts->cq.vaddr; 1014 sq_paddr = opts->sq.paddr; 1015 cq_paddr = opts->cq.paddr; 1016 } 1017 1018 pqpair->retry_count = ctrlr->opts.transport_retry_count; 1019 1020 /* 1021 * Limit the maximum number of completions to return per call to prevent wraparound, 1022 * and calculate how many trackers can be submitted at once without overflowing the 1023 * completion queue. 1024 */ 1025 pqpair->max_completions_cap = pqpair->num_entries / 4; 1026 pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); 1027 pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); 1028 num_trackers = pqpair->num_entries - pqpair->max_completions_cap; 1029 1030 SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", 1031 pqpair->max_completions_cap, num_trackers); 1032 1033 assert(num_trackers != 0); 1034 1035 pqpair->sq_in_cmb = false; 1036 1037 if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { 1038 flags |= SPDK_MALLOC_SHARE; 1039 } 1040 1041 /* cmd and cpl rings must be aligned on page size boundaries. */ 1042 if (ctrlr->opts.use_cmb_sqs) { 1043 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1044 sysconf(_SC_PAGESIZE), &offset) == 0) { 1045 pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset; 1046 pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset; 1047 pqpair->sq_in_cmb = true; 1048 } 1049 } 1050 1051 if (pqpair->sq_in_cmb == false) { 1052 if (pqpair->sq_vaddr) { 1053 pqpair->cmd = pqpair->sq_vaddr; 1054 } else { 1055 /* To ensure physical address contiguity we make each ring occupy 1056 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. 1057 */ 1058 pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1059 page_align, NULL, 1060 SPDK_ENV_SOCKET_ID_ANY, flags); 1061 if (pqpair->cmd == NULL) { 1062 SPDK_ERRLOG("alloc qpair_cmd failed\n"); 1063 return -ENOMEM; 1064 } 1065 } 1066 if (sq_paddr) { 1067 assert(pqpair->sq_vaddr != NULL); 1068 pqpair->cmd_bus_addr = sq_paddr; 1069 } else { 1070 pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL); 1071 if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) { 1072 SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n"); 1073 return -EFAULT; 1074 } 1075 } 1076 } 1077 1078 if (pqpair->cq_vaddr) { 1079 pqpair->cpl = pqpair->cq_vaddr; 1080 } else { 1081 pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl), 1082 page_align, NULL, 1083 SPDK_ENV_SOCKET_ID_ANY, flags); 1084 if (pqpair->cpl == NULL) { 1085 SPDK_ERRLOG("alloc qpair_cpl failed\n"); 1086 return -ENOMEM; 1087 } 1088 } 1089 if (cq_paddr) { 1090 assert(pqpair->cq_vaddr != NULL); 1091 pqpair->cpl_bus_addr = cq_paddr; 1092 } else { 1093 pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL); 1094 if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) { 1095 SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n"); 1096 return -EFAULT; 1097 } 1098 } 1099 1100 doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl; 1101 pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; 1102 pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; 1103 1104 /* 1105 * Reserve space for all of the trackers in a single allocation. 1106 * struct nvme_tracker must be padded so that its size is already a power of 2. 1107 * This ensures the PRP list embedded in the nvme_tracker object will not span a 1108 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. 1109 */ 1110 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, 1111 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1112 if (pqpair->tr == NULL) { 1113 SPDK_ERRLOG("nvme_tr failed\n"); 1114 return -ENOMEM; 1115 } 1116 1117 TAILQ_INIT(&pqpair->free_tr); 1118 TAILQ_INIT(&pqpair->outstanding_tr); 1119 1120 for (i = 0; i < num_trackers; i++) { 1121 tr = &pqpair->tr[i]; 1122 nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL)); 1123 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1124 } 1125 1126 nvme_pcie_qpair_reset(qpair); 1127 1128 return 0; 1129 } 1130 1131 static inline void 1132 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) 1133 { 1134 /* dst and src are known to be non-overlapping and 64-byte aligned. */ 1135 #if defined(__SSE2__) 1136 __m128i *d128 = (__m128i *)dst; 1137 const __m128i *s128 = (const __m128i *)src; 1138 1139 _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0])); 1140 _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1])); 1141 _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2])); 1142 _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3])); 1143 #else 1144 *dst = *src; 1145 #endif 1146 } 1147 1148 /** 1149 * Note: the ctrlr_lock must be held when calling this function. 1150 */ 1151 static void 1152 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, 1153 struct nvme_request *req, struct spdk_nvme_cpl *cpl) 1154 { 1155 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1156 struct nvme_request *active_req = req; 1157 struct spdk_nvme_ctrlr_process *active_proc; 1158 1159 /* 1160 * The admin request is from another process. Move to the per 1161 * process list for that process to handle it later. 1162 */ 1163 assert(nvme_qpair_is_admin_queue(qpair)); 1164 assert(active_req->pid != getpid()); 1165 1166 active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid); 1167 if (active_proc) { 1168 /* Save the original completion information */ 1169 memcpy(&active_req->cpl, cpl, sizeof(*cpl)); 1170 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); 1171 } else { 1172 SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", 1173 active_req->pid); 1174 1175 nvme_free_request(active_req); 1176 } 1177 } 1178 1179 /** 1180 * Note: the ctrlr_lock must be held when calling this function. 1181 */ 1182 static void 1183 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) 1184 { 1185 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1186 struct nvme_request *req, *tmp_req; 1187 pid_t pid = getpid(); 1188 struct spdk_nvme_ctrlr_process *proc; 1189 1190 /* 1191 * Check whether there is any pending admin request from 1192 * other active processes. 1193 */ 1194 assert(nvme_qpair_is_admin_queue(qpair)); 1195 1196 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 1197 if (!proc) { 1198 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); 1199 assert(proc); 1200 return; 1201 } 1202 1203 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { 1204 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); 1205 1206 assert(req->pid == pid); 1207 1208 nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl); 1209 nvme_free_request(req); 1210 } 1211 } 1212 1213 static inline int 1214 nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) 1215 { 1216 return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old); 1217 } 1218 1219 static bool 1220 nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value, 1221 volatile uint32_t *shadow_db, 1222 volatile uint32_t *eventidx) 1223 { 1224 uint16_t old; 1225 1226 if (!shadow_db) { 1227 return true; 1228 } 1229 1230 old = *shadow_db; 1231 *shadow_db = value; 1232 1233 if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) { 1234 return false; 1235 } 1236 1237 return true; 1238 } 1239 1240 static inline void 1241 nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair) 1242 { 1243 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1244 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1245 bool need_mmio = true; 1246 1247 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1248 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1249 pqpair->sq_tail, 1250 pqpair->shadow_doorbell.sq_tdbl, 1251 pqpair->shadow_doorbell.sq_eventidx); 1252 } 1253 1254 if (spdk_likely(need_mmio)) { 1255 spdk_wmb(); 1256 g_thread_mmio_ctrlr = pctrlr; 1257 spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail); 1258 g_thread_mmio_ctrlr = NULL; 1259 } 1260 } 1261 1262 static inline void 1263 nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair) 1264 { 1265 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1266 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1267 bool need_mmio = true; 1268 1269 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1270 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1271 pqpair->cq_head, 1272 pqpair->shadow_doorbell.cq_hdbl, 1273 pqpair->shadow_doorbell.cq_eventidx); 1274 } 1275 1276 if (spdk_likely(need_mmio)) { 1277 g_thread_mmio_ctrlr = pctrlr; 1278 spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head); 1279 g_thread_mmio_ctrlr = NULL; 1280 } 1281 } 1282 1283 static void 1284 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1285 { 1286 struct nvme_request *req; 1287 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1288 1289 req = tr->req; 1290 assert(req != NULL); 1291 1292 /* Copy the command from the tracker to the submission queue. */ 1293 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); 1294 1295 if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) { 1296 pqpair->sq_tail = 0; 1297 } 1298 1299 if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) { 1300 SPDK_ERRLOG("sq_tail is passing sq_head!\n"); 1301 } 1302 1303 if (!pqpair->flags.delay_pcie_doorbell) { 1304 nvme_pcie_qpair_ring_sq_doorbell(qpair); 1305 } 1306 } 1307 1308 static void 1309 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 1310 struct spdk_nvme_cpl *cpl, bool print_on_error) 1311 { 1312 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1313 struct nvme_request *req; 1314 bool retry, error; 1315 bool req_from_current_proc = true; 1316 1317 req = tr->req; 1318 1319 assert(req != NULL); 1320 1321 error = spdk_nvme_cpl_is_error(cpl); 1322 retry = error && nvme_completion_is_retry(cpl) && 1323 req->retries < pqpair->retry_count; 1324 1325 if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { 1326 spdk_nvme_qpair_print_command(qpair, &req->cmd); 1327 spdk_nvme_qpair_print_completion(qpair, cpl); 1328 } 1329 1330 assert(cpl->cid == req->cmd.cid); 1331 1332 if (retry) { 1333 req->retries++; 1334 nvme_pcie_qpair_submit_tracker(qpair, tr); 1335 } else { 1336 /* Only check admin requests from different processes. */ 1337 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { 1338 req_from_current_proc = false; 1339 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); 1340 } else { 1341 nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl); 1342 } 1343 1344 if (req_from_current_proc == true) { 1345 nvme_qpair_free_request(qpair, req); 1346 } 1347 1348 tr->req = NULL; 1349 1350 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); 1351 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1352 1353 /* 1354 * If the controller is in the middle of resetting, don't 1355 * try to submit queued requests here - let the reset logic 1356 * handle that instead. 1357 */ 1358 if (!STAILQ_EMPTY(&qpair->queued_req) && 1359 !qpair->ctrlr->is_resetting) { 1360 req = STAILQ_FIRST(&qpair->queued_req); 1361 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1362 nvme_qpair_submit_request(qpair, req); 1363 } 1364 } 1365 } 1366 1367 static void 1368 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, 1369 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 1370 bool print_on_error) 1371 { 1372 struct spdk_nvme_cpl cpl; 1373 1374 memset(&cpl, 0, sizeof(cpl)); 1375 cpl.sqid = qpair->id; 1376 cpl.cid = tr->cid; 1377 cpl.status.sct = sct; 1378 cpl.status.sc = sc; 1379 cpl.status.dnr = dnr; 1380 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); 1381 } 1382 1383 static void 1384 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1385 { 1386 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1387 struct nvme_tracker *tr, *temp, *last; 1388 1389 last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head); 1390 1391 /* Abort previously submitted (outstanding) trs */ 1392 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { 1393 if (!qpair->ctrlr->opts.disable_error_logging) { 1394 SPDK_ERRLOG("aborting outstanding command\n"); 1395 } 1396 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1397 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); 1398 1399 if (tr == last) { 1400 break; 1401 } 1402 } 1403 } 1404 1405 void 1406 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 1407 { 1408 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1409 struct nvme_tracker *tr; 1410 1411 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1412 while (tr != NULL) { 1413 assert(tr->req != NULL); 1414 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 1415 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, 1416 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, 1417 false); 1418 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1419 } else { 1420 tr = TAILQ_NEXT(tr, tq_list); 1421 } 1422 } 1423 } 1424 1425 static void 1426 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) 1427 { 1428 nvme_pcie_admin_qpair_abort_aers(qpair); 1429 } 1430 1431 static int 1432 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) 1433 { 1434 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1435 1436 if (nvme_qpair_is_admin_queue(qpair)) { 1437 nvme_pcie_admin_qpair_destroy(qpair); 1438 } 1439 /* 1440 * We check sq_vaddr and cq_vaddr to see if the user specified the memory 1441 * buffers when creating the I/O queue. 1442 * If the user specified them, we cannot free that memory. 1443 * Nor do we free it if it's in the CMB. 1444 */ 1445 if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) { 1446 spdk_free(pqpair->cmd); 1447 } 1448 if (!pqpair->cq_vaddr && pqpair->cpl) { 1449 spdk_free(pqpair->cpl); 1450 } 1451 if (pqpair->tr) { 1452 spdk_free(pqpair->tr); 1453 } 1454 1455 nvme_qpair_deinit(qpair); 1456 1457 spdk_free(pqpair); 1458 1459 return 0; 1460 } 1461 1462 void 1463 nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1464 { 1465 nvme_pcie_qpair_abort_trackers(qpair, dnr); 1466 } 1467 1468 static int 1469 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, 1470 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, 1471 void *cb_arg) 1472 { 1473 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1474 struct nvme_request *req; 1475 struct spdk_nvme_cmd *cmd; 1476 1477 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1478 if (req == NULL) { 1479 return -ENOMEM; 1480 } 1481 1482 cmd = &req->cmd; 1483 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; 1484 1485 /* 1486 * TODO: create a create io completion queue command data 1487 * structure. 1488 */ 1489 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; 1490 /* 1491 * 0x2 = interrupts enabled 1492 * 0x1 = physically contiguous 1493 */ 1494 cmd->cdw11 = 0x1; 1495 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; 1496 1497 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1498 } 1499 1500 static int 1501 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, 1502 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1503 { 1504 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1505 struct nvme_request *req; 1506 struct spdk_nvme_cmd *cmd; 1507 1508 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1509 if (req == NULL) { 1510 return -ENOMEM; 1511 } 1512 1513 cmd = &req->cmd; 1514 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; 1515 1516 /* 1517 * TODO: create a create io submission queue command data 1518 * structure. 1519 */ 1520 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; 1521 /* 0x1 = physically contiguous */ 1522 cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1; 1523 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; 1524 1525 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1526 } 1527 1528 static int 1529 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1530 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1531 { 1532 struct nvme_request *req; 1533 struct spdk_nvme_cmd *cmd; 1534 1535 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1536 if (req == NULL) { 1537 return -ENOMEM; 1538 } 1539 1540 cmd = &req->cmd; 1541 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; 1542 cmd->cdw10 = qpair->id; 1543 1544 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1545 } 1546 1547 static int 1548 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1549 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1550 { 1551 struct nvme_request *req; 1552 struct spdk_nvme_cmd *cmd; 1553 1554 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1555 if (req == NULL) { 1556 return -ENOMEM; 1557 } 1558 1559 cmd = &req->cmd; 1560 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; 1561 cmd->cdw10 = qpair->id; 1562 1563 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1564 } 1565 1566 static int 1567 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1568 uint16_t qid) 1569 { 1570 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1571 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1572 struct nvme_completion_poll_status status; 1573 int rc; 1574 1575 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1576 if (rc != 0) { 1577 return rc; 1578 } 1579 1580 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1581 SPDK_ERRLOG("nvme_create_io_cq failed!\n"); 1582 return -1; 1583 } 1584 1585 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); 1586 if (rc != 0) { 1587 return rc; 1588 } 1589 1590 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1591 SPDK_ERRLOG("nvme_create_io_sq failed!\n"); 1592 /* Attempt to delete the completion queue */ 1593 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); 1594 if (rc != 0) { 1595 return -1; 1596 } 1597 spdk_nvme_wait_for_completion(ctrlr->adminq, &status); 1598 return -1; 1599 } 1600 1601 if (ctrlr->shadow_doorbell) { 1602 pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * 1603 pctrlr->doorbell_stride_u32; 1604 pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * 1605 pctrlr->doorbell_stride_u32; 1606 pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * 1607 pctrlr->doorbell_stride_u32; 1608 pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * 1609 pctrlr->doorbell_stride_u32; 1610 pqpair->flags.has_shadow_doorbell = 1; 1611 } else { 1612 pqpair->flags.has_shadow_doorbell = 0; 1613 } 1614 nvme_pcie_qpair_reset(qpair); 1615 1616 return 0; 1617 } 1618 1619 struct spdk_nvme_qpair * 1620 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 1621 const struct spdk_nvme_io_qpair_opts *opts) 1622 { 1623 struct nvme_pcie_qpair *pqpair; 1624 struct spdk_nvme_qpair *qpair; 1625 int rc; 1626 1627 assert(ctrlr != NULL); 1628 1629 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, 1630 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1631 if (pqpair == NULL) { 1632 return NULL; 1633 } 1634 1635 pqpair->num_entries = opts->io_queue_size; 1636 pqpair->flags.delay_pcie_doorbell = opts->delay_pcie_doorbell; 1637 1638 qpair = &pqpair->qpair; 1639 1640 rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); 1641 if (rc != 0) { 1642 nvme_pcie_qpair_destroy(qpair); 1643 return NULL; 1644 } 1645 1646 rc = nvme_pcie_qpair_construct(qpair, opts); 1647 1648 if (rc != 0) { 1649 nvme_pcie_qpair_destroy(qpair); 1650 return NULL; 1651 } 1652 1653 rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid); 1654 1655 if (rc != 0) { 1656 SPDK_ERRLOG("I/O queue creation failed\n"); 1657 nvme_pcie_qpair_destroy(qpair); 1658 return NULL; 1659 } 1660 1661 return qpair; 1662 } 1663 1664 int 1665 nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1666 { 1667 if (nvme_qpair_is_admin_queue(qpair)) { 1668 return 0; 1669 } else { 1670 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); 1671 } 1672 } 1673 1674 void 1675 nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1676 { 1677 } 1678 1679 int 1680 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1681 { 1682 struct nvme_completion_poll_status status; 1683 int rc; 1684 1685 assert(ctrlr != NULL); 1686 1687 if (ctrlr->is_removed) { 1688 goto free; 1689 } 1690 1691 /* Delete the I/O submission queue */ 1692 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1693 if (rc != 0) { 1694 SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); 1695 return rc; 1696 } 1697 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1698 return -1; 1699 } 1700 1701 /* Delete the completion queue */ 1702 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1703 if (rc != 0) { 1704 SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); 1705 return rc; 1706 } 1707 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1708 return -1; 1709 } 1710 1711 free: 1712 if (qpair->no_deletion_notification_needed == 0) { 1713 /* Abort the rest of the I/O */ 1714 nvme_pcie_qpair_abort_trackers(qpair, 1); 1715 } 1716 1717 nvme_pcie_qpair_destroy(qpair); 1718 return 0; 1719 } 1720 1721 static void 1722 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1723 { 1724 /* 1725 * Bad vtophys translation, so abort this request and return 1726 * immediately. 1727 */ 1728 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1729 SPDK_NVME_SC_INVALID_FIELD, 1730 1 /* do not retry */, true); 1731 } 1732 1733 /* 1734 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. 1735 * 1736 * *prp_index will be updated to account for the number of PRP entries used. 1737 */ 1738 static inline int 1739 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, 1740 uint32_t page_size) 1741 { 1742 struct spdk_nvme_cmd *cmd = &tr->req->cmd; 1743 uintptr_t page_mask = page_size - 1; 1744 uint64_t phys_addr; 1745 uint32_t i; 1746 1747 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n", 1748 *prp_index, virt_addr, (uint32_t)len); 1749 1750 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { 1751 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1752 return -EINVAL; 1753 } 1754 1755 i = *prp_index; 1756 while (len) { 1757 uint32_t seg_len; 1758 1759 /* 1760 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, 1761 * so prp_index == count is valid. 1762 */ 1763 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { 1764 SPDK_ERRLOG("out of PRP entries\n"); 1765 return -EINVAL; 1766 } 1767 1768 phys_addr = spdk_vtophys(virt_addr, NULL); 1769 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { 1770 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); 1771 return -EINVAL; 1772 } 1773 1774 if (i == 0) { 1775 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr); 1776 cmd->dptr.prp.prp1 = phys_addr; 1777 seg_len = page_size - ((uintptr_t)virt_addr & page_mask); 1778 } else { 1779 if ((phys_addr & page_mask) != 0) { 1780 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); 1781 return -EINVAL; 1782 } 1783 1784 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr); 1785 tr->u.prp[i - 1] = phys_addr; 1786 seg_len = page_size; 1787 } 1788 1789 seg_len = spdk_min(seg_len, len); 1790 virt_addr += seg_len; 1791 len -= seg_len; 1792 i++; 1793 } 1794 1795 cmd->psdt = SPDK_NVME_PSDT_PRP; 1796 if (i <= 1) { 1797 cmd->dptr.prp.prp2 = 0; 1798 } else if (i == 2) { 1799 cmd->dptr.prp.prp2 = tr->u.prp[0]; 1800 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); 1801 } else { 1802 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; 1803 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); 1804 } 1805 1806 *prp_index = i; 1807 return 0; 1808 } 1809 1810 /** 1811 * Build PRP list describing physically contiguous payload buffer. 1812 */ 1813 static int 1814 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1815 struct nvme_tracker *tr) 1816 { 1817 uint32_t prp_index = 0; 1818 int rc; 1819 1820 rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, 1821 req->payload_size, qpair->ctrlr->page_size); 1822 if (rc) { 1823 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1824 return rc; 1825 } 1826 1827 return 0; 1828 } 1829 1830 /** 1831 * Build SGL list describing scattered payload buffer. 1832 */ 1833 static int 1834 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1835 struct nvme_tracker *tr) 1836 { 1837 int rc; 1838 void *virt_addr; 1839 uint64_t phys_addr; 1840 uint32_t remaining_transfer_len, remaining_user_sge_len, length; 1841 struct spdk_nvme_sgl_descriptor *sgl; 1842 uint32_t nseg = 0; 1843 1844 /* 1845 * Build scattered payloads. 1846 */ 1847 assert(req->payload_size != 0); 1848 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1849 assert(req->payload.reset_sgl_fn != NULL); 1850 assert(req->payload.next_sge_fn != NULL); 1851 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1852 1853 sgl = tr->u.sgl; 1854 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1855 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1856 1857 remaining_transfer_len = req->payload_size; 1858 1859 while (remaining_transfer_len > 0) { 1860 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, 1861 &virt_addr, &remaining_user_sge_len); 1862 if (rc) { 1863 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1864 return -1; 1865 } 1866 1867 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 1868 remaining_transfer_len -= remaining_user_sge_len; 1869 while (remaining_user_sge_len > 0) { 1870 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1871 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1872 return -1; 1873 } 1874 1875 phys_addr = spdk_vtophys(virt_addr, NULL); 1876 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1877 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1878 return -1; 1879 } 1880 1881 length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr)); 1882 remaining_user_sge_len -= length; 1883 virt_addr += length; 1884 1885 if (nseg > 0 && phys_addr == 1886 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { 1887 /* extend previous entry */ 1888 (*(sgl - 1)).unkeyed.length += length; 1889 continue; 1890 } 1891 1892 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1893 sgl->unkeyed.length = length; 1894 sgl->address = phys_addr; 1895 sgl->unkeyed.subtype = 0; 1896 1897 sgl++; 1898 nseg++; 1899 } 1900 } 1901 1902 if (nseg == 1) { 1903 /* 1904 * The whole transfer can be described by a single SGL descriptor. 1905 * Use the special case described by the spec where SGL1's type is Data Block. 1906 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1907 * SGL element into SGL1. 1908 */ 1909 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1910 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1911 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1912 } else { 1913 /* For now we can only support 1 SGL segment in NVMe controller */ 1914 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1915 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1916 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1917 } 1918 1919 return 0; 1920 } 1921 1922 /** 1923 * Build PRP list describing scattered payload buffer. 1924 */ 1925 static int 1926 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1927 struct nvme_tracker *tr) 1928 { 1929 int rc; 1930 void *virt_addr; 1931 uint32_t remaining_transfer_len, length; 1932 uint32_t prp_index = 0; 1933 uint32_t page_size = qpair->ctrlr->page_size; 1934 1935 /* 1936 * Build scattered payloads. 1937 */ 1938 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1939 assert(req->payload.reset_sgl_fn != NULL); 1940 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1941 1942 remaining_transfer_len = req->payload_size; 1943 while (remaining_transfer_len > 0) { 1944 assert(req->payload.next_sge_fn != NULL); 1945 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); 1946 if (rc) { 1947 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1948 return -1; 1949 } 1950 1951 length = spdk_min(remaining_transfer_len, length); 1952 1953 /* 1954 * Any incompatible sges should have been handled up in the splitting routine, 1955 * but assert here as an additional check. 1956 * 1957 * All SGEs except last must end on a page boundary. 1958 */ 1959 assert((length == remaining_transfer_len) || 1960 _is_page_aligned((uintptr_t)virt_addr + length, page_size)); 1961 1962 rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); 1963 if (rc) { 1964 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1965 return rc; 1966 } 1967 1968 remaining_transfer_len -= length; 1969 } 1970 1971 return 0; 1972 } 1973 1974 int 1975 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) 1976 { 1977 struct nvme_tracker *tr; 1978 int rc = 0; 1979 void *md_payload; 1980 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1981 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1982 1983 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 1984 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 1985 } 1986 1987 tr = TAILQ_FIRST(&pqpair->free_tr); 1988 1989 if (tr == NULL) { 1990 /* 1991 * Put the request on the qpair's request queue to be 1992 * processed when a tracker frees up via a command 1993 * completion. 1994 */ 1995 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 1996 goto exit; 1997 } 1998 1999 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ 2000 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); 2001 tr->req = req; 2002 tr->cb_fn = req->cb_fn; 2003 tr->cb_arg = req->cb_arg; 2004 req->cmd.cid = tr->cid; 2005 2006 if (req->payload_size && req->payload.md) { 2007 md_payload = req->payload.md + req->md_offset; 2008 tr->req->cmd.mptr = spdk_vtophys(md_payload, NULL); 2009 if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) { 2010 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2011 rc = -EINVAL; 2012 goto exit; 2013 } 2014 } 2015 2016 if (req->payload_size == 0) { 2017 /* Null payload - leave PRP fields untouched */ 2018 rc = 0; 2019 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { 2020 rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr); 2021 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { 2022 if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 2023 rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr); 2024 } else { 2025 rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr); 2026 } 2027 } else { 2028 assert(0); 2029 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2030 rc = -EINVAL; 2031 } 2032 2033 if (rc < 0) { 2034 goto exit; 2035 } 2036 2037 nvme_pcie_qpair_submit_tracker(qpair, tr); 2038 2039 exit: 2040 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2041 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2042 } 2043 2044 return rc; 2045 } 2046 2047 static void 2048 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 2049 { 2050 uint64_t t02; 2051 struct nvme_tracker *tr, *tmp; 2052 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2053 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2054 struct spdk_nvme_ctrlr_process *active_proc; 2055 2056 /* Don't check timeouts during controller initialization. */ 2057 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 2058 return; 2059 } 2060 2061 if (nvme_qpair_is_admin_queue(qpair)) { 2062 active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 2063 } else { 2064 active_proc = qpair->active_proc; 2065 } 2066 2067 /* Only check timeouts if the current process has a timeout callback. */ 2068 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 2069 return; 2070 } 2071 2072 t02 = spdk_get_ticks(); 2073 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { 2074 assert(tr->req != NULL); 2075 2076 if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { 2077 /* 2078 * The requests are in order, so as soon as one has not timed out, 2079 * stop iterating. 2080 */ 2081 break; 2082 } 2083 } 2084 } 2085 2086 int32_t 2087 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) 2088 { 2089 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2090 struct nvme_tracker *tr; 2091 struct spdk_nvme_cpl *cpl, *next_cpl; 2092 uint32_t num_completions = 0; 2093 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2094 uint16_t next_cq_head; 2095 uint8_t next_phase; 2096 bool next_is_valid = false; 2097 2098 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2099 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 2100 } 2101 2102 if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { 2103 /* 2104 * max_completions == 0 means unlimited, but complete at most 2105 * max_completions_cap batch of I/O at a time so that the completion 2106 * queue doorbells don't wrap around. 2107 */ 2108 max_completions = pqpair->max_completions_cap; 2109 } 2110 2111 while (1) { 2112 cpl = &pqpair->cpl[pqpair->cq_head]; 2113 2114 if (!next_is_valid && cpl->status.p != pqpair->flags.phase) { 2115 break; 2116 } 2117 2118 if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) { 2119 next_cq_head = pqpair->cq_head + 1; 2120 next_phase = pqpair->flags.phase; 2121 } else { 2122 next_cq_head = 0; 2123 next_phase = !pqpair->flags.phase; 2124 } 2125 next_cpl = &pqpair->cpl[next_cq_head]; 2126 next_is_valid = (next_cpl->status.p == next_phase); 2127 if (next_is_valid) { 2128 __builtin_prefetch(&pqpair->tr[next_cpl->cid]); 2129 } 2130 2131 #ifdef __PPC64__ 2132 /* 2133 * This memory barrier prevents reordering of: 2134 * - load after store from/to tr 2135 * - load after load cpl phase and cpl cid 2136 */ 2137 spdk_mb(); 2138 #elif defined(__aarch64__) 2139 __asm volatile("dmb oshld" ::: "memory"); 2140 #endif 2141 2142 if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { 2143 pqpair->cq_head = 0; 2144 pqpair->flags.phase = !pqpair->flags.phase; 2145 } 2146 2147 tr = &pqpair->tr[cpl->cid]; 2148 /* Prefetch the req's STAILQ_ENTRY since we'll need to access it 2149 * as part of putting the req back on the qpair's free list. 2150 */ 2151 __builtin_prefetch(&tr->req->stailq); 2152 pqpair->sq_head = cpl->sqhd; 2153 2154 if (tr->req) { 2155 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); 2156 } else { 2157 SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); 2158 spdk_nvme_qpair_print_completion(qpair, cpl); 2159 assert(0); 2160 } 2161 2162 if (++num_completions == max_completions) { 2163 break; 2164 } 2165 } 2166 2167 if (num_completions > 0) { 2168 nvme_pcie_qpair_ring_cq_doorbell(qpair); 2169 } 2170 2171 if (pqpair->flags.delay_pcie_doorbell) { 2172 if (pqpair->last_sq_tail != pqpair->sq_tail) { 2173 nvme_pcie_qpair_ring_sq_doorbell(qpair); 2174 pqpair->last_sq_tail = pqpair->sq_tail; 2175 } 2176 } 2177 2178 if (spdk_unlikely(ctrlr->timeout_enabled)) { 2179 /* 2180 * User registered for timeout callback 2181 */ 2182 nvme_pcie_qpair_check_timeout(qpair); 2183 } 2184 2185 /* Before returning, complete any pending admin request. */ 2186 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2187 nvme_pcie_qpair_complete_pending_admin_request(qpair); 2188 2189 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2190 } 2191 2192 return num_completions; 2193 } 2194