1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * Copyright (c) 2017, IBM Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * NVMe over PCIe transport 37 */ 38 39 #include "spdk/stdinc.h" 40 #include "spdk/env.h" 41 #include "spdk/likely.h" 42 #include "spdk/string.h" 43 #include "nvme_internal.h" 44 #include "nvme_uevent.h" 45 46 /* 47 * Number of completion queue entries to process before ringing the 48 * completion queue doorbell. 49 */ 50 #define NVME_MIN_COMPLETIONS (1) 51 #define NVME_MAX_COMPLETIONS (128) 52 53 #define NVME_ADMIN_ENTRIES (128) 54 55 /* 56 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL 57 * segment. 58 */ 59 #define NVME_MAX_SGL_DESCRIPTORS (251) 60 61 #define NVME_MAX_PRP_LIST_ENTRIES (505) 62 63 struct nvme_pcie_enum_ctx { 64 struct spdk_nvme_probe_ctx *probe_ctx; 65 struct spdk_pci_addr pci_addr; 66 bool has_pci_addr; 67 }; 68 69 /* PCIe transport extensions for spdk_nvme_ctrlr */ 70 struct nvme_pcie_ctrlr { 71 struct spdk_nvme_ctrlr ctrlr; 72 73 /** NVMe MMIO register space */ 74 volatile struct spdk_nvme_registers *regs; 75 76 /** NVMe MMIO register size */ 77 uint64_t regs_size; 78 79 /* BAR mapping address which contains controller memory buffer */ 80 void *cmb_bar_virt_addr; 81 82 /* BAR physical address which contains controller memory buffer */ 83 uint64_t cmb_bar_phys_addr; 84 85 /* Controller memory buffer size in Bytes */ 86 uint64_t cmb_size; 87 88 /* Current offset of controller memory buffer, relative to start of BAR virt addr */ 89 uint64_t cmb_current_offset; 90 91 /* Last valid offset into CMB, this differs if CMB memory registration occurs or not */ 92 uint64_t cmb_max_offset; 93 94 void *cmb_mem_register_addr; 95 size_t cmb_mem_register_size; 96 97 bool cmb_io_data_supported; 98 99 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */ 100 uint32_t doorbell_stride_u32; 101 102 /* Opaque handle to associated PCI device. */ 103 struct spdk_pci_device *devhandle; 104 105 /* Flag to indicate the MMIO register has been remapped */ 106 bool is_remapped; 107 }; 108 109 struct nvme_tracker { 110 TAILQ_ENTRY(nvme_tracker) tq_list; 111 112 struct nvme_request *req; 113 uint16_t cid; 114 115 uint16_t rsvd0; 116 uint32_t rsvd1; 117 118 spdk_nvme_cmd_cb cb_fn; 119 void *cb_arg; 120 121 uint64_t prp_sgl_bus_addr; 122 123 union { 124 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; 125 struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; 126 } u; 127 }; 128 /* 129 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary 130 * and so that there is no padding required to meet alignment requirements. 131 */ 132 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K"); 133 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned"); 134 135 /* PCIe transport extensions for spdk_nvme_qpair */ 136 struct nvme_pcie_qpair { 137 /* Submission queue tail doorbell */ 138 volatile uint32_t *sq_tdbl; 139 140 /* Completion queue head doorbell */ 141 volatile uint32_t *cq_hdbl; 142 143 /* Submission queue */ 144 struct spdk_nvme_cmd *cmd; 145 146 /* Completion queue */ 147 struct spdk_nvme_cpl *cpl; 148 149 TAILQ_HEAD(, nvme_tracker) free_tr; 150 TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr; 151 152 /* Array of trackers indexed by command ID. */ 153 struct nvme_tracker *tr; 154 155 uint16_t num_entries; 156 157 uint8_t retry_count; 158 159 uint16_t max_completions_cap; 160 161 uint16_t last_sq_tail; 162 uint16_t sq_tail; 163 uint16_t cq_head; 164 uint16_t sq_head; 165 166 struct { 167 uint8_t phase : 1; 168 uint8_t delay_pcie_doorbell : 1; 169 uint8_t has_shadow_doorbell : 1; 170 } flags; 171 172 /* 173 * Base qpair structure. 174 * This is located after the hot data in this structure so that the important parts of 175 * nvme_pcie_qpair are in the same cache line. 176 */ 177 struct spdk_nvme_qpair qpair; 178 179 struct { 180 /* Submission queue shadow tail doorbell */ 181 volatile uint32_t *sq_tdbl; 182 183 /* Completion queue shadow head doorbell */ 184 volatile uint32_t *cq_hdbl; 185 186 /* Submission queue event index */ 187 volatile uint32_t *sq_eventidx; 188 189 /* Completion queue event index */ 190 volatile uint32_t *cq_eventidx; 191 } shadow_doorbell; 192 193 /* 194 * Fields below this point should not be touched on the normal I/O path. 195 */ 196 197 bool sq_in_cmb; 198 199 uint64_t cmd_bus_addr; 200 uint64_t cpl_bus_addr; 201 202 struct spdk_nvme_cmd *sq_vaddr; 203 struct spdk_nvme_cpl *cq_vaddr; 204 }; 205 206 static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, 207 struct spdk_pci_addr *pci_addr); 208 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, 209 const struct spdk_nvme_io_qpair_opts *opts); 210 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair); 211 212 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; 213 static uint16_t g_signal_lock; 214 static bool g_sigset = false; 215 static int hotplug_fd = -1; 216 217 static void 218 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx) 219 { 220 void *map_address; 221 uint16_t flag = 0; 222 223 if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE, 224 __ATOMIC_RELAXED)) { 225 SPDK_DEBUGLOG(SPDK_LOG_NVME, "request g_signal_lock failed\n"); 226 return; 227 } 228 229 assert(g_thread_mmio_ctrlr != NULL); 230 231 if (!g_thread_mmio_ctrlr->is_remapped) { 232 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, 233 PROT_READ | PROT_WRITE, 234 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); 235 if (map_address == MAP_FAILED) { 236 SPDK_ERRLOG("mmap failed\n"); 237 __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); 238 return; 239 } 240 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); 241 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; 242 g_thread_mmio_ctrlr->is_remapped = true; 243 } 244 __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); 245 } 246 247 static void 248 nvme_pcie_ctrlr_setup_signal(void) 249 { 250 struct sigaction sa; 251 252 sa.sa_sigaction = nvme_sigbus_fault_sighandler; 253 sigemptyset(&sa.sa_mask); 254 sa.sa_flags = SA_SIGINFO; 255 sigaction(SIGBUS, &sa, NULL); 256 } 257 258 static inline struct nvme_pcie_ctrlr * 259 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) 260 { 261 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); 262 return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); 263 } 264 265 static int 266 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) 267 { 268 struct spdk_nvme_ctrlr *ctrlr, *tmp; 269 struct spdk_uevent event; 270 struct spdk_pci_addr pci_addr; 271 union spdk_nvme_csts_register csts; 272 struct spdk_nvme_ctrlr_process *proc; 273 274 while (spdk_get_uevent(hotplug_fd, &event) > 0) { 275 if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO || 276 event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) { 277 if (event.action == SPDK_NVME_UEVENT_ADD) { 278 SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n", 279 event.traddr); 280 if (spdk_process_is_primary()) { 281 if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) { 282 nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr); 283 } 284 } 285 } else if (event.action == SPDK_NVME_UEVENT_REMOVE) { 286 struct spdk_nvme_transport_id trid; 287 288 memset(&trid, 0, sizeof(trid)); 289 trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 290 snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr); 291 292 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); 293 if (ctrlr == NULL) { 294 return 0; 295 } 296 SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n", 297 event.traddr); 298 299 nvme_ctrlr_fail(ctrlr, true); 300 301 /* get the user app to clean up and stop I/O */ 302 if (probe_ctx->remove_cb) { 303 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 304 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); 305 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 306 } 307 } 308 } 309 } 310 311 /* This is a work around for vfio-attached device hot remove detection. */ 312 TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { 313 bool do_remove = false; 314 315 if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 316 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 317 318 if (spdk_pci_device_is_removed(pctrlr->devhandle)) { 319 do_remove = true; 320 } 321 } 322 323 /* NVMe controller BAR must be mapped in the current process before any access. */ 324 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 325 if (proc) { 326 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 327 if (csts.raw == 0xffffffffU) { 328 do_remove = true; 329 } 330 } 331 332 if (do_remove) { 333 nvme_ctrlr_fail(ctrlr, true); 334 if (probe_ctx->remove_cb) { 335 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 336 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); 337 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 338 } 339 } 340 } 341 return 0; 342 } 343 344 static inline struct nvme_pcie_qpair * 345 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) 346 { 347 assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE); 348 return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair); 349 } 350 351 static volatile void * 352 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) 353 { 354 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 355 356 return (volatile void *)((uintptr_t)pctrlr->regs + offset); 357 } 358 359 int 360 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) 361 { 362 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 363 364 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 365 g_thread_mmio_ctrlr = pctrlr; 366 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); 367 g_thread_mmio_ctrlr = NULL; 368 return 0; 369 } 370 371 int 372 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) 373 { 374 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 375 376 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 377 g_thread_mmio_ctrlr = pctrlr; 378 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); 379 g_thread_mmio_ctrlr = NULL; 380 return 0; 381 } 382 383 int 384 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) 385 { 386 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 387 388 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 389 assert(value != NULL); 390 g_thread_mmio_ctrlr = pctrlr; 391 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); 392 g_thread_mmio_ctrlr = NULL; 393 if (~(*value) == 0) { 394 return -1; 395 } 396 397 return 0; 398 } 399 400 int 401 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) 402 { 403 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 404 405 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 406 assert(value != NULL); 407 g_thread_mmio_ctrlr = pctrlr; 408 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); 409 g_thread_mmio_ctrlr = NULL; 410 if (~(*value) == 0) { 411 return -1; 412 } 413 414 return 0; 415 } 416 417 static int 418 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 419 { 420 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), 421 value); 422 } 423 424 static int 425 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 426 { 427 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), 428 value); 429 } 430 431 static int 432 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) 433 { 434 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), 435 aqa->raw); 436 } 437 438 static int 439 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) 440 { 441 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), 442 &cmbloc->raw); 443 } 444 445 static int 446 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) 447 { 448 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), 449 &cmbsz->raw); 450 } 451 452 uint32_t 453 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 454 { 455 /* 456 * For commands requiring more than 2 PRP entries, one PRP will be 457 * embedded in the command (prp1), and the rest of the PRP entries 458 * will be in a list pointed to by the command (prp2). This means 459 * that real max number of PRP entries we support is 506+1, which 460 * results in a max xfer size of 506*ctrlr->page_size. 461 */ 462 return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; 463 } 464 465 uint16_t 466 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 467 { 468 return NVME_MAX_SGL_DESCRIPTORS; 469 } 470 471 static void 472 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) 473 { 474 int rc; 475 void *addr; 476 uint32_t bir; 477 union spdk_nvme_cmbsz_register cmbsz; 478 union spdk_nvme_cmbloc_register cmbloc; 479 uint64_t size, unit_size, offset, bar_size, bar_phys_addr; 480 uint64_t mem_register_start, mem_register_end; 481 482 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || 483 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 484 SPDK_ERRLOG("get registers failed\n"); 485 goto exit; 486 } 487 488 if (!cmbsz.bits.sz) { 489 goto exit; 490 } 491 492 bir = cmbloc.bits.bir; 493 /* Values 0 2 3 4 5 are valid for BAR */ 494 if (bir > 5 || bir == 1) { 495 goto exit; 496 } 497 498 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ 499 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); 500 /* controller memory buffer size in Bytes */ 501 size = unit_size * cmbsz.bits.sz; 502 /* controller memory buffer offset from BAR in Bytes */ 503 offset = unit_size * cmbloc.bits.ofst; 504 505 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, 506 &bar_phys_addr, &bar_size); 507 if ((rc != 0) || addr == NULL) { 508 goto exit; 509 } 510 511 if (offset > bar_size) { 512 goto exit; 513 } 514 515 if (size > bar_size - offset) { 516 goto exit; 517 } 518 519 pctrlr->cmb_bar_virt_addr = addr; 520 pctrlr->cmb_bar_phys_addr = bar_phys_addr; 521 pctrlr->cmb_size = size; 522 pctrlr->cmb_current_offset = offset; 523 pctrlr->cmb_max_offset = offset + size; 524 525 if (!cmbsz.bits.sqs) { 526 pctrlr->ctrlr.opts.use_cmb_sqs = false; 527 } 528 529 /* If only SQS is supported use legacy mapping */ 530 if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) { 531 return; 532 } 533 534 /* If CMB is less than 4MiB in size then abort CMB mapping */ 535 if (pctrlr->cmb_size < (1ULL << 22)) { 536 goto exit; 537 } 538 539 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + VALUE_2MB - 1); 540 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size); 541 pctrlr->cmb_mem_register_addr = (void *)mem_register_start; 542 pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start; 543 544 rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); 545 if (rc) { 546 SPDK_ERRLOG("spdk_mem_register() failed\n"); 547 goto exit; 548 } 549 pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr); 550 pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr); 551 pctrlr->cmb_io_data_supported = true; 552 553 return; 554 exit: 555 pctrlr->cmb_bar_virt_addr = NULL; 556 pctrlr->ctrlr.opts.use_cmb_sqs = false; 557 return; 558 } 559 560 static int 561 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) 562 { 563 int rc = 0; 564 union spdk_nvme_cmbloc_register cmbloc; 565 void *addr = pctrlr->cmb_bar_virt_addr; 566 567 if (addr) { 568 if (pctrlr->cmb_mem_register_addr) { 569 spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); 570 } 571 572 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 573 SPDK_ERRLOG("get_cmbloc() failed\n"); 574 return -EIO; 575 } 576 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); 577 } 578 return rc; 579 } 580 581 static int 582 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned, 583 uint64_t *offset) 584 { 585 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 586 uint64_t round_offset; 587 588 round_offset = pctrlr->cmb_current_offset; 589 round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1); 590 591 /* CMB may only consume part of the BAR, calculate accordingly */ 592 if (round_offset + length > pctrlr->cmb_max_offset) { 593 SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); 594 return -1; 595 } 596 597 *offset = round_offset; 598 pctrlr->cmb_current_offset = round_offset + length; 599 600 return 0; 601 } 602 603 volatile struct spdk_nvme_registers * 604 nvme_pcie_ctrlr_get_registers(struct spdk_nvme_ctrlr *ctrlr) 605 { 606 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 607 608 return pctrlr->regs; 609 } 610 611 void * 612 nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) 613 { 614 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 615 uint64_t offset; 616 617 if (pctrlr->cmb_bar_virt_addr == NULL) { 618 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); 619 return NULL; 620 } 621 622 if (!pctrlr->cmb_io_data_supported) { 623 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n"); 624 return NULL; 625 } 626 627 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) { 628 SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size); 629 return NULL; 630 } 631 632 return pctrlr->cmb_bar_virt_addr + offset; 633 } 634 635 int 636 nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) 637 { 638 /* 639 * Do nothing for now. 640 * TODO: Track free space so buffers may be reused. 641 */ 642 SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n", 643 __func__); 644 return 0; 645 } 646 647 static int 648 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) 649 { 650 int rc; 651 void *addr; 652 uint64_t phys_addr, size; 653 654 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, 655 &phys_addr, &size); 656 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; 657 if ((pctrlr->regs == NULL) || (rc != 0)) { 658 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", 659 rc, pctrlr->regs); 660 return -1; 661 } 662 663 pctrlr->regs_size = size; 664 nvme_pcie_ctrlr_map_cmb(pctrlr); 665 666 return 0; 667 } 668 669 static int 670 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) 671 { 672 int rc = 0; 673 void *addr = (void *)pctrlr->regs; 674 675 if (pctrlr->ctrlr.is_removed) { 676 return rc; 677 } 678 679 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); 680 if (rc != 0) { 681 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); 682 return -1; 683 } 684 685 if (addr) { 686 /* NOTE: addr may have been remapped here. We're relying on DPDK to call 687 * munmap internally. 688 */ 689 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); 690 } 691 return rc; 692 } 693 694 static int 695 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr) 696 { 697 struct nvme_pcie_qpair *pqpair; 698 int rc; 699 700 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 701 if (pqpair == NULL) { 702 return -ENOMEM; 703 } 704 705 pqpair->num_entries = NVME_ADMIN_ENTRIES; 706 pqpair->flags.delay_pcie_doorbell = 0; 707 708 ctrlr->adminq = &pqpair->qpair; 709 710 rc = nvme_qpair_init(ctrlr->adminq, 711 0, /* qpair ID */ 712 ctrlr, 713 SPDK_NVME_QPRIO_URGENT, 714 NVME_ADMIN_ENTRIES); 715 if (rc != 0) { 716 return rc; 717 } 718 719 return nvme_pcie_qpair_construct(ctrlr->adminq, NULL); 720 } 721 722 /* This function must only be called while holding g_spdk_nvme_driver->lock */ 723 static int 724 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) 725 { 726 struct spdk_nvme_transport_id trid = {}; 727 struct nvme_pcie_enum_ctx *enum_ctx = ctx; 728 struct spdk_nvme_ctrlr *ctrlr; 729 struct spdk_pci_addr pci_addr; 730 731 pci_addr = spdk_pci_device_get_addr(pci_dev); 732 733 trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 734 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); 735 736 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); 737 if (!spdk_process_is_primary()) { 738 if (!ctrlr) { 739 SPDK_ERRLOG("Controller must be constructed in the primary process first.\n"); 740 return -1; 741 } 742 743 return nvme_ctrlr_add_process(ctrlr, pci_dev); 744 } 745 746 /* check whether user passes the pci_addr */ 747 if (enum_ctx->has_pci_addr && 748 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { 749 return 1; 750 } 751 752 return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev); 753 } 754 755 int 756 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, 757 bool direct_connect) 758 { 759 struct nvme_pcie_enum_ctx enum_ctx = {}; 760 761 enum_ctx.probe_ctx = probe_ctx; 762 763 if (strlen(probe_ctx->trid.traddr) != 0) { 764 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) { 765 return -1; 766 } 767 enum_ctx.has_pci_addr = true; 768 } 769 770 if (hotplug_fd < 0) { 771 hotplug_fd = spdk_uevent_connect(); 772 if (hotplug_fd < 0) { 773 SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n"); 774 } 775 } else { 776 _nvme_pcie_hotplug_monitor(probe_ctx); 777 } 778 779 if (enum_ctx.has_pci_addr == false) { 780 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), 781 pcie_nvme_enum_cb, &enum_ctx); 782 } else { 783 return spdk_pci_device_attach(spdk_pci_nvme_get_driver(), 784 pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); 785 } 786 } 787 788 static int 789 nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr) 790 { 791 struct nvme_pcie_enum_ctx enum_ctx; 792 793 enum_ctx.probe_ctx = probe_ctx; 794 enum_ctx.has_pci_addr = true; 795 enum_ctx.pci_addr = *pci_addr; 796 797 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx); 798 } 799 800 struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 801 const struct spdk_nvme_ctrlr_opts *opts, 802 void *devhandle) 803 { 804 struct spdk_pci_device *pci_dev = devhandle; 805 struct nvme_pcie_ctrlr *pctrlr; 806 union spdk_nvme_cap_register cap; 807 union spdk_nvme_vs_register vs; 808 uint32_t cmd_reg; 809 int rc; 810 struct spdk_pci_id pci_id; 811 812 rc = spdk_pci_device_claim(pci_dev); 813 if (rc < 0) { 814 SPDK_ERRLOG("could not claim device %s (%s)\n", 815 trid->traddr, spdk_strerror(-rc)); 816 return NULL; 817 } 818 819 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, 820 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 821 if (pctrlr == NULL) { 822 spdk_pci_device_unclaim(pci_dev); 823 SPDK_ERRLOG("could not allocate ctrlr\n"); 824 return NULL; 825 } 826 827 pctrlr->is_remapped = false; 828 pctrlr->ctrlr.is_removed = false; 829 pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 830 pctrlr->devhandle = devhandle; 831 pctrlr->ctrlr.opts = *opts; 832 memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid)); 833 834 rc = nvme_ctrlr_construct(&pctrlr->ctrlr); 835 if (rc != 0) { 836 spdk_pci_device_unclaim(pci_dev); 837 spdk_free(pctrlr); 838 return NULL; 839 } 840 841 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); 842 if (rc != 0) { 843 spdk_pci_device_unclaim(pci_dev); 844 spdk_free(pctrlr); 845 return NULL; 846 } 847 848 /* Enable PCI busmaster and disable INTx */ 849 spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4); 850 cmd_reg |= 0x404; 851 spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4); 852 853 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { 854 SPDK_ERRLOG("get_cap() failed\n"); 855 spdk_pci_device_unclaim(pci_dev); 856 spdk_free(pctrlr); 857 return NULL; 858 } 859 860 if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { 861 SPDK_ERRLOG("get_vs() failed\n"); 862 spdk_pci_device_unclaim(pci_dev); 863 spdk_free(pctrlr); 864 return NULL; 865 } 866 867 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); 868 869 /* Doorbell stride is 2 ^ (dstrd + 2), 870 * but we want multiples of 4, so drop the + 2 */ 871 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; 872 873 pci_id = spdk_pci_device_get_id(pci_dev); 874 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); 875 876 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr); 877 if (rc != 0) { 878 nvme_ctrlr_destruct(&pctrlr->ctrlr); 879 return NULL; 880 } 881 882 /* Construct the primary process properties */ 883 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); 884 if (rc != 0) { 885 nvme_ctrlr_destruct(&pctrlr->ctrlr); 886 return NULL; 887 } 888 889 if (g_sigset != true) { 890 nvme_pcie_ctrlr_setup_signal(); 891 g_sigset = true; 892 } 893 894 return &pctrlr->ctrlr; 895 } 896 897 int 898 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 899 { 900 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 901 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); 902 union spdk_nvme_aqa_register aqa; 903 904 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { 905 SPDK_ERRLOG("set_asq() failed\n"); 906 return -EIO; 907 } 908 909 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { 910 SPDK_ERRLOG("set_acq() failed\n"); 911 return -EIO; 912 } 913 914 aqa.raw = 0; 915 /* acqs and asqs are 0-based. */ 916 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 917 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 918 919 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { 920 SPDK_ERRLOG("set_aqa() failed\n"); 921 return -EIO; 922 } 923 924 return 0; 925 } 926 927 int 928 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 929 { 930 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 931 struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); 932 933 if (ctrlr->adminq) { 934 nvme_pcie_qpair_destroy(ctrlr->adminq); 935 } 936 937 nvme_ctrlr_destruct_finish(ctrlr); 938 939 nvme_ctrlr_free_processes(ctrlr); 940 941 nvme_pcie_ctrlr_free_bars(pctrlr); 942 943 if (devhandle) { 944 spdk_pci_device_unclaim(devhandle); 945 spdk_pci_device_detach(devhandle); 946 } 947 948 spdk_free(pctrlr); 949 950 return 0; 951 } 952 953 static void 954 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) 955 { 956 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); 957 tr->cid = cid; 958 tr->req = NULL; 959 } 960 961 int 962 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) 963 { 964 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 965 966 /* all head/tail vals are set to 0 */ 967 pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0; 968 969 /* 970 * First time through the completion queue, HW will set phase 971 * bit on completions to 1. So set this to 1 here, indicating 972 * we're looking for a 1 to know which entries have completed. 973 * we'll toggle the bit each time when the completion queue 974 * rolls over. 975 */ 976 pqpair->flags.phase = 1; 977 978 memset(pqpair->cmd, 0, 979 pqpair->num_entries * sizeof(struct spdk_nvme_cmd)); 980 memset(pqpair->cpl, 0, 981 pqpair->num_entries * sizeof(struct spdk_nvme_cpl)); 982 983 return 0; 984 } 985 986 static int 987 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, 988 const struct spdk_nvme_io_qpair_opts *opts) 989 { 990 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 991 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 992 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 993 struct nvme_tracker *tr; 994 uint16_t i; 995 volatile uint32_t *doorbell_base; 996 uint64_t offset; 997 uint16_t num_trackers; 998 size_t page_align = VALUE_2MB; 999 uint32_t flags = SPDK_MALLOC_DMA; 1000 uint64_t sq_paddr = 0; 1001 uint64_t cq_paddr = 0; 1002 1003 if (opts) { 1004 pqpair->sq_vaddr = opts->sq.vaddr; 1005 pqpair->cq_vaddr = opts->cq.vaddr; 1006 sq_paddr = opts->sq.paddr; 1007 cq_paddr = opts->cq.paddr; 1008 } 1009 1010 pqpair->retry_count = ctrlr->opts.transport_retry_count; 1011 1012 /* 1013 * Limit the maximum number of completions to return per call to prevent wraparound, 1014 * and calculate how many trackers can be submitted at once without overflowing the 1015 * completion queue. 1016 */ 1017 pqpair->max_completions_cap = pqpair->num_entries / 4; 1018 pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); 1019 pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); 1020 num_trackers = pqpair->num_entries - pqpair->max_completions_cap; 1021 1022 SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", 1023 pqpair->max_completions_cap, num_trackers); 1024 1025 assert(num_trackers != 0); 1026 1027 pqpair->sq_in_cmb = false; 1028 1029 if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { 1030 flags |= SPDK_MALLOC_SHARE; 1031 } 1032 1033 /* cmd and cpl rings must be aligned on page size boundaries. */ 1034 if (ctrlr->opts.use_cmb_sqs) { 1035 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1036 sysconf(_SC_PAGESIZE), &offset) == 0) { 1037 pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset; 1038 pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset; 1039 pqpair->sq_in_cmb = true; 1040 } 1041 } 1042 1043 if (pqpair->sq_in_cmb == false) { 1044 if (pqpair->sq_vaddr) { 1045 pqpair->cmd = pqpair->sq_vaddr; 1046 } else { 1047 /* To ensure physical address contiguity we make each ring occupy 1048 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. 1049 */ 1050 pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1051 page_align, NULL, 1052 SPDK_ENV_SOCKET_ID_ANY, flags); 1053 if (pqpair->cmd == NULL) { 1054 SPDK_ERRLOG("alloc qpair_cmd failed\n"); 1055 return -ENOMEM; 1056 } 1057 } 1058 if (sq_paddr) { 1059 assert(pqpair->sq_vaddr != NULL); 1060 pqpair->cmd_bus_addr = sq_paddr; 1061 } else { 1062 pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL); 1063 if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) { 1064 SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n"); 1065 return -EFAULT; 1066 } 1067 } 1068 } 1069 1070 if (pqpair->cq_vaddr) { 1071 pqpair->cpl = pqpair->cq_vaddr; 1072 } else { 1073 pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl), 1074 page_align, NULL, 1075 SPDK_ENV_SOCKET_ID_ANY, flags); 1076 if (pqpair->cpl == NULL) { 1077 SPDK_ERRLOG("alloc qpair_cpl failed\n"); 1078 return -ENOMEM; 1079 } 1080 } 1081 if (cq_paddr) { 1082 assert(pqpair->cq_vaddr != NULL); 1083 pqpair->cpl_bus_addr = cq_paddr; 1084 } else { 1085 pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL); 1086 if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) { 1087 SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n"); 1088 return -EFAULT; 1089 } 1090 } 1091 1092 doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl; 1093 pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; 1094 pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; 1095 1096 /* 1097 * Reserve space for all of the trackers in a single allocation. 1098 * struct nvme_tracker must be padded so that its size is already a power of 2. 1099 * This ensures the PRP list embedded in the nvme_tracker object will not span a 1100 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. 1101 */ 1102 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, 1103 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1104 if (pqpair->tr == NULL) { 1105 SPDK_ERRLOG("nvme_tr failed\n"); 1106 return -ENOMEM; 1107 } 1108 1109 TAILQ_INIT(&pqpair->free_tr); 1110 TAILQ_INIT(&pqpair->outstanding_tr); 1111 1112 for (i = 0; i < num_trackers; i++) { 1113 tr = &pqpair->tr[i]; 1114 nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL)); 1115 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1116 } 1117 1118 nvme_pcie_qpair_reset(qpair); 1119 1120 return 0; 1121 } 1122 1123 static inline void 1124 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) 1125 { 1126 /* dst and src are known to be non-overlapping and 64-byte aligned. */ 1127 #if defined(__SSE2__) 1128 __m128i *d128 = (__m128i *)dst; 1129 const __m128i *s128 = (const __m128i *)src; 1130 1131 _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0])); 1132 _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1])); 1133 _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2])); 1134 _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3])); 1135 #else 1136 *dst = *src; 1137 #endif 1138 } 1139 1140 /** 1141 * Note: the ctrlr_lock must be held when calling this function. 1142 */ 1143 static void 1144 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, 1145 struct nvme_request *req, struct spdk_nvme_cpl *cpl) 1146 { 1147 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1148 struct nvme_request *active_req = req; 1149 struct spdk_nvme_ctrlr_process *active_proc; 1150 1151 /* 1152 * The admin request is from another process. Move to the per 1153 * process list for that process to handle it later. 1154 */ 1155 assert(nvme_qpair_is_admin_queue(qpair)); 1156 assert(active_req->pid != getpid()); 1157 1158 active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid); 1159 if (active_proc) { 1160 /* Save the original completion information */ 1161 memcpy(&active_req->cpl, cpl, sizeof(*cpl)); 1162 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); 1163 } else { 1164 SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", 1165 active_req->pid); 1166 1167 nvme_free_request(active_req); 1168 } 1169 } 1170 1171 /** 1172 * Note: the ctrlr_lock must be held when calling this function. 1173 */ 1174 static void 1175 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) 1176 { 1177 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1178 struct nvme_request *req, *tmp_req; 1179 pid_t pid = getpid(); 1180 struct spdk_nvme_ctrlr_process *proc; 1181 1182 /* 1183 * Check whether there is any pending admin request from 1184 * other active processes. 1185 */ 1186 assert(nvme_qpair_is_admin_queue(qpair)); 1187 1188 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 1189 if (!proc) { 1190 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); 1191 assert(proc); 1192 return; 1193 } 1194 1195 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { 1196 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); 1197 1198 assert(req->pid == pid); 1199 1200 nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl); 1201 nvme_free_request(req); 1202 } 1203 } 1204 1205 static inline int 1206 nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) 1207 { 1208 return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old); 1209 } 1210 1211 static bool 1212 nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value, 1213 volatile uint32_t *shadow_db, 1214 volatile uint32_t *eventidx) 1215 { 1216 uint16_t old; 1217 1218 if (!shadow_db) { 1219 return true; 1220 } 1221 1222 old = *shadow_db; 1223 *shadow_db = value; 1224 1225 /* 1226 * Ensure that the doorbell is updated before reading the EventIdx from 1227 * memory 1228 */ 1229 spdk_mb(); 1230 1231 if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) { 1232 return false; 1233 } 1234 1235 return true; 1236 } 1237 1238 static inline void 1239 nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair) 1240 { 1241 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1242 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1243 bool need_mmio = true; 1244 1245 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1246 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1247 pqpair->sq_tail, 1248 pqpair->shadow_doorbell.sq_tdbl, 1249 pqpair->shadow_doorbell.sq_eventidx); 1250 } 1251 1252 if (spdk_likely(need_mmio)) { 1253 spdk_wmb(); 1254 g_thread_mmio_ctrlr = pctrlr; 1255 spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail); 1256 g_thread_mmio_ctrlr = NULL; 1257 } 1258 } 1259 1260 static inline void 1261 nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair) 1262 { 1263 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1264 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1265 bool need_mmio = true; 1266 1267 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1268 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1269 pqpair->cq_head, 1270 pqpair->shadow_doorbell.cq_hdbl, 1271 pqpair->shadow_doorbell.cq_eventidx); 1272 } 1273 1274 if (spdk_likely(need_mmio)) { 1275 g_thread_mmio_ctrlr = pctrlr; 1276 spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head); 1277 g_thread_mmio_ctrlr = NULL; 1278 } 1279 } 1280 1281 static void 1282 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1283 { 1284 struct nvme_request *req; 1285 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1286 1287 req = tr->req; 1288 assert(req != NULL); 1289 1290 /* Copy the command from the tracker to the submission queue. */ 1291 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); 1292 1293 if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) { 1294 pqpair->sq_tail = 0; 1295 } 1296 1297 if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) { 1298 SPDK_ERRLOG("sq_tail is passing sq_head!\n"); 1299 } 1300 1301 if (!pqpair->flags.delay_pcie_doorbell) { 1302 nvme_pcie_qpair_ring_sq_doorbell(qpair); 1303 } 1304 } 1305 1306 static void 1307 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 1308 struct spdk_nvme_cpl *cpl, bool print_on_error) 1309 { 1310 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1311 struct nvme_request *req; 1312 bool retry, error; 1313 bool req_from_current_proc = true; 1314 1315 req = tr->req; 1316 1317 assert(req != NULL); 1318 1319 error = spdk_nvme_cpl_is_error(cpl); 1320 retry = error && nvme_completion_is_retry(cpl) && 1321 req->retries < pqpair->retry_count; 1322 1323 if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { 1324 spdk_nvme_qpair_print_command(qpair, &req->cmd); 1325 spdk_nvme_qpair_print_completion(qpair, cpl); 1326 } 1327 1328 assert(cpl->cid == req->cmd.cid); 1329 1330 if (retry) { 1331 req->retries++; 1332 nvme_pcie_qpair_submit_tracker(qpair, tr); 1333 } else { 1334 /* Only check admin requests from different processes. */ 1335 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { 1336 req_from_current_proc = false; 1337 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); 1338 } else { 1339 nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl); 1340 } 1341 1342 if (req_from_current_proc == true) { 1343 nvme_qpair_free_request(qpair, req); 1344 } 1345 1346 tr->req = NULL; 1347 1348 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); 1349 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1350 1351 /* 1352 * If the controller is in the middle of resetting, don't 1353 * try to submit queued requests here - let the reset logic 1354 * handle that instead. 1355 */ 1356 if (!STAILQ_EMPTY(&qpair->queued_req) && 1357 !qpair->ctrlr->is_resetting) { 1358 req = STAILQ_FIRST(&qpair->queued_req); 1359 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1360 nvme_qpair_submit_request(qpair, req); 1361 } 1362 } 1363 } 1364 1365 static void 1366 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, 1367 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 1368 bool print_on_error) 1369 { 1370 struct spdk_nvme_cpl cpl; 1371 1372 memset(&cpl, 0, sizeof(cpl)); 1373 cpl.sqid = qpair->id; 1374 cpl.cid = tr->cid; 1375 cpl.status.sct = sct; 1376 cpl.status.sc = sc; 1377 cpl.status.dnr = dnr; 1378 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); 1379 } 1380 1381 static void 1382 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1383 { 1384 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1385 struct nvme_tracker *tr, *temp, *last; 1386 1387 last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head); 1388 1389 /* Abort previously submitted (outstanding) trs */ 1390 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { 1391 if (!qpair->ctrlr->opts.disable_error_logging) { 1392 SPDK_ERRLOG("aborting outstanding command\n"); 1393 } 1394 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1395 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); 1396 1397 if (tr == last) { 1398 break; 1399 } 1400 } 1401 } 1402 1403 void 1404 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 1405 { 1406 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1407 struct nvme_tracker *tr; 1408 1409 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1410 while (tr != NULL) { 1411 assert(tr->req != NULL); 1412 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 1413 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, 1414 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, 1415 false); 1416 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1417 } else { 1418 tr = TAILQ_NEXT(tr, tq_list); 1419 } 1420 } 1421 } 1422 1423 static void 1424 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) 1425 { 1426 nvme_pcie_admin_qpair_abort_aers(qpair); 1427 } 1428 1429 static int 1430 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) 1431 { 1432 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1433 1434 if (nvme_qpair_is_admin_queue(qpair)) { 1435 nvme_pcie_admin_qpair_destroy(qpair); 1436 } 1437 /* 1438 * We check sq_vaddr and cq_vaddr to see if the user specified the memory 1439 * buffers when creating the I/O queue. 1440 * If the user specified them, we cannot free that memory. 1441 * Nor do we free it if it's in the CMB. 1442 */ 1443 if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) { 1444 spdk_free(pqpair->cmd); 1445 } 1446 if (!pqpair->cq_vaddr && pqpair->cpl) { 1447 spdk_free(pqpair->cpl); 1448 } 1449 if (pqpair->tr) { 1450 spdk_free(pqpair->tr); 1451 } 1452 1453 nvme_qpair_deinit(qpair); 1454 1455 spdk_free(pqpair); 1456 1457 return 0; 1458 } 1459 1460 void 1461 nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1462 { 1463 nvme_pcie_qpair_abort_trackers(qpair, dnr); 1464 } 1465 1466 static int 1467 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, 1468 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, 1469 void *cb_arg) 1470 { 1471 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1472 struct nvme_request *req; 1473 struct spdk_nvme_cmd *cmd; 1474 1475 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1476 if (req == NULL) { 1477 return -ENOMEM; 1478 } 1479 1480 cmd = &req->cmd; 1481 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; 1482 1483 /* 1484 * TODO: create a create io completion queue command data 1485 * structure. 1486 */ 1487 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; 1488 /* 1489 * 0x2 = interrupts enabled 1490 * 0x1 = physically contiguous 1491 */ 1492 cmd->cdw11 = 0x1; 1493 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; 1494 1495 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1496 } 1497 1498 static int 1499 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, 1500 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1501 { 1502 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1503 struct nvme_request *req; 1504 struct spdk_nvme_cmd *cmd; 1505 1506 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1507 if (req == NULL) { 1508 return -ENOMEM; 1509 } 1510 1511 cmd = &req->cmd; 1512 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; 1513 1514 /* 1515 * TODO: create a create io submission queue command data 1516 * structure. 1517 */ 1518 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; 1519 /* 0x1 = physically contiguous */ 1520 cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1; 1521 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; 1522 1523 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1524 } 1525 1526 static int 1527 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1528 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1529 { 1530 struct nvme_request *req; 1531 struct spdk_nvme_cmd *cmd; 1532 1533 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1534 if (req == NULL) { 1535 return -ENOMEM; 1536 } 1537 1538 cmd = &req->cmd; 1539 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; 1540 cmd->cdw10 = qpair->id; 1541 1542 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1543 } 1544 1545 static int 1546 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1547 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1548 { 1549 struct nvme_request *req; 1550 struct spdk_nvme_cmd *cmd; 1551 1552 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1553 if (req == NULL) { 1554 return -ENOMEM; 1555 } 1556 1557 cmd = &req->cmd; 1558 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; 1559 cmd->cdw10 = qpair->id; 1560 1561 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1562 } 1563 1564 static int 1565 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1566 uint16_t qid) 1567 { 1568 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1569 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1570 struct nvme_completion_poll_status status; 1571 int rc; 1572 1573 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1574 if (rc != 0) { 1575 return rc; 1576 } 1577 1578 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1579 SPDK_ERRLOG("nvme_create_io_cq failed!\n"); 1580 return -1; 1581 } 1582 1583 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); 1584 if (rc != 0) { 1585 return rc; 1586 } 1587 1588 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1589 SPDK_ERRLOG("nvme_create_io_sq failed!\n"); 1590 /* Attempt to delete the completion queue */ 1591 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); 1592 if (rc != 0) { 1593 return -1; 1594 } 1595 spdk_nvme_wait_for_completion(ctrlr->adminq, &status); 1596 return -1; 1597 } 1598 1599 if (ctrlr->shadow_doorbell) { 1600 pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * 1601 pctrlr->doorbell_stride_u32; 1602 pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * 1603 pctrlr->doorbell_stride_u32; 1604 pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * 1605 pctrlr->doorbell_stride_u32; 1606 pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * 1607 pctrlr->doorbell_stride_u32; 1608 pqpair->flags.has_shadow_doorbell = 1; 1609 } else { 1610 pqpair->flags.has_shadow_doorbell = 0; 1611 } 1612 nvme_pcie_qpair_reset(qpair); 1613 1614 return 0; 1615 } 1616 1617 struct spdk_nvme_qpair * 1618 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 1619 const struct spdk_nvme_io_qpair_opts *opts) 1620 { 1621 struct nvme_pcie_qpair *pqpair; 1622 struct spdk_nvme_qpair *qpair; 1623 int rc; 1624 1625 assert(ctrlr != NULL); 1626 1627 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, 1628 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1629 if (pqpair == NULL) { 1630 return NULL; 1631 } 1632 1633 pqpair->num_entries = opts->io_queue_size; 1634 pqpair->flags.delay_pcie_doorbell = opts->delay_pcie_doorbell; 1635 1636 qpair = &pqpair->qpair; 1637 1638 rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); 1639 if (rc != 0) { 1640 nvme_pcie_qpair_destroy(qpair); 1641 return NULL; 1642 } 1643 1644 rc = nvme_pcie_qpair_construct(qpair, opts); 1645 1646 if (rc != 0) { 1647 nvme_pcie_qpair_destroy(qpair); 1648 return NULL; 1649 } 1650 1651 rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid); 1652 1653 if (rc != 0) { 1654 SPDK_ERRLOG("I/O queue creation failed\n"); 1655 nvme_pcie_qpair_destroy(qpair); 1656 return NULL; 1657 } 1658 1659 return qpair; 1660 } 1661 1662 int 1663 nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1664 { 1665 if (nvme_qpair_is_admin_queue(qpair)) { 1666 return 0; 1667 } else { 1668 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); 1669 } 1670 } 1671 1672 void 1673 nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1674 { 1675 } 1676 1677 int 1678 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1679 { 1680 struct nvme_completion_poll_status status; 1681 int rc; 1682 1683 assert(ctrlr != NULL); 1684 1685 if (ctrlr->is_removed) { 1686 goto free; 1687 } 1688 1689 /* Delete the I/O submission queue */ 1690 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1691 if (rc != 0) { 1692 SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); 1693 return rc; 1694 } 1695 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1696 return -1; 1697 } 1698 1699 /* Delete the completion queue */ 1700 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1701 if (rc != 0) { 1702 SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); 1703 return rc; 1704 } 1705 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1706 return -1; 1707 } 1708 1709 free: 1710 if (qpair->no_deletion_notification_needed == 0) { 1711 /* Abort the rest of the I/O */ 1712 nvme_pcie_qpair_abort_trackers(qpair, 1); 1713 } 1714 1715 nvme_pcie_qpair_destroy(qpair); 1716 return 0; 1717 } 1718 1719 static void 1720 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1721 { 1722 /* 1723 * Bad vtophys translation, so abort this request and return 1724 * immediately. 1725 */ 1726 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1727 SPDK_NVME_SC_INVALID_FIELD, 1728 1 /* do not retry */, true); 1729 } 1730 1731 /* 1732 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. 1733 * 1734 * *prp_index will be updated to account for the number of PRP entries used. 1735 */ 1736 static inline int 1737 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, 1738 uint32_t page_size) 1739 { 1740 struct spdk_nvme_cmd *cmd = &tr->req->cmd; 1741 uintptr_t page_mask = page_size - 1; 1742 uint64_t phys_addr; 1743 uint32_t i; 1744 1745 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n", 1746 *prp_index, virt_addr, (uint32_t)len); 1747 1748 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { 1749 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1750 return -EINVAL; 1751 } 1752 1753 i = *prp_index; 1754 while (len) { 1755 uint32_t seg_len; 1756 1757 /* 1758 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, 1759 * so prp_index == count is valid. 1760 */ 1761 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { 1762 SPDK_ERRLOG("out of PRP entries\n"); 1763 return -EINVAL; 1764 } 1765 1766 phys_addr = spdk_vtophys(virt_addr, NULL); 1767 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { 1768 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); 1769 return -EINVAL; 1770 } 1771 1772 if (i == 0) { 1773 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr); 1774 cmd->dptr.prp.prp1 = phys_addr; 1775 seg_len = page_size - ((uintptr_t)virt_addr & page_mask); 1776 } else { 1777 if ((phys_addr & page_mask) != 0) { 1778 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); 1779 return -EINVAL; 1780 } 1781 1782 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr); 1783 tr->u.prp[i - 1] = phys_addr; 1784 seg_len = page_size; 1785 } 1786 1787 seg_len = spdk_min(seg_len, len); 1788 virt_addr += seg_len; 1789 len -= seg_len; 1790 i++; 1791 } 1792 1793 cmd->psdt = SPDK_NVME_PSDT_PRP; 1794 if (i <= 1) { 1795 cmd->dptr.prp.prp2 = 0; 1796 } else if (i == 2) { 1797 cmd->dptr.prp.prp2 = tr->u.prp[0]; 1798 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); 1799 } else { 1800 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; 1801 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); 1802 } 1803 1804 *prp_index = i; 1805 return 0; 1806 } 1807 1808 /** 1809 * Build PRP list describing physically contiguous payload buffer. 1810 */ 1811 static int 1812 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1813 struct nvme_tracker *tr) 1814 { 1815 uint32_t prp_index = 0; 1816 int rc; 1817 1818 rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, 1819 req->payload_size, qpair->ctrlr->page_size); 1820 if (rc) { 1821 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1822 return rc; 1823 } 1824 1825 return 0; 1826 } 1827 1828 /** 1829 * Build SGL list describing scattered payload buffer. 1830 */ 1831 static int 1832 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1833 struct nvme_tracker *tr) 1834 { 1835 int rc; 1836 void *virt_addr; 1837 uint64_t phys_addr; 1838 uint32_t remaining_transfer_len, remaining_user_sge_len, length; 1839 struct spdk_nvme_sgl_descriptor *sgl; 1840 uint32_t nseg = 0; 1841 1842 /* 1843 * Build scattered payloads. 1844 */ 1845 assert(req->payload_size != 0); 1846 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1847 assert(req->payload.reset_sgl_fn != NULL); 1848 assert(req->payload.next_sge_fn != NULL); 1849 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1850 1851 sgl = tr->u.sgl; 1852 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1853 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1854 1855 remaining_transfer_len = req->payload_size; 1856 1857 while (remaining_transfer_len > 0) { 1858 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, 1859 &virt_addr, &remaining_user_sge_len); 1860 if (rc) { 1861 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1862 return -1; 1863 } 1864 1865 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 1866 remaining_transfer_len -= remaining_user_sge_len; 1867 while (remaining_user_sge_len > 0) { 1868 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1869 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1870 return -1; 1871 } 1872 1873 phys_addr = spdk_vtophys(virt_addr, NULL); 1874 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1875 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1876 return -1; 1877 } 1878 1879 length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr)); 1880 remaining_user_sge_len -= length; 1881 virt_addr += length; 1882 1883 if (nseg > 0 && phys_addr == 1884 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { 1885 /* extend previous entry */ 1886 (*(sgl - 1)).unkeyed.length += length; 1887 continue; 1888 } 1889 1890 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1891 sgl->unkeyed.length = length; 1892 sgl->address = phys_addr; 1893 sgl->unkeyed.subtype = 0; 1894 1895 sgl++; 1896 nseg++; 1897 } 1898 } 1899 1900 if (nseg == 1) { 1901 /* 1902 * The whole transfer can be described by a single SGL descriptor. 1903 * Use the special case described by the spec where SGL1's type is Data Block. 1904 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1905 * SGL element into SGL1. 1906 */ 1907 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1908 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1909 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1910 } else { 1911 /* For now we can only support 1 SGL segment in NVMe controller */ 1912 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1913 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1914 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1915 } 1916 1917 return 0; 1918 } 1919 1920 /** 1921 * Build PRP list describing scattered payload buffer. 1922 */ 1923 static int 1924 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1925 struct nvme_tracker *tr) 1926 { 1927 int rc; 1928 void *virt_addr; 1929 uint32_t remaining_transfer_len, length; 1930 uint32_t prp_index = 0; 1931 uint32_t page_size = qpair->ctrlr->page_size; 1932 1933 /* 1934 * Build scattered payloads. 1935 */ 1936 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1937 assert(req->payload.reset_sgl_fn != NULL); 1938 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1939 1940 remaining_transfer_len = req->payload_size; 1941 while (remaining_transfer_len > 0) { 1942 assert(req->payload.next_sge_fn != NULL); 1943 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); 1944 if (rc) { 1945 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1946 return -1; 1947 } 1948 1949 length = spdk_min(remaining_transfer_len, length); 1950 1951 /* 1952 * Any incompatible sges should have been handled up in the splitting routine, 1953 * but assert here as an additional check. 1954 * 1955 * All SGEs except last must end on a page boundary. 1956 */ 1957 assert((length == remaining_transfer_len) || 1958 _is_page_aligned((uintptr_t)virt_addr + length, page_size)); 1959 1960 rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); 1961 if (rc) { 1962 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1963 return rc; 1964 } 1965 1966 remaining_transfer_len -= length; 1967 } 1968 1969 return 0; 1970 } 1971 1972 int 1973 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) 1974 { 1975 struct nvme_tracker *tr; 1976 int rc = 0; 1977 void *md_payload; 1978 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1979 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1980 1981 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 1982 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 1983 } 1984 1985 tr = TAILQ_FIRST(&pqpair->free_tr); 1986 1987 if (tr == NULL) { 1988 /* 1989 * Put the request on the qpair's request queue to be 1990 * processed when a tracker frees up via a command 1991 * completion. 1992 */ 1993 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 1994 goto exit; 1995 } 1996 1997 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ 1998 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); 1999 tr->req = req; 2000 tr->cb_fn = req->cb_fn; 2001 tr->cb_arg = req->cb_arg; 2002 req->cmd.cid = tr->cid; 2003 2004 if (req->payload_size && req->payload.md) { 2005 md_payload = req->payload.md + req->md_offset; 2006 tr->req->cmd.mptr = spdk_vtophys(md_payload, NULL); 2007 if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) { 2008 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2009 rc = -EINVAL; 2010 goto exit; 2011 } 2012 } 2013 2014 if (req->payload_size == 0) { 2015 /* Null payload - leave PRP fields untouched */ 2016 rc = 0; 2017 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { 2018 rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr); 2019 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { 2020 if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 2021 rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr); 2022 } else { 2023 rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr); 2024 } 2025 } else { 2026 assert(0); 2027 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2028 rc = -EINVAL; 2029 } 2030 2031 if (rc < 0) { 2032 goto exit; 2033 } 2034 2035 nvme_pcie_qpair_submit_tracker(qpair, tr); 2036 2037 exit: 2038 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2039 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2040 } 2041 2042 return rc; 2043 } 2044 2045 static void 2046 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 2047 { 2048 uint64_t t02; 2049 struct nvme_tracker *tr, *tmp; 2050 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2051 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2052 struct spdk_nvme_ctrlr_process *active_proc; 2053 2054 /* Don't check timeouts during controller initialization. */ 2055 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 2056 return; 2057 } 2058 2059 if (nvme_qpair_is_admin_queue(qpair)) { 2060 active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 2061 } else { 2062 active_proc = qpair->active_proc; 2063 } 2064 2065 /* Only check timeouts if the current process has a timeout callback. */ 2066 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 2067 return; 2068 } 2069 2070 t02 = spdk_get_ticks(); 2071 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { 2072 assert(tr->req != NULL); 2073 2074 if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { 2075 /* 2076 * The requests are in order, so as soon as one has not timed out, 2077 * stop iterating. 2078 */ 2079 break; 2080 } 2081 } 2082 } 2083 2084 int32_t 2085 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) 2086 { 2087 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2088 struct nvme_tracker *tr; 2089 struct spdk_nvme_cpl *cpl, *next_cpl; 2090 uint32_t num_completions = 0; 2091 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2092 uint16_t next_cq_head; 2093 uint8_t next_phase; 2094 bool next_is_valid = false; 2095 2096 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2097 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 2098 } 2099 2100 if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { 2101 /* 2102 * max_completions == 0 means unlimited, but complete at most 2103 * max_completions_cap batch of I/O at a time so that the completion 2104 * queue doorbells don't wrap around. 2105 */ 2106 max_completions = pqpair->max_completions_cap; 2107 } 2108 2109 while (1) { 2110 cpl = &pqpair->cpl[pqpair->cq_head]; 2111 2112 if (!next_is_valid && cpl->status.p != pqpair->flags.phase) { 2113 break; 2114 } 2115 2116 if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) { 2117 next_cq_head = pqpair->cq_head + 1; 2118 next_phase = pqpair->flags.phase; 2119 } else { 2120 next_cq_head = 0; 2121 next_phase = !pqpair->flags.phase; 2122 } 2123 next_cpl = &pqpair->cpl[next_cq_head]; 2124 next_is_valid = (next_cpl->status.p == next_phase); 2125 if (next_is_valid) { 2126 __builtin_prefetch(&pqpair->tr[next_cpl->cid]); 2127 } 2128 2129 #ifdef __PPC64__ 2130 /* 2131 * This memory barrier prevents reordering of: 2132 * - load after store from/to tr 2133 * - load after load cpl phase and cpl cid 2134 */ 2135 spdk_mb(); 2136 #elif defined(__aarch64__) 2137 __asm volatile("dmb oshld" ::: "memory"); 2138 #endif 2139 2140 if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { 2141 pqpair->cq_head = 0; 2142 pqpair->flags.phase = !pqpair->flags.phase; 2143 } 2144 2145 tr = &pqpair->tr[cpl->cid]; 2146 /* Prefetch the req's STAILQ_ENTRY since we'll need to access it 2147 * as part of putting the req back on the qpair's free list. 2148 */ 2149 __builtin_prefetch(&tr->req->stailq); 2150 pqpair->sq_head = cpl->sqhd; 2151 2152 if (tr->req) { 2153 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); 2154 } else { 2155 SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); 2156 spdk_nvme_qpair_print_completion(qpair, cpl); 2157 assert(0); 2158 } 2159 2160 if (++num_completions == max_completions) { 2161 break; 2162 } 2163 } 2164 2165 if (num_completions > 0) { 2166 nvme_pcie_qpair_ring_cq_doorbell(qpair); 2167 } 2168 2169 if (pqpair->flags.delay_pcie_doorbell) { 2170 if (pqpair->last_sq_tail != pqpair->sq_tail) { 2171 nvme_pcie_qpair_ring_sq_doorbell(qpair); 2172 pqpair->last_sq_tail = pqpair->sq_tail; 2173 } 2174 } 2175 2176 if (spdk_unlikely(ctrlr->timeout_enabled)) { 2177 /* 2178 * User registered for timeout callback 2179 */ 2180 nvme_pcie_qpair_check_timeout(qpair); 2181 } 2182 2183 /* Before returning, complete any pending admin request. */ 2184 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2185 nvme_pcie_qpair_complete_pending_admin_request(qpair); 2186 2187 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2188 } 2189 2190 return num_completions; 2191 } 2192