1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * Copyright (c) 2017, IBM Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * NVMe over PCIe transport 37 */ 38 39 #include "spdk/stdinc.h" 40 #include "spdk/env.h" 41 #include "spdk/likely.h" 42 #include "nvme_internal.h" 43 #include "nvme_uevent.h" 44 45 /* 46 * Number of completion queue entries to process before ringing the 47 * completion queue doorbell. 48 */ 49 #define NVME_MIN_COMPLETIONS (1) 50 #define NVME_MAX_COMPLETIONS (128) 51 52 #define NVME_ADMIN_ENTRIES (128) 53 54 /* 55 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL 56 * segment. 57 */ 58 #define NVME_MAX_SGL_DESCRIPTORS (251) 59 60 #define NVME_MAX_PRP_LIST_ENTRIES (505) 61 62 struct nvme_pcie_enum_ctx { 63 struct spdk_nvme_probe_ctx *probe_ctx; 64 struct spdk_pci_addr pci_addr; 65 bool has_pci_addr; 66 }; 67 68 /* PCIe transport extensions for spdk_nvme_ctrlr */ 69 struct nvme_pcie_ctrlr { 70 struct spdk_nvme_ctrlr ctrlr; 71 72 /** NVMe MMIO register space */ 73 volatile struct spdk_nvme_registers *regs; 74 75 /** NVMe MMIO register size */ 76 uint64_t regs_size; 77 78 /* BAR mapping address which contains controller memory buffer */ 79 void *cmb_bar_virt_addr; 80 81 /* BAR physical address which contains controller memory buffer */ 82 uint64_t cmb_bar_phys_addr; 83 84 /* Controller memory buffer size in Bytes */ 85 uint64_t cmb_size; 86 87 /* Current offset of controller memory buffer, relative to start of BAR virt addr */ 88 uint64_t cmb_current_offset; 89 90 /* Last valid offset into CMB, this differs if CMB memory registration occurs or not */ 91 uint64_t cmb_max_offset; 92 93 void *cmb_mem_register_addr; 94 size_t cmb_mem_register_size; 95 96 bool cmb_io_data_supported; 97 98 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */ 99 uint32_t doorbell_stride_u32; 100 101 /* Opaque handle to associated PCI device. */ 102 struct spdk_pci_device *devhandle; 103 104 /* File descriptor returned from spdk_pci_device_claim(). Closed when ctrlr is detached. */ 105 int claim_fd; 106 107 /* Flag to indicate the MMIO register has been remapped */ 108 bool is_remapped; 109 }; 110 111 struct nvme_tracker { 112 TAILQ_ENTRY(nvme_tracker) tq_list; 113 114 struct nvme_request *req; 115 uint16_t cid; 116 117 uint16_t rsvd0; 118 uint32_t rsvd1; 119 120 spdk_nvme_cmd_cb cb_fn; 121 void *cb_arg; 122 123 uint64_t prp_sgl_bus_addr; 124 125 union { 126 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; 127 struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; 128 } u; 129 }; 130 /* 131 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary 132 * and so that there is no padding required to meet alignment requirements. 133 */ 134 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K"); 135 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned"); 136 137 /* PCIe transport extensions for spdk_nvme_qpair */ 138 struct nvme_pcie_qpair { 139 /* Submission queue tail doorbell */ 140 volatile uint32_t *sq_tdbl; 141 142 /* Completion queue head doorbell */ 143 volatile uint32_t *cq_hdbl; 144 145 /* Submission queue */ 146 struct spdk_nvme_cmd *cmd; 147 148 /* Completion queue */ 149 struct spdk_nvme_cpl *cpl; 150 151 TAILQ_HEAD(, nvme_tracker) free_tr; 152 TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr; 153 154 /* Array of trackers indexed by command ID. */ 155 struct nvme_tracker *tr; 156 157 uint16_t num_entries; 158 159 uint16_t max_completions_cap; 160 161 uint16_t last_sq_tail; 162 uint16_t sq_tail; 163 uint16_t cq_head; 164 uint16_t sq_head; 165 166 struct { 167 uint8_t phase : 1; 168 uint8_t is_enabled : 1; 169 uint8_t delay_pcie_doorbell : 1; 170 uint8_t has_shadow_doorbell : 1; 171 } flags; 172 173 /* 174 * Base qpair structure. 175 * This is located after the hot data in this structure so that the important parts of 176 * nvme_pcie_qpair are in the same cache line. 177 */ 178 struct spdk_nvme_qpair qpair; 179 180 struct { 181 /* Submission queue shadow tail doorbell */ 182 volatile uint32_t *sq_tdbl; 183 184 /* Completion queue shadow head doorbell */ 185 volatile uint32_t *cq_hdbl; 186 187 /* Submission queue event index */ 188 volatile uint32_t *sq_eventidx; 189 190 /* Completion queue event index */ 191 volatile uint32_t *cq_eventidx; 192 } shadow_doorbell; 193 194 /* 195 * Fields below this point should not be touched on the normal I/O path. 196 */ 197 198 bool sq_in_cmb; 199 200 uint64_t cmd_bus_addr; 201 uint64_t cpl_bus_addr; 202 }; 203 204 static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, 205 struct spdk_pci_addr *pci_addr); 206 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair); 207 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair); 208 209 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; 210 static volatile uint16_t g_signal_lock; 211 static bool g_sigset = false; 212 static int hotplug_fd = -1; 213 214 static void 215 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx) 216 { 217 void *map_address; 218 219 if (!__sync_bool_compare_and_swap(&g_signal_lock, 0, 1)) { 220 return; 221 } 222 223 assert(g_thread_mmio_ctrlr != NULL); 224 225 if (!g_thread_mmio_ctrlr->is_remapped) { 226 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, 227 PROT_READ | PROT_WRITE, 228 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); 229 if (map_address == MAP_FAILED) { 230 SPDK_ERRLOG("mmap failed\n"); 231 g_signal_lock = 0; 232 return; 233 } 234 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); 235 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; 236 g_thread_mmio_ctrlr->is_remapped = true; 237 } 238 g_signal_lock = 0; 239 return; 240 } 241 242 static void 243 nvme_pcie_ctrlr_setup_signal(void) 244 { 245 struct sigaction sa; 246 247 sa.sa_sigaction = nvme_sigbus_fault_sighandler; 248 sigemptyset(&sa.sa_mask); 249 sa.sa_flags = SA_SIGINFO; 250 sigaction(SIGBUS, &sa, NULL); 251 } 252 253 static inline struct nvme_pcie_ctrlr * 254 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) 255 { 256 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); 257 return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); 258 } 259 260 static int 261 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) 262 { 263 struct spdk_nvme_ctrlr *ctrlr, *tmp; 264 struct spdk_uevent event; 265 struct spdk_pci_addr pci_addr; 266 union spdk_nvme_csts_register csts; 267 struct spdk_nvme_ctrlr_process *proc; 268 269 while (spdk_get_uevent(hotplug_fd, &event) > 0) { 270 if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO || 271 event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) { 272 if (event.action == SPDK_NVME_UEVENT_ADD) { 273 SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n", 274 event.traddr); 275 if (spdk_process_is_primary()) { 276 if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) { 277 nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr); 278 } 279 } 280 } else if (event.action == SPDK_NVME_UEVENT_REMOVE) { 281 struct spdk_nvme_transport_id trid; 282 283 memset(&trid, 0, sizeof(trid)); 284 trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 285 snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr); 286 287 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); 288 if (ctrlr == NULL) { 289 return 0; 290 } 291 SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n", 292 event.traddr); 293 294 nvme_ctrlr_fail(ctrlr, true); 295 296 /* get the user app to clean up and stop I/O */ 297 if (probe_ctx->remove_cb) { 298 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 299 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); 300 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 301 } 302 } 303 } 304 } 305 306 /* This is a work around for vfio-attached device hot remove detection. */ 307 TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { 308 bool do_remove = false; 309 310 if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 311 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 312 313 if (spdk_pci_device_is_removed(pctrlr->devhandle)) { 314 do_remove = true; 315 } 316 } 317 318 /* NVMe controller BAR must be mapped in the current process before any access. */ 319 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 320 if (proc) { 321 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 322 if (csts.raw == 0xffffffffU) { 323 do_remove = true; 324 } 325 } 326 327 if (do_remove) { 328 nvme_ctrlr_fail(ctrlr, true); 329 if (probe_ctx->remove_cb) { 330 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 331 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); 332 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 333 } 334 } 335 } 336 return 0; 337 } 338 339 static inline struct nvme_pcie_qpair * 340 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) 341 { 342 assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE); 343 return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair); 344 } 345 346 static volatile void * 347 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) 348 { 349 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 350 351 return (volatile void *)((uintptr_t)pctrlr->regs + offset); 352 } 353 354 int 355 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) 356 { 357 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 358 359 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 360 g_thread_mmio_ctrlr = pctrlr; 361 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); 362 g_thread_mmio_ctrlr = NULL; 363 return 0; 364 } 365 366 int 367 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) 368 { 369 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 370 371 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 372 g_thread_mmio_ctrlr = pctrlr; 373 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); 374 g_thread_mmio_ctrlr = NULL; 375 return 0; 376 } 377 378 int 379 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) 380 { 381 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 382 383 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 384 assert(value != NULL); 385 g_thread_mmio_ctrlr = pctrlr; 386 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); 387 g_thread_mmio_ctrlr = NULL; 388 if (~(*value) == 0) { 389 return -1; 390 } 391 392 return 0; 393 } 394 395 int 396 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) 397 { 398 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 399 400 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 401 assert(value != NULL); 402 g_thread_mmio_ctrlr = pctrlr; 403 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); 404 g_thread_mmio_ctrlr = NULL; 405 if (~(*value) == 0) { 406 return -1; 407 } 408 409 return 0; 410 } 411 412 static int 413 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 414 { 415 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), 416 value); 417 } 418 419 static int 420 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 421 { 422 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), 423 value); 424 } 425 426 static int 427 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) 428 { 429 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), 430 aqa->raw); 431 } 432 433 static int 434 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) 435 { 436 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), 437 &cmbloc->raw); 438 } 439 440 static int 441 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) 442 { 443 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), 444 &cmbsz->raw); 445 } 446 447 uint32_t 448 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 449 { 450 /* 451 * For commands requiring more than 2 PRP entries, one PRP will be 452 * embedded in the command (prp1), and the rest of the PRP entries 453 * will be in a list pointed to by the command (prp2). This means 454 * that real max number of PRP entries we support is 506+1, which 455 * results in a max xfer size of 506*ctrlr->page_size. 456 */ 457 return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; 458 } 459 460 uint16_t 461 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 462 { 463 return NVME_MAX_SGL_DESCRIPTORS; 464 } 465 466 static void 467 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) 468 { 469 int rc; 470 void *addr; 471 uint32_t bir; 472 union spdk_nvme_cmbsz_register cmbsz; 473 union spdk_nvme_cmbloc_register cmbloc; 474 uint64_t size, unit_size, offset, bar_size, bar_phys_addr; 475 uint64_t mem_register_start, mem_register_end; 476 477 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || 478 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 479 SPDK_ERRLOG("get registers failed\n"); 480 goto exit; 481 } 482 483 if (!cmbsz.bits.sz) { 484 goto exit; 485 } 486 487 bir = cmbloc.bits.bir; 488 /* Values 0 2 3 4 5 are valid for BAR */ 489 if (bir > 5 || bir == 1) { 490 goto exit; 491 } 492 493 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ 494 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); 495 /* controller memory buffer size in Bytes */ 496 size = unit_size * cmbsz.bits.sz; 497 /* controller memory buffer offset from BAR in Bytes */ 498 offset = unit_size * cmbloc.bits.ofst; 499 500 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, 501 &bar_phys_addr, &bar_size); 502 if ((rc != 0) || addr == NULL) { 503 goto exit; 504 } 505 506 if (offset > bar_size) { 507 goto exit; 508 } 509 510 if (size > bar_size - offset) { 511 goto exit; 512 } 513 514 pctrlr->cmb_bar_virt_addr = addr; 515 pctrlr->cmb_bar_phys_addr = bar_phys_addr; 516 pctrlr->cmb_size = size; 517 pctrlr->cmb_current_offset = offset; 518 pctrlr->cmb_max_offset = offset + size; 519 520 if (!cmbsz.bits.sqs) { 521 pctrlr->ctrlr.opts.use_cmb_sqs = false; 522 } 523 524 /* If only SQS is supported use legacy mapping */ 525 if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) { 526 return; 527 } 528 529 /* If CMB is less than 4MiB in size then abort CMB mapping */ 530 if (pctrlr->cmb_size < (1ULL << 22)) { 531 goto exit; 532 } 533 534 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + VALUE_2MB - 1); 535 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size); 536 pctrlr->cmb_mem_register_addr = (void *)mem_register_start; 537 pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start; 538 539 rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); 540 if (rc) { 541 SPDK_ERRLOG("spdk_mem_register() failed\n"); 542 goto exit; 543 } 544 pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr); 545 pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr); 546 pctrlr->cmb_io_data_supported = true; 547 548 return; 549 exit: 550 pctrlr->cmb_bar_virt_addr = NULL; 551 pctrlr->ctrlr.opts.use_cmb_sqs = false; 552 return; 553 } 554 555 static int 556 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) 557 { 558 int rc = 0; 559 union spdk_nvme_cmbloc_register cmbloc; 560 void *addr = pctrlr->cmb_bar_virt_addr; 561 562 if (addr) { 563 if (pctrlr->cmb_mem_register_addr) { 564 spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); 565 } 566 567 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 568 SPDK_ERRLOG("get_cmbloc() failed\n"); 569 return -EIO; 570 } 571 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); 572 } 573 return rc; 574 } 575 576 static int 577 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned, 578 uint64_t *offset) 579 { 580 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 581 uint64_t round_offset; 582 583 round_offset = pctrlr->cmb_current_offset; 584 round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1); 585 586 /* CMB may only consume part of the BAR, calculate accordingly */ 587 if (round_offset + length > pctrlr->cmb_max_offset) { 588 SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); 589 return -1; 590 } 591 592 *offset = round_offset; 593 pctrlr->cmb_current_offset = round_offset + length; 594 595 return 0; 596 } 597 598 void * 599 nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) 600 { 601 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 602 uint64_t offset; 603 604 if (pctrlr->cmb_bar_virt_addr == NULL) { 605 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); 606 return NULL; 607 } 608 609 if (!pctrlr->cmb_io_data_supported) { 610 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n"); 611 return NULL; 612 } 613 614 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) { 615 SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size); 616 return NULL; 617 } 618 619 return pctrlr->cmb_bar_virt_addr + offset; 620 } 621 622 int 623 nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) 624 { 625 /* 626 * Do nothing for now. 627 * TODO: Track free space so buffers may be reused. 628 */ 629 SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n", 630 __func__); 631 return 0; 632 } 633 634 static int 635 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) 636 { 637 int rc; 638 void *addr; 639 uint64_t phys_addr, size; 640 641 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, 642 &phys_addr, &size); 643 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; 644 if ((pctrlr->regs == NULL) || (rc != 0)) { 645 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", 646 rc, pctrlr->regs); 647 return -1; 648 } 649 650 pctrlr->regs_size = size; 651 nvme_pcie_ctrlr_map_cmb(pctrlr); 652 653 return 0; 654 } 655 656 static int 657 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) 658 { 659 int rc = 0; 660 void *addr = (void *)pctrlr->regs; 661 662 if (pctrlr->ctrlr.is_removed) { 663 return rc; 664 } 665 666 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); 667 if (rc != 0) { 668 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); 669 return -1; 670 } 671 672 if (addr) { 673 /* NOTE: addr may have been remapped here. We're relying on DPDK to call 674 * munmap internally. 675 */ 676 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); 677 } 678 return rc; 679 } 680 681 static int 682 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr) 683 { 684 struct nvme_pcie_qpair *pqpair; 685 int rc; 686 687 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 688 if (pqpair == NULL) { 689 return -ENOMEM; 690 } 691 692 pqpair->num_entries = NVME_ADMIN_ENTRIES; 693 pqpair->flags.delay_pcie_doorbell = 0; 694 695 ctrlr->adminq = &pqpair->qpair; 696 697 rc = nvme_qpair_init(ctrlr->adminq, 698 0, /* qpair ID */ 699 ctrlr, 700 SPDK_NVME_QPRIO_URGENT, 701 NVME_ADMIN_ENTRIES); 702 if (rc != 0) { 703 return rc; 704 } 705 706 return nvme_pcie_qpair_construct(ctrlr->adminq); 707 } 708 709 /* This function must only be called while holding g_spdk_nvme_driver->lock */ 710 static int 711 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) 712 { 713 struct spdk_nvme_transport_id trid = {}; 714 struct nvme_pcie_enum_ctx *enum_ctx = ctx; 715 struct spdk_nvme_ctrlr *ctrlr; 716 struct spdk_pci_addr pci_addr; 717 718 pci_addr = spdk_pci_device_get_addr(pci_dev); 719 720 trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 721 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); 722 723 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); 724 if (!spdk_process_is_primary()) { 725 if (!ctrlr) { 726 SPDK_ERRLOG("Controller must be constructed in the primary process first.\n"); 727 return -1; 728 } 729 730 return nvme_ctrlr_add_process(ctrlr, pci_dev); 731 } 732 733 /* check whether user passes the pci_addr */ 734 if (enum_ctx->has_pci_addr && 735 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { 736 return 1; 737 } 738 739 return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev); 740 } 741 742 int 743 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, 744 bool direct_connect) 745 { 746 struct nvme_pcie_enum_ctx enum_ctx = {}; 747 748 enum_ctx.probe_ctx = probe_ctx; 749 750 if (strlen(probe_ctx->trid.traddr) != 0) { 751 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) { 752 return -1; 753 } 754 enum_ctx.has_pci_addr = true; 755 } 756 757 if (hotplug_fd < 0) { 758 hotplug_fd = spdk_uevent_connect(); 759 if (hotplug_fd < 0) { 760 SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n"); 761 } 762 } else { 763 _nvme_pcie_hotplug_monitor(probe_ctx); 764 } 765 766 if (enum_ctx.has_pci_addr == false) { 767 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), 768 pcie_nvme_enum_cb, &enum_ctx); 769 } else { 770 return spdk_pci_device_attach(spdk_pci_nvme_get_driver(), 771 pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); 772 } 773 } 774 775 static int 776 nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr) 777 { 778 struct nvme_pcie_enum_ctx enum_ctx; 779 780 enum_ctx.probe_ctx = probe_ctx; 781 enum_ctx.has_pci_addr = true; 782 enum_ctx.pci_addr = *pci_addr; 783 784 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx); 785 } 786 787 struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 788 const struct spdk_nvme_ctrlr_opts *opts, 789 void *devhandle) 790 { 791 struct spdk_pci_device *pci_dev = devhandle; 792 struct nvme_pcie_ctrlr *pctrlr; 793 union spdk_nvme_cap_register cap; 794 union spdk_nvme_vs_register vs; 795 uint32_t cmd_reg; 796 int rc, claim_fd; 797 struct spdk_pci_id pci_id; 798 struct spdk_pci_addr pci_addr; 799 800 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { 801 SPDK_ERRLOG("could not parse pci address\n"); 802 return NULL; 803 } 804 805 claim_fd = spdk_pci_device_claim(&pci_addr); 806 if (claim_fd < 0) { 807 SPDK_ERRLOG("could not claim device %s\n", trid->traddr); 808 return NULL; 809 } 810 811 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, 812 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 813 if (pctrlr == NULL) { 814 close(claim_fd); 815 SPDK_ERRLOG("could not allocate ctrlr\n"); 816 return NULL; 817 } 818 819 pctrlr->is_remapped = false; 820 pctrlr->ctrlr.is_removed = false; 821 pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE; 822 pctrlr->devhandle = devhandle; 823 pctrlr->ctrlr.opts = *opts; 824 pctrlr->claim_fd = claim_fd; 825 memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid)); 826 827 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); 828 if (rc != 0) { 829 close(claim_fd); 830 spdk_free(pctrlr); 831 return NULL; 832 } 833 834 /* Enable PCI busmaster and disable INTx */ 835 spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4); 836 cmd_reg |= 0x404; 837 spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4); 838 839 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { 840 SPDK_ERRLOG("get_cap() failed\n"); 841 close(claim_fd); 842 spdk_free(pctrlr); 843 return NULL; 844 } 845 846 if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { 847 SPDK_ERRLOG("get_vs() failed\n"); 848 close(claim_fd); 849 spdk_free(pctrlr); 850 return NULL; 851 } 852 853 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); 854 855 /* Doorbell stride is 2 ^ (dstrd + 2), 856 * but we want multiples of 4, so drop the + 2 */ 857 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; 858 859 rc = nvme_ctrlr_construct(&pctrlr->ctrlr); 860 if (rc != 0) { 861 nvme_ctrlr_destruct(&pctrlr->ctrlr); 862 return NULL; 863 } 864 865 pci_id = spdk_pci_device_get_id(pci_dev); 866 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); 867 868 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr); 869 if (rc != 0) { 870 nvme_ctrlr_destruct(&pctrlr->ctrlr); 871 return NULL; 872 } 873 874 /* Construct the primary process properties */ 875 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); 876 if (rc != 0) { 877 nvme_ctrlr_destruct(&pctrlr->ctrlr); 878 return NULL; 879 } 880 881 if (g_sigset != true) { 882 nvme_pcie_ctrlr_setup_signal(); 883 g_sigset = true; 884 } 885 886 return &pctrlr->ctrlr; 887 } 888 889 int 890 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 891 { 892 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 893 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); 894 union spdk_nvme_aqa_register aqa; 895 896 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { 897 SPDK_ERRLOG("set_asq() failed\n"); 898 return -EIO; 899 } 900 901 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { 902 SPDK_ERRLOG("set_acq() failed\n"); 903 return -EIO; 904 } 905 906 aqa.raw = 0; 907 /* acqs and asqs are 0-based. */ 908 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 909 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 910 911 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { 912 SPDK_ERRLOG("set_aqa() failed\n"); 913 return -EIO; 914 } 915 916 return 0; 917 } 918 919 int 920 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 921 { 922 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 923 struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); 924 925 close(pctrlr->claim_fd); 926 927 if (ctrlr->adminq) { 928 nvme_pcie_qpair_destroy(ctrlr->adminq); 929 } 930 931 nvme_ctrlr_destruct_finish(ctrlr); 932 933 nvme_ctrlr_free_processes(ctrlr); 934 935 nvme_pcie_ctrlr_free_bars(pctrlr); 936 937 if (devhandle) { 938 spdk_pci_device_detach(devhandle); 939 } 940 941 spdk_free(pctrlr); 942 943 return 0; 944 } 945 946 static void 947 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) 948 { 949 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); 950 tr->cid = cid; 951 tr->req = NULL; 952 } 953 954 int 955 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) 956 { 957 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 958 959 pqpair->last_sq_tail = pqpair->sq_tail = pqpair->cq_head = 0; 960 961 /* 962 * First time through the completion queue, HW will set phase 963 * bit on completions to 1. So set this to 1 here, indicating 964 * we're looking for a 1 to know which entries have completed. 965 * we'll toggle the bit each time when the completion queue 966 * rolls over. 967 */ 968 pqpair->flags.phase = 1; 969 970 memset(pqpair->cmd, 0, 971 pqpair->num_entries * sizeof(struct spdk_nvme_cmd)); 972 memset(pqpair->cpl, 0, 973 pqpair->num_entries * sizeof(struct spdk_nvme_cpl)); 974 975 return 0; 976 } 977 978 static int 979 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair) 980 { 981 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 982 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 983 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 984 struct nvme_tracker *tr; 985 uint16_t i; 986 volatile uint32_t *doorbell_base; 987 uint64_t offset; 988 uint16_t num_trackers; 989 size_t page_align = VALUE_2MB; 990 uint32_t flags = SPDK_MALLOC_DMA; 991 992 /* 993 * Limit the maximum number of completions to return per call to prevent wraparound, 994 * and calculate how many trackers can be submitted at once without overflowing the 995 * completion queue. 996 */ 997 pqpair->max_completions_cap = pqpair->num_entries / 4; 998 pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); 999 pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); 1000 num_trackers = pqpair->num_entries - pqpair->max_completions_cap; 1001 1002 SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", 1003 pqpair->max_completions_cap, num_trackers); 1004 1005 assert(num_trackers != 0); 1006 1007 pqpair->sq_in_cmb = false; 1008 1009 if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { 1010 flags |= SPDK_MALLOC_SHARE; 1011 } 1012 1013 /* cmd and cpl rings must be aligned on page size boundaries. */ 1014 if (ctrlr->opts.use_cmb_sqs) { 1015 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1016 sysconf(_SC_PAGESIZE), &offset) == 0) { 1017 pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset; 1018 pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset; 1019 pqpair->sq_in_cmb = true; 1020 } 1021 } 1022 1023 /* To ensure physical address contiguity we make each ring occupy 1024 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. 1025 */ 1026 if (pqpair->sq_in_cmb == false) { 1027 pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 1028 page_align, NULL, 1029 SPDK_ENV_SOCKET_ID_ANY, flags); 1030 if (pqpair->cmd == NULL) { 1031 SPDK_ERRLOG("alloc qpair_cmd failed\n"); 1032 return -ENOMEM; 1033 } 1034 1035 pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL); 1036 if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) { 1037 SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n"); 1038 return -EFAULT; 1039 } 1040 } 1041 1042 pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl), 1043 page_align, NULL, 1044 SPDK_ENV_SOCKET_ID_ANY, flags); 1045 if (pqpair->cpl == NULL) { 1046 SPDK_ERRLOG("alloc qpair_cpl failed\n"); 1047 return -ENOMEM; 1048 } 1049 1050 pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL); 1051 if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) { 1052 SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n"); 1053 return -EFAULT; 1054 } 1055 1056 doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl; 1057 pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; 1058 pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; 1059 1060 /* 1061 * Reserve space for all of the trackers in a single allocation. 1062 * struct nvme_tracker must be padded so that its size is already a power of 2. 1063 * This ensures the PRP list embedded in the nvme_tracker object will not span a 1064 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. 1065 */ 1066 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, 1067 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1068 if (pqpair->tr == NULL) { 1069 SPDK_ERRLOG("nvme_tr failed\n"); 1070 return -ENOMEM; 1071 } 1072 1073 TAILQ_INIT(&pqpair->free_tr); 1074 TAILQ_INIT(&pqpair->outstanding_tr); 1075 1076 for (i = 0; i < num_trackers; i++) { 1077 tr = &pqpair->tr[i]; 1078 nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL)); 1079 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1080 } 1081 1082 nvme_pcie_qpair_reset(qpair); 1083 1084 return 0; 1085 } 1086 1087 static inline void 1088 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) 1089 { 1090 /* dst and src are known to be non-overlapping and 64-byte aligned. */ 1091 #if defined(__AVX__) 1092 __m256i *d256 = (__m256i *)dst; 1093 const __m256i *s256 = (const __m256i *)src; 1094 1095 _mm256_stream_si256(&d256[0], _mm256_load_si256(&s256[0])); 1096 _mm256_stream_si256(&d256[1], _mm256_load_si256(&s256[1])); 1097 #elif defined(__SSE2__) 1098 __m128i *d128 = (__m128i *)dst; 1099 const __m128i *s128 = (const __m128i *)src; 1100 1101 _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0])); 1102 _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1])); 1103 _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2])); 1104 _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3])); 1105 #else 1106 *dst = *src; 1107 #endif 1108 } 1109 1110 /** 1111 * Note: the ctrlr_lock must be held when calling this function. 1112 */ 1113 static void 1114 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, 1115 struct nvme_request *req, struct spdk_nvme_cpl *cpl) 1116 { 1117 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1118 struct nvme_request *active_req = req; 1119 struct spdk_nvme_ctrlr_process *active_proc; 1120 1121 /* 1122 * The admin request is from another process. Move to the per 1123 * process list for that process to handle it later. 1124 */ 1125 assert(nvme_qpair_is_admin_queue(qpair)); 1126 assert(active_req->pid != getpid()); 1127 1128 active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid); 1129 if (active_proc) { 1130 /* Save the original completion information */ 1131 memcpy(&active_req->cpl, cpl, sizeof(*cpl)); 1132 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); 1133 } else { 1134 SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", 1135 active_req->pid); 1136 1137 nvme_free_request(active_req); 1138 } 1139 } 1140 1141 /** 1142 * Note: the ctrlr_lock must be held when calling this function. 1143 */ 1144 static void 1145 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) 1146 { 1147 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1148 struct nvme_request *req, *tmp_req; 1149 pid_t pid = getpid(); 1150 struct spdk_nvme_ctrlr_process *proc; 1151 1152 /* 1153 * Check whether there is any pending admin request from 1154 * other active processes. 1155 */ 1156 assert(nvme_qpair_is_admin_queue(qpair)); 1157 1158 proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 1159 if (!proc) { 1160 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); 1161 assert(proc); 1162 return; 1163 } 1164 1165 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { 1166 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); 1167 1168 assert(req->pid == pid); 1169 1170 nvme_complete_request(req->cb_fn, req->cb_arg, req, &req->cpl); 1171 nvme_free_request(req); 1172 } 1173 } 1174 1175 static inline int 1176 nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) 1177 { 1178 return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old); 1179 } 1180 1181 static bool 1182 nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value, 1183 volatile uint32_t *shadow_db, 1184 volatile uint32_t *eventidx) 1185 { 1186 uint16_t old; 1187 1188 if (!shadow_db) { 1189 return true; 1190 } 1191 1192 old = *shadow_db; 1193 *shadow_db = value; 1194 1195 if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) { 1196 return false; 1197 } 1198 1199 return true; 1200 } 1201 1202 static inline void 1203 nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair) 1204 { 1205 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1206 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1207 bool need_mmio = true; 1208 1209 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1210 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1211 pqpair->sq_tail, 1212 pqpair->shadow_doorbell.sq_tdbl, 1213 pqpair->shadow_doorbell.sq_eventidx); 1214 } 1215 1216 if (spdk_likely(need_mmio)) { 1217 spdk_wmb(); 1218 g_thread_mmio_ctrlr = pctrlr; 1219 spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail); 1220 g_thread_mmio_ctrlr = NULL; 1221 } 1222 } 1223 1224 static inline void 1225 nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair) 1226 { 1227 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1228 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); 1229 bool need_mmio = true; 1230 1231 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { 1232 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, 1233 pqpair->cq_head, 1234 pqpair->shadow_doorbell.cq_hdbl, 1235 pqpair->shadow_doorbell.cq_eventidx); 1236 } 1237 1238 if (spdk_likely(need_mmio)) { 1239 g_thread_mmio_ctrlr = pctrlr; 1240 spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head); 1241 g_thread_mmio_ctrlr = NULL; 1242 } 1243 } 1244 1245 static void 1246 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1247 { 1248 struct nvme_request *req; 1249 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1250 1251 req = tr->req; 1252 assert(req != NULL); 1253 1254 /* Copy the command from the tracker to the submission queue. */ 1255 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); 1256 1257 if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) { 1258 pqpair->sq_tail = 0; 1259 } 1260 1261 if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) { 1262 SPDK_ERRLOG("sq_tail is passing sq_head!\n"); 1263 } 1264 1265 if (!pqpair->flags.delay_pcie_doorbell) { 1266 nvme_pcie_qpair_ring_sq_doorbell(qpair); 1267 } 1268 } 1269 1270 static void 1271 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 1272 struct spdk_nvme_cpl *cpl, bool print_on_error) 1273 { 1274 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1275 struct nvme_request *req; 1276 bool retry, error; 1277 bool req_from_current_proc = true; 1278 1279 req = tr->req; 1280 1281 assert(req != NULL); 1282 1283 error = spdk_nvme_cpl_is_error(cpl); 1284 retry = error && nvme_completion_is_retry(cpl) && 1285 req->retries < spdk_nvme_retry_count; 1286 1287 if (error && print_on_error) { 1288 nvme_qpair_print_command(qpair, &req->cmd); 1289 nvme_qpair_print_completion(qpair, cpl); 1290 } 1291 1292 assert(cpl->cid == req->cmd.cid); 1293 1294 if (retry) { 1295 req->retries++; 1296 nvme_pcie_qpair_submit_tracker(qpair, tr); 1297 } else { 1298 /* Only check admin requests from different processes. */ 1299 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { 1300 req_from_current_proc = false; 1301 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); 1302 } else { 1303 nvme_complete_request(tr->cb_fn, tr->cb_arg, req, cpl); 1304 } 1305 1306 if (req_from_current_proc == true) { 1307 nvme_free_request(req); 1308 } 1309 1310 tr->req = NULL; 1311 1312 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); 1313 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 1314 1315 /* 1316 * If the controller is in the middle of resetting, don't 1317 * try to submit queued requests here - let the reset logic 1318 * handle that instead. 1319 */ 1320 if (!STAILQ_EMPTY(&qpair->queued_req) && 1321 !qpair->ctrlr->is_resetting) { 1322 req = STAILQ_FIRST(&qpair->queued_req); 1323 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1324 nvme_qpair_submit_request(qpair, req); 1325 } 1326 } 1327 } 1328 1329 static void 1330 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, 1331 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 1332 bool print_on_error) 1333 { 1334 struct spdk_nvme_cpl cpl; 1335 1336 memset(&cpl, 0, sizeof(cpl)); 1337 cpl.sqid = qpair->id; 1338 cpl.cid = tr->cid; 1339 cpl.status.sct = sct; 1340 cpl.status.sc = sc; 1341 cpl.status.dnr = dnr; 1342 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); 1343 } 1344 1345 static void 1346 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1347 { 1348 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1349 struct nvme_tracker *tr, *temp; 1350 1351 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { 1352 SPDK_ERRLOG("aborting outstanding command\n"); 1353 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1354 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); 1355 } 1356 } 1357 1358 static void 1359 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 1360 { 1361 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1362 struct nvme_tracker *tr; 1363 1364 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1365 while (tr != NULL) { 1366 assert(tr->req != NULL); 1367 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 1368 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, 1369 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, 1370 false); 1371 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 1372 } else { 1373 tr = TAILQ_NEXT(tr, tq_list); 1374 } 1375 } 1376 } 1377 1378 static void 1379 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) 1380 { 1381 nvme_pcie_admin_qpair_abort_aers(qpair); 1382 } 1383 1384 static int 1385 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) 1386 { 1387 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1388 1389 if (nvme_qpair_is_admin_queue(qpair)) { 1390 nvme_pcie_admin_qpair_destroy(qpair); 1391 } 1392 if (pqpair->cmd && !pqpair->sq_in_cmb) { 1393 spdk_free(pqpair->cmd); 1394 } 1395 if (pqpair->cpl) { 1396 spdk_free(pqpair->cpl); 1397 } 1398 if (pqpair->tr) { 1399 spdk_free(pqpair->tr); 1400 } 1401 1402 nvme_qpair_deinit(qpair); 1403 1404 spdk_free(pqpair); 1405 1406 return 0; 1407 } 1408 1409 static void 1410 nvme_pcie_admin_qpair_enable(struct spdk_nvme_qpair *qpair) 1411 { 1412 /* 1413 * Manually abort each outstanding admin command. Do not retry 1414 * admin commands found here, since they will be left over from 1415 * a controller reset and its likely the context in which the 1416 * command was issued no longer applies. 1417 */ 1418 nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */); 1419 } 1420 1421 static void 1422 nvme_pcie_io_qpair_enable(struct spdk_nvme_qpair *qpair) 1423 { 1424 /* Manually abort each outstanding I/O. */ 1425 nvme_pcie_qpair_abort_trackers(qpair, 0); 1426 } 1427 1428 int 1429 nvme_pcie_qpair_enable(struct spdk_nvme_qpair *qpair) 1430 { 1431 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1432 1433 pqpair->flags.is_enabled = true; 1434 if (nvme_qpair_is_io_queue(qpair)) { 1435 nvme_pcie_io_qpair_enable(qpair); 1436 } else { 1437 nvme_pcie_admin_qpair_enable(qpair); 1438 } 1439 1440 return 0; 1441 } 1442 1443 static void 1444 nvme_pcie_admin_qpair_disable(struct spdk_nvme_qpair *qpair) 1445 { 1446 nvme_pcie_admin_qpair_abort_aers(qpair); 1447 } 1448 1449 static void 1450 nvme_pcie_io_qpair_disable(struct spdk_nvme_qpair *qpair) 1451 { 1452 } 1453 1454 int 1455 nvme_pcie_qpair_disable(struct spdk_nvme_qpair *qpair) 1456 { 1457 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1458 1459 pqpair->flags.is_enabled = false; 1460 if (nvme_qpair_is_io_queue(qpair)) { 1461 nvme_pcie_io_qpair_disable(qpair); 1462 } else { 1463 nvme_pcie_admin_qpair_disable(qpair); 1464 } 1465 1466 return 0; 1467 } 1468 1469 1470 int 1471 nvme_pcie_qpair_fail(struct spdk_nvme_qpair *qpair) 1472 { 1473 nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */); 1474 1475 return 0; 1476 } 1477 1478 static int 1479 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, 1480 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, 1481 void *cb_arg) 1482 { 1483 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1484 struct nvme_request *req; 1485 struct spdk_nvme_cmd *cmd; 1486 1487 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1488 if (req == NULL) { 1489 return -ENOMEM; 1490 } 1491 1492 cmd = &req->cmd; 1493 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; 1494 1495 /* 1496 * TODO: create a create io completion queue command data 1497 * structure. 1498 */ 1499 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; 1500 /* 1501 * 0x2 = interrupts enabled 1502 * 0x1 = physically contiguous 1503 */ 1504 cmd->cdw11 = 0x1; 1505 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; 1506 1507 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1508 } 1509 1510 static int 1511 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, 1512 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1513 { 1514 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 1515 struct nvme_request *req; 1516 struct spdk_nvme_cmd *cmd; 1517 1518 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1519 if (req == NULL) { 1520 return -ENOMEM; 1521 } 1522 1523 cmd = &req->cmd; 1524 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; 1525 1526 /* 1527 * TODO: create a create io submission queue command data 1528 * structure. 1529 */ 1530 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; 1531 /* 0x1 = physically contiguous */ 1532 cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1; 1533 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; 1534 1535 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1536 } 1537 1538 static int 1539 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1540 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1541 { 1542 struct nvme_request *req; 1543 struct spdk_nvme_cmd *cmd; 1544 1545 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1546 if (req == NULL) { 1547 return -ENOMEM; 1548 } 1549 1550 cmd = &req->cmd; 1551 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; 1552 cmd->cdw10 = qpair->id; 1553 1554 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1555 } 1556 1557 static int 1558 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1559 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 1560 { 1561 struct nvme_request *req; 1562 struct spdk_nvme_cmd *cmd; 1563 1564 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 1565 if (req == NULL) { 1566 return -ENOMEM; 1567 } 1568 1569 cmd = &req->cmd; 1570 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; 1571 cmd->cdw10 = qpair->id; 1572 1573 return nvme_ctrlr_submit_admin_request(ctrlr, req); 1574 } 1575 1576 static int 1577 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 1578 uint16_t qid) 1579 { 1580 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1581 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1582 struct nvme_completion_poll_status status; 1583 int rc; 1584 1585 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1586 if (rc != 0) { 1587 return rc; 1588 } 1589 1590 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1591 SPDK_ERRLOG("nvme_create_io_cq failed!\n"); 1592 return -1; 1593 } 1594 1595 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); 1596 if (rc != 0) { 1597 return rc; 1598 } 1599 1600 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1601 SPDK_ERRLOG("nvme_create_io_sq failed!\n"); 1602 /* Attempt to delete the completion queue */ 1603 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); 1604 if (rc != 0) { 1605 return -1; 1606 } 1607 spdk_nvme_wait_for_completion(ctrlr->adminq, &status); 1608 return -1; 1609 } 1610 1611 if (ctrlr->shadow_doorbell) { 1612 pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * 1613 pctrlr->doorbell_stride_u32; 1614 pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * 1615 pctrlr->doorbell_stride_u32; 1616 pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * 1617 pctrlr->doorbell_stride_u32; 1618 pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * 1619 pctrlr->doorbell_stride_u32; 1620 pqpair->flags.has_shadow_doorbell = 1; 1621 } else { 1622 pqpair->flags.has_shadow_doorbell = 0; 1623 } 1624 nvme_pcie_qpair_reset(qpair); 1625 1626 return 0; 1627 } 1628 1629 struct spdk_nvme_qpair * 1630 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 1631 const struct spdk_nvme_io_qpair_opts *opts) 1632 { 1633 struct nvme_pcie_qpair *pqpair; 1634 struct spdk_nvme_qpair *qpair; 1635 int rc; 1636 1637 assert(ctrlr != NULL); 1638 1639 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, 1640 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 1641 if (pqpair == NULL) { 1642 return NULL; 1643 } 1644 1645 pqpair->num_entries = opts->io_queue_size; 1646 pqpair->flags.delay_pcie_doorbell = opts->delay_pcie_doorbell; 1647 1648 qpair = &pqpair->qpair; 1649 1650 rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); 1651 if (rc != 0) { 1652 nvme_pcie_qpair_destroy(qpair); 1653 return NULL; 1654 } 1655 1656 rc = nvme_pcie_qpair_construct(qpair); 1657 if (rc != 0) { 1658 nvme_pcie_qpair_destroy(qpair); 1659 return NULL; 1660 } 1661 1662 rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid); 1663 1664 if (rc != 0) { 1665 SPDK_ERRLOG("I/O queue creation failed\n"); 1666 nvme_pcie_qpair_destroy(qpair); 1667 return NULL; 1668 } 1669 1670 return qpair; 1671 } 1672 1673 int 1674 nvme_pcie_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1675 { 1676 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); 1677 } 1678 1679 int 1680 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1681 { 1682 struct nvme_completion_poll_status status; 1683 int rc; 1684 1685 assert(ctrlr != NULL); 1686 1687 if (ctrlr->is_removed) { 1688 goto free; 1689 } 1690 1691 /* Delete the I/O submission queue */ 1692 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1693 if (rc != 0) { 1694 SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); 1695 return rc; 1696 } 1697 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1698 return -1; 1699 } 1700 1701 /* Delete the completion queue */ 1702 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); 1703 if (rc != 0) { 1704 SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); 1705 return rc; 1706 } 1707 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { 1708 return -1; 1709 } 1710 1711 free: 1712 if (qpair->no_deletion_notification_needed == 0) { 1713 /* Abort the rest of the I/O */ 1714 nvme_pcie_qpair_abort_trackers(qpair, 1); 1715 } 1716 1717 nvme_pcie_qpair_destroy(qpair); 1718 return 0; 1719 } 1720 1721 static void 1722 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1723 { 1724 /* 1725 * Bad vtophys translation, so abort this request and return 1726 * immediately. 1727 */ 1728 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1729 SPDK_NVME_SC_INVALID_FIELD, 1730 1 /* do not retry */, true); 1731 } 1732 1733 /* 1734 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. 1735 * 1736 * *prp_index will be updated to account for the number of PRP entries used. 1737 */ 1738 static int 1739 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, 1740 uint32_t page_size) 1741 { 1742 struct spdk_nvme_cmd *cmd = &tr->req->cmd; 1743 uintptr_t page_mask = page_size - 1; 1744 uint64_t phys_addr; 1745 uint32_t i; 1746 1747 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n", 1748 *prp_index, virt_addr, (uint32_t)len); 1749 1750 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { 1751 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1752 return -EINVAL; 1753 } 1754 1755 i = *prp_index; 1756 while (len) { 1757 uint32_t seg_len; 1758 1759 /* 1760 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, 1761 * so prp_index == count is valid. 1762 */ 1763 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { 1764 SPDK_ERRLOG("out of PRP entries\n"); 1765 return -EINVAL; 1766 } 1767 1768 phys_addr = spdk_vtophys(virt_addr, NULL); 1769 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { 1770 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); 1771 return -EINVAL; 1772 } 1773 1774 if (i == 0) { 1775 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr); 1776 cmd->dptr.prp.prp1 = phys_addr; 1777 seg_len = page_size - ((uintptr_t)virt_addr & page_mask); 1778 } else { 1779 if ((phys_addr & page_mask) != 0) { 1780 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); 1781 return -EINVAL; 1782 } 1783 1784 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr); 1785 tr->u.prp[i - 1] = phys_addr; 1786 seg_len = page_size; 1787 } 1788 1789 seg_len = spdk_min(seg_len, len); 1790 virt_addr += seg_len; 1791 len -= seg_len; 1792 i++; 1793 } 1794 1795 cmd->psdt = SPDK_NVME_PSDT_PRP; 1796 if (i <= 1) { 1797 cmd->dptr.prp.prp2 = 0; 1798 } else if (i == 2) { 1799 cmd->dptr.prp.prp2 = tr->u.prp[0]; 1800 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); 1801 } else { 1802 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; 1803 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); 1804 } 1805 1806 *prp_index = i; 1807 return 0; 1808 } 1809 1810 /** 1811 * Build PRP list describing physically contiguous payload buffer. 1812 */ 1813 static int 1814 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1815 struct nvme_tracker *tr) 1816 { 1817 uint32_t prp_index = 0; 1818 int rc; 1819 1820 rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, 1821 req->payload_size, qpair->ctrlr->page_size); 1822 if (rc) { 1823 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1824 return rc; 1825 } 1826 1827 return 0; 1828 } 1829 1830 /** 1831 * Build SGL list describing scattered payload buffer. 1832 */ 1833 static int 1834 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1835 struct nvme_tracker *tr) 1836 { 1837 int rc; 1838 void *virt_addr; 1839 uint64_t phys_addr; 1840 uint32_t remaining_transfer_len, remaining_user_sge_len, length; 1841 struct spdk_nvme_sgl_descriptor *sgl; 1842 uint32_t nseg = 0; 1843 1844 /* 1845 * Build scattered payloads. 1846 */ 1847 assert(req->payload_size != 0); 1848 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1849 assert(req->payload.reset_sgl_fn != NULL); 1850 assert(req->payload.next_sge_fn != NULL); 1851 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1852 1853 sgl = tr->u.sgl; 1854 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1855 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1856 1857 remaining_transfer_len = req->payload_size; 1858 1859 while (remaining_transfer_len > 0) { 1860 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, 1861 &virt_addr, &remaining_user_sge_len); 1862 if (rc) { 1863 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1864 return -1; 1865 } 1866 1867 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 1868 remaining_transfer_len -= remaining_user_sge_len; 1869 while (remaining_user_sge_len > 0) { 1870 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1871 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1872 return -1; 1873 } 1874 1875 phys_addr = spdk_vtophys(virt_addr, NULL); 1876 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1877 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1878 return -1; 1879 } 1880 1881 length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr)); 1882 remaining_user_sge_len -= length; 1883 virt_addr += length; 1884 1885 if (nseg > 0 && phys_addr == 1886 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { 1887 /* extend previous entry */ 1888 (*(sgl - 1)).unkeyed.length += length; 1889 continue; 1890 } 1891 1892 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1893 sgl->unkeyed.length = length; 1894 sgl->address = phys_addr; 1895 sgl->unkeyed.subtype = 0; 1896 1897 sgl++; 1898 nseg++; 1899 } 1900 } 1901 1902 if (nseg == 1) { 1903 /* 1904 * The whole transfer can be described by a single SGL descriptor. 1905 * Use the special case described by the spec where SGL1's type is Data Block. 1906 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1907 * SGL element into SGL1. 1908 */ 1909 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1910 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1911 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1912 } else { 1913 /* For now we can only support 1 SGL segment in NVMe controller */ 1914 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1915 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1916 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1917 } 1918 1919 return 0; 1920 } 1921 1922 /** 1923 * Build PRP list describing scattered payload buffer. 1924 */ 1925 static int 1926 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1927 struct nvme_tracker *tr) 1928 { 1929 int rc; 1930 void *virt_addr; 1931 uint32_t remaining_transfer_len, length; 1932 uint32_t prp_index = 0; 1933 uint32_t page_size = qpair->ctrlr->page_size; 1934 1935 /* 1936 * Build scattered payloads. 1937 */ 1938 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1939 assert(req->payload.reset_sgl_fn != NULL); 1940 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1941 1942 remaining_transfer_len = req->payload_size; 1943 while (remaining_transfer_len > 0) { 1944 assert(req->payload.next_sge_fn != NULL); 1945 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); 1946 if (rc) { 1947 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1948 return -1; 1949 } 1950 1951 length = spdk_min(remaining_transfer_len, length); 1952 1953 /* 1954 * Any incompatible sges should have been handled up in the splitting routine, 1955 * but assert here as an additional check. 1956 * 1957 * All SGEs except last must end on a page boundary. 1958 */ 1959 assert((length == remaining_transfer_len) || 1960 _is_page_aligned((uintptr_t)virt_addr + length, page_size)); 1961 1962 rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); 1963 if (rc) { 1964 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1965 return rc; 1966 } 1967 1968 remaining_transfer_len -= length; 1969 } 1970 1971 return 0; 1972 } 1973 1974 static inline bool 1975 nvme_pcie_qpair_check_enabled(struct spdk_nvme_qpair *qpair) 1976 { 1977 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1978 1979 if (!pqpair->flags.is_enabled && 1980 !qpair->ctrlr->is_resetting) { 1981 nvme_qpair_enable(qpair); 1982 } 1983 return pqpair->flags.is_enabled; 1984 } 1985 1986 int 1987 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) 1988 { 1989 struct nvme_tracker *tr; 1990 int rc = 0; 1991 void *md_payload; 1992 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1993 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1994 1995 nvme_pcie_qpair_check_enabled(qpair); 1996 1997 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 1998 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 1999 } 2000 2001 tr = TAILQ_FIRST(&pqpair->free_tr); 2002 2003 if (tr == NULL || !pqpair->flags.is_enabled) { 2004 /* 2005 * No tracker is available, or the qpair is disabled due to 2006 * an in-progress controller-level reset. 2007 * 2008 * Put the request on the qpair's request queue to be 2009 * processed when a tracker frees up via a command 2010 * completion or when the controller reset is 2011 * completed. 2012 */ 2013 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 2014 goto exit; 2015 } 2016 2017 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ 2018 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); 2019 tr->req = req; 2020 tr->cb_fn = req->cb_fn; 2021 tr->cb_arg = req->cb_arg; 2022 req->cmd.cid = tr->cid; 2023 2024 if (req->payload_size && req->payload.md) { 2025 md_payload = req->payload.md + req->md_offset; 2026 tr->req->cmd.mptr = spdk_vtophys(md_payload, NULL); 2027 if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) { 2028 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2029 rc = -EINVAL; 2030 goto exit; 2031 } 2032 } 2033 2034 if (req->payload_size == 0) { 2035 /* Null payload - leave PRP fields zeroed */ 2036 rc = 0; 2037 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { 2038 rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr); 2039 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { 2040 if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 2041 rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr); 2042 } else { 2043 rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr); 2044 } 2045 } else { 2046 assert(0); 2047 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 2048 rc = -EINVAL; 2049 } 2050 2051 if (rc < 0) { 2052 goto exit; 2053 } 2054 2055 nvme_pcie_qpair_submit_tracker(qpair, tr); 2056 2057 exit: 2058 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2059 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2060 } 2061 2062 return rc; 2063 } 2064 2065 static void 2066 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 2067 { 2068 uint64_t t02; 2069 struct nvme_tracker *tr, *tmp; 2070 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2071 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2072 struct spdk_nvme_ctrlr_process *active_proc; 2073 2074 /* Don't check timeouts during controller initialization. */ 2075 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 2076 return; 2077 } 2078 2079 if (nvme_qpair_is_admin_queue(qpair)) { 2080 active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); 2081 } else { 2082 active_proc = qpair->active_proc; 2083 } 2084 2085 /* Only check timeouts if the current process has a timeout callback. */ 2086 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 2087 return; 2088 } 2089 2090 t02 = spdk_get_ticks(); 2091 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { 2092 assert(tr->req != NULL); 2093 2094 if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { 2095 /* 2096 * The requests are in order, so as soon as one has not timed out, 2097 * stop iterating. 2098 */ 2099 break; 2100 } 2101 } 2102 } 2103 2104 int32_t 2105 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) 2106 { 2107 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 2108 struct nvme_tracker *tr; 2109 struct spdk_nvme_cpl *cpl; 2110 uint32_t num_completions = 0; 2111 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2112 2113 if (spdk_unlikely(!nvme_pcie_qpair_check_enabled(qpair))) { 2114 /* 2115 * qpair is not enabled, likely because a controller reset is 2116 * is in progress. Ignore the interrupt - any I/O that was 2117 * associated with this interrupt will get retried when the 2118 * reset is complete. 2119 */ 2120 return 0; 2121 } 2122 2123 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2124 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 2125 } 2126 2127 if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { 2128 /* 2129 * max_completions == 0 means unlimited, but complete at most 2130 * max_completions_cap batch of I/O at a time so that the completion 2131 * queue doorbells don't wrap around. 2132 */ 2133 max_completions = pqpair->max_completions_cap; 2134 } 2135 2136 while (1) { 2137 cpl = &pqpair->cpl[pqpair->cq_head]; 2138 2139 if (cpl->status.p != pqpair->flags.phase) { 2140 break; 2141 } 2142 #ifdef __PPC64__ 2143 /* 2144 * This memory barrier prevents reordering of: 2145 * - load after store from/to tr 2146 * - load after load cpl phase and cpl cid 2147 */ 2148 spdk_mb(); 2149 #elif defined(__aarch64__) 2150 __asm volatile("dmb oshld" ::: "memory"); 2151 #endif 2152 2153 if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { 2154 pqpair->cq_head = 0; 2155 pqpair->flags.phase = !pqpair->flags.phase; 2156 } 2157 2158 tr = &pqpair->tr[cpl->cid]; 2159 pqpair->sq_head = cpl->sqhd; 2160 2161 if (tr->req) { 2162 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); 2163 } else { 2164 SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); 2165 nvme_qpair_print_completion(qpair, cpl); 2166 assert(0); 2167 } 2168 2169 if (++num_completions == max_completions) { 2170 break; 2171 } 2172 } 2173 2174 if (num_completions > 0) { 2175 nvme_pcie_qpair_ring_cq_doorbell(qpair); 2176 } 2177 2178 if (pqpair->flags.delay_pcie_doorbell) { 2179 if (pqpair->last_sq_tail != pqpair->sq_tail) { 2180 nvme_pcie_qpair_ring_sq_doorbell(qpair); 2181 pqpair->last_sq_tail = pqpair->sq_tail; 2182 } 2183 } 2184 2185 if (spdk_unlikely(ctrlr->timeout_enabled)) { 2186 /* 2187 * User registered for timeout callback 2188 */ 2189 nvme_pcie_qpair_check_timeout(qpair); 2190 } 2191 2192 /* Before returning, complete any pending admin request. */ 2193 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 2194 nvme_pcie_qpair_complete_pending_admin_request(qpair); 2195 2196 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 2197 } 2198 2199 return num_completions; 2200 } 2201