1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2017, IBM Corporation. All rights reserved. 6 * Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * NVMe over PCIe transport 37 */ 38 39 #include "spdk/stdinc.h" 40 #include "spdk/env.h" 41 #include "spdk/likely.h" 42 #include "spdk/string.h" 43 #include "nvme_internal.h" 44 #include "nvme_pcie_internal.h" 45 46 struct nvme_pcie_enum_ctx { 47 struct spdk_nvme_probe_ctx *probe_ctx; 48 struct spdk_pci_addr pci_addr; 49 bool has_pci_addr; 50 }; 51 52 static uint16_t g_signal_lock; 53 static bool g_sigset = false; 54 static spdk_nvme_pcie_hotplug_filter_cb g_hotplug_filter_cb; 55 56 static void 57 nvme_sigbus_fault_sighandler(siginfo_t *info, void *ctx) 58 { 59 void *map_address; 60 uint16_t flag = 0; 61 62 if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE, 63 __ATOMIC_RELAXED)) { 64 SPDK_DEBUGLOG(nvme, "request g_signal_lock failed\n"); 65 return; 66 } 67 68 if (g_thread_mmio_ctrlr == NULL) { 69 return; 70 } 71 72 if (!g_thread_mmio_ctrlr->is_remapped) { 73 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, 74 PROT_READ | PROT_WRITE, 75 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); 76 if (map_address == MAP_FAILED) { 77 SPDK_ERRLOG("mmap failed\n"); 78 __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); 79 return; 80 } 81 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); 82 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; 83 g_thread_mmio_ctrlr->is_remapped = true; 84 } 85 __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); 86 } 87 88 static void 89 _nvme_pcie_event_process(struct spdk_pci_event *event, void *cb_ctx) 90 { 91 struct spdk_nvme_transport_id trid; 92 struct spdk_nvme_ctrlr *ctrlr; 93 94 if (event->action == SPDK_UEVENT_ADD) { 95 if (spdk_process_is_primary()) { 96 if (g_hotplug_filter_cb == NULL || g_hotplug_filter_cb(&event->traddr)) { 97 /* The enumerate interface implement the add operation */ 98 spdk_pci_device_allow(&event->traddr); 99 } 100 } 101 } else if (event->action == SPDK_UEVENT_REMOVE) { 102 memset(&trid, 0, sizeof(trid)); 103 spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE); 104 105 if (spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &event->traddr) < 0) { 106 SPDK_ERRLOG("Failed to format pci address\n"); 107 return; 108 } 109 110 ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid); 111 if (ctrlr == NULL) { 112 return; 113 } 114 SPDK_DEBUGLOG(nvme, "remove nvme address: %s\n", trid.traddr); 115 116 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 117 nvme_ctrlr_fail(ctrlr, true); 118 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 119 120 /* get the user app to clean up and stop I/O */ 121 if (ctrlr->remove_cb) { 122 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 123 ctrlr->remove_cb(cb_ctx, ctrlr); 124 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 125 } 126 } 127 } 128 129 static int 130 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) 131 { 132 struct spdk_nvme_ctrlr *ctrlr, *tmp; 133 struct spdk_pci_event event; 134 135 if (g_spdk_nvme_driver->hotplug_fd < 0) { 136 return 0; 137 } 138 139 while (spdk_pci_get_event(g_spdk_nvme_driver->hotplug_fd, &event) > 0) { 140 _nvme_pcie_event_process(&event, probe_ctx->cb_ctx); 141 } 142 143 /* Initiate removal of physically hotremoved PCI controllers. Even after 144 * they're hotremoved from the system, SPDK might still report them via RPC. 145 */ 146 TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { 147 bool do_remove = false; 148 struct nvme_pcie_ctrlr *pctrlr; 149 150 if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { 151 continue; 152 } 153 154 pctrlr = nvme_pcie_ctrlr(ctrlr); 155 if (spdk_pci_device_is_removed(pctrlr->devhandle)) { 156 do_remove = true; 157 } 158 159 if (do_remove) { 160 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 161 nvme_ctrlr_fail(ctrlr, true); 162 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 163 if (ctrlr->remove_cb) { 164 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); 165 ctrlr->remove_cb(ctrlr->cb_ctx, ctrlr); 166 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); 167 } 168 } 169 } 170 return 0; 171 } 172 173 static volatile void * 174 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) 175 { 176 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 177 178 return (volatile void *)((uintptr_t)pctrlr->regs + offset); 179 } 180 181 static int 182 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) 183 { 184 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 185 186 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 187 g_thread_mmio_ctrlr = pctrlr; 188 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); 189 g_thread_mmio_ctrlr = NULL; 190 return 0; 191 } 192 193 static int 194 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) 195 { 196 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 197 198 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 199 g_thread_mmio_ctrlr = pctrlr; 200 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); 201 g_thread_mmio_ctrlr = NULL; 202 return 0; 203 } 204 205 static int 206 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) 207 { 208 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 209 210 assert(offset <= sizeof(struct spdk_nvme_registers) - 4); 211 assert(value != NULL); 212 g_thread_mmio_ctrlr = pctrlr; 213 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); 214 g_thread_mmio_ctrlr = NULL; 215 if (~(*value) == 0) { 216 return -1; 217 } 218 219 return 0; 220 } 221 222 static int 223 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) 224 { 225 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 226 227 assert(offset <= sizeof(struct spdk_nvme_registers) - 8); 228 assert(value != NULL); 229 g_thread_mmio_ctrlr = pctrlr; 230 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); 231 g_thread_mmio_ctrlr = NULL; 232 if (~(*value) == 0) { 233 return -1; 234 } 235 236 return 0; 237 } 238 239 static int 240 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 241 { 242 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), 243 value); 244 } 245 246 static int 247 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) 248 { 249 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), 250 value); 251 } 252 253 static int 254 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) 255 { 256 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), 257 aqa->raw); 258 } 259 260 static int 261 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) 262 { 263 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), 264 &cmbloc->raw); 265 } 266 267 static int 268 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) 269 { 270 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), 271 &cmbsz->raw); 272 } 273 274 static int 275 nvme_pcie_ctrlr_get_pmrcap(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_pmrcap_register *pmrcap) 276 { 277 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, pmrcap.raw), 278 &pmrcap->raw); 279 } 280 281 static int 282 nvme_pcie_ctrlr_set_pmrctl(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_pmrctl_register *pmrctl) 283 { 284 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, pmrctl.raw), 285 pmrctl->raw); 286 } 287 288 static int 289 nvme_pcie_ctrlr_get_pmrctl(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_pmrctl_register *pmrctl) 290 { 291 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, pmrctl.raw), 292 &pmrctl->raw); 293 } 294 295 static int 296 nvme_pcie_ctrlr_get_pmrsts(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_pmrsts_register *pmrsts) 297 { 298 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, pmrsts.raw), 299 &pmrsts->raw); 300 } 301 302 static int 303 nvme_pcie_ctrlr_set_pmrmscl(struct nvme_pcie_ctrlr *pctrlr, uint32_t value) 304 { 305 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, pmrmscl.raw), 306 value); 307 } 308 309 static int 310 nvme_pcie_ctrlr_set_pmrmscu(struct nvme_pcie_ctrlr *pctrlr, uint32_t value) 311 { 312 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, pmrmscu), 313 value); 314 } 315 316 static uint32_t 317 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 318 { 319 /* 320 * For commands requiring more than 2 PRP entries, one PRP will be 321 * embedded in the command (prp1), and the rest of the PRP entries 322 * will be in a list pointed to by the command (prp2). The number 323 * of PRP entries in the list is defined by 324 * NVME_MAX_PRP_LIST_ENTRIES. 325 * 326 * Note that the max xfer size is not (MAX_ENTRIES + 1) * page_size 327 * because the first PRP entry may not be aligned on a 4KiB 328 * boundary. 329 */ 330 return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; 331 } 332 333 static uint16_t 334 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 335 { 336 return NVME_MAX_SGL_DESCRIPTORS; 337 } 338 339 static void 340 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) 341 { 342 int rc; 343 void *addr = NULL; 344 uint32_t bir; 345 union spdk_nvme_cmbsz_register cmbsz; 346 union spdk_nvme_cmbloc_register cmbloc; 347 uint64_t size, unit_size, offset, bar_size = 0, bar_phys_addr = 0; 348 349 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || 350 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 351 SPDK_ERRLOG("get registers failed\n"); 352 goto exit; 353 } 354 355 if (!cmbsz.bits.sz) { 356 goto exit; 357 } 358 359 bir = cmbloc.bits.bir; 360 /* Values 0 2 3 4 5 are valid for BAR */ 361 if (bir > 5 || bir == 1) { 362 goto exit; 363 } 364 365 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ 366 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); 367 /* controller memory buffer size in Bytes */ 368 size = unit_size * cmbsz.bits.sz; 369 /* controller memory buffer offset from BAR in Bytes */ 370 offset = unit_size * cmbloc.bits.ofst; 371 372 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, 373 &bar_phys_addr, &bar_size); 374 if ((rc != 0) || addr == NULL) { 375 goto exit; 376 } 377 378 if (offset > bar_size) { 379 goto exit; 380 } 381 382 if (size > bar_size - offset) { 383 goto exit; 384 } 385 386 pctrlr->cmb.bar_va = addr; 387 pctrlr->cmb.bar_pa = bar_phys_addr; 388 pctrlr->cmb.size = size; 389 pctrlr->cmb.current_offset = offset; 390 391 if (!cmbsz.bits.sqs) { 392 pctrlr->ctrlr.opts.use_cmb_sqs = false; 393 } 394 395 return; 396 exit: 397 pctrlr->ctrlr.opts.use_cmb_sqs = false; 398 return; 399 } 400 401 static int 402 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) 403 { 404 int rc = 0; 405 union spdk_nvme_cmbloc_register cmbloc; 406 void *addr = pctrlr->cmb.bar_va; 407 408 if (addr) { 409 if (pctrlr->cmb.mem_register_addr) { 410 spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size); 411 } 412 413 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 414 SPDK_ERRLOG("get_cmbloc() failed\n"); 415 return -EIO; 416 } 417 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); 418 } 419 return rc; 420 } 421 422 static int 423 nvme_pcie_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr) 424 { 425 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 426 427 if (pctrlr->cmb.bar_va == NULL) { 428 SPDK_DEBUGLOG(nvme, "CMB not available\n"); 429 return -ENOTSUP; 430 } 431 432 if (ctrlr->opts.use_cmb_sqs) { 433 SPDK_ERRLOG("CMB is already in use for submission queues.\n"); 434 return -ENOTSUP; 435 } 436 437 return 0; 438 } 439 440 static void * 441 nvme_pcie_ctrlr_map_io_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) 442 { 443 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 444 union spdk_nvme_cmbsz_register cmbsz; 445 union spdk_nvme_cmbloc_register cmbloc; 446 uint64_t mem_register_start, mem_register_end; 447 int rc; 448 449 if (pctrlr->cmb.mem_register_addr != NULL) { 450 *size = pctrlr->cmb.mem_register_size; 451 return pctrlr->cmb.mem_register_addr; 452 } 453 454 *size = 0; 455 456 if (pctrlr->cmb.bar_va == NULL) { 457 SPDK_DEBUGLOG(nvme, "CMB not available\n"); 458 return NULL; 459 } 460 461 if (ctrlr->opts.use_cmb_sqs) { 462 SPDK_ERRLOG("CMB is already in use for submission queues.\n"); 463 return NULL; 464 } 465 466 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || 467 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { 468 SPDK_ERRLOG("get registers failed\n"); 469 return NULL; 470 } 471 472 /* If only SQS is supported */ 473 if (!(cmbsz.bits.wds || cmbsz.bits.rds)) { 474 return NULL; 475 } 476 477 /* If CMB is less than 4MiB in size then abort CMB mapping */ 478 if (pctrlr->cmb.size < (1ULL << 22)) { 479 return NULL; 480 } 481 482 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + 483 VALUE_2MB - 1); 484 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + 485 pctrlr->cmb.size); 486 487 rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start); 488 if (rc) { 489 SPDK_ERRLOG("spdk_mem_register() failed\n"); 490 return NULL; 491 } 492 493 pctrlr->cmb.mem_register_addr = (void *)mem_register_start; 494 pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start; 495 496 *size = pctrlr->cmb.mem_register_size; 497 return pctrlr->cmb.mem_register_addr; 498 } 499 500 static int 501 nvme_pcie_ctrlr_unmap_io_cmb(struct spdk_nvme_ctrlr *ctrlr) 502 { 503 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 504 int rc; 505 506 if (pctrlr->cmb.mem_register_addr == NULL) { 507 return 0; 508 } 509 510 rc = spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size); 511 512 if (rc == 0) { 513 pctrlr->cmb.mem_register_addr = NULL; 514 pctrlr->cmb.mem_register_size = 0; 515 } 516 517 return rc; 518 } 519 520 static void 521 nvme_pcie_ctrlr_map_pmr(struct nvme_pcie_ctrlr *pctrlr) 522 { 523 int rc; 524 void *addr = NULL; 525 uint32_t bir; 526 union spdk_nvme_pmrcap_register pmrcap; 527 uint64_t bar_size = 0, bar_phys_addr = 0; 528 529 if (!pctrlr->regs->cap.bits.pmrs) { 530 return; 531 } 532 533 if (nvme_pcie_ctrlr_get_pmrcap(pctrlr, &pmrcap)) { 534 SPDK_ERRLOG("get registers failed\n"); 535 return; 536 } 537 538 bir = pmrcap.bits.bir; 539 /* Values 2 3 4 5 are valid for BAR */ 540 if (bir > 5 || bir < 2) { 541 SPDK_ERRLOG("invalid base indicator register value\n"); 542 return; 543 } 544 545 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, &bar_phys_addr, &bar_size); 546 if ((rc != 0) || addr == NULL) { 547 SPDK_ERRLOG("could not map the bar %d\n", bir); 548 return; 549 } 550 551 if (pmrcap.bits.cmss) { 552 uint32_t pmrmscl, pmrmscu, cmse = 1; 553 union spdk_nvme_pmrsts_register pmrsts; 554 555 /* Enable Controller Memory Space */ 556 pmrmscl = (uint32_t)((bar_phys_addr & 0xFFFFF000ULL) | (cmse << 1)); 557 pmrmscu = (uint32_t)((bar_phys_addr >> 32ULL) & 0xFFFFFFFFULL); 558 559 if (nvme_pcie_ctrlr_set_pmrmscu(pctrlr, pmrmscu)) { 560 SPDK_ERRLOG("set_pmrmscu() failed\n"); 561 spdk_pci_device_unmap_bar(pctrlr->devhandle, bir, addr); 562 return; 563 } 564 565 if (nvme_pcie_ctrlr_set_pmrmscl(pctrlr, pmrmscl)) { 566 SPDK_ERRLOG("set_pmrmscl() failed\n"); 567 spdk_pci_device_unmap_bar(pctrlr->devhandle, bir, addr); 568 return; 569 } 570 571 if (nvme_pcie_ctrlr_get_pmrsts(pctrlr, &pmrsts)) { 572 SPDK_ERRLOG("get pmrsts failed\n"); 573 spdk_pci_device_unmap_bar(pctrlr->devhandle, bir, addr); 574 return; 575 } 576 577 if (pmrsts.bits.cbai) { 578 SPDK_ERRLOG("Controller Memory Space Enable Failure\n"); 579 SPDK_ERRLOG("CBA Invalid - Host Addresses cannot reference PMR\n"); 580 } else { 581 SPDK_DEBUGLOG(nvme, "Controller Memory Space Enable Success\n"); 582 SPDK_DEBUGLOG(nvme, "Host Addresses can reference PMR\n"); 583 } 584 } 585 586 pctrlr->pmr.bar_va = addr; 587 pctrlr->pmr.bar_pa = bar_phys_addr; 588 pctrlr->pmr.size = pctrlr->ctrlr.pmr_size = bar_size; 589 } 590 591 static int 592 nvme_pcie_ctrlr_unmap_pmr(struct nvme_pcie_ctrlr *pctrlr) 593 { 594 int rc = 0; 595 union spdk_nvme_pmrcap_register pmrcap; 596 void *addr = pctrlr->pmr.bar_va; 597 598 if (addr == NULL) { 599 return rc; 600 } 601 602 if (pctrlr->pmr.mem_register_addr) { 603 spdk_mem_unregister(pctrlr->pmr.mem_register_addr, pctrlr->pmr.mem_register_size); 604 } 605 606 if (nvme_pcie_ctrlr_get_pmrcap(pctrlr, &pmrcap)) { 607 SPDK_ERRLOG("get_pmrcap() failed\n"); 608 return -EIO; 609 } 610 611 if (pmrcap.bits.cmss) { 612 if (nvme_pcie_ctrlr_set_pmrmscu(pctrlr, 0)) { 613 SPDK_ERRLOG("set_pmrmscu() failed\n"); 614 } 615 616 if (nvme_pcie_ctrlr_set_pmrmscl(pctrlr, 0)) { 617 SPDK_ERRLOG("set_pmrmscl() failed\n"); 618 } 619 } 620 621 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, pmrcap.bits.bir, addr); 622 623 return rc; 624 } 625 626 static int 627 nvme_pcie_ctrlr_config_pmr(struct spdk_nvme_ctrlr *ctrlr, bool enable) 628 { 629 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 630 union spdk_nvme_pmrcap_register pmrcap; 631 union spdk_nvme_pmrctl_register pmrctl; 632 union spdk_nvme_pmrsts_register pmrsts; 633 uint8_t pmrto, pmrtu; 634 uint64_t timeout_in_ms, ticks_per_ms, timeout_in_ticks, now_ticks; 635 636 if (!pctrlr->regs->cap.bits.pmrs) { 637 SPDK_ERRLOG("PMR is not supported by the controller\n"); 638 return -ENOTSUP; 639 } 640 641 if (nvme_pcie_ctrlr_get_pmrcap(pctrlr, &pmrcap)) { 642 SPDK_ERRLOG("get registers failed\n"); 643 return -EIO; 644 } 645 646 pmrto = pmrcap.bits.pmrto; 647 pmrtu = pmrcap.bits.pmrtu; 648 649 if (pmrtu > 1) { 650 SPDK_ERRLOG("PMR Time Units Invalid\n"); 651 return -EINVAL; 652 } 653 654 ticks_per_ms = spdk_get_ticks_hz() / 1000; 655 timeout_in_ms = pmrto * (pmrtu ? (60 * 1000) : 500); 656 timeout_in_ticks = timeout_in_ms * ticks_per_ms; 657 658 if (nvme_pcie_ctrlr_get_pmrctl(pctrlr, &pmrctl)) { 659 SPDK_ERRLOG("get pmrctl failed\n"); 660 return -EIO; 661 } 662 663 if (enable && pmrctl.bits.en != 0) { 664 SPDK_ERRLOG("PMR is already enabled\n"); 665 return -EINVAL; 666 } else if (!enable && pmrctl.bits.en != 1) { 667 SPDK_ERRLOG("PMR is already disabled\n"); 668 return -EINVAL; 669 } 670 671 pmrctl.bits.en = enable; 672 673 if (nvme_pcie_ctrlr_set_pmrctl(pctrlr, &pmrctl)) { 674 SPDK_ERRLOG("set pmrctl failed\n"); 675 return -EIO; 676 } 677 678 now_ticks = spdk_get_ticks(); 679 680 do { 681 if (nvme_pcie_ctrlr_get_pmrsts(pctrlr, &pmrsts)) { 682 SPDK_ERRLOG("get pmrsts failed\n"); 683 return -EIO; 684 } 685 686 if (pmrsts.bits.nrdy == enable && 687 spdk_get_ticks() > now_ticks + timeout_in_ticks) { 688 SPDK_ERRLOG("PMR Enable - Timed Out\n"); 689 return -ETIMEDOUT; 690 } 691 } while (pmrsts.bits.nrdy == enable); 692 693 SPDK_DEBUGLOG(nvme, "PMR %s\n", enable ? "Enabled" : "Disabled"); 694 695 return 0; 696 } 697 698 static int 699 nvme_pcie_ctrlr_enable_pmr(struct spdk_nvme_ctrlr *ctrlr) 700 { 701 return nvme_pcie_ctrlr_config_pmr(ctrlr, true); 702 } 703 704 static int 705 nvme_pcie_ctrlr_disable_pmr(struct spdk_nvme_ctrlr *ctrlr) 706 { 707 return nvme_pcie_ctrlr_config_pmr(ctrlr, false); 708 } 709 710 static void * 711 nvme_pcie_ctrlr_map_io_pmr(struct spdk_nvme_ctrlr *ctrlr, size_t *size) 712 { 713 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 714 union spdk_nvme_pmrcap_register pmrcap; 715 uint64_t mem_register_start, mem_register_end; 716 int rc; 717 718 if (!pctrlr->regs->cap.bits.pmrs) { 719 SPDK_ERRLOG("PMR is not supported by the controller\n"); 720 return NULL; 721 } 722 723 if (pctrlr->pmr.mem_register_addr != NULL) { 724 *size = pctrlr->pmr.mem_register_size; 725 return pctrlr->pmr.mem_register_addr; 726 } 727 728 *size = 0; 729 730 if (pctrlr->pmr.bar_va == NULL) { 731 SPDK_DEBUGLOG(nvme, "PMR not available\n"); 732 return NULL; 733 } 734 735 if (nvme_pcie_ctrlr_get_pmrcap(pctrlr, &pmrcap)) { 736 SPDK_ERRLOG("get registers failed\n"); 737 return NULL; 738 } 739 740 /* Check if WDS / RDS is supported */ 741 if (!(pmrcap.bits.wds || pmrcap.bits.rds)) { 742 return NULL; 743 } 744 745 /* If PMR is less than 4MiB in size then abort PMR mapping */ 746 if (pctrlr->pmr.size < (1ULL << 22)) { 747 return NULL; 748 } 749 750 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->pmr.bar_va + VALUE_2MB - 1); 751 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->pmr.bar_va + pctrlr->pmr.size); 752 753 rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start); 754 if (rc) { 755 SPDK_ERRLOG("spdk_mem_register() failed\n"); 756 return NULL; 757 } 758 759 pctrlr->pmr.mem_register_addr = (void *)mem_register_start; 760 pctrlr->pmr.mem_register_size = mem_register_end - mem_register_start; 761 762 *size = pctrlr->pmr.mem_register_size; 763 return pctrlr->pmr.mem_register_addr; 764 } 765 766 static int 767 nvme_pcie_ctrlr_unmap_io_pmr(struct spdk_nvme_ctrlr *ctrlr) 768 { 769 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 770 int rc; 771 772 if (pctrlr->pmr.mem_register_addr == NULL) { 773 return -ENXIO; 774 } 775 776 rc = spdk_mem_unregister(pctrlr->pmr.mem_register_addr, pctrlr->pmr.mem_register_size); 777 778 if (rc == 0) { 779 pctrlr->pmr.mem_register_addr = NULL; 780 pctrlr->pmr.mem_register_size = 0; 781 } 782 783 return rc; 784 } 785 786 static int 787 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) 788 { 789 int rc; 790 void *addr = NULL; 791 uint64_t phys_addr = 0, size = 0; 792 793 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, 794 &phys_addr, &size); 795 796 if ((addr == NULL) || (rc != 0)) { 797 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", 798 rc, addr); 799 return -1; 800 } 801 802 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; 803 pctrlr->regs_size = size; 804 pctrlr->doorbell_base = (volatile uint32_t *)&pctrlr->regs->doorbell[0].sq_tdbl; 805 nvme_pcie_ctrlr_map_cmb(pctrlr); 806 nvme_pcie_ctrlr_map_pmr(pctrlr); 807 808 return 0; 809 } 810 811 static int 812 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) 813 { 814 int rc = 0; 815 void *addr = (void *)pctrlr->regs; 816 817 if (pctrlr->ctrlr.is_removed) { 818 return rc; 819 } 820 821 rc = nvme_pcie_ctrlr_unmap_pmr(pctrlr); 822 if (rc != 0) { 823 SPDK_ERRLOG("nvme_ctrlr_unmap_pmr failed with error code %d\n", rc); 824 return -1; 825 } 826 827 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); 828 if (rc != 0) { 829 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); 830 return -1; 831 } 832 833 if (addr) { 834 /* NOTE: addr may have been remapped here. We're relying on DPDK to call 835 * munmap internally. 836 */ 837 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); 838 } 839 return rc; 840 } 841 842 /* This function must only be called while holding g_spdk_nvme_driver->lock */ 843 static int 844 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) 845 { 846 struct spdk_nvme_transport_id trid = {}; 847 struct nvme_pcie_enum_ctx *enum_ctx = ctx; 848 struct spdk_nvme_ctrlr *ctrlr; 849 struct spdk_pci_addr pci_addr; 850 851 pci_addr = spdk_pci_device_get_addr(pci_dev); 852 853 spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE); 854 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); 855 856 ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid); 857 if (!spdk_process_is_primary()) { 858 if (!ctrlr) { 859 SPDK_ERRLOG("Controller must be constructed in the primary process first.\n"); 860 return -1; 861 } 862 863 return nvme_ctrlr_add_process(ctrlr, pci_dev); 864 } 865 866 /* check whether user passes the pci_addr */ 867 if (enum_ctx->has_pci_addr && 868 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { 869 return 1; 870 } 871 872 return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev); 873 } 874 875 static int 876 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, 877 bool direct_connect) 878 { 879 struct nvme_pcie_enum_ctx enum_ctx = {}; 880 881 enum_ctx.probe_ctx = probe_ctx; 882 883 if (strlen(probe_ctx->trid.traddr) != 0) { 884 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) { 885 return -1; 886 } 887 enum_ctx.has_pci_addr = true; 888 } 889 890 /* Only the primary process can monitor hotplug. */ 891 if (spdk_process_is_primary()) { 892 _nvme_pcie_hotplug_monitor(probe_ctx); 893 } 894 895 if (enum_ctx.has_pci_addr == false) { 896 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), 897 pcie_nvme_enum_cb, &enum_ctx); 898 } else { 899 return spdk_pci_device_attach(spdk_pci_nvme_get_driver(), 900 pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); 901 } 902 } 903 904 static struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 905 const struct spdk_nvme_ctrlr_opts *opts, 906 void *devhandle) 907 { 908 struct spdk_pci_device *pci_dev = devhandle; 909 struct nvme_pcie_ctrlr *pctrlr; 910 union spdk_nvme_cap_register cap; 911 union spdk_nvme_vs_register vs; 912 uint16_t cmd_reg; 913 int rc; 914 struct spdk_pci_id pci_id; 915 916 rc = spdk_pci_device_claim(pci_dev); 917 if (rc < 0) { 918 SPDK_ERRLOG("could not claim device %s (%s)\n", 919 trid->traddr, spdk_strerror(-rc)); 920 return NULL; 921 } 922 923 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, 924 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 925 if (pctrlr == NULL) { 926 spdk_pci_device_unclaim(pci_dev); 927 SPDK_ERRLOG("could not allocate ctrlr\n"); 928 return NULL; 929 } 930 931 pctrlr->is_remapped = false; 932 pctrlr->ctrlr.is_removed = false; 933 pctrlr->devhandle = devhandle; 934 pctrlr->ctrlr.opts = *opts; 935 pctrlr->ctrlr.trid = *trid; 936 937 rc = nvme_ctrlr_construct(&pctrlr->ctrlr); 938 if (rc != 0) { 939 spdk_pci_device_unclaim(pci_dev); 940 spdk_free(pctrlr); 941 return NULL; 942 } 943 944 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); 945 if (rc != 0) { 946 spdk_pci_device_unclaim(pci_dev); 947 spdk_free(pctrlr); 948 return NULL; 949 } 950 951 /* Enable PCI busmaster and disable INTx */ 952 spdk_pci_device_cfg_read16(pci_dev, &cmd_reg, 4); 953 cmd_reg |= 0x404; 954 spdk_pci_device_cfg_write16(pci_dev, cmd_reg, 4); 955 956 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { 957 SPDK_ERRLOG("get_cap() failed\n"); 958 spdk_pci_device_unclaim(pci_dev); 959 spdk_free(pctrlr); 960 return NULL; 961 } 962 963 if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { 964 SPDK_ERRLOG("get_vs() failed\n"); 965 spdk_pci_device_unclaim(pci_dev); 966 spdk_free(pctrlr); 967 return NULL; 968 } 969 970 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); 971 972 /* Doorbell stride is 2 ^ (dstrd + 2), 973 * but we want multiples of 4, so drop the + 2 */ 974 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; 975 976 pci_id = spdk_pci_device_get_id(pci_dev); 977 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); 978 979 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr, pctrlr->ctrlr.opts.admin_queue_size); 980 if (rc != 0) { 981 nvme_ctrlr_destruct(&pctrlr->ctrlr); 982 return NULL; 983 } 984 985 /* Construct the primary process properties */ 986 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); 987 if (rc != 0) { 988 nvme_ctrlr_destruct(&pctrlr->ctrlr); 989 return NULL; 990 } 991 992 if (g_sigset != true) { 993 spdk_pci_register_error_handler(nvme_sigbus_fault_sighandler, 994 NULL); 995 g_sigset = true; 996 } 997 998 return &pctrlr->ctrlr; 999 } 1000 1001 static int 1002 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 1003 { 1004 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1005 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); 1006 union spdk_nvme_aqa_register aqa; 1007 1008 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { 1009 SPDK_ERRLOG("set_asq() failed\n"); 1010 return -EIO; 1011 } 1012 1013 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { 1014 SPDK_ERRLOG("set_acq() failed\n"); 1015 return -EIO; 1016 } 1017 1018 aqa.raw = 0; 1019 /* acqs and asqs are 0-based. */ 1020 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 1021 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; 1022 1023 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { 1024 SPDK_ERRLOG("set_aqa() failed\n"); 1025 return -EIO; 1026 } 1027 1028 return 0; 1029 } 1030 1031 static int 1032 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 1033 { 1034 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 1035 struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); 1036 1037 if (ctrlr->adminq) { 1038 nvme_pcie_qpair_destroy(ctrlr->adminq); 1039 } 1040 1041 nvme_ctrlr_destruct_finish(ctrlr); 1042 1043 nvme_ctrlr_free_processes(ctrlr); 1044 1045 nvme_pcie_ctrlr_free_bars(pctrlr); 1046 1047 if (devhandle) { 1048 spdk_pci_device_unclaim(devhandle); 1049 spdk_pci_device_detach(devhandle); 1050 } 1051 1052 spdk_free(pctrlr); 1053 1054 return 0; 1055 } 1056 1057 static int 1058 nvme_pcie_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, 1059 int (*iter_fn)(struct nvme_request *req, void *arg), 1060 void *arg) 1061 { 1062 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1063 struct nvme_tracker *tr, *tmp; 1064 int rc; 1065 1066 assert(iter_fn != NULL); 1067 1068 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { 1069 assert(tr->req != NULL); 1070 1071 rc = iter_fn(tr->req, arg); 1072 if (rc != 0) { 1073 return rc; 1074 } 1075 } 1076 1077 return 0; 1078 } 1079 1080 static void 1081 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1082 { 1083 /* 1084 * Bad vtophys translation, so abort this request and return 1085 * immediately. 1086 */ 1087 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1088 SPDK_NVME_SC_INVALID_FIELD, 1089 1 /* do not retry */, true); 1090 } 1091 1092 /* 1093 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. 1094 * 1095 * *prp_index will be updated to account for the number of PRP entries used. 1096 */ 1097 static inline int 1098 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, 1099 uint32_t page_size) 1100 { 1101 struct spdk_nvme_cmd *cmd = &tr->req->cmd; 1102 uintptr_t page_mask = page_size - 1; 1103 uint64_t phys_addr; 1104 uint32_t i; 1105 1106 SPDK_DEBUGLOG(nvme, "prp_index:%u virt_addr:%p len:%u\n", 1107 *prp_index, virt_addr, (uint32_t)len); 1108 1109 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { 1110 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1111 return -EFAULT; 1112 } 1113 1114 i = *prp_index; 1115 while (len) { 1116 uint32_t seg_len; 1117 1118 /* 1119 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, 1120 * so prp_index == count is valid. 1121 */ 1122 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { 1123 SPDK_ERRLOG("out of PRP entries\n"); 1124 return -EFAULT; 1125 } 1126 1127 phys_addr = spdk_vtophys(virt_addr, NULL); 1128 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { 1129 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); 1130 return -EFAULT; 1131 } 1132 1133 if (i == 0) { 1134 SPDK_DEBUGLOG(nvme, "prp1 = %p\n", (void *)phys_addr); 1135 cmd->dptr.prp.prp1 = phys_addr; 1136 seg_len = page_size - ((uintptr_t)virt_addr & page_mask); 1137 } else { 1138 if ((phys_addr & page_mask) != 0) { 1139 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); 1140 return -EFAULT; 1141 } 1142 1143 SPDK_DEBUGLOG(nvme, "prp[%u] = %p\n", i - 1, (void *)phys_addr); 1144 tr->u.prp[i - 1] = phys_addr; 1145 seg_len = page_size; 1146 } 1147 1148 seg_len = spdk_min(seg_len, len); 1149 virt_addr += seg_len; 1150 len -= seg_len; 1151 i++; 1152 } 1153 1154 cmd->psdt = SPDK_NVME_PSDT_PRP; 1155 if (i <= 1) { 1156 cmd->dptr.prp.prp2 = 0; 1157 } else if (i == 2) { 1158 cmd->dptr.prp.prp2 = tr->u.prp[0]; 1159 SPDK_DEBUGLOG(nvme, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); 1160 } else { 1161 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; 1162 SPDK_DEBUGLOG(nvme, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); 1163 } 1164 1165 *prp_index = i; 1166 return 0; 1167 } 1168 1169 static int 1170 nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair, 1171 struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned) 1172 { 1173 assert(0); 1174 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1175 return -EINVAL; 1176 } 1177 1178 /** 1179 * Build PRP list describing physically contiguous payload buffer. 1180 */ 1181 static int 1182 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1183 struct nvme_tracker *tr, bool dword_aligned) 1184 { 1185 uint32_t prp_index = 0; 1186 int rc; 1187 1188 rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, 1189 req->payload_size, qpair->ctrlr->page_size); 1190 if (rc) { 1191 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1192 } 1193 1194 return rc; 1195 } 1196 1197 /** 1198 * Build an SGL describing a physically contiguous payload buffer. 1199 * 1200 * This is more efficient than using PRP because large buffers can be 1201 * described this way. 1202 */ 1203 static int 1204 nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1205 struct nvme_tracker *tr, bool dword_aligned) 1206 { 1207 void *virt_addr; 1208 uint64_t phys_addr, mapping_length; 1209 uint32_t length; 1210 struct spdk_nvme_sgl_descriptor *sgl; 1211 uint32_t nseg = 0; 1212 1213 assert(req->payload_size != 0); 1214 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); 1215 1216 sgl = tr->u.sgl; 1217 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1218 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1219 1220 length = req->payload_size; 1221 virt_addr = req->payload.contig_or_cb_arg + req->payload_offset; 1222 1223 while (length > 0) { 1224 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1225 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1226 return -EFAULT; 1227 } 1228 1229 if (dword_aligned && ((uintptr_t)virt_addr & 3)) { 1230 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1231 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1232 return -EFAULT; 1233 } 1234 1235 mapping_length = length; 1236 phys_addr = spdk_vtophys(virt_addr, &mapping_length); 1237 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1238 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1239 return -EFAULT; 1240 } 1241 1242 mapping_length = spdk_min(length, mapping_length); 1243 1244 length -= mapping_length; 1245 virt_addr += mapping_length; 1246 1247 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1248 sgl->unkeyed.length = mapping_length; 1249 sgl->address = phys_addr; 1250 sgl->unkeyed.subtype = 0; 1251 1252 sgl++; 1253 nseg++; 1254 } 1255 1256 if (nseg == 1) { 1257 /* 1258 * The whole transfer can be described by a single SGL descriptor. 1259 * Use the special case described by the spec where SGL1's type is Data Block. 1260 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1261 * SGL element into SGL1. 1262 */ 1263 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1264 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1265 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1266 } else { 1267 /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because 1268 * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. 1269 */ 1270 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1271 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1272 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1273 } 1274 1275 return 0; 1276 } 1277 1278 /** 1279 * Build SGL list describing scattered payload buffer. 1280 */ 1281 static int 1282 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1283 struct nvme_tracker *tr, bool dword_aligned) 1284 { 1285 int rc; 1286 void *virt_addr; 1287 uint64_t phys_addr, mapping_length; 1288 uint32_t remaining_transfer_len, remaining_user_sge_len, length; 1289 struct spdk_nvme_sgl_descriptor *sgl; 1290 uint32_t nseg = 0; 1291 1292 /* 1293 * Build scattered payloads. 1294 */ 1295 assert(req->payload_size != 0); 1296 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1297 assert(req->payload.reset_sgl_fn != NULL); 1298 assert(req->payload.next_sge_fn != NULL); 1299 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1300 1301 sgl = tr->u.sgl; 1302 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1303 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1304 1305 remaining_transfer_len = req->payload_size; 1306 1307 while (remaining_transfer_len > 0) { 1308 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, 1309 &virt_addr, &remaining_user_sge_len); 1310 if (rc) { 1311 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1312 return -EFAULT; 1313 } 1314 1315 /* Bit Bucket SGL descriptor */ 1316 if ((uint64_t)virt_addr == UINT64_MAX) { 1317 /* TODO: enable WRITE and COMPARE when necessary */ 1318 if (req->cmd.opc != SPDK_NVME_OPC_READ) { 1319 SPDK_ERRLOG("Only READ command can be supported\n"); 1320 goto exit; 1321 } 1322 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1323 SPDK_ERRLOG("Too many SGL entries\n"); 1324 goto exit; 1325 } 1326 1327 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET; 1328 /* If the SGL describes a destination data buffer, the length of data 1329 * buffer shall be discarded by controller, and the length is included 1330 * in Number of Logical Blocks (NLB) parameter. Otherwise, the length 1331 * is not included in the NLB parameter. 1332 */ 1333 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 1334 remaining_transfer_len -= remaining_user_sge_len; 1335 1336 sgl->unkeyed.length = remaining_user_sge_len; 1337 sgl->address = 0; 1338 sgl->unkeyed.subtype = 0; 1339 1340 sgl++; 1341 nseg++; 1342 1343 continue; 1344 } 1345 1346 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 1347 remaining_transfer_len -= remaining_user_sge_len; 1348 while (remaining_user_sge_len > 0) { 1349 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1350 SPDK_ERRLOG("Too many SGL entries\n"); 1351 goto exit; 1352 } 1353 1354 if (dword_aligned && ((uintptr_t)virt_addr & 3)) { 1355 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1356 goto exit; 1357 } 1358 1359 mapping_length = remaining_user_sge_len; 1360 phys_addr = spdk_vtophys(virt_addr, &mapping_length); 1361 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1362 goto exit; 1363 } 1364 1365 length = spdk_min(remaining_user_sge_len, mapping_length); 1366 remaining_user_sge_len -= length; 1367 virt_addr += length; 1368 1369 if (nseg > 0 && phys_addr == 1370 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { 1371 /* extend previous entry */ 1372 (*(sgl - 1)).unkeyed.length += length; 1373 continue; 1374 } 1375 1376 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1377 sgl->unkeyed.length = length; 1378 sgl->address = phys_addr; 1379 sgl->unkeyed.subtype = 0; 1380 1381 sgl++; 1382 nseg++; 1383 } 1384 } 1385 1386 if (nseg == 1) { 1387 /* 1388 * The whole transfer can be described by a single SGL descriptor. 1389 * Use the special case described by the spec where SGL1's type is Data Block. 1390 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1391 * SGL element into SGL1. 1392 */ 1393 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1394 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1395 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1396 } else { 1397 /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because 1398 * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. 1399 */ 1400 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1401 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1402 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1403 } 1404 1405 return 0; 1406 1407 exit: 1408 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1409 return -EFAULT; 1410 } 1411 1412 /** 1413 * Build PRP list describing scattered payload buffer. 1414 */ 1415 static int 1416 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1417 struct nvme_tracker *tr, bool dword_aligned) 1418 { 1419 int rc; 1420 void *virt_addr; 1421 uint32_t remaining_transfer_len, length; 1422 uint32_t prp_index = 0; 1423 uint32_t page_size = qpair->ctrlr->page_size; 1424 1425 /* 1426 * Build scattered payloads. 1427 */ 1428 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1429 assert(req->payload.reset_sgl_fn != NULL); 1430 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1431 1432 remaining_transfer_len = req->payload_size; 1433 while (remaining_transfer_len > 0) { 1434 assert(req->payload.next_sge_fn != NULL); 1435 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); 1436 if (rc) { 1437 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1438 return -EFAULT; 1439 } 1440 1441 length = spdk_min(remaining_transfer_len, length); 1442 1443 /* 1444 * Any incompatible sges should have been handled up in the splitting routine, 1445 * but assert here as an additional check. 1446 * 1447 * All SGEs except last must end on a page boundary. 1448 */ 1449 assert((length == remaining_transfer_len) || 1450 _is_page_aligned((uintptr_t)virt_addr + length, page_size)); 1451 1452 rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); 1453 if (rc) { 1454 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1455 return rc; 1456 } 1457 1458 remaining_transfer_len -= length; 1459 } 1460 1461 return 0; 1462 } 1463 1464 typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *, 1465 bool); 1466 1467 static build_req_fn const g_nvme_pcie_build_req_table[][2] = { 1468 [NVME_PAYLOAD_TYPE_INVALID] = { 1469 nvme_pcie_qpair_build_request_invalid, /* PRP */ 1470 nvme_pcie_qpair_build_request_invalid /* SGL */ 1471 }, 1472 [NVME_PAYLOAD_TYPE_CONTIG] = { 1473 nvme_pcie_qpair_build_contig_request, /* PRP */ 1474 nvme_pcie_qpair_build_contig_hw_sgl_request /* SGL */ 1475 }, 1476 [NVME_PAYLOAD_TYPE_SGL] = { 1477 nvme_pcie_qpair_build_prps_sgl_request, /* PRP */ 1478 nvme_pcie_qpair_build_hw_sgl_request /* SGL */ 1479 } 1480 }; 1481 1482 static int 1483 nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 1484 bool sgl_supported, bool dword_aligned) 1485 { 1486 void *md_payload; 1487 struct nvme_request *req = tr->req; 1488 1489 if (req->payload.md) { 1490 md_payload = req->payload.md + req->md_offset; 1491 if (dword_aligned && ((uintptr_t)md_payload & 3)) { 1492 SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload); 1493 goto exit; 1494 } 1495 1496 if (sgl_supported && dword_aligned) { 1497 assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG); 1498 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL; 1499 tr->meta_sgl.address = spdk_vtophys(md_payload, NULL); 1500 if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR) { 1501 goto exit; 1502 } 1503 tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1504 tr->meta_sgl.unkeyed.length = req->md_size; 1505 tr->meta_sgl.unkeyed.subtype = 0; 1506 req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor); 1507 } else { 1508 req->cmd.mptr = spdk_vtophys(md_payload, NULL); 1509 if (req->cmd.mptr == SPDK_VTOPHYS_ERROR) { 1510 goto exit; 1511 } 1512 } 1513 } 1514 1515 return 0; 1516 1517 exit: 1518 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1519 return -EINVAL; 1520 } 1521 1522 static int 1523 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) 1524 { 1525 struct nvme_tracker *tr; 1526 int rc = 0; 1527 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1528 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1529 enum nvme_payload_type payload_type; 1530 bool sgl_supported; 1531 bool dword_aligned = true; 1532 1533 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 1534 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 1535 } 1536 1537 tr = TAILQ_FIRST(&pqpair->free_tr); 1538 1539 if (tr == NULL) { 1540 pqpair->stat->queued_requests++; 1541 /* Inform the upper layer to try again later. */ 1542 rc = -EAGAIN; 1543 goto exit; 1544 } 1545 1546 pqpair->stat->submitted_requests++; 1547 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ 1548 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); 1549 tr->req = req; 1550 tr->cb_fn = req->cb_fn; 1551 tr->cb_arg = req->cb_arg; 1552 req->cmd.cid = tr->cid; 1553 1554 if (req->payload_size != 0) { 1555 payload_type = nvme_payload_type(&req->payload); 1556 /* According to the specification, PRPs shall be used for all 1557 * Admin commands for NVMe over PCIe implementations. 1558 */ 1559 sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 && 1560 !nvme_qpair_is_admin_queue(qpair); 1561 1562 if (sgl_supported) { 1563 /* Don't use SGL for DSM command */ 1564 if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_NO_SGL_FOR_DSM) && 1565 (req->cmd.opc == SPDK_NVME_OPC_DATASET_MANAGEMENT))) { 1566 sgl_supported = false; 1567 } 1568 } 1569 1570 if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) { 1571 dword_aligned = false; 1572 } 1573 rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned); 1574 if (rc < 0) { 1575 goto exit; 1576 } 1577 1578 rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, dword_aligned); 1579 if (rc < 0) { 1580 goto exit; 1581 } 1582 } 1583 1584 nvme_pcie_qpair_submit_tracker(qpair, tr); 1585 1586 exit: 1587 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 1588 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 1589 } 1590 1591 return rc; 1592 } 1593 1594 void 1595 spdk_nvme_pcie_set_hotplug_filter(spdk_nvme_pcie_hotplug_filter_cb filter_cb) 1596 { 1597 g_hotplug_filter_cb = filter_cb; 1598 } 1599 1600 static int 1601 nvme_pcie_poll_group_get_stats(struct spdk_nvme_transport_poll_group *tgroup, 1602 struct spdk_nvme_transport_poll_group_stat **_stats) 1603 { 1604 struct nvme_pcie_poll_group *group; 1605 struct spdk_nvme_transport_poll_group_stat *stats; 1606 1607 if (tgroup == NULL || _stats == NULL) { 1608 SPDK_ERRLOG("Invalid stats or group pointer\n"); 1609 return -EINVAL; 1610 } 1611 1612 group = SPDK_CONTAINEROF(tgroup, struct nvme_pcie_poll_group, group); 1613 stats = calloc(1, sizeof(*stats)); 1614 if (!stats) { 1615 SPDK_ERRLOG("Can't allocate memory for RDMA stats\n"); 1616 return -ENOMEM; 1617 } 1618 stats->trtype = SPDK_NVME_TRANSPORT_PCIE; 1619 memcpy(&stats->pcie, &group->stats, sizeof(group->stats)); 1620 1621 *_stats = stats; 1622 1623 return 0; 1624 } 1625 1626 static void 1627 nvme_pcie_poll_group_free_stats(struct spdk_nvme_transport_poll_group *tgroup, 1628 struct spdk_nvme_transport_poll_group_stat *stats) 1629 { 1630 free(stats); 1631 } 1632 1633 static struct spdk_pci_id nvme_pci_driver_id[] = { 1634 { 1635 .class_id = SPDK_PCI_CLASS_NVME, 1636 .vendor_id = SPDK_PCI_ANY_ID, 1637 .device_id = SPDK_PCI_ANY_ID, 1638 .subvendor_id = SPDK_PCI_ANY_ID, 1639 .subdevice_id = SPDK_PCI_ANY_ID, 1640 }, 1641 { .vendor_id = 0, /* sentinel */ }, 1642 }; 1643 1644 SPDK_PCI_DRIVER_REGISTER(nvme, nvme_pci_driver_id, 1645 SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); 1646 1647 const struct spdk_nvme_transport_ops pcie_ops = { 1648 .name = "PCIE", 1649 .type = SPDK_NVME_TRANSPORT_PCIE, 1650 .ctrlr_construct = nvme_pcie_ctrlr_construct, 1651 .ctrlr_scan = nvme_pcie_ctrlr_scan, 1652 .ctrlr_destruct = nvme_pcie_ctrlr_destruct, 1653 .ctrlr_enable = nvme_pcie_ctrlr_enable, 1654 1655 .ctrlr_set_reg_4 = nvme_pcie_ctrlr_set_reg_4, 1656 .ctrlr_set_reg_8 = nvme_pcie_ctrlr_set_reg_8, 1657 .ctrlr_get_reg_4 = nvme_pcie_ctrlr_get_reg_4, 1658 .ctrlr_get_reg_8 = nvme_pcie_ctrlr_get_reg_8, 1659 1660 .ctrlr_get_max_xfer_size = nvme_pcie_ctrlr_get_max_xfer_size, 1661 .ctrlr_get_max_sges = nvme_pcie_ctrlr_get_max_sges, 1662 1663 .ctrlr_reserve_cmb = nvme_pcie_ctrlr_reserve_cmb, 1664 .ctrlr_map_cmb = nvme_pcie_ctrlr_map_io_cmb, 1665 .ctrlr_unmap_cmb = nvme_pcie_ctrlr_unmap_io_cmb, 1666 1667 .ctrlr_enable_pmr = nvme_pcie_ctrlr_enable_pmr, 1668 .ctrlr_disable_pmr = nvme_pcie_ctrlr_disable_pmr, 1669 .ctrlr_map_pmr = nvme_pcie_ctrlr_map_io_pmr, 1670 .ctrlr_unmap_pmr = nvme_pcie_ctrlr_unmap_io_pmr, 1671 1672 .ctrlr_create_io_qpair = nvme_pcie_ctrlr_create_io_qpair, 1673 .ctrlr_delete_io_qpair = nvme_pcie_ctrlr_delete_io_qpair, 1674 .ctrlr_connect_qpair = nvme_pcie_ctrlr_connect_qpair, 1675 .ctrlr_disconnect_qpair = nvme_pcie_ctrlr_disconnect_qpair, 1676 1677 .qpair_abort_reqs = nvme_pcie_qpair_abort_reqs, 1678 .qpair_reset = nvme_pcie_qpair_reset, 1679 .qpair_submit_request = nvme_pcie_qpair_submit_request, 1680 .qpair_process_completions = nvme_pcie_qpair_process_completions, 1681 .qpair_iterate_requests = nvme_pcie_qpair_iterate_requests, 1682 .admin_qpair_abort_aers = nvme_pcie_admin_qpair_abort_aers, 1683 1684 .poll_group_create = nvme_pcie_poll_group_create, 1685 .poll_group_connect_qpair = nvme_pcie_poll_group_connect_qpair, 1686 .poll_group_disconnect_qpair = nvme_pcie_poll_group_disconnect_qpair, 1687 .poll_group_add = nvme_pcie_poll_group_add, 1688 .poll_group_remove = nvme_pcie_poll_group_remove, 1689 .poll_group_process_completions = nvme_pcie_poll_group_process_completions, 1690 .poll_group_destroy = nvme_pcie_poll_group_destroy, 1691 .poll_group_get_stats = nvme_pcie_poll_group_get_stats, 1692 .poll_group_free_stats = nvme_pcie_poll_group_free_stats 1693 }; 1694 1695 SPDK_NVME_TRANSPORT_REGISTER(pcie, &pcie_ops); 1696