1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019, Nutanix Inc. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * * Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * * Neither the name of Intel Corporation nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * NVMe over vfio-user transport 35 */ 36 37 #include <vfio-user/libvfio-user.h> 38 #include <vfio-user/pci_defs.h> 39 40 #include "spdk/barrier.h" 41 #include "spdk/stdinc.h" 42 #include "spdk/assert.h" 43 #include "spdk/thread.h" 44 #include "spdk/nvmf_transport.h" 45 #include "spdk/sock.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 #include "spdk/log.h" 49 50 #include "transport.h" 51 52 #include "nvmf_internal.h" 53 54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 56 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 57 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 58 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 59 60 #define NVMF_VFIO_USER_DOORBELLS_OFFSET 0x1000 61 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 62 63 #define NVME_REG_CFG_SIZE 0x1000 64 #define NVME_REG_BAR0_SIZE 0x4000 65 #define NVME_IRQ_INTX_NUM 1 66 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 67 68 struct nvmf_vfio_user_req; 69 struct nvmf_vfio_user_qpair; 70 71 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 72 73 /* 1 more for PRP2 list itself */ 74 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 75 76 enum nvmf_vfio_user_req_state { 77 VFIO_USER_REQUEST_STATE_FREE = 0, 78 VFIO_USER_REQUEST_STATE_EXECUTING, 79 }; 80 81 struct nvmf_vfio_user_req { 82 struct spdk_nvmf_request req; 83 struct spdk_nvme_cpl rsp; 84 struct spdk_nvme_cmd cmd; 85 86 enum nvmf_vfio_user_req_state state; 87 nvmf_vfio_user_req_cb_fn cb_fn; 88 void *cb_arg; 89 90 /* old CC before prop_set_cc fabric command */ 91 union spdk_nvme_cc_register cc; 92 93 /* placeholder for gpa_to_vva memory map table, the IO buffer doesn't use it */ 94 dma_sg_t *sg; 95 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 96 uint8_t iovcnt; 97 98 TAILQ_ENTRY(nvmf_vfio_user_req) link; 99 }; 100 101 /* 102 * A NVMe queue. 103 */ 104 struct nvme_q { 105 bool is_cq; 106 107 void *addr; 108 109 dma_sg_t *sg; 110 struct iovec iov; 111 112 uint32_t size; 113 uint64_t prp1; 114 115 union { 116 struct { 117 uint32_t head; 118 /* multiple SQs can be mapped to the same CQ */ 119 uint16_t cqid; 120 }; 121 struct { 122 uint32_t tail; 123 uint16_t iv; 124 bool ien; 125 bool phase; 126 }; 127 }; 128 }; 129 130 enum nvmf_vfio_user_qpair_state { 131 VFIO_USER_QPAIR_UNINITIALIZED = 0, 132 VFIO_USER_QPAIR_ACTIVE, 133 VFIO_USER_QPAIR_DELETED, 134 VFIO_USER_QPAIR_INACTIVE, 135 VFIO_USER_QPAIR_ERROR, 136 }; 137 138 struct nvmf_vfio_user_qpair { 139 struct spdk_nvmf_qpair qpair; 140 struct spdk_nvmf_transport_poll_group *group; 141 struct nvmf_vfio_user_ctrlr *ctrlr; 142 struct nvmf_vfio_user_req *reqs_internal; 143 uint16_t qsize; 144 struct nvme_q cq; 145 struct nvme_q sq; 146 enum nvmf_vfio_user_qpair_state state; 147 148 /* Copy of Create IO SQ command */ 149 struct spdk_nvme_cmd create_io_sq_cmd; 150 151 TAILQ_HEAD(, nvmf_vfio_user_req) reqs; 152 TAILQ_ENTRY(nvmf_vfio_user_qpair) link; 153 }; 154 155 struct nvmf_vfio_user_poll_group { 156 struct spdk_nvmf_transport_poll_group group; 157 TAILQ_HEAD(, nvmf_vfio_user_qpair) qps; 158 }; 159 160 struct nvmf_vfio_user_ctrlr { 161 struct nvmf_vfio_user_endpoint *endpoint; 162 struct nvmf_vfio_user_transport *transport; 163 164 /* Number of connected queue pairs */ 165 uint32_t num_connected_qps; 166 167 struct spdk_thread *thread; 168 struct spdk_poller *vfu_ctx_poller; 169 170 uint16_t cntlid; 171 172 struct nvmf_vfio_user_qpair *qp[NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR]; 173 174 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 175 176 volatile uint32_t *doorbells; 177 178 /* internal CSTS.CFS register for vfio-user fatal errors */ 179 uint32_t cfs : 1; 180 }; 181 182 struct nvmf_vfio_user_endpoint { 183 vfu_ctx_t *vfu_ctx; 184 struct msixcap *msix; 185 vfu_pci_config_space_t *pci_config_space; 186 int devmem_fd; 187 volatile uint32_t *doorbells; 188 189 struct spdk_nvme_transport_id trid; 190 const struct spdk_nvmf_subsystem *subsystem; 191 192 struct nvmf_vfio_user_ctrlr *ctrlr; 193 pthread_mutex_t lock; 194 195 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 196 }; 197 198 struct nvmf_vfio_user_transport_opts { 199 bool disable_mappable_bar0; 200 }; 201 202 struct nvmf_vfio_user_transport { 203 struct spdk_nvmf_transport transport; 204 struct nvmf_vfio_user_transport_opts transport_opts; 205 pthread_mutex_t lock; 206 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 207 }; 208 209 /* 210 * function prototypes 211 */ 212 static volatile uint32_t * 213 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 214 215 static volatile uint32_t * 216 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 217 218 static int 219 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 220 221 static struct nvmf_vfio_user_req * 222 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair); 223 224 static int 225 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 226 uint32_t max_iovcnt, uint32_t len, size_t mps, 227 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 228 { 229 uint64_t prp1, prp2; 230 void *vva; 231 uint32_t i; 232 uint32_t residue_len, nents; 233 uint64_t *prp_list; 234 uint32_t iovcnt; 235 236 assert(max_iovcnt > 0); 237 238 prp1 = cmd->dptr.prp.prp1; 239 prp2 = cmd->dptr.prp.prp2; 240 241 /* PRP1 may started with unaligned page address */ 242 residue_len = mps - (prp1 % mps); 243 residue_len = spdk_min(len, residue_len); 244 245 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 246 if (spdk_unlikely(vva == NULL)) { 247 SPDK_ERRLOG("GPA to VVA failed\n"); 248 return -EINVAL; 249 } 250 len -= residue_len; 251 if (len && max_iovcnt < 2) { 252 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 253 return -ERANGE; 254 } 255 iovs[0].iov_base = vva; 256 iovs[0].iov_len = residue_len; 257 258 if (len) { 259 if (spdk_unlikely(prp2 == 0)) { 260 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 261 return -EINVAL; 262 } 263 264 if (len <= mps) { 265 /* 2 PRP used */ 266 iovcnt = 2; 267 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 268 if (spdk_unlikely(vva == NULL)) { 269 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 270 prp2, len); 271 return -EINVAL; 272 } 273 iovs[1].iov_base = vva; 274 iovs[1].iov_len = len; 275 } else { 276 /* PRP list used */ 277 nents = (len + mps - 1) / mps; 278 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 279 SPDK_ERRLOG("Too many page entries\n"); 280 return -ERANGE; 281 } 282 283 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 284 if (spdk_unlikely(vva == NULL)) { 285 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 286 prp2, nents); 287 return -EINVAL; 288 } 289 prp_list = vva; 290 i = 0; 291 while (len != 0) { 292 residue_len = spdk_min(len, mps); 293 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 294 if (spdk_unlikely(vva == NULL)) { 295 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 296 prp_list[i], residue_len); 297 return -EINVAL; 298 } 299 iovs[i + 1].iov_base = vva; 300 iovs[i + 1].iov_len = residue_len; 301 len -= residue_len; 302 i++; 303 } 304 iovcnt = i + 1; 305 } 306 } else { 307 /* 1 PRP used */ 308 iovcnt = 1; 309 } 310 311 assert(iovcnt <= max_iovcnt); 312 return iovcnt; 313 } 314 315 static int 316 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 317 struct iovec *iovs, uint32_t max_iovcnt, 318 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 319 { 320 uint32_t i; 321 void *vva; 322 323 if (spdk_unlikely(max_iovcnt < num_sgls)) { 324 return -ERANGE; 325 } 326 327 for (i = 0; i < num_sgls; i++) { 328 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 329 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 330 return -EINVAL; 331 } 332 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 333 if (spdk_unlikely(vva == NULL)) { 334 SPDK_ERRLOG("GPA to VVA failed\n"); 335 return -EINVAL; 336 } 337 iovs[i].iov_base = vva; 338 iovs[i].iov_len = sgls[i].unkeyed.length; 339 } 340 341 return num_sgls; 342 } 343 344 static int 345 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 346 uint32_t len, size_t mps, 347 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 348 { 349 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 350 uint32_t num_sgls, seg_len; 351 void *vva; 352 int ret; 353 uint32_t total_iovcnt = 0; 354 355 /* SGL cases */ 356 sgl = &cmd->dptr.sgl1; 357 358 /* only one SGL segment */ 359 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 360 assert(max_iovcnt > 0); 361 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 362 if (spdk_unlikely(vva == NULL)) { 363 SPDK_ERRLOG("GPA to VVA failed\n"); 364 return -EINVAL; 365 } 366 iovs[0].iov_base = vva; 367 iovs[0].iov_len = sgl->unkeyed.length; 368 assert(sgl->unkeyed.length == len); 369 370 return 1; 371 } 372 373 for (;;) { 374 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 375 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 376 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 377 return -EINVAL; 378 } 379 380 seg_len = sgl->unkeyed.length; 381 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 382 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 383 return -EINVAL; 384 } 385 386 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 387 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 388 if (spdk_unlikely(vva == NULL)) { 389 SPDK_ERRLOG("GPA to VVA failed\n"); 390 return -EINVAL; 391 } 392 393 /* sgl point to the first segment */ 394 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 395 last_sgl = &sgl[num_sgls - 1]; 396 397 /* we are done */ 398 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 399 /* map whole sgl list */ 400 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 401 max_iovcnt - total_iovcnt, gpa_to_vva); 402 if (spdk_unlikely(ret < 0)) { 403 return ret; 404 } 405 total_iovcnt += ret; 406 407 return total_iovcnt; 408 } 409 410 if (num_sgls > 1) { 411 /* map whole sgl exclude last_sgl */ 412 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 413 max_iovcnt - total_iovcnt, gpa_to_vva); 414 if (spdk_unlikely(ret < 0)) { 415 return ret; 416 } 417 total_iovcnt += ret; 418 } 419 420 /* move to next level's segments */ 421 sgl = last_sgl; 422 } 423 424 return 0; 425 } 426 427 static int 428 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 429 uint32_t len, size_t mps, 430 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 431 { 432 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 433 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 434 } 435 436 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 437 } 438 439 static char * 440 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 441 { 442 return endpoint->trid.traddr; 443 } 444 445 static char * 446 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 447 { 448 if (!ctrlr || !ctrlr->endpoint) { 449 return "Null Ctrlr"; 450 } 451 452 return endpoint_id(ctrlr->endpoint); 453 } 454 455 static inline uint16_t 456 io_q_id(struct nvme_q *q) 457 { 458 459 struct nvmf_vfio_user_qpair *vu_qpair; 460 461 assert(q); 462 463 if (q->is_cq) { 464 vu_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq); 465 } else { 466 vu_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq); 467 } 468 assert(vu_qpair); 469 return vu_qpair->qpair.qid; 470 } 471 472 static void 473 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 474 { 475 assert(ctrlr != NULL); 476 477 if (ctrlr->cfs == 0) { 478 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr)); 479 } 480 481 ctrlr->cfs = 1U; 482 } 483 484 static inline bool 485 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 486 { 487 assert(vu_ctrlr != NULL); 488 assert(vu_ctrlr->endpoint != NULL); 489 490 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 491 492 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 493 } 494 495 static void 496 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 497 { 498 if (endpoint->doorbells) { 499 munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 500 } 501 502 if (endpoint->devmem_fd > 0) { 503 close(endpoint->devmem_fd); 504 } 505 506 vfu_destroy_ctx(endpoint->vfu_ctx); 507 508 pthread_mutex_destroy(&endpoint->lock); 509 free(endpoint); 510 } 511 512 /* called when process exits */ 513 static int 514 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 515 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 516 { 517 struct nvmf_vfio_user_transport *vu_transport; 518 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 519 520 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 521 522 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 523 transport); 524 525 (void)pthread_mutex_destroy(&vu_transport->lock); 526 527 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 528 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 529 nvmf_vfio_user_destroy_endpoint(endpoint); 530 } 531 532 free(vu_transport); 533 534 if (cb_fn) { 535 cb_fn(cb_arg); 536 } 537 538 return 0; 539 } 540 541 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 542 { 543 "disable-mappable-bar0", 544 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 545 spdk_json_decode_bool, true 546 }, 547 }; 548 549 static struct spdk_nvmf_transport * 550 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 551 { 552 struct nvmf_vfio_user_transport *vu_transport; 553 int err; 554 555 vu_transport = calloc(1, sizeof(*vu_transport)); 556 if (vu_transport == NULL) { 557 SPDK_ERRLOG("Transport alloc fail: %m\n"); 558 return NULL; 559 } 560 561 err = pthread_mutex_init(&vu_transport->lock, NULL); 562 if (err != 0) { 563 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 564 goto err; 565 } 566 567 TAILQ_INIT(&vu_transport->endpoints); 568 569 if (opts->transport_specific != NULL && 570 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 571 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 572 vu_transport)) { 573 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 574 free(vu_transport); 575 return NULL; 576 } 577 578 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 579 vu_transport->transport_opts.disable_mappable_bar0); 580 581 return &vu_transport->transport; 582 583 err: 584 free(vu_transport); 585 586 return NULL; 587 } 588 589 static uint16_t 590 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr) 591 { 592 assert(ctrlr != NULL); 593 assert(ctrlr->qp[0] != NULL); 594 assert(ctrlr->qp[0]->qpair.ctrlr != NULL); 595 596 return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1; 597 } 598 599 static void * 600 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot) 601 { 602 int ret; 603 604 assert(ctx != NULL); 605 assert(sg != NULL); 606 assert(iov != NULL); 607 608 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 609 if (ret < 0) { 610 return NULL; 611 } 612 613 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 614 if (ret != 0) { 615 return NULL; 616 } 617 618 assert(iov->iov_base != NULL); 619 return iov->iov_base; 620 } 621 622 static inline uint32_t 623 sq_head(struct nvmf_vfio_user_qpair *qpair) 624 { 625 assert(qpair != NULL); 626 return qpair->sq.head; 627 } 628 629 static inline void 630 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair) 631 { 632 assert(ctrlr != NULL); 633 assert(qpair != NULL); 634 qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size; 635 } 636 637 static int 638 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q *q, bool is_cq, bool unmap) 639 { 640 uint64_t len; 641 642 assert(q->size); 643 assert(q->addr == NULL); 644 645 if (is_cq) { 646 len = q->size * sizeof(struct spdk_nvme_cpl); 647 } else { 648 len = q->size * sizeof(struct spdk_nvme_cmd); 649 } 650 651 q->addr = map_one(vu_ctrlr->endpoint->vfu_ctx, q->prp1, len, q->sg, 652 &q->iov, is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 653 if (q->addr == NULL) { 654 return -EFAULT; 655 } 656 657 if (unmap) { 658 memset(q->addr, 0, len); 659 } 660 661 return 0; 662 } 663 664 static int 665 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 666 { 667 struct nvme_q *sq; 668 const struct spdk_nvmf_registers *regs; 669 int ret; 670 671 assert(ctrlr != NULL); 672 assert(ctrlr->qp[0] != NULL); 673 assert(ctrlr->qp[0]->sq.addr == NULL); 674 /* XXX ctrlr->asq == 0 is a valid memory address */ 675 676 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 677 sq = &ctrlr->qp[0]->sq; 678 sq->size = regs->aqa.bits.asqs + 1; 679 sq->prp1 = regs->asq; 680 sq->head = 0; 681 sq->cqid = 0; 682 sq->is_cq = false; 683 684 ret = map_q(ctrlr, sq, false, true); 685 if (ret) { 686 return ret; 687 } 688 689 *tdbl(ctrlr, sq) = 0; 690 691 return 0; 692 } 693 694 static inline int 695 queue_index(uint16_t qid, int is_cq) 696 { 697 return (qid * 2) + is_cq; 698 } 699 700 static volatile uint32_t * 701 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 702 { 703 assert(ctrlr != NULL); 704 assert(q != NULL); 705 assert(!q->is_cq); 706 707 return &ctrlr->doorbells[queue_index(io_q_id(q), false)]; 708 } 709 710 static volatile uint32_t * 711 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 712 { 713 assert(ctrlr != NULL); 714 assert(q != NULL); 715 assert(q->is_cq); 716 717 return &ctrlr->doorbells[queue_index(io_q_id(q), true)]; 718 } 719 720 static inline bool 721 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 722 { 723 assert(ctrlr != NULL); 724 assert(q != NULL); 725 assert(q->is_cq); 726 727 return ((q->tail + 1) % q->size) == *hdbl(ctrlr, q); 728 } 729 730 static inline void 731 cq_tail_advance(struct nvme_q *q) 732 { 733 assert(q != NULL); 734 assert(q->is_cq); 735 736 assert(q->tail < q->size); 737 q->tail++; 738 739 if (spdk_unlikely(q->tail == q->size)) { 740 q->tail = 0; 741 q->phase = !q->phase; 742 } 743 } 744 745 static int 746 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 747 { 748 struct nvme_q *cq; 749 const struct spdk_nvmf_registers *regs; 750 int ret; 751 752 assert(ctrlr != NULL); 753 assert(ctrlr->qp[0] != NULL); 754 assert(ctrlr->qp[0]->cq.addr == NULL); 755 756 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 757 assert(regs != NULL); 758 cq = &ctrlr->qp[0]->cq; 759 cq->size = regs->aqa.bits.acqs + 1; 760 cq->prp1 = regs->acq; 761 cq->tail = 0; 762 cq->is_cq = true; 763 cq->ien = true; 764 cq->phase = true; 765 766 ret = map_q(ctrlr, cq, true, true); 767 if (ret) { 768 return ret; 769 } 770 *hdbl(ctrlr, cq) = 0; 771 772 return 0; 773 } 774 775 static inline dma_sg_t * 776 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 777 { 778 return (dma_sg_t *)((uintptr_t)vu_req->sg + iovcnt * dma_sg_size()); 779 } 780 781 static void * 782 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 783 { 784 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 785 struct spdk_nvmf_qpair *qpair; 786 struct nvmf_vfio_user_req *vu_req; 787 struct nvmf_vfio_user_qpair *vu_qpair; 788 void *ret; 789 790 assert(req != NULL); 791 qpair = req->qpair; 792 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 793 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 794 795 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 796 ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len, 797 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 798 &vu_req->iov[vu_req->iovcnt], prot); 799 if (spdk_likely(ret != NULL)) { 800 vu_req->iovcnt++; 801 } 802 return ret; 803 } 804 805 static int 806 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 807 struct iovec *iov, uint32_t length) 808 { 809 /* Map PRP list to from Guest physical memory to 810 * virtual memory address. 811 */ 812 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 813 length, 4096, _map_one); 814 } 815 816 static struct spdk_nvmf_request * 817 get_nvmf_req(struct nvmf_vfio_user_qpair *qp); 818 819 static int 820 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 821 struct spdk_nvmf_request *req); 822 823 /* 824 * Posts a CQE in the completion queue. 825 * 826 * @ctrlr: the vfio-user controller 827 * @cq: the completion queue 828 * @cdw0: cdw0 as reported by NVMf 829 * @sqid: submission queue ID 830 * @cid: command identifier in NVMe command 831 * @sc: the NVMe CQE status code 832 * @sct: the NVMe CQE status code type 833 */ 834 static int 835 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *cq, 836 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 837 { 838 struct spdk_nvme_cpl *cpl; 839 const struct spdk_nvmf_registers *regs; 840 int err; 841 842 assert(ctrlr != NULL); 843 844 if (spdk_unlikely(cq == NULL || cq->addr == NULL)) { 845 return 0; 846 } 847 848 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 849 if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) { 850 SPDK_DEBUGLOG(nvmf_vfio, 851 "%s: ignore completion SQ%d cid=%d status=%#x\n", 852 ctrlr_id(ctrlr), sqid, cid, sc); 853 return 0; 854 } 855 856 if (cq_is_full(ctrlr, cq)) { 857 SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n", 858 ctrlr_id(ctrlr), io_q_id(cq), cq->tail, *hdbl(ctrlr, cq)); 859 return -1; 860 } 861 862 cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail; 863 864 assert(ctrlr->qp[sqid] != NULL); 865 SPDK_DEBUGLOG(nvmf_vfio, 866 "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n", 867 ctrlr_id(ctrlr), sqid, cid, sc, sq_head(ctrlr->qp[sqid]), 868 cq->tail); 869 870 cpl->sqhd = sq_head(ctrlr->qp[sqid]); 871 cpl->sqid = sqid; 872 cpl->cid = cid; 873 cpl->cdw0 = cdw0; 874 cpl->status.dnr = 0x0; 875 cpl->status.m = 0x0; 876 cpl->status.sct = sct; 877 cpl->status.p = cq->phase; 878 cpl->status.sc = sc; 879 880 cq_tail_advance(cq); 881 882 /* 883 * this function now executes at SPDK thread context, we 884 * might be triggerring interrupts from vfio-user thread context so 885 * check for race conditions. 886 */ 887 if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) { 888 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 889 if (err != 0) { 890 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 891 ctrlr_id(ctrlr)); 892 return err; 893 } 894 } 895 896 return 0; 897 } 898 899 static struct nvme_q * 900 lookup_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, const uint16_t qid, const bool is_cq) 901 { 902 struct nvme_q *q; 903 904 assert(ctrlr != NULL); 905 906 if (qid > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) { 907 return NULL; 908 } 909 910 if (ctrlr->qp[qid] == NULL) { 911 return NULL; 912 } 913 914 if (is_cq) { 915 q = &ctrlr->qp[qid]->cq; 916 } else { 917 q = &ctrlr->qp[qid]->sq; 918 } 919 920 if (q->addr == NULL) { 921 return NULL; 922 } 923 924 return q; 925 } 926 927 static void 928 unmap_qp(struct nvmf_vfio_user_qpair *qp) 929 { 930 struct nvmf_vfio_user_ctrlr *ctrlr; 931 932 if (qp->ctrlr == NULL) { 933 return; 934 } 935 ctrlr = qp->ctrlr; 936 937 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap QP%d\n", 938 ctrlr_id(ctrlr), qp->qpair.qid); 939 940 if (qp->sq.addr != NULL) { 941 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->sq.sg, &qp->sq.iov, 1); 942 qp->sq.addr = NULL; 943 } 944 945 if (qp->cq.addr != NULL) { 946 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->cq.sg, &qp->cq.iov, 1); 947 qp->cq.addr = NULL; 948 } 949 } 950 951 static int 952 remap_qp(struct nvmf_vfio_user_qpair *vu_qpair) 953 { 954 struct nvme_q *sq, *cq; 955 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 956 int ret; 957 958 vu_ctrlr = vu_qpair->ctrlr; 959 sq = &vu_qpair->sq; 960 cq = &vu_qpair->cq; 961 962 if (sq->size) { 963 ret = map_q(vu_ctrlr, sq, false, false); 964 if (ret) { 965 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n", 966 io_q_id(sq), sq->prp1, sq->prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 967 return -EFAULT; 968 } 969 } 970 971 if (cq->size) { 972 ret = map_q(vu_ctrlr, cq, true, false); 973 if (ret) { 974 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n", 975 io_q_id(cq), cq->prp1, cq->prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 976 return -EFAULT; 977 } 978 979 } 980 981 return 0; 982 } 983 984 static void 985 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 986 { 987 struct nvmf_vfio_user_qpair *qpair; 988 struct nvmf_vfio_user_req *vu_req; 989 uint32_t i; 990 991 if (ctrlr == NULL) { 992 return; 993 } 994 995 qpair = ctrlr->qp[qid]; 996 if (qpair == NULL) { 997 return; 998 } 999 1000 SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr), 1001 qid, qpair); 1002 1003 unmap_qp(qpair); 1004 1005 for (i = 0; i < qpair->qsize; i++) { 1006 vu_req = &qpair->reqs_internal[i]; 1007 free(vu_req->sg); 1008 } 1009 free(qpair->reqs_internal); 1010 1011 free(qpair->sq.sg); 1012 free(qpair->cq.sg); 1013 free(qpair); 1014 1015 ctrlr->qp[qid] = NULL; 1016 } 1017 1018 /* This function can only fail because of memory allocation errors. */ 1019 static int 1020 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1021 const uint16_t qsize, const uint16_t id) 1022 { 1023 uint16_t i; 1024 struct nvmf_vfio_user_qpair *qpair; 1025 struct nvmf_vfio_user_req *vu_req, *tmp; 1026 struct spdk_nvmf_request *req; 1027 1028 assert(ctrlr != NULL); 1029 assert(transport != NULL); 1030 1031 qpair = calloc(1, sizeof(*qpair)); 1032 if (qpair == NULL) { 1033 return -ENOMEM; 1034 } 1035 qpair->sq.sg = calloc(1, dma_sg_size()); 1036 if (qpair->sq.sg == NULL) { 1037 free(qpair); 1038 return -ENOMEM; 1039 } 1040 qpair->cq.sg = calloc(1, dma_sg_size()); 1041 if (qpair->cq.sg == NULL) { 1042 free(qpair->sq.sg); 1043 free(qpair); 1044 return -ENOMEM; 1045 } 1046 1047 qpair->qpair.qid = id; 1048 qpair->qpair.transport = transport; 1049 qpair->ctrlr = ctrlr; 1050 qpair->qsize = qsize; 1051 1052 TAILQ_INIT(&qpair->reqs); 1053 1054 qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req)); 1055 if (qpair->reqs_internal == NULL) { 1056 SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr)); 1057 goto reqs_err; 1058 } 1059 1060 for (i = 0; i < qsize; i++) { 1061 vu_req = &qpair->reqs_internal[i]; 1062 vu_req->sg = calloc(NVMF_VFIO_USER_MAX_IOVECS, dma_sg_size()); 1063 if (vu_req->sg == NULL) { 1064 goto sg_err; 1065 } 1066 1067 req = &vu_req->req; 1068 req->qpair = &qpair->qpair; 1069 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1070 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1071 1072 TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link); 1073 } 1074 1075 ctrlr->qp[id] = qpair; 1076 return 0; 1077 1078 sg_err: 1079 TAILQ_FOREACH_SAFE(vu_req, &qpair->reqs, link, tmp) { 1080 free(vu_req->sg); 1081 } 1082 free(qpair->reqs_internal); 1083 1084 reqs_err: 1085 free(qpair->sq.sg); 1086 free(qpair->cq.sg); 1087 free(qpair); 1088 return -ENOMEM; 1089 } 1090 1091 /* 1092 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 1093 * on error. 1094 */ 1095 static int 1096 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1097 struct spdk_nvme_cmd *cmd, const bool is_cq) 1098 { 1099 uint16_t qid, qsize; 1100 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1101 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1102 int err = 0; 1103 struct nvmf_vfio_user_qpair *vu_qpair; 1104 struct nvme_q *io_q; 1105 1106 assert(ctrlr != NULL); 1107 assert(cmd != NULL); 1108 1109 qid = cmd->cdw10_bits.create_io_q.qid; 1110 if (qid >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) { 1111 SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr), 1112 qid, NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR); 1113 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1114 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1115 goto out; 1116 } 1117 1118 if (lookup_io_q(ctrlr, qid, is_cq)) { 1119 SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr), 1120 is_cq ? 'C' : 'S', qid); 1121 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1122 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1123 goto out; 1124 } 1125 1126 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1127 if (qsize > max_queue_size(ctrlr)) { 1128 SPDK_ERRLOG("%s: queue too big, want=%d, max=%d\n", ctrlr_id(ctrlr), 1129 qsize, max_queue_size(ctrlr)); 1130 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1131 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 1132 goto out; 1133 } 1134 1135 SPDK_DEBUGLOG(nvmf_vfio, 1136 "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr), 1137 is_cq ? 'C' : 'S', qid, qsize); 1138 1139 if (is_cq) { 1140 err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, qsize, qid); 1141 if (err != 0) { 1142 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1143 goto out; 1144 } 1145 1146 io_q = &ctrlr->qp[qid]->cq; 1147 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1148 SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr)); 1149 sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF; 1150 goto out; 1151 } 1152 io_q->ien = cmd->cdw11_bits.create_io_cq.ien; 1153 io_q->iv = cmd->cdw11_bits.create_io_cq.iv; 1154 io_q->phase = true; 1155 } else { 1156 if (cmd->cdw11_bits.create_io_sq.cqid == 0) { 1157 SPDK_ERRLOG("%s: invalid CQID 0\n", ctrlr_id(ctrlr)); 1158 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1159 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1160 goto out; 1161 1162 } 1163 /* CQ must be created before SQ */ 1164 if (!lookup_io_q(ctrlr, cmd->cdw11_bits.create_io_sq.cqid, true)) { 1165 SPDK_ERRLOG("%s: CQ%d does not exist\n", ctrlr_id(ctrlr), 1166 cmd->cdw11_bits.create_io_sq.cqid); 1167 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1168 sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1169 goto out; 1170 } 1171 1172 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1173 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1174 sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF; 1175 goto out; 1176 } 1177 1178 io_q = &ctrlr->qp[qid]->sq; 1179 io_q->cqid = cmd->cdw11_bits.create_io_sq.cqid; 1180 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr), 1181 qid, io_q->cqid); 1182 } 1183 1184 io_q->is_cq = is_cq; 1185 io_q->size = qsize; 1186 io_q->prp1 = cmd->dptr.prp.prp1; 1187 1188 err = map_q(ctrlr, io_q, is_cq, true); 1189 if (err) { 1190 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1191 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1192 goto out; 1193 } 1194 1195 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n", 1196 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1197 qid, cmd->dptr.prp.prp1, (unsigned long long)io_q->addr); 1198 1199 if (is_cq) { 1200 *hdbl(ctrlr, io_q) = 0; 1201 } else { 1202 /* 1203 * Create our new I/O qpair. This asynchronously invokes, on a 1204 * suitable poll group, the nvmf_vfio_user_poll_group_add() 1205 * callback, which will call spdk_nvmf_request_exec_fabrics() 1206 * with a generated fabrics connect command. This command is 1207 * then eventually completed via handle_queue_connect_rsp(). 1208 */ 1209 vu_qpair = ctrlr->qp[qid]; 1210 vu_qpair->create_io_sq_cmd = *cmd; 1211 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1212 &vu_qpair->qpair); 1213 *tdbl(ctrlr, io_q) = 0; 1214 return 0; 1215 } 1216 1217 out: 1218 return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, sc, sct); 1219 } 1220 1221 /* For ADMIN I/O DELETE COMPLETION QUEUE the NVMf library will disconnect and free 1222 * queue pair, so save the command in a context. 1223 */ 1224 struct vfio_user_delete_cq_ctx { 1225 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1226 struct spdk_nvme_cmd delete_io_cq_cmd; 1227 }; 1228 1229 static void 1230 vfio_user_qpair_delete_cb(void *cb_arg) 1231 { 1232 struct vfio_user_delete_cq_ctx *ctx = cb_arg; 1233 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 1234 1235 post_completion(vu_ctrlr, &vu_ctrlr->qp[0]->cq, 0, 0, ctx->delete_io_cq_cmd.cid, 1236 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 1237 free(ctx); 1238 } 1239 1240 /* 1241 * Deletes a completion or submission I/O queue. 1242 */ 1243 static int 1244 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1245 struct spdk_nvme_cmd *cmd, const bool is_cq) 1246 { 1247 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1248 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1249 struct nvmf_vfio_user_qpair *vu_qpair; 1250 struct vfio_user_delete_cq_ctx *ctx; 1251 1252 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n", 1253 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1254 cmd->cdw10_bits.delete_io_q.qid); 1255 1256 if (lookup_io_q(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq) == NULL) { 1257 SPDK_ERRLOG("%s: %cQ%d does not exist\n", ctrlr_id(ctrlr), 1258 is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid); 1259 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1260 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1261 goto out; 1262 } 1263 1264 vu_qpair = ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]; 1265 if (is_cq) { 1266 /* SQ must have been deleted first */ 1267 if (vu_qpair->state != VFIO_USER_QPAIR_DELETED) { 1268 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 1269 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1270 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 1271 goto out; 1272 } 1273 ctx = calloc(1, sizeof(*ctx)); 1274 if (!ctx) { 1275 sct = SPDK_NVME_SCT_GENERIC; 1276 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1277 goto out; 1278 } 1279 ctx->vu_ctrlr = ctrlr; 1280 ctx->delete_io_cq_cmd = *cmd; 1281 spdk_nvmf_qpair_disconnect(&vu_qpair->qpair, vfio_user_qpair_delete_cb, ctx); 1282 return 0; 1283 } else { 1284 /* 1285 * This doesn't actually delete the SQ, We're merely telling the poll_group_poll 1286 * function to skip checking this SQ. The queue pair will be disconnected in Delete 1287 * IO CQ command. 1288 */ 1289 assert(vu_qpair->state == VFIO_USER_QPAIR_ACTIVE); 1290 vu_qpair->state = VFIO_USER_QPAIR_DELETED; 1291 } 1292 1293 out: 1294 return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, sc, sct); 1295 } 1296 1297 /* 1298 * Returns 0 on success and -errno on error. 1299 */ 1300 static int 1301 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 1302 { 1303 assert(ctrlr != NULL); 1304 assert(cmd != NULL); 1305 1306 switch (cmd->opc) { 1307 case SPDK_NVME_OPC_CREATE_IO_CQ: 1308 case SPDK_NVME_OPC_CREATE_IO_SQ: 1309 return handle_create_io_q(ctrlr, cmd, 1310 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 1311 case SPDK_NVME_OPC_DELETE_IO_SQ: 1312 case SPDK_NVME_OPC_DELETE_IO_CQ: 1313 return handle_del_io_q(ctrlr, cmd, 1314 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 1315 default: 1316 return handle_cmd_req(ctrlr, cmd, get_nvmf_req(ctrlr->qp[0])); 1317 } 1318 } 1319 1320 static int 1321 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 1322 { 1323 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 1324 struct nvmf_vfio_user_ctrlr *vu_ctrlr = vu_qpair->ctrlr; 1325 uint16_t sqid, cqid; 1326 1327 assert(vu_qpair != NULL); 1328 assert(vu_req != NULL); 1329 assert(vu_ctrlr != NULL); 1330 1331 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, vu_req->sg, vu_req->iov, vu_req->iovcnt); 1332 sqid = vu_qpair->qpair.qid; 1333 cqid = vu_ctrlr->qp[sqid]->sq.cqid; 1334 1335 return post_completion(vu_ctrlr, &vu_ctrlr->qp[cqid]->cq, 1336 vu_req->req.rsp->nvme_cpl.cdw0, 1337 sqid, 1338 vu_req->req.cmd->nvme_cmd.cid, 1339 vu_req->req.rsp->nvme_cpl.status.sc, 1340 vu_req->req.rsp->nvme_cpl.status.sct); 1341 } 1342 1343 static int 1344 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair, 1345 struct spdk_nvme_cmd *cmd) 1346 { 1347 assert(qpair != NULL); 1348 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 1349 return consume_admin_cmd(ctrlr, cmd); 1350 } 1351 1352 return handle_cmd_req(ctrlr, cmd, get_nvmf_req(qpair)); 1353 } 1354 1355 /* Returns the number of commands processed, or a negative value on error. */ 1356 static int 1357 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 1358 struct nvmf_vfio_user_qpair *qpair) 1359 { 1360 struct spdk_nvme_cmd *queue; 1361 int count = 0; 1362 1363 assert(ctrlr != NULL); 1364 assert(qpair != NULL); 1365 1366 queue = qpair->sq.addr; 1367 while (sq_head(qpair) != new_tail) { 1368 int err; 1369 struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)]; 1370 1371 count++; 1372 1373 /* 1374 * SQHD must contain the new head pointer, so we must increase 1375 * it before we generate a completion. 1376 */ 1377 sqhd_advance(ctrlr, qpair); 1378 1379 err = consume_cmd(ctrlr, qpair, cmd); 1380 if (err != 0) { 1381 return err; 1382 } 1383 } 1384 1385 return count; 1386 } 1387 1388 static int 1389 enable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1390 { 1391 int err; 1392 1393 assert(ctrlr != NULL); 1394 1395 err = acq_setup(ctrlr); 1396 if (err != 0) { 1397 return err; 1398 } 1399 1400 err = asq_setup(ctrlr); 1401 if (err != 0) { 1402 return err; 1403 } 1404 1405 return 0; 1406 } 1407 1408 static void 1409 disable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1410 { 1411 assert(ctrlr->qp[0] != NULL); 1412 1413 unmap_qp(ctrlr->qp[0]); 1414 } 1415 1416 static void 1417 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1418 { 1419 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1420 struct nvmf_vfio_user_ctrlr *ctrlr; 1421 struct nvmf_vfio_user_qpair *qpair; 1422 int i, ret; 1423 1424 /* 1425 * We're not interested in any DMA regions that aren't mappable (we don't 1426 * support clients that don't share their memory). 1427 */ 1428 if (!info->vaddr) { 1429 return; 1430 } 1431 1432 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1433 (info->mapping.iov_len & MASK_2MB)) { 1434 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1435 (uintptr_t)info->mapping.iov_base, 1436 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1437 return; 1438 } 1439 1440 assert(endpoint != NULL); 1441 if (endpoint->ctrlr == NULL) { 1442 return; 1443 } 1444 ctrlr = endpoint->ctrlr; 1445 1446 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1447 (uintptr_t)info->mapping.iov_base, 1448 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1449 1450 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 1451 * check the protection bits before registering. 1452 */ 1453 if (info->prot == (PROT_WRITE | PROT_READ)) { 1454 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 1455 if (ret) { 1456 SPDK_ERRLOG("Memory region register %#lx-%#lx failed, ret=%d\n", 1457 (uint64_t)(uintptr_t)info->mapping.iov_base, 1458 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len, 1459 ret); 1460 } 1461 } 1462 1463 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1464 qpair = ctrlr->qp[i]; 1465 if (qpair == NULL) { 1466 continue; 1467 } 1468 1469 if (qpair->state != VFIO_USER_QPAIR_INACTIVE) { 1470 continue; 1471 } 1472 1473 ret = remap_qp(qpair); 1474 if (ret) { 1475 continue; 1476 } 1477 qpair->state = VFIO_USER_QPAIR_ACTIVE; 1478 SPDK_DEBUGLOG(nvmf_vfio, "Remap QP %u successfully\n", i); 1479 } 1480 } 1481 1482 static int 1483 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1484 { 1485 1486 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1487 struct nvmf_vfio_user_ctrlr *ctrlr; 1488 struct nvmf_vfio_user_qpair *qpair; 1489 void *map_start, *map_end; 1490 int i, ret; 1491 1492 if (!info->vaddr) { 1493 return 0; 1494 } 1495 1496 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1497 (info->mapping.iov_len & MASK_2MB)) { 1498 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1499 (uintptr_t)info->mapping.iov_base, 1500 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1501 return 0; 1502 } 1503 1504 assert(endpoint != NULL); 1505 if (endpoint->ctrlr == NULL) { 1506 return 0; 1507 } 1508 ctrlr = endpoint->ctrlr; 1509 1510 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1511 (uintptr_t)info->mapping.iov_base, 1512 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1513 1514 if (info->prot == (PROT_WRITE | PROT_READ)) { 1515 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 1516 if (ret) { 1517 SPDK_ERRLOG("Memory region unregister %#lx-%#lx failed, ret=%d\n", 1518 (uint64_t)(uintptr_t)info->mapping.iov_base, 1519 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len, 1520 ret); 1521 } 1522 } 1523 1524 map_start = info->mapping.iov_base; 1525 map_end = info->mapping.iov_base + info->mapping.iov_len; 1526 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1527 qpair = ctrlr->qp[i]; 1528 if (qpair == NULL) { 1529 continue; 1530 } 1531 1532 if ((qpair->cq.addr >= map_start && qpair->cq.addr < map_end) || 1533 (qpair->sq.addr >= map_start && qpair->sq.addr < map_end)) { 1534 unmap_qp(qpair); 1535 qpair->state = VFIO_USER_QPAIR_INACTIVE; 1536 } 1537 } 1538 1539 return 0; 1540 } 1541 1542 static int 1543 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1544 { 1545 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 1546 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1547 bool disable_admin = false; 1548 int ret; 1549 1550 assert(vu_qpair != NULL); 1551 assert(req != NULL); 1552 1553 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 1554 assert(vu_qpair->ctrlr != NULL); 1555 assert(req != NULL); 1556 1557 memcpy(req->req.data, 1558 &req->req.rsp->prop_get_rsp.value.u64, 1559 req->req.length); 1560 } else { 1561 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 1562 assert(vu_qpair->ctrlr != NULL); 1563 vu_ctrlr = vu_qpair->ctrlr; 1564 1565 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 1566 union spdk_nvme_cc_register cc, diff; 1567 1568 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 1569 diff.raw = cc.raw ^ req->cc.raw; 1570 1571 if (diff.bits.en) { 1572 if (cc.bits.en) { 1573 SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr)); 1574 ret = enable_admin_queue(vu_ctrlr); 1575 if (ret) { 1576 SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr)); 1577 return ret; 1578 } 1579 vu_qpair->state = VFIO_USER_QPAIR_ACTIVE; 1580 } else { 1581 disable_admin = true; 1582 } 1583 } 1584 1585 if (diff.bits.shn) { 1586 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 1587 disable_admin = true; 1588 } 1589 } 1590 1591 if (disable_admin) { 1592 SPDK_DEBUGLOG(nvmf_vfio, 1593 "%s: UNMAP Admin queue\n", 1594 ctrlr_id(vu_ctrlr)); 1595 vu_qpair->state = VFIO_USER_QPAIR_INACTIVE; 1596 disable_admin_queue(vu_ctrlr); 1597 /* For PCIe controller reset or shutdown, we will drop all AER responses */ 1598 nvmf_ctrlr_abort_aer(vu_qpair->qpair.ctrlr); 1599 } 1600 } 1601 } 1602 1603 return 0; 1604 } 1605 1606 /* 1607 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 1608 * doorbell is written via access_bar0_fn(). 1609 * 1610 * DSTRD is set to fixed value 0 for NVMf. 1611 * 1612 */ 1613 static int 1614 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 1615 const size_t count, loff_t pos, const bool is_write) 1616 { 1617 assert(ctrlr != NULL); 1618 assert(buf != NULL); 1619 1620 if (count != sizeof(uint32_t)) { 1621 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 1622 ctrlr_id(ctrlr), count); 1623 errno = EINVAL; 1624 return -1; 1625 } 1626 1627 pos -= NVMF_VFIO_USER_DOORBELLS_OFFSET; 1628 1629 /* pos must be dword aligned */ 1630 if ((pos & 0x3) != 0) { 1631 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 1632 errno = EINVAL; 1633 return -1; 1634 } 1635 1636 /* convert byte offset to array index */ 1637 pos >>= 2; 1638 1639 if (pos > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR * 2) { 1640 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 1641 errno = EINVAL; 1642 return -1; 1643 } 1644 1645 if (is_write) { 1646 ctrlr->doorbells[pos] = *buf; 1647 spdk_wmb(); 1648 } else { 1649 spdk_rmb(); 1650 *buf = ctrlr->doorbells[pos]; 1651 } 1652 return 0; 1653 } 1654 1655 static ssize_t 1656 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 1657 bool is_write) 1658 { 1659 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1660 struct nvmf_vfio_user_ctrlr *ctrlr; 1661 struct nvmf_vfio_user_req *req; 1662 const struct spdk_nvmf_registers *regs; 1663 int ret; 1664 1665 ctrlr = endpoint->ctrlr; 1666 1667 SPDK_DEBUGLOG(nvmf_vfio, 1668 "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n", 1669 endpoint_id(endpoint), is_write ? "write" : "read", 1670 ctrlr, count, pos); 1671 1672 if (pos >= NVMF_VFIO_USER_DOORBELLS_OFFSET) { 1673 /* 1674 * The fact that the doorbells can be memory mapped doesn't mean 1675 * that the client (VFIO in QEMU) is obliged to memory map them, 1676 * it might still elect to access them via regular read/write; 1677 * we might also have had disable_mappable_bar0 set. 1678 */ 1679 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 1680 pos, is_write); 1681 if (ret == 0) { 1682 return count; 1683 } 1684 return ret; 1685 } 1686 1687 /* Construct a Fabric Property Get/Set command and send it */ 1688 req = get_nvmf_vfio_user_req(ctrlr->qp[0]); 1689 if (req == NULL) { 1690 errno = ENOBUFS; 1691 return -1; 1692 } 1693 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 1694 req->cc.raw = regs->cc.raw; 1695 1696 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 1697 req->cb_arg = ctrlr->qp[0]; 1698 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 1699 req->req.cmd->prop_set_cmd.cid = 0; 1700 req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1; 1701 req->req.cmd->prop_set_cmd.ofst = pos; 1702 if (is_write) { 1703 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 1704 if (req->req.cmd->prop_set_cmd.attrib.size) { 1705 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 1706 } else { 1707 req->req.cmd->prop_set_cmd.value.u32.high = 0; 1708 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 1709 } 1710 } else { 1711 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 1712 } 1713 req->req.length = count; 1714 req->req.data = buf; 1715 1716 spdk_nvmf_request_exec_fabrics(&req->req); 1717 1718 return count; 1719 } 1720 1721 /* 1722 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 1723 * available on PCI-X 2.0 and PCI Express buses 1724 */ 1725 static ssize_t 1726 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 1727 bool is_write) 1728 { 1729 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1730 1731 if (is_write) { 1732 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 1733 endpoint_id(endpoint), offset, offset + count); 1734 errno = EINVAL; 1735 return -1; 1736 } 1737 1738 if (offset + count > PCI_CFG_SPACE_EXP_SIZE) { 1739 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 1740 endpoint_id(endpoint), offset, count, 1741 PCI_CFG_SPACE_EXP_SIZE); 1742 errno = ERANGE; 1743 return -1; 1744 } 1745 1746 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 1747 1748 return count; 1749 } 1750 1751 static void 1752 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 1753 { 1754 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1755 1756 if (level >= LOG_DEBUG) { 1757 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1758 } else if (level >= LOG_INFO) { 1759 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1760 } else if (level >= LOG_NOTICE) { 1761 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 1762 } else if (level >= LOG_WARNING) { 1763 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 1764 } else { 1765 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 1766 } 1767 } 1768 1769 static int 1770 vfio_user_get_log_level(void) 1771 { 1772 int level; 1773 1774 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 1775 return LOG_DEBUG; 1776 } 1777 1778 level = spdk_log_to_syslog_level(spdk_log_get_level()); 1779 if (level < 0) { 1780 return LOG_ERR; 1781 } 1782 1783 return level; 1784 } 1785 1786 static void 1787 init_pci_config_space(vfu_pci_config_space_t *p) 1788 { 1789 /* MLBAR */ 1790 p->hdr.bars[0].raw = 0x0; 1791 /* MUBAR */ 1792 p->hdr.bars[1].raw = 0x0; 1793 1794 /* vendor specific, let's set them to zero for now */ 1795 p->hdr.bars[3].raw = 0x0; 1796 p->hdr.bars[4].raw = 0x0; 1797 p->hdr.bars[5].raw = 0x0; 1798 1799 /* enable INTx */ 1800 p->hdr.intr.ipin = 0x1; 1801 } 1802 1803 static int 1804 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 1805 struct nvmf_vfio_user_endpoint *endpoint) 1806 { 1807 int ret; 1808 ssize_t cap_offset; 1809 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 1810 1811 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 1812 struct pxcap pxcap = { 1813 .hdr.id = PCI_CAP_ID_EXP, 1814 .pxcaps.ver = 0x2, 1815 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 1816 .pxdcap2.ctds = 0x1 1817 }; 1818 1819 struct msixcap msixcap = { 1820 .hdr.id = PCI_CAP_ID_MSIX, 1821 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 1822 .mtab = {.tbir = 0x4, .to = 0x0}, 1823 .mpba = {.pbir = 0x5, .pbao = 0x0} 1824 }; 1825 1826 static struct iovec sparse_mmap[] = { 1827 { 1828 .iov_base = (void *)NVMF_VFIO_USER_DOORBELLS_OFFSET, 1829 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 1830 }, 1831 }; 1832 1833 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 1834 if (ret < 0) { 1835 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 1836 return ret; 1837 } 1838 vfu_pci_set_id(vfu_ctx, 0x4e58, 0x0001, 0, 0); 1839 /* 1840 * 0x02, controller uses the NVM Express programming interface 1841 * 0x08, non-volatile memory controller 1842 * 0x01, mass storage controller 1843 */ 1844 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 1845 1846 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 1847 if (cap_offset < 0) { 1848 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 1849 return ret; 1850 } 1851 1852 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 1853 if (cap_offset < 0) { 1854 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 1855 return ret; 1856 } 1857 1858 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 1859 if (cap_offset < 0) { 1860 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 1861 return ret; 1862 } 1863 1864 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 1865 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1866 if (ret < 0) { 1867 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 1868 return ret; 1869 } 1870 1871 if (vu_transport->transport_opts.disable_mappable_bar0) { 1872 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1873 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1874 NULL, 0, -1, 0); 1875 } else { 1876 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1877 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1878 sparse_mmap, 1, endpoint->devmem_fd, 0); 1879 } 1880 1881 if (ret < 0) { 1882 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 1883 return ret; 1884 } 1885 1886 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, PAGE_SIZE, 1887 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1888 if (ret < 0) { 1889 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 1890 return ret; 1891 } 1892 1893 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, PAGE_SIZE, 1894 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1895 if (ret < 0) { 1896 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 1897 return ret; 1898 } 1899 1900 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 1901 if (ret < 0) { 1902 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 1903 return ret; 1904 } 1905 1906 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 1907 if (ret < 0) { 1908 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 1909 return ret; 1910 } 1911 1912 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 1913 if (ret < 0) { 1914 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 1915 return ret; 1916 } 1917 1918 ret = vfu_realize_ctx(vfu_ctx); 1919 if (ret < 0) { 1920 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 1921 return ret; 1922 } 1923 1924 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 1925 assert(endpoint->pci_config_space != NULL); 1926 init_pci_config_space(endpoint->pci_config_space); 1927 1928 assert(cap_offset != 0); 1929 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 1930 1931 return 0; 1932 } 1933 1934 static void 1935 _free_ctrlr(void *ctx) 1936 { 1937 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 1938 int i; 1939 1940 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1941 free_qp(ctrlr, i); 1942 } 1943 1944 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 1945 free(ctrlr); 1946 } 1947 1948 static void 1949 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 1950 { 1951 assert(ctrlr != NULL); 1952 1953 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 1954 1955 if (ctrlr->thread == spdk_get_thread()) { 1956 _free_ctrlr(ctrlr); 1957 } else { 1958 spdk_thread_send_msg(ctrlr->thread, _free_ctrlr, ctrlr); 1959 } 1960 } 1961 1962 static void 1963 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 1964 struct nvmf_vfio_user_endpoint *endpoint) 1965 { 1966 struct nvmf_vfio_user_ctrlr *ctrlr; 1967 int err; 1968 1969 /* First, construct a vfio-user CUSTOM transport controller */ 1970 ctrlr = calloc(1, sizeof(*ctrlr)); 1971 if (ctrlr == NULL) { 1972 err = -ENOMEM; 1973 goto out; 1974 } 1975 ctrlr->cntlid = 0xffff; 1976 ctrlr->transport = transport; 1977 ctrlr->endpoint = endpoint; 1978 ctrlr->doorbells = endpoint->doorbells; 1979 1980 /* Then, construct an admin queue pair */ 1981 err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0); 1982 if (err != 0) { 1983 goto out; 1984 } 1985 endpoint->ctrlr = ctrlr; 1986 1987 /* Notify the generic layer about the new admin queue pair */ 1988 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->qp[0]->qpair); 1989 1990 out: 1991 if (err != 0) { 1992 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 1993 endpoint_id(endpoint), strerror(-err)); 1994 free_ctrlr(ctrlr); 1995 } 1996 } 1997 1998 static int 1999 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 2000 const struct spdk_nvme_transport_id *trid, 2001 struct spdk_nvmf_listen_opts *listen_opts) 2002 { 2003 struct nvmf_vfio_user_transport *vu_transport; 2004 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 2005 char *path = NULL; 2006 char uuid[PATH_MAX] = {}; 2007 int fd; 2008 int err; 2009 2010 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2011 transport); 2012 2013 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 2014 /* Only compare traddr */ 2015 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 2016 return -EEXIST; 2017 } 2018 } 2019 2020 endpoint = calloc(1, sizeof(*endpoint)); 2021 if (!endpoint) { 2022 return -ENOMEM; 2023 } 2024 2025 endpoint->devmem_fd = -1; 2026 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 2027 2028 err = asprintf(&path, "%s/bar0", endpoint_id(endpoint)); 2029 if (err == -1) { 2030 goto out; 2031 } 2032 2033 fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); 2034 if (fd == -1) { 2035 SPDK_ERRLOG("%s: failed to open device memory at %s: %m\n", 2036 endpoint_id(endpoint), path); 2037 err = fd; 2038 free(path); 2039 goto out; 2040 } 2041 free(path); 2042 2043 endpoint->devmem_fd = fd; 2044 err = ftruncate(fd, NVMF_VFIO_USER_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 2045 if (err != 0) { 2046 goto out; 2047 } 2048 2049 endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 2050 PROT_READ | PROT_WRITE, MAP_SHARED, fd, NVMF_VFIO_USER_DOORBELLS_OFFSET); 2051 if (endpoint->doorbells == MAP_FAILED) { 2052 endpoint->doorbells = NULL; 2053 err = -errno; 2054 goto out; 2055 } 2056 2057 snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 2058 2059 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 2060 endpoint, VFU_DEV_TYPE_PCI); 2061 if (endpoint->vfu_ctx == NULL) { 2062 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 2063 endpoint_id(endpoint)); 2064 err = -1; 2065 goto out; 2066 } 2067 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level()); 2068 2069 err = vfio_user_dev_info_fill(vu_transport, endpoint); 2070 if (err < 0) { 2071 goto out; 2072 } 2073 2074 pthread_mutex_init(&endpoint->lock, NULL); 2075 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 2076 SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells); 2077 2078 out: 2079 if (err != 0) { 2080 nvmf_vfio_user_destroy_endpoint(endpoint); 2081 } 2082 2083 return err; 2084 } 2085 2086 static void 2087 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 2088 const struct spdk_nvme_transport_id *trid) 2089 { 2090 struct nvmf_vfio_user_transport *vu_transport; 2091 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 2092 2093 assert(trid != NULL); 2094 assert(trid->traddr != NULL); 2095 2096 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 2097 2098 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2099 transport); 2100 2101 pthread_mutex_lock(&vu_transport->lock); 2102 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 2103 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 2104 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 2105 if (endpoint->ctrlr) { 2106 free_ctrlr(endpoint->ctrlr); 2107 } 2108 nvmf_vfio_user_destroy_endpoint(endpoint); 2109 pthread_mutex_unlock(&vu_transport->lock); 2110 2111 return; 2112 } 2113 } 2114 pthread_mutex_unlock(&vu_transport->lock); 2115 2116 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 2117 } 2118 2119 static void 2120 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 2121 struct spdk_nvmf_subsystem *subsystem, 2122 struct spdk_nvmf_ctrlr_data *cdata) 2123 { 2124 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 2125 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 2126 /* libvfio-user can only support 1 connection for now */ 2127 cdata->oncs.reservations = 0; 2128 } 2129 2130 static int 2131 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 2132 const struct spdk_nvmf_subsystem *subsystem, 2133 const struct spdk_nvme_transport_id *trid) 2134 { 2135 struct nvmf_vfio_user_transport *vu_transport; 2136 struct nvmf_vfio_user_endpoint *endpoint; 2137 2138 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 2139 2140 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2141 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 2142 break; 2143 } 2144 } 2145 2146 if (endpoint == NULL) { 2147 return -ENOENT; 2148 } 2149 2150 endpoint->subsystem = subsystem; 2151 2152 return 0; 2153 } 2154 2155 /* 2156 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 2157 * frequency. 2158 * 2159 * For each transport endpoint (which at the libvfio-user level corresponds to 2160 * a socket), if we don't currently have a controller set up, peek to see if the 2161 * socket is able to accept a new connection. 2162 * 2163 * This poller also takes care of handling the creation of any pending new 2164 * qpairs. 2165 * 2166 * Returns the number of events handled. 2167 */ 2168 static uint32_t 2169 nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport) 2170 { 2171 struct nvmf_vfio_user_transport *vu_transport; 2172 struct nvmf_vfio_user_endpoint *endpoint; 2173 uint32_t count = 0; 2174 int err; 2175 2176 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2177 transport); 2178 2179 pthread_mutex_lock(&vu_transport->lock); 2180 2181 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2182 if (endpoint->ctrlr != NULL) { 2183 continue; 2184 } 2185 2186 err = vfu_attach_ctx(endpoint->vfu_ctx); 2187 if (err != 0) { 2188 if (errno == EAGAIN || errno == EWOULDBLOCK) { 2189 continue; 2190 } 2191 2192 pthread_mutex_unlock(&vu_transport->lock); 2193 return 1; 2194 } 2195 2196 count++; 2197 2198 /* Construct a controller */ 2199 nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 2200 } 2201 2202 pthread_mutex_unlock(&vu_transport->lock); 2203 2204 return count; 2205 } 2206 2207 static void 2208 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 2209 struct spdk_nvme_transport_id *trid, 2210 struct spdk_nvmf_discovery_log_page_entry *entry) 2211 { } 2212 2213 static struct spdk_nvmf_transport_poll_group * 2214 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport) 2215 { 2216 struct nvmf_vfio_user_poll_group *vu_group; 2217 2218 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 2219 2220 vu_group = calloc(1, sizeof(*vu_group)); 2221 if (vu_group == NULL) { 2222 SPDK_ERRLOG("Error allocating poll group: %m"); 2223 return NULL; 2224 } 2225 2226 TAILQ_INIT(&vu_group->qps); 2227 2228 return &vu_group->group; 2229 } 2230 2231 /* called when process exits */ 2232 static void 2233 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 2234 { 2235 struct nvmf_vfio_user_poll_group *vu_group; 2236 2237 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 2238 2239 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2240 2241 free(vu_group); 2242 } 2243 2244 static void 2245 vfio_user_qpair_disconnect_cb(void *ctx) 2246 { 2247 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2248 struct nvmf_vfio_user_ctrlr *ctrlr; 2249 2250 pthread_mutex_lock(&endpoint->lock); 2251 ctrlr = endpoint->ctrlr; 2252 if (!ctrlr) { 2253 pthread_mutex_unlock(&endpoint->lock); 2254 return; 2255 } 2256 2257 if (!ctrlr->num_connected_qps) { 2258 endpoint->ctrlr = NULL; 2259 free_ctrlr(ctrlr); 2260 pthread_mutex_unlock(&endpoint->lock); 2261 return; 2262 } 2263 pthread_mutex_unlock(&endpoint->lock); 2264 } 2265 2266 static int 2267 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 2268 { 2269 uint32_t i; 2270 struct nvmf_vfio_user_qpair *qpair; 2271 struct nvmf_vfio_user_endpoint *endpoint; 2272 2273 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 2274 2275 endpoint = ctrlr->endpoint; 2276 assert(endpoint != NULL); 2277 2278 pthread_mutex_lock(&endpoint->lock); 2279 if (ctrlr->num_connected_qps == 0) { 2280 endpoint->ctrlr = NULL; 2281 free_ctrlr(ctrlr); 2282 pthread_mutex_unlock(&endpoint->lock); 2283 return 0; 2284 } 2285 pthread_mutex_unlock(&endpoint->lock); 2286 2287 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 2288 qpair = ctrlr->qp[i]; 2289 if (qpair == NULL) { 2290 continue; 2291 } 2292 spdk_nvmf_qpair_disconnect(&qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint); 2293 } 2294 2295 return 0; 2296 } 2297 2298 /* 2299 * Poll for and process any incoming vfio-user messages. 2300 */ 2301 static int 2302 vfio_user_poll_vfu_ctx(void *ctx) 2303 { 2304 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 2305 int ret; 2306 2307 assert(ctrlr != NULL); 2308 2309 /* This will call access_bar0_fn() if there are any writes 2310 * to the portion of the BAR that is not mmap'd */ 2311 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 2312 if (spdk_unlikely(ret == -1)) { 2313 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 2314 2315 /* initiator shutdown or reset, waiting for another re-connect */ 2316 if (errno == ENOTCONN) { 2317 vfio_user_destroy_ctrlr(ctrlr); 2318 return SPDK_POLLER_BUSY; 2319 } 2320 2321 fail_ctrlr(ctrlr); 2322 } 2323 2324 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 2325 } 2326 2327 static int 2328 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2329 { 2330 struct nvmf_vfio_user_poll_group *vu_group; 2331 struct nvmf_vfio_user_qpair *qpair = cb_arg; 2332 struct nvmf_vfio_user_ctrlr *ctrlr; 2333 struct nvmf_vfio_user_endpoint *endpoint; 2334 2335 assert(qpair != NULL); 2336 assert(req != NULL); 2337 2338 ctrlr = qpair->ctrlr; 2339 endpoint = ctrlr->endpoint; 2340 assert(ctrlr != NULL); 2341 assert(endpoint != NULL); 2342 2343 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 2344 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 2345 endpoint->ctrlr = NULL; 2346 free_ctrlr(ctrlr); 2347 return -1; 2348 } 2349 2350 vu_group = SPDK_CONTAINEROF(qpair->group, struct nvmf_vfio_user_poll_group, group); 2351 TAILQ_INSERT_TAIL(&vu_group->qps, qpair, link); 2352 qpair->state = VFIO_USER_QPAIR_ACTIVE; 2353 2354 pthread_mutex_lock(&endpoint->lock); 2355 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 2356 ctrlr->cntlid = qpair->qpair.ctrlr->cntlid; 2357 ctrlr->thread = spdk_get_thread(); 2358 ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, ctrlr, 0); 2359 } else { 2360 /* For I/O queues this command was generated in response to an 2361 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 2362 * been completed. Complete it now. 2363 */ 2364 post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, 2365 qpair->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2366 } 2367 ctrlr->num_connected_qps++; 2368 pthread_mutex_unlock(&endpoint->lock); 2369 2370 free(req->req.data); 2371 req->req.data = NULL; 2372 2373 return 0; 2374 } 2375 2376 /* 2377 * Add the given qpair to the given poll group. New qpairs are added via 2378 * spdk_nvmf_tgt_new_qpair(), which picks a poll group, then calls back 2379 * here via nvmf_transport_poll_group_add(). 2380 */ 2381 static int 2382 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 2383 struct spdk_nvmf_qpair *qpair) 2384 { 2385 struct nvmf_vfio_user_qpair *vu_qpair; 2386 struct nvmf_vfio_user_req *vu_req; 2387 struct nvmf_vfio_user_ctrlr *ctrlr; 2388 struct spdk_nvmf_request *req; 2389 struct spdk_nvmf_fabric_connect_data *data; 2390 bool admin; 2391 2392 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2393 vu_qpair->group = group; 2394 ctrlr = vu_qpair->ctrlr; 2395 2396 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 2397 ctrlr_id(ctrlr), vu_qpair->qpair.qid, 2398 vu_qpair, qpair, group); 2399 2400 admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair); 2401 2402 vu_req = get_nvmf_vfio_user_req(vu_qpair); 2403 if (vu_req == NULL) { 2404 return -1; 2405 } 2406 2407 req = &vu_req->req; 2408 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2409 req->cmd->connect_cmd.cid = 0; 2410 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 2411 req->cmd->connect_cmd.recfmt = 0; 2412 req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1; 2413 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 2414 2415 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 2416 req->data = calloc(1, req->length); 2417 if (req->data == NULL) { 2418 nvmf_vfio_user_req_free(req); 2419 return -ENOMEM; 2420 } 2421 2422 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 2423 data->cntlid = admin ? 0xFFFF : ctrlr->cntlid; 2424 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 2425 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 2426 2427 vu_req->cb_fn = handle_queue_connect_rsp; 2428 vu_req->cb_arg = vu_qpair; 2429 2430 SPDK_DEBUGLOG(nvmf_vfio, 2431 "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", 2432 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 2433 2434 spdk_nvmf_request_exec_fabrics(req); 2435 return 0; 2436 } 2437 2438 static int 2439 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 2440 struct spdk_nvmf_qpair *qpair) 2441 { 2442 struct nvmf_vfio_user_qpair *vu_qpair; 2443 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2444 struct nvmf_vfio_user_endpoint *endpoint; 2445 struct nvmf_vfio_user_poll_group *vu_group; 2446 2447 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2448 vu_ctrlr = vu_qpair->ctrlr; 2449 endpoint = vu_ctrlr->endpoint; 2450 2451 SPDK_DEBUGLOG(nvmf_vfio, 2452 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 2453 ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group); 2454 2455 2456 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2457 TAILQ_REMOVE(&vu_group->qps, vu_qpair, link); 2458 2459 pthread_mutex_lock(&endpoint->lock); 2460 assert(vu_ctrlr->num_connected_qps); 2461 vu_ctrlr->num_connected_qps--; 2462 pthread_mutex_unlock(&endpoint->lock); 2463 2464 return 0; 2465 } 2466 2467 static void 2468 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_qpair *vu_qpair, struct nvmf_vfio_user_req *vu_req) 2469 { 2470 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 2471 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 2472 vu_req->iovcnt = 0; 2473 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 2474 2475 TAILQ_INSERT_TAIL(&vu_qpair->reqs, vu_req, link); 2476 } 2477 2478 static int 2479 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 2480 { 2481 struct nvmf_vfio_user_qpair *vu_qpair; 2482 struct nvmf_vfio_user_req *vu_req; 2483 2484 assert(req != NULL); 2485 2486 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2487 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2488 2489 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2490 2491 return 0; 2492 } 2493 2494 static int 2495 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 2496 { 2497 struct nvmf_vfio_user_qpair *vu_qpair; 2498 struct nvmf_vfio_user_req *vu_req; 2499 2500 assert(req != NULL); 2501 2502 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2503 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2504 2505 if (vu_req->cb_fn != NULL) { 2506 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 2507 fail_ctrlr(vu_qpair->ctrlr); 2508 } 2509 } 2510 2511 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2512 2513 return 0; 2514 } 2515 2516 static void 2517 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 2518 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 2519 { 2520 struct nvmf_vfio_user_qpair *vu_qpair; 2521 2522 assert(qpair != NULL); 2523 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2524 free_qp(vu_qpair->ctrlr, qpair->qid); 2525 2526 if (cb_fn) { 2527 cb_fn(cb_arg); 2528 } 2529 } 2530 2531 /** 2532 * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available. 2533 */ 2534 static struct nvmf_vfio_user_req * 2535 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair) 2536 { 2537 struct nvmf_vfio_user_req *req; 2538 2539 assert(qpair != NULL); 2540 2541 if (TAILQ_EMPTY(&qpair->reqs)) { 2542 return NULL; 2543 } 2544 2545 req = TAILQ_FIRST(&qpair->reqs); 2546 TAILQ_REMOVE(&qpair->reqs, req, link); 2547 2548 return req; 2549 } 2550 2551 static struct spdk_nvmf_request * 2552 get_nvmf_req(struct nvmf_vfio_user_qpair *qpair) 2553 { 2554 struct nvmf_vfio_user_req *req = get_nvmf_vfio_user_req(qpair); 2555 2556 if (req == NULL) { 2557 return NULL; 2558 } 2559 return &req->req; 2560 } 2561 2562 static int 2563 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 2564 { 2565 uint16_t nlb, nr; 2566 uint32_t nsid; 2567 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2568 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 2569 struct spdk_nvmf_ns *ns; 2570 2571 nsid = cmd->nsid; 2572 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 2573 if (ns == NULL || ns->bdev == NULL) { 2574 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 2575 return -EINVAL; 2576 } 2577 2578 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 2579 nr = cmd->cdw10_bits.dsm.nr + 1; 2580 return nr * sizeof(struct spdk_nvme_dsm_range); 2581 } 2582 2583 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 2584 return nlb * spdk_bdev_get_block_size(ns->bdev); 2585 } 2586 2587 static int 2588 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2589 { 2590 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2591 uint32_t len = 0; 2592 int iovcnt; 2593 2594 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 2595 req->length = 0; 2596 req->data = NULL; 2597 2598 if (req->xfer == SPDK_NVME_DATA_NONE) { 2599 return 0; 2600 } 2601 2602 switch (cmd->opc) { 2603 case SPDK_NVME_OPC_IDENTIFY: 2604 len = 4096; 2605 break; 2606 case SPDK_NVME_OPC_GET_LOG_PAGE: 2607 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 2608 break; 2609 default: 2610 /* 2611 * CREATE IO SQ/CQ are processed separately in handle_create_io_q(). 2612 * GET/SET FEATURES: no need to support Host Identifier for vfio-user transport. 2613 * Let the NVMf library to decide other commands. 2614 */ 2615 return 0; 2616 } 2617 2618 /* ADMIN command will not use SGL */ 2619 if (req->cmd->nvme_cmd.psdt != 0) { 2620 return -EINVAL; 2621 } 2622 2623 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 2624 if (iovcnt < 0) { 2625 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 2626 ctrlr_id(ctrlr), cmd->opc); 2627 return -1; 2628 } 2629 req->length = len; 2630 req->data = req->iov[0].iov_base; 2631 req->iovcnt = iovcnt; 2632 2633 return 0; 2634 } 2635 2636 /* 2637 * Map an I/O command's buffers. 2638 * 2639 * Returns 0 on success and -errno on failure. 2640 */ 2641 static int 2642 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2643 { 2644 int len, iovcnt; 2645 struct spdk_nvme_cmd *cmd; 2646 2647 assert(ctrlr != NULL); 2648 assert(req != NULL); 2649 2650 cmd = &req->cmd->nvme_cmd; 2651 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 2652 req->length = 0; 2653 req->data = NULL; 2654 2655 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 2656 return 0; 2657 } 2658 2659 len = get_nvmf_io_req_length(req); 2660 if (len < 0) { 2661 return -EINVAL; 2662 } 2663 req->length = len; 2664 2665 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 2666 if (iovcnt < 0) { 2667 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 2668 return -EFAULT; 2669 } 2670 req->data = req->iov[0].iov_base; 2671 req->iovcnt = iovcnt; 2672 2673 return 0; 2674 } 2675 2676 static int 2677 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 2678 struct spdk_nvmf_request *req) 2679 { 2680 int err; 2681 struct nvmf_vfio_user_req *vu_req; 2682 2683 assert(ctrlr != NULL); 2684 assert(cmd != NULL); 2685 2686 /* 2687 * TODO: this means that there are no free requests available, 2688 * returning -1 will fail the controller. Theoretically this error can 2689 * be avoided completely by ensuring we have as many requests as slots 2690 * in the SQ, plus one for the the property request. 2691 */ 2692 if (spdk_unlikely(req == NULL)) { 2693 return -1; 2694 } 2695 2696 assert(req->qpair != NULL); 2697 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle qid%u, req opc=%#x cid=%d\n", 2698 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 2699 2700 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2701 vu_req->cb_fn = handle_cmd_rsp; 2702 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2703 req->cmd->nvme_cmd = *cmd; 2704 2705 if (nvmf_qpair_is_admin_queue(req->qpair)) { 2706 err = map_admin_cmd_req(ctrlr, req); 2707 } else { 2708 switch (cmd->opc) { 2709 case SPDK_NVME_OPC_RESERVATION_REGISTER: 2710 case SPDK_NVME_OPC_RESERVATION_REPORT: 2711 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 2712 case SPDK_NVME_OPC_RESERVATION_RELEASE: 2713 err = -ENOTSUP; 2714 break; 2715 default: 2716 err = map_io_cmd_req(ctrlr, req); 2717 break; 2718 } 2719 } 2720 2721 if (spdk_unlikely(err < 0)) { 2722 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 2723 ctrlr_id(ctrlr), cmd->opc); 2724 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2725 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2726 return handle_cmd_rsp(vu_req, vu_req->cb_arg); 2727 } 2728 2729 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 2730 spdk_nvmf_request_exec(req); 2731 2732 return 0; 2733 } 2734 2735 /* Returns the number of commands processed, or a negative value on error. */ 2736 static int 2737 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair) 2738 { 2739 struct nvmf_vfio_user_ctrlr *ctrlr; 2740 uint32_t new_tail; 2741 int count = 0; 2742 2743 assert(qpair != NULL); 2744 2745 ctrlr = qpair->ctrlr; 2746 2747 new_tail = *tdbl(ctrlr, &qpair->sq); 2748 if (sq_head(qpair) == new_tail) { 2749 return 0; 2750 } 2751 2752 count = handle_sq_tdbl_write(ctrlr, new_tail, qpair); 2753 if (count < 0) { 2754 fail_ctrlr(ctrlr); 2755 } 2756 2757 return count; 2758 } 2759 2760 /* 2761 * vfio-user transport poll handler. Note that the library context is polled in 2762 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 2763 * active qpairs. 2764 * 2765 * Returns the number of commands processed, or a negative value on error. 2766 */ 2767 static int 2768 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2769 { 2770 struct nvmf_vfio_user_poll_group *vu_group; 2771 struct nvmf_vfio_user_qpair *vu_qpair, *tmp; 2772 int count = 0; 2773 2774 assert(group != NULL); 2775 2776 spdk_rmb(); 2777 2778 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2779 2780 TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) { 2781 int ret; 2782 2783 if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) { 2784 continue; 2785 } 2786 2787 ret = nvmf_vfio_user_qpair_poll(vu_qpair); 2788 2789 if (ret < 0) { 2790 return ret; 2791 } 2792 2793 count += ret; 2794 } 2795 2796 return count; 2797 } 2798 2799 static int 2800 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 2801 struct spdk_nvme_transport_id *trid) 2802 { 2803 struct nvmf_vfio_user_qpair *vu_qpair; 2804 struct nvmf_vfio_user_ctrlr *ctrlr; 2805 2806 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2807 ctrlr = vu_qpair->ctrlr; 2808 2809 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2810 return 0; 2811 } 2812 2813 static int 2814 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 2815 struct spdk_nvme_transport_id *trid) 2816 { 2817 return 0; 2818 } 2819 2820 static int 2821 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 2822 struct spdk_nvme_transport_id *trid) 2823 { 2824 struct nvmf_vfio_user_qpair *vu_qpair; 2825 struct nvmf_vfio_user_ctrlr *ctrlr; 2826 2827 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2828 ctrlr = vu_qpair->ctrlr; 2829 2830 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2831 return 0; 2832 } 2833 2834 static void 2835 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 2836 struct spdk_nvmf_request *req) 2837 { 2838 struct nvmf_vfio_user_qpair *vu_qpair; 2839 struct nvmf_vfio_user_req *vu_req, *vu_req_to_abort = NULL; 2840 uint16_t i, cid; 2841 2842 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2843 2844 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 2845 for (i = 0; i < vu_qpair->qsize; i++) { 2846 vu_req = &vu_qpair->reqs_internal[i]; 2847 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 2848 vu_req_to_abort = vu_req; 2849 break; 2850 } 2851 } 2852 2853 if (vu_req_to_abort == NULL) { 2854 spdk_nvmf_request_complete(req); 2855 return; 2856 } 2857 2858 req->req_to_abort = &vu_req_to_abort->req; 2859 nvmf_ctrlr_abort_request(req); 2860 } 2861 2862 static void 2863 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 2864 { 2865 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 2866 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 2867 opts->in_capsule_data_size = 0; 2868 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 2869 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 2870 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 2871 opts->num_shared_buffers = 0; 2872 opts->buf_cache_size = 0; 2873 opts->association_timeout = 0; 2874 opts->transport_specific = NULL; 2875 } 2876 2877 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 2878 .name = "VFIOUSER", 2879 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 2880 .opts_init = nvmf_vfio_user_opts_init, 2881 .create = nvmf_vfio_user_create, 2882 .destroy = nvmf_vfio_user_destroy, 2883 2884 .listen = nvmf_vfio_user_listen, 2885 .stop_listen = nvmf_vfio_user_stop_listen, 2886 .accept = nvmf_vfio_user_accept, 2887 .cdata_init = nvmf_vfio_user_cdata_init, 2888 .listen_associate = nvmf_vfio_user_listen_associate, 2889 2890 .listener_discover = nvmf_vfio_user_discover, 2891 2892 .poll_group_create = nvmf_vfio_user_poll_group_create, 2893 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 2894 .poll_group_add = nvmf_vfio_user_poll_group_add, 2895 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 2896 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 2897 2898 .req_free = nvmf_vfio_user_req_free, 2899 .req_complete = nvmf_vfio_user_req_complete, 2900 2901 .qpair_fini = nvmf_vfio_user_close_qpair, 2902 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 2903 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 2904 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 2905 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 2906 }; 2907 2908 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 2909 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 2910