1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019, Nutanix Inc. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * * Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * * Neither the name of Intel Corporation nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * NVMe over vfio-user transport 35 */ 36 37 #include <vfio-user/libvfio-user.h> 38 #include <vfio-user/pci_defs.h> 39 40 #include "spdk/barrier.h" 41 #include "spdk/stdinc.h" 42 #include "spdk/assert.h" 43 #include "spdk/thread.h" 44 #include "spdk/nvmf_transport.h" 45 #include "spdk/sock.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 #include "spdk/log.h" 49 50 #include "transport.h" 51 52 #include "nvmf_internal.h" 53 54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 56 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 57 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 58 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 59 60 #define NVMF_VFIO_USER_DOORBELLS_OFFSET 0x1000 61 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 62 63 #define NVME_REG_CFG_SIZE 0x1000 64 #define NVME_REG_BAR0_SIZE 0x4000 65 #define NVME_IRQ_INTX_NUM 1 66 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 67 68 struct nvmf_vfio_user_req; 69 struct nvmf_vfio_user_qpair; 70 71 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 72 73 /* 1 more for PRP2 list itself */ 74 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 75 76 enum nvmf_vfio_user_req_state { 77 VFIO_USER_REQUEST_STATE_FREE = 0, 78 VFIO_USER_REQUEST_STATE_EXECUTING, 79 }; 80 81 struct nvmf_vfio_user_req { 82 struct spdk_nvmf_request req; 83 struct spdk_nvme_cpl rsp; 84 struct spdk_nvme_cmd cmd; 85 86 enum nvmf_vfio_user_req_state state; 87 nvmf_vfio_user_req_cb_fn cb_fn; 88 void *cb_arg; 89 90 /* old CC before prop_set_cc fabric command */ 91 union spdk_nvme_cc_register cc; 92 93 /* placeholder for gpa_to_vva memory map table, the IO buffer doesn't use it */ 94 dma_sg_t *sg; 95 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 96 uint8_t iovcnt; 97 98 TAILQ_ENTRY(nvmf_vfio_user_req) link; 99 }; 100 101 /* 102 * A NVMe queue. 103 */ 104 struct nvme_q { 105 bool is_cq; 106 107 void *addr; 108 109 dma_sg_t *sg; 110 struct iovec iov; 111 112 uint32_t size; 113 uint64_t prp1; 114 115 union { 116 struct { 117 uint32_t head; 118 /* multiple SQs can be mapped to the same CQ */ 119 uint16_t cqid; 120 }; 121 struct { 122 uint32_t tail; 123 uint16_t iv; 124 bool ien; 125 }; 126 }; 127 }; 128 129 enum nvmf_vfio_user_qpair_state { 130 VFIO_USER_QPAIR_UNINITIALIZED = 0, 131 VFIO_USER_QPAIR_ACTIVE, 132 VFIO_USER_QPAIR_DELETED, 133 VFIO_USER_QPAIR_INACTIVE, 134 VFIO_USER_QPAIR_ERROR, 135 }; 136 137 struct nvmf_vfio_user_qpair { 138 struct spdk_nvmf_qpair qpair; 139 struct spdk_nvmf_transport_poll_group *group; 140 struct nvmf_vfio_user_ctrlr *ctrlr; 141 struct nvmf_vfio_user_req *reqs_internal; 142 uint16_t qsize; 143 struct nvme_q cq; 144 struct nvme_q sq; 145 enum nvmf_vfio_user_qpair_state state; 146 147 /* Copy of Create IO SQ command */ 148 struct spdk_nvme_cmd create_io_sq_cmd; 149 150 TAILQ_HEAD(, nvmf_vfio_user_req) reqs; 151 TAILQ_ENTRY(nvmf_vfio_user_qpair) link; 152 }; 153 154 struct nvmf_vfio_user_poll_group { 155 struct spdk_nvmf_transport_poll_group group; 156 TAILQ_HEAD(, nvmf_vfio_user_qpair) qps; 157 }; 158 159 struct nvmf_vfio_user_ctrlr { 160 struct nvmf_vfio_user_endpoint *endpoint; 161 struct nvmf_vfio_user_transport *transport; 162 163 /* Number of connected queue pairs */ 164 uint32_t num_connected_qps; 165 166 struct spdk_thread *thread; 167 struct spdk_poller *mmio_poller; 168 169 uint16_t cntlid; 170 171 struct nvmf_vfio_user_qpair *qp[NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR]; 172 173 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 174 175 volatile uint32_t *doorbells; 176 177 /* internal CSTS.CFS register for vfio-user fatal errors */ 178 uint32_t cfs : 1; 179 }; 180 181 struct nvmf_vfio_user_endpoint { 182 vfu_ctx_t *vfu_ctx; 183 struct msixcap *msix; 184 vfu_pci_config_space_t *pci_config_space; 185 int fd; 186 volatile uint32_t *doorbells; 187 188 struct spdk_nvme_transport_id trid; 189 const struct spdk_nvmf_subsystem *subsystem; 190 191 struct nvmf_vfio_user_ctrlr *ctrlr; 192 pthread_mutex_t lock; 193 194 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 195 }; 196 197 struct nvmf_vfio_user_transport_opts { 198 bool disable_mappable_bar0; 199 }; 200 201 struct nvmf_vfio_user_transport { 202 struct spdk_nvmf_transport transport; 203 struct nvmf_vfio_user_transport_opts transport_opts; 204 pthread_mutex_t lock; 205 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 206 207 TAILQ_HEAD(, nvmf_vfio_user_qpair) new_qps; 208 }; 209 210 /* 211 * function prototypes 212 */ 213 static volatile uint32_t * 214 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 215 216 static volatile uint32_t * 217 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 218 219 static int 220 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 221 222 static struct nvmf_vfio_user_req * 223 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair); 224 225 static int 226 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 227 struct nvme_q *cq, uint32_t cdw0, uint16_t sc, 228 uint16_t sct); 229 230 static int 231 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 232 uint32_t max_iovcnt, uint32_t len, size_t mps, 233 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 234 { 235 uint64_t prp1, prp2; 236 void *vva; 237 uint32_t i; 238 uint32_t residue_len, nents; 239 uint64_t *prp_list; 240 uint32_t iovcnt; 241 242 assert(max_iovcnt > 0); 243 244 prp1 = cmd->dptr.prp.prp1; 245 prp2 = cmd->dptr.prp.prp2; 246 247 /* PRP1 may started with unaligned page address */ 248 residue_len = mps - (prp1 % mps); 249 residue_len = spdk_min(len, residue_len); 250 251 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 252 if (spdk_unlikely(vva == NULL)) { 253 SPDK_ERRLOG("GPA to VVA failed\n"); 254 return -EINVAL; 255 } 256 len -= residue_len; 257 if (len && max_iovcnt < 2) { 258 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 259 return -ERANGE; 260 } 261 iovs[0].iov_base = vva; 262 iovs[0].iov_len = residue_len; 263 264 if (len) { 265 if (spdk_unlikely(prp2 == 0)) { 266 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 267 return -EINVAL; 268 } 269 270 if (len <= mps) { 271 /* 2 PRP used */ 272 iovcnt = 2; 273 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 274 if (spdk_unlikely(vva == NULL)) { 275 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 276 prp2, len); 277 return -EINVAL; 278 } 279 iovs[1].iov_base = vva; 280 iovs[1].iov_len = len; 281 } else { 282 /* PRP list used */ 283 nents = (len + mps - 1) / mps; 284 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 285 SPDK_ERRLOG("Too many page entries\n"); 286 return -ERANGE; 287 } 288 289 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 290 if (spdk_unlikely(vva == NULL)) { 291 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 292 prp2, nents); 293 return -EINVAL; 294 } 295 prp_list = vva; 296 i = 0; 297 while (len != 0) { 298 residue_len = spdk_min(len, mps); 299 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 300 if (spdk_unlikely(vva == NULL)) { 301 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 302 prp_list[i], residue_len); 303 return -EINVAL; 304 } 305 iovs[i + 1].iov_base = vva; 306 iovs[i + 1].iov_len = residue_len; 307 len -= residue_len; 308 i++; 309 } 310 iovcnt = i + 1; 311 } 312 } else { 313 /* 1 PRP used */ 314 iovcnt = 1; 315 } 316 317 assert(iovcnt <= max_iovcnt); 318 return iovcnt; 319 } 320 321 static int 322 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 323 struct iovec *iovs, uint32_t max_iovcnt, 324 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 325 { 326 uint32_t i; 327 void *vva; 328 329 if (spdk_unlikely(max_iovcnt < num_sgls)) { 330 return -ERANGE; 331 } 332 333 for (i = 0; i < num_sgls; i++) { 334 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 335 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 336 return -EINVAL; 337 } 338 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 339 if (spdk_unlikely(vva == NULL)) { 340 SPDK_ERRLOG("GPA to VVA failed\n"); 341 return -EINVAL; 342 } 343 iovs[i].iov_base = vva; 344 iovs[i].iov_len = sgls[i].unkeyed.length; 345 } 346 347 return num_sgls; 348 } 349 350 static int 351 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 352 uint32_t len, size_t mps, 353 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 354 { 355 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 356 uint32_t num_sgls, seg_len; 357 void *vva; 358 int ret; 359 uint32_t total_iovcnt = 0; 360 361 /* SGL cases */ 362 sgl = &cmd->dptr.sgl1; 363 364 /* only one SGL segment */ 365 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 366 assert(max_iovcnt > 0); 367 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 368 if (spdk_unlikely(vva == NULL)) { 369 SPDK_ERRLOG("GPA to VVA failed\n"); 370 return -EINVAL; 371 } 372 iovs[0].iov_base = vva; 373 iovs[0].iov_len = sgl->unkeyed.length; 374 assert(sgl->unkeyed.length == len); 375 376 return 1; 377 } 378 379 for (;;) { 380 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 381 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 382 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 383 return -EINVAL; 384 } 385 386 seg_len = sgl->unkeyed.length; 387 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 388 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 389 return -EINVAL; 390 } 391 392 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 393 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 394 if (spdk_unlikely(vva == NULL)) { 395 SPDK_ERRLOG("GPA to VVA failed\n"); 396 return -EINVAL; 397 } 398 399 /* sgl point to the first segment */ 400 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 401 last_sgl = &sgl[num_sgls - 1]; 402 403 /* we are done */ 404 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 405 /* map whole sgl list */ 406 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 407 max_iovcnt - total_iovcnt, gpa_to_vva); 408 if (spdk_unlikely(ret < 0)) { 409 return ret; 410 } 411 total_iovcnt += ret; 412 413 return total_iovcnt; 414 } 415 416 if (num_sgls > 1) { 417 /* map whole sgl exclude last_sgl */ 418 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 419 max_iovcnt - total_iovcnt, gpa_to_vva); 420 if (spdk_unlikely(ret < 0)) { 421 return ret; 422 } 423 total_iovcnt += ret; 424 } 425 426 /* move to next level's segments */ 427 sgl = last_sgl; 428 } 429 430 return 0; 431 } 432 433 static int 434 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 435 uint32_t len, size_t mps, 436 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 437 { 438 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 439 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 440 } 441 442 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 443 } 444 445 static char * 446 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 447 { 448 return endpoint->trid.traddr; 449 } 450 451 static char * 452 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 453 { 454 if (!ctrlr || !ctrlr->endpoint) { 455 return "Null Ctrlr"; 456 } 457 458 return endpoint_id(ctrlr->endpoint); 459 } 460 461 static uint16_t 462 io_q_id(struct nvme_q *q) 463 { 464 465 struct nvmf_vfio_user_qpair *vfio_user_qpair; 466 467 assert(q); 468 469 if (q->is_cq) { 470 vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq); 471 } else { 472 vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq); 473 } 474 assert(vfio_user_qpair); 475 return vfio_user_qpair->qpair.qid; 476 } 477 478 static void 479 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 480 { 481 assert(ctrlr != NULL); 482 483 if (ctrlr->cfs == 0) { 484 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr)); 485 } 486 487 ctrlr->cfs = 1U; 488 } 489 490 static bool 491 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *ctrlr) 492 { 493 assert(ctrlr != NULL); 494 assert(ctrlr->endpoint != NULL); 495 496 vfu_pci_config_space_t *pci = ctrlr->endpoint->pci_config_space; 497 498 return (!pci->hdr.cmd.id || ctrlr->endpoint->msix->mxc.mxe); 499 } 500 501 static void 502 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 503 { 504 if (endpoint->doorbells) { 505 munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 506 } 507 508 if (endpoint->fd > 0) { 509 close(endpoint->fd); 510 } 511 512 vfu_destroy_ctx(endpoint->vfu_ctx); 513 514 pthread_mutex_destroy(&endpoint->lock); 515 free(endpoint); 516 } 517 518 /* called when process exits */ 519 static int 520 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 521 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 522 { 523 struct nvmf_vfio_user_transport *vu_transport; 524 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 525 526 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 527 528 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 529 transport); 530 531 (void)pthread_mutex_destroy(&vu_transport->lock); 532 533 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 534 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 535 nvmf_vfio_user_destroy_endpoint(endpoint); 536 } 537 538 free(vu_transport); 539 540 if (cb_fn) { 541 cb_fn(cb_arg); 542 } 543 544 return 0; 545 } 546 547 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 548 { 549 "disable-mappable-bar0", 550 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 551 spdk_json_decode_bool, true 552 }, 553 }; 554 555 static struct spdk_nvmf_transport * 556 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 557 { 558 struct nvmf_vfio_user_transport *vu_transport; 559 int err; 560 561 vu_transport = calloc(1, sizeof(*vu_transport)); 562 if (vu_transport == NULL) { 563 SPDK_ERRLOG("Transport alloc fail: %m\n"); 564 return NULL; 565 } 566 567 err = pthread_mutex_init(&vu_transport->lock, NULL); 568 if (err != 0) { 569 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 570 goto err; 571 } 572 573 TAILQ_INIT(&vu_transport->endpoints); 574 TAILQ_INIT(&vu_transport->new_qps); 575 576 if (opts->transport_specific != NULL && 577 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 578 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 579 vu_transport)) { 580 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 581 free(vu_transport); 582 return NULL; 583 } 584 585 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 586 vu_transport->transport_opts.disable_mappable_bar0); 587 588 return &vu_transport->transport; 589 590 err: 591 free(vu_transport); 592 593 return NULL; 594 } 595 596 static uint16_t 597 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr) 598 { 599 assert(ctrlr != NULL); 600 assert(ctrlr->qp[0] != NULL); 601 assert(ctrlr->qp[0]->qpair.ctrlr != NULL); 602 603 return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1; 604 } 605 606 static void * 607 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot) 608 { 609 int ret; 610 611 assert(ctx != NULL); 612 assert(sg != NULL); 613 assert(iov != NULL); 614 615 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 616 if (ret < 0) { 617 return NULL; 618 } 619 620 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 621 if (ret != 0) { 622 return NULL; 623 } 624 625 assert(iov->iov_base != NULL); 626 return iov->iov_base; 627 } 628 629 static uint32_t 630 sq_head(struct nvmf_vfio_user_qpair *qpair) 631 { 632 assert(qpair != NULL); 633 return qpair->sq.head; 634 } 635 636 static void 637 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair) 638 { 639 assert(ctrlr != NULL); 640 assert(qpair != NULL); 641 qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size; 642 } 643 644 static int 645 asq_map(struct nvmf_vfio_user_ctrlr *ctrlr) 646 { 647 struct nvme_q *sq; 648 const struct spdk_nvmf_registers *regs; 649 650 assert(ctrlr != NULL); 651 assert(ctrlr->qp[0] != NULL); 652 assert(ctrlr->qp[0]->sq.addr == NULL); 653 /* XXX ctrlr->asq == 0 is a valid memory address */ 654 655 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 656 sq = &ctrlr->qp[0]->sq; 657 sq->size = regs->aqa.bits.asqs + 1; 658 sq->head = ctrlr->doorbells[0] = 0; 659 sq->cqid = 0; 660 sq->addr = map_one(ctrlr->endpoint->vfu_ctx, regs->asq, 661 sq->size * sizeof(struct spdk_nvme_cmd), sq->sg, 662 &sq->iov, PROT_READ); 663 if (sq->addr == NULL) { 664 return -1; 665 } 666 memset(sq->addr, 0, sq->size * sizeof(struct spdk_nvme_cmd)); 667 sq->is_cq = false; 668 *tdbl(ctrlr, sq) = 0; 669 670 return 0; 671 } 672 673 static uint16_t 674 cq_next(struct nvme_q *q) 675 { 676 assert(q != NULL); 677 assert(q->is_cq); 678 return (q->tail + 1) % q->size; 679 } 680 681 static int 682 queue_index(uint16_t qid, int is_cq) 683 { 684 return (qid * 2) + is_cq; 685 } 686 687 static volatile uint32_t * 688 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 689 { 690 assert(ctrlr != NULL); 691 assert(q != NULL); 692 assert(!q->is_cq); 693 694 return &ctrlr->doorbells[queue_index(io_q_id(q), false)]; 695 } 696 697 static volatile uint32_t * 698 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 699 { 700 assert(ctrlr != NULL); 701 assert(q != NULL); 702 assert(q->is_cq); 703 704 return &ctrlr->doorbells[queue_index(io_q_id(q), true)]; 705 } 706 707 static bool 708 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 709 { 710 assert(ctrlr != NULL); 711 assert(q != NULL); 712 return cq_next(q) == *hdbl(ctrlr, q); 713 } 714 715 static void 716 cq_tail_advance(struct nvme_q *q) 717 { 718 assert(q != NULL); 719 q->tail = cq_next(q); 720 } 721 722 static int 723 acq_map(struct nvmf_vfio_user_ctrlr *ctrlr) 724 { 725 struct nvme_q *cq; 726 const struct spdk_nvmf_registers *regs; 727 728 assert(ctrlr != NULL); 729 assert(ctrlr->qp[0] != NULL); 730 assert(ctrlr->qp[0]->cq.addr == NULL); 731 732 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 733 assert(regs != NULL); 734 cq = &ctrlr->qp[0]->cq; 735 cq->size = regs->aqa.bits.acqs + 1; 736 cq->tail = 0; 737 cq->addr = map_one(ctrlr->endpoint->vfu_ctx, regs->acq, 738 cq->size * sizeof(struct spdk_nvme_cpl), cq->sg, 739 &cq->iov, PROT_READ | PROT_WRITE); 740 if (cq->addr == NULL) { 741 return -1; 742 } 743 memset(cq->addr, 0, cq->size * sizeof(struct spdk_nvme_cpl)); 744 cq->is_cq = true; 745 cq->ien = true; 746 *hdbl(ctrlr, cq) = 0; 747 748 return 0; 749 } 750 751 static inline dma_sg_t * 752 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 753 { 754 return (dma_sg_t *)((uintptr_t)vu_req->sg + iovcnt * dma_sg_size()); 755 } 756 757 static void * 758 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 759 { 760 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 761 struct spdk_nvmf_qpair *qpair; 762 struct nvmf_vfio_user_req *vu_req; 763 struct nvmf_vfio_user_qpair *vu_qpair; 764 void *ret; 765 766 assert(req != NULL); 767 qpair = req->qpair; 768 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 769 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 770 771 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 772 ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len, 773 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 774 &vu_req->iov[vu_req->iovcnt], prot); 775 if (spdk_likely(ret != NULL)) { 776 vu_req->iovcnt++; 777 } 778 return ret; 779 } 780 781 static int 782 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 783 struct iovec *iov, uint32_t length) 784 { 785 /* Map PRP list to from Guest physical memory to 786 * virtual memory address. 787 */ 788 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 789 length, 4096, _map_one); 790 } 791 792 static struct spdk_nvmf_request * 793 get_nvmf_req(struct nvmf_vfio_user_qpair *qp); 794 795 static int 796 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 797 struct spdk_nvmf_request *req); 798 799 /* 800 * Posts a CQE in the completion queue. 801 * 802 * @ctrlr: the vfio-user controller 803 * @cmd: the NVMe command for which the completion is posted 804 * @cq: the completion queue 805 * @cdw0: cdw0 as reported by NVMf 806 * @sc: the NVMe CQE status code 807 * @sct: the NVMe CQE status code type 808 */ 809 static int 810 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 811 struct nvme_q *cq, uint32_t cdw0, uint16_t sc, 812 uint16_t sct) 813 { 814 struct spdk_nvme_cpl *cpl; 815 const struct spdk_nvmf_registers *regs; 816 uint16_t qid; 817 int err; 818 819 assert(ctrlr != NULL); 820 assert(cmd != NULL); 821 822 if (spdk_unlikely(cq == NULL || cq->addr == NULL)) { 823 return 0; 824 } 825 826 qid = io_q_id(cq); 827 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 828 if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) { 829 SPDK_DEBUGLOG(nvmf_vfio, 830 "%s: ignore completion SQ%d cid=%d status=%#x\n", 831 ctrlr_id(ctrlr), qid, cmd->cid, sc); 832 return 0; 833 } 834 835 if (cq_is_full(ctrlr, cq)) { 836 SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n", 837 ctrlr_id(ctrlr), qid, cq->tail, *hdbl(ctrlr, cq)); 838 return -1; 839 } 840 841 cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail; 842 843 assert(ctrlr->qp[qid] != NULL); 844 SPDK_DEBUGLOG(nvmf_vfio, 845 "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n", 846 ctrlr_id(ctrlr), qid, cmd->cid, sc, ctrlr->qp[qid]->sq.head, 847 cq->tail); 848 849 cpl->sqhd = ctrlr->qp[qid]->sq.head; 850 cpl->cid = cmd->cid; 851 cpl->cdw0 = cdw0; 852 cpl->status.dnr = 0x0; 853 cpl->status.m = 0x0; 854 cpl->status.sct = sct; 855 cpl->status.p = ~cpl->status.p; 856 cpl->status.sc = sc; 857 858 cq_tail_advance(cq); 859 860 /* 861 * this function now executes at SPDK thread context, we 862 * might be triggerring interrupts from vfio-user thread context so 863 * check for race conditions. 864 */ 865 if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) { 866 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 867 if (err != 0) { 868 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 869 ctrlr_id(ctrlr)); 870 return err; 871 } 872 } 873 874 return 0; 875 } 876 877 static struct nvme_q * 878 lookup_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, const uint16_t qid, const bool is_cq) 879 { 880 struct nvme_q *q; 881 882 assert(ctrlr != NULL); 883 884 if (qid > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) { 885 return NULL; 886 } 887 888 if (ctrlr->qp[qid] == NULL) { 889 return NULL; 890 } 891 892 if (is_cq) { 893 q = &ctrlr->qp[qid]->cq; 894 } else { 895 q = &ctrlr->qp[qid]->sq; 896 } 897 898 if (q->addr == NULL) { 899 return NULL; 900 } 901 902 return q; 903 } 904 905 static void 906 unmap_qp(struct nvmf_vfio_user_qpair *qp) 907 { 908 struct nvmf_vfio_user_ctrlr *ctrlr; 909 910 if (qp->ctrlr == NULL) { 911 return; 912 } 913 ctrlr = qp->ctrlr; 914 915 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap QP%d\n", 916 ctrlr_id(ctrlr), qp->qpair.qid); 917 918 if (qp->sq.addr != NULL) { 919 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->sq.sg, &qp->sq.iov, 1); 920 qp->sq.addr = NULL; 921 } 922 923 if (qp->cq.addr != NULL) { 924 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->cq.sg, &qp->cq.iov, 1); 925 qp->cq.addr = NULL; 926 } 927 } 928 929 static void 930 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 931 { 932 struct nvmf_vfio_user_qpair *qpair; 933 struct nvmf_vfio_user_req *vu_req; 934 uint32_t i; 935 936 if (ctrlr == NULL) { 937 return; 938 } 939 940 qpair = ctrlr->qp[qid]; 941 if (qpair == NULL) { 942 return; 943 } 944 945 SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr), 946 qid, qpair); 947 948 unmap_qp(qpair); 949 950 for (i = 0; i < qpair->qsize; i++) { 951 vu_req = &qpair->reqs_internal[i]; 952 free(vu_req->sg); 953 } 954 free(qpair->reqs_internal); 955 956 free(qpair->sq.sg); 957 free(qpair->cq.sg); 958 free(qpair); 959 960 ctrlr->qp[qid] = NULL; 961 } 962 963 /* This function can only fail because of memory allocation errors. */ 964 static int 965 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 966 const uint16_t qsize, const uint16_t id) 967 { 968 uint16_t i; 969 struct nvmf_vfio_user_qpair *qpair; 970 struct nvmf_vfio_user_req *vu_req, *tmp; 971 struct spdk_nvmf_request *req; 972 973 assert(ctrlr != NULL); 974 assert(transport != NULL); 975 976 qpair = calloc(1, sizeof(*qpair)); 977 if (qpair == NULL) { 978 return -ENOMEM; 979 } 980 qpair->sq.sg = calloc(1, dma_sg_size()); 981 if (qpair->sq.sg == NULL) { 982 free(qpair); 983 return -ENOMEM; 984 } 985 qpair->cq.sg = calloc(1, dma_sg_size()); 986 if (qpair->cq.sg == NULL) { 987 free(qpair->sq.sg); 988 free(qpair); 989 return -ENOMEM; 990 } 991 992 qpair->qpair.qid = id; 993 qpair->qpair.transport = transport; 994 qpair->ctrlr = ctrlr; 995 qpair->qsize = qsize; 996 997 TAILQ_INIT(&qpair->reqs); 998 999 qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req)); 1000 if (qpair->reqs_internal == NULL) { 1001 SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr)); 1002 goto reqs_err; 1003 } 1004 1005 for (i = 0; i < qsize; i++) { 1006 vu_req = &qpair->reqs_internal[i]; 1007 vu_req->sg = calloc(NVMF_VFIO_USER_MAX_IOVECS, dma_sg_size()); 1008 if (vu_req->sg == NULL) { 1009 goto sg_err; 1010 } 1011 1012 req = &vu_req->req; 1013 req->qpair = &qpair->qpair; 1014 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1015 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1016 1017 TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link); 1018 } 1019 1020 ctrlr->qp[id] = qpair; 1021 return 0; 1022 1023 sg_err: 1024 TAILQ_FOREACH_SAFE(vu_req, &qpair->reqs, link, tmp) { 1025 free(vu_req->sg); 1026 } 1027 free(qpair->reqs_internal); 1028 1029 reqs_err: 1030 free(qpair->sq.sg); 1031 free(qpair->cq.sg); 1032 free(qpair); 1033 return -ENOMEM; 1034 } 1035 1036 /* 1037 * Creates a completion or sumbission I/O queue. Returns 0 on success, -errno 1038 * on error. 1039 * 1040 * XXX SPDK thread context. 1041 */ 1042 static int 1043 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1044 struct spdk_nvme_cmd *cmd, const bool is_cq) 1045 { 1046 size_t entry_size; 1047 uint16_t qsize; 1048 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1049 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1050 int err = 0; 1051 struct nvmf_vfio_user_qpair *vu_qpair; 1052 struct nvme_q *io_q; 1053 int prot; 1054 1055 assert(ctrlr != NULL); 1056 assert(cmd != NULL); 1057 1058 SPDK_DEBUGLOG(nvmf_vfio, 1059 "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr), 1060 is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid, 1061 cmd->cdw10_bits.create_io_q.qsize); 1062 1063 if (cmd->cdw10_bits.create_io_q.qid >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) { 1064 SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr), 1065 cmd->cdw10_bits.create_io_q.qid, 1066 NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR); 1067 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1068 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1069 goto out; 1070 } 1071 1072 if (lookup_io_q(ctrlr, cmd->cdw10_bits.create_io_q.qid, is_cq)) { 1073 SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr), 1074 is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid); 1075 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1076 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1077 goto out; 1078 } 1079 1080 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1081 if (qsize > max_queue_size(ctrlr)) { 1082 SPDK_ERRLOG("%s: queue too big, want=%d, max=%d\n", ctrlr_id(ctrlr), 1083 qsize, max_queue_size(ctrlr)); 1084 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1085 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 1086 goto out; 1087 } 1088 1089 if (is_cq) { 1090 err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, qsize, 1091 cmd->cdw10_bits.create_io_q.qid); 1092 if (err != 0) { 1093 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1094 goto out; 1095 } 1096 1097 io_q = &ctrlr->qp[cmd->cdw10_bits.create_io_q.qid]->cq; 1098 entry_size = sizeof(struct spdk_nvme_cpl); 1099 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1100 SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr)); 1101 sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF; 1102 goto out; 1103 } 1104 io_q->ien = cmd->cdw11_bits.create_io_cq.ien; 1105 io_q->iv = cmd->cdw11_bits.create_io_cq.iv; 1106 } else { 1107 /* CQ must be created before SQ */ 1108 if (!lookup_io_q(ctrlr, cmd->cdw11_bits.create_io_sq.cqid, true)) { 1109 SPDK_ERRLOG("%s: CQ%d does not exist\n", ctrlr_id(ctrlr), 1110 cmd->cdw11_bits.create_io_sq.cqid); 1111 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1112 sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1113 goto out; 1114 } 1115 1116 io_q = &ctrlr->qp[cmd->cdw10_bits.create_io_q.qid]->sq; 1117 entry_size = sizeof(struct spdk_nvme_cmd); 1118 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1119 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1120 sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF; 1121 goto out; 1122 } 1123 1124 io_q->cqid = cmd->cdw11_bits.create_io_sq.cqid; 1125 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr), 1126 cmd->cdw10_bits.create_io_q.qid, io_q->cqid); 1127 } 1128 1129 io_q->is_cq = is_cq; 1130 io_q->size = qsize; 1131 prot = PROT_READ; 1132 if (is_cq) { 1133 prot |= PROT_WRITE; 1134 } 1135 io_q->addr = map_one(ctrlr->endpoint->vfu_ctx, cmd->dptr.prp.prp1, 1136 io_q->size * entry_size, io_q->sg, &io_q->iov, prot); 1137 if (io_q->addr == NULL) { 1138 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1139 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1140 goto out; 1141 } 1142 io_q->prp1 = cmd->dptr.prp.prp1; 1143 memset(io_q->addr, 0, io_q->size * entry_size); 1144 1145 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n", 1146 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1147 cmd->cdw10_bits.create_io_q.qid, cmd->dptr.prp.prp1, 1148 (unsigned long long)io_q->addr); 1149 1150 if (is_cq) { 1151 *hdbl(ctrlr, io_q) = 0; 1152 } else { 1153 /* After we've returned here, on the next time nvmf_vfio_user_accept executes it will 1154 * pick up this qpair and will eventually call nvmf_vfio_user_poll_group_add which will 1155 * call spdk_nvmf_request_exec_fabrics with a generated fabrics connect command. That 1156 * will then call handle_queue_connect_rsp, which is where we ultimately complete 1157 * this command. 1158 */ 1159 vu_qpair = ctrlr->qp[cmd->cdw10_bits.create_io_q.qid]; 1160 vu_qpair->create_io_sq_cmd = *cmd; 1161 TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, vu_qpair, link); 1162 *tdbl(ctrlr, io_q) = 0; 1163 return 0; 1164 } 1165 1166 out: 1167 return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct); 1168 } 1169 1170 /* For ADMIN I/O DELETE COMPLETION QUEUE the NVMf library will disconnect and free 1171 * queue pair, so save the command in a context. 1172 */ 1173 struct vfio_user_delete_cq_ctx { 1174 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1175 struct spdk_nvme_cmd delete_io_cq_cmd; 1176 }; 1177 1178 static void 1179 vfio_user_qpair_delete_cb(void *cb_arg) 1180 { 1181 struct vfio_user_delete_cq_ctx *ctx = cb_arg; 1182 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 1183 1184 post_completion(vu_ctrlr, &ctx->delete_io_cq_cmd, &vu_ctrlr->qp[0]->cq, 0, 1185 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 1186 free(ctx); 1187 } 1188 1189 /* 1190 * Deletes a completion or sumbission I/O queue. 1191 */ 1192 static int 1193 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1194 struct spdk_nvme_cmd *cmd, const bool is_cq) 1195 { 1196 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1197 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1198 struct nvmf_vfio_user_qpair *vu_qpair; 1199 struct vfio_user_delete_cq_ctx *ctx; 1200 1201 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n", 1202 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1203 cmd->cdw10_bits.delete_io_q.qid); 1204 1205 if (lookup_io_q(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq) == NULL) { 1206 SPDK_ERRLOG("%s: %cQ%d does not exist\n", ctrlr_id(ctrlr), 1207 is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid); 1208 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1209 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1210 goto out; 1211 } 1212 1213 vu_qpair = ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]; 1214 if (is_cq) { 1215 /* SQ must have been deleted first */ 1216 if (vu_qpair->state != VFIO_USER_QPAIR_DELETED) { 1217 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 1218 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1219 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 1220 goto out; 1221 } 1222 ctx = calloc(1, sizeof(*ctx)); 1223 if (!ctx) { 1224 sct = SPDK_NVME_SCT_GENERIC; 1225 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1226 goto out; 1227 } 1228 ctx->vu_ctrlr = ctrlr; 1229 ctx->delete_io_cq_cmd = *cmd; 1230 spdk_nvmf_qpair_disconnect(&vu_qpair->qpair, vfio_user_qpair_delete_cb, ctx); 1231 return 0; 1232 } else { 1233 /* 1234 * This doesn't actually delete the SQ, We're merely telling the poll_group_poll 1235 * function to skip checking this SQ. The queue pair will be disconnected in Delete 1236 * IO CQ command. 1237 */ 1238 assert(vu_qpair->state == VFIO_USER_QPAIR_ACTIVE); 1239 vu_qpair->state = VFIO_USER_QPAIR_DELETED; 1240 } 1241 1242 out: 1243 return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct); 1244 } 1245 1246 /* 1247 * Returns 0 on success and -errno on error. 1248 * 1249 * XXX SPDK thread context 1250 */ 1251 static int 1252 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 1253 { 1254 assert(ctrlr != NULL); 1255 assert(cmd != NULL); 1256 1257 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle admin req opc=%#x cid=%d\n", 1258 ctrlr_id(ctrlr), cmd->opc, cmd->cid); 1259 1260 switch (cmd->opc) { 1261 case SPDK_NVME_OPC_CREATE_IO_CQ: 1262 case SPDK_NVME_OPC_CREATE_IO_SQ: 1263 return handle_create_io_q(ctrlr, cmd, 1264 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 1265 case SPDK_NVME_OPC_DELETE_IO_SQ: 1266 case SPDK_NVME_OPC_DELETE_IO_CQ: 1267 return handle_del_io_q(ctrlr, cmd, 1268 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 1269 default: 1270 return handle_cmd_req(ctrlr, cmd, get_nvmf_req(ctrlr->qp[0])); 1271 } 1272 } 1273 1274 static int 1275 handle_cmd_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1276 { 1277 struct nvmf_vfio_user_qpair *qpair = cb_arg; 1278 1279 assert(qpair != NULL); 1280 assert(req != NULL); 1281 1282 vfu_unmap_sg(qpair->ctrlr->endpoint->vfu_ctx, req->sg, req->iov, req->iovcnt); 1283 1284 return post_completion(qpair->ctrlr, &req->req.cmd->nvme_cmd, 1285 &qpair->ctrlr->qp[req->req.qpair->qid]->cq, 1286 req->req.rsp->nvme_cpl.cdw0, 1287 req->req.rsp->nvme_cpl.status.sc, 1288 req->req.rsp->nvme_cpl.status.sct); 1289 } 1290 1291 static int 1292 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair, 1293 struct spdk_nvme_cmd *cmd) 1294 { 1295 assert(qpair != NULL); 1296 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 1297 return consume_admin_cmd(ctrlr, cmd); 1298 } 1299 1300 return handle_cmd_req(ctrlr, cmd, get_nvmf_req(qpair)); 1301 } 1302 1303 static ssize_t 1304 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 1305 struct nvmf_vfio_user_qpair *qpair) 1306 { 1307 struct spdk_nvme_cmd *queue; 1308 1309 assert(ctrlr != NULL); 1310 assert(qpair != NULL); 1311 1312 queue = qpair->sq.addr; 1313 while (sq_head(qpair) != new_tail) { 1314 int err; 1315 struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)]; 1316 1317 /* 1318 * SQHD must contain the new head pointer, so we must increase 1319 * it before we generate a completion. 1320 */ 1321 sqhd_advance(ctrlr, qpair); 1322 1323 err = consume_cmd(ctrlr, qpair, cmd); 1324 if (err != 0) { 1325 return err; 1326 } 1327 } 1328 1329 return 0; 1330 } 1331 1332 static int 1333 map_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1334 { 1335 int err; 1336 1337 assert(ctrlr != NULL); 1338 1339 err = acq_map(ctrlr); 1340 if (err != 0) { 1341 return err; 1342 } 1343 1344 err = asq_map(ctrlr); 1345 if (err != 0) { 1346 return err; 1347 } 1348 1349 return 0; 1350 } 1351 1352 static void 1353 unmap_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1354 { 1355 assert(ctrlr->qp[0] != NULL); 1356 1357 unmap_qp(ctrlr->qp[0]); 1358 } 1359 1360 static void 1361 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1362 { 1363 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1364 struct nvmf_vfio_user_ctrlr *ctrlr; 1365 struct nvmf_vfio_user_qpair *qpair; 1366 int i, ret; 1367 1368 /* 1369 * We're not interested in any DMA regions that aren't mappable (we don't 1370 * support clients that don't share their memory). 1371 */ 1372 if (!info->vaddr) { 1373 return; 1374 } 1375 1376 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1377 (info->mapping.iov_len & MASK_2MB)) { 1378 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1379 (uintptr_t)info->mapping.iov_base, 1380 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1381 return; 1382 } 1383 1384 assert(endpoint != NULL); 1385 if (endpoint->ctrlr == NULL) { 1386 return; 1387 } 1388 ctrlr = endpoint->ctrlr; 1389 1390 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1391 (uintptr_t)info->mapping.iov_base, 1392 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1393 1394 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 1395 * check the protection bits before registering. 1396 */ 1397 if ((info->prot == (PROT_WRITE | PROT_READ)) && 1398 (spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len))) { 1399 SPDK_ERRLOG("Memory region register %#lx-%#lx failed\n", 1400 (uint64_t)(uintptr_t)info->mapping.iov_base, 1401 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1402 } 1403 1404 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1405 qpair = ctrlr->qp[i]; 1406 if (qpair == NULL) { 1407 continue; 1408 } 1409 1410 if (qpair->state != VFIO_USER_QPAIR_INACTIVE) { 1411 continue; 1412 } 1413 1414 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 1415 ret = map_admin_queue(ctrlr); 1416 if (ret) { 1417 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap Admin queue\n"); 1418 continue; 1419 } 1420 qpair->state = VFIO_USER_QPAIR_ACTIVE; 1421 SPDK_DEBUGLOG(nvmf_vfio, "Remap Admin queue\n"); 1422 } else { 1423 struct nvme_q *sq = &qpair->sq; 1424 struct nvme_q *cq = &qpair->cq; 1425 1426 sq->addr = map_one(ctrlr->endpoint->vfu_ctx, sq->prp1, sq->size * 64, sq->sg, &sq->iov, 1427 PROT_READ | PROT_WRITE); 1428 if (!sq->addr) { 1429 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n", 1430 i, sq->prp1, sq->prp1 + sq->size * 64); 1431 continue; 1432 } 1433 cq->addr = map_one(ctrlr->endpoint->vfu_ctx, cq->prp1, cq->size * 16, cq->sg, &cq->iov, 1434 PROT_READ | PROT_WRITE); 1435 if (!cq->addr) { 1436 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n", 1437 i, cq->prp1, cq->prp1 + cq->size * 16); 1438 continue; 1439 } 1440 qpair->state = VFIO_USER_QPAIR_ACTIVE; 1441 SPDK_DEBUGLOG(nvmf_vfio, "Remap IO QP%u\n", i); 1442 } 1443 } 1444 } 1445 1446 static int 1447 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1448 { 1449 1450 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1451 struct nvmf_vfio_user_ctrlr *ctrlr; 1452 struct nvmf_vfio_user_qpair *qpair; 1453 void *map_start, *map_end; 1454 int i; 1455 1456 if (!info->vaddr) { 1457 return 0; 1458 } 1459 1460 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1461 (info->mapping.iov_len & MASK_2MB)) { 1462 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1463 (uintptr_t)info->mapping.iov_base, 1464 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1465 return 0; 1466 } 1467 1468 assert(endpoint != NULL); 1469 if (endpoint->ctrlr == NULL) { 1470 return 0; 1471 } 1472 ctrlr = endpoint->ctrlr; 1473 1474 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1475 (uintptr_t)info->mapping.iov_base, 1476 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1477 1478 if ((info->prot == (PROT_WRITE | PROT_READ)) && 1479 (spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len))) { 1480 SPDK_ERRLOG("Memory region unregister %#lx-%#lx failed\n", 1481 (uint64_t)(uintptr_t)info->mapping.iov_base, 1482 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1483 } 1484 1485 map_start = info->mapping.iov_base; 1486 map_end = info->mapping.iov_base + info->mapping.iov_len; 1487 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1488 qpair = ctrlr->qp[i]; 1489 if (qpair == NULL) { 1490 continue; 1491 } 1492 1493 if ((qpair->cq.addr >= map_start && qpair->cq.addr < map_end) || 1494 (qpair->sq.addr >= map_start && qpair->sq.addr < map_end)) { 1495 unmap_qp(qpair); 1496 qpair->state = VFIO_USER_QPAIR_INACTIVE; 1497 } 1498 } 1499 1500 return 0; 1501 } 1502 1503 static int 1504 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1505 { 1506 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 1507 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1508 bool unmap_admin = false; 1509 int ret; 1510 1511 assert(vu_qpair != NULL); 1512 assert(req != NULL); 1513 1514 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 1515 assert(vu_qpair->ctrlr != NULL); 1516 assert(req != NULL); 1517 1518 memcpy(req->req.data, 1519 &req->req.rsp->prop_get_rsp.value.u64, 1520 req->req.length); 1521 } else { 1522 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 1523 assert(vu_qpair->ctrlr != NULL); 1524 vu_ctrlr = vu_qpair->ctrlr; 1525 1526 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 1527 union spdk_nvme_cc_register cc, diff; 1528 1529 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 1530 diff.raw = cc.raw ^ req->cc.raw; 1531 1532 if (diff.bits.en) { 1533 if (cc.bits.en) { 1534 SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr)); 1535 ret = map_admin_queue(vu_ctrlr); 1536 if (ret) { 1537 SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr)); 1538 return ret; 1539 } 1540 vu_qpair->state = VFIO_USER_QPAIR_ACTIVE; 1541 } else { 1542 unmap_admin = true; 1543 } 1544 } 1545 1546 if (diff.bits.shn) { 1547 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 1548 unmap_admin = true; 1549 } 1550 } 1551 1552 if (unmap_admin) { 1553 SPDK_DEBUGLOG(nvmf_vfio, 1554 "%s: UNMAP Admin queue\n", 1555 ctrlr_id(vu_ctrlr)); 1556 unmap_admin_queue(vu_ctrlr); 1557 vu_qpair->state = VFIO_USER_QPAIR_INACTIVE; 1558 /* For PCIe controller reset or shutdown, we will drop all AER responses */ 1559 nvmf_ctrlr_abort_aer(vu_qpair->qpair.ctrlr); 1560 } 1561 } 1562 } 1563 1564 return 0; 1565 } 1566 1567 static int 1568 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 1569 const size_t count, loff_t pos, const bool is_write) 1570 { 1571 assert(ctrlr != NULL); 1572 assert(buf != NULL); 1573 1574 if (count != sizeof(uint32_t)) { 1575 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 1576 ctrlr_id(ctrlr), count); 1577 errno = EINVAL; 1578 return -1; 1579 } 1580 1581 pos -= NVMF_VFIO_USER_DOORBELLS_OFFSET; 1582 1583 /* pos must be dword aligned */ 1584 if ((pos & 0x3) != 0) { 1585 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 1586 errno = EINVAL; 1587 return -1; 1588 } 1589 1590 /* convert byte offset to array index */ 1591 pos >>= 2; 1592 1593 if (pos > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR * 2) { 1594 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 1595 errno = EINVAL; 1596 return -1; 1597 } 1598 1599 if (is_write) { 1600 ctrlr->doorbells[pos] = *buf; 1601 spdk_wmb(); 1602 } else { 1603 spdk_rmb(); 1604 *buf = ctrlr->doorbells[pos]; 1605 } 1606 return 0; 1607 } 1608 1609 static ssize_t 1610 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 1611 bool is_write) 1612 { 1613 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1614 struct nvmf_vfio_user_ctrlr *ctrlr; 1615 struct nvmf_vfio_user_req *req; 1616 const struct spdk_nvmf_registers *regs; 1617 int ret; 1618 1619 ctrlr = endpoint->ctrlr; 1620 1621 SPDK_DEBUGLOG(nvmf_vfio, 1622 "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n", 1623 endpoint_id(endpoint), is_write ? "write" : "read", 1624 ctrlr, count, pos); 1625 1626 if (pos >= NVMF_VFIO_USER_DOORBELLS_OFFSET) { 1627 /* 1628 * The fact that the doorbells can be memory mapped doesn't mean 1629 * that the client (VFIO in QEMU) is obliged to memory map them, 1630 * it might still elect to access them via regular read/write; 1631 * we might also have had disable_mappable_bar0 set. 1632 */ 1633 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 1634 pos, is_write); 1635 if (ret == 0) { 1636 return count; 1637 } 1638 return ret; 1639 } 1640 1641 /* Construct a Fabric Property Get/Set command and send it */ 1642 req = get_nvmf_vfio_user_req(ctrlr->qp[0]); 1643 if (req == NULL) { 1644 errno = ENOBUFS; 1645 return -1; 1646 } 1647 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 1648 req->cc.raw = regs->cc.raw; 1649 1650 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 1651 req->cb_arg = ctrlr->qp[0]; 1652 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 1653 req->req.cmd->prop_set_cmd.cid = 0; 1654 req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1; 1655 req->req.cmd->prop_set_cmd.ofst = pos; 1656 if (is_write) { 1657 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 1658 if (req->req.cmd->prop_set_cmd.attrib.size) { 1659 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 1660 } else { 1661 req->req.cmd->prop_set_cmd.value.u32.high = 0; 1662 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 1663 } 1664 } else { 1665 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 1666 } 1667 req->req.length = count; 1668 req->req.data = buf; 1669 1670 spdk_nvmf_request_exec_fabrics(&req->req); 1671 1672 return count; 1673 } 1674 1675 /* 1676 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 1677 * available on PCI-X 2.0 and PCI Express buses 1678 */ 1679 static ssize_t 1680 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 1681 bool is_write) 1682 { 1683 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1684 1685 if (is_write) { 1686 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 1687 endpoint_id(endpoint), offset, offset + count); 1688 errno = EINVAL; 1689 return -1; 1690 } 1691 1692 if (offset + count > PCI_CFG_SPACE_EXP_SIZE) { 1693 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 1694 endpoint_id(endpoint), offset, count, 1695 PCI_CFG_SPACE_EXP_SIZE); 1696 errno = ERANGE; 1697 return -1; 1698 } 1699 1700 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 1701 1702 return count; 1703 } 1704 1705 static void 1706 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 1707 { 1708 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1709 1710 if (level >= LOG_DEBUG) { 1711 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1712 } else if (level >= LOG_INFO) { 1713 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1714 } else if (level >= LOG_NOTICE) { 1715 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 1716 } else if (level >= LOG_WARNING) { 1717 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 1718 } else { 1719 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 1720 } 1721 } 1722 1723 static void 1724 init_pci_config_space(vfu_pci_config_space_t *p) 1725 { 1726 /* MLBAR */ 1727 p->hdr.bars[0].raw = 0x0; 1728 /* MUBAR */ 1729 p->hdr.bars[1].raw = 0x0; 1730 1731 /* vendor specific, let's set them to zero for now */ 1732 p->hdr.bars[3].raw = 0x0; 1733 p->hdr.bars[4].raw = 0x0; 1734 p->hdr.bars[5].raw = 0x0; 1735 1736 /* enable INTx */ 1737 p->hdr.intr.ipin = 0x1; 1738 } 1739 1740 static int 1741 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 1742 struct nvmf_vfio_user_endpoint *endpoint) 1743 { 1744 int ret; 1745 ssize_t cap_offset; 1746 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 1747 1748 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 1749 struct pxcap pxcap = { 1750 .hdr.id = PCI_CAP_ID_EXP, 1751 .pxcaps.ver = 0x2, 1752 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 1753 .pxdcap2.ctds = 0x1 1754 }; 1755 1756 struct msixcap msixcap = { 1757 .hdr.id = PCI_CAP_ID_MSIX, 1758 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 1759 .mtab = {.tbir = 0x4, .to = 0x0}, 1760 .mpba = {.pbir = 0x5, .pbao = 0x0} 1761 }; 1762 1763 static struct iovec sparse_mmap[] = { 1764 { 1765 .iov_base = (void *)NVMF_VFIO_USER_DOORBELLS_OFFSET, 1766 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 1767 }, 1768 }; 1769 1770 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 1771 if (ret < 0) { 1772 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 1773 return ret; 1774 } 1775 vfu_pci_set_id(vfu_ctx, 0x4e58, 0x0001, 0, 0); 1776 /* 1777 * 0x02, controller uses the NVM Express programming interface 1778 * 0x08, non-volatile memory controller 1779 * 0x01, mass storage controller 1780 */ 1781 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 1782 1783 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 1784 if (cap_offset < 0) { 1785 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 1786 return ret; 1787 } 1788 1789 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 1790 if (cap_offset < 0) { 1791 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 1792 return ret; 1793 } 1794 1795 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 1796 if (cap_offset < 0) { 1797 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 1798 return ret; 1799 } 1800 1801 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 1802 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1803 if (ret < 0) { 1804 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 1805 return ret; 1806 } 1807 1808 if (vu_transport->transport_opts.disable_mappable_bar0) { 1809 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1810 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1811 NULL, 0, -1, 0); 1812 } else { 1813 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1814 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1815 sparse_mmap, 1, endpoint->fd, 0); 1816 } 1817 1818 if (ret < 0) { 1819 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 1820 return ret; 1821 } 1822 1823 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, PAGE_SIZE, 1824 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1825 if (ret < 0) { 1826 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 1827 return ret; 1828 } 1829 1830 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, PAGE_SIZE, 1831 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1832 if (ret < 0) { 1833 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 1834 return ret; 1835 } 1836 1837 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 1838 if (ret < 0) { 1839 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 1840 return ret; 1841 } 1842 1843 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 1844 if (ret < 0) { 1845 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 1846 return ret; 1847 } 1848 1849 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 1850 if (ret < 0) { 1851 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 1852 return ret; 1853 } 1854 1855 ret = vfu_realize_ctx(vfu_ctx); 1856 if (ret < 0) { 1857 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 1858 return ret; 1859 } 1860 1861 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 1862 assert(endpoint->pci_config_space != NULL); 1863 init_pci_config_space(endpoint->pci_config_space); 1864 1865 assert(cap_offset != 0); 1866 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 1867 1868 return 0; 1869 } 1870 1871 static void 1872 _free_ctrlr(void *ctx) 1873 { 1874 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 1875 int i; 1876 1877 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1878 free_qp(ctrlr, i); 1879 } 1880 1881 spdk_poller_unregister(&ctrlr->mmio_poller); 1882 free(ctrlr); 1883 } 1884 1885 static void 1886 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 1887 { 1888 assert(ctrlr != NULL); 1889 1890 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 1891 1892 if (ctrlr->thread == spdk_get_thread()) { 1893 _free_ctrlr(ctrlr); 1894 } else { 1895 spdk_thread_send_msg(ctrlr->thread, _free_ctrlr, ctrlr); 1896 } 1897 } 1898 1899 static void 1900 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 1901 struct nvmf_vfio_user_endpoint *endpoint) 1902 { 1903 struct nvmf_vfio_user_ctrlr *ctrlr; 1904 int err; 1905 1906 /* First, construct a vfio-user CUSTOM transport controller */ 1907 ctrlr = calloc(1, sizeof(*ctrlr)); 1908 if (ctrlr == NULL) { 1909 err = -ENOMEM; 1910 goto out; 1911 } 1912 ctrlr->cntlid = 0xffff; 1913 ctrlr->transport = transport; 1914 ctrlr->endpoint = endpoint; 1915 ctrlr->doorbells = endpoint->doorbells; 1916 1917 /* Then, construct an admin queue pair */ 1918 err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0); 1919 if (err != 0) { 1920 goto out; 1921 } 1922 endpoint->ctrlr = ctrlr; 1923 1924 /* Notify the generic layer about the new admin queue pair */ 1925 TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[0], link); 1926 1927 out: 1928 if (err != 0) { 1929 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 1930 endpoint_id(endpoint), strerror(-err)); 1931 free_ctrlr(ctrlr); 1932 } 1933 } 1934 1935 static int 1936 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 1937 const struct spdk_nvme_transport_id *trid, 1938 struct spdk_nvmf_listen_opts *listen_opts) 1939 { 1940 struct nvmf_vfio_user_transport *vu_transport; 1941 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1942 char *path = NULL; 1943 char uuid[PATH_MAX] = {}; 1944 int fd; 1945 int err; 1946 1947 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1948 transport); 1949 1950 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1951 /* Only compare traddr */ 1952 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 1953 return -EEXIST; 1954 } 1955 } 1956 1957 endpoint = calloc(1, sizeof(*endpoint)); 1958 if (!endpoint) { 1959 return -ENOMEM; 1960 } 1961 1962 endpoint->fd = -1; 1963 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 1964 1965 err = asprintf(&path, "%s/bar0", endpoint_id(endpoint)); 1966 if (err == -1) { 1967 goto out; 1968 } 1969 1970 fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); 1971 if (fd == -1) { 1972 SPDK_ERRLOG("%s: failed to open device memory at %s: %m\n", 1973 endpoint_id(endpoint), path); 1974 err = fd; 1975 free(path); 1976 goto out; 1977 } 1978 free(path); 1979 1980 endpoint->fd = fd; 1981 err = ftruncate(fd, NVMF_VFIO_USER_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 1982 if (err != 0) { 1983 goto out; 1984 } 1985 1986 endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 1987 PROT_READ | PROT_WRITE, MAP_SHARED, fd, NVMF_VFIO_USER_DOORBELLS_OFFSET); 1988 if (endpoint->doorbells == MAP_FAILED) { 1989 endpoint->doorbells = NULL; 1990 err = -errno; 1991 goto out; 1992 } 1993 1994 snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 1995 1996 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 1997 endpoint, VFU_DEV_TYPE_PCI); 1998 if (endpoint->vfu_ctx == NULL) { 1999 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 2000 endpoint_id(endpoint)); 2001 err = -1; 2002 goto out; 2003 } 2004 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, LOG_DEBUG); 2005 2006 err = vfio_user_dev_info_fill(vu_transport, endpoint); 2007 if (err < 0) { 2008 goto out; 2009 } 2010 2011 pthread_mutex_init(&endpoint->lock, NULL); 2012 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 2013 SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells); 2014 2015 out: 2016 if (err != 0) { 2017 nvmf_vfio_user_destroy_endpoint(endpoint); 2018 } 2019 2020 return err; 2021 } 2022 2023 static void 2024 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 2025 const struct spdk_nvme_transport_id *trid) 2026 { 2027 struct nvmf_vfio_user_transport *vu_transport; 2028 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 2029 2030 assert(trid != NULL); 2031 assert(trid->traddr != NULL); 2032 2033 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 2034 2035 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2036 transport); 2037 2038 pthread_mutex_lock(&vu_transport->lock); 2039 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 2040 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 2041 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 2042 if (endpoint->ctrlr) { 2043 free_ctrlr(endpoint->ctrlr); 2044 } 2045 nvmf_vfio_user_destroy_endpoint(endpoint); 2046 pthread_mutex_unlock(&vu_transport->lock); 2047 2048 return; 2049 } 2050 } 2051 pthread_mutex_unlock(&vu_transport->lock); 2052 2053 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 2054 } 2055 2056 static void 2057 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 2058 struct spdk_nvmf_subsystem *subsystem, 2059 struct spdk_nvmf_ctrlr_data *cdata) 2060 { 2061 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 2062 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 2063 } 2064 2065 static int 2066 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 2067 const struct spdk_nvmf_subsystem *subsystem, 2068 const struct spdk_nvme_transport_id *trid) 2069 { 2070 struct nvmf_vfio_user_transport *vu_transport; 2071 struct nvmf_vfio_user_endpoint *endpoint; 2072 2073 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 2074 2075 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2076 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 2077 break; 2078 } 2079 } 2080 2081 if (endpoint == NULL) { 2082 return -ENOENT; 2083 } 2084 2085 endpoint->subsystem = subsystem; 2086 2087 return 0; 2088 } 2089 2090 /* 2091 * Executed periodically. 2092 * 2093 * XXX SPDK thread context. 2094 */ 2095 static uint32_t 2096 nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport) 2097 { 2098 int err; 2099 struct nvmf_vfio_user_transport *vu_transport; 2100 struct nvmf_vfio_user_qpair *qp, *tmp_qp; 2101 struct nvmf_vfio_user_endpoint *endpoint; 2102 2103 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2104 transport); 2105 2106 pthread_mutex_lock(&vu_transport->lock); 2107 2108 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2109 /* try to attach a new controller */ 2110 if (endpoint->ctrlr != NULL) { 2111 continue; 2112 } 2113 2114 err = vfu_attach_ctx(endpoint->vfu_ctx); 2115 if (err != 0) { 2116 if (errno == EAGAIN || errno == EWOULDBLOCK) { 2117 continue; 2118 } 2119 2120 pthread_mutex_unlock(&vu_transport->lock); 2121 return -EFAULT; 2122 } 2123 2124 /* Construct a controller */ 2125 nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 2126 } 2127 2128 TAILQ_FOREACH_SAFE(qp, &vu_transport->new_qps, link, tmp_qp) { 2129 TAILQ_REMOVE(&vu_transport->new_qps, qp, link); 2130 spdk_nvmf_tgt_new_qpair(transport->tgt, &qp->qpair); 2131 } 2132 2133 pthread_mutex_unlock(&vu_transport->lock); 2134 2135 return 0; 2136 } 2137 2138 static void 2139 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 2140 struct spdk_nvme_transport_id *trid, 2141 struct spdk_nvmf_discovery_log_page_entry *entry) 2142 { } 2143 2144 static struct spdk_nvmf_transport_poll_group * 2145 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport) 2146 { 2147 struct nvmf_vfio_user_poll_group *vu_group; 2148 2149 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 2150 2151 vu_group = calloc(1, sizeof(*vu_group)); 2152 if (vu_group == NULL) { 2153 SPDK_ERRLOG("Error allocating poll group: %m"); 2154 return NULL; 2155 } 2156 2157 TAILQ_INIT(&vu_group->qps); 2158 2159 return &vu_group->group; 2160 } 2161 2162 /* called when process exits */ 2163 static void 2164 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 2165 { 2166 struct nvmf_vfio_user_poll_group *vu_group; 2167 2168 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 2169 2170 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2171 2172 free(vu_group); 2173 } 2174 2175 static void 2176 vfio_user_qpair_disconnect_cb(void *ctx) 2177 { 2178 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2179 struct nvmf_vfio_user_ctrlr *ctrlr; 2180 2181 pthread_mutex_lock(&endpoint->lock); 2182 ctrlr = endpoint->ctrlr; 2183 if (!ctrlr) { 2184 pthread_mutex_unlock(&endpoint->lock); 2185 return; 2186 } 2187 2188 if (!ctrlr->num_connected_qps) { 2189 endpoint->ctrlr = NULL; 2190 free_ctrlr(ctrlr); 2191 pthread_mutex_unlock(&endpoint->lock); 2192 return; 2193 } 2194 pthread_mutex_unlock(&endpoint->lock); 2195 } 2196 2197 static int 2198 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 2199 { 2200 uint32_t i; 2201 struct nvmf_vfio_user_qpair *qpair; 2202 struct nvmf_vfio_user_endpoint *endpoint; 2203 2204 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 2205 2206 endpoint = ctrlr->endpoint; 2207 assert(endpoint != NULL); 2208 2209 pthread_mutex_lock(&endpoint->lock); 2210 if (ctrlr->num_connected_qps == 0) { 2211 endpoint->ctrlr = NULL; 2212 free_ctrlr(ctrlr); 2213 pthread_mutex_unlock(&endpoint->lock); 2214 return 0; 2215 } 2216 pthread_mutex_unlock(&endpoint->lock); 2217 2218 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 2219 qpair = ctrlr->qp[i]; 2220 if (qpair == NULL) { 2221 continue; 2222 } 2223 spdk_nvmf_qpair_disconnect(&qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint); 2224 } 2225 2226 return 0; 2227 } 2228 2229 static int 2230 vfio_user_poll_mmio(void *ctx) 2231 { 2232 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 2233 int ret; 2234 2235 assert(ctrlr != NULL); 2236 2237 /* This will call access_bar0_fn() if there are any writes 2238 * to the portion of the BAR that is not mmap'd */ 2239 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 2240 if (spdk_unlikely(ret != 0)) { 2241 spdk_poller_unregister(&ctrlr->mmio_poller); 2242 2243 /* initiator shutdown or reset, waiting for another re-connect */ 2244 if (errno == ENOTCONN) { 2245 vfio_user_destroy_ctrlr(ctrlr); 2246 return SPDK_POLLER_BUSY; 2247 } 2248 2249 fail_ctrlr(ctrlr); 2250 } 2251 2252 return SPDK_POLLER_BUSY; 2253 } 2254 2255 static int 2256 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2257 { 2258 struct nvmf_vfio_user_poll_group *vu_group; 2259 struct nvmf_vfio_user_qpair *qpair = cb_arg; 2260 struct nvmf_vfio_user_ctrlr *ctrlr; 2261 struct nvmf_vfio_user_endpoint *endpoint; 2262 2263 assert(qpair != NULL); 2264 assert(req != NULL); 2265 2266 ctrlr = qpair->ctrlr; 2267 endpoint = ctrlr->endpoint; 2268 assert(ctrlr != NULL); 2269 assert(endpoint != NULL); 2270 2271 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 2272 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 2273 endpoint->ctrlr = NULL; 2274 free_ctrlr(ctrlr); 2275 return -1; 2276 } 2277 2278 vu_group = SPDK_CONTAINEROF(qpair->group, struct nvmf_vfio_user_poll_group, group); 2279 TAILQ_INSERT_TAIL(&vu_group->qps, qpair, link); 2280 qpair->state = VFIO_USER_QPAIR_ACTIVE; 2281 2282 pthread_mutex_lock(&endpoint->lock); 2283 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 2284 ctrlr->cntlid = qpair->qpair.ctrlr->cntlid; 2285 ctrlr->thread = spdk_get_thread(); 2286 ctrlr->mmio_poller = SPDK_POLLER_REGISTER(vfio_user_poll_mmio, ctrlr, 0); 2287 } else { 2288 /* For I/O queues this command was generated in response to an 2289 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 2290 * been completed. Complete it now. 2291 */ 2292 post_completion(ctrlr, &qpair->create_io_sq_cmd, &ctrlr->qp[0]->cq, 0, 2293 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2294 } 2295 ctrlr->num_connected_qps++; 2296 pthread_mutex_unlock(&endpoint->lock); 2297 2298 free(req->req.data); 2299 req->req.data = NULL; 2300 2301 return 0; 2302 } 2303 2304 /* 2305 * Add the given qpair to the given poll group. New qpairs are added to 2306 * ->new_qps; they are processed via nvmf_vfio_user_accept(), calling 2307 * spdk_nvmf_tgt_new_qpair(), which picks a poll group, then calls back 2308 * here via nvmf_transport_poll_group_add(). 2309 */ 2310 static int 2311 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 2312 struct spdk_nvmf_qpair *qpair) 2313 { 2314 struct nvmf_vfio_user_qpair *vu_qpair; 2315 struct nvmf_vfio_user_req *vu_req; 2316 struct nvmf_vfio_user_ctrlr *ctrlr; 2317 struct spdk_nvmf_request *req; 2318 struct spdk_nvmf_fabric_connect_data *data; 2319 bool admin; 2320 2321 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2322 vu_qpair->group = group; 2323 ctrlr = vu_qpair->ctrlr; 2324 2325 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 2326 ctrlr_id(ctrlr), vu_qpair->qpair.qid, 2327 vu_qpair, qpair, group); 2328 2329 admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair); 2330 2331 vu_req = get_nvmf_vfio_user_req(vu_qpair); 2332 if (vu_req == NULL) { 2333 return -1; 2334 } 2335 2336 req = &vu_req->req; 2337 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2338 req->cmd->connect_cmd.cid = 0; 2339 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 2340 req->cmd->connect_cmd.recfmt = 0; 2341 req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1; 2342 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 2343 2344 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 2345 req->data = calloc(1, req->length); 2346 if (req->data == NULL) { 2347 nvmf_vfio_user_req_free(req); 2348 return -ENOMEM; 2349 } 2350 2351 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 2352 data->cntlid = admin ? 0xFFFF : ctrlr->cntlid; 2353 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 2354 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 2355 2356 vu_req->cb_fn = handle_queue_connect_rsp; 2357 vu_req->cb_arg = vu_qpair; 2358 2359 SPDK_DEBUGLOG(nvmf_vfio, 2360 "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", 2361 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 2362 2363 spdk_nvmf_request_exec_fabrics(req); 2364 return 0; 2365 } 2366 2367 static int 2368 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 2369 struct spdk_nvmf_qpair *qpair) 2370 { 2371 struct nvmf_vfio_user_qpair *vu_qpair; 2372 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2373 struct nvmf_vfio_user_endpoint *endpoint; 2374 struct nvmf_vfio_user_poll_group *vu_group; 2375 2376 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2377 vu_ctrlr = vu_qpair->ctrlr; 2378 endpoint = vu_ctrlr->endpoint; 2379 2380 SPDK_DEBUGLOG(nvmf_vfio, 2381 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 2382 ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group); 2383 2384 2385 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2386 TAILQ_REMOVE(&vu_group->qps, vu_qpair, link); 2387 2388 pthread_mutex_lock(&endpoint->lock); 2389 assert(vu_ctrlr->num_connected_qps); 2390 vu_ctrlr->num_connected_qps--; 2391 pthread_mutex_unlock(&endpoint->lock); 2392 2393 return 0; 2394 } 2395 2396 static void 2397 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_qpair *vu_qpair, struct nvmf_vfio_user_req *vu_req) 2398 { 2399 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 2400 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 2401 vu_req->iovcnt = 0; 2402 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 2403 2404 TAILQ_INSERT_TAIL(&vu_qpair->reqs, vu_req, link); 2405 } 2406 2407 static int 2408 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 2409 { 2410 struct nvmf_vfio_user_qpair *vu_qpair; 2411 struct nvmf_vfio_user_req *vu_req; 2412 2413 assert(req != NULL); 2414 2415 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2416 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2417 2418 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2419 2420 return 0; 2421 } 2422 2423 static int 2424 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 2425 { 2426 struct nvmf_vfio_user_qpair *vu_qpair; 2427 struct nvmf_vfio_user_req *vu_req; 2428 2429 assert(req != NULL); 2430 2431 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2432 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2433 2434 if (vu_req->cb_fn != NULL) { 2435 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 2436 fail_ctrlr(vu_qpair->ctrlr); 2437 } 2438 } 2439 2440 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2441 2442 return 0; 2443 } 2444 2445 static void 2446 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 2447 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 2448 { 2449 struct nvmf_vfio_user_qpair *vu_qpair; 2450 2451 assert(qpair != NULL); 2452 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2453 free_qp(vu_qpair->ctrlr, qpair->qid); 2454 2455 if (cb_fn) { 2456 cb_fn(cb_arg); 2457 } 2458 } 2459 2460 /** 2461 * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available. 2462 */ 2463 static struct nvmf_vfio_user_req * 2464 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair) 2465 { 2466 struct nvmf_vfio_user_req *req; 2467 2468 assert(qpair != NULL); 2469 2470 if (TAILQ_EMPTY(&qpair->reqs)) { 2471 return NULL; 2472 } 2473 2474 req = TAILQ_FIRST(&qpair->reqs); 2475 TAILQ_REMOVE(&qpair->reqs, req, link); 2476 2477 return req; 2478 } 2479 2480 static struct spdk_nvmf_request * 2481 get_nvmf_req(struct nvmf_vfio_user_qpair *qpair) 2482 { 2483 struct nvmf_vfio_user_req *req = get_nvmf_vfio_user_req(qpair); 2484 2485 if (req == NULL) { 2486 return NULL; 2487 } 2488 return &req->req; 2489 } 2490 2491 static int 2492 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 2493 { 2494 uint16_t nlb, nr; 2495 uint32_t nsid; 2496 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2497 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 2498 struct spdk_nvmf_ns *ns; 2499 2500 nsid = cmd->nsid; 2501 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 2502 if (ns == NULL || ns->bdev == NULL) { 2503 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 2504 return -EINVAL; 2505 } 2506 2507 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 2508 nr = cmd->cdw10_bits.dsm.nr + 1; 2509 return nr * sizeof(struct spdk_nvme_dsm_range); 2510 } 2511 2512 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 2513 return nlb * spdk_bdev_get_block_size(ns->bdev); 2514 } 2515 2516 static int 2517 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2518 { 2519 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2520 uint32_t len = 0; 2521 int iovcnt; 2522 2523 req->xfer = cmd->opc & 0x3; 2524 req->length = 0; 2525 req->data = NULL; 2526 2527 switch (cmd->opc) { 2528 case SPDK_NVME_OPC_IDENTIFY: 2529 len = 4096; /* TODO: there should be a define somewhere for this */ 2530 break; 2531 case SPDK_NVME_OPC_GET_LOG_PAGE: 2532 len = (cmd->cdw10_bits.get_log_page.numdl + 1) * 4; 2533 break; 2534 } 2535 2536 if (!cmd->dptr.prp.prp1 || !len) { 2537 return 0; 2538 } 2539 /* ADMIN command will not use SGL */ 2540 assert(req->cmd->nvme_cmd.psdt == 0); 2541 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 2542 if (iovcnt < 0) { 2543 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 2544 ctrlr_id(ctrlr), cmd->opc); 2545 return -1; 2546 } 2547 2548 req->length = len; 2549 req->data = req->iov[0].iov_base; 2550 2551 return 0; 2552 } 2553 2554 /* 2555 * Handles an I/O command. 2556 * 2557 * Returns 0 on success and -errno on failure. Sets @submit on whether or not 2558 * the request must be forwarded to NVMf. 2559 */ 2560 static int 2561 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2562 { 2563 int err = 0; 2564 struct spdk_nvme_cmd *cmd; 2565 2566 assert(ctrlr != NULL); 2567 assert(req != NULL); 2568 2569 cmd = &req->cmd->nvme_cmd; 2570 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 2571 2572 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 2573 return 0; 2574 } 2575 2576 err = get_nvmf_io_req_length(req); 2577 if (err < 0) { 2578 return -EINVAL; 2579 } 2580 2581 req->length = err; 2582 err = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 2583 if (err < 0) { 2584 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 2585 return -EFAULT; 2586 } 2587 2588 req->data = req->iov[0].iov_base; 2589 req->iovcnt = err; 2590 2591 return 0; 2592 } 2593 2594 static int 2595 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 2596 struct spdk_nvmf_request *req) 2597 { 2598 int err; 2599 struct nvmf_vfio_user_req *vu_req; 2600 2601 assert(ctrlr != NULL); 2602 assert(cmd != NULL); 2603 2604 /* 2605 * TODO: this means that there are no free requests available, 2606 * returning -1 will fail the controller. Theoretically this error can 2607 * be avoided completely by ensuring we have as many requests as slots 2608 * in the SQ, plus one for the the property request. 2609 */ 2610 if (spdk_unlikely(req == NULL)) { 2611 return -1; 2612 } 2613 2614 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2615 vu_req->cb_fn = handle_cmd_rsp; 2616 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2617 req->cmd->nvme_cmd = *cmd; 2618 if (nvmf_qpair_is_admin_queue(req->qpair)) { 2619 err = map_admin_cmd_req(ctrlr, req); 2620 } else { 2621 err = map_io_cmd_req(ctrlr, req); 2622 } 2623 2624 if (spdk_unlikely(err < 0)) { 2625 SPDK_ERRLOG("%s: map NVMe command opc 0x%x failed\n", 2626 ctrlr_id(ctrlr), cmd->opc); 2627 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2628 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2629 return handle_cmd_rsp(vu_req, vu_req->cb_arg); 2630 } 2631 2632 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 2633 spdk_nvmf_request_exec(req); 2634 2635 return 0; 2636 } 2637 2638 static void 2639 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair) 2640 { 2641 struct nvmf_vfio_user_ctrlr *ctrlr; 2642 uint32_t new_tail; 2643 2644 assert(qpair != NULL); 2645 2646 ctrlr = qpair->ctrlr; 2647 2648 new_tail = *tdbl(ctrlr, &qpair->sq); 2649 if (sq_head(qpair) != new_tail) { 2650 int err = handle_sq_tdbl_write(ctrlr, new_tail, qpair); 2651 if (err != 0) { 2652 fail_ctrlr(ctrlr); 2653 return; 2654 } 2655 } 2656 } 2657 2658 static int 2659 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2660 { 2661 struct nvmf_vfio_user_poll_group *vu_group; 2662 struct nvmf_vfio_user_qpair *vu_qpair, *tmp; 2663 2664 assert(group != NULL); 2665 2666 spdk_rmb(); 2667 2668 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2669 2670 TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) { 2671 if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) { 2672 continue; 2673 } 2674 nvmf_vfio_user_qpair_poll(vu_qpair); 2675 } 2676 2677 return 0; 2678 } 2679 2680 static int 2681 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 2682 struct spdk_nvme_transport_id *trid) 2683 { 2684 struct nvmf_vfio_user_qpair *vu_qpair; 2685 struct nvmf_vfio_user_ctrlr *ctrlr; 2686 2687 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2688 ctrlr = vu_qpair->ctrlr; 2689 2690 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2691 return 0; 2692 } 2693 2694 static int 2695 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 2696 struct spdk_nvme_transport_id *trid) 2697 { 2698 return 0; 2699 } 2700 2701 static int 2702 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 2703 struct spdk_nvme_transport_id *trid) 2704 { 2705 struct nvmf_vfio_user_qpair *vu_qpair; 2706 struct nvmf_vfio_user_ctrlr *ctrlr; 2707 2708 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2709 ctrlr = vu_qpair->ctrlr; 2710 2711 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2712 return 0; 2713 } 2714 2715 static void 2716 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 2717 struct spdk_nvmf_request *req) 2718 { 2719 struct nvmf_vfio_user_qpair *vu_qpair; 2720 struct nvmf_vfio_user_req *vu_req, *vu_req_to_abort = NULL; 2721 uint16_t i, cid; 2722 2723 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2724 2725 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 2726 for (i = 0; i < vu_qpair->qsize; i++) { 2727 vu_req = &vu_qpair->reqs_internal[i]; 2728 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 2729 vu_req_to_abort = vu_req; 2730 break; 2731 } 2732 } 2733 2734 if (vu_req_to_abort == NULL) { 2735 spdk_nvmf_request_complete(req); 2736 return; 2737 } 2738 2739 req->req_to_abort = &vu_req_to_abort->req; 2740 nvmf_ctrlr_abort_request(req); 2741 } 2742 2743 static void 2744 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 2745 { 2746 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 2747 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 2748 opts->in_capsule_data_size = 0; 2749 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 2750 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 2751 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 2752 opts->num_shared_buffers = 0; 2753 opts->buf_cache_size = 0; 2754 opts->association_timeout = 0; 2755 opts->transport_specific = NULL; 2756 } 2757 2758 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 2759 .name = "VFIOUSER", 2760 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 2761 .opts_init = nvmf_vfio_user_opts_init, 2762 .create = nvmf_vfio_user_create, 2763 .destroy = nvmf_vfio_user_destroy, 2764 2765 .listen = nvmf_vfio_user_listen, 2766 .stop_listen = nvmf_vfio_user_stop_listen, 2767 .accept = nvmf_vfio_user_accept, 2768 .cdata_init = nvmf_vfio_user_cdata_init, 2769 .listen_associate = nvmf_vfio_user_listen_associate, 2770 2771 .listener_discover = nvmf_vfio_user_discover, 2772 2773 .poll_group_create = nvmf_vfio_user_poll_group_create, 2774 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 2775 .poll_group_add = nvmf_vfio_user_poll_group_add, 2776 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 2777 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 2778 2779 .req_free = nvmf_vfio_user_req_free, 2780 .req_complete = nvmf_vfio_user_req_complete, 2781 2782 .qpair_fini = nvmf_vfio_user_close_qpair, 2783 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 2784 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 2785 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 2786 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 2787 }; 2788 2789 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 2790 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 2791