1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019, Nutanix Inc. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * * Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * * Neither the name of Intel Corporation nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * NVMe over vfio-user transport 35 */ 36 37 #include <vfio-user/libvfio-user.h> 38 #include <vfio-user/pci_defs.h> 39 40 #include "spdk/barrier.h" 41 #include "spdk/stdinc.h" 42 #include "spdk/assert.h" 43 #include "spdk/thread.h" 44 #include "spdk/nvmf_transport.h" 45 #include "spdk/sock.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 #include "spdk/log.h" 49 50 #include "transport.h" 51 52 #include "nvmf_internal.h" 53 54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 56 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 57 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 58 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 59 60 #define NVMF_VFIO_USER_DOORBELLS_OFFSET 0x1000 61 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 62 63 #define NVME_REG_CFG_SIZE 0x1000 64 #define NVME_REG_BAR0_SIZE 0x4000 65 #define NVME_IRQ_INTX_NUM 1 66 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 67 68 struct nvmf_vfio_user_req; 69 struct nvmf_vfio_user_qpair; 70 71 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 72 73 /* 1 more for PRP2 list itself */ 74 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 75 76 enum nvmf_vfio_user_req_state { 77 VFIO_USER_REQUEST_STATE_FREE = 0, 78 VFIO_USER_REQUEST_STATE_EXECUTING, 79 }; 80 81 struct nvmf_vfio_user_req { 82 struct spdk_nvmf_request req; 83 struct spdk_nvme_cpl rsp; 84 struct spdk_nvme_cmd cmd; 85 86 enum nvmf_vfio_user_req_state state; 87 nvmf_vfio_user_req_cb_fn cb_fn; 88 void *cb_arg; 89 90 /* placeholder for gpa_to_vva memory map table, the IO buffer doesn't use it */ 91 dma_sg_t *sg; 92 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 93 uint8_t iovcnt; 94 95 TAILQ_ENTRY(nvmf_vfio_user_req) link; 96 }; 97 98 /* 99 * A NVMe queue. 100 */ 101 struct nvme_q { 102 bool is_cq; 103 104 void *addr; 105 106 dma_sg_t *sg; 107 struct iovec iov; 108 109 uint32_t size; 110 uint64_t prp1; 111 112 union { 113 struct { 114 uint32_t head; 115 /* multiple SQs can be mapped to the same CQ */ 116 uint16_t cqid; 117 }; 118 struct { 119 uint32_t tail; 120 uint16_t iv; 121 bool ien; 122 }; 123 }; 124 }; 125 126 enum nvmf_vfio_user_qpair_state { 127 VFIO_USER_QPAIR_UNINITIALIZED = 0, 128 VFIO_USER_QPAIR_ACTIVE, 129 VFIO_USER_QPAIR_DELETED, 130 VFIO_USER_QPAIR_INACTIVE, 131 VFIO_USER_QPAIR_ERROR, 132 }; 133 134 struct nvmf_vfio_user_qpair { 135 struct spdk_nvmf_qpair qpair; 136 struct spdk_nvmf_transport_poll_group *group; 137 struct nvmf_vfio_user_ctrlr *ctrlr; 138 struct nvmf_vfio_user_req *reqs_internal; 139 uint16_t qsize; 140 struct nvme_q cq; 141 struct nvme_q sq; 142 enum nvmf_vfio_user_qpair_state state; 143 144 TAILQ_HEAD(, nvmf_vfio_user_req) reqs; 145 TAILQ_ENTRY(nvmf_vfio_user_qpair) link; 146 }; 147 148 struct nvmf_vfio_user_poll_group { 149 struct spdk_nvmf_transport_poll_group group; 150 TAILQ_HEAD(, nvmf_vfio_user_qpair) qps; 151 }; 152 153 struct nvmf_vfio_user_ctrlr { 154 struct nvmf_vfio_user_endpoint *endpoint; 155 struct nvmf_vfio_user_transport *transport; 156 157 /* Number of connected queue pairs */ 158 uint32_t num_connected_qps; 159 160 struct spdk_thread *thread; 161 struct spdk_poller *mmio_poller; 162 163 uint16_t cntlid; 164 165 struct nvmf_vfio_user_qpair *qp[NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR]; 166 167 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 168 169 volatile uint32_t *doorbells; 170 171 /* internal CSTS.CFS register for vfio-user fatal errors */ 172 uint32_t cfs : 1; 173 }; 174 175 struct nvmf_vfio_user_endpoint { 176 vfu_ctx_t *vfu_ctx; 177 struct msixcap *msix; 178 vfu_pci_config_space_t *pci_config_space; 179 int fd; 180 volatile uint32_t *doorbells; 181 182 struct spdk_nvme_transport_id trid; 183 const struct spdk_nvmf_subsystem *subsystem; 184 185 struct nvmf_vfio_user_ctrlr *ctrlr; 186 pthread_mutex_t lock; 187 188 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 189 }; 190 191 struct nvmf_vfio_user_transport_opts { 192 bool disable_mappable_bar0; 193 }; 194 195 struct nvmf_vfio_user_transport { 196 struct spdk_nvmf_transport transport; 197 struct nvmf_vfio_user_transport_opts transport_opts; 198 pthread_mutex_t lock; 199 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 200 201 TAILQ_HEAD(, nvmf_vfio_user_qpair) new_qps; 202 }; 203 204 /* 205 * function prototypes 206 */ 207 static volatile uint32_t * 208 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 209 210 static volatile uint32_t * 211 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 212 213 static int 214 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 215 216 static struct nvmf_vfio_user_req * 217 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair); 218 219 static int 220 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 221 struct nvme_q *cq, uint32_t cdw0, uint16_t sc, 222 uint16_t sct); 223 224 static int 225 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 226 uint32_t max_iovcnt, uint32_t len, size_t mps, 227 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 228 { 229 uint64_t prp1, prp2; 230 void *vva; 231 uint32_t i; 232 uint32_t residue_len, nents; 233 uint64_t *prp_list; 234 uint32_t iovcnt; 235 236 assert(max_iovcnt > 0); 237 238 prp1 = cmd->dptr.prp.prp1; 239 prp2 = cmd->dptr.prp.prp2; 240 241 /* PRP1 may started with unaligned page address */ 242 residue_len = mps - (prp1 % mps); 243 residue_len = spdk_min(len, residue_len); 244 245 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 246 if (spdk_unlikely(vva == NULL)) { 247 SPDK_ERRLOG("GPA to VVA failed\n"); 248 return -EINVAL; 249 } 250 len -= residue_len; 251 if (len && max_iovcnt < 2) { 252 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 253 return -ERANGE; 254 } 255 iovs[0].iov_base = vva; 256 iovs[0].iov_len = residue_len; 257 258 if (len) { 259 if (spdk_unlikely(prp2 == 0)) { 260 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 261 return -EINVAL; 262 } 263 264 if (len <= mps) { 265 /* 2 PRP used */ 266 iovcnt = 2; 267 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 268 if (spdk_unlikely(vva == NULL)) { 269 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 270 prp2, len); 271 return -EINVAL; 272 } 273 iovs[1].iov_base = vva; 274 iovs[1].iov_len = len; 275 } else { 276 /* PRP list used */ 277 nents = (len + mps - 1) / mps; 278 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 279 SPDK_ERRLOG("Too many page entries\n"); 280 return -ERANGE; 281 } 282 283 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 284 if (spdk_unlikely(vva == NULL)) { 285 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 286 prp2, nents); 287 return -EINVAL; 288 } 289 prp_list = vva; 290 i = 0; 291 while (len != 0) { 292 residue_len = spdk_min(len, mps); 293 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 294 if (spdk_unlikely(vva == NULL)) { 295 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 296 prp_list[i], residue_len); 297 return -EINVAL; 298 } 299 iovs[i + 1].iov_base = vva; 300 iovs[i + 1].iov_len = residue_len; 301 len -= residue_len; 302 i++; 303 } 304 iovcnt = i + 1; 305 } 306 } else { 307 /* 1 PRP used */ 308 iovcnt = 1; 309 } 310 311 assert(iovcnt <= max_iovcnt); 312 return iovcnt; 313 } 314 315 static int 316 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 317 struct iovec *iovs, uint32_t max_iovcnt, 318 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 319 { 320 uint32_t i; 321 void *vva; 322 323 if (spdk_unlikely(max_iovcnt < num_sgls)) { 324 return -ERANGE; 325 } 326 327 for (i = 0; i < num_sgls; i++) { 328 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 329 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 330 return -EINVAL; 331 } 332 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 333 if (spdk_unlikely(vva == NULL)) { 334 SPDK_ERRLOG("GPA to VVA failed\n"); 335 return -EINVAL; 336 } 337 iovs[i].iov_base = vva; 338 iovs[i].iov_len = sgls[i].unkeyed.length; 339 } 340 341 return num_sgls; 342 } 343 344 static int 345 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 346 uint32_t len, size_t mps, 347 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 348 { 349 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 350 uint32_t num_sgls, seg_len; 351 void *vva; 352 int ret; 353 uint32_t total_iovcnt = 0; 354 355 /* SGL cases */ 356 sgl = &cmd->dptr.sgl1; 357 358 /* only one SGL segment */ 359 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 360 assert(max_iovcnt > 0); 361 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 362 if (spdk_unlikely(vva == NULL)) { 363 SPDK_ERRLOG("GPA to VVA failed\n"); 364 return -EINVAL; 365 } 366 iovs[0].iov_base = vva; 367 iovs[0].iov_len = sgl->unkeyed.length; 368 assert(sgl->unkeyed.length == len); 369 370 return 1; 371 } 372 373 for (;;) { 374 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 375 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 376 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 377 return -EINVAL; 378 } 379 380 seg_len = sgl->unkeyed.length; 381 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 382 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 383 return -EINVAL; 384 } 385 386 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 387 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 388 if (spdk_unlikely(vva == NULL)) { 389 SPDK_ERRLOG("GPA to VVA failed\n"); 390 return -EINVAL; 391 } 392 393 /* sgl point to the first segment */ 394 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 395 last_sgl = &sgl[num_sgls - 1]; 396 397 /* we are done */ 398 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 399 /* map whole sgl list */ 400 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 401 max_iovcnt - total_iovcnt, gpa_to_vva); 402 if (spdk_unlikely(ret < 0)) { 403 return ret; 404 } 405 total_iovcnt += ret; 406 407 return total_iovcnt; 408 } 409 410 if (num_sgls > 1) { 411 /* map whole sgl exclude last_sgl */ 412 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 413 max_iovcnt - total_iovcnt, gpa_to_vva); 414 if (spdk_unlikely(ret < 0)) { 415 return ret; 416 } 417 total_iovcnt += ret; 418 } 419 420 /* move to next level's segments */ 421 sgl = last_sgl; 422 } 423 424 return 0; 425 } 426 427 static int 428 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 429 uint32_t len, size_t mps, 430 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 431 { 432 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 433 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 434 } 435 436 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 437 } 438 439 static char * 440 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 441 { 442 return endpoint->trid.traddr; 443 } 444 445 static char * 446 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 447 { 448 if (!ctrlr || !ctrlr->endpoint) { 449 return "Null Ctrlr"; 450 } 451 452 return endpoint_id(ctrlr->endpoint); 453 } 454 455 static uint16_t 456 io_q_id(struct nvme_q *q) 457 { 458 459 struct nvmf_vfio_user_qpair *vfio_user_qpair; 460 461 assert(q); 462 463 if (q->is_cq) { 464 vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq); 465 } else { 466 vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq); 467 } 468 assert(vfio_user_qpair); 469 return vfio_user_qpair->qpair.qid; 470 } 471 472 static void 473 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 474 { 475 assert(ctrlr != NULL); 476 477 if (ctrlr->cfs == 0) { 478 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr)); 479 } 480 481 ctrlr->cfs = 1U; 482 } 483 484 static bool 485 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *ctrlr) 486 { 487 assert(ctrlr != NULL); 488 assert(ctrlr->endpoint != NULL); 489 490 vfu_pci_config_space_t *pci = ctrlr->endpoint->pci_config_space; 491 492 return (!pci->hdr.cmd.id || ctrlr->endpoint->msix->mxc.mxe); 493 } 494 495 static void 496 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 497 { 498 if (endpoint->doorbells) { 499 munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 500 } 501 502 if (endpoint->fd > 0) { 503 close(endpoint->fd); 504 } 505 506 vfu_destroy_ctx(endpoint->vfu_ctx); 507 508 pthread_mutex_destroy(&endpoint->lock); 509 free(endpoint); 510 } 511 512 /* called when process exits */ 513 static int 514 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 515 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 516 { 517 struct nvmf_vfio_user_transport *vu_transport; 518 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 519 520 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 521 522 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 523 transport); 524 525 (void)pthread_mutex_destroy(&vu_transport->lock); 526 527 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 528 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 529 nvmf_vfio_user_destroy_endpoint(endpoint); 530 } 531 532 free(vu_transport); 533 534 if (cb_fn) { 535 cb_fn(cb_arg); 536 } 537 538 return 0; 539 } 540 541 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 542 { 543 "disable-mappable-bar0", 544 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 545 spdk_json_decode_bool, true 546 }, 547 }; 548 549 static struct spdk_nvmf_transport * 550 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 551 { 552 struct nvmf_vfio_user_transport *vu_transport; 553 int err; 554 555 vu_transport = calloc(1, sizeof(*vu_transport)); 556 if (vu_transport == NULL) { 557 SPDK_ERRLOG("Transport alloc fail: %m\n"); 558 return NULL; 559 } 560 561 err = pthread_mutex_init(&vu_transport->lock, NULL); 562 if (err != 0) { 563 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 564 goto err; 565 } 566 567 TAILQ_INIT(&vu_transport->endpoints); 568 TAILQ_INIT(&vu_transport->new_qps); 569 570 if (opts->transport_specific != NULL && 571 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 572 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 573 vu_transport)) { 574 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 575 free(vu_transport); 576 return NULL; 577 } 578 579 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 580 vu_transport->transport_opts.disable_mappable_bar0); 581 582 return &vu_transport->transport; 583 584 err: 585 free(vu_transport); 586 587 return NULL; 588 } 589 590 static uint16_t 591 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr) 592 { 593 assert(ctrlr != NULL); 594 assert(ctrlr->qp[0] != NULL); 595 assert(ctrlr->qp[0]->qpair.ctrlr != NULL); 596 597 return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1; 598 } 599 600 static void * 601 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot) 602 { 603 int ret; 604 605 assert(ctx != NULL); 606 assert(sg != NULL); 607 assert(iov != NULL); 608 609 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 610 if (ret < 0) { 611 return NULL; 612 } 613 614 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 615 if (ret != 0) { 616 return NULL; 617 } 618 619 assert(iov->iov_base != NULL); 620 return iov->iov_base; 621 } 622 623 static uint32_t 624 sq_head(struct nvmf_vfio_user_qpair *qpair) 625 { 626 assert(qpair != NULL); 627 return qpair->sq.head; 628 } 629 630 static void 631 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair) 632 { 633 assert(ctrlr != NULL); 634 assert(qpair != NULL); 635 qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size; 636 } 637 638 static int 639 asq_map(struct nvmf_vfio_user_ctrlr *ctrlr) 640 { 641 struct nvme_q *sq; 642 const struct spdk_nvmf_registers *regs; 643 644 assert(ctrlr != NULL); 645 assert(ctrlr->qp[0] != NULL); 646 assert(ctrlr->qp[0]->sq.addr == NULL); 647 /* XXX ctrlr->asq == 0 is a valid memory address */ 648 649 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 650 sq = &ctrlr->qp[0]->sq; 651 sq->size = regs->aqa.bits.asqs + 1; 652 sq->head = ctrlr->doorbells[0] = 0; 653 sq->cqid = 0; 654 sq->addr = map_one(ctrlr->endpoint->vfu_ctx, regs->asq, 655 sq->size * sizeof(struct spdk_nvme_cmd), sq->sg, 656 &sq->iov, PROT_READ); 657 if (sq->addr == NULL) { 658 return -1; 659 } 660 memset(sq->addr, 0, sq->size * sizeof(struct spdk_nvme_cmd)); 661 sq->is_cq = false; 662 *tdbl(ctrlr, sq) = 0; 663 664 return 0; 665 } 666 667 static uint16_t 668 cq_next(struct nvme_q *q) 669 { 670 assert(q != NULL); 671 assert(q->is_cq); 672 return (q->tail + 1) % q->size; 673 } 674 675 static int 676 queue_index(uint16_t qid, int is_cq) 677 { 678 return (qid * 2) + is_cq; 679 } 680 681 static volatile uint32_t * 682 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 683 { 684 assert(ctrlr != NULL); 685 assert(q != NULL); 686 assert(!q->is_cq); 687 688 return &ctrlr->doorbells[queue_index(io_q_id(q), false)]; 689 } 690 691 static volatile uint32_t * 692 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 693 { 694 assert(ctrlr != NULL); 695 assert(q != NULL); 696 assert(q->is_cq); 697 698 return &ctrlr->doorbells[queue_index(io_q_id(q), true)]; 699 } 700 701 static bool 702 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 703 { 704 assert(ctrlr != NULL); 705 assert(q != NULL); 706 return cq_next(q) == *hdbl(ctrlr, q); 707 } 708 709 static void 710 cq_tail_advance(struct nvme_q *q) 711 { 712 assert(q != NULL); 713 q->tail = cq_next(q); 714 } 715 716 static int 717 acq_map(struct nvmf_vfio_user_ctrlr *ctrlr) 718 { 719 struct nvme_q *cq; 720 const struct spdk_nvmf_registers *regs; 721 722 assert(ctrlr != NULL); 723 assert(ctrlr->qp[0] != NULL); 724 assert(ctrlr->qp[0]->cq.addr == NULL); 725 726 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 727 assert(regs != NULL); 728 cq = &ctrlr->qp[0]->cq; 729 cq->size = regs->aqa.bits.acqs + 1; 730 cq->tail = 0; 731 cq->addr = map_one(ctrlr->endpoint->vfu_ctx, regs->acq, 732 cq->size * sizeof(struct spdk_nvme_cpl), cq->sg, 733 &cq->iov, PROT_READ | PROT_WRITE); 734 if (cq->addr == NULL) { 735 return -1; 736 } 737 memset(cq->addr, 0, cq->size * sizeof(struct spdk_nvme_cpl)); 738 cq->is_cq = true; 739 cq->ien = true; 740 *hdbl(ctrlr, cq) = 0; 741 742 return 0; 743 } 744 745 static inline dma_sg_t * 746 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 747 { 748 return (dma_sg_t *)((uintptr_t)vu_req->sg + iovcnt * dma_sg_size()); 749 } 750 751 static void * 752 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 753 { 754 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 755 struct spdk_nvmf_qpair *qpair; 756 struct nvmf_vfio_user_req *vu_req; 757 struct nvmf_vfio_user_qpair *vu_qpair; 758 void *ret; 759 760 assert(req != NULL); 761 qpair = req->qpair; 762 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 763 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 764 765 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 766 ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len, 767 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 768 &vu_req->iov[vu_req->iovcnt], prot); 769 if (spdk_likely(ret != NULL)) { 770 vu_req->iovcnt++; 771 } 772 return ret; 773 } 774 775 static int 776 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 777 struct iovec *iov, uint32_t length) 778 { 779 /* Map PRP list to from Guest physical memory to 780 * virtual memory address. 781 */ 782 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 783 length, 4096, _map_one); 784 } 785 786 static struct spdk_nvmf_request * 787 get_nvmf_req(struct nvmf_vfio_user_qpair *qp); 788 789 static int 790 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 791 struct spdk_nvmf_request *req); 792 793 /* 794 * Posts a CQE in the completion queue. 795 * 796 * @ctrlr: the vfio-user controller 797 * @cmd: the NVMe command for which the completion is posted 798 * @cq: the completion queue 799 * @cdw0: cdw0 as reported by NVMf 800 * @sc: the NVMe CQE status code 801 * @sct: the NVMe CQE status code type 802 */ 803 static int 804 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 805 struct nvme_q *cq, uint32_t cdw0, uint16_t sc, 806 uint16_t sct) 807 { 808 struct spdk_nvme_cpl *cpl; 809 uint16_t qid; 810 int err; 811 812 assert(ctrlr != NULL); 813 assert(cmd != NULL); 814 815 qid = io_q_id(cq); 816 817 if (ctrlr->qp[0]->qpair.ctrlr->vcprop.csts.bits.shst != SPDK_NVME_SHST_NORMAL) { 818 SPDK_DEBUGLOG(nvmf_vfio, 819 "%s: ignore completion SQ%d cid=%d status=%#x\n", 820 ctrlr_id(ctrlr), qid, cmd->cid, sc); 821 return 0; 822 } 823 824 if (cq_is_full(ctrlr, cq)) { 825 SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n", 826 ctrlr_id(ctrlr), qid, cq->tail, *hdbl(ctrlr, cq)); 827 return -1; 828 } 829 830 cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail; 831 832 SPDK_DEBUGLOG(nvmf_vfio, 833 "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n", 834 ctrlr_id(ctrlr), qid, cmd->cid, sc, ctrlr->qp[qid]->sq.head, 835 cq->tail); 836 837 assert(ctrlr->qp[qid] != NULL); 838 839 cpl->sqhd = ctrlr->qp[qid]->sq.head; 840 cpl->cid = cmd->cid; 841 cpl->cdw0 = cdw0; 842 cpl->status.dnr = 0x0; 843 cpl->status.m = 0x0; 844 cpl->status.sct = sct; 845 cpl->status.p = ~cpl->status.p; 846 cpl->status.sc = sc; 847 848 cq_tail_advance(cq); 849 850 /* 851 * this function now executes at SPDK thread context, we 852 * might be triggerring interrupts from vfio-user thread context so 853 * check for race conditions. 854 */ 855 if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) { 856 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 857 if (err != 0) { 858 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 859 ctrlr_id(ctrlr)); 860 return err; 861 } 862 } 863 864 return 0; 865 } 866 867 static struct nvme_q * 868 lookup_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, const uint16_t qid, const bool is_cq) 869 { 870 struct nvme_q *q; 871 872 assert(ctrlr != NULL); 873 874 if (qid > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) { 875 return NULL; 876 } 877 878 if (ctrlr->qp[qid] == NULL) { 879 return NULL; 880 } 881 882 if (is_cq) { 883 q = &ctrlr->qp[qid]->cq; 884 } else { 885 q = &ctrlr->qp[qid]->sq; 886 } 887 888 if (q->addr == NULL) { 889 return NULL; 890 } 891 892 return q; 893 } 894 895 static void 896 unmap_qp(struct nvmf_vfio_user_qpair *qp) 897 { 898 struct nvmf_vfio_user_ctrlr *ctrlr; 899 900 if (qp->ctrlr == NULL) { 901 return; 902 } 903 ctrlr = qp->ctrlr; 904 905 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap QP%d\n", 906 ctrlr_id(ctrlr), qp->qpair.qid); 907 908 if (qp->sq.addr != NULL) { 909 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->sq.sg, &qp->sq.iov, 1); 910 qp->sq.addr = NULL; 911 } 912 913 if (qp->cq.addr != NULL) { 914 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->cq.sg, &qp->cq.iov, 1); 915 qp->cq.addr = NULL; 916 } 917 } 918 919 static void 920 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 921 { 922 struct nvmf_vfio_user_qpair *qpair; 923 struct nvmf_vfio_user_req *vu_req; 924 uint32_t i; 925 926 if (ctrlr == NULL) { 927 return; 928 } 929 930 qpair = ctrlr->qp[qid]; 931 if (qpair == NULL) { 932 return; 933 } 934 935 SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr), 936 qid, qpair); 937 938 unmap_qp(qpair); 939 940 for (i = 0; i < qpair->qsize; i++) { 941 vu_req = &qpair->reqs_internal[i]; 942 free(vu_req->sg); 943 } 944 free(qpair->reqs_internal); 945 946 free(qpair->sq.sg); 947 free(qpair->cq.sg); 948 free(qpair); 949 950 ctrlr->qp[qid] = NULL; 951 } 952 953 /* This function can only fail because of memory allocation errors. */ 954 static int 955 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 956 const uint16_t qsize, const uint16_t id) 957 { 958 uint16_t i; 959 struct nvmf_vfio_user_qpair *qpair; 960 struct nvmf_vfio_user_req *vu_req, *tmp; 961 struct spdk_nvmf_request *req; 962 963 assert(ctrlr != NULL); 964 assert(transport != NULL); 965 966 qpair = calloc(1, sizeof(*qpair)); 967 if (qpair == NULL) { 968 return -ENOMEM; 969 } 970 qpair->sq.sg = calloc(1, dma_sg_size()); 971 if (qpair->sq.sg == NULL) { 972 free(qpair); 973 return -ENOMEM; 974 } 975 qpair->cq.sg = calloc(1, dma_sg_size()); 976 if (qpair->cq.sg == NULL) { 977 free(qpair->sq.sg); 978 free(qpair); 979 return -ENOMEM; 980 } 981 982 qpair->qpair.qid = id; 983 qpair->qpair.transport = transport; 984 qpair->ctrlr = ctrlr; 985 qpair->qsize = qsize; 986 987 TAILQ_INIT(&qpair->reqs); 988 989 qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req)); 990 if (qpair->reqs_internal == NULL) { 991 SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr)); 992 goto reqs_err; 993 } 994 995 for (i = 0; i < qsize; i++) { 996 vu_req = &qpair->reqs_internal[i]; 997 vu_req->sg = calloc(NVMF_VFIO_USER_MAX_IOVECS, dma_sg_size()); 998 if (vu_req->sg == NULL) { 999 goto sg_err; 1000 } 1001 1002 req = &vu_req->req; 1003 req->qpair = &qpair->qpair; 1004 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1005 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1006 1007 TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link); 1008 } 1009 1010 ctrlr->qp[id] = qpair; 1011 return 0; 1012 1013 sg_err: 1014 TAILQ_FOREACH_SAFE(vu_req, &qpair->reqs, link, tmp) { 1015 free(vu_req->sg); 1016 } 1017 free(qpair->reqs_internal); 1018 1019 reqs_err: 1020 free(qpair->sq.sg); 1021 free(qpair->cq.sg); 1022 free(qpair); 1023 return -ENOMEM; 1024 } 1025 1026 /* 1027 * Creates a completion or sumbission I/O queue. Returns 0 on success, -errno 1028 * on error. 1029 * 1030 * XXX SPDK thread context. 1031 */ 1032 static int 1033 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1034 struct spdk_nvme_cmd *cmd, const bool is_cq) 1035 { 1036 size_t entry_size; 1037 uint16_t qsize; 1038 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1039 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1040 int err = 0; 1041 struct nvme_q *io_q; 1042 int prot; 1043 1044 assert(ctrlr != NULL); 1045 assert(cmd != NULL); 1046 1047 SPDK_DEBUGLOG(nvmf_vfio, 1048 "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr), 1049 is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid, 1050 cmd->cdw10_bits.create_io_q.qsize); 1051 1052 if (cmd->cdw10_bits.create_io_q.qid >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) { 1053 SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr), 1054 cmd->cdw10_bits.create_io_q.qid, 1055 NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR); 1056 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1057 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1058 goto out; 1059 } 1060 1061 if (lookup_io_q(ctrlr, cmd->cdw10_bits.create_io_q.qid, is_cq)) { 1062 SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr), 1063 is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid); 1064 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1065 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1066 goto out; 1067 } 1068 1069 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1070 if (qsize > max_queue_size(ctrlr)) { 1071 SPDK_ERRLOG("%s: queue too big, want=%d, max=%d\n", ctrlr_id(ctrlr), 1072 qsize, max_queue_size(ctrlr)); 1073 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1074 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 1075 goto out; 1076 } 1077 1078 /* TODO break rest of this function into smaller functions */ 1079 if (is_cq) { 1080 err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, qsize, 1081 cmd->cdw10_bits.create_io_q.qid); 1082 if (err != 0) { 1083 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1084 goto out; 1085 } 1086 1087 io_q = &ctrlr->qp[cmd->cdw10_bits.create_io_q.qid]->cq; 1088 entry_size = sizeof(struct spdk_nvme_cpl); 1089 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1090 /* 1091 * TODO CAP.CMBS is currently set to zero, however we 1092 * should zero it out explicitly when CAP is read. 1093 * Support for CAP.CMBS is not mentioned in the NVMf 1094 * spec. 1095 */ 1096 SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr)); 1097 sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF; 1098 goto out; 1099 } 1100 io_q->ien = cmd->cdw11_bits.create_io_cq.ien; 1101 io_q->iv = cmd->cdw11_bits.create_io_cq.iv; 1102 } else { 1103 /* CQ must be created before SQ */ 1104 if (!lookup_io_q(ctrlr, cmd->cdw11_bits.create_io_sq.cqid, true)) { 1105 SPDK_ERRLOG("%s: CQ%d does not exist\n", ctrlr_id(ctrlr), 1106 cmd->cdw11_bits.create_io_sq.cqid); 1107 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1108 sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1109 goto out; 1110 } 1111 1112 io_q = &ctrlr->qp[cmd->cdw10_bits.create_io_q.qid]->sq; 1113 entry_size = sizeof(struct spdk_nvme_cmd); 1114 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1115 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1116 sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF; 1117 goto out; 1118 } 1119 1120 io_q->cqid = cmd->cdw11_bits.create_io_sq.cqid; 1121 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr), 1122 cmd->cdw10_bits.create_io_q.qid, io_q->cqid); 1123 } 1124 1125 io_q->is_cq = is_cq; 1126 io_q->size = qsize; 1127 prot = PROT_READ; 1128 if (is_cq) { 1129 prot |= PROT_WRITE; 1130 } 1131 io_q->addr = map_one(ctrlr->endpoint->vfu_ctx, cmd->dptr.prp.prp1, 1132 io_q->size * entry_size, io_q->sg, &io_q->iov, prot); 1133 if (io_q->addr == NULL) { 1134 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1135 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1136 goto out; 1137 } 1138 io_q->prp1 = cmd->dptr.prp.prp1; 1139 memset(io_q->addr, 0, io_q->size * entry_size); 1140 1141 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n", 1142 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1143 cmd->cdw10_bits.create_io_q.qid, cmd->dptr.prp.prp1, 1144 (unsigned long long)io_q->addr); 1145 1146 if (is_cq) { 1147 *hdbl(ctrlr, io_q) = 0; 1148 } else { 1149 /* 1150 * After we've returned from the nvmf_vfio_user_poll_group_poll thread, once 1151 * nvmf_vfio_user_accept executes it will pick up this QP and will eventually 1152 * call nvmf_vfio_user_poll_group_add. The rest of the opertion needed to 1153 * complete the addition of the queue will be continued at the 1154 * completion callback. 1155 */ 1156 TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[cmd->cdw10_bits.create_io_q.qid], link); 1157 *tdbl(ctrlr, io_q) = 0; 1158 1159 } 1160 1161 out: 1162 return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct); 1163 } 1164 1165 /* 1166 * Deletes a completion or sumbission I/O queue. 1167 */ 1168 static int 1169 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1170 struct spdk_nvme_cmd *cmd, const bool is_cq) 1171 { 1172 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1173 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1174 1175 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n", 1176 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1177 cmd->cdw10_bits.delete_io_q.qid); 1178 1179 if (lookup_io_q(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq) == NULL) { 1180 SPDK_ERRLOG("%s: %cQ%d does not exist\n", ctrlr_id(ctrlr), 1181 is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid); 1182 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1183 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1184 goto out; 1185 } 1186 1187 if (is_cq) { 1188 /* SQ must have been deleted first */ 1189 if (ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state != VFIO_USER_QPAIR_DELETED) { 1190 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 1191 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1192 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 1193 goto out; 1194 } 1195 } else { 1196 /* 1197 * This doesn't actually delete the I/O queue, we can't 1198 * do that anyway because NVMf doesn't support it. We're merely 1199 * telling the poll_group_poll function to skip checking this 1200 * queue. The only workflow this works is when CC.EN is set to 1201 * 0 and we're stopping the subsystem, so we know that the 1202 * relevant callbacks to destroy the queues will be called. 1203 */ 1204 assert(ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state == VFIO_USER_QPAIR_ACTIVE); 1205 ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state = VFIO_USER_QPAIR_DELETED; 1206 } 1207 1208 out: 1209 return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct); 1210 } 1211 1212 /* 1213 * Returns 0 on success and -errno on error. 1214 * 1215 * XXX SPDK thread context 1216 */ 1217 static int 1218 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 1219 { 1220 assert(ctrlr != NULL); 1221 assert(cmd != NULL); 1222 1223 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle admin req opc=%#x cid=%d\n", 1224 ctrlr_id(ctrlr), cmd->opc, cmd->cid); 1225 1226 switch (cmd->opc) { 1227 case SPDK_NVME_OPC_CREATE_IO_CQ: 1228 case SPDK_NVME_OPC_CREATE_IO_SQ: 1229 return handle_create_io_q(ctrlr, cmd, 1230 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 1231 case SPDK_NVME_OPC_DELETE_IO_SQ: 1232 case SPDK_NVME_OPC_DELETE_IO_CQ: 1233 return handle_del_io_q(ctrlr, cmd, 1234 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 1235 default: 1236 return handle_cmd_req(ctrlr, cmd, get_nvmf_req(ctrlr->qp[0])); 1237 } 1238 } 1239 1240 static int 1241 handle_cmd_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1242 { 1243 struct nvmf_vfio_user_qpair *qpair = cb_arg; 1244 1245 assert(qpair != NULL); 1246 assert(req != NULL); 1247 1248 vfu_unmap_sg(qpair->ctrlr->endpoint->vfu_ctx, req->sg, req->iov, req->iovcnt); 1249 1250 return post_completion(qpair->ctrlr, &req->req.cmd->nvme_cmd, 1251 &qpair->ctrlr->qp[req->req.qpair->qid]->cq, 1252 req->req.rsp->nvme_cpl.cdw0, 1253 req->req.rsp->nvme_cpl.status.sc, 1254 req->req.rsp->nvme_cpl.status.sct); 1255 } 1256 1257 static int 1258 handle_admin_aer_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1259 { 1260 struct nvmf_vfio_user_qpair *qpair = cb_arg; 1261 1262 assert(qpair != NULL); 1263 assert(req != NULL); 1264 1265 vfu_unmap_sg(qpair->ctrlr->endpoint->vfu_ctx, req->sg, req->iov, req->iovcnt); 1266 1267 if (qpair->state != VFIO_USER_QPAIR_ACTIVE) { 1268 return 0; 1269 } 1270 1271 return post_completion(qpair->ctrlr, &req->req.cmd->nvme_cmd, 1272 &qpair->ctrlr->qp[req->req.qpair->qid]->cq, 1273 req->req.rsp->nvme_cpl.cdw0, 1274 req->req.rsp->nvme_cpl.status.sc, 1275 req->req.rsp->nvme_cpl.status.sct); 1276 } 1277 1278 static int 1279 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair, 1280 struct spdk_nvme_cmd *cmd) 1281 { 1282 assert(qpair != NULL); 1283 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 1284 return consume_admin_cmd(ctrlr, cmd); 1285 } 1286 1287 return handle_cmd_req(ctrlr, cmd, get_nvmf_req(qpair)); 1288 } 1289 1290 static ssize_t 1291 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 1292 struct nvmf_vfio_user_qpair *qpair) 1293 { 1294 struct spdk_nvme_cmd *queue; 1295 1296 assert(ctrlr != NULL); 1297 assert(qpair != NULL); 1298 1299 queue = qpair->sq.addr; 1300 while (sq_head(qpair) != new_tail) { 1301 int err; 1302 struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)]; 1303 1304 /* 1305 * SQHD must contain the new head pointer, so we must increase 1306 * it before we generate a completion. 1307 */ 1308 sqhd_advance(ctrlr, qpair); 1309 1310 err = consume_cmd(ctrlr, qpair, cmd); 1311 if (err != 0) { 1312 return err; 1313 } 1314 } 1315 1316 return 0; 1317 } 1318 1319 static int 1320 map_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1321 { 1322 int err; 1323 1324 assert(ctrlr != NULL); 1325 1326 err = acq_map(ctrlr); 1327 if (err != 0) { 1328 return err; 1329 } 1330 1331 err = asq_map(ctrlr); 1332 if (err != 0) { 1333 return err; 1334 } 1335 1336 return 0; 1337 } 1338 1339 static void 1340 unmap_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1341 { 1342 assert(ctrlr->qp[0] != NULL); 1343 1344 unmap_qp(ctrlr->qp[0]); 1345 } 1346 1347 static void 1348 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1349 { 1350 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1351 struct nvmf_vfio_user_ctrlr *ctrlr; 1352 struct nvmf_vfio_user_qpair *qpair; 1353 int i, ret; 1354 1355 /* 1356 * We're not interested in any DMA regions that aren't mappable (we don't 1357 * support clients that don't share their memory). 1358 */ 1359 if (!info->vaddr) { 1360 return; 1361 } 1362 1363 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1364 (info->mapping.iov_len & MASK_2MB)) { 1365 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1366 (uintptr_t)info->mapping.iov_base, 1367 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1368 return; 1369 } 1370 1371 assert(endpoint != NULL); 1372 if (endpoint->ctrlr == NULL) { 1373 return; 1374 } 1375 ctrlr = endpoint->ctrlr; 1376 1377 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1378 (uintptr_t)info->mapping.iov_base, 1379 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1380 1381 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 1382 * check the protection bits before registering. 1383 */ 1384 if ((info->prot == (PROT_WRITE | PROT_READ)) && 1385 (spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len))) { 1386 SPDK_ERRLOG("Memory region register %#lx-%#lx failed\n", 1387 (uint64_t)(uintptr_t)info->mapping.iov_base, 1388 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1389 } 1390 1391 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1392 qpair = ctrlr->qp[i]; 1393 if (qpair == NULL) { 1394 continue; 1395 } 1396 1397 if (qpair->state != VFIO_USER_QPAIR_INACTIVE) { 1398 continue; 1399 } 1400 1401 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 1402 ret = map_admin_queue(ctrlr); 1403 if (ret) { 1404 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap Admin queue\n"); 1405 continue; 1406 } 1407 qpair->state = VFIO_USER_QPAIR_ACTIVE; 1408 SPDK_DEBUGLOG(nvmf_vfio, "Remap Admin queue\n"); 1409 } else { 1410 struct nvme_q *sq = &qpair->sq; 1411 struct nvme_q *cq = &qpair->cq; 1412 1413 sq->addr = map_one(ctrlr->endpoint->vfu_ctx, sq->prp1, sq->size * 64, sq->sg, &sq->iov, 1414 PROT_READ | PROT_WRITE); 1415 if (!sq->addr) { 1416 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n", 1417 i, sq->prp1, sq->prp1 + sq->size * 64); 1418 continue; 1419 } 1420 cq->addr = map_one(ctrlr->endpoint->vfu_ctx, cq->prp1, cq->size * 16, cq->sg, &cq->iov, 1421 PROT_READ | PROT_WRITE); 1422 if (!cq->addr) { 1423 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n", 1424 i, cq->prp1, cq->prp1 + cq->size * 16); 1425 continue; 1426 } 1427 qpair->state = VFIO_USER_QPAIR_ACTIVE; 1428 SPDK_DEBUGLOG(nvmf_vfio, "Remap IO QP%u\n", i); 1429 } 1430 } 1431 } 1432 1433 static int 1434 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1435 { 1436 1437 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1438 struct nvmf_vfio_user_ctrlr *ctrlr; 1439 struct nvmf_vfio_user_qpair *qpair; 1440 void *map_start, *map_end; 1441 int i; 1442 1443 if (!info->vaddr) { 1444 return 0; 1445 } 1446 1447 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1448 (info->mapping.iov_len & MASK_2MB)) { 1449 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1450 (uintptr_t)info->mapping.iov_base, 1451 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1452 return 0; 1453 } 1454 1455 assert(endpoint != NULL); 1456 if (endpoint->ctrlr == NULL) { 1457 return 0; 1458 } 1459 ctrlr = endpoint->ctrlr; 1460 1461 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1462 (uintptr_t)info->mapping.iov_base, 1463 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1464 1465 if ((info->prot == (PROT_WRITE | PROT_READ)) && 1466 (spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len))) { 1467 SPDK_ERRLOG("Memory region unregister %#lx-%#lx failed\n", 1468 (uint64_t)(uintptr_t)info->mapping.iov_base, 1469 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1470 } 1471 1472 map_start = info->mapping.iov_base; 1473 map_end = info->mapping.iov_base + info->mapping.iov_len; 1474 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1475 qpair = ctrlr->qp[i]; 1476 if (qpair == NULL) { 1477 continue; 1478 } 1479 1480 if ((qpair->cq.addr >= map_start && qpair->cq.addr < map_end) || 1481 (qpair->sq.addr >= map_start && qpair->sq.addr < map_end)) { 1482 unmap_qp(qpair); 1483 qpair->state = VFIO_USER_QPAIR_INACTIVE; 1484 } 1485 } 1486 1487 return 0; 1488 } 1489 1490 static int 1491 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1492 { 1493 struct nvmf_vfio_user_qpair *qpair = cb_arg; 1494 int ret; 1495 1496 assert(qpair != NULL); 1497 assert(req != NULL); 1498 1499 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 1500 assert(qpair->ctrlr != NULL); 1501 assert(req != NULL); 1502 1503 memcpy(req->req.data, 1504 &req->req.rsp->prop_get_rsp.value.u64, 1505 req->req.length); 1506 } else { 1507 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 1508 assert(qpair->ctrlr != NULL); 1509 1510 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 1511 union spdk_nvme_cc_register *cc; 1512 1513 cc = (union spdk_nvme_cc_register *)&req->req.cmd->prop_set_cmd.value.u64; 1514 1515 if (cc->bits.en == 1 && cc->bits.shn == 0) { 1516 SPDK_DEBUGLOG(nvmf_vfio, 1517 "%s: MAP Admin queue\n", 1518 ctrlr_id(qpair->ctrlr)); 1519 ret = map_admin_queue(qpair->ctrlr); 1520 if (ret) { 1521 SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(qpair->ctrlr)); 1522 return ret; 1523 } 1524 qpair->state = VFIO_USER_QPAIR_ACTIVE; 1525 } else if ((cc->bits.en == 0 && cc->bits.shn == 0) || 1526 (cc->bits.en == 1 && cc->bits.shn != 0)) { 1527 SPDK_DEBUGLOG(nvmf_vfio, 1528 "%s: UNMAP Admin queue\n", 1529 ctrlr_id(qpair->ctrlr)); 1530 unmap_admin_queue(qpair->ctrlr); 1531 qpair->state = VFIO_USER_QPAIR_INACTIVE; 1532 /* For PCIe controller reset, we will drop all AER responses */ 1533 nvmf_ctrlr_abort_aer(req->req.qpair->ctrlr); 1534 } 1535 } 1536 } 1537 1538 return 0; 1539 } 1540 1541 /* 1542 * XXX Do NOT remove, see comment in access_bar0_fn. 1543 * 1544 * Handles a write at offset 0x1000 or more. 1545 * 1546 * DSTRD is set to fixed value 0 for NVMf. 1547 * 1548 */ 1549 static int 1550 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 1551 const size_t count, loff_t pos, const bool is_write) 1552 { 1553 assert(ctrlr != NULL); 1554 assert(buf != NULL); 1555 1556 if (count != sizeof(uint32_t)) { 1557 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 1558 ctrlr_id(ctrlr), count); 1559 errno = EINVAL; 1560 return -1; 1561 } 1562 1563 pos -= NVMF_VFIO_USER_DOORBELLS_OFFSET; 1564 1565 /* pos must be dword aligned */ 1566 if ((pos & 0x3) != 0) { 1567 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 1568 errno = EINVAL; 1569 return -1; 1570 } 1571 1572 /* convert byte offset to array index */ 1573 pos >>= 2; 1574 1575 if (pos > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR * 2) { 1576 /* 1577 * TODO: need to emit a "Write to Invalid Doorbell Register" 1578 * asynchronous event 1579 */ 1580 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 1581 errno = EINVAL; 1582 return -1; 1583 } 1584 1585 if (is_write) { 1586 ctrlr->doorbells[pos] = *buf; 1587 spdk_wmb(); 1588 } else { 1589 spdk_rmb(); 1590 *buf = ctrlr->doorbells[pos]; 1591 } 1592 return 0; 1593 } 1594 1595 static ssize_t 1596 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 1597 bool is_write) 1598 { 1599 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1600 struct nvmf_vfio_user_ctrlr *ctrlr; 1601 struct nvmf_vfio_user_req *req; 1602 int ret; 1603 1604 ctrlr = endpoint->ctrlr; 1605 1606 SPDK_DEBUGLOG(nvmf_vfio, 1607 "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n", 1608 endpoint_id(endpoint), is_write ? "write" : "read", 1609 ctrlr, count, pos); 1610 1611 if (pos >= NVMF_VFIO_USER_DOORBELLS_OFFSET) { 1612 /* 1613 * The fact that the doorbells can be memory mapped doesn't mean 1614 * that the client (VFIO in QEMU) is obliged to memory map them, 1615 * it might still elect to access them via regular read/write; 1616 * we might also have had disable_mappable_bar0 set. 1617 */ 1618 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 1619 pos, is_write); 1620 if (ret == 0) { 1621 return count; 1622 } 1623 assert(errno != 0); 1624 return ret; 1625 } 1626 1627 /* Construct a Fabric Property Get/Set command and send it */ 1628 req = get_nvmf_vfio_user_req(ctrlr->qp[0]); 1629 if (req == NULL) { 1630 errno = ENOBUFS; 1631 return -1; 1632 } 1633 1634 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 1635 req->cb_arg = ctrlr->qp[0]; 1636 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 1637 req->req.cmd->prop_set_cmd.cid = 0; 1638 req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1; 1639 req->req.cmd->prop_set_cmd.ofst = pos; 1640 if (is_write) { 1641 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 1642 if (req->req.cmd->prop_set_cmd.attrib.size) { 1643 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 1644 } else { 1645 req->req.cmd->prop_set_cmd.value.u32.high = 0; 1646 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 1647 } 1648 } else { 1649 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 1650 } 1651 req->req.length = count; 1652 req->req.data = buf; 1653 1654 spdk_nvmf_request_exec_fabrics(&req->req); 1655 1656 return count; 1657 } 1658 1659 /* 1660 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 1661 * available on PCI-X 2.0 and PCI Express buses 1662 */ 1663 static ssize_t 1664 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 1665 bool is_write) 1666 { 1667 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1668 1669 if (is_write) { 1670 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 1671 endpoint_id(endpoint), offset, offset + count); 1672 errno = EINVAL; 1673 return -1; 1674 } 1675 1676 if (offset + count > PCI_CFG_SPACE_EXP_SIZE) { 1677 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 1678 endpoint_id(endpoint), offset, count, 1679 PCI_CFG_SPACE_EXP_SIZE); 1680 errno = ERANGE; 1681 return -1; 1682 } 1683 1684 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 1685 1686 return count; 1687 } 1688 1689 static void 1690 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 1691 { 1692 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1693 1694 if (level >= LOG_DEBUG) { 1695 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1696 } else if (level >= LOG_INFO) { 1697 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1698 } else if (level >= LOG_NOTICE) { 1699 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 1700 } else if (level >= LOG_WARNING) { 1701 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 1702 } else { 1703 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 1704 } 1705 } 1706 1707 static void 1708 init_pci_config_space(vfu_pci_config_space_t *p) 1709 { 1710 /* MLBAR */ 1711 p->hdr.bars[0].raw = 0x0; 1712 /* MUBAR */ 1713 p->hdr.bars[1].raw = 0x0; 1714 1715 /* vendor specific, let's set them to zero for now */ 1716 p->hdr.bars[3].raw = 0x0; 1717 p->hdr.bars[4].raw = 0x0; 1718 p->hdr.bars[5].raw = 0x0; 1719 1720 /* enable INTx */ 1721 p->hdr.intr.ipin = 0x1; 1722 } 1723 1724 static int 1725 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 1726 struct nvmf_vfio_user_endpoint *endpoint) 1727 { 1728 int ret; 1729 ssize_t cap_offset; 1730 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 1731 1732 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 1733 struct pxcap pxcap = { 1734 .hdr.id = PCI_CAP_ID_EXP, 1735 .pxcaps.ver = 0x2, 1736 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 1737 .pxdcap2.ctds = 0x1 1738 }; 1739 1740 struct msixcap msixcap = { 1741 .hdr.id = PCI_CAP_ID_MSIX, 1742 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 1743 .mtab = {.tbir = 0x4, .to = 0x0}, 1744 .mpba = {.pbir = 0x5, .pbao = 0x0} 1745 }; 1746 1747 static struct iovec sparse_mmap[] = { 1748 { 1749 .iov_base = (void *)NVMF_VFIO_USER_DOORBELLS_OFFSET, 1750 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 1751 }, 1752 }; 1753 1754 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 1755 if (ret < 0) { 1756 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 1757 return ret; 1758 } 1759 vfu_pci_set_id(vfu_ctx, 0x4e58, 0x0001, 0, 0); 1760 /* 1761 * 0x02, controller uses the NVM Express programming interface 1762 * 0x08, non-volatile memory controller 1763 * 0x01, mass storage controller 1764 */ 1765 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 1766 1767 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 1768 if (cap_offset < 0) { 1769 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 1770 return ret; 1771 } 1772 1773 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 1774 if (cap_offset < 0) { 1775 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 1776 return ret; 1777 } 1778 1779 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 1780 if (cap_offset < 0) { 1781 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 1782 return ret; 1783 } 1784 1785 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 1786 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1787 if (ret < 0) { 1788 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 1789 return ret; 1790 } 1791 1792 if (vu_transport->transport_opts.disable_mappable_bar0) { 1793 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1794 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1795 NULL, 0, -1, 0); 1796 } else { 1797 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1798 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1799 sparse_mmap, 1, endpoint->fd, 0); 1800 } 1801 1802 if (ret < 0) { 1803 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 1804 return ret; 1805 } 1806 1807 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, PAGE_SIZE, 1808 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1809 if (ret < 0) { 1810 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 1811 return ret; 1812 } 1813 1814 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, PAGE_SIZE, 1815 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1816 if (ret < 0) { 1817 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 1818 return ret; 1819 } 1820 1821 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 1822 if (ret < 0) { 1823 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 1824 return ret; 1825 } 1826 1827 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 1828 if (ret < 0) { 1829 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 1830 return ret; 1831 } 1832 1833 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 1834 if (ret < 0) { 1835 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 1836 return ret; 1837 } 1838 1839 ret = vfu_realize_ctx(vfu_ctx); 1840 if (ret < 0) { 1841 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 1842 return ret; 1843 } 1844 1845 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 1846 assert(endpoint->pci_config_space != NULL); 1847 init_pci_config_space(endpoint->pci_config_space); 1848 1849 assert(cap_offset != 0); 1850 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 1851 1852 return 0; 1853 } 1854 1855 static void 1856 _free_ctrlr(void *ctx) 1857 { 1858 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 1859 int i; 1860 1861 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1862 free_qp(ctrlr, i); 1863 } 1864 1865 if (ctrlr->endpoint) { 1866 ctrlr->endpoint->ctrlr = NULL; 1867 } 1868 1869 spdk_poller_unregister(&ctrlr->mmio_poller); 1870 free(ctrlr); 1871 } 1872 1873 static int 1874 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 1875 { 1876 assert(ctrlr != NULL); 1877 1878 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 1879 1880 if (ctrlr->thread == spdk_get_thread()) { 1881 _free_ctrlr(ctrlr); 1882 } else { 1883 spdk_thread_send_msg(ctrlr->thread, _free_ctrlr, ctrlr); 1884 } 1885 1886 return 0; 1887 } 1888 1889 static void 1890 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 1891 struct nvmf_vfio_user_endpoint *endpoint) 1892 { 1893 struct nvmf_vfio_user_ctrlr *ctrlr; 1894 int err; 1895 1896 /* First, construct a vfio-user CUSTOM transport controller */ 1897 ctrlr = calloc(1, sizeof(*ctrlr)); 1898 if (ctrlr == NULL) { 1899 err = -ENOMEM; 1900 goto out; 1901 } 1902 ctrlr->cntlid = 0xffff; 1903 ctrlr->transport = transport; 1904 ctrlr->endpoint = endpoint; 1905 ctrlr->doorbells = endpoint->doorbells; 1906 1907 /* Then, construct an admin queue pair */ 1908 err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0); 1909 if (err != 0) { 1910 goto out; 1911 } 1912 endpoint->ctrlr = ctrlr; 1913 1914 /* Notify the generic layer about the new admin queue pair */ 1915 TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[0], link); 1916 1917 out: 1918 if (err != 0) { 1919 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 1920 endpoint_id(endpoint), strerror(-err)); 1921 if (free_ctrlr(ctrlr) != 0) { 1922 SPDK_ERRLOG("%s: failed to clean up\n", 1923 endpoint_id(endpoint)); 1924 } 1925 } 1926 } 1927 1928 static int 1929 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 1930 const struct spdk_nvme_transport_id *trid, 1931 struct spdk_nvmf_listen_opts *listen_opts) 1932 { 1933 struct nvmf_vfio_user_transport *vu_transport; 1934 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1935 char *path = NULL; 1936 char uuid[PATH_MAX] = {}; 1937 int fd; 1938 int err; 1939 1940 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1941 transport); 1942 1943 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1944 /* Only compare traddr */ 1945 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 1946 return -EEXIST; 1947 } 1948 } 1949 1950 endpoint = calloc(1, sizeof(*endpoint)); 1951 if (!endpoint) { 1952 return -ENOMEM; 1953 } 1954 1955 endpoint->fd = -1; 1956 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 1957 1958 err = asprintf(&path, "%s/bar0", endpoint_id(endpoint)); 1959 if (err == -1) { 1960 goto out; 1961 } 1962 1963 fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); 1964 if (fd == -1) { 1965 SPDK_ERRLOG("%s: failed to open device memory at %s: %m\n", 1966 endpoint_id(endpoint), path); 1967 err = fd; 1968 free(path); 1969 goto out; 1970 } 1971 free(path); 1972 1973 endpoint->fd = fd; 1974 err = ftruncate(fd, NVMF_VFIO_USER_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 1975 if (err != 0) { 1976 goto out; 1977 } 1978 1979 endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 1980 PROT_READ | PROT_WRITE, MAP_SHARED, fd, NVMF_VFIO_USER_DOORBELLS_OFFSET); 1981 if (endpoint->doorbells == MAP_FAILED) { 1982 endpoint->doorbells = NULL; 1983 err = -errno; 1984 goto out; 1985 } 1986 1987 snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 1988 1989 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 1990 endpoint, VFU_DEV_TYPE_PCI); 1991 if (endpoint->vfu_ctx == NULL) { 1992 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 1993 endpoint_id(endpoint)); 1994 err = -1; 1995 goto out; 1996 } 1997 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, LOG_DEBUG); 1998 1999 err = vfio_user_dev_info_fill(vu_transport, endpoint); 2000 if (err < 0) { 2001 goto out; 2002 } 2003 2004 pthread_mutex_init(&endpoint->lock, NULL); 2005 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 2006 SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells); 2007 2008 out: 2009 if (err != 0) { 2010 nvmf_vfio_user_destroy_endpoint(endpoint); 2011 } 2012 2013 return err; 2014 } 2015 2016 static void 2017 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 2018 const struct spdk_nvme_transport_id *trid) 2019 { 2020 struct nvmf_vfio_user_transport *vu_transport; 2021 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 2022 int err; 2023 2024 assert(trid != NULL); 2025 assert(trid->traddr != NULL); 2026 2027 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 2028 2029 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2030 transport); 2031 2032 pthread_mutex_lock(&vu_transport->lock); 2033 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 2034 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 2035 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 2036 if (endpoint->ctrlr) { 2037 err = free_ctrlr(endpoint->ctrlr); 2038 if (err != 0) { 2039 SPDK_ERRLOG("%s: failed destroy controller: %s\n", 2040 endpoint_id(endpoint), strerror(-err)); 2041 } 2042 } 2043 nvmf_vfio_user_destroy_endpoint(endpoint); 2044 pthread_mutex_unlock(&vu_transport->lock); 2045 2046 return; 2047 } 2048 } 2049 pthread_mutex_unlock(&vu_transport->lock); 2050 2051 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 2052 } 2053 2054 static void 2055 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 2056 struct spdk_nvmf_subsystem *subsystem, 2057 struct spdk_nvmf_ctrlr_data *cdata) 2058 { 2059 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 2060 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 2061 } 2062 2063 static int 2064 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 2065 const struct spdk_nvmf_subsystem *subsystem, 2066 const struct spdk_nvme_transport_id *trid) 2067 { 2068 struct nvmf_vfio_user_transport *vu_transport; 2069 struct nvmf_vfio_user_endpoint *endpoint; 2070 2071 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 2072 2073 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2074 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 2075 break; 2076 } 2077 } 2078 2079 if (endpoint == NULL) { 2080 return -ENOENT; 2081 } 2082 2083 endpoint->subsystem = subsystem; 2084 2085 return 0; 2086 } 2087 2088 /* 2089 * Executed periodically. 2090 * 2091 * XXX SPDK thread context. 2092 */ 2093 static uint32_t 2094 nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport) 2095 { 2096 int err; 2097 struct nvmf_vfio_user_transport *vu_transport; 2098 struct nvmf_vfio_user_qpair *qp, *tmp_qp; 2099 struct nvmf_vfio_user_endpoint *endpoint; 2100 2101 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2102 transport); 2103 2104 pthread_mutex_lock(&vu_transport->lock); 2105 2106 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2107 /* try to attach a new controller */ 2108 if (endpoint->ctrlr != NULL) { 2109 continue; 2110 } 2111 2112 err = vfu_attach_ctx(endpoint->vfu_ctx); 2113 if (err != 0) { 2114 if (errno == EAGAIN || errno == EWOULDBLOCK) { 2115 continue; 2116 } 2117 2118 pthread_mutex_unlock(&vu_transport->lock); 2119 return -EFAULT; 2120 } 2121 2122 /* Construct a controller */ 2123 nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 2124 } 2125 2126 TAILQ_FOREACH_SAFE(qp, &vu_transport->new_qps, link, tmp_qp) { 2127 TAILQ_REMOVE(&vu_transport->new_qps, qp, link); 2128 spdk_nvmf_tgt_new_qpair(transport->tgt, &qp->qpair); 2129 } 2130 2131 pthread_mutex_unlock(&vu_transport->lock); 2132 2133 return 0; 2134 } 2135 2136 static void 2137 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 2138 struct spdk_nvme_transport_id *trid, 2139 struct spdk_nvmf_discovery_log_page_entry *entry) 2140 { } 2141 2142 static struct spdk_nvmf_transport_poll_group * 2143 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport) 2144 { 2145 struct nvmf_vfio_user_poll_group *vu_group; 2146 2147 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 2148 2149 vu_group = calloc(1, sizeof(*vu_group)); 2150 if (vu_group == NULL) { 2151 SPDK_ERRLOG("Error allocating poll group: %m"); 2152 return NULL; 2153 } 2154 2155 TAILQ_INIT(&vu_group->qps); 2156 2157 return &vu_group->group; 2158 } 2159 2160 /* called when process exits */ 2161 static void 2162 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 2163 { 2164 struct nvmf_vfio_user_poll_group *vu_group; 2165 2166 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 2167 2168 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2169 2170 free(vu_group); 2171 } 2172 2173 static void 2174 vfio_user_qpair_disconnect_cb(void *ctx) 2175 { 2176 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2177 struct nvmf_vfio_user_ctrlr *ctrlr; 2178 2179 pthread_mutex_lock(&endpoint->lock); 2180 ctrlr = endpoint->ctrlr; 2181 if (!ctrlr) { 2182 pthread_mutex_unlock(&endpoint->lock); 2183 return; 2184 } 2185 2186 if (!ctrlr->num_connected_qps) { 2187 free_ctrlr(ctrlr); 2188 pthread_mutex_unlock(&endpoint->lock); 2189 return; 2190 } 2191 pthread_mutex_unlock(&endpoint->lock); 2192 } 2193 2194 static int 2195 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 2196 { 2197 uint32_t i; 2198 struct nvmf_vfio_user_qpair *qpair; 2199 struct nvmf_vfio_user_endpoint *endpoint; 2200 2201 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 2202 2203 endpoint = ctrlr->endpoint; 2204 assert(endpoint != NULL); 2205 2206 for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 2207 qpair = ctrlr->qp[i]; 2208 if (qpair == NULL) { 2209 continue; 2210 } 2211 spdk_nvmf_qpair_disconnect(&qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint); 2212 } 2213 2214 return 0; 2215 } 2216 2217 static int 2218 vfio_user_poll_mmio(void *ctx) 2219 { 2220 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 2221 int ret; 2222 2223 assert(ctrlr != NULL); 2224 2225 /* This will call access_bar0_fn() if there are any writes 2226 * to the portion of the BAR that is not mmap'd */ 2227 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 2228 if (spdk_unlikely(ret != 0)) { 2229 spdk_poller_unregister(&ctrlr->mmio_poller); 2230 2231 /* initiator shutdown or reset, waiting for another re-connect */ 2232 if (errno == ENOTCONN) { 2233 vfio_user_destroy_ctrlr(ctrlr); 2234 return SPDK_POLLER_BUSY; 2235 } 2236 2237 fail_ctrlr(ctrlr); 2238 } 2239 2240 return SPDK_POLLER_BUSY; 2241 } 2242 2243 static int 2244 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2245 { 2246 struct nvmf_vfio_user_poll_group *vu_group; 2247 struct nvmf_vfio_user_qpair *qpair = cb_arg; 2248 struct nvmf_vfio_user_ctrlr *ctrlr; 2249 struct nvmf_vfio_user_endpoint *endpoint; 2250 2251 assert(qpair != NULL); 2252 assert(req != NULL); 2253 2254 ctrlr = qpair->ctrlr; 2255 endpoint = ctrlr->endpoint; 2256 assert(ctrlr != NULL); 2257 assert(endpoint != NULL); 2258 2259 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 2260 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 2261 free_ctrlr(ctrlr); 2262 return -1; 2263 } 2264 2265 vu_group = SPDK_CONTAINEROF(qpair->group, struct nvmf_vfio_user_poll_group, group); 2266 TAILQ_INSERT_TAIL(&vu_group->qps, qpair, link); 2267 qpair->state = VFIO_USER_QPAIR_ACTIVE; 2268 2269 pthread_mutex_lock(&endpoint->lock); 2270 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 2271 ctrlr->cntlid = qpair->qpair.ctrlr->cntlid; 2272 ctrlr->thread = spdk_get_thread(); 2273 ctrlr->mmio_poller = SPDK_POLLER_REGISTER(vfio_user_poll_mmio, ctrlr, 0); 2274 } 2275 ctrlr->num_connected_qps++; 2276 pthread_mutex_unlock(&endpoint->lock); 2277 2278 free(req->req.data); 2279 req->req.data = NULL; 2280 2281 return 0; 2282 } 2283 2284 /* 2285 * Add the given qpair to the given poll group. New qpairs are added to 2286 * ->new_qps; they are processed via nvmf_vfio_user_accept(), calling 2287 * spdk_nvmf_tgt_new_qpair(), which picks a poll group, then calls back 2288 * here via nvmf_transport_poll_group_add(). 2289 */ 2290 static int 2291 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 2292 struct spdk_nvmf_qpair *qpair) 2293 { 2294 struct nvmf_vfio_user_qpair *vu_qpair; 2295 struct nvmf_vfio_user_req *vu_req; 2296 struct nvmf_vfio_user_ctrlr *ctrlr; 2297 struct spdk_nvmf_request *req; 2298 struct spdk_nvmf_fabric_connect_data *data; 2299 bool admin; 2300 2301 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2302 vu_qpair->group = group; 2303 ctrlr = vu_qpair->ctrlr; 2304 2305 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 2306 ctrlr_id(ctrlr), vu_qpair->qpair.qid, 2307 vu_qpair, qpair, group); 2308 2309 admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair); 2310 2311 vu_req = get_nvmf_vfio_user_req(vu_qpair); 2312 if (vu_req == NULL) { 2313 return -1; 2314 } 2315 2316 req = &vu_req->req; 2317 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2318 req->cmd->connect_cmd.cid = 0; 2319 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 2320 req->cmd->connect_cmd.recfmt = 0; 2321 req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1; 2322 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 2323 2324 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 2325 req->data = calloc(1, req->length); 2326 if (req->data == NULL) { 2327 nvmf_vfio_user_req_free(req); 2328 return -ENOMEM; 2329 } 2330 2331 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 2332 data->cntlid = admin ? 0xFFFF : ctrlr->cntlid; 2333 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 2334 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 2335 2336 vu_req->cb_fn = handle_queue_connect_rsp; 2337 vu_req->cb_arg = vu_qpair; 2338 2339 SPDK_DEBUGLOG(nvmf_vfio, 2340 "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", 2341 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 2342 2343 spdk_nvmf_request_exec_fabrics(req); 2344 return 0; 2345 } 2346 2347 static int 2348 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 2349 struct spdk_nvmf_qpair *qpair) 2350 { 2351 struct nvmf_vfio_user_qpair *vu_qpair; 2352 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2353 struct nvmf_vfio_user_endpoint *endpoint; 2354 struct nvmf_vfio_user_poll_group *vu_group; 2355 2356 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2357 vu_ctrlr = vu_qpair->ctrlr; 2358 endpoint = vu_ctrlr->endpoint; 2359 2360 SPDK_DEBUGLOG(nvmf_vfio, 2361 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 2362 ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group); 2363 2364 2365 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2366 TAILQ_REMOVE(&vu_group->qps, vu_qpair, link); 2367 2368 pthread_mutex_lock(&endpoint->lock); 2369 assert(vu_ctrlr->num_connected_qps); 2370 vu_ctrlr->num_connected_qps--; 2371 pthread_mutex_unlock(&endpoint->lock); 2372 2373 return 0; 2374 } 2375 2376 static void 2377 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_qpair *vu_qpair, struct nvmf_vfio_user_req *vu_req) 2378 { 2379 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 2380 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 2381 vu_req->iovcnt = 0; 2382 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 2383 2384 TAILQ_INSERT_TAIL(&vu_qpair->reqs, vu_req, link); 2385 } 2386 2387 static int 2388 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 2389 { 2390 struct nvmf_vfio_user_qpair *vu_qpair; 2391 struct nvmf_vfio_user_req *vu_req; 2392 2393 assert(req != NULL); 2394 2395 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2396 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2397 2398 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2399 2400 return 0; 2401 } 2402 2403 static int 2404 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 2405 { 2406 struct nvmf_vfio_user_qpair *vu_qpair; 2407 struct nvmf_vfio_user_req *vu_req; 2408 2409 assert(req != NULL); 2410 2411 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2412 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2413 2414 if (vu_req->cb_fn != NULL) { 2415 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 2416 fail_ctrlr(vu_qpair->ctrlr); 2417 } 2418 } 2419 2420 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2421 2422 return 0; 2423 } 2424 2425 static void 2426 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 2427 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 2428 { 2429 struct nvmf_vfio_user_qpair *vu_qpair; 2430 2431 assert(qpair != NULL); 2432 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2433 free_qp(vu_qpair->ctrlr, qpair->qid); 2434 2435 if (cb_fn) { 2436 cb_fn(cb_arg); 2437 } 2438 } 2439 2440 /** 2441 * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available. 2442 */ 2443 static struct nvmf_vfio_user_req * 2444 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair) 2445 { 2446 struct nvmf_vfio_user_req *req; 2447 2448 assert(qpair != NULL); 2449 2450 if (TAILQ_EMPTY(&qpair->reqs)) { 2451 return NULL; 2452 } 2453 2454 req = TAILQ_FIRST(&qpair->reqs); 2455 TAILQ_REMOVE(&qpair->reqs, req, link); 2456 2457 return req; 2458 } 2459 2460 static struct spdk_nvmf_request * 2461 get_nvmf_req(struct nvmf_vfio_user_qpair *qpair) 2462 { 2463 struct nvmf_vfio_user_req *req = get_nvmf_vfio_user_req(qpair); 2464 2465 if (req == NULL) { 2466 return NULL; 2467 } 2468 return &req->req; 2469 } 2470 2471 static int 2472 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 2473 { 2474 uint16_t nlb, nr; 2475 uint32_t nsid; 2476 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2477 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 2478 struct spdk_nvmf_ns *ns; 2479 2480 nsid = cmd->nsid; 2481 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 2482 if (ns == NULL || ns->bdev == NULL) { 2483 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 2484 return -EINVAL; 2485 } 2486 2487 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 2488 nr = cmd->cdw10_bits.dsm.nr + 1; 2489 return nr * sizeof(struct spdk_nvme_dsm_range); 2490 } 2491 2492 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 2493 return nlb * spdk_bdev_get_block_size(ns->bdev); 2494 } 2495 2496 static int 2497 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2498 { 2499 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2500 uint32_t len = 0; 2501 int iovcnt; 2502 2503 req->xfer = cmd->opc & 0x3; 2504 req->length = 0; 2505 req->data = NULL; 2506 2507 switch (cmd->opc) { 2508 case SPDK_NVME_OPC_IDENTIFY: 2509 len = 4096; /* TODO: there should be a define somewhere for this */ 2510 break; 2511 case SPDK_NVME_OPC_GET_LOG_PAGE: 2512 len = (cmd->cdw10_bits.get_log_page.numdl + 1) * 4; 2513 break; 2514 } 2515 2516 if (!cmd->dptr.prp.prp1 || !len) { 2517 return 0; 2518 } 2519 /* ADMIN command will not use SGL */ 2520 assert(req->cmd->nvme_cmd.psdt == 0); 2521 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 2522 if (iovcnt < 0) { 2523 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 2524 ctrlr_id(ctrlr), cmd->opc); 2525 return -1; 2526 } 2527 2528 req->length = len; 2529 req->data = req->iov[0].iov_base; 2530 2531 return 0; 2532 } 2533 2534 /* 2535 * Handles an I/O command. 2536 * 2537 * Returns 0 on success and -errno on failure. Sets @submit on whether or not 2538 * the request must be forwarded to NVMf. 2539 */ 2540 static int 2541 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2542 { 2543 int err = 0; 2544 struct spdk_nvme_cmd *cmd; 2545 2546 assert(ctrlr != NULL); 2547 assert(req != NULL); 2548 2549 cmd = &req->cmd->nvme_cmd; 2550 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 2551 2552 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 2553 return 0; 2554 } 2555 2556 err = get_nvmf_io_req_length(req); 2557 if (err < 0) { 2558 return -EINVAL; 2559 } 2560 2561 req->length = err; 2562 err = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 2563 if (err < 0) { 2564 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 2565 return -EFAULT; 2566 } 2567 2568 req->data = req->iov[0].iov_base; 2569 req->iovcnt = err; 2570 2571 return 0; 2572 } 2573 2574 static int 2575 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 2576 struct spdk_nvmf_request *req) 2577 { 2578 int err; 2579 struct nvmf_vfio_user_req *vu_req; 2580 2581 assert(ctrlr != NULL); 2582 assert(cmd != NULL); 2583 2584 /* 2585 * TODO: this means that there are no free requests available, 2586 * returning -1 will fail the controller. Theoretically this error can 2587 * be avoided completely by ensuring we have as many requests as slots 2588 * in the SQ, plus one for the the property request. 2589 */ 2590 if (spdk_unlikely(req == NULL)) { 2591 return -1; 2592 } 2593 2594 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2595 vu_req->cb_fn = handle_cmd_rsp; 2596 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2597 req->cmd->nvme_cmd = *cmd; 2598 if (nvmf_qpair_is_admin_queue(req->qpair)) { 2599 err = map_admin_cmd_req(ctrlr, req); 2600 if (cmd->opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 2601 vu_req->cb_fn = handle_admin_aer_rsp; 2602 } 2603 } else { 2604 err = map_io_cmd_req(ctrlr, req); 2605 } 2606 2607 if (spdk_unlikely(err < 0)) { 2608 SPDK_ERRLOG("%s: map NVMe command opc 0x%x failed\n", 2609 ctrlr_id(ctrlr), cmd->opc); 2610 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2611 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2612 return handle_cmd_rsp(vu_req, vu_req->cb_arg); 2613 } 2614 2615 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 2616 spdk_nvmf_request_exec(req); 2617 2618 return 0; 2619 } 2620 2621 static void 2622 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair) 2623 { 2624 struct nvmf_vfio_user_ctrlr *ctrlr; 2625 uint32_t new_tail; 2626 2627 assert(qpair != NULL); 2628 2629 ctrlr = qpair->ctrlr; 2630 2631 new_tail = *tdbl(ctrlr, &qpair->sq); 2632 if (sq_head(qpair) != new_tail) { 2633 int err = handle_sq_tdbl_write(ctrlr, new_tail, qpair); 2634 if (err != 0) { 2635 fail_ctrlr(ctrlr); 2636 return; 2637 } 2638 } 2639 } 2640 2641 /* 2642 * Called unconditionally, periodically, very frequently from SPDK to ask 2643 * whether there's work to be done. This function consumes requests generated 2644 * from read/write_bar0 by setting ctrlr->prop_req.dir. read_bar0, and 2645 * occasionally write_bar0 -- though this may change, synchronously wait. This 2646 * function also consumes requests by looking at the doorbells. 2647 */ 2648 static int 2649 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2650 { 2651 struct nvmf_vfio_user_poll_group *vu_group; 2652 struct nvmf_vfio_user_qpair *vu_qpair, *tmp; 2653 2654 assert(group != NULL); 2655 2656 spdk_rmb(); 2657 2658 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2659 2660 TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) { 2661 if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) { 2662 continue; 2663 } 2664 nvmf_vfio_user_qpair_poll(vu_qpair); 2665 } 2666 2667 return 0; 2668 } 2669 2670 static int 2671 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 2672 struct spdk_nvme_transport_id *trid) 2673 { 2674 struct nvmf_vfio_user_qpair *vu_qpair; 2675 struct nvmf_vfio_user_ctrlr *ctrlr; 2676 2677 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2678 ctrlr = vu_qpair->ctrlr; 2679 2680 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2681 return 0; 2682 } 2683 2684 static int 2685 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 2686 struct spdk_nvme_transport_id *trid) 2687 { 2688 return 0; 2689 } 2690 2691 static int 2692 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 2693 struct spdk_nvme_transport_id *trid) 2694 { 2695 struct nvmf_vfio_user_qpair *vu_qpair; 2696 struct nvmf_vfio_user_ctrlr *ctrlr; 2697 2698 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2699 ctrlr = vu_qpair->ctrlr; 2700 2701 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2702 return 0; 2703 } 2704 2705 static void 2706 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 2707 struct spdk_nvmf_request *req) 2708 { 2709 struct nvmf_vfio_user_qpair *vu_qpair; 2710 struct nvmf_vfio_user_req *vu_req, *vu_req_to_abort = NULL; 2711 uint16_t i, cid; 2712 2713 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2714 2715 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 2716 for (i = 0; i < vu_qpair->qsize; i++) { 2717 vu_req = &vu_qpair->reqs_internal[i]; 2718 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 2719 vu_req_to_abort = vu_req; 2720 break; 2721 } 2722 } 2723 2724 if (vu_req_to_abort == NULL) { 2725 spdk_nvmf_request_complete(req); 2726 return; 2727 } 2728 2729 req->req_to_abort = &vu_req_to_abort->req; 2730 nvmf_ctrlr_abort_request(req); 2731 } 2732 2733 static void 2734 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 2735 { 2736 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 2737 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 2738 opts->in_capsule_data_size = 0; 2739 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 2740 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 2741 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 2742 opts->num_shared_buffers = 0; 2743 opts->buf_cache_size = 0; 2744 opts->association_timeout = 0; 2745 opts->transport_specific = NULL; 2746 } 2747 2748 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 2749 .name = "VFIOUSER", 2750 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 2751 .opts_init = nvmf_vfio_user_opts_init, 2752 .create = nvmf_vfio_user_create, 2753 .destroy = nvmf_vfio_user_destroy, 2754 2755 .listen = nvmf_vfio_user_listen, 2756 .stop_listen = nvmf_vfio_user_stop_listen, 2757 .accept = nvmf_vfio_user_accept, 2758 .cdata_init = nvmf_vfio_user_cdata_init, 2759 .listen_associate = nvmf_vfio_user_listen_associate, 2760 2761 .listener_discover = nvmf_vfio_user_discover, 2762 2763 .poll_group_create = nvmf_vfio_user_poll_group_create, 2764 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 2765 .poll_group_add = nvmf_vfio_user_poll_group_add, 2766 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 2767 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 2768 2769 .req_free = nvmf_vfio_user_req_free, 2770 .req_complete = nvmf_vfio_user_req_complete, 2771 2772 .qpair_fini = nvmf_vfio_user_close_qpair, 2773 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 2774 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 2775 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 2776 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 2777 }; 2778 2779 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 2780 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 2781