1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019, Nutanix Inc. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * * Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * * Neither the name of Intel Corporation nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * NVMe over vfio-user transport 35 */ 36 37 #include <vfio-user/libvfio-user.h> 38 #include <vfio-user/pci_defs.h> 39 40 #include "spdk/barrier.h" 41 #include "spdk/stdinc.h" 42 #include "spdk/assert.h" 43 #include "spdk/thread.h" 44 #include "spdk/nvmf_transport.h" 45 #include "spdk/sock.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 #include "spdk/log.h" 49 50 #include "transport.h" 51 52 #include "nvmf_internal.h" 53 54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 56 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 57 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 58 59 #define NVME_DOORBELLS_OFFSET 0x1000 60 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 61 62 #define NVME_REG_CFG_SIZE PCI_CFG_SPACE_EXP_SIZE 63 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 64 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 65 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 66 /* MSIX Table Size */ 67 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 68 /* MSIX Pending Bit Array Size */ 69 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 70 71 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 72 73 struct nvmf_vfio_user_req; 74 struct nvmf_vfio_user_qpair; 75 76 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 77 78 /* 1 more for PRP2 list itself */ 79 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 80 81 enum nvmf_vfio_user_req_state { 82 VFIO_USER_REQUEST_STATE_FREE = 0, 83 VFIO_USER_REQUEST_STATE_EXECUTING, 84 }; 85 86 struct nvmf_vfio_user_req { 87 struct spdk_nvmf_request req; 88 struct spdk_nvme_cpl rsp; 89 struct spdk_nvme_cmd cmd; 90 91 enum nvmf_vfio_user_req_state state; 92 nvmf_vfio_user_req_cb_fn cb_fn; 93 void *cb_arg; 94 95 /* old CC before prop_set_cc fabric command */ 96 union spdk_nvme_cc_register cc; 97 98 /* placeholder for gpa_to_vva memory map table, the IO buffer doesn't use it */ 99 dma_sg_t *sg; 100 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 101 uint8_t iovcnt; 102 103 TAILQ_ENTRY(nvmf_vfio_user_req) link; 104 }; 105 106 /* 107 * A NVMe queue. 108 */ 109 struct nvme_q { 110 bool is_cq; 111 112 void *addr; 113 114 dma_sg_t *sg; 115 struct iovec iov; 116 117 uint32_t size; 118 uint64_t prp1; 119 120 union { 121 struct { 122 uint32_t head; 123 /* multiple SQs can be mapped to the same CQ */ 124 uint16_t cqid; 125 }; 126 struct { 127 uint32_t tail; 128 uint16_t iv; 129 bool ien; 130 bool phase; 131 }; 132 }; 133 }; 134 135 enum nvmf_vfio_user_qpair_state { 136 VFIO_USER_QPAIR_UNINITIALIZED = 0, 137 VFIO_USER_QPAIR_ACTIVE, 138 VFIO_USER_QPAIR_SQ_DELETED, 139 VFIO_USER_QPAIR_INACTIVE, 140 VFIO_USER_QPAIR_ERROR, 141 }; 142 143 struct nvmf_vfio_user_qpair { 144 struct spdk_nvmf_qpair qpair; 145 struct spdk_nvmf_transport_poll_group *group; 146 struct nvmf_vfio_user_ctrlr *ctrlr; 147 struct nvmf_vfio_user_req *reqs_internal; 148 uint32_t qsize; 149 struct nvme_q cq; 150 struct nvme_q sq; 151 enum nvmf_vfio_user_qpair_state state; 152 153 /* Copy of Create IO SQ command */ 154 struct spdk_nvme_cmd create_io_sq_cmd; 155 156 TAILQ_HEAD(, nvmf_vfio_user_req) reqs; 157 /* Poll group entry */ 158 TAILQ_ENTRY(nvmf_vfio_user_qpair) link; 159 /* Connected queue pair entry */ 160 TAILQ_ENTRY(nvmf_vfio_user_qpair) tailq; 161 }; 162 163 struct nvmf_vfio_user_poll_group { 164 struct spdk_nvmf_transport_poll_group group; 165 TAILQ_HEAD(, nvmf_vfio_user_qpair) qps; 166 }; 167 168 struct nvmf_vfio_user_ctrlr { 169 struct nvmf_vfio_user_endpoint *endpoint; 170 struct nvmf_vfio_user_transport *transport; 171 172 /* Connected queue pairs list */ 173 TAILQ_HEAD(, nvmf_vfio_user_qpair) connected_qps; 174 175 struct spdk_thread *thread; 176 struct spdk_poller *vfu_ctx_poller; 177 178 uint16_t cntlid; 179 struct spdk_nvmf_ctrlr *ctrlr; 180 181 struct nvmf_vfio_user_qpair *qp[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 182 183 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 184 185 volatile uint32_t *doorbells; 186 187 /* internal CSTS.CFS register for vfio-user fatal errors */ 188 uint32_t cfs : 1; 189 }; 190 191 struct nvmf_vfio_user_endpoint { 192 vfu_ctx_t *vfu_ctx; 193 struct msixcap *msix; 194 vfu_pci_config_space_t *pci_config_space; 195 int devmem_fd; 196 volatile uint32_t *doorbells; 197 198 struct spdk_nvme_transport_id trid; 199 const struct spdk_nvmf_subsystem *subsystem; 200 201 struct nvmf_vfio_user_ctrlr *ctrlr; 202 pthread_mutex_t lock; 203 204 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 205 }; 206 207 struct nvmf_vfio_user_transport_opts { 208 bool disable_mappable_bar0; 209 }; 210 211 struct nvmf_vfio_user_transport { 212 struct spdk_nvmf_transport transport; 213 struct nvmf_vfio_user_transport_opts transport_opts; 214 pthread_mutex_t lock; 215 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 216 }; 217 218 /* 219 * function prototypes 220 */ 221 static volatile uint32_t * 222 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 223 224 static volatile uint32_t * 225 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 226 227 static int 228 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 229 230 static struct nvmf_vfio_user_req * 231 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair); 232 233 static int 234 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 235 uint32_t max_iovcnt, uint32_t len, size_t mps, 236 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 237 { 238 uint64_t prp1, prp2; 239 void *vva; 240 uint32_t i; 241 uint32_t residue_len, nents; 242 uint64_t *prp_list; 243 uint32_t iovcnt; 244 245 assert(max_iovcnt > 0); 246 247 prp1 = cmd->dptr.prp.prp1; 248 prp2 = cmd->dptr.prp.prp2; 249 250 /* PRP1 may started with unaligned page address */ 251 residue_len = mps - (prp1 % mps); 252 residue_len = spdk_min(len, residue_len); 253 254 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 255 if (spdk_unlikely(vva == NULL)) { 256 SPDK_ERRLOG("GPA to VVA failed\n"); 257 return -EINVAL; 258 } 259 len -= residue_len; 260 if (len && max_iovcnt < 2) { 261 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 262 return -ERANGE; 263 } 264 iovs[0].iov_base = vva; 265 iovs[0].iov_len = residue_len; 266 267 if (len) { 268 if (spdk_unlikely(prp2 == 0)) { 269 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 270 return -EINVAL; 271 } 272 273 if (len <= mps) { 274 /* 2 PRP used */ 275 iovcnt = 2; 276 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 277 if (spdk_unlikely(vva == NULL)) { 278 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 279 prp2, len); 280 return -EINVAL; 281 } 282 iovs[1].iov_base = vva; 283 iovs[1].iov_len = len; 284 } else { 285 /* PRP list used */ 286 nents = (len + mps - 1) / mps; 287 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 288 SPDK_ERRLOG("Too many page entries\n"); 289 return -ERANGE; 290 } 291 292 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 293 if (spdk_unlikely(vva == NULL)) { 294 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 295 prp2, nents); 296 return -EINVAL; 297 } 298 prp_list = vva; 299 i = 0; 300 while (len != 0) { 301 residue_len = spdk_min(len, mps); 302 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 303 if (spdk_unlikely(vva == NULL)) { 304 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 305 prp_list[i], residue_len); 306 return -EINVAL; 307 } 308 iovs[i + 1].iov_base = vva; 309 iovs[i + 1].iov_len = residue_len; 310 len -= residue_len; 311 i++; 312 } 313 iovcnt = i + 1; 314 } 315 } else { 316 /* 1 PRP used */ 317 iovcnt = 1; 318 } 319 320 assert(iovcnt <= max_iovcnt); 321 return iovcnt; 322 } 323 324 static int 325 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 326 struct iovec *iovs, uint32_t max_iovcnt, 327 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 328 { 329 uint32_t i; 330 void *vva; 331 332 if (spdk_unlikely(max_iovcnt < num_sgls)) { 333 return -ERANGE; 334 } 335 336 for (i = 0; i < num_sgls; i++) { 337 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 338 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 339 return -EINVAL; 340 } 341 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 342 if (spdk_unlikely(vva == NULL)) { 343 SPDK_ERRLOG("GPA to VVA failed\n"); 344 return -EINVAL; 345 } 346 iovs[i].iov_base = vva; 347 iovs[i].iov_len = sgls[i].unkeyed.length; 348 } 349 350 return num_sgls; 351 } 352 353 static int 354 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 355 uint32_t len, size_t mps, 356 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 357 { 358 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 359 uint32_t num_sgls, seg_len; 360 void *vva; 361 int ret; 362 uint32_t total_iovcnt = 0; 363 364 /* SGL cases */ 365 sgl = &cmd->dptr.sgl1; 366 367 /* only one SGL segment */ 368 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 369 assert(max_iovcnt > 0); 370 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 371 if (spdk_unlikely(vva == NULL)) { 372 SPDK_ERRLOG("GPA to VVA failed\n"); 373 return -EINVAL; 374 } 375 iovs[0].iov_base = vva; 376 iovs[0].iov_len = sgl->unkeyed.length; 377 assert(sgl->unkeyed.length == len); 378 379 return 1; 380 } 381 382 for (;;) { 383 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 384 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 385 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 386 return -EINVAL; 387 } 388 389 seg_len = sgl->unkeyed.length; 390 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 391 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 392 return -EINVAL; 393 } 394 395 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 396 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 397 if (spdk_unlikely(vva == NULL)) { 398 SPDK_ERRLOG("GPA to VVA failed\n"); 399 return -EINVAL; 400 } 401 402 /* sgl point to the first segment */ 403 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 404 last_sgl = &sgl[num_sgls - 1]; 405 406 /* we are done */ 407 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 408 /* map whole sgl list */ 409 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 410 max_iovcnt - total_iovcnt, gpa_to_vva); 411 if (spdk_unlikely(ret < 0)) { 412 return ret; 413 } 414 total_iovcnt += ret; 415 416 return total_iovcnt; 417 } 418 419 if (num_sgls > 1) { 420 /* map whole sgl exclude last_sgl */ 421 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 422 max_iovcnt - total_iovcnt, gpa_to_vva); 423 if (spdk_unlikely(ret < 0)) { 424 return ret; 425 } 426 total_iovcnt += ret; 427 } 428 429 /* move to next level's segments */ 430 sgl = last_sgl; 431 } 432 433 return 0; 434 } 435 436 static int 437 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 438 uint32_t len, size_t mps, 439 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 440 { 441 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 442 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 443 } 444 445 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 446 } 447 448 static char * 449 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 450 { 451 return endpoint->trid.traddr; 452 } 453 454 static char * 455 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 456 { 457 if (!ctrlr || !ctrlr->endpoint) { 458 return "Null Ctrlr"; 459 } 460 461 return endpoint_id(ctrlr->endpoint); 462 } 463 464 static inline uint16_t 465 io_q_id(struct nvme_q *q) 466 { 467 468 struct nvmf_vfio_user_qpair *vu_qpair; 469 470 assert(q); 471 472 if (q->is_cq) { 473 vu_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq); 474 } else { 475 vu_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq); 476 } 477 assert(vu_qpair); 478 return vu_qpair->qpair.qid; 479 } 480 481 static void 482 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 483 { 484 assert(ctrlr != NULL); 485 486 if (ctrlr->cfs == 0) { 487 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr)); 488 } 489 490 ctrlr->cfs = 1U; 491 } 492 493 static inline bool 494 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 495 { 496 assert(vu_ctrlr != NULL); 497 assert(vu_ctrlr->endpoint != NULL); 498 499 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 500 501 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 502 } 503 504 static void 505 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 506 { 507 if (endpoint->doorbells) { 508 munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 509 } 510 511 if (endpoint->devmem_fd > 0) { 512 close(endpoint->devmem_fd); 513 } 514 515 if (endpoint->vfu_ctx) { 516 vfu_destroy_ctx(endpoint->vfu_ctx); 517 } 518 519 pthread_mutex_destroy(&endpoint->lock); 520 free(endpoint); 521 } 522 523 /* called when process exits */ 524 static int 525 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 526 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 527 { 528 struct nvmf_vfio_user_transport *vu_transport; 529 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 530 531 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 532 533 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 534 transport); 535 536 (void)pthread_mutex_destroy(&vu_transport->lock); 537 538 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 539 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 540 nvmf_vfio_user_destroy_endpoint(endpoint); 541 } 542 543 free(vu_transport); 544 545 if (cb_fn) { 546 cb_fn(cb_arg); 547 } 548 549 return 0; 550 } 551 552 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 553 { 554 "disable_mappable_bar0", 555 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 556 spdk_json_decode_bool, true 557 }, 558 }; 559 560 static struct spdk_nvmf_transport * 561 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 562 { 563 struct nvmf_vfio_user_transport *vu_transport; 564 int err; 565 566 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 567 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 568 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 569 return NULL; 570 } 571 572 vu_transport = calloc(1, sizeof(*vu_transport)); 573 if (vu_transport == NULL) { 574 SPDK_ERRLOG("Transport alloc fail: %m\n"); 575 return NULL; 576 } 577 578 err = pthread_mutex_init(&vu_transport->lock, NULL); 579 if (err != 0) { 580 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 581 goto err; 582 } 583 584 TAILQ_INIT(&vu_transport->endpoints); 585 586 if (opts->transport_specific != NULL && 587 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 588 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 589 vu_transport)) { 590 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 591 free(vu_transport); 592 return NULL; 593 } 594 595 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 596 vu_transport->transport_opts.disable_mappable_bar0); 597 598 return &vu_transport->transport; 599 600 err: 601 free(vu_transport); 602 603 return NULL; 604 } 605 606 static uint32_t 607 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr) 608 { 609 assert(ctrlr != NULL); 610 assert(ctrlr->qp[0] != NULL); 611 assert(ctrlr->qp[0]->qpair.ctrlr != NULL); 612 613 return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1; 614 } 615 616 static void * 617 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot) 618 { 619 int ret; 620 621 assert(ctx != NULL); 622 assert(sg != NULL); 623 assert(iov != NULL); 624 625 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 626 if (ret < 0) { 627 return NULL; 628 } 629 630 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 631 if (ret != 0) { 632 return NULL; 633 } 634 635 assert(iov->iov_base != NULL); 636 return iov->iov_base; 637 } 638 639 static inline uint32_t 640 sq_head(struct nvmf_vfio_user_qpair *qpair) 641 { 642 assert(qpair != NULL); 643 return qpair->sq.head; 644 } 645 646 static inline void 647 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair) 648 { 649 assert(ctrlr != NULL); 650 assert(qpair != NULL); 651 qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size; 652 } 653 654 static int 655 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q *q, bool is_cq, bool unmap) 656 { 657 uint64_t len; 658 659 assert(q->size); 660 assert(q->addr == NULL); 661 662 if (is_cq) { 663 len = q->size * sizeof(struct spdk_nvme_cpl); 664 } else { 665 len = q->size * sizeof(struct spdk_nvme_cmd); 666 } 667 668 q->addr = map_one(vu_ctrlr->endpoint->vfu_ctx, q->prp1, len, q->sg, 669 &q->iov, is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 670 if (q->addr == NULL) { 671 return -EFAULT; 672 } 673 674 if (unmap) { 675 memset(q->addr, 0, len); 676 } 677 678 return 0; 679 } 680 681 static int 682 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 683 { 684 struct nvme_q *sq; 685 const struct spdk_nvmf_registers *regs; 686 int ret; 687 688 assert(ctrlr != NULL); 689 assert(ctrlr->qp[0] != NULL); 690 assert(ctrlr->qp[0]->sq.addr == NULL); 691 /* XXX ctrlr->asq == 0 is a valid memory address */ 692 693 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 694 sq = &ctrlr->qp[0]->sq; 695 sq->size = regs->aqa.bits.asqs + 1; 696 sq->prp1 = regs->asq; 697 sq->head = 0; 698 sq->cqid = 0; 699 sq->is_cq = false; 700 701 ret = map_q(ctrlr, sq, false, true); 702 if (ret) { 703 return ret; 704 } 705 706 *tdbl(ctrlr, sq) = 0; 707 708 return 0; 709 } 710 711 static inline int 712 queue_index(uint16_t qid, int is_cq) 713 { 714 return (qid * 2) + is_cq; 715 } 716 717 static volatile uint32_t * 718 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 719 { 720 assert(ctrlr != NULL); 721 assert(q != NULL); 722 assert(!q->is_cq); 723 724 return &ctrlr->doorbells[queue_index(io_q_id(q), false)]; 725 } 726 727 static volatile uint32_t * 728 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 729 { 730 assert(ctrlr != NULL); 731 assert(q != NULL); 732 assert(q->is_cq); 733 734 return &ctrlr->doorbells[queue_index(io_q_id(q), true)]; 735 } 736 737 static inline bool 738 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 739 { 740 assert(ctrlr != NULL); 741 assert(q != NULL); 742 assert(q->is_cq); 743 744 return ((q->tail + 1) % q->size) == *hdbl(ctrlr, q); 745 } 746 747 static inline void 748 cq_tail_advance(struct nvme_q *q) 749 { 750 assert(q != NULL); 751 assert(q->is_cq); 752 753 assert(q->tail < q->size); 754 q->tail++; 755 756 if (spdk_unlikely(q->tail == q->size)) { 757 q->tail = 0; 758 q->phase = !q->phase; 759 } 760 } 761 762 static int 763 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 764 { 765 struct nvme_q *cq; 766 const struct spdk_nvmf_registers *regs; 767 int ret; 768 769 assert(ctrlr != NULL); 770 assert(ctrlr->qp[0] != NULL); 771 assert(ctrlr->qp[0]->cq.addr == NULL); 772 773 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 774 assert(regs != NULL); 775 cq = &ctrlr->qp[0]->cq; 776 cq->size = regs->aqa.bits.acqs + 1; 777 cq->prp1 = regs->acq; 778 cq->tail = 0; 779 cq->is_cq = true; 780 cq->ien = true; 781 cq->phase = true; 782 783 ret = map_q(ctrlr, cq, true, true); 784 if (ret) { 785 return ret; 786 } 787 *hdbl(ctrlr, cq) = 0; 788 789 return 0; 790 } 791 792 static inline dma_sg_t * 793 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 794 { 795 return (dma_sg_t *)((uintptr_t)vu_req->sg + iovcnt * dma_sg_size()); 796 } 797 798 static void * 799 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 800 { 801 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 802 struct spdk_nvmf_qpair *qpair; 803 struct nvmf_vfio_user_req *vu_req; 804 struct nvmf_vfio_user_qpair *vu_qpair; 805 void *ret; 806 807 assert(req != NULL); 808 qpair = req->qpair; 809 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 810 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 811 812 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 813 ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len, 814 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 815 &vu_req->iov[vu_req->iovcnt], prot); 816 if (spdk_likely(ret != NULL)) { 817 vu_req->iovcnt++; 818 } 819 return ret; 820 } 821 822 static int 823 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 824 struct iovec *iov, uint32_t length) 825 { 826 /* Map PRP list to from Guest physical memory to 827 * virtual memory address. 828 */ 829 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 830 length, 4096, _map_one); 831 } 832 833 static int 834 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 835 struct nvmf_vfio_user_qpair *vu_qpair); 836 837 /* 838 * Posts a CQE in the completion queue. 839 * 840 * @ctrlr: the vfio-user controller 841 * @cq: the completion queue 842 * @cdw0: cdw0 as reported by NVMf 843 * @sqid: submission queue ID 844 * @cid: command identifier in NVMe command 845 * @sc: the NVMe CQE status code 846 * @sct: the NVMe CQE status code type 847 */ 848 static int 849 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *cq, 850 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 851 { 852 struct spdk_nvme_cpl *cpl; 853 const struct spdk_nvmf_registers *regs; 854 int err; 855 856 assert(ctrlr != NULL); 857 858 if (spdk_unlikely(cq == NULL || cq->addr == NULL)) { 859 return 0; 860 } 861 862 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 863 if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) { 864 SPDK_DEBUGLOG(nvmf_vfio, 865 "%s: ignore completion SQ%d cid=%d status=%#x\n", 866 ctrlr_id(ctrlr), sqid, cid, sc); 867 return 0; 868 } 869 870 if (cq_is_full(ctrlr, cq)) { 871 SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n", 872 ctrlr_id(ctrlr), io_q_id(cq), cq->tail, *hdbl(ctrlr, cq)); 873 return -1; 874 } 875 876 cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail; 877 878 assert(ctrlr->qp[sqid] != NULL); 879 SPDK_DEBUGLOG(nvmf_vfio, 880 "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n", 881 ctrlr_id(ctrlr), sqid, cid, sc, sq_head(ctrlr->qp[sqid]), 882 cq->tail); 883 884 cpl->sqhd = sq_head(ctrlr->qp[sqid]); 885 cpl->sqid = sqid; 886 cpl->cid = cid; 887 cpl->cdw0 = cdw0; 888 cpl->status.dnr = 0x0; 889 cpl->status.m = 0x0; 890 cpl->status.sct = sct; 891 cpl->status.p = cq->phase; 892 cpl->status.sc = sc; 893 894 cq_tail_advance(cq); 895 896 /* 897 * this function now executes at SPDK thread context, we 898 * might be triggering interrupts from vfio-user thread context so 899 * check for race conditions. 900 */ 901 if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) { 902 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 903 if (err != 0) { 904 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 905 ctrlr_id(ctrlr)); 906 return err; 907 } 908 } 909 910 return 0; 911 } 912 913 static bool 914 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 915 { 916 assert(vu_ctrlr != NULL); 917 918 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 919 return false; 920 } 921 922 if (vu_ctrlr->qp[qid] == NULL) { 923 return false; 924 } 925 926 if (!is_cq) { 927 if (vu_ctrlr->qp[qid]->state == VFIO_USER_QPAIR_SQ_DELETED || 928 vu_ctrlr->qp[qid]->state == VFIO_USER_QPAIR_UNINITIALIZED) { 929 return false; 930 } 931 } 932 933 return true; 934 } 935 936 static void 937 unmap_qp(struct nvmf_vfio_user_qpair *qp) 938 { 939 struct nvmf_vfio_user_ctrlr *ctrlr; 940 941 if (qp->ctrlr == NULL) { 942 return; 943 } 944 ctrlr = qp->ctrlr; 945 946 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap QP%d\n", 947 ctrlr_id(ctrlr), qp->qpair.qid); 948 949 if (qp->sq.addr != NULL) { 950 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->sq.sg, &qp->sq.iov, 1); 951 qp->sq.addr = NULL; 952 } 953 954 if (qp->cq.addr != NULL) { 955 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->cq.sg, &qp->cq.iov, 1); 956 qp->cq.addr = NULL; 957 } 958 } 959 960 static int 961 remap_qp(struct nvmf_vfio_user_qpair *vu_qpair) 962 { 963 struct nvme_q *sq, *cq; 964 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 965 int ret; 966 967 vu_ctrlr = vu_qpair->ctrlr; 968 sq = &vu_qpair->sq; 969 cq = &vu_qpair->cq; 970 971 if (sq->size) { 972 ret = map_q(vu_ctrlr, sq, false, false); 973 if (ret) { 974 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n", 975 io_q_id(sq), sq->prp1, sq->prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 976 return -EFAULT; 977 } 978 } 979 980 if (cq->size) { 981 ret = map_q(vu_ctrlr, cq, true, false); 982 if (ret) { 983 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n", 984 io_q_id(cq), cq->prp1, cq->prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 985 return -EFAULT; 986 } 987 988 } 989 990 return 0; 991 } 992 993 static void 994 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 995 { 996 struct nvmf_vfio_user_qpair *qpair; 997 struct nvmf_vfio_user_req *vu_req; 998 uint32_t i; 999 1000 if (ctrlr == NULL) { 1001 return; 1002 } 1003 1004 qpair = ctrlr->qp[qid]; 1005 if (qpair == NULL) { 1006 return; 1007 } 1008 1009 SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr), 1010 qid, qpair); 1011 1012 unmap_qp(qpair); 1013 1014 for (i = 0; i < qpair->qsize; i++) { 1015 vu_req = &qpair->reqs_internal[i]; 1016 free(vu_req->sg); 1017 } 1018 free(qpair->reqs_internal); 1019 1020 free(qpair->sq.sg); 1021 free(qpair->cq.sg); 1022 free(qpair); 1023 1024 ctrlr->qp[qid] = NULL; 1025 } 1026 1027 /* This function can only fail because of memory allocation errors. */ 1028 static int 1029 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1030 const uint32_t qsize, const uint16_t id) 1031 { 1032 uint32_t i; 1033 struct nvmf_vfio_user_qpair *qpair; 1034 struct nvmf_vfio_user_req *vu_req, *tmp; 1035 struct spdk_nvmf_request *req; 1036 1037 assert(ctrlr != NULL); 1038 assert(transport != NULL); 1039 1040 qpair = calloc(1, sizeof(*qpair)); 1041 if (qpair == NULL) { 1042 return -ENOMEM; 1043 } 1044 qpair->sq.sg = calloc(1, dma_sg_size()); 1045 if (qpair->sq.sg == NULL) { 1046 free(qpair); 1047 return -ENOMEM; 1048 } 1049 qpair->cq.sg = calloc(1, dma_sg_size()); 1050 if (qpair->cq.sg == NULL) { 1051 free(qpair->sq.sg); 1052 free(qpair); 1053 return -ENOMEM; 1054 } 1055 1056 qpair->qpair.qid = id; 1057 qpair->qpair.transport = transport; 1058 qpair->ctrlr = ctrlr; 1059 qpair->qsize = qsize; 1060 1061 TAILQ_INIT(&qpair->reqs); 1062 1063 qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req)); 1064 if (qpair->reqs_internal == NULL) { 1065 SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr)); 1066 goto reqs_err; 1067 } 1068 1069 for (i = 0; i < qsize; i++) { 1070 vu_req = &qpair->reqs_internal[i]; 1071 vu_req->sg = calloc(NVMF_VFIO_USER_MAX_IOVECS, dma_sg_size()); 1072 if (vu_req->sg == NULL) { 1073 goto sg_err; 1074 } 1075 1076 req = &vu_req->req; 1077 req->qpair = &qpair->qpair; 1078 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1079 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1080 1081 TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link); 1082 } 1083 1084 ctrlr->qp[id] = qpair; 1085 return 0; 1086 1087 sg_err: 1088 TAILQ_FOREACH_SAFE(vu_req, &qpair->reqs, link, tmp) { 1089 free(vu_req->sg); 1090 } 1091 free(qpair->reqs_internal); 1092 1093 reqs_err: 1094 free(qpair->sq.sg); 1095 free(qpair->cq.sg); 1096 free(qpair); 1097 return -ENOMEM; 1098 } 1099 1100 /* 1101 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 1102 * on error. 1103 */ 1104 static int 1105 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1106 struct spdk_nvme_cmd *cmd, const bool is_cq) 1107 { 1108 uint16_t qid, cqid; 1109 uint32_t qsize; 1110 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1111 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1112 int err = 0; 1113 struct nvmf_vfio_user_qpair *vu_qpair; 1114 struct nvme_q *io_q; 1115 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1116 1117 assert(ctrlr != NULL); 1118 assert(cmd != NULL); 1119 1120 qid = cmd->cdw10_bits.create_io_q.qid; 1121 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1122 SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr), 1123 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 1124 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1125 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1126 goto out; 1127 } 1128 1129 if (io_q_exists(ctrlr, qid, is_cq)) { 1130 SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr), 1131 is_cq ? 'C' : 'S', qid); 1132 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1133 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1134 goto out; 1135 } 1136 1137 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1138 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 1139 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 1140 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1141 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 1142 goto out; 1143 } 1144 1145 SPDK_DEBUGLOG(nvmf_vfio, 1146 "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr), 1147 is_cq ? 'C' : 'S', qid, qsize); 1148 1149 if (is_cq) { 1150 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1151 SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr)); 1152 sc = SPDK_NVME_SC_INVALID_FIELD; 1153 goto out; 1154 } 1155 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 1156 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 1157 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1158 sc = SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 1159 goto out; 1160 } 1161 1162 err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, qsize, qid); 1163 if (err != 0) { 1164 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1165 goto out; 1166 } 1167 1168 io_q = &ctrlr->qp[qid]->cq; 1169 io_q->ien = cmd->cdw11_bits.create_io_cq.ien; 1170 io_q->iv = cmd->cdw11_bits.create_io_cq.iv; 1171 io_q->phase = true; 1172 } else { 1173 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1174 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1175 SPDK_ERRLOG("%s: invalid CQID %u\n", ctrlr_id(ctrlr), cqid); 1176 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1177 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1178 goto out; 1179 1180 } 1181 /* CQ must be created before SQ */ 1182 if (!io_q_exists(ctrlr, cqid, true)) { 1183 SPDK_ERRLOG("%s: CQ%u does not exist\n", ctrlr_id(ctrlr), cqid); 1184 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1185 sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1186 goto out; 1187 } 1188 1189 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1190 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1191 sc = SPDK_NVME_SC_INVALID_FIELD; 1192 goto out; 1193 } 1194 /* TODO: support shared IO CQ */ 1195 if (qid != cqid) { 1196 SPDK_ERRLOG("%s: doesn't support shared CQ now\n", ctrlr_id(ctrlr)); 1197 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1198 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1199 } 1200 1201 io_q = &ctrlr->qp[qid]->sq; 1202 io_q->cqid = cqid; 1203 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr), 1204 qid, io_q->cqid); 1205 } 1206 1207 io_q->is_cq = is_cq; 1208 io_q->size = qsize; 1209 io_q->prp1 = cmd->dptr.prp.prp1; 1210 1211 err = map_q(ctrlr, io_q, is_cq, true); 1212 if (err) { 1213 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1214 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1215 goto out; 1216 } 1217 1218 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n", 1219 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1220 qid, cmd->dptr.prp.prp1, (unsigned long long)io_q->addr); 1221 1222 if (is_cq) { 1223 *hdbl(ctrlr, io_q) = 0; 1224 } else { 1225 vu_qpair = ctrlr->qp[qid]; 1226 *tdbl(ctrlr, io_q) = 0; 1227 vu_qpair->sq.head = 0; 1228 1229 if (vu_qpair->state == VFIO_USER_QPAIR_SQ_DELETED) { 1230 vu_qpair->state = VFIO_USER_QPAIR_ACTIVE; 1231 } else { 1232 /* 1233 * Create our new I/O qpair. This asynchronously invokes, on a 1234 * suitable poll group, the nvmf_vfio_user_poll_group_add() 1235 * callback, which will call spdk_nvmf_request_exec_fabrics() 1236 * with a generated fabrics connect command. This command is 1237 * then eventually completed via handle_queue_connect_rsp(). 1238 */ 1239 vu_qpair->create_io_sq_cmd = *cmd; 1240 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1241 &vu_qpair->qpair); 1242 return 0; 1243 } 1244 } 1245 1246 out: 1247 return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, sc, sct); 1248 } 1249 1250 /* For ADMIN I/O DELETE COMPLETION QUEUE the NVMf library will disconnect and free 1251 * queue pair, so save the command in a context. 1252 */ 1253 struct vfio_user_delete_cq_ctx { 1254 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1255 struct spdk_nvme_cmd delete_io_cq_cmd; 1256 }; 1257 1258 static void 1259 vfio_user_qpair_delete_cb(void *cb_arg) 1260 { 1261 struct vfio_user_delete_cq_ctx *ctx = cb_arg; 1262 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 1263 1264 post_completion(vu_ctrlr, &vu_ctrlr->qp[0]->cq, 0, 0, ctx->delete_io_cq_cmd.cid, 1265 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 1266 free(ctx); 1267 } 1268 1269 /* 1270 * Deletes a completion or submission I/O queue. 1271 */ 1272 static int 1273 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1274 struct spdk_nvme_cmd *cmd, const bool is_cq) 1275 { 1276 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1277 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1278 struct nvmf_vfio_user_qpair *vu_qpair; 1279 struct vfio_user_delete_cq_ctx *ctx; 1280 1281 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n", 1282 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1283 cmd->cdw10_bits.delete_io_q.qid); 1284 1285 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 1286 SPDK_ERRLOG("%s: I/O %cQ%d does not exist\n", ctrlr_id(ctrlr), 1287 is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid); 1288 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1289 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1290 goto out; 1291 } 1292 1293 vu_qpair = ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]; 1294 if (is_cq) { 1295 if (vu_qpair->state == VFIO_USER_QPAIR_UNINITIALIZED) { 1296 free_qp(ctrlr, cmd->cdw10_bits.delete_io_q.qid); 1297 goto out; 1298 } 1299 1300 /* SQ must have been deleted first */ 1301 if (vu_qpair->state != VFIO_USER_QPAIR_SQ_DELETED) { 1302 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 1303 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1304 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 1305 goto out; 1306 } 1307 ctx = calloc(1, sizeof(*ctx)); 1308 if (!ctx) { 1309 sct = SPDK_NVME_SCT_GENERIC; 1310 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1311 goto out; 1312 } 1313 ctx->vu_ctrlr = ctrlr; 1314 ctx->delete_io_cq_cmd = *cmd; 1315 spdk_nvmf_qpair_disconnect(&vu_qpair->qpair, vfio_user_qpair_delete_cb, ctx); 1316 return 0; 1317 } else { 1318 if (vu_qpair->state == VFIO_USER_QPAIR_SQ_DELETED) { 1319 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%u is already deleted\n", ctrlr_id(ctrlr), 1320 cmd->cdw10_bits.delete_io_q.qid); 1321 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1322 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1323 goto out; 1324 } 1325 1326 /* 1327 * This doesn't actually delete the SQ, We're merely telling the poll_group_poll 1328 * function to skip checking this SQ. The queue pair will be disconnected in Delete 1329 * IO CQ command. 1330 */ 1331 vu_qpair->state = VFIO_USER_QPAIR_SQ_DELETED; 1332 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, vu_qpair->sq.sg, &vu_qpair->sq.iov, 1); 1333 vu_qpair->sq.addr = NULL; 1334 } 1335 1336 out: 1337 return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, sc, sct); 1338 } 1339 1340 /* 1341 * Returns 0 on success and -errno on error. 1342 */ 1343 static int 1344 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 1345 { 1346 assert(ctrlr != NULL); 1347 assert(cmd != NULL); 1348 1349 if (cmd->fuse != 0) { 1350 /* Fused admin commands are not supported. */ 1351 return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, 1352 SPDK_NVME_SC_INVALID_FIELD, 1353 SPDK_NVME_SCT_GENERIC); 1354 } 1355 1356 switch (cmd->opc) { 1357 case SPDK_NVME_OPC_CREATE_IO_CQ: 1358 case SPDK_NVME_OPC_CREATE_IO_SQ: 1359 return handle_create_io_q(ctrlr, cmd, 1360 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 1361 case SPDK_NVME_OPC_DELETE_IO_SQ: 1362 case SPDK_NVME_OPC_DELETE_IO_CQ: 1363 return handle_del_io_q(ctrlr, cmd, 1364 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 1365 default: 1366 return handle_cmd_req(ctrlr, cmd, ctrlr->qp[0]); 1367 } 1368 } 1369 1370 static int 1371 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 1372 { 1373 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 1374 struct nvmf_vfio_user_ctrlr *vu_ctrlr = vu_qpair->ctrlr; 1375 uint16_t sqid, cqid; 1376 1377 assert(vu_qpair != NULL); 1378 assert(vu_req != NULL); 1379 assert(vu_ctrlr != NULL); 1380 1381 if (spdk_likely(vu_req->iovcnt)) { 1382 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, vu_req->sg, vu_req->iov, vu_req->iovcnt); 1383 } 1384 sqid = vu_qpair->qpair.qid; 1385 cqid = vu_ctrlr->qp[sqid]->sq.cqid; 1386 1387 return post_completion(vu_ctrlr, &vu_ctrlr->qp[cqid]->cq, 1388 vu_req->req.rsp->nvme_cpl.cdw0, 1389 sqid, 1390 vu_req->req.cmd->nvme_cmd.cid, 1391 vu_req->req.rsp->nvme_cpl.status.sc, 1392 vu_req->req.rsp->nvme_cpl.status.sct); 1393 } 1394 1395 static int 1396 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair, 1397 struct spdk_nvme_cmd *cmd) 1398 { 1399 assert(qpair != NULL); 1400 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 1401 return consume_admin_cmd(ctrlr, cmd); 1402 } 1403 1404 return handle_cmd_req(ctrlr, cmd, qpair); 1405 } 1406 1407 /* Returns the number of commands processed, or a negative value on error. */ 1408 static int 1409 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 1410 struct nvmf_vfio_user_qpair *qpair) 1411 { 1412 struct spdk_nvme_cmd *queue; 1413 int count = 0; 1414 1415 assert(ctrlr != NULL); 1416 assert(qpair != NULL); 1417 1418 queue = qpair->sq.addr; 1419 while (sq_head(qpair) != new_tail) { 1420 int err; 1421 struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)]; 1422 1423 count++; 1424 1425 /* 1426 * SQHD must contain the new head pointer, so we must increase 1427 * it before we generate a completion. 1428 */ 1429 sqhd_advance(ctrlr, qpair); 1430 1431 err = consume_cmd(ctrlr, qpair, cmd); 1432 if (err != 0) { 1433 return err; 1434 } 1435 } 1436 1437 return count; 1438 } 1439 1440 static int 1441 enable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1442 { 1443 int err; 1444 1445 assert(ctrlr != NULL); 1446 1447 err = acq_setup(ctrlr); 1448 if (err != 0) { 1449 return err; 1450 } 1451 1452 err = asq_setup(ctrlr); 1453 if (err != 0) { 1454 return err; 1455 } 1456 1457 return 0; 1458 } 1459 1460 static void 1461 disable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1462 { 1463 assert(ctrlr->qp[0] != NULL); 1464 1465 unmap_qp(ctrlr->qp[0]); 1466 } 1467 1468 static void 1469 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1470 { 1471 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1472 struct nvmf_vfio_user_ctrlr *ctrlr; 1473 struct nvmf_vfio_user_qpair *qpair; 1474 int ret; 1475 1476 /* 1477 * We're not interested in any DMA regions that aren't mappable (we don't 1478 * support clients that don't share their memory). 1479 */ 1480 if (!info->vaddr) { 1481 return; 1482 } 1483 1484 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1485 (info->mapping.iov_len & MASK_2MB)) { 1486 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1487 (uintptr_t)info->mapping.iov_base, 1488 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1489 return; 1490 } 1491 1492 assert(endpoint != NULL); 1493 if (endpoint->ctrlr == NULL) { 1494 return; 1495 } 1496 ctrlr = endpoint->ctrlr; 1497 1498 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1499 (uintptr_t)info->mapping.iov_base, 1500 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1501 1502 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 1503 * check the protection bits before registering. 1504 */ 1505 if (info->prot == (PROT_WRITE | PROT_READ)) { 1506 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 1507 if (ret) { 1508 SPDK_ERRLOG("Memory region register %#lx-%#lx failed, ret=%d\n", 1509 (uint64_t)(uintptr_t)info->mapping.iov_base, 1510 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len, 1511 ret); 1512 } 1513 } 1514 1515 pthread_mutex_lock(&endpoint->lock); 1516 TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) { 1517 if (qpair->state != VFIO_USER_QPAIR_INACTIVE) { 1518 continue; 1519 } 1520 1521 ret = remap_qp(qpair); 1522 if (ret) { 1523 continue; 1524 } 1525 qpair->state = VFIO_USER_QPAIR_ACTIVE; 1526 SPDK_DEBUGLOG(nvmf_vfio, "Remap QP %u successfully\n", qpair->qpair.qid); 1527 } 1528 pthread_mutex_unlock(&endpoint->lock); 1529 } 1530 1531 static int 1532 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1533 { 1534 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1535 struct nvmf_vfio_user_ctrlr *ctrlr; 1536 struct nvmf_vfio_user_qpair *qpair; 1537 void *map_start, *map_end; 1538 int ret = 0; 1539 1540 if (!info->vaddr) { 1541 return 0; 1542 } 1543 1544 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1545 (info->mapping.iov_len & MASK_2MB)) { 1546 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1547 (uintptr_t)info->mapping.iov_base, 1548 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1549 return 0; 1550 } 1551 1552 assert(endpoint != NULL); 1553 if (endpoint->ctrlr == NULL) { 1554 return 0; 1555 } 1556 ctrlr = endpoint->ctrlr; 1557 1558 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1559 (uintptr_t)info->mapping.iov_base, 1560 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1561 1562 map_start = info->mapping.iov_base; 1563 map_end = info->mapping.iov_base + info->mapping.iov_len; 1564 1565 pthread_mutex_lock(&endpoint->lock); 1566 TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) { 1567 if ((qpair->cq.addr >= map_start && qpair->cq.addr <= map_end) || 1568 (qpair->sq.addr >= map_start && qpair->sq.addr <= map_end)) { 1569 /* TODO: Ideally we should disconnect this queue pair 1570 * before returning to caller. 1571 */ 1572 unmap_qp(qpair); 1573 qpair->state = VFIO_USER_QPAIR_INACTIVE; 1574 } 1575 } 1576 pthread_mutex_unlock(&endpoint->lock); 1577 1578 if (info->prot == (PROT_WRITE | PROT_READ)) { 1579 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 1580 if (ret) { 1581 SPDK_ERRLOG("Memory region unregister %#lx-%#lx failed, ret=%d\n", 1582 (uint64_t)(uintptr_t)info->mapping.iov_base, 1583 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len, 1584 ret); 1585 } 1586 } 1587 1588 return 0; 1589 } 1590 1591 static int 1592 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1593 { 1594 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 1595 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1596 bool disable_admin = false; 1597 int ret; 1598 1599 assert(vu_qpair != NULL); 1600 assert(req != NULL); 1601 1602 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 1603 assert(vu_qpair->ctrlr != NULL); 1604 assert(req != NULL); 1605 1606 memcpy(req->req.data, 1607 &req->req.rsp->prop_get_rsp.value.u64, 1608 req->req.length); 1609 } else { 1610 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 1611 assert(vu_qpair->ctrlr != NULL); 1612 vu_ctrlr = vu_qpair->ctrlr; 1613 1614 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 1615 union spdk_nvme_cc_register cc, diff; 1616 1617 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 1618 diff.raw = cc.raw ^ req->cc.raw; 1619 1620 if (diff.bits.en) { 1621 if (cc.bits.en) { 1622 SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr)); 1623 ret = enable_admin_queue(vu_ctrlr); 1624 if (ret) { 1625 SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr)); 1626 return ret; 1627 } 1628 vu_qpair->state = VFIO_USER_QPAIR_ACTIVE; 1629 } else { 1630 disable_admin = true; 1631 } 1632 } 1633 1634 if (diff.bits.shn) { 1635 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 1636 disable_admin = true; 1637 } 1638 } 1639 1640 if (disable_admin) { 1641 SPDK_DEBUGLOG(nvmf_vfio, 1642 "%s: UNMAP Admin queue\n", 1643 ctrlr_id(vu_ctrlr)); 1644 vu_qpair->state = VFIO_USER_QPAIR_INACTIVE; 1645 disable_admin_queue(vu_ctrlr); 1646 /* For PCIe controller reset or shutdown, we will drop all AER responses */ 1647 nvmf_ctrlr_abort_aer(vu_qpair->qpair.ctrlr); 1648 } 1649 } 1650 } 1651 1652 return 0; 1653 } 1654 1655 /* 1656 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 1657 * doorbell is written via access_bar0_fn(). 1658 * 1659 * DSTRD is set to fixed value 0 for NVMf. 1660 * 1661 */ 1662 static int 1663 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 1664 const size_t count, loff_t pos, const bool is_write) 1665 { 1666 assert(ctrlr != NULL); 1667 assert(buf != NULL); 1668 1669 if (count != sizeof(uint32_t)) { 1670 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 1671 ctrlr_id(ctrlr), count); 1672 errno = EINVAL; 1673 return -1; 1674 } 1675 1676 pos -= NVME_DOORBELLS_OFFSET; 1677 1678 /* pos must be dword aligned */ 1679 if ((pos & 0x3) != 0) { 1680 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 1681 errno = EINVAL; 1682 return -1; 1683 } 1684 1685 /* convert byte offset to array index */ 1686 pos >>= 2; 1687 1688 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 1689 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 1690 errno = EINVAL; 1691 return -1; 1692 } 1693 1694 if (is_write) { 1695 ctrlr->doorbells[pos] = *buf; 1696 spdk_wmb(); 1697 } else { 1698 spdk_rmb(); 1699 *buf = ctrlr->doorbells[pos]; 1700 } 1701 return 0; 1702 } 1703 1704 static ssize_t 1705 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 1706 bool is_write) 1707 { 1708 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1709 struct nvmf_vfio_user_ctrlr *ctrlr; 1710 struct nvmf_vfio_user_req *req; 1711 const struct spdk_nvmf_registers *regs; 1712 int ret; 1713 1714 ctrlr = endpoint->ctrlr; 1715 1716 SPDK_DEBUGLOG(nvmf_vfio, 1717 "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n", 1718 endpoint_id(endpoint), is_write ? "write" : "read", 1719 ctrlr, count, pos); 1720 1721 if (pos >= NVME_DOORBELLS_OFFSET) { 1722 /* 1723 * The fact that the doorbells can be memory mapped doesn't mean 1724 * that the client (VFIO in QEMU) is obliged to memory map them, 1725 * it might still elect to access them via regular read/write; 1726 * we might also have had disable_mappable_bar0 set. 1727 */ 1728 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 1729 pos, is_write); 1730 if (ret == 0) { 1731 return count; 1732 } 1733 return ret; 1734 } 1735 1736 /* Construct a Fabric Property Get/Set command and send it */ 1737 req = get_nvmf_vfio_user_req(ctrlr->qp[0]); 1738 if (req == NULL) { 1739 errno = ENOBUFS; 1740 return -1; 1741 } 1742 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 1743 req->cc.raw = regs->cc.raw; 1744 1745 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 1746 req->cb_arg = ctrlr->qp[0]; 1747 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 1748 req->req.cmd->prop_set_cmd.cid = 0; 1749 req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1; 1750 req->req.cmd->prop_set_cmd.ofst = pos; 1751 if (is_write) { 1752 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 1753 if (req->req.cmd->prop_set_cmd.attrib.size) { 1754 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 1755 } else { 1756 req->req.cmd->prop_set_cmd.value.u32.high = 0; 1757 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 1758 } 1759 } else { 1760 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 1761 } 1762 req->req.length = count; 1763 req->req.data = buf; 1764 1765 spdk_nvmf_request_exec_fabrics(&req->req); 1766 1767 return count; 1768 } 1769 1770 /* 1771 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 1772 * available on PCI-X 2.0 and PCI Express buses 1773 */ 1774 static ssize_t 1775 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 1776 bool is_write) 1777 { 1778 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1779 1780 if (is_write) { 1781 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 1782 endpoint_id(endpoint), offset, offset + count); 1783 errno = EINVAL; 1784 return -1; 1785 } 1786 1787 if (offset + count > NVME_REG_CFG_SIZE) { 1788 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 1789 endpoint_id(endpoint), offset, count, 1790 NVME_REG_CFG_SIZE); 1791 errno = ERANGE; 1792 return -1; 1793 } 1794 1795 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 1796 1797 return count; 1798 } 1799 1800 static void 1801 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 1802 { 1803 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1804 1805 if (level >= LOG_DEBUG) { 1806 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1807 } else if (level >= LOG_INFO) { 1808 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1809 } else if (level >= LOG_NOTICE) { 1810 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 1811 } else if (level >= LOG_WARNING) { 1812 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 1813 } else { 1814 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 1815 } 1816 } 1817 1818 static int 1819 vfio_user_get_log_level(void) 1820 { 1821 int level; 1822 1823 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 1824 return LOG_DEBUG; 1825 } 1826 1827 level = spdk_log_to_syslog_level(spdk_log_get_level()); 1828 if (level < 0) { 1829 return LOG_ERR; 1830 } 1831 1832 return level; 1833 } 1834 1835 static void 1836 init_pci_config_space(vfu_pci_config_space_t *p) 1837 { 1838 /* MLBAR */ 1839 p->hdr.bars[0].raw = 0x0; 1840 /* MUBAR */ 1841 p->hdr.bars[1].raw = 0x0; 1842 1843 /* vendor specific, let's set them to zero for now */ 1844 p->hdr.bars[3].raw = 0x0; 1845 p->hdr.bars[4].raw = 0x0; 1846 p->hdr.bars[5].raw = 0x0; 1847 1848 /* enable INTx */ 1849 p->hdr.intr.ipin = 0x1; 1850 } 1851 1852 static int 1853 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 1854 struct nvmf_vfio_user_endpoint *endpoint) 1855 { 1856 int ret; 1857 ssize_t cap_offset; 1858 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 1859 1860 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 1861 struct pxcap pxcap = { 1862 .hdr.id = PCI_CAP_ID_EXP, 1863 .pxcaps.ver = 0x2, 1864 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 1865 .pxdcap2.ctds = 0x1 1866 }; 1867 1868 struct msixcap msixcap = { 1869 .hdr.id = PCI_CAP_ID_MSIX, 1870 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 1871 .mtab = {.tbir = 0x4, .to = 0x0}, 1872 .mpba = {.pbir = 0x5, .pbao = 0x0} 1873 }; 1874 1875 struct iovec sparse_mmap[] = { 1876 { 1877 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 1878 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 1879 }, 1880 }; 1881 1882 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 1883 if (ret < 0) { 1884 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 1885 return ret; 1886 } 1887 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 1888 /* 1889 * 0x02, controller uses the NVM Express programming interface 1890 * 0x08, non-volatile memory controller 1891 * 0x01, mass storage controller 1892 */ 1893 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 1894 1895 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 1896 if (cap_offset < 0) { 1897 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 1898 return ret; 1899 } 1900 1901 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 1902 if (cap_offset < 0) { 1903 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 1904 return ret; 1905 } 1906 1907 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 1908 if (cap_offset < 0) { 1909 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 1910 return ret; 1911 } 1912 1913 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 1914 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1915 if (ret < 0) { 1916 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 1917 return ret; 1918 } 1919 1920 if (vu_transport->transport_opts.disable_mappable_bar0) { 1921 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1922 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1923 NULL, 0, -1, 0); 1924 } else { 1925 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1926 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1927 sparse_mmap, 1, endpoint->devmem_fd, 0); 1928 } 1929 1930 if (ret < 0) { 1931 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 1932 return ret; 1933 } 1934 1935 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 1936 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1937 if (ret < 0) { 1938 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 1939 return ret; 1940 } 1941 1942 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 1943 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1944 if (ret < 0) { 1945 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 1946 return ret; 1947 } 1948 1949 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 1950 if (ret < 0) { 1951 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 1952 return ret; 1953 } 1954 1955 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 1956 if (ret < 0) { 1957 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 1958 return ret; 1959 } 1960 1961 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 1962 if (ret < 0) { 1963 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 1964 return ret; 1965 } 1966 1967 ret = vfu_realize_ctx(vfu_ctx); 1968 if (ret < 0) { 1969 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 1970 return ret; 1971 } 1972 1973 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 1974 assert(endpoint->pci_config_space != NULL); 1975 init_pci_config_space(endpoint->pci_config_space); 1976 1977 assert(cap_offset != 0); 1978 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 1979 1980 return 0; 1981 } 1982 1983 static void 1984 _free_ctrlr(void *ctx) 1985 { 1986 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 1987 1988 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 1989 free(ctrlr); 1990 } 1991 1992 static void 1993 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr, bool free_qps) 1994 { 1995 int i; 1996 assert(ctrlr != NULL); 1997 1998 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 1999 2000 if (free_qps) { 2001 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2002 free_qp(ctrlr, i); 2003 } 2004 } 2005 2006 if (ctrlr->thread == spdk_get_thread()) { 2007 _free_ctrlr(ctrlr); 2008 } else { 2009 spdk_thread_send_msg(ctrlr->thread, _free_ctrlr, ctrlr); 2010 } 2011 } 2012 2013 static void 2014 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 2015 struct nvmf_vfio_user_endpoint *endpoint) 2016 { 2017 struct nvmf_vfio_user_ctrlr *ctrlr; 2018 int err = 0; 2019 2020 /* First, construct a vfio-user CUSTOM transport controller */ 2021 ctrlr = calloc(1, sizeof(*ctrlr)); 2022 if (ctrlr == NULL) { 2023 err = -ENOMEM; 2024 goto out; 2025 } 2026 ctrlr->cntlid = 0xffff; 2027 ctrlr->transport = transport; 2028 ctrlr->endpoint = endpoint; 2029 ctrlr->doorbells = endpoint->doorbells; 2030 TAILQ_INIT(&ctrlr->connected_qps); 2031 2032 /* Then, construct an admin queue pair */ 2033 err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0); 2034 if (err != 0) { 2035 free(ctrlr); 2036 goto out; 2037 } 2038 endpoint->ctrlr = ctrlr; 2039 2040 /* Notify the generic layer about the new admin queue pair */ 2041 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->qp[0]->qpair); 2042 2043 out: 2044 if (err != 0) { 2045 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 2046 endpoint_id(endpoint), strerror(-err)); 2047 } 2048 } 2049 2050 static int 2051 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 2052 const struct spdk_nvme_transport_id *trid, 2053 struct spdk_nvmf_listen_opts *listen_opts) 2054 { 2055 struct nvmf_vfio_user_transport *vu_transport; 2056 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 2057 char path[PATH_MAX] = {}; 2058 char uuid[PATH_MAX] = {}; 2059 int ret; 2060 2061 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2062 transport); 2063 2064 pthread_mutex_lock(&vu_transport->lock); 2065 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 2066 /* Only compare traddr */ 2067 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 2068 pthread_mutex_unlock(&vu_transport->lock); 2069 return -EEXIST; 2070 } 2071 } 2072 pthread_mutex_unlock(&vu_transport->lock); 2073 2074 endpoint = calloc(1, sizeof(*endpoint)); 2075 if (!endpoint) { 2076 return -ENOMEM; 2077 } 2078 2079 pthread_mutex_init(&endpoint->lock, NULL); 2080 endpoint->devmem_fd = -1; 2081 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 2082 2083 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 2084 if (ret < 0 || ret >= PATH_MAX) { 2085 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 2086 ret = -1; 2087 goto out; 2088 } 2089 2090 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 2091 if (ret == -1) { 2092 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 2093 endpoint_id(endpoint), path, spdk_strerror(errno)); 2094 goto out; 2095 } 2096 2097 endpoint->devmem_fd = ret; 2098 ret = ftruncate(endpoint->devmem_fd, 2099 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 2100 if (ret != 0) { 2101 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 2102 spdk_strerror(errno)); 2103 goto out; 2104 } 2105 2106 endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 2107 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 2108 if (endpoint->doorbells == MAP_FAILED) { 2109 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 2110 endpoint->doorbells = NULL; 2111 ret = -1; 2112 goto out; 2113 } 2114 2115 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 2116 if (ret < 0 || ret >= PATH_MAX) { 2117 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 2118 ret = -1; 2119 goto out; 2120 } 2121 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 2122 endpoint, VFU_DEV_TYPE_PCI); 2123 if (endpoint->vfu_ctx == NULL) { 2124 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 2125 endpoint_id(endpoint)); 2126 ret = -1; 2127 goto out; 2128 } 2129 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level()); 2130 2131 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 2132 if (ret < 0) { 2133 goto out; 2134 } 2135 2136 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 2137 SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells); 2138 2139 out: 2140 if (ret != 0) { 2141 nvmf_vfio_user_destroy_endpoint(endpoint); 2142 } 2143 2144 return ret; 2145 } 2146 2147 static void 2148 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 2149 const struct spdk_nvme_transport_id *trid) 2150 { 2151 struct nvmf_vfio_user_transport *vu_transport; 2152 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 2153 2154 assert(trid != NULL); 2155 assert(trid->traddr != NULL); 2156 2157 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 2158 2159 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2160 transport); 2161 2162 pthread_mutex_lock(&vu_transport->lock); 2163 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 2164 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 2165 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 2166 if (endpoint->ctrlr) { 2167 /* Users may kill NVMeoF target while VM 2168 * is connected, free all resources. 2169 */ 2170 free_ctrlr(endpoint->ctrlr, true); 2171 } 2172 nvmf_vfio_user_destroy_endpoint(endpoint); 2173 pthread_mutex_unlock(&vu_transport->lock); 2174 2175 return; 2176 } 2177 } 2178 pthread_mutex_unlock(&vu_transport->lock); 2179 2180 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 2181 } 2182 2183 static void 2184 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 2185 struct spdk_nvmf_subsystem *subsystem, 2186 struct spdk_nvmf_ctrlr_data *cdata) 2187 { 2188 cdata->vid = SPDK_PCI_VID_NUTANIX; 2189 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 2190 cdata->ieee[0] = 0x8d; 2191 cdata->ieee[1] = 0x6b; 2192 cdata->ieee[2] = 0x50; 2193 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 2194 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 2195 /* libvfio-user can only support 1 connection for now */ 2196 cdata->oncs.reservations = 0; 2197 } 2198 2199 static int 2200 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 2201 const struct spdk_nvmf_subsystem *subsystem, 2202 const struct spdk_nvme_transport_id *trid) 2203 { 2204 struct nvmf_vfio_user_transport *vu_transport; 2205 struct nvmf_vfio_user_endpoint *endpoint; 2206 2207 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 2208 2209 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2210 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 2211 break; 2212 } 2213 } 2214 2215 if (endpoint == NULL) { 2216 return -ENOENT; 2217 } 2218 2219 endpoint->subsystem = subsystem; 2220 2221 return 0; 2222 } 2223 2224 /* 2225 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 2226 * frequency. 2227 * 2228 * For each transport endpoint (which at the libvfio-user level corresponds to 2229 * a socket), if we don't currently have a controller set up, peek to see if the 2230 * socket is able to accept a new connection. 2231 * 2232 * This poller also takes care of handling the creation of any pending new 2233 * qpairs. 2234 * 2235 * Returns the number of events handled. 2236 */ 2237 static uint32_t 2238 nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport) 2239 { 2240 struct nvmf_vfio_user_transport *vu_transport; 2241 struct nvmf_vfio_user_endpoint *endpoint; 2242 uint32_t count = 0; 2243 int err; 2244 2245 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2246 transport); 2247 2248 pthread_mutex_lock(&vu_transport->lock); 2249 2250 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2251 if (endpoint->ctrlr != NULL) { 2252 continue; 2253 } 2254 2255 err = vfu_attach_ctx(endpoint->vfu_ctx); 2256 if (err != 0) { 2257 if (errno == EAGAIN || errno == EWOULDBLOCK) { 2258 continue; 2259 } 2260 2261 pthread_mutex_unlock(&vu_transport->lock); 2262 return 1; 2263 } 2264 2265 count++; 2266 2267 /* Construct a controller */ 2268 nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 2269 } 2270 2271 pthread_mutex_unlock(&vu_transport->lock); 2272 2273 return count; 2274 } 2275 2276 static void 2277 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 2278 struct spdk_nvme_transport_id *trid, 2279 struct spdk_nvmf_discovery_log_page_entry *entry) 2280 { } 2281 2282 static struct spdk_nvmf_transport_poll_group * 2283 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport) 2284 { 2285 struct nvmf_vfio_user_poll_group *vu_group; 2286 2287 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 2288 2289 vu_group = calloc(1, sizeof(*vu_group)); 2290 if (vu_group == NULL) { 2291 SPDK_ERRLOG("Error allocating poll group: %m"); 2292 return NULL; 2293 } 2294 2295 TAILQ_INIT(&vu_group->qps); 2296 2297 return &vu_group->group; 2298 } 2299 2300 /* called when process exits */ 2301 static void 2302 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 2303 { 2304 struct nvmf_vfio_user_poll_group *vu_group; 2305 2306 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 2307 2308 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2309 2310 free(vu_group); 2311 } 2312 2313 static void 2314 vfio_user_qpair_disconnect_cb(void *ctx) 2315 { 2316 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2317 struct nvmf_vfio_user_ctrlr *ctrlr; 2318 2319 pthread_mutex_lock(&endpoint->lock); 2320 ctrlr = endpoint->ctrlr; 2321 if (!ctrlr) { 2322 pthread_mutex_unlock(&endpoint->lock); 2323 return; 2324 } 2325 2326 if (TAILQ_EMPTY(&ctrlr->connected_qps)) { 2327 endpoint->ctrlr = NULL; 2328 free_ctrlr(ctrlr, false); 2329 } 2330 pthread_mutex_unlock(&endpoint->lock); 2331 } 2332 2333 static void 2334 _vfio_user_qpair_disconnect(void *ctx) 2335 { 2336 struct nvmf_vfio_user_qpair *vu_qpair = ctx; 2337 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2338 struct nvmf_vfio_user_endpoint *endpoint; 2339 2340 vu_ctrlr = vu_qpair->ctrlr; 2341 endpoint = vu_ctrlr->endpoint; 2342 2343 spdk_nvmf_qpair_disconnect(&vu_qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint); 2344 } 2345 2346 static int 2347 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 2348 { 2349 struct nvmf_vfio_user_qpair *qpair; 2350 struct nvmf_vfio_user_endpoint *endpoint; 2351 2352 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 2353 2354 endpoint = ctrlr->endpoint; 2355 assert(endpoint != NULL); 2356 2357 pthread_mutex_lock(&endpoint->lock); 2358 if (TAILQ_EMPTY(&ctrlr->connected_qps)) { 2359 endpoint->ctrlr = NULL; 2360 free_ctrlr(ctrlr, false); 2361 pthread_mutex_unlock(&endpoint->lock); 2362 return 0; 2363 } 2364 2365 TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) { 2366 /* add another round thread poll to avoid recursive endpoint lock */ 2367 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, qpair); 2368 } 2369 pthread_mutex_unlock(&endpoint->lock); 2370 2371 return 0; 2372 } 2373 2374 /* 2375 * Poll for and process any incoming vfio-user messages. 2376 */ 2377 static int 2378 vfio_user_poll_vfu_ctx(void *ctx) 2379 { 2380 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 2381 int ret; 2382 2383 assert(ctrlr != NULL); 2384 2385 /* This will call access_bar0_fn() if there are any writes 2386 * to the portion of the BAR that is not mmap'd */ 2387 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 2388 if (spdk_unlikely(ret == -1)) { 2389 if (errno == EBUSY) { 2390 return SPDK_POLLER_BUSY; 2391 } 2392 2393 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 2394 2395 /* initiator shutdown or reset, waiting for another re-connect */ 2396 if (errno == ENOTCONN) { 2397 vfio_user_destroy_ctrlr(ctrlr); 2398 return SPDK_POLLER_BUSY; 2399 } 2400 2401 fail_ctrlr(ctrlr); 2402 } 2403 2404 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 2405 } 2406 2407 static int 2408 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2409 { 2410 struct nvmf_vfio_user_poll_group *vu_group; 2411 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 2412 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2413 struct nvmf_vfio_user_endpoint *endpoint; 2414 2415 assert(vu_qpair != NULL); 2416 assert(req != NULL); 2417 2418 vu_ctrlr = vu_qpair->ctrlr; 2419 assert(vu_ctrlr != NULL); 2420 endpoint = vu_ctrlr->endpoint; 2421 assert(endpoint != NULL); 2422 2423 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 2424 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 2425 endpoint->ctrlr = NULL; 2426 free_ctrlr(vu_ctrlr, true); 2427 return -1; 2428 } 2429 2430 vu_group = SPDK_CONTAINEROF(vu_qpair->group, struct nvmf_vfio_user_poll_group, group); 2431 TAILQ_INSERT_TAIL(&vu_group->qps, vu_qpair, link); 2432 vu_qpair->state = VFIO_USER_QPAIR_ACTIVE; 2433 2434 pthread_mutex_lock(&endpoint->lock); 2435 if (nvmf_qpair_is_admin_queue(&vu_qpair->qpair)) { 2436 vu_ctrlr->cntlid = vu_qpair->qpair.ctrlr->cntlid; 2437 vu_ctrlr->thread = spdk_get_thread(); 2438 vu_ctrlr->ctrlr = vu_qpair->qpair.ctrlr; 2439 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, vu_ctrlr, 0); 2440 } else { 2441 /* For I/O queues this command was generated in response to an 2442 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 2443 * been completed. Complete it now. 2444 */ 2445 post_completion(vu_ctrlr, &vu_ctrlr->qp[0]->cq, 0, 0, 2446 vu_qpair->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2447 } 2448 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_qps, vu_qpair, tailq); 2449 pthread_mutex_unlock(&endpoint->lock); 2450 2451 free(req->req.data); 2452 req->req.data = NULL; 2453 2454 return 0; 2455 } 2456 2457 /* 2458 * Add the given qpair to the given poll group. New qpairs are added via 2459 * spdk_nvmf_tgt_new_qpair(), which picks a poll group, then calls back 2460 * here via nvmf_transport_poll_group_add(). 2461 */ 2462 static int 2463 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 2464 struct spdk_nvmf_qpair *qpair) 2465 { 2466 struct nvmf_vfio_user_qpair *vu_qpair; 2467 struct nvmf_vfio_user_req *vu_req; 2468 struct nvmf_vfio_user_ctrlr *ctrlr; 2469 struct spdk_nvmf_request *req; 2470 struct spdk_nvmf_fabric_connect_data *data; 2471 bool admin; 2472 2473 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2474 vu_qpair->group = group; 2475 ctrlr = vu_qpair->ctrlr; 2476 2477 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 2478 ctrlr_id(ctrlr), vu_qpair->qpair.qid, 2479 vu_qpair, qpair, group); 2480 2481 admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair); 2482 2483 vu_req = get_nvmf_vfio_user_req(vu_qpair); 2484 if (vu_req == NULL) { 2485 return -1; 2486 } 2487 2488 req = &vu_req->req; 2489 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2490 req->cmd->connect_cmd.cid = 0; 2491 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 2492 req->cmd->connect_cmd.recfmt = 0; 2493 req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1; 2494 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 2495 2496 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 2497 req->data = calloc(1, req->length); 2498 if (req->data == NULL) { 2499 nvmf_vfio_user_req_free(req); 2500 return -ENOMEM; 2501 } 2502 2503 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 2504 data->cntlid = admin ? 0xFFFF : ctrlr->cntlid; 2505 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 2506 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 2507 2508 vu_req->cb_fn = handle_queue_connect_rsp; 2509 vu_req->cb_arg = vu_qpair; 2510 2511 SPDK_DEBUGLOG(nvmf_vfio, 2512 "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", 2513 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 2514 2515 spdk_nvmf_request_exec_fabrics(req); 2516 return 0; 2517 } 2518 2519 static int 2520 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 2521 struct spdk_nvmf_qpair *qpair) 2522 { 2523 struct nvmf_vfio_user_qpair *vu_qpair; 2524 struct nvmf_vfio_user_poll_group *vu_group; 2525 2526 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2527 2528 SPDK_DEBUGLOG(nvmf_vfio, 2529 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 2530 ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group); 2531 2532 2533 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2534 TAILQ_REMOVE(&vu_group->qps, vu_qpair, link); 2535 2536 return 0; 2537 } 2538 2539 static void 2540 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_qpair *vu_qpair, struct nvmf_vfio_user_req *vu_req) 2541 { 2542 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 2543 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 2544 vu_req->iovcnt = 0; 2545 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 2546 2547 TAILQ_INSERT_TAIL(&vu_qpair->reqs, vu_req, link); 2548 } 2549 2550 static int 2551 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 2552 { 2553 struct nvmf_vfio_user_qpair *vu_qpair; 2554 struct nvmf_vfio_user_req *vu_req; 2555 2556 assert(req != NULL); 2557 2558 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2559 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2560 2561 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2562 2563 return 0; 2564 } 2565 2566 static int 2567 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 2568 { 2569 struct nvmf_vfio_user_qpair *vu_qpair; 2570 struct nvmf_vfio_user_req *vu_req; 2571 2572 assert(req != NULL); 2573 2574 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2575 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2576 2577 if (vu_req->cb_fn != NULL) { 2578 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 2579 fail_ctrlr(vu_qpair->ctrlr); 2580 } 2581 } 2582 2583 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2584 2585 return 0; 2586 } 2587 2588 static void 2589 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 2590 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 2591 { 2592 struct nvmf_vfio_user_qpair *vu_qpair; 2593 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2594 2595 assert(qpair != NULL); 2596 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2597 vu_ctrlr = vu_qpair->ctrlr; 2598 2599 pthread_mutex_lock(&vu_ctrlr->endpoint->lock); 2600 TAILQ_REMOVE(&vu_ctrlr->connected_qps, vu_qpair, tailq); 2601 pthread_mutex_unlock(&vu_ctrlr->endpoint->lock); 2602 2603 free_qp(vu_ctrlr, qpair->qid); 2604 2605 if (cb_fn) { 2606 cb_fn(cb_arg); 2607 } 2608 } 2609 2610 /** 2611 * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available. 2612 */ 2613 static struct nvmf_vfio_user_req * 2614 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair) 2615 { 2616 struct nvmf_vfio_user_req *req; 2617 2618 assert(qpair != NULL); 2619 2620 if (TAILQ_EMPTY(&qpair->reqs)) { 2621 return NULL; 2622 } 2623 2624 req = TAILQ_FIRST(&qpair->reqs); 2625 TAILQ_REMOVE(&qpair->reqs, req, link); 2626 2627 return req; 2628 } 2629 2630 static int 2631 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 2632 { 2633 uint16_t nr; 2634 uint32_t nlb, nsid; 2635 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2636 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 2637 struct spdk_nvmf_ns *ns; 2638 2639 nsid = cmd->nsid; 2640 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 2641 if (ns == NULL || ns->bdev == NULL) { 2642 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 2643 return -EINVAL; 2644 } 2645 2646 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 2647 nr = cmd->cdw10_bits.dsm.nr + 1; 2648 return nr * sizeof(struct spdk_nvme_dsm_range); 2649 } 2650 2651 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 2652 return nlb * spdk_bdev_get_block_size(ns->bdev); 2653 } 2654 2655 static int 2656 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2657 { 2658 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2659 uint32_t len = 0; 2660 uint8_t fid; 2661 int iovcnt; 2662 2663 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 2664 req->length = 0; 2665 req->data = NULL; 2666 2667 if (req->xfer == SPDK_NVME_DATA_NONE) { 2668 return 0; 2669 } 2670 2671 switch (cmd->opc) { 2672 case SPDK_NVME_OPC_IDENTIFY: 2673 len = 4096; 2674 break; 2675 case SPDK_NVME_OPC_GET_LOG_PAGE: 2676 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 2677 break; 2678 case SPDK_NVME_OPC_GET_FEATURES: 2679 case SPDK_NVME_OPC_SET_FEATURES: 2680 fid = cmd->cdw10_bits.set_features.fid; 2681 switch (fid) { 2682 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 2683 len = 4096; 2684 break; 2685 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 2686 len = 256; 2687 break; 2688 case SPDK_NVME_FEAT_TIMESTAMP: 2689 len = 8; 2690 break; 2691 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 2692 len = 512; 2693 break; 2694 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 2695 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 2696 len = 16; 2697 } else { 2698 len = 8; 2699 } 2700 break; 2701 default: 2702 return 0; 2703 } 2704 break; 2705 default: 2706 return 0; 2707 } 2708 2709 /* ADMIN command will not use SGL */ 2710 if (cmd->psdt != 0) { 2711 return -EINVAL; 2712 } 2713 2714 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 2715 if (iovcnt < 0) { 2716 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 2717 ctrlr_id(ctrlr), cmd->opc); 2718 return -1; 2719 } 2720 req->length = len; 2721 req->data = req->iov[0].iov_base; 2722 req->iovcnt = iovcnt; 2723 2724 return 0; 2725 } 2726 2727 /* 2728 * Map an I/O command's buffers. 2729 * 2730 * Returns 0 on success and -errno on failure. 2731 */ 2732 static int 2733 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2734 { 2735 int len, iovcnt; 2736 struct spdk_nvme_cmd *cmd; 2737 2738 assert(ctrlr != NULL); 2739 assert(req != NULL); 2740 2741 cmd = &req->cmd->nvme_cmd; 2742 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 2743 req->length = 0; 2744 req->data = NULL; 2745 2746 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 2747 return 0; 2748 } 2749 2750 len = get_nvmf_io_req_length(req); 2751 if (len < 0) { 2752 return -EINVAL; 2753 } 2754 req->length = len; 2755 2756 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 2757 if (iovcnt < 0) { 2758 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 2759 return -EFAULT; 2760 } 2761 req->data = req->iov[0].iov_base; 2762 req->iovcnt = iovcnt; 2763 2764 return 0; 2765 } 2766 2767 static int 2768 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 2769 struct nvmf_vfio_user_qpair *vu_qpair) 2770 { 2771 int err; 2772 struct nvmf_vfio_user_req *vu_req; 2773 struct spdk_nvmf_request *req; 2774 2775 assert(ctrlr != NULL); 2776 assert(cmd != NULL); 2777 2778 vu_req = get_nvmf_vfio_user_req(vu_qpair); 2779 if (spdk_unlikely(vu_req == NULL)) { 2780 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 2781 return post_completion(ctrlr, &vu_qpair->cq, 0, 0, cmd->cid, 2782 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 2783 2784 } 2785 req = &vu_req->req; 2786 2787 assert(req->qpair != NULL); 2788 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle qid%u, req opc=%#x cid=%d\n", 2789 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 2790 2791 vu_req->cb_fn = handle_cmd_rsp; 2792 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2793 req->cmd->nvme_cmd = *cmd; 2794 2795 if (nvmf_qpair_is_admin_queue(req->qpair)) { 2796 err = map_admin_cmd_req(ctrlr, req); 2797 } else { 2798 switch (cmd->opc) { 2799 case SPDK_NVME_OPC_RESERVATION_REGISTER: 2800 case SPDK_NVME_OPC_RESERVATION_REPORT: 2801 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 2802 case SPDK_NVME_OPC_RESERVATION_RELEASE: 2803 err = -ENOTSUP; 2804 break; 2805 default: 2806 err = map_io_cmd_req(ctrlr, req); 2807 break; 2808 } 2809 } 2810 2811 if (spdk_unlikely(err < 0)) { 2812 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 2813 ctrlr_id(ctrlr), cmd->opc); 2814 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2815 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2816 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 2817 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2818 return err; 2819 } 2820 2821 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 2822 spdk_nvmf_request_exec(req); 2823 2824 return 0; 2825 } 2826 2827 /* Returns the number of commands processed, or a negative value on error. */ 2828 static int 2829 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair) 2830 { 2831 struct nvmf_vfio_user_ctrlr *ctrlr; 2832 uint32_t new_tail; 2833 int count = 0; 2834 2835 assert(qpair != NULL); 2836 2837 ctrlr = qpair->ctrlr; 2838 2839 /* On aarch64 platforms, doorbells update from guest VM may not be seen 2840 * on SPDK target side. This is because there is memory type mismatch 2841 * situation here. That is on guest VM side, the doorbells are treated as 2842 * device memory while on SPDK target side, it is treated as normal 2843 * memory. And this situation cause problem on ARM platform. 2844 * Refer to "https://developer.arm.com/documentation/102376/0100/ 2845 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 2846 * cannot fix this. Use "dc civac" to invalidate cache may solve 2847 * this. 2848 */ 2849 spdk_ivdt_dcache(tdbl(ctrlr, &qpair->sq)); 2850 2851 /* Load-Acquire. */ 2852 new_tail = *tdbl(ctrlr, &qpair->sq); 2853 2854 /* 2855 * Ensure that changes to the queue are visible to us. 2856 * The host driver should write the queue first, do a wmb(), and then 2857 * update the SQ tail doorbell (their Store-Release). 2858 */ 2859 spdk_rmb(); 2860 2861 new_tail = new_tail & 0xffffu; 2862 if (spdk_unlikely(new_tail >= qpair->sq.size)) { 2863 union spdk_nvme_async_event_completion event = {}; 2864 2865 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid SQ%u doorbell value %u\n", ctrlr_id(ctrlr), qpair->qpair.qid, 2866 new_tail); 2867 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 2868 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 2869 nvmf_ctrlr_async_event_error_event(qpair->qpair.ctrlr, event); 2870 2871 return 0; 2872 } 2873 2874 if (sq_head(qpair) == new_tail) { 2875 return 0; 2876 } 2877 2878 count = handle_sq_tdbl_write(ctrlr, new_tail, qpair); 2879 if (count < 0) { 2880 fail_ctrlr(ctrlr); 2881 } 2882 2883 return count; 2884 } 2885 2886 /* 2887 * vfio-user transport poll handler. Note that the library context is polled in 2888 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 2889 * active qpairs. 2890 * 2891 * Returns the number of commands processed, or a negative value on error. 2892 */ 2893 static int 2894 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2895 { 2896 struct nvmf_vfio_user_poll_group *vu_group; 2897 struct nvmf_vfio_user_qpair *vu_qpair, *tmp; 2898 int count = 0; 2899 2900 assert(group != NULL); 2901 2902 spdk_rmb(); 2903 2904 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2905 2906 TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) { 2907 int ret; 2908 2909 if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) { 2910 continue; 2911 } 2912 2913 ret = nvmf_vfio_user_qpair_poll(vu_qpair); 2914 2915 if (ret < 0) { 2916 return ret; 2917 } 2918 2919 count += ret; 2920 } 2921 2922 return count; 2923 } 2924 2925 static int 2926 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 2927 struct spdk_nvme_transport_id *trid) 2928 { 2929 struct nvmf_vfio_user_qpair *vu_qpair; 2930 struct nvmf_vfio_user_ctrlr *ctrlr; 2931 2932 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2933 ctrlr = vu_qpair->ctrlr; 2934 2935 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2936 return 0; 2937 } 2938 2939 static int 2940 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 2941 struct spdk_nvme_transport_id *trid) 2942 { 2943 return 0; 2944 } 2945 2946 static int 2947 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 2948 struct spdk_nvme_transport_id *trid) 2949 { 2950 struct nvmf_vfio_user_qpair *vu_qpair; 2951 struct nvmf_vfio_user_ctrlr *ctrlr; 2952 2953 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2954 ctrlr = vu_qpair->ctrlr; 2955 2956 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2957 return 0; 2958 } 2959 2960 static void 2961 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 2962 struct spdk_nvmf_request *req) 2963 { 2964 struct nvmf_vfio_user_qpair *vu_qpair; 2965 struct nvmf_vfio_user_req *vu_req, *vu_req_to_abort = NULL; 2966 uint32_t i; 2967 uint16_t cid; 2968 2969 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2970 2971 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 2972 for (i = 0; i < vu_qpair->qsize; i++) { 2973 vu_req = &vu_qpair->reqs_internal[i]; 2974 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 2975 vu_req_to_abort = vu_req; 2976 break; 2977 } 2978 } 2979 2980 if (vu_req_to_abort == NULL) { 2981 spdk_nvmf_request_complete(req); 2982 return; 2983 } 2984 2985 req->req_to_abort = &vu_req_to_abort->req; 2986 nvmf_ctrlr_abort_request(req); 2987 } 2988 2989 static void 2990 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 2991 { 2992 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 2993 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 2994 opts->in_capsule_data_size = 0; 2995 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 2996 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 2997 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 2998 opts->num_shared_buffers = 0; 2999 opts->buf_cache_size = 0; 3000 opts->association_timeout = 0; 3001 opts->transport_specific = NULL; 3002 } 3003 3004 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 3005 .name = "VFIOUSER", 3006 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 3007 .opts_init = nvmf_vfio_user_opts_init, 3008 .create = nvmf_vfio_user_create, 3009 .destroy = nvmf_vfio_user_destroy, 3010 3011 .listen = nvmf_vfio_user_listen, 3012 .stop_listen = nvmf_vfio_user_stop_listen, 3013 .accept = nvmf_vfio_user_accept, 3014 .cdata_init = nvmf_vfio_user_cdata_init, 3015 .listen_associate = nvmf_vfio_user_listen_associate, 3016 3017 .listener_discover = nvmf_vfio_user_discover, 3018 3019 .poll_group_create = nvmf_vfio_user_poll_group_create, 3020 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 3021 .poll_group_add = nvmf_vfio_user_poll_group_add, 3022 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 3023 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 3024 3025 .req_free = nvmf_vfio_user_req_free, 3026 .req_complete = nvmf_vfio_user_req_complete, 3027 3028 .qpair_fini = nvmf_vfio_user_close_qpair, 3029 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 3030 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 3031 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 3032 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 3033 }; 3034 3035 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 3036 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 3037