1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019, Nutanix Inc. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * * Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * * Neither the name of Intel Corporation nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * NVMe over vfio-user transport 35 */ 36 37 #include <vfio-user/libvfio-user.h> 38 #include <vfio-user/pci_defs.h> 39 40 #include "spdk/barrier.h" 41 #include "spdk/stdinc.h" 42 #include "spdk/assert.h" 43 #include "spdk/thread.h" 44 #include "spdk/nvmf_transport.h" 45 #include "spdk/sock.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 #include "spdk/log.h" 49 50 #include "transport.h" 51 52 #include "nvmf_internal.h" 53 54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 56 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 57 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 58 59 #define NVME_DOORBELLS_OFFSET 0x1000 60 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 61 62 /* 63 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 64 * available on PCI-X 2.0 and PCI Express buses 65 */ 66 #define NVME_REG_CFG_SIZE 0x1000 67 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 68 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 69 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 70 /* MSIX Table Size */ 71 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 72 /* MSIX Pending Bit Array Size */ 73 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 74 75 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 76 77 struct nvmf_vfio_user_req; 78 struct nvmf_vfio_user_qpair; 79 80 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 81 82 /* 1 more for PRP2 list itself */ 83 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 84 85 enum nvmf_vfio_user_req_state { 86 VFIO_USER_REQUEST_STATE_FREE = 0, 87 VFIO_USER_REQUEST_STATE_EXECUTING, 88 }; 89 90 struct nvmf_vfio_user_req { 91 struct spdk_nvmf_request req; 92 struct spdk_nvme_cpl rsp; 93 struct spdk_nvme_cmd cmd; 94 95 enum nvmf_vfio_user_req_state state; 96 nvmf_vfio_user_req_cb_fn cb_fn; 97 void *cb_arg; 98 99 /* old CC before prop_set_cc fabric command */ 100 union spdk_nvme_cc_register cc; 101 102 /* placeholder for gpa_to_vva memory map table, the IO buffer doesn't use it */ 103 dma_sg_t *sg; 104 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 105 uint8_t iovcnt; 106 107 TAILQ_ENTRY(nvmf_vfio_user_req) link; 108 }; 109 110 /* 111 * A NVMe queue. 112 */ 113 struct nvme_q { 114 bool is_cq; 115 116 void *addr; 117 118 dma_sg_t *sg; 119 struct iovec iov; 120 121 uint32_t size; 122 uint64_t prp1; 123 124 union { 125 struct { 126 uint32_t head; 127 /* multiple SQs can be mapped to the same CQ */ 128 uint16_t cqid; 129 }; 130 struct { 131 uint32_t tail; 132 uint16_t iv; 133 bool ien; 134 bool phase; 135 }; 136 }; 137 }; 138 139 enum nvmf_vfio_user_qpair_state { 140 VFIO_USER_QPAIR_UNINITIALIZED = 0, 141 VFIO_USER_QPAIR_ACTIVE, 142 VFIO_USER_QPAIR_SQ_DELETED, 143 VFIO_USER_QPAIR_INACTIVE, 144 VFIO_USER_QPAIR_ERROR, 145 }; 146 147 struct nvmf_vfio_user_qpair { 148 struct spdk_nvmf_qpair qpair; 149 struct spdk_nvmf_transport_poll_group *group; 150 struct nvmf_vfio_user_ctrlr *ctrlr; 151 struct nvmf_vfio_user_req *reqs_internal; 152 uint32_t qsize; 153 struct nvme_q cq; 154 struct nvme_q sq; 155 enum nvmf_vfio_user_qpair_state state; 156 157 /* Copy of Create IO SQ command */ 158 struct spdk_nvme_cmd create_io_sq_cmd; 159 160 TAILQ_HEAD(, nvmf_vfio_user_req) reqs; 161 /* Poll group entry */ 162 TAILQ_ENTRY(nvmf_vfio_user_qpair) link; 163 /* Connected queue pair entry */ 164 TAILQ_ENTRY(nvmf_vfio_user_qpair) tailq; 165 }; 166 167 struct nvmf_vfio_user_poll_group { 168 struct spdk_nvmf_transport_poll_group group; 169 TAILQ_HEAD(, nvmf_vfio_user_qpair) qps; 170 }; 171 172 struct nvmf_vfio_user_ctrlr { 173 struct nvmf_vfio_user_endpoint *endpoint; 174 struct nvmf_vfio_user_transport *transport; 175 176 /* Connected queue pairs list */ 177 TAILQ_HEAD(, nvmf_vfio_user_qpair) connected_qps; 178 179 struct spdk_thread *thread; 180 struct spdk_poller *vfu_ctx_poller; 181 182 uint16_t cntlid; 183 struct spdk_nvmf_ctrlr *ctrlr; 184 185 struct nvmf_vfio_user_qpair *qp[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 186 187 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 188 189 volatile uint32_t *doorbells; 190 191 /* internal CSTS.CFS register for vfio-user fatal errors */ 192 uint32_t cfs : 1; 193 }; 194 195 struct nvmf_vfio_user_endpoint { 196 vfu_ctx_t *vfu_ctx; 197 struct msixcap *msix; 198 vfu_pci_config_space_t *pci_config_space; 199 int devmem_fd; 200 volatile uint32_t *doorbells; 201 202 struct spdk_nvme_transport_id trid; 203 const struct spdk_nvmf_subsystem *subsystem; 204 205 struct nvmf_vfio_user_ctrlr *ctrlr; 206 pthread_mutex_t lock; 207 208 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 209 }; 210 211 struct nvmf_vfio_user_transport_opts { 212 bool disable_mappable_bar0; 213 }; 214 215 struct nvmf_vfio_user_transport { 216 struct spdk_nvmf_transport transport; 217 struct nvmf_vfio_user_transport_opts transport_opts; 218 struct spdk_poller *accept_poller; 219 pthread_mutex_t lock; 220 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 221 }; 222 223 /* 224 * function prototypes 225 */ 226 static volatile uint32_t * 227 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 228 229 static volatile uint32_t * 230 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q); 231 232 static int 233 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 234 235 static struct nvmf_vfio_user_req * 236 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair); 237 238 static int 239 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 240 uint32_t max_iovcnt, uint32_t len, size_t mps, 241 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 242 { 243 uint64_t prp1, prp2; 244 void *vva; 245 uint32_t i; 246 uint32_t residue_len, nents; 247 uint64_t *prp_list; 248 uint32_t iovcnt; 249 250 assert(max_iovcnt > 0); 251 252 prp1 = cmd->dptr.prp.prp1; 253 prp2 = cmd->dptr.prp.prp2; 254 255 /* PRP1 may started with unaligned page address */ 256 residue_len = mps - (prp1 % mps); 257 residue_len = spdk_min(len, residue_len); 258 259 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 260 if (spdk_unlikely(vva == NULL)) { 261 SPDK_ERRLOG("GPA to VVA failed\n"); 262 return -EINVAL; 263 } 264 len -= residue_len; 265 if (len && max_iovcnt < 2) { 266 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 267 return -ERANGE; 268 } 269 iovs[0].iov_base = vva; 270 iovs[0].iov_len = residue_len; 271 272 if (len) { 273 if (spdk_unlikely(prp2 == 0)) { 274 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 275 return -EINVAL; 276 } 277 278 if (len <= mps) { 279 /* 2 PRP used */ 280 iovcnt = 2; 281 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 282 if (spdk_unlikely(vva == NULL)) { 283 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 284 prp2, len); 285 return -EINVAL; 286 } 287 iovs[1].iov_base = vva; 288 iovs[1].iov_len = len; 289 } else { 290 /* PRP list used */ 291 nents = (len + mps - 1) / mps; 292 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 293 SPDK_ERRLOG("Too many page entries\n"); 294 return -ERANGE; 295 } 296 297 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 298 if (spdk_unlikely(vva == NULL)) { 299 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 300 prp2, nents); 301 return -EINVAL; 302 } 303 prp_list = vva; 304 i = 0; 305 while (len != 0) { 306 residue_len = spdk_min(len, mps); 307 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 308 if (spdk_unlikely(vva == NULL)) { 309 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 310 prp_list[i], residue_len); 311 return -EINVAL; 312 } 313 iovs[i + 1].iov_base = vva; 314 iovs[i + 1].iov_len = residue_len; 315 len -= residue_len; 316 i++; 317 } 318 iovcnt = i + 1; 319 } 320 } else { 321 /* 1 PRP used */ 322 iovcnt = 1; 323 } 324 325 assert(iovcnt <= max_iovcnt); 326 return iovcnt; 327 } 328 329 static int 330 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 331 struct iovec *iovs, uint32_t max_iovcnt, 332 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 333 { 334 uint32_t i; 335 void *vva; 336 337 if (spdk_unlikely(max_iovcnt < num_sgls)) { 338 return -ERANGE; 339 } 340 341 for (i = 0; i < num_sgls; i++) { 342 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 343 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 344 return -EINVAL; 345 } 346 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 347 if (spdk_unlikely(vva == NULL)) { 348 SPDK_ERRLOG("GPA to VVA failed\n"); 349 return -EINVAL; 350 } 351 iovs[i].iov_base = vva; 352 iovs[i].iov_len = sgls[i].unkeyed.length; 353 } 354 355 return num_sgls; 356 } 357 358 static int 359 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 360 uint32_t len, size_t mps, 361 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 362 { 363 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 364 uint32_t num_sgls, seg_len; 365 void *vva; 366 int ret; 367 uint32_t total_iovcnt = 0; 368 369 /* SGL cases */ 370 sgl = &cmd->dptr.sgl1; 371 372 /* only one SGL segment */ 373 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 374 assert(max_iovcnt > 0); 375 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 376 if (spdk_unlikely(vva == NULL)) { 377 SPDK_ERRLOG("GPA to VVA failed\n"); 378 return -EINVAL; 379 } 380 iovs[0].iov_base = vva; 381 iovs[0].iov_len = sgl->unkeyed.length; 382 assert(sgl->unkeyed.length == len); 383 384 return 1; 385 } 386 387 for (;;) { 388 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 389 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 390 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 391 return -EINVAL; 392 } 393 394 seg_len = sgl->unkeyed.length; 395 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 396 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 397 return -EINVAL; 398 } 399 400 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 401 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 402 if (spdk_unlikely(vva == NULL)) { 403 SPDK_ERRLOG("GPA to VVA failed\n"); 404 return -EINVAL; 405 } 406 407 /* sgl point to the first segment */ 408 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 409 last_sgl = &sgl[num_sgls - 1]; 410 411 /* we are done */ 412 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 413 /* map whole sgl list */ 414 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 415 max_iovcnt - total_iovcnt, gpa_to_vva); 416 if (spdk_unlikely(ret < 0)) { 417 return ret; 418 } 419 total_iovcnt += ret; 420 421 return total_iovcnt; 422 } 423 424 if (num_sgls > 1) { 425 /* map whole sgl exclude last_sgl */ 426 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 427 max_iovcnt - total_iovcnt, gpa_to_vva); 428 if (spdk_unlikely(ret < 0)) { 429 return ret; 430 } 431 total_iovcnt += ret; 432 } 433 434 /* move to next level's segments */ 435 sgl = last_sgl; 436 } 437 438 return 0; 439 } 440 441 static int 442 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 443 uint32_t len, size_t mps, 444 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 445 { 446 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 447 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 448 } 449 450 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 451 } 452 453 static char * 454 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 455 { 456 return endpoint->trid.traddr; 457 } 458 459 static char * 460 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 461 { 462 if (!ctrlr || !ctrlr->endpoint) { 463 return "Null Ctrlr"; 464 } 465 466 return endpoint_id(ctrlr->endpoint); 467 } 468 469 static inline uint16_t 470 io_q_id(struct nvme_q *q) 471 { 472 473 struct nvmf_vfio_user_qpair *vu_qpair; 474 475 assert(q); 476 477 if (q->is_cq) { 478 vu_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq); 479 } else { 480 vu_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq); 481 } 482 assert(vu_qpair); 483 return vu_qpair->qpair.qid; 484 } 485 486 static void 487 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 488 { 489 assert(ctrlr != NULL); 490 491 if (ctrlr->cfs == 0) { 492 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr)); 493 } 494 495 ctrlr->cfs = 1U; 496 } 497 498 static inline bool 499 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 500 { 501 assert(vu_ctrlr != NULL); 502 assert(vu_ctrlr->endpoint != NULL); 503 504 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 505 506 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 507 } 508 509 static void 510 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 511 { 512 if (endpoint->doorbells) { 513 munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 514 } 515 516 if (endpoint->devmem_fd > 0) { 517 close(endpoint->devmem_fd); 518 } 519 520 if (endpoint->vfu_ctx) { 521 vfu_destroy_ctx(endpoint->vfu_ctx); 522 } 523 524 pthread_mutex_destroy(&endpoint->lock); 525 free(endpoint); 526 } 527 528 /* called when process exits */ 529 static int 530 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 531 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 532 { 533 struct nvmf_vfio_user_transport *vu_transport; 534 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 535 536 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 537 538 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 539 transport); 540 541 spdk_poller_unregister(&vu_transport->accept_poller); 542 (void)pthread_mutex_destroy(&vu_transport->lock); 543 544 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 545 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 546 nvmf_vfio_user_destroy_endpoint(endpoint); 547 } 548 549 free(vu_transport); 550 551 if (cb_fn) { 552 cb_fn(cb_arg); 553 } 554 555 return 0; 556 } 557 558 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 559 { 560 "disable_mappable_bar0", 561 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 562 spdk_json_decode_bool, true 563 }, 564 }; 565 566 static int 567 nvmf_vfio_user_accept(void *ctx); 568 569 static struct spdk_nvmf_transport * 570 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 571 { 572 struct nvmf_vfio_user_transport *vu_transport; 573 int err; 574 575 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 576 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 577 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 578 return NULL; 579 } 580 581 vu_transport = calloc(1, sizeof(*vu_transport)); 582 if (vu_transport == NULL) { 583 SPDK_ERRLOG("Transport alloc fail: %m\n"); 584 return NULL; 585 } 586 587 err = pthread_mutex_init(&vu_transport->lock, NULL); 588 if (err != 0) { 589 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 590 goto err; 591 } 592 593 TAILQ_INIT(&vu_transport->endpoints); 594 595 if (opts->transport_specific != NULL && 596 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 597 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 598 vu_transport)) { 599 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 600 free(vu_transport); 601 return NULL; 602 } 603 604 vu_transport->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, &vu_transport->transport, 605 vu_transport->transport.opts.acceptor_poll_rate); 606 if (!vu_transport->accept_poller) { 607 free(vu_transport); 608 return NULL; 609 } 610 611 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 612 vu_transport->transport_opts.disable_mappable_bar0); 613 614 return &vu_transport->transport; 615 616 err: 617 free(vu_transport); 618 619 return NULL; 620 } 621 622 static uint32_t 623 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr) 624 { 625 assert(ctrlr != NULL); 626 assert(ctrlr->qp[0] != NULL); 627 assert(ctrlr->qp[0]->qpair.ctrlr != NULL); 628 629 return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1; 630 } 631 632 static void * 633 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot) 634 { 635 int ret; 636 637 assert(ctx != NULL); 638 assert(sg != NULL); 639 assert(iov != NULL); 640 641 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 642 if (ret < 0) { 643 return NULL; 644 } 645 646 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 647 if (ret != 0) { 648 return NULL; 649 } 650 651 assert(iov->iov_base != NULL); 652 return iov->iov_base; 653 } 654 655 static inline uint32_t 656 sq_head(struct nvmf_vfio_user_qpair *qpair) 657 { 658 assert(qpair != NULL); 659 return qpair->sq.head; 660 } 661 662 static inline void 663 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair) 664 { 665 assert(ctrlr != NULL); 666 assert(qpair != NULL); 667 qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size; 668 } 669 670 static int 671 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q *q, bool is_cq, bool unmap) 672 { 673 uint64_t len; 674 675 assert(q->size); 676 assert(q->addr == NULL); 677 678 if (is_cq) { 679 len = q->size * sizeof(struct spdk_nvme_cpl); 680 } else { 681 len = q->size * sizeof(struct spdk_nvme_cmd); 682 } 683 684 q->addr = map_one(vu_ctrlr->endpoint->vfu_ctx, q->prp1, len, q->sg, 685 &q->iov, is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 686 if (q->addr == NULL) { 687 return -EFAULT; 688 } 689 690 if (unmap) { 691 memset(q->addr, 0, len); 692 } 693 694 return 0; 695 } 696 697 static int 698 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 699 { 700 struct nvme_q *sq; 701 const struct spdk_nvmf_registers *regs; 702 int ret; 703 704 assert(ctrlr != NULL); 705 assert(ctrlr->qp[0] != NULL); 706 assert(ctrlr->qp[0]->sq.addr == NULL); 707 /* XXX ctrlr->asq == 0 is a valid memory address */ 708 709 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 710 sq = &ctrlr->qp[0]->sq; 711 sq->size = regs->aqa.bits.asqs + 1; 712 sq->prp1 = regs->asq; 713 sq->head = 0; 714 sq->cqid = 0; 715 sq->is_cq = false; 716 717 ret = map_q(ctrlr, sq, false, true); 718 if (ret) { 719 return ret; 720 } 721 722 *tdbl(ctrlr, sq) = 0; 723 724 return 0; 725 } 726 727 static inline int 728 queue_index(uint16_t qid, int is_cq) 729 { 730 return (qid * 2) + is_cq; 731 } 732 733 static volatile uint32_t * 734 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 735 { 736 assert(ctrlr != NULL); 737 assert(q != NULL); 738 assert(!q->is_cq); 739 740 return &ctrlr->doorbells[queue_index(io_q_id(q), false)]; 741 } 742 743 static volatile uint32_t * 744 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 745 { 746 assert(ctrlr != NULL); 747 assert(q != NULL); 748 assert(q->is_cq); 749 750 return &ctrlr->doorbells[queue_index(io_q_id(q), true)]; 751 } 752 753 static inline bool 754 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q) 755 { 756 assert(ctrlr != NULL); 757 assert(q != NULL); 758 assert(q->is_cq); 759 760 return ((q->tail + 1) % q->size) == *hdbl(ctrlr, q); 761 } 762 763 static inline void 764 cq_tail_advance(struct nvme_q *q) 765 { 766 assert(q != NULL); 767 assert(q->is_cq); 768 769 assert(q->tail < q->size); 770 q->tail++; 771 772 if (spdk_unlikely(q->tail == q->size)) { 773 q->tail = 0; 774 q->phase = !q->phase; 775 } 776 } 777 778 static int 779 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 780 { 781 struct nvme_q *cq; 782 const struct spdk_nvmf_registers *regs; 783 int ret; 784 785 assert(ctrlr != NULL); 786 assert(ctrlr->qp[0] != NULL); 787 assert(ctrlr->qp[0]->cq.addr == NULL); 788 789 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 790 assert(regs != NULL); 791 cq = &ctrlr->qp[0]->cq; 792 cq->size = regs->aqa.bits.acqs + 1; 793 cq->prp1 = regs->acq; 794 cq->tail = 0; 795 cq->is_cq = true; 796 cq->ien = true; 797 cq->phase = true; 798 799 ret = map_q(ctrlr, cq, true, true); 800 if (ret) { 801 return ret; 802 } 803 *hdbl(ctrlr, cq) = 0; 804 805 return 0; 806 } 807 808 static inline dma_sg_t * 809 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 810 { 811 return (dma_sg_t *)((uintptr_t)vu_req->sg + iovcnt * dma_sg_size()); 812 } 813 814 static void * 815 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 816 { 817 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 818 struct spdk_nvmf_qpair *qpair; 819 struct nvmf_vfio_user_req *vu_req; 820 struct nvmf_vfio_user_qpair *vu_qpair; 821 void *ret; 822 823 assert(req != NULL); 824 qpair = req->qpair; 825 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 826 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 827 828 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 829 ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len, 830 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 831 &vu_req->iov[vu_req->iovcnt], prot); 832 if (spdk_likely(ret != NULL)) { 833 vu_req->iovcnt++; 834 } 835 return ret; 836 } 837 838 static int 839 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 840 struct iovec *iov, uint32_t length) 841 { 842 /* Map PRP list to from Guest physical memory to 843 * virtual memory address. 844 */ 845 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 846 length, 4096, _map_one); 847 } 848 849 static int 850 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 851 struct nvmf_vfio_user_qpair *vu_qpair); 852 853 /* 854 * Posts a CQE in the completion queue. 855 * 856 * @ctrlr: the vfio-user controller 857 * @cq: the completion queue 858 * @cdw0: cdw0 as reported by NVMf 859 * @sqid: submission queue ID 860 * @cid: command identifier in NVMe command 861 * @sc: the NVMe CQE status code 862 * @sct: the NVMe CQE status code type 863 */ 864 static int 865 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *cq, 866 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 867 { 868 struct spdk_nvme_cpl *cpl; 869 const struct spdk_nvmf_registers *regs; 870 int err; 871 872 assert(ctrlr != NULL); 873 874 if (spdk_unlikely(cq == NULL || cq->addr == NULL)) { 875 return 0; 876 } 877 878 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 879 if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) { 880 SPDK_DEBUGLOG(nvmf_vfio, 881 "%s: ignore completion SQ%d cid=%d status=%#x\n", 882 ctrlr_id(ctrlr), sqid, cid, sc); 883 return 0; 884 } 885 886 if (cq_is_full(ctrlr, cq)) { 887 SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n", 888 ctrlr_id(ctrlr), io_q_id(cq), cq->tail, *hdbl(ctrlr, cq)); 889 return -1; 890 } 891 892 cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail; 893 894 assert(ctrlr->qp[sqid] != NULL); 895 SPDK_DEBUGLOG(nvmf_vfio, 896 "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n", 897 ctrlr_id(ctrlr), sqid, cid, sc, sq_head(ctrlr->qp[sqid]), 898 cq->tail); 899 900 cpl->sqhd = sq_head(ctrlr->qp[sqid]); 901 cpl->sqid = sqid; 902 cpl->cid = cid; 903 cpl->cdw0 = cdw0; 904 cpl->status.dnr = 0x0; 905 cpl->status.m = 0x0; 906 cpl->status.sct = sct; 907 cpl->status.p = cq->phase; 908 cpl->status.sc = sc; 909 910 cq_tail_advance(cq); 911 912 /* 913 * this function now executes at SPDK thread context, we 914 * might be triggering interrupts from vfio-user thread context so 915 * check for race conditions. 916 */ 917 if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) { 918 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 919 if (err != 0) { 920 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 921 ctrlr_id(ctrlr)); 922 return err; 923 } 924 } 925 926 return 0; 927 } 928 929 static bool 930 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 931 { 932 assert(vu_ctrlr != NULL); 933 934 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 935 return false; 936 } 937 938 if (vu_ctrlr->qp[qid] == NULL) { 939 return false; 940 } 941 942 if (!is_cq) { 943 if (vu_ctrlr->qp[qid]->state == VFIO_USER_QPAIR_SQ_DELETED || 944 vu_ctrlr->qp[qid]->state == VFIO_USER_QPAIR_UNINITIALIZED) { 945 return false; 946 } 947 } 948 949 return true; 950 } 951 952 static void 953 unmap_qp(struct nvmf_vfio_user_qpair *qp) 954 { 955 struct nvmf_vfio_user_ctrlr *ctrlr; 956 957 if (qp->ctrlr == NULL) { 958 return; 959 } 960 ctrlr = qp->ctrlr; 961 962 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap QP%d\n", 963 ctrlr_id(ctrlr), qp->qpair.qid); 964 965 if (qp->sq.addr != NULL) { 966 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->sq.sg, &qp->sq.iov, 1); 967 qp->sq.addr = NULL; 968 } 969 970 if (qp->cq.addr != NULL) { 971 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->cq.sg, &qp->cq.iov, 1); 972 qp->cq.addr = NULL; 973 } 974 } 975 976 static int 977 remap_qp(struct nvmf_vfio_user_qpair *vu_qpair) 978 { 979 struct nvme_q *sq, *cq; 980 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 981 int ret; 982 983 vu_ctrlr = vu_qpair->ctrlr; 984 sq = &vu_qpair->sq; 985 cq = &vu_qpair->cq; 986 987 if (sq->size) { 988 ret = map_q(vu_ctrlr, sq, false, false); 989 if (ret) { 990 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n", 991 io_q_id(sq), sq->prp1, sq->prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 992 return -EFAULT; 993 } 994 } 995 996 if (cq->size) { 997 ret = map_q(vu_ctrlr, cq, true, false); 998 if (ret) { 999 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n", 1000 io_q_id(cq), cq->prp1, cq->prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 1001 return -EFAULT; 1002 } 1003 1004 } 1005 1006 return 0; 1007 } 1008 1009 static void 1010 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1011 { 1012 struct nvmf_vfio_user_qpair *qpair; 1013 struct nvmf_vfio_user_req *vu_req; 1014 uint32_t i; 1015 1016 if (ctrlr == NULL) { 1017 return; 1018 } 1019 1020 qpair = ctrlr->qp[qid]; 1021 if (qpair == NULL) { 1022 return; 1023 } 1024 1025 SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr), 1026 qid, qpair); 1027 1028 unmap_qp(qpair); 1029 1030 for (i = 0; i < qpair->qsize; i++) { 1031 vu_req = &qpair->reqs_internal[i]; 1032 free(vu_req->sg); 1033 } 1034 free(qpair->reqs_internal); 1035 1036 free(qpair->sq.sg); 1037 free(qpair->cq.sg); 1038 free(qpair); 1039 1040 ctrlr->qp[qid] = NULL; 1041 } 1042 1043 /* This function can only fail because of memory allocation errors. */ 1044 static int 1045 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1046 const uint32_t qsize, const uint16_t id) 1047 { 1048 uint32_t i; 1049 struct nvmf_vfio_user_qpair *qpair; 1050 struct nvmf_vfio_user_req *vu_req, *tmp; 1051 struct spdk_nvmf_request *req; 1052 1053 assert(ctrlr != NULL); 1054 assert(transport != NULL); 1055 1056 qpair = calloc(1, sizeof(*qpair)); 1057 if (qpair == NULL) { 1058 return -ENOMEM; 1059 } 1060 qpair->sq.sg = calloc(1, dma_sg_size()); 1061 if (qpair->sq.sg == NULL) { 1062 free(qpair); 1063 return -ENOMEM; 1064 } 1065 qpair->cq.sg = calloc(1, dma_sg_size()); 1066 if (qpair->cq.sg == NULL) { 1067 free(qpair->sq.sg); 1068 free(qpair); 1069 return -ENOMEM; 1070 } 1071 1072 qpair->qpair.qid = id; 1073 qpair->qpair.transport = transport; 1074 qpair->ctrlr = ctrlr; 1075 qpair->qsize = qsize; 1076 1077 TAILQ_INIT(&qpair->reqs); 1078 1079 qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req)); 1080 if (qpair->reqs_internal == NULL) { 1081 SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr)); 1082 goto reqs_err; 1083 } 1084 1085 for (i = 0; i < qsize; i++) { 1086 vu_req = &qpair->reqs_internal[i]; 1087 vu_req->sg = calloc(NVMF_VFIO_USER_MAX_IOVECS, dma_sg_size()); 1088 if (vu_req->sg == NULL) { 1089 goto sg_err; 1090 } 1091 1092 req = &vu_req->req; 1093 req->qpair = &qpair->qpair; 1094 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1095 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1096 1097 TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link); 1098 } 1099 1100 ctrlr->qp[id] = qpair; 1101 return 0; 1102 1103 sg_err: 1104 TAILQ_FOREACH_SAFE(vu_req, &qpair->reqs, link, tmp) { 1105 free(vu_req->sg); 1106 } 1107 free(qpair->reqs_internal); 1108 1109 reqs_err: 1110 free(qpair->sq.sg); 1111 free(qpair->cq.sg); 1112 free(qpair); 1113 return -ENOMEM; 1114 } 1115 1116 /* 1117 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 1118 * on error. 1119 */ 1120 static int 1121 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1122 struct spdk_nvme_cmd *cmd, const bool is_cq) 1123 { 1124 uint16_t qid, cqid; 1125 uint32_t qsize; 1126 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1127 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1128 int err = 0; 1129 struct nvmf_vfio_user_qpair *vu_qpair; 1130 struct nvme_q *io_q; 1131 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1132 1133 assert(ctrlr != NULL); 1134 assert(cmd != NULL); 1135 1136 qid = cmd->cdw10_bits.create_io_q.qid; 1137 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1138 SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr), 1139 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 1140 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1141 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1142 goto out; 1143 } 1144 1145 if (io_q_exists(ctrlr, qid, is_cq)) { 1146 SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr), 1147 is_cq ? 'C' : 'S', qid); 1148 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1149 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1150 goto out; 1151 } 1152 1153 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1154 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 1155 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 1156 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1157 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 1158 goto out; 1159 } 1160 1161 SPDK_DEBUGLOG(nvmf_vfio, 1162 "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr), 1163 is_cq ? 'C' : 'S', qid, qsize); 1164 1165 if (is_cq) { 1166 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1167 SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr)); 1168 sc = SPDK_NVME_SC_INVALID_FIELD; 1169 goto out; 1170 } 1171 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 1172 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 1173 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1174 sc = SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 1175 goto out; 1176 } 1177 1178 err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, qsize, qid); 1179 if (err != 0) { 1180 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1181 goto out; 1182 } 1183 1184 io_q = &ctrlr->qp[qid]->cq; 1185 io_q->ien = cmd->cdw11_bits.create_io_cq.ien; 1186 io_q->iv = cmd->cdw11_bits.create_io_cq.iv; 1187 io_q->phase = true; 1188 } else { 1189 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1190 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1191 SPDK_ERRLOG("%s: invalid CQID %u\n", ctrlr_id(ctrlr), cqid); 1192 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1193 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1194 goto out; 1195 1196 } 1197 /* CQ must be created before SQ */ 1198 if (!io_q_exists(ctrlr, cqid, true)) { 1199 SPDK_ERRLOG("%s: CQ%u does not exist\n", ctrlr_id(ctrlr), cqid); 1200 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1201 sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1202 goto out; 1203 } 1204 1205 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1206 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1207 sc = SPDK_NVME_SC_INVALID_FIELD; 1208 goto out; 1209 } 1210 /* TODO: support shared IO CQ */ 1211 if (qid != cqid) { 1212 SPDK_ERRLOG("%s: doesn't support shared CQ now\n", ctrlr_id(ctrlr)); 1213 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1214 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1215 } 1216 1217 io_q = &ctrlr->qp[qid]->sq; 1218 io_q->cqid = cqid; 1219 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr), 1220 qid, io_q->cqid); 1221 } 1222 1223 io_q->is_cq = is_cq; 1224 io_q->size = qsize; 1225 io_q->prp1 = cmd->dptr.prp.prp1; 1226 1227 err = map_q(ctrlr, io_q, is_cq, true); 1228 if (err) { 1229 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1230 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1231 goto out; 1232 } 1233 1234 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n", 1235 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1236 qid, cmd->dptr.prp.prp1, (unsigned long long)io_q->addr); 1237 1238 if (is_cq) { 1239 *hdbl(ctrlr, io_q) = 0; 1240 } else { 1241 vu_qpair = ctrlr->qp[qid]; 1242 *tdbl(ctrlr, io_q) = 0; 1243 vu_qpair->sq.head = 0; 1244 1245 if (vu_qpair->state == VFIO_USER_QPAIR_SQ_DELETED) { 1246 vu_qpair->state = VFIO_USER_QPAIR_ACTIVE; 1247 } else { 1248 /* 1249 * Create our new I/O qpair. This asynchronously invokes, on a 1250 * suitable poll group, the nvmf_vfio_user_poll_group_add() 1251 * callback, which will call spdk_nvmf_request_exec_fabrics() 1252 * with a generated fabrics connect command. This command is 1253 * then eventually completed via handle_queue_connect_rsp(). 1254 */ 1255 vu_qpair->create_io_sq_cmd = *cmd; 1256 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1257 &vu_qpair->qpair); 1258 return 0; 1259 } 1260 } 1261 1262 out: 1263 return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, sc, sct); 1264 } 1265 1266 /* For ADMIN I/O DELETE COMPLETION QUEUE the NVMf library will disconnect and free 1267 * queue pair, so save the command in a context. 1268 */ 1269 struct vfio_user_delete_cq_ctx { 1270 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1271 struct spdk_nvme_cmd delete_io_cq_cmd; 1272 }; 1273 1274 static void 1275 vfio_user_qpair_delete_cb(void *cb_arg) 1276 { 1277 struct vfio_user_delete_cq_ctx *ctx = cb_arg; 1278 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 1279 1280 post_completion(vu_ctrlr, &vu_ctrlr->qp[0]->cq, 0, 0, ctx->delete_io_cq_cmd.cid, 1281 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 1282 free(ctx); 1283 } 1284 1285 /* 1286 * Deletes a completion or submission I/O queue. 1287 */ 1288 static int 1289 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1290 struct spdk_nvme_cmd *cmd, const bool is_cq) 1291 { 1292 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1293 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1294 struct nvmf_vfio_user_qpair *vu_qpair; 1295 struct vfio_user_delete_cq_ctx *ctx; 1296 1297 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n", 1298 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1299 cmd->cdw10_bits.delete_io_q.qid); 1300 1301 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 1302 SPDK_ERRLOG("%s: I/O %cQ%d does not exist\n", ctrlr_id(ctrlr), 1303 is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid); 1304 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1305 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1306 goto out; 1307 } 1308 1309 vu_qpair = ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]; 1310 if (is_cq) { 1311 if (vu_qpair->state == VFIO_USER_QPAIR_UNINITIALIZED) { 1312 free_qp(ctrlr, cmd->cdw10_bits.delete_io_q.qid); 1313 goto out; 1314 } 1315 1316 /* SQ must have been deleted first */ 1317 if (vu_qpair->state != VFIO_USER_QPAIR_SQ_DELETED) { 1318 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 1319 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1320 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 1321 goto out; 1322 } 1323 ctx = calloc(1, sizeof(*ctx)); 1324 if (!ctx) { 1325 sct = SPDK_NVME_SCT_GENERIC; 1326 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1327 goto out; 1328 } 1329 ctx->vu_ctrlr = ctrlr; 1330 ctx->delete_io_cq_cmd = *cmd; 1331 spdk_nvmf_qpair_disconnect(&vu_qpair->qpair, vfio_user_qpair_delete_cb, ctx); 1332 return 0; 1333 } else { 1334 if (vu_qpair->state == VFIO_USER_QPAIR_SQ_DELETED) { 1335 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%u is already deleted\n", ctrlr_id(ctrlr), 1336 cmd->cdw10_bits.delete_io_q.qid); 1337 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1338 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1339 goto out; 1340 } 1341 1342 /* 1343 * This doesn't actually delete the SQ, We're merely telling the poll_group_poll 1344 * function to skip checking this SQ. The queue pair will be disconnected in Delete 1345 * IO CQ command. 1346 */ 1347 vu_qpair->state = VFIO_USER_QPAIR_SQ_DELETED; 1348 vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, vu_qpair->sq.sg, &vu_qpair->sq.iov, 1); 1349 vu_qpair->sq.addr = NULL; 1350 } 1351 1352 out: 1353 return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, sc, sct); 1354 } 1355 1356 /* 1357 * Returns 0 on success and -errno on error. 1358 */ 1359 static int 1360 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 1361 { 1362 assert(ctrlr != NULL); 1363 assert(cmd != NULL); 1364 1365 if (cmd->fuse != 0) { 1366 /* Fused admin commands are not supported. */ 1367 return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, 1368 SPDK_NVME_SC_INVALID_FIELD, 1369 SPDK_NVME_SCT_GENERIC); 1370 } 1371 1372 switch (cmd->opc) { 1373 case SPDK_NVME_OPC_CREATE_IO_CQ: 1374 case SPDK_NVME_OPC_CREATE_IO_SQ: 1375 return handle_create_io_q(ctrlr, cmd, 1376 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 1377 case SPDK_NVME_OPC_DELETE_IO_SQ: 1378 case SPDK_NVME_OPC_DELETE_IO_CQ: 1379 return handle_del_io_q(ctrlr, cmd, 1380 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 1381 default: 1382 return handle_cmd_req(ctrlr, cmd, ctrlr->qp[0]); 1383 } 1384 } 1385 1386 static int 1387 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 1388 { 1389 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 1390 struct nvmf_vfio_user_ctrlr *vu_ctrlr = vu_qpair->ctrlr; 1391 uint16_t sqid, cqid; 1392 1393 assert(vu_qpair != NULL); 1394 assert(vu_req != NULL); 1395 assert(vu_ctrlr != NULL); 1396 1397 if (spdk_likely(vu_req->iovcnt)) { 1398 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, vu_req->sg, vu_req->iov, vu_req->iovcnt); 1399 } 1400 sqid = vu_qpair->qpair.qid; 1401 cqid = vu_ctrlr->qp[sqid]->sq.cqid; 1402 1403 return post_completion(vu_ctrlr, &vu_ctrlr->qp[cqid]->cq, 1404 vu_req->req.rsp->nvme_cpl.cdw0, 1405 sqid, 1406 vu_req->req.cmd->nvme_cmd.cid, 1407 vu_req->req.rsp->nvme_cpl.status.sc, 1408 vu_req->req.rsp->nvme_cpl.status.sct); 1409 } 1410 1411 static int 1412 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair, 1413 struct spdk_nvme_cmd *cmd) 1414 { 1415 assert(qpair != NULL); 1416 if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { 1417 return consume_admin_cmd(ctrlr, cmd); 1418 } 1419 1420 return handle_cmd_req(ctrlr, cmd, qpair); 1421 } 1422 1423 /* Returns the number of commands processed, or a negative value on error. */ 1424 static int 1425 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 1426 struct nvmf_vfio_user_qpair *qpair) 1427 { 1428 struct spdk_nvme_cmd *queue; 1429 int count = 0; 1430 1431 assert(ctrlr != NULL); 1432 assert(qpair != NULL); 1433 1434 queue = qpair->sq.addr; 1435 while (sq_head(qpair) != new_tail) { 1436 int err; 1437 struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)]; 1438 1439 count++; 1440 1441 /* 1442 * SQHD must contain the new head pointer, so we must increase 1443 * it before we generate a completion. 1444 */ 1445 sqhd_advance(ctrlr, qpair); 1446 1447 err = consume_cmd(ctrlr, qpair, cmd); 1448 if (err != 0) { 1449 return err; 1450 } 1451 } 1452 1453 return count; 1454 } 1455 1456 static int 1457 enable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1458 { 1459 int err; 1460 1461 assert(ctrlr != NULL); 1462 1463 err = acq_setup(ctrlr); 1464 if (err != 0) { 1465 return err; 1466 } 1467 1468 err = asq_setup(ctrlr); 1469 if (err != 0) { 1470 return err; 1471 } 1472 1473 return 0; 1474 } 1475 1476 static void 1477 disable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1478 { 1479 assert(ctrlr->qp[0] != NULL); 1480 1481 unmap_qp(ctrlr->qp[0]); 1482 } 1483 1484 static void 1485 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1486 { 1487 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1488 struct nvmf_vfio_user_ctrlr *ctrlr; 1489 struct nvmf_vfio_user_qpair *qpair; 1490 int ret; 1491 1492 /* 1493 * We're not interested in any DMA regions that aren't mappable (we don't 1494 * support clients that don't share their memory). 1495 */ 1496 if (!info->vaddr) { 1497 return; 1498 } 1499 1500 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1501 (info->mapping.iov_len & MASK_2MB)) { 1502 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1503 (uintptr_t)info->mapping.iov_base, 1504 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1505 return; 1506 } 1507 1508 assert(endpoint != NULL); 1509 if (endpoint->ctrlr == NULL) { 1510 return; 1511 } 1512 ctrlr = endpoint->ctrlr; 1513 1514 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1515 (uintptr_t)info->mapping.iov_base, 1516 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1517 1518 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 1519 * check the protection bits before registering. 1520 */ 1521 if (info->prot == (PROT_WRITE | PROT_READ)) { 1522 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 1523 if (ret) { 1524 SPDK_ERRLOG("Memory region register %#lx-%#lx failed, ret=%d\n", 1525 (uint64_t)(uintptr_t)info->mapping.iov_base, 1526 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len, 1527 ret); 1528 } 1529 } 1530 1531 pthread_mutex_lock(&endpoint->lock); 1532 TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) { 1533 if (qpair->state != VFIO_USER_QPAIR_INACTIVE) { 1534 continue; 1535 } 1536 1537 ret = remap_qp(qpair); 1538 if (ret) { 1539 continue; 1540 } 1541 qpair->state = VFIO_USER_QPAIR_ACTIVE; 1542 SPDK_DEBUGLOG(nvmf_vfio, "Remap QP %u successfully\n", qpair->qpair.qid); 1543 } 1544 pthread_mutex_unlock(&endpoint->lock); 1545 } 1546 1547 static int 1548 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1549 { 1550 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1551 struct nvmf_vfio_user_ctrlr *ctrlr; 1552 struct nvmf_vfio_user_qpair *qpair; 1553 void *map_start, *map_end; 1554 int ret = 0; 1555 1556 if (!info->vaddr) { 1557 return 0; 1558 } 1559 1560 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1561 (info->mapping.iov_len & MASK_2MB)) { 1562 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, 1563 (uintptr_t)info->mapping.iov_base, 1564 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1565 return 0; 1566 } 1567 1568 assert(endpoint != NULL); 1569 if (endpoint->ctrlr == NULL) { 1570 return 0; 1571 } 1572 ctrlr = endpoint->ctrlr; 1573 1574 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n", ctrlr_id(ctrlr), 1575 (uintptr_t)info->mapping.iov_base, 1576 (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); 1577 1578 map_start = info->mapping.iov_base; 1579 map_end = info->mapping.iov_base + info->mapping.iov_len; 1580 1581 pthread_mutex_lock(&endpoint->lock); 1582 TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) { 1583 if ((qpair->cq.addr >= map_start && qpair->cq.addr <= map_end) || 1584 (qpair->sq.addr >= map_start && qpair->sq.addr <= map_end)) { 1585 /* TODO: Ideally we should disconnect this queue pair 1586 * before returning to caller. 1587 */ 1588 unmap_qp(qpair); 1589 qpair->state = VFIO_USER_QPAIR_INACTIVE; 1590 } 1591 } 1592 pthread_mutex_unlock(&endpoint->lock); 1593 1594 if (info->prot == (PROT_WRITE | PROT_READ)) { 1595 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 1596 if (ret) { 1597 SPDK_ERRLOG("Memory region unregister %#lx-%#lx failed, ret=%d\n", 1598 (uint64_t)(uintptr_t)info->mapping.iov_base, 1599 (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len, 1600 ret); 1601 } 1602 } 1603 1604 return 0; 1605 } 1606 1607 static int 1608 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1609 { 1610 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 1611 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1612 bool disable_admin = false; 1613 int ret; 1614 1615 assert(vu_qpair != NULL); 1616 assert(req != NULL); 1617 1618 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 1619 assert(vu_qpair->ctrlr != NULL); 1620 assert(req != NULL); 1621 1622 memcpy(req->req.data, 1623 &req->req.rsp->prop_get_rsp.value.u64, 1624 req->req.length); 1625 } else { 1626 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 1627 assert(vu_qpair->ctrlr != NULL); 1628 vu_ctrlr = vu_qpair->ctrlr; 1629 1630 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 1631 union spdk_nvme_cc_register cc, diff; 1632 1633 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 1634 diff.raw = cc.raw ^ req->cc.raw; 1635 1636 if (diff.bits.en) { 1637 if (cc.bits.en) { 1638 SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr)); 1639 ret = enable_admin_queue(vu_ctrlr); 1640 if (ret) { 1641 SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr)); 1642 return ret; 1643 } 1644 vu_qpair->state = VFIO_USER_QPAIR_ACTIVE; 1645 } else { 1646 disable_admin = true; 1647 } 1648 } 1649 1650 if (diff.bits.shn) { 1651 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 1652 disable_admin = true; 1653 } 1654 } 1655 1656 if (disable_admin) { 1657 SPDK_DEBUGLOG(nvmf_vfio, 1658 "%s: UNMAP Admin queue\n", 1659 ctrlr_id(vu_ctrlr)); 1660 vu_qpair->state = VFIO_USER_QPAIR_INACTIVE; 1661 disable_admin_queue(vu_ctrlr); 1662 /* For PCIe controller reset or shutdown, we will drop all AER responses */ 1663 nvmf_ctrlr_abort_aer(vu_qpair->qpair.ctrlr); 1664 } 1665 } 1666 } 1667 1668 return 0; 1669 } 1670 1671 /* 1672 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 1673 * doorbell is written via access_bar0_fn(). 1674 * 1675 * DSTRD is set to fixed value 0 for NVMf. 1676 * 1677 */ 1678 static int 1679 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 1680 const size_t count, loff_t pos, const bool is_write) 1681 { 1682 assert(ctrlr != NULL); 1683 assert(buf != NULL); 1684 1685 if (count != sizeof(uint32_t)) { 1686 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 1687 ctrlr_id(ctrlr), count); 1688 errno = EINVAL; 1689 return -1; 1690 } 1691 1692 pos -= NVME_DOORBELLS_OFFSET; 1693 1694 /* pos must be dword aligned */ 1695 if ((pos & 0x3) != 0) { 1696 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 1697 errno = EINVAL; 1698 return -1; 1699 } 1700 1701 /* convert byte offset to array index */ 1702 pos >>= 2; 1703 1704 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 1705 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 1706 errno = EINVAL; 1707 return -1; 1708 } 1709 1710 if (is_write) { 1711 ctrlr->doorbells[pos] = *buf; 1712 spdk_wmb(); 1713 } else { 1714 spdk_rmb(); 1715 *buf = ctrlr->doorbells[pos]; 1716 } 1717 return 0; 1718 } 1719 1720 static ssize_t 1721 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 1722 bool is_write) 1723 { 1724 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1725 struct nvmf_vfio_user_ctrlr *ctrlr; 1726 struct nvmf_vfio_user_req *req; 1727 const struct spdk_nvmf_registers *regs; 1728 int ret; 1729 1730 ctrlr = endpoint->ctrlr; 1731 1732 SPDK_DEBUGLOG(nvmf_vfio, 1733 "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n", 1734 endpoint_id(endpoint), is_write ? "write" : "read", 1735 ctrlr, count, pos); 1736 1737 if (pos >= NVME_DOORBELLS_OFFSET) { 1738 /* 1739 * The fact that the doorbells can be memory mapped doesn't mean 1740 * that the client (VFIO in QEMU) is obliged to memory map them, 1741 * it might still elect to access them via regular read/write; 1742 * we might also have had disable_mappable_bar0 set. 1743 */ 1744 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 1745 pos, is_write); 1746 if (ret == 0) { 1747 return count; 1748 } 1749 return ret; 1750 } 1751 1752 /* Construct a Fabric Property Get/Set command and send it */ 1753 req = get_nvmf_vfio_user_req(ctrlr->qp[0]); 1754 if (req == NULL) { 1755 errno = ENOBUFS; 1756 return -1; 1757 } 1758 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr); 1759 req->cc.raw = regs->cc.raw; 1760 1761 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 1762 req->cb_arg = ctrlr->qp[0]; 1763 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 1764 req->req.cmd->prop_set_cmd.cid = 0; 1765 req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1; 1766 req->req.cmd->prop_set_cmd.ofst = pos; 1767 if (is_write) { 1768 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 1769 if (req->req.cmd->prop_set_cmd.attrib.size) { 1770 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 1771 } else { 1772 req->req.cmd->prop_set_cmd.value.u32.high = 0; 1773 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 1774 } 1775 } else { 1776 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 1777 } 1778 req->req.length = count; 1779 req->req.data = buf; 1780 1781 spdk_nvmf_request_exec_fabrics(&req->req); 1782 1783 return count; 1784 } 1785 1786 static ssize_t 1787 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 1788 bool is_write) 1789 { 1790 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1791 1792 if (is_write) { 1793 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 1794 endpoint_id(endpoint), offset, offset + count); 1795 errno = EINVAL; 1796 return -1; 1797 } 1798 1799 if (offset + count > NVME_REG_CFG_SIZE) { 1800 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 1801 endpoint_id(endpoint), offset, count, 1802 NVME_REG_CFG_SIZE); 1803 errno = ERANGE; 1804 return -1; 1805 } 1806 1807 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 1808 1809 return count; 1810 } 1811 1812 static void 1813 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 1814 { 1815 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1816 1817 if (level >= LOG_DEBUG) { 1818 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1819 } else if (level >= LOG_INFO) { 1820 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 1821 } else if (level >= LOG_NOTICE) { 1822 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 1823 } else if (level >= LOG_WARNING) { 1824 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 1825 } else { 1826 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 1827 } 1828 } 1829 1830 static int 1831 vfio_user_get_log_level(void) 1832 { 1833 int level; 1834 1835 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 1836 return LOG_DEBUG; 1837 } 1838 1839 level = spdk_log_to_syslog_level(spdk_log_get_level()); 1840 if (level < 0) { 1841 return LOG_ERR; 1842 } 1843 1844 return level; 1845 } 1846 1847 static void 1848 init_pci_config_space(vfu_pci_config_space_t *p) 1849 { 1850 /* MLBAR */ 1851 p->hdr.bars[0].raw = 0x0; 1852 /* MUBAR */ 1853 p->hdr.bars[1].raw = 0x0; 1854 1855 /* vendor specific, let's set them to zero for now */ 1856 p->hdr.bars[3].raw = 0x0; 1857 p->hdr.bars[4].raw = 0x0; 1858 p->hdr.bars[5].raw = 0x0; 1859 1860 /* enable INTx */ 1861 p->hdr.intr.ipin = 0x1; 1862 } 1863 1864 static int 1865 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 1866 struct nvmf_vfio_user_endpoint *endpoint) 1867 { 1868 int ret; 1869 ssize_t cap_offset; 1870 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 1871 1872 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 1873 struct pxcap pxcap = { 1874 .hdr.id = PCI_CAP_ID_EXP, 1875 .pxcaps.ver = 0x2, 1876 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 1877 .pxdcap2.ctds = 0x1 1878 }; 1879 1880 struct msixcap msixcap = { 1881 .hdr.id = PCI_CAP_ID_MSIX, 1882 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 1883 .mtab = {.tbir = 0x4, .to = 0x0}, 1884 .mpba = {.pbir = 0x5, .pbao = 0x0} 1885 }; 1886 1887 struct iovec sparse_mmap[] = { 1888 { 1889 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 1890 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 1891 }, 1892 }; 1893 1894 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 1895 if (ret < 0) { 1896 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 1897 return ret; 1898 } 1899 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 1900 /* 1901 * 0x02, controller uses the NVM Express programming interface 1902 * 0x08, non-volatile memory controller 1903 * 0x01, mass storage controller 1904 */ 1905 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 1906 1907 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 1908 if (cap_offset < 0) { 1909 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 1910 return ret; 1911 } 1912 1913 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 1914 if (cap_offset < 0) { 1915 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 1916 return ret; 1917 } 1918 1919 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 1920 if (cap_offset < 0) { 1921 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 1922 return ret; 1923 } 1924 1925 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 1926 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1927 if (ret < 0) { 1928 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 1929 return ret; 1930 } 1931 1932 if (vu_transport->transport_opts.disable_mappable_bar0) { 1933 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1934 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1935 NULL, 0, -1, 0); 1936 } else { 1937 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 1938 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 1939 sparse_mmap, 1, endpoint->devmem_fd, 0); 1940 } 1941 1942 if (ret < 0) { 1943 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 1944 return ret; 1945 } 1946 1947 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 1948 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1949 if (ret < 0) { 1950 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 1951 return ret; 1952 } 1953 1954 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 1955 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 1956 if (ret < 0) { 1957 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 1958 return ret; 1959 } 1960 1961 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 1962 if (ret < 0) { 1963 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 1964 return ret; 1965 } 1966 1967 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 1968 if (ret < 0) { 1969 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 1970 return ret; 1971 } 1972 1973 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 1974 if (ret < 0) { 1975 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 1976 return ret; 1977 } 1978 1979 ret = vfu_realize_ctx(vfu_ctx); 1980 if (ret < 0) { 1981 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 1982 return ret; 1983 } 1984 1985 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 1986 assert(endpoint->pci_config_space != NULL); 1987 init_pci_config_space(endpoint->pci_config_space); 1988 1989 assert(cap_offset != 0); 1990 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 1991 1992 return 0; 1993 } 1994 1995 static void 1996 _free_ctrlr(void *ctx) 1997 { 1998 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 1999 2000 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 2001 free(ctrlr); 2002 } 2003 2004 static void 2005 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr, bool free_qps) 2006 { 2007 int i; 2008 assert(ctrlr != NULL); 2009 2010 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 2011 2012 if (free_qps) { 2013 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2014 free_qp(ctrlr, i); 2015 } 2016 } 2017 2018 if (ctrlr->thread == spdk_get_thread()) { 2019 _free_ctrlr(ctrlr); 2020 } else { 2021 spdk_thread_send_msg(ctrlr->thread, _free_ctrlr, ctrlr); 2022 } 2023 } 2024 2025 static void 2026 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 2027 struct nvmf_vfio_user_endpoint *endpoint) 2028 { 2029 struct nvmf_vfio_user_ctrlr *ctrlr; 2030 int err = 0; 2031 2032 /* First, construct a vfio-user CUSTOM transport controller */ 2033 ctrlr = calloc(1, sizeof(*ctrlr)); 2034 if (ctrlr == NULL) { 2035 err = -ENOMEM; 2036 goto out; 2037 } 2038 /* We can only support one connection for now */ 2039 ctrlr->cntlid = 0x1; 2040 ctrlr->transport = transport; 2041 ctrlr->endpoint = endpoint; 2042 ctrlr->doorbells = endpoint->doorbells; 2043 TAILQ_INIT(&ctrlr->connected_qps); 2044 2045 /* Then, construct an admin queue pair */ 2046 err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0); 2047 if (err != 0) { 2048 free(ctrlr); 2049 goto out; 2050 } 2051 endpoint->ctrlr = ctrlr; 2052 2053 /* Notify the generic layer about the new admin queue pair */ 2054 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->qp[0]->qpair); 2055 2056 out: 2057 if (err != 0) { 2058 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 2059 endpoint_id(endpoint), strerror(-err)); 2060 } 2061 } 2062 2063 static int 2064 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 2065 const struct spdk_nvme_transport_id *trid, 2066 struct spdk_nvmf_listen_opts *listen_opts) 2067 { 2068 struct nvmf_vfio_user_transport *vu_transport; 2069 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 2070 char path[PATH_MAX] = {}; 2071 char uuid[PATH_MAX] = {}; 2072 int ret; 2073 2074 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2075 transport); 2076 2077 pthread_mutex_lock(&vu_transport->lock); 2078 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 2079 /* Only compare traddr */ 2080 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 2081 pthread_mutex_unlock(&vu_transport->lock); 2082 return -EEXIST; 2083 } 2084 } 2085 pthread_mutex_unlock(&vu_transport->lock); 2086 2087 endpoint = calloc(1, sizeof(*endpoint)); 2088 if (!endpoint) { 2089 return -ENOMEM; 2090 } 2091 2092 pthread_mutex_init(&endpoint->lock, NULL); 2093 endpoint->devmem_fd = -1; 2094 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 2095 2096 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 2097 if (ret < 0 || ret >= PATH_MAX) { 2098 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 2099 ret = -1; 2100 goto out; 2101 } 2102 2103 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 2104 if (ret == -1) { 2105 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 2106 endpoint_id(endpoint), path, spdk_strerror(errno)); 2107 goto out; 2108 } 2109 2110 endpoint->devmem_fd = ret; 2111 ret = ftruncate(endpoint->devmem_fd, 2112 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 2113 if (ret != 0) { 2114 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 2115 spdk_strerror(errno)); 2116 goto out; 2117 } 2118 2119 endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 2120 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 2121 if (endpoint->doorbells == MAP_FAILED) { 2122 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 2123 endpoint->doorbells = NULL; 2124 ret = -1; 2125 goto out; 2126 } 2127 2128 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 2129 if (ret < 0 || ret >= PATH_MAX) { 2130 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 2131 ret = -1; 2132 goto out; 2133 } 2134 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 2135 endpoint, VFU_DEV_TYPE_PCI); 2136 if (endpoint->vfu_ctx == NULL) { 2137 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 2138 endpoint_id(endpoint)); 2139 ret = -1; 2140 goto out; 2141 } 2142 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level()); 2143 2144 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 2145 if (ret < 0) { 2146 goto out; 2147 } 2148 2149 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 2150 SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells); 2151 2152 out: 2153 if (ret != 0) { 2154 nvmf_vfio_user_destroy_endpoint(endpoint); 2155 } 2156 2157 return ret; 2158 } 2159 2160 static void 2161 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 2162 const struct spdk_nvme_transport_id *trid) 2163 { 2164 struct nvmf_vfio_user_transport *vu_transport; 2165 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 2166 2167 assert(trid != NULL); 2168 assert(trid->traddr != NULL); 2169 2170 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 2171 2172 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2173 transport); 2174 2175 pthread_mutex_lock(&vu_transport->lock); 2176 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 2177 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 2178 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 2179 if (endpoint->ctrlr) { 2180 /* Users may kill NVMeoF target while VM 2181 * is connected, free all resources. 2182 */ 2183 free_ctrlr(endpoint->ctrlr, true); 2184 } 2185 nvmf_vfio_user_destroy_endpoint(endpoint); 2186 pthread_mutex_unlock(&vu_transport->lock); 2187 2188 return; 2189 } 2190 } 2191 pthread_mutex_unlock(&vu_transport->lock); 2192 2193 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 2194 } 2195 2196 static void 2197 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 2198 struct spdk_nvmf_subsystem *subsystem, 2199 struct spdk_nvmf_ctrlr_data *cdata) 2200 { 2201 cdata->vid = SPDK_PCI_VID_NUTANIX; 2202 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 2203 cdata->ieee[0] = 0x8d; 2204 cdata->ieee[1] = 0x6b; 2205 cdata->ieee[2] = 0x50; 2206 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 2207 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 2208 /* libvfio-user can only support 1 connection for now */ 2209 cdata->oncs.reservations = 0; 2210 } 2211 2212 static int 2213 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 2214 const struct spdk_nvmf_subsystem *subsystem, 2215 const struct spdk_nvme_transport_id *trid) 2216 { 2217 struct nvmf_vfio_user_transport *vu_transport; 2218 struct nvmf_vfio_user_endpoint *endpoint; 2219 2220 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 2221 2222 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2223 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 2224 break; 2225 } 2226 } 2227 2228 if (endpoint == NULL) { 2229 return -ENOENT; 2230 } 2231 2232 endpoint->subsystem = subsystem; 2233 2234 return 0; 2235 } 2236 2237 /* 2238 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 2239 * frequency. 2240 * 2241 * For each transport endpoint (which at the libvfio-user level corresponds to 2242 * a socket), if we don't currently have a controller set up, peek to see if the 2243 * socket is able to accept a new connection. 2244 * 2245 * This poller also takes care of handling the creation of any pending new 2246 * qpairs. 2247 */ 2248 static int 2249 nvmf_vfio_user_accept(void *ctx) 2250 { 2251 struct spdk_nvmf_transport *transport = ctx; 2252 struct nvmf_vfio_user_transport *vu_transport; 2253 struct nvmf_vfio_user_endpoint *endpoint; 2254 uint32_t count = 0; 2255 int err; 2256 2257 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 2258 transport); 2259 2260 pthread_mutex_lock(&vu_transport->lock); 2261 2262 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 2263 if (endpoint->ctrlr != NULL) { 2264 continue; 2265 } 2266 2267 err = vfu_attach_ctx(endpoint->vfu_ctx); 2268 if (err != 0) { 2269 if (errno == EAGAIN || errno == EWOULDBLOCK) { 2270 continue; 2271 } 2272 2273 pthread_mutex_unlock(&vu_transport->lock); 2274 return SPDK_POLLER_BUSY; 2275 } 2276 2277 count++; 2278 2279 /* Construct a controller */ 2280 nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 2281 } 2282 2283 pthread_mutex_unlock(&vu_transport->lock); 2284 2285 return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 2286 } 2287 2288 static void 2289 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 2290 struct spdk_nvme_transport_id *trid, 2291 struct spdk_nvmf_discovery_log_page_entry *entry) 2292 { } 2293 2294 static struct spdk_nvmf_transport_poll_group * 2295 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport) 2296 { 2297 struct nvmf_vfio_user_poll_group *vu_group; 2298 2299 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 2300 2301 vu_group = calloc(1, sizeof(*vu_group)); 2302 if (vu_group == NULL) { 2303 SPDK_ERRLOG("Error allocating poll group: %m"); 2304 return NULL; 2305 } 2306 2307 TAILQ_INIT(&vu_group->qps); 2308 2309 return &vu_group->group; 2310 } 2311 2312 /* called when process exits */ 2313 static void 2314 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 2315 { 2316 struct nvmf_vfio_user_poll_group *vu_group; 2317 2318 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 2319 2320 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2321 2322 free(vu_group); 2323 } 2324 2325 static void 2326 vfio_user_qpair_disconnect_cb(void *ctx) 2327 { 2328 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2329 struct nvmf_vfio_user_ctrlr *ctrlr; 2330 2331 pthread_mutex_lock(&endpoint->lock); 2332 ctrlr = endpoint->ctrlr; 2333 if (!ctrlr) { 2334 pthread_mutex_unlock(&endpoint->lock); 2335 return; 2336 } 2337 2338 if (TAILQ_EMPTY(&ctrlr->connected_qps)) { 2339 endpoint->ctrlr = NULL; 2340 free_ctrlr(ctrlr, false); 2341 } 2342 pthread_mutex_unlock(&endpoint->lock); 2343 } 2344 2345 static void 2346 _vfio_user_qpair_disconnect(void *ctx) 2347 { 2348 struct nvmf_vfio_user_qpair *vu_qpair = ctx; 2349 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2350 struct nvmf_vfio_user_endpoint *endpoint; 2351 2352 vu_ctrlr = vu_qpair->ctrlr; 2353 endpoint = vu_ctrlr->endpoint; 2354 2355 spdk_nvmf_qpair_disconnect(&vu_qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint); 2356 } 2357 2358 static int 2359 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 2360 { 2361 struct nvmf_vfio_user_qpair *qpair; 2362 struct nvmf_vfio_user_endpoint *endpoint; 2363 2364 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 2365 2366 endpoint = ctrlr->endpoint; 2367 assert(endpoint != NULL); 2368 2369 pthread_mutex_lock(&endpoint->lock); 2370 if (TAILQ_EMPTY(&ctrlr->connected_qps)) { 2371 endpoint->ctrlr = NULL; 2372 free_ctrlr(ctrlr, false); 2373 pthread_mutex_unlock(&endpoint->lock); 2374 return 0; 2375 } 2376 2377 TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) { 2378 /* add another round thread poll to avoid recursive endpoint lock */ 2379 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, qpair); 2380 } 2381 pthread_mutex_unlock(&endpoint->lock); 2382 2383 return 0; 2384 } 2385 2386 /* 2387 * Poll for and process any incoming vfio-user messages. 2388 */ 2389 static int 2390 vfio_user_poll_vfu_ctx(void *ctx) 2391 { 2392 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 2393 int ret; 2394 2395 assert(ctrlr != NULL); 2396 2397 /* This will call access_bar0_fn() if there are any writes 2398 * to the portion of the BAR that is not mmap'd */ 2399 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 2400 if (spdk_unlikely(ret == -1)) { 2401 if (errno == EBUSY) { 2402 return SPDK_POLLER_BUSY; 2403 } 2404 2405 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 2406 2407 /* initiator shutdown or reset, waiting for another re-connect */ 2408 if (errno == ENOTCONN) { 2409 vfio_user_destroy_ctrlr(ctrlr); 2410 return SPDK_POLLER_BUSY; 2411 } 2412 2413 fail_ctrlr(ctrlr); 2414 } 2415 2416 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 2417 } 2418 2419 static int 2420 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2421 { 2422 struct nvmf_vfio_user_poll_group *vu_group; 2423 struct nvmf_vfio_user_qpair *vu_qpair = cb_arg; 2424 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2425 struct nvmf_vfio_user_endpoint *endpoint; 2426 2427 assert(vu_qpair != NULL); 2428 assert(req != NULL); 2429 2430 vu_ctrlr = vu_qpair->ctrlr; 2431 assert(vu_ctrlr != NULL); 2432 endpoint = vu_ctrlr->endpoint; 2433 assert(endpoint != NULL); 2434 2435 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 2436 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 2437 endpoint->ctrlr = NULL; 2438 free_ctrlr(vu_ctrlr, true); 2439 return -1; 2440 } 2441 2442 vu_group = SPDK_CONTAINEROF(vu_qpair->group, struct nvmf_vfio_user_poll_group, group); 2443 TAILQ_INSERT_TAIL(&vu_group->qps, vu_qpair, link); 2444 vu_qpair->state = VFIO_USER_QPAIR_ACTIVE; 2445 2446 pthread_mutex_lock(&endpoint->lock); 2447 if (nvmf_qpair_is_admin_queue(&vu_qpair->qpair)) { 2448 vu_ctrlr->cntlid = vu_qpair->qpair.ctrlr->cntlid; 2449 vu_ctrlr->thread = spdk_get_thread(); 2450 vu_ctrlr->ctrlr = vu_qpair->qpair.ctrlr; 2451 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, vu_ctrlr, 0); 2452 } else { 2453 /* For I/O queues this command was generated in response to an 2454 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 2455 * been completed. Complete it now. 2456 */ 2457 post_completion(vu_ctrlr, &vu_ctrlr->qp[0]->cq, 0, 0, 2458 vu_qpair->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2459 } 2460 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_qps, vu_qpair, tailq); 2461 pthread_mutex_unlock(&endpoint->lock); 2462 2463 free(req->req.data); 2464 req->req.data = NULL; 2465 2466 return 0; 2467 } 2468 2469 /* 2470 * Add the given qpair to the given poll group. New qpairs are added via 2471 * spdk_nvmf_tgt_new_qpair(), which picks a poll group, then calls back 2472 * here via nvmf_transport_poll_group_add(). 2473 */ 2474 static int 2475 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 2476 struct spdk_nvmf_qpair *qpair) 2477 { 2478 struct nvmf_vfio_user_qpair *vu_qpair; 2479 struct nvmf_vfio_user_req *vu_req; 2480 struct nvmf_vfio_user_ctrlr *ctrlr; 2481 struct spdk_nvmf_request *req; 2482 struct spdk_nvmf_fabric_connect_data *data; 2483 bool admin; 2484 2485 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2486 vu_qpair->group = group; 2487 ctrlr = vu_qpair->ctrlr; 2488 2489 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 2490 ctrlr_id(ctrlr), vu_qpair->qpair.qid, 2491 vu_qpair, qpair, group); 2492 2493 admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair); 2494 2495 vu_req = get_nvmf_vfio_user_req(vu_qpair); 2496 if (vu_req == NULL) { 2497 return -1; 2498 } 2499 2500 req = &vu_req->req; 2501 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2502 req->cmd->connect_cmd.cid = 0; 2503 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 2504 req->cmd->connect_cmd.recfmt = 0; 2505 req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1; 2506 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 2507 2508 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 2509 req->data = calloc(1, req->length); 2510 if (req->data == NULL) { 2511 nvmf_vfio_user_req_free(req); 2512 return -ENOMEM; 2513 } 2514 2515 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 2516 data->cntlid = ctrlr->cntlid; 2517 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 2518 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 2519 2520 vu_req->cb_fn = handle_queue_connect_rsp; 2521 vu_req->cb_arg = vu_qpair; 2522 2523 SPDK_DEBUGLOG(nvmf_vfio, 2524 "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", 2525 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 2526 2527 spdk_nvmf_request_exec_fabrics(req); 2528 return 0; 2529 } 2530 2531 static int 2532 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 2533 struct spdk_nvmf_qpair *qpair) 2534 { 2535 struct nvmf_vfio_user_qpair *vu_qpair; 2536 struct nvmf_vfio_user_poll_group *vu_group; 2537 2538 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2539 2540 SPDK_DEBUGLOG(nvmf_vfio, 2541 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 2542 ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group); 2543 2544 2545 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2546 TAILQ_REMOVE(&vu_group->qps, vu_qpair, link); 2547 2548 return 0; 2549 } 2550 2551 static void 2552 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_qpair *vu_qpair, struct nvmf_vfio_user_req *vu_req) 2553 { 2554 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 2555 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 2556 vu_req->iovcnt = 0; 2557 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 2558 2559 TAILQ_INSERT_TAIL(&vu_qpair->reqs, vu_req, link); 2560 } 2561 2562 static int 2563 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 2564 { 2565 struct nvmf_vfio_user_qpair *vu_qpair; 2566 struct nvmf_vfio_user_req *vu_req; 2567 2568 assert(req != NULL); 2569 2570 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2571 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2572 2573 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2574 2575 return 0; 2576 } 2577 2578 static int 2579 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 2580 { 2581 struct nvmf_vfio_user_qpair *vu_qpair; 2582 struct nvmf_vfio_user_req *vu_req; 2583 2584 assert(req != NULL); 2585 2586 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 2587 vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2588 2589 if (vu_req->cb_fn != NULL) { 2590 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 2591 fail_ctrlr(vu_qpair->ctrlr); 2592 } 2593 } 2594 2595 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2596 2597 return 0; 2598 } 2599 2600 static void 2601 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 2602 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 2603 { 2604 struct nvmf_vfio_user_qpair *vu_qpair; 2605 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2606 2607 assert(qpair != NULL); 2608 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2609 vu_ctrlr = vu_qpair->ctrlr; 2610 2611 pthread_mutex_lock(&vu_ctrlr->endpoint->lock); 2612 TAILQ_REMOVE(&vu_ctrlr->connected_qps, vu_qpair, tailq); 2613 pthread_mutex_unlock(&vu_ctrlr->endpoint->lock); 2614 2615 free_qp(vu_ctrlr, qpair->qid); 2616 2617 if (cb_fn) { 2618 cb_fn(cb_arg); 2619 } 2620 } 2621 2622 /** 2623 * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available. 2624 */ 2625 static struct nvmf_vfio_user_req * 2626 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair) 2627 { 2628 struct nvmf_vfio_user_req *req; 2629 2630 assert(qpair != NULL); 2631 2632 if (TAILQ_EMPTY(&qpair->reqs)) { 2633 return NULL; 2634 } 2635 2636 req = TAILQ_FIRST(&qpair->reqs); 2637 TAILQ_REMOVE(&qpair->reqs, req, link); 2638 2639 return req; 2640 } 2641 2642 static int 2643 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 2644 { 2645 uint16_t nr; 2646 uint32_t nlb, nsid; 2647 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2648 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 2649 struct spdk_nvmf_ns *ns; 2650 2651 nsid = cmd->nsid; 2652 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 2653 if (ns == NULL || ns->bdev == NULL) { 2654 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 2655 return -EINVAL; 2656 } 2657 2658 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 2659 nr = cmd->cdw10_bits.dsm.nr + 1; 2660 return nr * sizeof(struct spdk_nvme_dsm_range); 2661 } 2662 2663 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 2664 return nlb * spdk_bdev_get_block_size(ns->bdev); 2665 } 2666 2667 static int 2668 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2669 { 2670 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 2671 uint32_t len = 0; 2672 uint8_t fid; 2673 int iovcnt; 2674 2675 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 2676 req->length = 0; 2677 req->data = NULL; 2678 2679 if (req->xfer == SPDK_NVME_DATA_NONE) { 2680 return 0; 2681 } 2682 2683 switch (cmd->opc) { 2684 case SPDK_NVME_OPC_IDENTIFY: 2685 len = 4096; 2686 break; 2687 case SPDK_NVME_OPC_GET_LOG_PAGE: 2688 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 2689 break; 2690 case SPDK_NVME_OPC_GET_FEATURES: 2691 case SPDK_NVME_OPC_SET_FEATURES: 2692 fid = cmd->cdw10_bits.set_features.fid; 2693 switch (fid) { 2694 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 2695 len = 4096; 2696 break; 2697 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 2698 len = 256; 2699 break; 2700 case SPDK_NVME_FEAT_TIMESTAMP: 2701 len = 8; 2702 break; 2703 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 2704 len = 512; 2705 break; 2706 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 2707 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 2708 len = 16; 2709 } else { 2710 len = 8; 2711 } 2712 break; 2713 default: 2714 return 0; 2715 } 2716 break; 2717 default: 2718 return 0; 2719 } 2720 2721 /* ADMIN command will not use SGL */ 2722 if (cmd->psdt != 0) { 2723 return -EINVAL; 2724 } 2725 2726 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 2727 if (iovcnt < 0) { 2728 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 2729 ctrlr_id(ctrlr), cmd->opc); 2730 return -1; 2731 } 2732 req->length = len; 2733 req->data = req->iov[0].iov_base; 2734 req->iovcnt = iovcnt; 2735 2736 return 0; 2737 } 2738 2739 /* 2740 * Map an I/O command's buffers. 2741 * 2742 * Returns 0 on success and -errno on failure. 2743 */ 2744 static int 2745 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 2746 { 2747 int len, iovcnt; 2748 struct spdk_nvme_cmd *cmd; 2749 2750 assert(ctrlr != NULL); 2751 assert(req != NULL); 2752 2753 cmd = &req->cmd->nvme_cmd; 2754 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 2755 req->length = 0; 2756 req->data = NULL; 2757 2758 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 2759 return 0; 2760 } 2761 2762 len = get_nvmf_io_req_length(req); 2763 if (len < 0) { 2764 return -EINVAL; 2765 } 2766 req->length = len; 2767 2768 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 2769 if (iovcnt < 0) { 2770 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 2771 return -EFAULT; 2772 } 2773 req->data = req->iov[0].iov_base; 2774 req->iovcnt = iovcnt; 2775 2776 return 0; 2777 } 2778 2779 static int 2780 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 2781 struct nvmf_vfio_user_qpair *vu_qpair) 2782 { 2783 int err; 2784 struct nvmf_vfio_user_req *vu_req; 2785 struct spdk_nvmf_request *req; 2786 2787 assert(ctrlr != NULL); 2788 assert(cmd != NULL); 2789 2790 vu_req = get_nvmf_vfio_user_req(vu_qpair); 2791 if (spdk_unlikely(vu_req == NULL)) { 2792 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 2793 return post_completion(ctrlr, &vu_qpair->cq, 0, 0, cmd->cid, 2794 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 2795 2796 } 2797 req = &vu_req->req; 2798 2799 assert(req->qpair != NULL); 2800 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle qid%u, req opc=%#x cid=%d\n", 2801 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 2802 2803 vu_req->cb_fn = handle_cmd_rsp; 2804 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair); 2805 req->cmd->nvme_cmd = *cmd; 2806 2807 if (nvmf_qpair_is_admin_queue(req->qpair)) { 2808 err = map_admin_cmd_req(ctrlr, req); 2809 } else { 2810 switch (cmd->opc) { 2811 case SPDK_NVME_OPC_RESERVATION_REGISTER: 2812 case SPDK_NVME_OPC_RESERVATION_REPORT: 2813 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 2814 case SPDK_NVME_OPC_RESERVATION_RELEASE: 2815 err = -ENOTSUP; 2816 break; 2817 default: 2818 err = map_io_cmd_req(ctrlr, req); 2819 break; 2820 } 2821 } 2822 2823 if (spdk_unlikely(err < 0)) { 2824 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 2825 ctrlr_id(ctrlr), cmd->opc); 2826 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2827 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2828 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 2829 _nvmf_vfio_user_req_free(vu_qpair, vu_req); 2830 return err; 2831 } 2832 2833 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 2834 spdk_nvmf_request_exec(req); 2835 2836 return 0; 2837 } 2838 2839 /* Returns the number of commands processed, or a negative value on error. */ 2840 static int 2841 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair) 2842 { 2843 struct nvmf_vfio_user_ctrlr *ctrlr; 2844 uint32_t new_tail; 2845 int count = 0; 2846 2847 assert(qpair != NULL); 2848 2849 ctrlr = qpair->ctrlr; 2850 2851 /* On aarch64 platforms, doorbells update from guest VM may not be seen 2852 * on SPDK target side. This is because there is memory type mismatch 2853 * situation here. That is on guest VM side, the doorbells are treated as 2854 * device memory while on SPDK target side, it is treated as normal 2855 * memory. And this situation cause problem on ARM platform. 2856 * Refer to "https://developer.arm.com/documentation/102376/0100/ 2857 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 2858 * cannot fix this. Use "dc civac" to invalidate cache may solve 2859 * this. 2860 */ 2861 spdk_ivdt_dcache(tdbl(ctrlr, &qpair->sq)); 2862 2863 /* Load-Acquire. */ 2864 new_tail = *tdbl(ctrlr, &qpair->sq); 2865 2866 /* 2867 * Ensure that changes to the queue are visible to us. 2868 * The host driver should write the queue first, do a wmb(), and then 2869 * update the SQ tail doorbell (their Store-Release). 2870 */ 2871 spdk_rmb(); 2872 2873 new_tail = new_tail & 0xffffu; 2874 if (spdk_unlikely(new_tail >= qpair->sq.size)) { 2875 union spdk_nvme_async_event_completion event = {}; 2876 2877 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid SQ%u doorbell value %u\n", ctrlr_id(ctrlr), qpair->qpair.qid, 2878 new_tail); 2879 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 2880 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 2881 nvmf_ctrlr_async_event_error_event(qpair->qpair.ctrlr, event); 2882 2883 return 0; 2884 } 2885 2886 if (sq_head(qpair) == new_tail) { 2887 return 0; 2888 } 2889 2890 count = handle_sq_tdbl_write(ctrlr, new_tail, qpair); 2891 if (count < 0) { 2892 fail_ctrlr(ctrlr); 2893 } 2894 2895 return count; 2896 } 2897 2898 /* 2899 * vfio-user transport poll handler. Note that the library context is polled in 2900 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 2901 * active qpairs. 2902 * 2903 * Returns the number of commands processed, or a negative value on error. 2904 */ 2905 static int 2906 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2907 { 2908 struct nvmf_vfio_user_poll_group *vu_group; 2909 struct nvmf_vfio_user_qpair *vu_qpair, *tmp; 2910 int count = 0; 2911 2912 assert(group != NULL); 2913 2914 spdk_rmb(); 2915 2916 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 2917 2918 TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) { 2919 int ret; 2920 2921 if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) { 2922 continue; 2923 } 2924 2925 ret = nvmf_vfio_user_qpair_poll(vu_qpair); 2926 2927 if (ret < 0) { 2928 return ret; 2929 } 2930 2931 count += ret; 2932 } 2933 2934 return count; 2935 } 2936 2937 static int 2938 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 2939 struct spdk_nvme_transport_id *trid) 2940 { 2941 struct nvmf_vfio_user_qpair *vu_qpair; 2942 struct nvmf_vfio_user_ctrlr *ctrlr; 2943 2944 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2945 ctrlr = vu_qpair->ctrlr; 2946 2947 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2948 return 0; 2949 } 2950 2951 static int 2952 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 2953 struct spdk_nvme_transport_id *trid) 2954 { 2955 return 0; 2956 } 2957 2958 static int 2959 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 2960 struct spdk_nvme_transport_id *trid) 2961 { 2962 struct nvmf_vfio_user_qpair *vu_qpair; 2963 struct nvmf_vfio_user_ctrlr *ctrlr; 2964 2965 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2966 ctrlr = vu_qpair->ctrlr; 2967 2968 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 2969 return 0; 2970 } 2971 2972 static void 2973 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 2974 struct spdk_nvmf_request *req) 2975 { 2976 struct nvmf_vfio_user_qpair *vu_qpair; 2977 struct nvmf_vfio_user_req *vu_req, *vu_req_to_abort = NULL; 2978 uint32_t i; 2979 uint16_t cid; 2980 2981 vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair); 2982 2983 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 2984 for (i = 0; i < vu_qpair->qsize; i++) { 2985 vu_req = &vu_qpair->reqs_internal[i]; 2986 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 2987 vu_req_to_abort = vu_req; 2988 break; 2989 } 2990 } 2991 2992 if (vu_req_to_abort == NULL) { 2993 spdk_nvmf_request_complete(req); 2994 return; 2995 } 2996 2997 req->req_to_abort = &vu_req_to_abort->req; 2998 nvmf_ctrlr_abort_request(req); 2999 } 3000 3001 static void 3002 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 3003 { 3004 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 3005 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 3006 opts->in_capsule_data_size = 0; 3007 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 3008 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 3009 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 3010 opts->num_shared_buffers = 0; 3011 opts->buf_cache_size = 0; 3012 opts->association_timeout = 0; 3013 opts->transport_specific = NULL; 3014 } 3015 3016 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 3017 .name = "VFIOUSER", 3018 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 3019 .opts_init = nvmf_vfio_user_opts_init, 3020 .create = nvmf_vfio_user_create, 3021 .destroy = nvmf_vfio_user_destroy, 3022 3023 .listen = nvmf_vfio_user_listen, 3024 .stop_listen = nvmf_vfio_user_stop_listen, 3025 .cdata_init = nvmf_vfio_user_cdata_init, 3026 .listen_associate = nvmf_vfio_user_listen_associate, 3027 3028 .listener_discover = nvmf_vfio_user_discover, 3029 3030 .poll_group_create = nvmf_vfio_user_poll_group_create, 3031 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 3032 .poll_group_add = nvmf_vfio_user_poll_group_add, 3033 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 3034 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 3035 3036 .req_free = nvmf_vfio_user_req_free, 3037 .req_complete = nvmf_vfio_user_req_complete, 3038 3039 .qpair_fini = nvmf_vfio_user_close_qpair, 3040 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 3041 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 3042 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 3043 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 3044 }; 3045 3046 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 3047 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 3048