1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2021 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * NVMe over PCIe common library 36 */ 37 38 #include "spdk/stdinc.h" 39 #include "spdk/likely.h" 40 #include "spdk/string.h" 41 #include "nvme_internal.h" 42 #include "nvme_pcie_internal.h" 43 44 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; 45 46 static inline uint64_t 47 nvme_pcie_vtophys(struct spdk_nvme_ctrlr *ctrlr, const void *buf, uint64_t *size) 48 { 49 if (spdk_likely(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE)) { 50 return spdk_vtophys(buf, size); 51 } else { 52 /* vfio-user address translation with IOVA=VA mode */ 53 return (uint64_t)(uintptr_t)buf; 54 } 55 } 56 57 int 58 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) 59 { 60 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 61 uint32_t i; 62 63 /* all head/tail vals are set to 0 */ 64 pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0; 65 66 /* 67 * First time through the completion queue, HW will set phase 68 * bit on completions to 1. So set this to 1 here, indicating 69 * we're looking for a 1 to know which entries have completed. 70 * we'll toggle the bit each time when the completion queue 71 * rolls over. 72 */ 73 pqpair->flags.phase = 1; 74 for (i = 0; i < pqpair->num_entries; i++) { 75 pqpair->cpl[i].status.p = 0; 76 } 77 78 return 0; 79 } 80 81 static void 82 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) 83 { 84 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); 85 tr->cid = cid; 86 tr->req = NULL; 87 } 88 89 static void * 90 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment, 91 uint64_t *phys_addr) 92 { 93 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 94 uintptr_t addr; 95 96 if (pctrlr->cmb.mem_register_addr != NULL) { 97 /* BAR is mapped for data */ 98 return NULL; 99 } 100 101 addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset; 102 addr = (addr + (alignment - 1)) & ~(alignment - 1); 103 104 /* CMB may only consume part of the BAR, calculate accordingly */ 105 if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) { 106 SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); 107 return NULL; 108 } 109 *phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va; 110 111 pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va; 112 113 return (void *)addr; 114 } 115 116 int 117 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, 118 const struct spdk_nvme_io_qpair_opts *opts) 119 { 120 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 121 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 122 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 123 struct nvme_tracker *tr; 124 uint16_t i; 125 uint16_t num_trackers; 126 size_t page_align = sysconf(_SC_PAGESIZE); 127 size_t queue_align, queue_len; 128 uint32_t flags = SPDK_MALLOC_DMA; 129 uint64_t sq_paddr = 0; 130 uint64_t cq_paddr = 0; 131 132 if (opts) { 133 pqpair->sq_vaddr = opts->sq.vaddr; 134 pqpair->cq_vaddr = opts->cq.vaddr; 135 sq_paddr = opts->sq.paddr; 136 cq_paddr = opts->cq.paddr; 137 } 138 139 pqpair->retry_count = ctrlr->opts.transport_retry_count; 140 141 /* 142 * Limit the maximum number of completions to return per call to prevent wraparound, 143 * and calculate how many trackers can be submitted at once without overflowing the 144 * completion queue. 145 */ 146 pqpair->max_completions_cap = pqpair->num_entries / 4; 147 pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); 148 pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); 149 num_trackers = pqpair->num_entries - pqpair->max_completions_cap; 150 151 SPDK_INFOLOG(nvme, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", 152 pqpair->max_completions_cap, num_trackers); 153 154 assert(num_trackers != 0); 155 156 pqpair->sq_in_cmb = false; 157 158 if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { 159 flags |= SPDK_MALLOC_SHARE; 160 } 161 162 /* cmd and cpl rings must be aligned on page size boundaries. */ 163 if (ctrlr->opts.use_cmb_sqs) { 164 pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), 165 page_align, &pqpair->cmd_bus_addr); 166 if (pqpair->cmd != NULL) { 167 pqpair->sq_in_cmb = true; 168 } 169 } 170 171 if (pqpair->sq_in_cmb == false) { 172 if (pqpair->sq_vaddr) { 173 pqpair->cmd = pqpair->sq_vaddr; 174 } else { 175 /* To ensure physical address contiguity we make each ring occupy 176 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. 177 */ 178 queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd); 179 queue_align = spdk_max(spdk_align32pow2(queue_len), page_align); 180 pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags); 181 if (pqpair->cmd == NULL) { 182 SPDK_ERRLOG("alloc qpair_cmd failed\n"); 183 return -ENOMEM; 184 } 185 } 186 if (sq_paddr) { 187 assert(pqpair->sq_vaddr != NULL); 188 pqpair->cmd_bus_addr = sq_paddr; 189 } else { 190 pqpair->cmd_bus_addr = nvme_pcie_vtophys(ctrlr, pqpair->cmd, NULL); 191 if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) { 192 SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n"); 193 return -EFAULT; 194 } 195 } 196 } 197 198 if (pqpair->cq_vaddr) { 199 pqpair->cpl = pqpair->cq_vaddr; 200 } else { 201 queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl); 202 queue_align = spdk_max(spdk_align32pow2(queue_len), page_align); 203 pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags); 204 if (pqpair->cpl == NULL) { 205 SPDK_ERRLOG("alloc qpair_cpl failed\n"); 206 return -ENOMEM; 207 } 208 } 209 if (cq_paddr) { 210 assert(pqpair->cq_vaddr != NULL); 211 pqpair->cpl_bus_addr = cq_paddr; 212 } else { 213 pqpair->cpl_bus_addr = nvme_pcie_vtophys(ctrlr, pqpair->cpl, NULL); 214 if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) { 215 SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n"); 216 return -EFAULT; 217 } 218 } 219 220 pqpair->sq_tdbl = pctrlr->doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; 221 pqpair->cq_hdbl = pctrlr->doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; 222 223 /* 224 * Reserve space for all of the trackers in a single allocation. 225 * struct nvme_tracker must be padded so that its size is already a power of 2. 226 * This ensures the PRP list embedded in the nvme_tracker object will not span a 227 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. 228 */ 229 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, 230 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 231 if (pqpair->tr == NULL) { 232 SPDK_ERRLOG("nvme_tr failed\n"); 233 return -ENOMEM; 234 } 235 236 TAILQ_INIT(&pqpair->free_tr); 237 TAILQ_INIT(&pqpair->outstanding_tr); 238 239 for (i = 0; i < num_trackers; i++) { 240 tr = &pqpair->tr[i]; 241 nvme_qpair_construct_tracker(tr, i, nvme_pcie_vtophys(ctrlr, tr, NULL)); 242 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 243 } 244 245 nvme_pcie_qpair_reset(qpair); 246 247 return 0; 248 } 249 250 int 251 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries) 252 { 253 struct nvme_pcie_qpair *pqpair; 254 int rc; 255 256 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 257 if (pqpair == NULL) { 258 return -ENOMEM; 259 } 260 261 pqpair->num_entries = num_entries; 262 pqpair->flags.delay_cmd_submit = 0; 263 264 ctrlr->adminq = &pqpair->qpair; 265 266 rc = nvme_qpair_init(ctrlr->adminq, 267 0, /* qpair ID */ 268 ctrlr, 269 SPDK_NVME_QPRIO_URGENT, 270 num_entries); 271 if (rc != 0) { 272 return rc; 273 } 274 275 pqpair->stat = spdk_zmalloc(sizeof(*pqpair->stat), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, 276 SPDK_MALLOC_SHARE); 277 if (!pqpair->stat) { 278 SPDK_ERRLOG("Failed to allocate admin qpair statistics\n"); 279 return -ENOMEM; 280 } 281 282 return nvme_pcie_qpair_construct(ctrlr->adminq, NULL); 283 } 284 285 /** 286 * Note: the ctrlr_lock must be held when calling this function. 287 */ 288 void 289 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, 290 struct nvme_request *req, struct spdk_nvme_cpl *cpl) 291 { 292 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 293 struct nvme_request *active_req = req; 294 struct spdk_nvme_ctrlr_process *active_proc; 295 296 /* 297 * The admin request is from another process. Move to the per 298 * process list for that process to handle it later. 299 */ 300 assert(nvme_qpair_is_admin_queue(qpair)); 301 assert(active_req->pid != getpid()); 302 303 active_proc = nvme_ctrlr_get_process(ctrlr, active_req->pid); 304 if (active_proc) { 305 /* Save the original completion information */ 306 memcpy(&active_req->cpl, cpl, sizeof(*cpl)); 307 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); 308 } else { 309 SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", 310 active_req->pid); 311 312 nvme_free_request(active_req); 313 } 314 } 315 316 /** 317 * Note: the ctrlr_lock must be held when calling this function. 318 */ 319 void 320 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) 321 { 322 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 323 struct nvme_request *req, *tmp_req; 324 pid_t pid = getpid(); 325 struct spdk_nvme_ctrlr_process *proc; 326 327 /* 328 * Check whether there is any pending admin request from 329 * other active processes. 330 */ 331 assert(nvme_qpair_is_admin_queue(qpair)); 332 333 proc = nvme_ctrlr_get_current_process(ctrlr); 334 if (!proc) { 335 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); 336 assert(proc); 337 return; 338 } 339 340 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { 341 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); 342 343 assert(req->pid == pid); 344 345 nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl); 346 nvme_free_request(req); 347 } 348 } 349 350 int 351 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, 352 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, 353 void *cb_arg) 354 { 355 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 356 struct nvme_request *req; 357 struct spdk_nvme_cmd *cmd; 358 359 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 360 if (req == NULL) { 361 return -ENOMEM; 362 } 363 364 cmd = &req->cmd; 365 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; 366 367 cmd->cdw10_bits.create_io_q.qid = io_que->id; 368 cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1; 369 370 cmd->cdw11_bits.create_io_cq.pc = 1; 371 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; 372 373 return nvme_ctrlr_submit_admin_request(ctrlr, req); 374 } 375 376 int 377 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, 378 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) 379 { 380 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); 381 struct nvme_request *req; 382 struct spdk_nvme_cmd *cmd; 383 384 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 385 if (req == NULL) { 386 return -ENOMEM; 387 } 388 389 cmd = &req->cmd; 390 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; 391 392 cmd->cdw10_bits.create_io_q.qid = io_que->id; 393 cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1; 394 cmd->cdw11_bits.create_io_sq.pc = 1; 395 cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio; 396 cmd->cdw11_bits.create_io_sq.cqid = io_que->id; 397 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; 398 399 return nvme_ctrlr_submit_admin_request(ctrlr, req); 400 } 401 402 int 403 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 404 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 405 { 406 struct nvme_request *req; 407 struct spdk_nvme_cmd *cmd; 408 409 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 410 if (req == NULL) { 411 return -ENOMEM; 412 } 413 414 cmd = &req->cmd; 415 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; 416 cmd->cdw10_bits.delete_io_q.qid = qpair->id; 417 418 return nvme_ctrlr_submit_admin_request(ctrlr, req); 419 } 420 421 int 422 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 423 spdk_nvme_cmd_cb cb_fn, void *cb_arg) 424 { 425 struct nvme_request *req; 426 struct spdk_nvme_cmd *cmd; 427 428 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); 429 if (req == NULL) { 430 return -ENOMEM; 431 } 432 433 cmd = &req->cmd; 434 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; 435 cmd->cdw10_bits.delete_io_q.qid = qpair->id; 436 437 return nvme_ctrlr_submit_admin_request(ctrlr, req); 438 } 439 440 static int 441 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, 442 uint16_t qid) 443 { 444 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); 445 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 446 struct nvme_completion_poll_status *status; 447 int rc; 448 449 status = calloc(1, sizeof(*status)); 450 if (!status) { 451 SPDK_ERRLOG("Failed to allocate status tracker\n"); 452 return -ENOMEM; 453 } 454 455 /* Statistics may already be allocated in the case of controller reset */ 456 if (!pqpair->stat) { 457 if (qpair->poll_group) { 458 struct nvme_pcie_poll_group *group = SPDK_CONTAINEROF(qpair->poll_group, 459 struct nvme_pcie_poll_group, group); 460 461 pqpair->stat = &group->stats; 462 pqpair->shared_stats = true; 463 } else { 464 pqpair->stat = calloc(1, sizeof(*pqpair->stat)); 465 if (!pqpair->stat) { 466 SPDK_ERRLOG("Failed to allocate qpair statistics\n"); 467 free(status); 468 return -ENOMEM; 469 } 470 } 471 } 472 473 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); 474 if (rc != 0) { 475 free(status); 476 return rc; 477 } 478 479 if (nvme_wait_for_completion(ctrlr->adminq, status)) { 480 SPDK_ERRLOG("nvme_create_io_cq failed!\n"); 481 if (!status->timed_out) { 482 free(status); 483 } 484 return -1; 485 } 486 487 memset(status, 0, sizeof(*status)); 488 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status); 489 if (rc != 0) { 490 free(status); 491 return rc; 492 } 493 494 if (nvme_wait_for_completion(ctrlr->adminq, status)) { 495 SPDK_ERRLOG("nvme_create_io_sq failed!\n"); 496 if (status->timed_out) { 497 /* Request is still queued, the memory will be freed in a completion callback. 498 allocate a new request */ 499 status = calloc(1, sizeof(*status)); 500 if (!status) { 501 SPDK_ERRLOG("Failed to allocate status tracker\n"); 502 return -ENOMEM; 503 } 504 } 505 506 memset(status, 0, sizeof(*status)); 507 /* Attempt to delete the completion queue */ 508 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status); 509 if (rc != 0) { 510 /* The originall or newly allocated status structure can be freed since 511 * the corresponding request has been completed of failed to submit */ 512 free(status); 513 return -1; 514 } 515 nvme_wait_for_completion(ctrlr->adminq, status); 516 if (!status->timed_out) { 517 /* status can be freed regardless of nvme_wait_for_completion return value */ 518 free(status); 519 } 520 return -1; 521 } 522 523 if (ctrlr->shadow_doorbell) { 524 pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * 525 pctrlr->doorbell_stride_u32; 526 pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * 527 pctrlr->doorbell_stride_u32; 528 pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * 529 pctrlr->doorbell_stride_u32; 530 pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * 531 pctrlr->doorbell_stride_u32; 532 pqpair->flags.has_shadow_doorbell = 1; 533 } else { 534 pqpair->flags.has_shadow_doorbell = 0; 535 } 536 nvme_pcie_qpair_reset(qpair); 537 free(status); 538 539 return 0; 540 } 541 542 int 543 nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 544 { 545 if (nvme_qpair_is_admin_queue(qpair)) { 546 return 0; 547 } else { 548 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); 549 } 550 } 551 552 void 553 nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 554 { 555 } 556 557 /* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must 558 * not use wide instructions because QEMU will not emulate such instructions to MMIO space. 559 * So this function ensures we only copy 8 bytes at a time. 560 */ 561 static inline void 562 nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) 563 { 564 uint64_t *dst64 = (uint64_t *)dst; 565 const uint64_t *src64 = (const uint64_t *)src; 566 uint32_t i; 567 568 for (i = 0; i < sizeof(*dst) / 8; i++) { 569 dst64[i] = src64[i]; 570 } 571 } 572 573 static inline void 574 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) 575 { 576 /* dst and src are known to be non-overlapping and 64-byte aligned. */ 577 #if defined(__SSE2__) 578 __m128i *d128 = (__m128i *)dst; 579 const __m128i *s128 = (const __m128i *)src; 580 581 _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0])); 582 _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1])); 583 _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2])); 584 _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3])); 585 #else 586 *dst = *src; 587 #endif 588 } 589 590 void 591 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 592 { 593 struct nvme_request *req; 594 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 595 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 596 597 req = tr->req; 598 assert(req != NULL); 599 600 if (req->cmd.fuse == SPDK_NVME_IO_FLAGS_FUSE_FIRST) { 601 /* This is first cmd of two fused commands - don't ring doorbell */ 602 qpair->first_fused_submitted = 1; 603 } 604 605 /* Don't use wide instructions to copy NVMe command, this is limited by QEMU 606 * virtual NVMe controller, the maximum access width is 8 Bytes for one time. 607 */ 608 if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) { 609 nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd); 610 } else { 611 /* Copy the command from the tracker to the submission queue. */ 612 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); 613 } 614 615 if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) { 616 pqpair->sq_tail = 0; 617 } 618 619 if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) { 620 SPDK_ERRLOG("sq_tail is passing sq_head!\n"); 621 } 622 623 if (!pqpair->flags.delay_cmd_submit) { 624 nvme_pcie_qpair_ring_sq_doorbell(qpair); 625 } 626 } 627 628 void 629 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 630 struct spdk_nvme_cpl *cpl, bool print_on_error) 631 { 632 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 633 struct nvme_request *req; 634 bool retry, error; 635 bool req_from_current_proc = true; 636 637 req = tr->req; 638 639 assert(req != NULL); 640 641 error = spdk_nvme_cpl_is_error(cpl); 642 retry = error && nvme_completion_is_retry(cpl) && 643 req->retries < pqpair->retry_count; 644 645 if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { 646 spdk_nvme_qpair_print_command(qpair, &req->cmd); 647 spdk_nvme_qpair_print_completion(qpair, cpl); 648 } 649 650 assert(cpl->cid == req->cmd.cid); 651 652 if (retry) { 653 req->retries++; 654 nvme_pcie_qpair_submit_tracker(qpair, tr); 655 } else { 656 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); 657 658 /* Only check admin requests from different processes. */ 659 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { 660 req_from_current_proc = false; 661 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); 662 } else { 663 nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl); 664 } 665 666 if (req_from_current_proc == true) { 667 nvme_qpair_free_request(qpair, req); 668 } 669 670 tr->req = NULL; 671 672 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); 673 } 674 } 675 676 void 677 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, 678 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 679 bool print_on_error) 680 { 681 struct spdk_nvme_cpl cpl; 682 683 memset(&cpl, 0, sizeof(cpl)); 684 cpl.sqid = qpair->id; 685 cpl.cid = tr->cid; 686 cpl.status.sct = sct; 687 cpl.status.sc = sc; 688 cpl.status.dnr = dnr; 689 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); 690 } 691 692 void 693 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) 694 { 695 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 696 struct nvme_tracker *tr, *temp, *last; 697 698 last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head); 699 700 /* Abort previously submitted (outstanding) trs */ 701 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { 702 if (!qpair->ctrlr->opts.disable_error_logging) { 703 SPDK_ERRLOG("aborting outstanding command\n"); 704 } 705 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 706 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); 707 708 if (tr == last) { 709 break; 710 } 711 } 712 } 713 714 void 715 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 716 { 717 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 718 struct nvme_tracker *tr; 719 720 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 721 while (tr != NULL) { 722 assert(tr->req != NULL); 723 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 724 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, 725 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, 726 false); 727 tr = TAILQ_FIRST(&pqpair->outstanding_tr); 728 } else { 729 tr = TAILQ_NEXT(tr, tq_list); 730 } 731 } 732 } 733 734 void 735 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) 736 { 737 nvme_pcie_admin_qpair_abort_aers(qpair); 738 } 739 740 void 741 nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) 742 { 743 nvme_pcie_qpair_abort_trackers(qpair, dnr); 744 } 745 746 static void 747 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 748 { 749 uint64_t t02; 750 struct nvme_tracker *tr, *tmp; 751 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 752 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 753 struct spdk_nvme_ctrlr_process *active_proc; 754 755 /* Don't check timeouts during controller initialization. */ 756 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 757 return; 758 } 759 760 if (nvme_qpair_is_admin_queue(qpair)) { 761 active_proc = nvme_ctrlr_get_current_process(ctrlr); 762 } else { 763 active_proc = qpair->active_proc; 764 } 765 766 /* Only check timeouts if the current process has a timeout callback. */ 767 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 768 return; 769 } 770 771 t02 = spdk_get_ticks(); 772 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { 773 assert(tr->req != NULL); 774 775 if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { 776 /* 777 * The requests are in order, so as soon as one has not timed out, 778 * stop iterating. 779 */ 780 break; 781 } 782 } 783 } 784 785 int32_t 786 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) 787 { 788 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 789 struct nvme_tracker *tr; 790 struct spdk_nvme_cpl *cpl, *next_cpl; 791 uint32_t num_completions = 0; 792 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 793 uint16_t next_cq_head; 794 uint8_t next_phase; 795 bool next_is_valid = false; 796 797 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 798 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 799 } 800 801 if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { 802 /* 803 * max_completions == 0 means unlimited, but complete at most 804 * max_completions_cap batch of I/O at a time so that the completion 805 * queue doorbells don't wrap around. 806 */ 807 max_completions = pqpair->max_completions_cap; 808 } 809 810 pqpair->stat->polls++; 811 812 while (1) { 813 cpl = &pqpair->cpl[pqpair->cq_head]; 814 815 if (!next_is_valid && cpl->status.p != pqpair->flags.phase) { 816 break; 817 } 818 819 if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) { 820 next_cq_head = pqpair->cq_head + 1; 821 next_phase = pqpair->flags.phase; 822 } else { 823 next_cq_head = 0; 824 next_phase = !pqpair->flags.phase; 825 } 826 next_cpl = &pqpair->cpl[next_cq_head]; 827 next_is_valid = (next_cpl->status.p == next_phase); 828 if (next_is_valid) { 829 __builtin_prefetch(&pqpair->tr[next_cpl->cid]); 830 } 831 832 #ifdef __PPC64__ 833 /* 834 * This memory barrier prevents reordering of: 835 * - load after store from/to tr 836 * - load after load cpl phase and cpl cid 837 */ 838 spdk_mb(); 839 #elif defined(__aarch64__) 840 __asm volatile("dmb oshld" ::: "memory"); 841 #endif 842 843 if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { 844 pqpair->cq_head = 0; 845 pqpair->flags.phase = !pqpair->flags.phase; 846 } 847 848 tr = &pqpair->tr[cpl->cid]; 849 /* Prefetch the req's STAILQ_ENTRY since we'll need to access it 850 * as part of putting the req back on the qpair's free list. 851 */ 852 __builtin_prefetch(&tr->req->stailq); 853 pqpair->sq_head = cpl->sqhd; 854 855 if (tr->req) { 856 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); 857 } else { 858 SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); 859 spdk_nvme_qpair_print_completion(qpair, cpl); 860 assert(0); 861 } 862 863 if (++num_completions == max_completions) { 864 break; 865 } 866 } 867 868 if (num_completions > 0) { 869 pqpair->stat->completions += num_completions; 870 nvme_pcie_qpair_ring_cq_doorbell(qpair); 871 } else { 872 pqpair->stat->idle_polls++; 873 } 874 875 if (pqpair->flags.delay_cmd_submit) { 876 if (pqpair->last_sq_tail != pqpair->sq_tail) { 877 nvme_pcie_qpair_ring_sq_doorbell(qpair); 878 pqpair->last_sq_tail = pqpair->sq_tail; 879 } 880 } 881 882 if (spdk_unlikely(ctrlr->timeout_enabled)) { 883 /* 884 * User registered for timeout callback 885 */ 886 nvme_pcie_qpair_check_timeout(qpair); 887 } 888 889 /* Before returning, complete any pending admin request. */ 890 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 891 nvme_pcie_qpair_complete_pending_admin_request(qpair); 892 893 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 894 } 895 896 return num_completions; 897 } 898 899 int 900 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) 901 { 902 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 903 904 if (nvme_qpair_is_admin_queue(qpair)) { 905 nvme_pcie_admin_qpair_destroy(qpair); 906 } 907 /* 908 * We check sq_vaddr and cq_vaddr to see if the user specified the memory 909 * buffers when creating the I/O queue. 910 * If the user specified them, we cannot free that memory. 911 * Nor do we free it if it's in the CMB. 912 */ 913 if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) { 914 spdk_free(pqpair->cmd); 915 } 916 if (!pqpair->cq_vaddr && pqpair->cpl) { 917 spdk_free(pqpair->cpl); 918 } 919 if (pqpair->tr) { 920 spdk_free(pqpair->tr); 921 } 922 923 nvme_qpair_deinit(qpair); 924 925 if (!pqpair->shared_stats) { 926 if (qpair->id) { 927 free(pqpair->stat); 928 } else { 929 /* statistics of admin qpair are allocates from huge pages because 930 * admin qpair is shared for multi-process */ 931 spdk_free(pqpair->stat); 932 } 933 934 } 935 936 spdk_free(pqpair); 937 938 return 0; 939 } 940 941 struct spdk_nvme_qpair * 942 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 943 const struct spdk_nvme_io_qpair_opts *opts) 944 { 945 struct nvme_pcie_qpair *pqpair; 946 struct spdk_nvme_qpair *qpair; 947 int rc; 948 949 assert(ctrlr != NULL); 950 951 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, 952 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); 953 if (pqpair == NULL) { 954 return NULL; 955 } 956 957 pqpair->num_entries = opts->io_queue_size; 958 pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit; 959 960 qpair = &pqpair->qpair; 961 962 rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); 963 if (rc != 0) { 964 nvme_pcie_qpair_destroy(qpair); 965 return NULL; 966 } 967 968 rc = nvme_pcie_qpair_construct(qpair, opts); 969 970 if (rc != 0) { 971 nvme_pcie_qpair_destroy(qpair); 972 return NULL; 973 } 974 975 return qpair; 976 } 977 978 int 979 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 980 { 981 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 982 struct nvme_completion_poll_status *status; 983 int rc; 984 985 assert(ctrlr != NULL); 986 987 if (ctrlr->is_removed) { 988 goto free; 989 } 990 991 status = calloc(1, sizeof(*status)); 992 if (!status) { 993 SPDK_ERRLOG("Failed to allocate status tracker\n"); 994 return -ENOMEM; 995 } 996 997 /* Delete the I/O submission queue */ 998 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status); 999 if (rc != 0) { 1000 SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); 1001 free(status); 1002 return rc; 1003 } 1004 if (nvme_wait_for_completion(ctrlr->adminq, status)) { 1005 if (!status->timed_out) { 1006 free(status); 1007 } 1008 return -1; 1009 } 1010 1011 /* Now that the submission queue is deleted, the device is supposed to have 1012 * completed any outstanding I/O. Try to complete them. If they don't complete, 1013 * they'll be marked as aborted and completed below. */ 1014 nvme_pcie_qpair_process_completions(qpair, 0); 1015 1016 memset(status, 0, sizeof(*status)); 1017 /* Delete the completion queue */ 1018 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); 1019 if (rc != 0) { 1020 SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); 1021 free(status); 1022 return rc; 1023 } 1024 if (nvme_wait_for_completion(ctrlr->adminq, status)) { 1025 if (!status->timed_out) { 1026 free(status); 1027 } 1028 return -1; 1029 } 1030 free(status); 1031 1032 if (pqpair->flags.has_shadow_doorbell) { 1033 *pqpair->shadow_doorbell.sq_tdbl = 0; 1034 *pqpair->shadow_doorbell.cq_hdbl = 0; 1035 *pqpair->shadow_doorbell.sq_eventidx = 0; 1036 *pqpair->shadow_doorbell.cq_eventidx = 0; 1037 } 1038 free: 1039 if (qpair->no_deletion_notification_needed == 0) { 1040 /* Abort the rest of the I/O */ 1041 nvme_pcie_qpair_abort_trackers(qpair, 1); 1042 } 1043 1044 nvme_pcie_qpair_destroy(qpair); 1045 return 0; 1046 } 1047 1048 static void 1049 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) 1050 { 1051 /* 1052 * Bad vtophys translation, so abort this request and return 1053 * immediately. 1054 */ 1055 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, 1056 SPDK_NVME_SC_INVALID_FIELD, 1057 1 /* do not retry */, true); 1058 } 1059 1060 /* 1061 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. 1062 * 1063 * *prp_index will be updated to account for the number of PRP entries used. 1064 */ 1065 static inline int 1066 nvme_pcie_prp_list_append(struct spdk_nvme_ctrlr *ctrlr, struct nvme_tracker *tr, 1067 uint32_t *prp_index, void *virt_addr, size_t len, 1068 uint32_t page_size) 1069 { 1070 struct spdk_nvme_cmd *cmd = &tr->req->cmd; 1071 uintptr_t page_mask = page_size - 1; 1072 uint64_t phys_addr; 1073 uint32_t i; 1074 1075 SPDK_DEBUGLOG(nvme, "prp_index:%u virt_addr:%p len:%u\n", 1076 *prp_index, virt_addr, (uint32_t)len); 1077 1078 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { 1079 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1080 return -EFAULT; 1081 } 1082 1083 i = *prp_index; 1084 while (len) { 1085 uint32_t seg_len; 1086 1087 /* 1088 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, 1089 * so prp_index == count is valid. 1090 */ 1091 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { 1092 SPDK_ERRLOG("out of PRP entries\n"); 1093 return -EFAULT; 1094 } 1095 1096 phys_addr = nvme_pcie_vtophys(ctrlr, virt_addr, NULL); 1097 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { 1098 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); 1099 return -EFAULT; 1100 } 1101 1102 if (i == 0) { 1103 SPDK_DEBUGLOG(nvme, "prp1 = %p\n", (void *)phys_addr); 1104 cmd->dptr.prp.prp1 = phys_addr; 1105 seg_len = page_size - ((uintptr_t)virt_addr & page_mask); 1106 } else { 1107 if ((phys_addr & page_mask) != 0) { 1108 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); 1109 return -EFAULT; 1110 } 1111 1112 SPDK_DEBUGLOG(nvme, "prp[%u] = %p\n", i - 1, (void *)phys_addr); 1113 tr->u.prp[i - 1] = phys_addr; 1114 seg_len = page_size; 1115 } 1116 1117 seg_len = spdk_min(seg_len, len); 1118 virt_addr += seg_len; 1119 len -= seg_len; 1120 i++; 1121 } 1122 1123 cmd->psdt = SPDK_NVME_PSDT_PRP; 1124 if (i <= 1) { 1125 cmd->dptr.prp.prp2 = 0; 1126 } else if (i == 2) { 1127 cmd->dptr.prp.prp2 = tr->u.prp[0]; 1128 SPDK_DEBUGLOG(nvme, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); 1129 } else { 1130 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; 1131 SPDK_DEBUGLOG(nvme, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); 1132 } 1133 1134 *prp_index = i; 1135 return 0; 1136 } 1137 1138 static int 1139 nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair, 1140 struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned) 1141 { 1142 assert(0); 1143 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1144 return -EINVAL; 1145 } 1146 1147 /** 1148 * Build PRP list describing physically contiguous payload buffer. 1149 */ 1150 static int 1151 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1152 struct nvme_tracker *tr, bool dword_aligned) 1153 { 1154 uint32_t prp_index = 0; 1155 int rc; 1156 1157 rc = nvme_pcie_prp_list_append(qpair->ctrlr, tr, &prp_index, 1158 req->payload.contig_or_cb_arg + req->payload_offset, 1159 req->payload_size, qpair->ctrlr->page_size); 1160 if (rc) { 1161 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1162 } 1163 1164 return rc; 1165 } 1166 1167 /** 1168 * Build an SGL describing a physically contiguous payload buffer. 1169 * 1170 * This is more efficient than using PRP because large buffers can be 1171 * described this way. 1172 */ 1173 static int 1174 nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1175 struct nvme_tracker *tr, bool dword_aligned) 1176 { 1177 void *virt_addr; 1178 uint64_t phys_addr, mapping_length; 1179 uint32_t length; 1180 struct spdk_nvme_sgl_descriptor *sgl; 1181 uint32_t nseg = 0; 1182 1183 assert(req->payload_size != 0); 1184 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); 1185 1186 sgl = tr->u.sgl; 1187 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1188 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1189 1190 length = req->payload_size; 1191 virt_addr = req->payload.contig_or_cb_arg + req->payload_offset; 1192 1193 while (length > 0) { 1194 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1195 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1196 return -EFAULT; 1197 } 1198 1199 if (dword_aligned && ((uintptr_t)virt_addr & 3)) { 1200 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1201 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1202 return -EFAULT; 1203 } 1204 1205 mapping_length = length; 1206 phys_addr = nvme_pcie_vtophys(qpair->ctrlr, virt_addr, &mapping_length); 1207 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1208 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1209 return -EFAULT; 1210 } 1211 1212 mapping_length = spdk_min(length, mapping_length); 1213 1214 length -= mapping_length; 1215 virt_addr += mapping_length; 1216 1217 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1218 sgl->unkeyed.length = mapping_length; 1219 sgl->address = phys_addr; 1220 sgl->unkeyed.subtype = 0; 1221 1222 sgl++; 1223 nseg++; 1224 } 1225 1226 if (nseg == 1) { 1227 /* 1228 * The whole transfer can be described by a single SGL descriptor. 1229 * Use the special case described by the spec where SGL1's type is Data Block. 1230 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1231 * SGL element into SGL1. 1232 */ 1233 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1234 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1235 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1236 } else { 1237 /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because 1238 * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. 1239 */ 1240 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1241 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1242 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1243 } 1244 1245 return 0; 1246 } 1247 1248 /** 1249 * Build SGL list describing scattered payload buffer. 1250 */ 1251 static int 1252 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1253 struct nvme_tracker *tr, bool dword_aligned) 1254 { 1255 int rc; 1256 void *virt_addr; 1257 uint64_t phys_addr, mapping_length; 1258 uint32_t remaining_transfer_len, remaining_user_sge_len, length; 1259 struct spdk_nvme_sgl_descriptor *sgl; 1260 uint32_t nseg = 0; 1261 1262 /* 1263 * Build scattered payloads. 1264 */ 1265 assert(req->payload_size != 0); 1266 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1267 assert(req->payload.reset_sgl_fn != NULL); 1268 assert(req->payload.next_sge_fn != NULL); 1269 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1270 1271 sgl = tr->u.sgl; 1272 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1273 req->cmd.dptr.sgl1.unkeyed.subtype = 0; 1274 1275 remaining_transfer_len = req->payload_size; 1276 1277 while (remaining_transfer_len > 0) { 1278 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, 1279 &virt_addr, &remaining_user_sge_len); 1280 if (rc) { 1281 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1282 return -EFAULT; 1283 } 1284 1285 /* Bit Bucket SGL descriptor */ 1286 if ((uint64_t)virt_addr == UINT64_MAX) { 1287 /* TODO: enable WRITE and COMPARE when necessary */ 1288 if (req->cmd.opc != SPDK_NVME_OPC_READ) { 1289 SPDK_ERRLOG("Only READ command can be supported\n"); 1290 goto exit; 1291 } 1292 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1293 SPDK_ERRLOG("Too many SGL entries\n"); 1294 goto exit; 1295 } 1296 1297 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET; 1298 /* If the SGL describes a destination data buffer, the length of data 1299 * buffer shall be discarded by controller, and the length is included 1300 * in Number of Logical Blocks (NLB) parameter. Otherwise, the length 1301 * is not included in the NLB parameter. 1302 */ 1303 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 1304 remaining_transfer_len -= remaining_user_sge_len; 1305 1306 sgl->unkeyed.length = remaining_user_sge_len; 1307 sgl->address = 0; 1308 sgl->unkeyed.subtype = 0; 1309 1310 sgl++; 1311 nseg++; 1312 1313 continue; 1314 } 1315 1316 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); 1317 remaining_transfer_len -= remaining_user_sge_len; 1318 while (remaining_user_sge_len > 0) { 1319 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { 1320 SPDK_ERRLOG("Too many SGL entries\n"); 1321 goto exit; 1322 } 1323 1324 if (dword_aligned && ((uintptr_t)virt_addr & 3)) { 1325 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); 1326 goto exit; 1327 } 1328 1329 mapping_length = remaining_user_sge_len; 1330 phys_addr = nvme_pcie_vtophys(qpair->ctrlr, virt_addr, &mapping_length); 1331 if (phys_addr == SPDK_VTOPHYS_ERROR) { 1332 goto exit; 1333 } 1334 1335 length = spdk_min(remaining_user_sge_len, mapping_length); 1336 remaining_user_sge_len -= length; 1337 virt_addr += length; 1338 1339 if (nseg > 0 && phys_addr == 1340 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { 1341 /* extend previous entry */ 1342 (*(sgl - 1)).unkeyed.length += length; 1343 continue; 1344 } 1345 1346 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1347 sgl->unkeyed.length = length; 1348 sgl->address = phys_addr; 1349 sgl->unkeyed.subtype = 0; 1350 1351 sgl++; 1352 nseg++; 1353 } 1354 } 1355 1356 if (nseg == 1) { 1357 /* 1358 * The whole transfer can be described by a single SGL descriptor. 1359 * Use the special case described by the spec where SGL1's type is Data Block. 1360 * This means the SGL in the tracker is not used at all, so copy the first (and only) 1361 * SGL element into SGL1. 1362 */ 1363 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1364 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; 1365 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; 1366 } else { 1367 /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because 1368 * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. 1369 */ 1370 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1371 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; 1372 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); 1373 } 1374 1375 return 0; 1376 1377 exit: 1378 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1379 return -EFAULT; 1380 } 1381 1382 /** 1383 * Build PRP list describing scattered payload buffer. 1384 */ 1385 static int 1386 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, 1387 struct nvme_tracker *tr, bool dword_aligned) 1388 { 1389 int rc; 1390 void *virt_addr; 1391 uint32_t remaining_transfer_len, length; 1392 uint32_t prp_index = 0; 1393 uint32_t page_size = qpair->ctrlr->page_size; 1394 1395 /* 1396 * Build scattered payloads. 1397 */ 1398 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1399 assert(req->payload.reset_sgl_fn != NULL); 1400 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1401 1402 remaining_transfer_len = req->payload_size; 1403 while (remaining_transfer_len > 0) { 1404 assert(req->payload.next_sge_fn != NULL); 1405 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); 1406 if (rc) { 1407 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1408 return -EFAULT; 1409 } 1410 1411 length = spdk_min(remaining_transfer_len, length); 1412 1413 /* 1414 * Any incompatible sges should have been handled up in the splitting routine, 1415 * but assert here as an additional check. 1416 * 1417 * All SGEs except last must end on a page boundary. 1418 */ 1419 assert((length == remaining_transfer_len) || 1420 _is_page_aligned((uintptr_t)virt_addr + length, page_size)); 1421 1422 rc = nvme_pcie_prp_list_append(qpair->ctrlr, tr, &prp_index, virt_addr, length, page_size); 1423 if (rc) { 1424 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1425 return rc; 1426 } 1427 1428 remaining_transfer_len -= length; 1429 } 1430 1431 return 0; 1432 } 1433 1434 typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *, 1435 bool); 1436 1437 static build_req_fn const g_nvme_pcie_build_req_table[][2] = { 1438 [NVME_PAYLOAD_TYPE_INVALID] = { 1439 nvme_pcie_qpair_build_request_invalid, /* PRP */ 1440 nvme_pcie_qpair_build_request_invalid /* SGL */ 1441 }, 1442 [NVME_PAYLOAD_TYPE_CONTIG] = { 1443 nvme_pcie_qpair_build_contig_request, /* PRP */ 1444 nvme_pcie_qpair_build_contig_hw_sgl_request /* SGL */ 1445 }, 1446 [NVME_PAYLOAD_TYPE_SGL] = { 1447 nvme_pcie_qpair_build_prps_sgl_request, /* PRP */ 1448 nvme_pcie_qpair_build_hw_sgl_request /* SGL */ 1449 } 1450 }; 1451 1452 static int 1453 nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, 1454 bool sgl_supported, bool dword_aligned) 1455 { 1456 void *md_payload; 1457 struct nvme_request *req = tr->req; 1458 1459 if (req->payload.md) { 1460 md_payload = req->payload.md + req->md_offset; 1461 if (dword_aligned && ((uintptr_t)md_payload & 3)) { 1462 SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload); 1463 goto exit; 1464 } 1465 1466 if (sgl_supported && dword_aligned) { 1467 assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG); 1468 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL; 1469 tr->meta_sgl.address = nvme_pcie_vtophys(qpair->ctrlr, md_payload, NULL); 1470 if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR) { 1471 goto exit; 1472 } 1473 tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1474 tr->meta_sgl.unkeyed.length = req->md_size; 1475 tr->meta_sgl.unkeyed.subtype = 0; 1476 req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor); 1477 } else { 1478 req->cmd.mptr = nvme_pcie_vtophys(qpair->ctrlr, md_payload, NULL); 1479 if (req->cmd.mptr == SPDK_VTOPHYS_ERROR) { 1480 goto exit; 1481 } 1482 } 1483 } 1484 1485 return 0; 1486 1487 exit: 1488 nvme_pcie_fail_request_bad_vtophys(qpair, tr); 1489 return -EINVAL; 1490 } 1491 1492 int 1493 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) 1494 { 1495 struct nvme_tracker *tr; 1496 int rc = 0; 1497 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1498 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); 1499 enum nvme_payload_type payload_type; 1500 bool sgl_supported; 1501 bool dword_aligned = true; 1502 1503 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 1504 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); 1505 } 1506 1507 tr = TAILQ_FIRST(&pqpair->free_tr); 1508 1509 if (tr == NULL) { 1510 pqpair->stat->queued_requests++; 1511 /* Inform the upper layer to try again later. */ 1512 rc = -EAGAIN; 1513 goto exit; 1514 } 1515 1516 pqpair->stat->submitted_requests++; 1517 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ 1518 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); 1519 tr->req = req; 1520 tr->cb_fn = req->cb_fn; 1521 tr->cb_arg = req->cb_arg; 1522 req->cmd.cid = tr->cid; 1523 1524 if (req->payload_size != 0) { 1525 payload_type = nvme_payload_type(&req->payload); 1526 /* According to the specification, PRPs shall be used for all 1527 * Admin commands for NVMe over PCIe implementations. 1528 */ 1529 sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 && 1530 !nvme_qpair_is_admin_queue(qpair); 1531 1532 if (sgl_supported) { 1533 /* Don't use SGL for DSM command */ 1534 if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_NO_SGL_FOR_DSM) && 1535 (req->cmd.opc == SPDK_NVME_OPC_DATASET_MANAGEMENT))) { 1536 sgl_supported = false; 1537 } 1538 } 1539 1540 if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) { 1541 dword_aligned = false; 1542 } 1543 rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned); 1544 if (rc < 0) { 1545 goto exit; 1546 } 1547 1548 rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, dword_aligned); 1549 if (rc < 0) { 1550 goto exit; 1551 } 1552 } 1553 1554 nvme_pcie_qpair_submit_tracker(qpair, tr); 1555 1556 exit: 1557 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { 1558 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); 1559 } 1560 1561 return rc; 1562 } 1563 1564 struct spdk_nvme_transport_poll_group * 1565 nvme_pcie_poll_group_create(void) 1566 { 1567 struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group)); 1568 1569 if (group == NULL) { 1570 SPDK_ERRLOG("Unable to allocate poll group.\n"); 1571 return NULL; 1572 } 1573 1574 return &group->group; 1575 } 1576 1577 int 1578 nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) 1579 { 1580 return 0; 1581 } 1582 1583 int 1584 nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) 1585 { 1586 return 0; 1587 } 1588 1589 int 1590 nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, 1591 struct spdk_nvme_qpair *qpair) 1592 { 1593 return 0; 1594 } 1595 1596 int 1597 nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, 1598 struct spdk_nvme_qpair *qpair) 1599 { 1600 return 0; 1601 } 1602 1603 int64_t 1604 nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, 1605 uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) 1606 { 1607 struct spdk_nvme_qpair *qpair, *tmp_qpair; 1608 int32_t local_completions = 0; 1609 int64_t total_completions = 0; 1610 1611 STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { 1612 disconnected_qpair_cb(qpair, tgroup->group->ctx); 1613 } 1614 1615 STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { 1616 local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair); 1617 if (local_completions < 0) { 1618 disconnected_qpair_cb(qpair, tgroup->group->ctx); 1619 local_completions = 0; 1620 } 1621 total_completions += local_completions; 1622 } 1623 1624 return total_completions; 1625 } 1626 1627 int 1628 nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) 1629 { 1630 if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { 1631 return -EBUSY; 1632 } 1633 1634 free(tgroup); 1635 1636 return 0; 1637 } 1638