1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * NVMe over RDMA transport 36 */ 37 38 #include "spdk/stdinc.h" 39 40 #include "spdk/assert.h" 41 #include "spdk/log.h" 42 #include "spdk/trace.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme.h" 45 #include "spdk/nvmf_spec.h" 46 #include "spdk/string.h" 47 #include "spdk/endian.h" 48 #include "spdk/likely.h" 49 #include "spdk/config.h" 50 51 #include "nvme_internal.h" 52 #include "spdk_internal/rdma.h" 53 54 #define NVME_RDMA_TIME_OUT_IN_MS 2000 55 #define NVME_RDMA_RW_BUFFER_SIZE 131072 56 57 /* 58 * NVME RDMA qpair Resource Defaults 59 */ 60 #define NVME_RDMA_DEFAULT_TX_SGE 2 61 #define NVME_RDMA_DEFAULT_RX_SGE 1 62 63 /* Max number of NVMe-oF SGL descriptors supported by the host */ 64 #define NVME_RDMA_MAX_SGL_DESCRIPTORS 16 65 66 /* number of STAILQ entries for holding pending RDMA CM events. */ 67 #define NVME_RDMA_NUM_CM_EVENTS 256 68 69 /* CM event processing timeout */ 70 #define NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US 1000000 71 72 /* The default size for a shared rdma completion queue. */ 73 #define DEFAULT_NVME_RDMA_CQ_SIZE 4096 74 75 /* 76 * In the special case of a stale connection we don't expose a mechanism 77 * for the user to retry the connection so we need to handle it internally. 78 */ 79 #define NVME_RDMA_STALE_CONN_RETRY_MAX 5 80 #define NVME_RDMA_STALE_CONN_RETRY_DELAY_US 10000 81 82 /* 83 * Maximum value of transport_retry_count used by RDMA controller 84 */ 85 #define NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT 7 86 87 /* 88 * Maximum value of transport_ack_timeout used by RDMA controller 89 */ 90 #define NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT 31 91 92 /* 93 * Number of poller cycles to keep a pointer to destroyed qpairs 94 * in the poll group. 95 */ 96 #define NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES 50 97 98 /* 99 * The max length of keyed SGL data block (3 bytes) 100 */ 101 #define NVME_RDMA_MAX_KEYED_SGL_LENGTH ((1u << 24u) - 1) 102 103 #define WC_PER_QPAIR(queue_depth) (queue_depth * 2) 104 105 enum nvme_rdma_wr_type { 106 RDMA_WR_TYPE_RECV, 107 RDMA_WR_TYPE_SEND, 108 }; 109 110 struct nvme_rdma_wr { 111 /* Using this instead of the enum allows this struct to only occupy one byte. */ 112 uint8_t type; 113 }; 114 115 struct spdk_nvmf_cmd { 116 struct spdk_nvme_cmd cmd; 117 struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS]; 118 }; 119 120 struct spdk_nvme_rdma_hooks g_nvme_hooks = {}; 121 122 /* STAILQ wrapper for cm events. */ 123 struct nvme_rdma_cm_event_entry { 124 struct rdma_cm_event *evt; 125 STAILQ_ENTRY(nvme_rdma_cm_event_entry) link; 126 }; 127 128 /* NVMe RDMA transport extensions for spdk_nvme_ctrlr */ 129 struct nvme_rdma_ctrlr { 130 struct spdk_nvme_ctrlr ctrlr; 131 132 struct ibv_pd *pd; 133 134 uint16_t max_sge; 135 136 struct rdma_event_channel *cm_channel; 137 138 STAILQ_HEAD(, nvme_rdma_cm_event_entry) pending_cm_events; 139 140 STAILQ_HEAD(, nvme_rdma_cm_event_entry) free_cm_events; 141 142 struct nvme_rdma_cm_event_entry *cm_events; 143 }; 144 145 struct nvme_rdma_destroyed_qpair { 146 struct nvme_rdma_qpair *destroyed_qpair_tracker; 147 uint32_t completed_cycles; 148 STAILQ_ENTRY(nvme_rdma_destroyed_qpair) link; 149 }; 150 151 struct nvme_rdma_poller { 152 struct ibv_context *device; 153 struct ibv_cq *cq; 154 int required_num_wc; 155 int current_num_wc; 156 STAILQ_ENTRY(nvme_rdma_poller) link; 157 }; 158 159 struct nvme_rdma_poll_group { 160 struct spdk_nvme_transport_poll_group group; 161 STAILQ_HEAD(, nvme_rdma_poller) pollers; 162 int num_pollers; 163 STAILQ_HEAD(, nvme_rdma_destroyed_qpair) destroyed_qpairs; 164 }; 165 166 struct spdk_nvme_recv_wr_list { 167 struct ibv_recv_wr *first; 168 struct ibv_recv_wr *last; 169 }; 170 171 /* Memory regions */ 172 union nvme_rdma_mr { 173 struct ibv_mr *mr; 174 uint64_t key; 175 }; 176 177 /* NVMe RDMA qpair extensions for spdk_nvme_qpair */ 178 struct nvme_rdma_qpair { 179 struct spdk_nvme_qpair qpair; 180 181 struct spdk_rdma_qp *rdma_qp; 182 struct rdma_cm_id *cm_id; 183 struct ibv_cq *cq; 184 185 struct spdk_nvme_rdma_req *rdma_reqs; 186 187 uint32_t max_send_sge; 188 189 uint32_t max_recv_sge; 190 191 uint16_t num_entries; 192 193 bool delay_cmd_submit; 194 195 bool poll_group_disconnect_in_progress; 196 197 uint32_t num_completions; 198 199 /* Parallel arrays of response buffers + response SGLs of size num_entries */ 200 struct ibv_sge *rsp_sgls; 201 struct spdk_nvme_rdma_rsp *rsps; 202 203 struct ibv_recv_wr *rsp_recv_wrs; 204 205 struct spdk_nvme_recv_wr_list recvs_to_post; 206 207 /* Memory region describing all rsps for this qpair */ 208 union nvme_rdma_mr rsp_mr; 209 210 /* 211 * Array of num_entries NVMe commands registered as RDMA message buffers. 212 * Indexed by rdma_req->id. 213 */ 214 struct spdk_nvmf_cmd *cmds; 215 216 /* Memory region describing all cmds for this qpair */ 217 union nvme_rdma_mr cmd_mr; 218 219 struct spdk_rdma_mem_map *mr_map; 220 221 TAILQ_HEAD(, spdk_nvme_rdma_req) free_reqs; 222 TAILQ_HEAD(, spdk_nvme_rdma_req) outstanding_reqs; 223 224 /* Counts of outstanding send and recv objects */ 225 uint16_t current_num_recvs; 226 uint16_t current_num_sends; 227 228 /* Placed at the end of the struct since it is not used frequently */ 229 struct rdma_cm_event *evt; 230 231 /* Used by poll group to keep the qpair around until it is ready to remove it. */ 232 bool defer_deletion_to_pg; 233 }; 234 235 enum NVME_RDMA_COMPLETION_FLAGS { 236 NVME_RDMA_SEND_COMPLETED = 1u << 0, 237 NVME_RDMA_RECV_COMPLETED = 1u << 1, 238 }; 239 240 struct spdk_nvme_rdma_req { 241 uint16_t id; 242 uint16_t completion_flags: 2; 243 uint16_t reserved: 14; 244 /* if completion of RDMA_RECV received before RDMA_SEND, we will complete nvme request 245 * during processing of RDMA_SEND. To complete the request we must know the index 246 * of nvme_cpl received in RDMA_RECV, so store it in this field */ 247 uint16_t rsp_idx; 248 249 struct nvme_rdma_wr rdma_wr; 250 251 struct ibv_send_wr send_wr; 252 253 struct nvme_request *req; 254 255 struct ibv_sge send_sgl[NVME_RDMA_DEFAULT_TX_SGE]; 256 257 TAILQ_ENTRY(spdk_nvme_rdma_req) link; 258 }; 259 260 struct spdk_nvme_rdma_rsp { 261 struct spdk_nvme_cpl cpl; 262 struct nvme_rdma_qpair *rqpair; 263 uint16_t idx; 264 struct nvme_rdma_wr rdma_wr; 265 }; 266 267 static const char *rdma_cm_event_str[] = { 268 "RDMA_CM_EVENT_ADDR_RESOLVED", 269 "RDMA_CM_EVENT_ADDR_ERROR", 270 "RDMA_CM_EVENT_ROUTE_RESOLVED", 271 "RDMA_CM_EVENT_ROUTE_ERROR", 272 "RDMA_CM_EVENT_CONNECT_REQUEST", 273 "RDMA_CM_EVENT_CONNECT_RESPONSE", 274 "RDMA_CM_EVENT_CONNECT_ERROR", 275 "RDMA_CM_EVENT_UNREACHABLE", 276 "RDMA_CM_EVENT_REJECTED", 277 "RDMA_CM_EVENT_ESTABLISHED", 278 "RDMA_CM_EVENT_DISCONNECTED", 279 "RDMA_CM_EVENT_DEVICE_REMOVAL", 280 "RDMA_CM_EVENT_MULTICAST_JOIN", 281 "RDMA_CM_EVENT_MULTICAST_ERROR", 282 "RDMA_CM_EVENT_ADDR_CHANGE", 283 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 284 }; 285 286 struct nvme_rdma_qpair *nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, 287 uint32_t qp_num); 288 289 static inline void * 290 nvme_rdma_calloc(size_t nmemb, size_t size) 291 { 292 if (!nmemb || !size) { 293 return NULL; 294 } 295 296 if (!g_nvme_hooks.get_rkey) { 297 return calloc(nmemb, size); 298 } else { 299 return spdk_zmalloc(nmemb * size, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 300 } 301 } 302 303 static inline void 304 nvme_rdma_free(void *buf) 305 { 306 if (!g_nvme_hooks.get_rkey) { 307 free(buf); 308 } else { 309 spdk_free(buf); 310 } 311 } 312 313 static int nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, 314 struct spdk_nvme_qpair *qpair); 315 316 static inline struct nvme_rdma_qpair * 317 nvme_rdma_qpair(struct spdk_nvme_qpair *qpair) 318 { 319 assert(qpair->trtype == SPDK_NVME_TRANSPORT_RDMA); 320 return SPDK_CONTAINEROF(qpair, struct nvme_rdma_qpair, qpair); 321 } 322 323 static inline struct nvme_rdma_poll_group * 324 nvme_rdma_poll_group(struct spdk_nvme_transport_poll_group *group) 325 { 326 return (SPDK_CONTAINEROF(group, struct nvme_rdma_poll_group, group)); 327 } 328 329 static inline struct nvme_rdma_ctrlr * 330 nvme_rdma_ctrlr(struct spdk_nvme_ctrlr *ctrlr) 331 { 332 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA); 333 return SPDK_CONTAINEROF(ctrlr, struct nvme_rdma_ctrlr, ctrlr); 334 } 335 336 static struct spdk_nvme_rdma_req * 337 nvme_rdma_req_get(struct nvme_rdma_qpair *rqpair) 338 { 339 struct spdk_nvme_rdma_req *rdma_req; 340 341 rdma_req = TAILQ_FIRST(&rqpair->free_reqs); 342 if (rdma_req) { 343 TAILQ_REMOVE(&rqpair->free_reqs, rdma_req, link); 344 TAILQ_INSERT_TAIL(&rqpair->outstanding_reqs, rdma_req, link); 345 } 346 347 return rdma_req; 348 } 349 350 static void 351 nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) 352 { 353 rdma_req->completion_flags = 0; 354 rdma_req->req = NULL; 355 TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link); 356 } 357 358 static void 359 nvme_rdma_req_complete(struct spdk_nvme_rdma_req *rdma_req, 360 struct spdk_nvme_cpl *rsp) 361 { 362 struct nvme_request *req = rdma_req->req; 363 struct nvme_rdma_qpair *rqpair; 364 365 assert(req != NULL); 366 367 rqpair = nvme_rdma_qpair(req->qpair); 368 TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); 369 370 nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp); 371 nvme_free_request(req); 372 } 373 374 static const char * 375 nvme_rdma_cm_event_str_get(uint32_t event) 376 { 377 if (event < SPDK_COUNTOF(rdma_cm_event_str)) { 378 return rdma_cm_event_str[event]; 379 } else { 380 return "Undefined"; 381 } 382 } 383 384 385 static int 386 nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair) 387 { 388 struct rdma_cm_event *event = rqpair->evt; 389 struct spdk_nvmf_rdma_accept_private_data *accept_data; 390 int rc = 0; 391 392 if (event) { 393 switch (event->event) { 394 case RDMA_CM_EVENT_ADDR_RESOLVED: 395 case RDMA_CM_EVENT_ADDR_ERROR: 396 case RDMA_CM_EVENT_ROUTE_RESOLVED: 397 case RDMA_CM_EVENT_ROUTE_ERROR: 398 break; 399 case RDMA_CM_EVENT_CONNECT_REQUEST: 400 break; 401 case RDMA_CM_EVENT_CONNECT_ERROR: 402 break; 403 case RDMA_CM_EVENT_UNREACHABLE: 404 case RDMA_CM_EVENT_REJECTED: 405 break; 406 case RDMA_CM_EVENT_CONNECT_RESPONSE: 407 rc = spdk_rdma_qp_complete_connect(rqpair->rdma_qp); 408 /* fall through */ 409 case RDMA_CM_EVENT_ESTABLISHED: 410 accept_data = (struct spdk_nvmf_rdma_accept_private_data *)event->param.conn.private_data; 411 if (accept_data == NULL) { 412 rc = -1; 413 } else { 414 SPDK_DEBUGLOG(nvme, "Requested queue depth %d. Actually got queue depth %d.\n", 415 rqpair->num_entries, accept_data->crqsize); 416 rqpair->num_entries = spdk_min(rqpair->num_entries, accept_data->crqsize); 417 } 418 break; 419 case RDMA_CM_EVENT_DISCONNECTED: 420 rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE; 421 break; 422 case RDMA_CM_EVENT_DEVICE_REMOVAL: 423 rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; 424 break; 425 case RDMA_CM_EVENT_MULTICAST_JOIN: 426 case RDMA_CM_EVENT_MULTICAST_ERROR: 427 break; 428 case RDMA_CM_EVENT_ADDR_CHANGE: 429 rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; 430 break; 431 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 432 break; 433 default: 434 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 435 break; 436 } 437 rqpair->evt = NULL; 438 rdma_ack_cm_event(event); 439 } 440 441 return rc; 442 } 443 444 /* 445 * This function must be called under the nvme controller's lock 446 * because it touches global controller variables. The lock is taken 447 * by the generic transport code before invoking a few of the functions 448 * in this file: nvme_rdma_ctrlr_connect_qpair, nvme_rdma_ctrlr_delete_io_qpair, 449 * and conditionally nvme_rdma_qpair_process_completions when it is calling 450 * completions on the admin qpair. When adding a new call to this function, please 451 * verify that it is in a situation where it falls under the lock. 452 */ 453 static int 454 nvme_rdma_poll_events(struct nvme_rdma_ctrlr *rctrlr) 455 { 456 struct nvme_rdma_cm_event_entry *entry, *tmp; 457 struct nvme_rdma_qpair *event_qpair; 458 struct rdma_cm_event *event; 459 struct rdma_event_channel *channel = rctrlr->cm_channel; 460 461 STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) { 462 event_qpair = nvme_rdma_qpair(entry->evt->id->context); 463 if (event_qpair->evt == NULL) { 464 event_qpair->evt = entry->evt; 465 STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link); 466 STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link); 467 } 468 } 469 470 while (rdma_get_cm_event(channel, &event) == 0) { 471 event_qpair = nvme_rdma_qpair(event->id->context); 472 if (event_qpair->evt == NULL) { 473 event_qpair->evt = event; 474 } else { 475 assert(rctrlr == nvme_rdma_ctrlr(event_qpair->qpair.ctrlr)); 476 entry = STAILQ_FIRST(&rctrlr->free_cm_events); 477 if (entry == NULL) { 478 rdma_ack_cm_event(event); 479 return -ENOMEM; 480 } 481 STAILQ_REMOVE(&rctrlr->free_cm_events, entry, nvme_rdma_cm_event_entry, link); 482 entry->evt = event; 483 STAILQ_INSERT_TAIL(&rctrlr->pending_cm_events, entry, link); 484 } 485 } 486 487 if (errno == EAGAIN || errno == EWOULDBLOCK) { 488 return 0; 489 } else { 490 return errno; 491 } 492 } 493 494 static int 495 nvme_rdma_validate_cm_event(enum rdma_cm_event_type expected_evt_type, 496 struct rdma_cm_event *reaped_evt) 497 { 498 int rc = -EBADMSG; 499 500 if (expected_evt_type == reaped_evt->event) { 501 return 0; 502 } 503 504 switch (expected_evt_type) { 505 case RDMA_CM_EVENT_ESTABLISHED: 506 /* 507 * There is an enum ib_cm_rej_reason in the kernel headers that sets 10 as 508 * IB_CM_REJ_STALE_CONN. I can't find the corresponding userspace but we get 509 * the same values here. 510 */ 511 if (reaped_evt->event == RDMA_CM_EVENT_REJECTED && reaped_evt->status == 10) { 512 rc = -ESTALE; 513 } else if (reaped_evt->event == RDMA_CM_EVENT_CONNECT_RESPONSE) { 514 /* 515 * If we are using a qpair which is not created using rdma cm API 516 * then we will receive RDMA_CM_EVENT_CONNECT_RESPONSE instead of 517 * RDMA_CM_EVENT_ESTABLISHED. 518 */ 519 return 0; 520 } 521 break; 522 default: 523 break; 524 } 525 526 SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n", 527 nvme_rdma_cm_event_str_get(expected_evt_type), 528 nvme_rdma_cm_event_str_get(reaped_evt->event), reaped_evt->event, 529 reaped_evt->status); 530 return rc; 531 } 532 533 static int 534 nvme_rdma_process_event(struct nvme_rdma_qpair *rqpair, 535 struct rdma_event_channel *channel, 536 enum rdma_cm_event_type evt) 537 { 538 struct nvme_rdma_ctrlr *rctrlr; 539 uint64_t timeout_ticks; 540 int rc = 0, rc2; 541 542 if (rqpair->evt != NULL) { 543 rc = nvme_rdma_qpair_process_cm_event(rqpair); 544 if (rc) { 545 return rc; 546 } 547 } 548 549 timeout_ticks = (NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC + 550 spdk_get_ticks(); 551 rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); 552 assert(rctrlr != NULL); 553 554 while (!rqpair->evt && spdk_get_ticks() < timeout_ticks && rc == 0) { 555 rc = nvme_rdma_poll_events(rctrlr); 556 } 557 558 if (rc) { 559 return rc; 560 } 561 562 if (rqpair->evt == NULL) { 563 return -EADDRNOTAVAIL; 564 } 565 566 rc = nvme_rdma_validate_cm_event(evt, rqpair->evt); 567 568 rc2 = nvme_rdma_qpair_process_cm_event(rqpair); 569 /* bad message takes precedence over the other error codes from processing the event. */ 570 return rc == 0 ? rc2 : rc; 571 } 572 573 static int 574 nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair) 575 { 576 int rc; 577 struct spdk_rdma_qp_init_attr attr = {}; 578 struct ibv_device_attr dev_attr; 579 struct nvme_rdma_ctrlr *rctrlr; 580 581 rc = ibv_query_device(rqpair->cm_id->verbs, &dev_attr); 582 if (rc != 0) { 583 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 584 return -1; 585 } 586 587 if (rqpair->qpair.poll_group) { 588 assert(!rqpair->cq); 589 rc = nvme_poll_group_connect_qpair(&rqpair->qpair); 590 if (rc) { 591 SPDK_ERRLOG("Unable to activate the rdmaqpair.\n"); 592 return -1; 593 } 594 assert(rqpair->cq); 595 } else { 596 rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0); 597 if (!rqpair->cq) { 598 SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno, spdk_strerror(errno)); 599 return -1; 600 } 601 } 602 603 rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); 604 if (g_nvme_hooks.get_ibv_pd) { 605 rctrlr->pd = g_nvme_hooks.get_ibv_pd(&rctrlr->ctrlr.trid, rqpair->cm_id->verbs); 606 } else { 607 rctrlr->pd = NULL; 608 } 609 610 attr.pd = rctrlr->pd; 611 attr.send_cq = rqpair->cq; 612 attr.recv_cq = rqpair->cq; 613 attr.cap.max_send_wr = rqpair->num_entries; /* SEND operations */ 614 attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */ 615 attr.cap.max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, dev_attr.max_sge); 616 attr.cap.max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, dev_attr.max_sge); 617 618 rqpair->rdma_qp = spdk_rdma_qp_create(rqpair->cm_id, &attr); 619 620 if (!rqpair->rdma_qp) { 621 return -1; 622 } 623 624 /* ibv_create_qp will change the values in attr.cap. Make sure we store the proper value. */ 625 rqpair->max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, attr.cap.max_send_sge); 626 rqpair->max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, attr.cap.max_recv_sge); 627 rqpair->current_num_recvs = 0; 628 rqpair->current_num_sends = 0; 629 630 rctrlr->pd = rqpair->rdma_qp->qp->pd; 631 632 rqpair->cm_id->context = &rqpair->qpair; 633 634 return 0; 635 } 636 637 static inline int 638 nvme_rdma_qpair_submit_sends(struct nvme_rdma_qpair *rqpair) 639 { 640 struct ibv_send_wr *bad_send_wr = NULL; 641 int rc; 642 643 rc = spdk_rdma_qp_flush_send_wrs(rqpair->rdma_qp, &bad_send_wr); 644 645 if (spdk_unlikely(rc)) { 646 SPDK_ERRLOG("Failed to post WRs on send queue, errno %d (%s), bad_wr %p\n", 647 rc, spdk_strerror(rc), bad_send_wr); 648 while (bad_send_wr != NULL) { 649 assert(rqpair->current_num_sends > 0); 650 rqpair->current_num_sends--; 651 bad_send_wr = bad_send_wr->next; 652 } 653 return rc; 654 } 655 656 return 0; 657 } 658 659 static inline int 660 nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair) 661 { 662 struct ibv_recv_wr *bad_recv_wr; 663 int rc = 0; 664 665 if (rqpair->recvs_to_post.first) { 666 rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->recvs_to_post.first, &bad_recv_wr); 667 if (spdk_unlikely(rc)) { 668 SPDK_ERRLOG("Failed to post WRs on receive queue, errno %d (%s), bad_wr %p\n", 669 rc, spdk_strerror(rc), bad_recv_wr); 670 while (bad_recv_wr != NULL) { 671 assert(rqpair->current_num_sends > 0); 672 rqpair->current_num_recvs--; 673 bad_recv_wr = bad_recv_wr->next; 674 } 675 } 676 677 rqpair->recvs_to_post.first = NULL; 678 } 679 return rc; 680 } 681 682 /* Append the given send wr structure to the qpair's outstanding sends list. */ 683 /* This function accepts only a single wr. */ 684 static inline int 685 nvme_rdma_qpair_queue_send_wr(struct nvme_rdma_qpair *rqpair, struct ibv_send_wr *wr) 686 { 687 assert(wr->next == NULL); 688 689 assert(rqpair->current_num_sends < rqpair->num_entries); 690 691 rqpair->current_num_sends++; 692 spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, wr); 693 694 if (!rqpair->delay_cmd_submit) { 695 return nvme_rdma_qpair_submit_sends(rqpair); 696 } 697 698 return 0; 699 } 700 701 /* Append the given recv wr structure to the qpair's outstanding recvs list. */ 702 /* This function accepts only a single wr. */ 703 static inline int 704 nvme_rdma_qpair_queue_recv_wr(struct nvme_rdma_qpair *rqpair, struct ibv_recv_wr *wr) 705 { 706 707 assert(wr->next == NULL); 708 assert(rqpair->current_num_recvs < rqpair->num_entries); 709 710 rqpair->current_num_recvs++; 711 if (rqpair->recvs_to_post.first == NULL) { 712 rqpair->recvs_to_post.first = wr; 713 } else { 714 rqpair->recvs_to_post.last->next = wr; 715 } 716 717 rqpair->recvs_to_post.last = wr; 718 719 if (!rqpair->delay_cmd_submit) { 720 return nvme_rdma_qpair_submit_recvs(rqpair); 721 } 722 723 return 0; 724 } 725 726 #define nvme_rdma_trace_ibv_sge(sg_list) \ 727 if (sg_list) { \ 728 SPDK_DEBUGLOG(nvme, "local addr %p length 0x%x lkey 0x%x\n", \ 729 (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \ 730 } 731 732 static int 733 nvme_rdma_post_recv(struct nvme_rdma_qpair *rqpair, uint16_t rsp_idx) 734 { 735 struct ibv_recv_wr *wr; 736 737 wr = &rqpair->rsp_recv_wrs[rsp_idx]; 738 wr->next = NULL; 739 nvme_rdma_trace_ibv_sge(wr->sg_list); 740 return nvme_rdma_qpair_queue_recv_wr(rqpair, wr); 741 } 742 743 static int 744 nvme_rdma_reg_mr(struct rdma_cm_id *cm_id, union nvme_rdma_mr *mr, void *mem, size_t length) 745 { 746 if (!g_nvme_hooks.get_rkey) { 747 mr->mr = rdma_reg_msgs(cm_id, mem, length); 748 if (mr->mr == NULL) { 749 SPDK_ERRLOG("Unable to register mr: %s (%d)\n", 750 spdk_strerror(errno), errno); 751 return -1; 752 } 753 } else { 754 mr->key = g_nvme_hooks.get_rkey(cm_id->pd, mem, length); 755 } 756 757 return 0; 758 } 759 760 static void 761 nvme_rdma_dereg_mr(union nvme_rdma_mr *mr) 762 { 763 if (!g_nvme_hooks.get_rkey) { 764 if (mr->mr && rdma_dereg_mr(mr->mr)) { 765 SPDK_ERRLOG("Unable to de-register mr\n"); 766 } 767 } else { 768 if (mr->key) { 769 g_nvme_hooks.put_rkey(mr->key); 770 } 771 } 772 memset(mr, 0, sizeof(*mr)); 773 } 774 775 static uint32_t 776 nvme_rdma_mr_get_lkey(union nvme_rdma_mr *mr) 777 { 778 uint32_t lkey; 779 780 if (!g_nvme_hooks.get_rkey) { 781 lkey = mr->mr->lkey; 782 } else { 783 lkey = *((uint64_t *) mr->key); 784 } 785 786 return lkey; 787 } 788 789 static void 790 nvme_rdma_unregister_rsps(struct nvme_rdma_qpair *rqpair) 791 { 792 nvme_rdma_dereg_mr(&rqpair->rsp_mr); 793 } 794 795 static void 796 nvme_rdma_free_rsps(struct nvme_rdma_qpair *rqpair) 797 { 798 nvme_rdma_free(rqpair->rsps); 799 rqpair->rsps = NULL; 800 nvme_rdma_free(rqpair->rsp_sgls); 801 rqpair->rsp_sgls = NULL; 802 nvme_rdma_free(rqpair->rsp_recv_wrs); 803 rqpair->rsp_recv_wrs = NULL; 804 } 805 806 static int 807 nvme_rdma_alloc_rsps(struct nvme_rdma_qpair *rqpair) 808 { 809 rqpair->rsps = NULL; 810 rqpair->rsp_recv_wrs = NULL; 811 812 rqpair->rsp_sgls = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_sgls)); 813 if (!rqpair->rsp_sgls) { 814 SPDK_ERRLOG("Failed to allocate rsp_sgls\n"); 815 goto fail; 816 } 817 818 rqpair->rsp_recv_wrs = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_recv_wrs)); 819 if (!rqpair->rsp_recv_wrs) { 820 SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n"); 821 goto fail; 822 } 823 824 rqpair->rsps = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsps)); 825 if (!rqpair->rsps) { 826 SPDK_ERRLOG("can not allocate rdma rsps\n"); 827 goto fail; 828 } 829 830 return 0; 831 fail: 832 nvme_rdma_free_rsps(rqpair); 833 return -ENOMEM; 834 } 835 836 static int 837 nvme_rdma_register_rsps(struct nvme_rdma_qpair *rqpair) 838 { 839 uint16_t i; 840 int rc; 841 uint32_t lkey; 842 843 rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->rsp_mr, 844 rqpair->rsps, rqpair->num_entries * sizeof(*rqpair->rsps)); 845 846 if (rc < 0) { 847 goto fail; 848 } 849 850 lkey = nvme_rdma_mr_get_lkey(&rqpair->rsp_mr); 851 852 for (i = 0; i < rqpair->num_entries; i++) { 853 struct ibv_sge *rsp_sgl = &rqpair->rsp_sgls[i]; 854 struct spdk_nvme_rdma_rsp *rsp = &rqpair->rsps[i]; 855 856 rsp->rqpair = rqpair; 857 rsp->rdma_wr.type = RDMA_WR_TYPE_RECV; 858 rsp->idx = i; 859 rsp_sgl->addr = (uint64_t)&rqpair->rsps[i]; 860 rsp_sgl->length = sizeof(struct spdk_nvme_cpl); 861 rsp_sgl->lkey = lkey; 862 863 rqpair->rsp_recv_wrs[i].wr_id = (uint64_t)&rsp->rdma_wr; 864 rqpair->rsp_recv_wrs[i].next = NULL; 865 rqpair->rsp_recv_wrs[i].sg_list = rsp_sgl; 866 rqpair->rsp_recv_wrs[i].num_sge = 1; 867 868 rc = nvme_rdma_post_recv(rqpair, i); 869 if (rc) { 870 goto fail; 871 } 872 } 873 874 rc = nvme_rdma_qpair_submit_recvs(rqpair); 875 if (rc) { 876 goto fail; 877 } 878 879 return 0; 880 881 fail: 882 nvme_rdma_unregister_rsps(rqpair); 883 return rc; 884 } 885 886 static void 887 nvme_rdma_unregister_reqs(struct nvme_rdma_qpair *rqpair) 888 { 889 nvme_rdma_dereg_mr(&rqpair->cmd_mr); 890 } 891 892 static void 893 nvme_rdma_free_reqs(struct nvme_rdma_qpair *rqpair) 894 { 895 if (!rqpair->rdma_reqs) { 896 return; 897 } 898 899 nvme_rdma_free(rqpair->cmds); 900 rqpair->cmds = NULL; 901 902 nvme_rdma_free(rqpair->rdma_reqs); 903 rqpair->rdma_reqs = NULL; 904 } 905 906 static int 907 nvme_rdma_alloc_reqs(struct nvme_rdma_qpair *rqpair) 908 { 909 uint16_t i; 910 911 rqpair->rdma_reqs = nvme_rdma_calloc(rqpair->num_entries, sizeof(struct spdk_nvme_rdma_req)); 912 if (rqpair->rdma_reqs == NULL) { 913 SPDK_ERRLOG("Failed to allocate rdma_reqs\n"); 914 goto fail; 915 } 916 917 rqpair->cmds = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->cmds)); 918 if (!rqpair->cmds) { 919 SPDK_ERRLOG("Failed to allocate RDMA cmds\n"); 920 goto fail; 921 } 922 923 924 TAILQ_INIT(&rqpair->free_reqs); 925 TAILQ_INIT(&rqpair->outstanding_reqs); 926 for (i = 0; i < rqpair->num_entries; i++) { 927 struct spdk_nvme_rdma_req *rdma_req; 928 struct spdk_nvmf_cmd *cmd; 929 930 rdma_req = &rqpair->rdma_reqs[i]; 931 rdma_req->rdma_wr.type = RDMA_WR_TYPE_SEND; 932 cmd = &rqpair->cmds[i]; 933 934 rdma_req->id = i; 935 936 /* The first RDMA sgl element will always point 937 * at this data structure. Depending on whether 938 * an NVMe-oF SGL is required, the length of 939 * this element may change. */ 940 rdma_req->send_sgl[0].addr = (uint64_t)cmd; 941 rdma_req->send_wr.wr_id = (uint64_t)&rdma_req->rdma_wr; 942 rdma_req->send_wr.next = NULL; 943 rdma_req->send_wr.opcode = IBV_WR_SEND; 944 rdma_req->send_wr.send_flags = IBV_SEND_SIGNALED; 945 rdma_req->send_wr.sg_list = rdma_req->send_sgl; 946 rdma_req->send_wr.imm_data = 0; 947 948 TAILQ_INSERT_TAIL(&rqpair->free_reqs, rdma_req, link); 949 } 950 951 return 0; 952 fail: 953 nvme_rdma_free_reqs(rqpair); 954 return -ENOMEM; 955 } 956 957 static int 958 nvme_rdma_register_reqs(struct nvme_rdma_qpair *rqpair) 959 { 960 int i; 961 int rc; 962 uint32_t lkey; 963 964 rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->cmd_mr, 965 rqpair->cmds, rqpair->num_entries * sizeof(*rqpair->cmds)); 966 967 if (rc < 0) { 968 goto fail; 969 } 970 971 lkey = nvme_rdma_mr_get_lkey(&rqpair->cmd_mr); 972 973 for (i = 0; i < rqpair->num_entries; i++) { 974 rqpair->rdma_reqs[i].send_sgl[0].lkey = lkey; 975 } 976 977 return 0; 978 979 fail: 980 nvme_rdma_unregister_reqs(rqpair); 981 return -ENOMEM; 982 } 983 984 static int 985 nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair, 986 struct sockaddr *src_addr, 987 struct sockaddr *dst_addr, 988 struct rdma_event_channel *cm_channel) 989 { 990 int ret; 991 992 ret = rdma_resolve_addr(rqpair->cm_id, src_addr, dst_addr, 993 NVME_RDMA_TIME_OUT_IN_MS); 994 if (ret) { 995 SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno); 996 return ret; 997 } 998 999 ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED); 1000 if (ret) { 1001 SPDK_ERRLOG("RDMA address resolution error\n"); 1002 return -1; 1003 } 1004 1005 if (rqpair->qpair.ctrlr->opts.transport_ack_timeout != SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED) { 1006 #ifdef SPDK_CONFIG_RDMA_SET_ACK_TIMEOUT 1007 uint8_t timeout = rqpair->qpair.ctrlr->opts.transport_ack_timeout; 1008 ret = rdma_set_option(rqpair->cm_id, RDMA_OPTION_ID, 1009 RDMA_OPTION_ID_ACK_TIMEOUT, 1010 &timeout, sizeof(timeout)); 1011 if (ret) { 1012 SPDK_NOTICELOG("Can't apply RDMA_OPTION_ID_ACK_TIMEOUT %d, ret %d\n", timeout, ret); 1013 } 1014 #else 1015 SPDK_DEBUGLOG(nvme, "transport_ack_timeout is not supported\n"); 1016 #endif 1017 } 1018 1019 1020 ret = rdma_resolve_route(rqpair->cm_id, NVME_RDMA_TIME_OUT_IN_MS); 1021 if (ret) { 1022 SPDK_ERRLOG("rdma_resolve_route\n"); 1023 return ret; 1024 } 1025 1026 ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED); 1027 if (ret) { 1028 SPDK_ERRLOG("RDMA route resolution error\n"); 1029 return -1; 1030 } 1031 1032 return 0; 1033 } 1034 1035 static int 1036 nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) 1037 { 1038 struct rdma_conn_param param = {}; 1039 struct spdk_nvmf_rdma_request_private_data request_data = {}; 1040 struct ibv_device_attr attr; 1041 int ret; 1042 struct spdk_nvme_ctrlr *ctrlr; 1043 struct nvme_rdma_ctrlr *rctrlr; 1044 1045 ret = ibv_query_device(rqpair->cm_id->verbs, &attr); 1046 if (ret != 0) { 1047 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1048 return ret; 1049 } 1050 1051 param.responder_resources = spdk_min(rqpair->num_entries, attr.max_qp_rd_atom); 1052 1053 ctrlr = rqpair->qpair.ctrlr; 1054 if (!ctrlr) { 1055 return -1; 1056 } 1057 rctrlr = nvme_rdma_ctrlr(ctrlr); 1058 assert(rctrlr != NULL); 1059 1060 request_data.qid = rqpair->qpair.id; 1061 request_data.hrqsize = rqpair->num_entries; 1062 request_data.hsqsize = rqpair->num_entries - 1; 1063 request_data.cntlid = ctrlr->cntlid; 1064 1065 param.private_data = &request_data; 1066 param.private_data_len = sizeof(request_data); 1067 param.retry_count = ctrlr->opts.transport_retry_count; 1068 param.rnr_retry_count = 7; 1069 1070 /* Fields below are ignored by rdma cm if qpair has been 1071 * created using rdma cm API. */ 1072 param.srq = 0; 1073 param.qp_num = rqpair->rdma_qp->qp->qp_num; 1074 1075 ret = rdma_connect(rqpair->cm_id, ¶m); 1076 if (ret) { 1077 SPDK_ERRLOG("nvme rdma connect error\n"); 1078 return ret; 1079 } 1080 1081 ret = nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_ESTABLISHED); 1082 if (ret == -ESTALE) { 1083 SPDK_NOTICELOG("Received a stale connection notice during connection.\n"); 1084 return -EAGAIN; 1085 } else if (ret) { 1086 SPDK_ERRLOG("RDMA connect error %d\n", ret); 1087 return ret; 1088 } else { 1089 return 0; 1090 } 1091 } 1092 1093 static int 1094 nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service) 1095 { 1096 struct addrinfo *res; 1097 struct addrinfo hints; 1098 int ret; 1099 1100 memset(&hints, 0, sizeof(hints)); 1101 hints.ai_family = family; 1102 hints.ai_socktype = SOCK_STREAM; 1103 hints.ai_protocol = 0; 1104 1105 ret = getaddrinfo(addr, service, &hints, &res); 1106 if (ret) { 1107 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret); 1108 return ret; 1109 } 1110 1111 if (res->ai_addrlen > sizeof(*sa)) { 1112 SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen); 1113 ret = EINVAL; 1114 } else { 1115 memcpy(sa, res->ai_addr, res->ai_addrlen); 1116 } 1117 1118 freeaddrinfo(res); 1119 return ret; 1120 } 1121 1122 static int 1123 _nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1124 { 1125 struct sockaddr_storage dst_addr; 1126 struct sockaddr_storage src_addr; 1127 bool src_addr_specified; 1128 int rc; 1129 struct nvme_rdma_ctrlr *rctrlr; 1130 struct nvme_rdma_qpair *rqpair; 1131 int family; 1132 1133 rqpair = nvme_rdma_qpair(qpair); 1134 rctrlr = nvme_rdma_ctrlr(ctrlr); 1135 assert(rctrlr != NULL); 1136 1137 switch (ctrlr->trid.adrfam) { 1138 case SPDK_NVMF_ADRFAM_IPV4: 1139 family = AF_INET; 1140 break; 1141 case SPDK_NVMF_ADRFAM_IPV6: 1142 family = AF_INET6; 1143 break; 1144 default: 1145 SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); 1146 return -1; 1147 } 1148 1149 SPDK_DEBUGLOG(nvme, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); 1150 1151 memset(&dst_addr, 0, sizeof(dst_addr)); 1152 1153 SPDK_DEBUGLOG(nvme, "trsvcid is %s\n", ctrlr->trid.trsvcid); 1154 rc = nvme_rdma_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid); 1155 if (rc != 0) { 1156 SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n"); 1157 return -1; 1158 } 1159 1160 if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { 1161 memset(&src_addr, 0, sizeof(src_addr)); 1162 rc = nvme_rdma_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid); 1163 if (rc != 0) { 1164 SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n"); 1165 return -1; 1166 } 1167 src_addr_specified = true; 1168 } else { 1169 src_addr_specified = false; 1170 } 1171 1172 rc = rdma_create_id(rctrlr->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP); 1173 if (rc < 0) { 1174 SPDK_ERRLOG("rdma_create_id() failed\n"); 1175 return -1; 1176 } 1177 1178 rc = nvme_rdma_resolve_addr(rqpair, 1179 src_addr_specified ? (struct sockaddr *)&src_addr : NULL, 1180 (struct sockaddr *)&dst_addr, rctrlr->cm_channel); 1181 if (rc < 0) { 1182 SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n"); 1183 return -1; 1184 } 1185 1186 rc = nvme_rdma_qpair_init(rqpair); 1187 if (rc < 0) { 1188 SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n"); 1189 return -1; 1190 } 1191 1192 rc = nvme_rdma_connect(rqpair); 1193 if (rc != 0) { 1194 SPDK_ERRLOG("Unable to connect the rqpair\n"); 1195 return rc; 1196 } 1197 1198 rc = nvme_rdma_register_reqs(rqpair); 1199 SPDK_DEBUGLOG(nvme, "rc =%d\n", rc); 1200 if (rc) { 1201 SPDK_ERRLOG("Unable to register rqpair RDMA requests\n"); 1202 return -1; 1203 } 1204 SPDK_DEBUGLOG(nvme, "RDMA requests registered\n"); 1205 1206 rc = nvme_rdma_register_rsps(rqpair); 1207 SPDK_DEBUGLOG(nvme, "rc =%d\n", rc); 1208 if (rc < 0) { 1209 SPDK_ERRLOG("Unable to register rqpair RDMA responses\n"); 1210 return -1; 1211 } 1212 SPDK_DEBUGLOG(nvme, "RDMA responses registered\n"); 1213 1214 rqpair->mr_map = spdk_rdma_create_mem_map(rqpair->rdma_qp->qp->pd, &g_nvme_hooks); 1215 if (!rqpair->mr_map) { 1216 SPDK_ERRLOG("Unable to register RDMA memory translation map\n"); 1217 return -1; 1218 } 1219 1220 rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries); 1221 if (rc < 0) { 1222 rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; 1223 SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); 1224 return rc; 1225 } 1226 1227 return 0; 1228 } 1229 1230 static int 1231 nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1232 { 1233 int rc; 1234 int retry_count = 0; 1235 1236 rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair); 1237 1238 /* 1239 * -EAGAIN represents the special case where the target side still thought it was connected. 1240 * Most NICs will fail the first connection attempt, and the NICs will clean up whatever 1241 * state they need to. After that, subsequent connection attempts will succeed. 1242 */ 1243 if (rc == -EAGAIN) { 1244 SPDK_NOTICELOG("Detected stale connection on Target side for qpid: %d\n", qpair->id); 1245 do { 1246 nvme_delay(NVME_RDMA_STALE_CONN_RETRY_DELAY_US); 1247 nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); 1248 rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair); 1249 retry_count++; 1250 } while (rc == -EAGAIN && retry_count < NVME_RDMA_STALE_CONN_RETRY_MAX); 1251 } 1252 1253 return rc; 1254 } 1255 1256 /* 1257 * Build SGL describing empty payload. 1258 */ 1259 static int 1260 nvme_rdma_build_null_request(struct spdk_nvme_rdma_req *rdma_req) 1261 { 1262 struct nvme_request *req = rdma_req->req; 1263 1264 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1265 1266 /* The first element of this SGL is pointing at an 1267 * spdk_nvmf_cmd object. For this particular command, 1268 * we only need the first 64 bytes corresponding to 1269 * the NVMe command. */ 1270 rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); 1271 1272 /* The RDMA SGL needs one element describing the NVMe command. */ 1273 rdma_req->send_wr.num_sge = 1; 1274 1275 req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; 1276 req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; 1277 req->cmd.dptr.sgl1.keyed.length = 0; 1278 req->cmd.dptr.sgl1.keyed.key = 0; 1279 req->cmd.dptr.sgl1.address = 0; 1280 1281 return 0; 1282 } 1283 1284 /* 1285 * Build inline SGL describing contiguous payload buffer. 1286 */ 1287 static int 1288 nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair *rqpair, 1289 struct spdk_nvme_rdma_req *rdma_req) 1290 { 1291 struct nvme_request *req = rdma_req->req; 1292 struct spdk_rdma_memory_translation mem_translation; 1293 void *payload; 1294 int rc; 1295 1296 payload = req->payload.contig_or_cb_arg + req->payload_offset; 1297 assert(req->payload_size != 0); 1298 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); 1299 1300 rc = spdk_rdma_get_translation(rqpair->mr_map, payload, req->payload_size, &mem_translation); 1301 if (spdk_unlikely(rc)) { 1302 SPDK_ERRLOG("Memory translation failed, rc %d\n", rc); 1303 return -1; 1304 } 1305 1306 rdma_req->send_sgl[1].lkey = spdk_rdma_memory_translation_get_lkey(&mem_translation); 1307 1308 /* The first element of this SGL is pointing at an 1309 * spdk_nvmf_cmd object. For this particular command, 1310 * we only need the first 64 bytes corresponding to 1311 * the NVMe command. */ 1312 rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); 1313 1314 rdma_req->send_sgl[1].addr = (uint64_t)payload; 1315 rdma_req->send_sgl[1].length = (uint32_t)req->payload_size; 1316 1317 /* The RDMA SGL contains two elements. The first describes 1318 * the NVMe command and the second describes the data 1319 * payload. */ 1320 rdma_req->send_wr.num_sge = 2; 1321 1322 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1323 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1324 req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; 1325 req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; 1326 /* Inline only supported for icdoff == 0 currently. This function will 1327 * not get called for controllers with other values. */ 1328 req->cmd.dptr.sgl1.address = (uint64_t)0; 1329 1330 return 0; 1331 } 1332 1333 /* 1334 * Build SGL describing contiguous payload buffer. 1335 */ 1336 static int 1337 nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair, 1338 struct spdk_nvme_rdma_req *rdma_req) 1339 { 1340 struct nvme_request *req = rdma_req->req; 1341 void *payload = req->payload.contig_or_cb_arg + req->payload_offset; 1342 struct spdk_rdma_memory_translation mem_translation; 1343 int rc; 1344 1345 assert(req->payload_size != 0); 1346 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); 1347 1348 if (spdk_unlikely(req->payload_size > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) { 1349 SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n", 1350 req->payload_size, NVME_RDMA_MAX_KEYED_SGL_LENGTH); 1351 return -1; 1352 } 1353 1354 rc = spdk_rdma_get_translation(rqpair->mr_map, payload, req->payload_size, &mem_translation); 1355 if (spdk_unlikely(rc)) { 1356 SPDK_ERRLOG("Memory translation failed, rc %d\n", rc); 1357 return -1; 1358 } 1359 1360 req->cmd.dptr.sgl1.keyed.key = spdk_rdma_memory_translation_get_rkey(&mem_translation); 1361 1362 /* The first element of this SGL is pointing at an 1363 * spdk_nvmf_cmd object. For this particular command, 1364 * we only need the first 64 bytes corresponding to 1365 * the NVMe command. */ 1366 rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); 1367 1368 /* The RDMA SGL needs one element describing the NVMe command. */ 1369 rdma_req->send_wr.num_sge = 1; 1370 1371 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1372 req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; 1373 req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; 1374 req->cmd.dptr.sgl1.keyed.length = req->payload_size; 1375 req->cmd.dptr.sgl1.address = (uint64_t)payload; 1376 1377 return 0; 1378 } 1379 1380 /* 1381 * Build SGL describing scattered payload buffer. 1382 */ 1383 static int 1384 nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, 1385 struct spdk_nvme_rdma_req *rdma_req) 1386 { 1387 struct nvme_request *req = rdma_req->req; 1388 struct spdk_nvmf_cmd *cmd = &rqpair->cmds[rdma_req->id]; 1389 struct spdk_rdma_memory_translation mem_translation; 1390 void *virt_addr; 1391 uint32_t remaining_size; 1392 uint32_t sge_length; 1393 int rc, max_num_sgl, num_sgl_desc; 1394 1395 assert(req->payload_size != 0); 1396 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1397 assert(req->payload.reset_sgl_fn != NULL); 1398 assert(req->payload.next_sge_fn != NULL); 1399 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1400 1401 max_num_sgl = req->qpair->ctrlr->max_sges; 1402 1403 remaining_size = req->payload_size; 1404 num_sgl_desc = 0; 1405 do { 1406 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &sge_length); 1407 if (rc) { 1408 return -1; 1409 } 1410 1411 sge_length = spdk_min(remaining_size, sge_length); 1412 1413 if (spdk_unlikely(sge_length > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) { 1414 SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n", 1415 sge_length, NVME_RDMA_MAX_KEYED_SGL_LENGTH); 1416 return -1; 1417 } 1418 rc = spdk_rdma_get_translation(rqpair->mr_map, virt_addr, sge_length, &mem_translation); 1419 if (spdk_unlikely(rc)) { 1420 SPDK_ERRLOG("Memory translation failed, rc %d\n", rc); 1421 return -1; 1422 } 1423 1424 cmd->sgl[num_sgl_desc].keyed.key = spdk_rdma_memory_translation_get_rkey(&mem_translation); 1425 cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; 1426 cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; 1427 cmd->sgl[num_sgl_desc].keyed.length = sge_length; 1428 cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr; 1429 1430 remaining_size -= sge_length; 1431 num_sgl_desc++; 1432 } while (remaining_size > 0 && num_sgl_desc < max_num_sgl); 1433 1434 1435 /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ 1436 if (remaining_size > 0) { 1437 return -1; 1438 } 1439 1440 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1441 1442 /* The RDMA SGL needs one element describing some portion 1443 * of the spdk_nvmf_cmd structure. */ 1444 rdma_req->send_wr.num_sge = 1; 1445 1446 /* 1447 * If only one SGL descriptor is required, it can be embedded directly in the command 1448 * as a data block descriptor. 1449 */ 1450 if (num_sgl_desc == 1) { 1451 /* The first element of this SGL is pointing at an 1452 * spdk_nvmf_cmd object. For this particular command, 1453 * we only need the first 64 bytes corresponding to 1454 * the NVMe command. */ 1455 rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); 1456 1457 req->cmd.dptr.sgl1.keyed.type = cmd->sgl[0].keyed.type; 1458 req->cmd.dptr.sgl1.keyed.subtype = cmd->sgl[0].keyed.subtype; 1459 req->cmd.dptr.sgl1.keyed.length = cmd->sgl[0].keyed.length; 1460 req->cmd.dptr.sgl1.keyed.key = cmd->sgl[0].keyed.key; 1461 req->cmd.dptr.sgl1.address = cmd->sgl[0].address; 1462 } else { 1463 /* 1464 * Otherwise, The SGL descriptor embedded in the command must point to the list of 1465 * SGL descriptors used to describe the operation. In that case it is a last segment descriptor. 1466 */ 1467 uint32_t descriptors_size = sizeof(struct spdk_nvme_sgl_descriptor) * num_sgl_desc; 1468 1469 if (spdk_unlikely(descriptors_size > rqpair->qpair.ctrlr->ioccsz_bytes)) { 1470 SPDK_ERRLOG("Size of SGL descriptors (%u) exceeds ICD (%u)\n", 1471 descriptors_size, rqpair->qpair.ctrlr->ioccsz_bytes); 1472 return -1; 1473 } 1474 rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + descriptors_size; 1475 1476 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; 1477 req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; 1478 req->cmd.dptr.sgl1.unkeyed.length = descriptors_size; 1479 req->cmd.dptr.sgl1.address = (uint64_t)0; 1480 } 1481 1482 return 0; 1483 } 1484 1485 /* 1486 * Build inline SGL describing sgl payload buffer. 1487 */ 1488 static int 1489 nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair *rqpair, 1490 struct spdk_nvme_rdma_req *rdma_req) 1491 { 1492 struct nvme_request *req = rdma_req->req; 1493 struct spdk_rdma_memory_translation mem_translation; 1494 uint32_t length; 1495 void *virt_addr; 1496 int rc; 1497 1498 assert(req->payload_size != 0); 1499 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 1500 assert(req->payload.reset_sgl_fn != NULL); 1501 assert(req->payload.next_sge_fn != NULL); 1502 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 1503 1504 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); 1505 if (rc) { 1506 return -1; 1507 } 1508 1509 if (length < req->payload_size) { 1510 SPDK_DEBUGLOG(nvme, "Inline SGL request split so sending separately.\n"); 1511 return nvme_rdma_build_sgl_request(rqpair, rdma_req); 1512 } 1513 1514 if (length > req->payload_size) { 1515 length = req->payload_size; 1516 } 1517 1518 rc = spdk_rdma_get_translation(rqpair->mr_map, virt_addr, length, &mem_translation); 1519 if (spdk_unlikely(rc)) { 1520 SPDK_ERRLOG("Memory translation failed, rc %d\n", rc); 1521 return -1; 1522 } 1523 1524 rdma_req->send_sgl[1].addr = (uint64_t)virt_addr; 1525 rdma_req->send_sgl[1].length = length; 1526 rdma_req->send_sgl[1].lkey = spdk_rdma_memory_translation_get_lkey(&mem_translation); 1527 1528 rdma_req->send_wr.num_sge = 2; 1529 1530 /* The first element of this SGL is pointing at an 1531 * spdk_nvmf_cmd object. For this particular command, 1532 * we only need the first 64 bytes corresponding to 1533 * the NVMe command. */ 1534 rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); 1535 1536 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 1537 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 1538 req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; 1539 req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; 1540 /* Inline only supported for icdoff == 0 currently. This function will 1541 * not get called for controllers with other values. */ 1542 req->cmd.dptr.sgl1.address = (uint64_t)0; 1543 1544 return 0; 1545 } 1546 1547 static int 1548 nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req, 1549 struct spdk_nvme_rdma_req *rdma_req) 1550 { 1551 struct spdk_nvme_ctrlr *ctrlr = rqpair->qpair.ctrlr; 1552 enum nvme_payload_type payload_type; 1553 bool icd_supported; 1554 int rc; 1555 1556 assert(rdma_req->req == NULL); 1557 rdma_req->req = req; 1558 req->cmd.cid = rdma_req->id; 1559 payload_type = nvme_payload_type(&req->payload); 1560 /* 1561 * Check if icdoff is non zero, to avoid interop conflicts with 1562 * targets with non-zero icdoff. Both SPDK and the Linux kernel 1563 * targets use icdoff = 0. For targets with non-zero icdoff, we 1564 * will currently just not use inline data for now. 1565 */ 1566 icd_supported = spdk_nvme_opc_get_data_transfer(req->cmd.opc) == SPDK_NVME_DATA_HOST_TO_CONTROLLER 1567 && req->payload_size <= ctrlr->ioccsz_bytes && ctrlr->icdoff == 0; 1568 1569 if (req->payload_size == 0) { 1570 rc = nvme_rdma_build_null_request(rdma_req); 1571 } else if (payload_type == NVME_PAYLOAD_TYPE_CONTIG) { 1572 if (icd_supported) { 1573 rc = nvme_rdma_build_contig_inline_request(rqpair, rdma_req); 1574 } else { 1575 rc = nvme_rdma_build_contig_request(rqpair, rdma_req); 1576 } 1577 } else if (payload_type == NVME_PAYLOAD_TYPE_SGL) { 1578 if (icd_supported) { 1579 rc = nvme_rdma_build_sgl_inline_request(rqpair, rdma_req); 1580 } else { 1581 rc = nvme_rdma_build_sgl_request(rqpair, rdma_req); 1582 } 1583 } else { 1584 rc = -1; 1585 } 1586 1587 if (rc) { 1588 rdma_req->req = NULL; 1589 return rc; 1590 } 1591 1592 memcpy(&rqpair->cmds[rdma_req->id], &req->cmd, sizeof(req->cmd)); 1593 return 0; 1594 } 1595 1596 static struct spdk_nvme_qpair * 1597 nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, 1598 uint16_t qid, uint32_t qsize, 1599 enum spdk_nvme_qprio qprio, 1600 uint32_t num_requests, 1601 bool delay_cmd_submit) 1602 { 1603 struct nvme_rdma_qpair *rqpair; 1604 struct spdk_nvme_qpair *qpair; 1605 int rc; 1606 1607 rqpair = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_qpair)); 1608 if (!rqpair) { 1609 SPDK_ERRLOG("failed to get create rqpair\n"); 1610 return NULL; 1611 } 1612 1613 rqpair->num_entries = qsize; 1614 rqpair->delay_cmd_submit = delay_cmd_submit; 1615 qpair = &rqpair->qpair; 1616 rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests); 1617 if (rc != 0) { 1618 return NULL; 1619 } 1620 1621 rc = nvme_rdma_alloc_reqs(rqpair); 1622 SPDK_DEBUGLOG(nvme, "rc =%d\n", rc); 1623 if (rc) { 1624 SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n"); 1625 nvme_rdma_free(rqpair); 1626 return NULL; 1627 } 1628 SPDK_DEBUGLOG(nvme, "RDMA requests allocated\n"); 1629 1630 rc = nvme_rdma_alloc_rsps(rqpair); 1631 SPDK_DEBUGLOG(nvme, "rc =%d\n", rc); 1632 if (rc < 0) { 1633 SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n"); 1634 nvme_rdma_free_reqs(rqpair); 1635 nvme_rdma_free(rqpair); 1636 return NULL; 1637 } 1638 SPDK_DEBUGLOG(nvme, "RDMA responses allocated\n"); 1639 1640 return qpair; 1641 } 1642 1643 static void 1644 nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1645 { 1646 struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); 1647 struct nvme_rdma_ctrlr *rctrlr = NULL; 1648 struct nvme_rdma_cm_event_entry *entry, *tmp; 1649 1650 spdk_rdma_free_mem_map(&rqpair->mr_map); 1651 nvme_rdma_unregister_reqs(rqpair); 1652 nvme_rdma_unregister_rsps(rqpair); 1653 1654 if (rqpair->evt) { 1655 rdma_ack_cm_event(rqpair->evt); 1656 rqpair->evt = NULL; 1657 } 1658 1659 /* 1660 * This works because we have the controller lock both in 1661 * this function and in the function where we add new events. 1662 */ 1663 if (qpair->ctrlr != NULL) { 1664 rctrlr = nvme_rdma_ctrlr(qpair->ctrlr); 1665 STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) { 1666 if (nvme_rdma_qpair(entry->evt->id->context) == rqpair) { 1667 STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link); 1668 rdma_ack_cm_event(entry->evt); 1669 STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link); 1670 } 1671 } 1672 } 1673 1674 if (rqpair->cm_id) { 1675 if (rqpair->rdma_qp) { 1676 spdk_rdma_qp_disconnect(rqpair->rdma_qp); 1677 if (rctrlr != NULL) { 1678 if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) { 1679 SPDK_DEBUGLOG(nvme, "Target did not respond to qpair disconnect.\n"); 1680 } 1681 } 1682 spdk_rdma_qp_destroy(rqpair->rdma_qp); 1683 rqpair->rdma_qp = NULL; 1684 } 1685 1686 rdma_destroy_id(rqpair->cm_id); 1687 rqpair->cm_id = NULL; 1688 } 1689 1690 if (rqpair->cq) { 1691 ibv_destroy_cq(rqpair->cq); 1692 rqpair->cq = NULL; 1693 } 1694 } 1695 1696 static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); 1697 1698 static int 1699 nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 1700 { 1701 struct nvme_rdma_qpair *rqpair; 1702 1703 rqpair = nvme_rdma_qpair(qpair); 1704 nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); 1705 if (rqpair->defer_deletion_to_pg) { 1706 nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); 1707 return 0; 1708 } 1709 1710 nvme_rdma_qpair_abort_reqs(qpair, 1); 1711 nvme_qpair_deinit(qpair); 1712 1713 nvme_rdma_free_reqs(rqpair); 1714 nvme_rdma_free_rsps(rqpair); 1715 nvme_rdma_free(rqpair); 1716 1717 return 0; 1718 } 1719 1720 static struct spdk_nvme_qpair * 1721 nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 1722 const struct spdk_nvme_io_qpair_opts *opts) 1723 { 1724 return nvme_rdma_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, 1725 opts->io_queue_requests, 1726 opts->delay_cmd_submit); 1727 } 1728 1729 static int 1730 nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 1731 { 1732 /* do nothing here */ 1733 return 0; 1734 } 1735 1736 static int nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); 1737 1738 static struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 1739 const struct spdk_nvme_ctrlr_opts *opts, 1740 void *devhandle) 1741 { 1742 struct nvme_rdma_ctrlr *rctrlr; 1743 union spdk_nvme_cap_register cap; 1744 union spdk_nvme_vs_register vs; 1745 struct ibv_context **contexts; 1746 struct ibv_device_attr dev_attr; 1747 int i, flag, rc; 1748 1749 rctrlr = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_ctrlr)); 1750 if (rctrlr == NULL) { 1751 SPDK_ERRLOG("could not allocate ctrlr\n"); 1752 return NULL; 1753 } 1754 1755 rctrlr->ctrlr.opts = *opts; 1756 rctrlr->ctrlr.trid = *trid; 1757 1758 if (opts->transport_retry_count > NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT) { 1759 SPDK_NOTICELOG("transport_retry_count exceeds max value %d, use max value\n", 1760 NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT); 1761 rctrlr->ctrlr.opts.transport_retry_count = NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT; 1762 } 1763 1764 if (opts->transport_ack_timeout > NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT) { 1765 SPDK_NOTICELOG("transport_ack_timeout exceeds max value %d, use max value\n", 1766 NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT); 1767 rctrlr->ctrlr.opts.transport_ack_timeout = NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT; 1768 } 1769 1770 contexts = rdma_get_devices(NULL); 1771 if (contexts == NULL) { 1772 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 1773 nvme_rdma_free(rctrlr); 1774 return NULL; 1775 } 1776 1777 i = 0; 1778 rctrlr->max_sge = NVME_RDMA_MAX_SGL_DESCRIPTORS; 1779 1780 while (contexts[i] != NULL) { 1781 rc = ibv_query_device(contexts[i], &dev_attr); 1782 if (rc < 0) { 1783 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1784 rdma_free_devices(contexts); 1785 nvme_rdma_free(rctrlr); 1786 return NULL; 1787 } 1788 rctrlr->max_sge = spdk_min(rctrlr->max_sge, (uint16_t)dev_attr.max_sge); 1789 i++; 1790 } 1791 1792 rdma_free_devices(contexts); 1793 1794 rc = nvme_ctrlr_construct(&rctrlr->ctrlr); 1795 if (rc != 0) { 1796 nvme_rdma_free(rctrlr); 1797 return NULL; 1798 } 1799 1800 STAILQ_INIT(&rctrlr->pending_cm_events); 1801 STAILQ_INIT(&rctrlr->free_cm_events); 1802 rctrlr->cm_events = nvme_rdma_calloc(NVME_RDMA_NUM_CM_EVENTS, sizeof(*rctrlr->cm_events)); 1803 if (rctrlr->cm_events == NULL) { 1804 SPDK_ERRLOG("unable to allocat buffers to hold CM events.\n"); 1805 goto destruct_ctrlr; 1806 } 1807 1808 for (i = 0; i < NVME_RDMA_NUM_CM_EVENTS; i++) { 1809 STAILQ_INSERT_TAIL(&rctrlr->free_cm_events, &rctrlr->cm_events[i], link); 1810 } 1811 1812 rctrlr->cm_channel = rdma_create_event_channel(); 1813 if (rctrlr->cm_channel == NULL) { 1814 SPDK_ERRLOG("rdma_create_event_channel() failed\n"); 1815 goto destruct_ctrlr; 1816 } 1817 1818 flag = fcntl(rctrlr->cm_channel->fd, F_GETFL); 1819 if (fcntl(rctrlr->cm_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1820 SPDK_ERRLOG("Cannot set event channel to non blocking\n"); 1821 goto destruct_ctrlr; 1822 } 1823 1824 rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0, 1825 rctrlr->ctrlr.opts.admin_queue_size, 0, 1826 rctrlr->ctrlr.opts.admin_queue_size, false); 1827 if (!rctrlr->ctrlr.adminq) { 1828 SPDK_ERRLOG("failed to create admin qpair\n"); 1829 goto destruct_ctrlr; 1830 } 1831 1832 rc = nvme_transport_ctrlr_connect_qpair(&rctrlr->ctrlr, rctrlr->ctrlr.adminq); 1833 if (rc < 0) { 1834 SPDK_ERRLOG("failed to connect admin qpair\n"); 1835 goto destruct_ctrlr; 1836 } 1837 1838 if (nvme_ctrlr_get_cap(&rctrlr->ctrlr, &cap)) { 1839 SPDK_ERRLOG("get_cap() failed\n"); 1840 goto destruct_ctrlr; 1841 } 1842 1843 if (nvme_ctrlr_get_vs(&rctrlr->ctrlr, &vs)) { 1844 SPDK_ERRLOG("get_vs() failed\n"); 1845 goto destruct_ctrlr; 1846 } 1847 1848 if (nvme_ctrlr_add_process(&rctrlr->ctrlr, 0) != 0) { 1849 SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); 1850 goto destruct_ctrlr; 1851 } 1852 1853 nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs); 1854 1855 SPDK_DEBUGLOG(nvme, "successfully initialized the nvmf ctrlr\n"); 1856 return &rctrlr->ctrlr; 1857 1858 destruct_ctrlr: 1859 nvme_ctrlr_destruct(&rctrlr->ctrlr); 1860 return NULL; 1861 } 1862 1863 static int 1864 nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 1865 { 1866 struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); 1867 struct nvme_rdma_cm_event_entry *entry; 1868 1869 if (ctrlr->adminq) { 1870 nvme_rdma_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq); 1871 } 1872 1873 STAILQ_FOREACH(entry, &rctrlr->pending_cm_events, link) { 1874 rdma_ack_cm_event(entry->evt); 1875 } 1876 1877 STAILQ_INIT(&rctrlr->free_cm_events); 1878 STAILQ_INIT(&rctrlr->pending_cm_events); 1879 nvme_rdma_free(rctrlr->cm_events); 1880 1881 if (rctrlr->cm_channel) { 1882 rdma_destroy_event_channel(rctrlr->cm_channel); 1883 rctrlr->cm_channel = NULL; 1884 } 1885 1886 nvme_ctrlr_destruct_finish(ctrlr); 1887 1888 nvme_rdma_free(rctrlr); 1889 1890 return 0; 1891 } 1892 1893 static int 1894 nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair, 1895 struct nvme_request *req) 1896 { 1897 struct nvme_rdma_qpair *rqpair; 1898 struct spdk_nvme_rdma_req *rdma_req; 1899 struct ibv_send_wr *wr; 1900 1901 rqpair = nvme_rdma_qpair(qpair); 1902 assert(rqpair != NULL); 1903 assert(req != NULL); 1904 1905 rdma_req = nvme_rdma_req_get(rqpair); 1906 if (!rdma_req) { 1907 /* Inform the upper layer to try again later. */ 1908 return -EAGAIN; 1909 } 1910 1911 if (nvme_rdma_req_init(rqpair, req, rdma_req)) { 1912 SPDK_ERRLOG("nvme_rdma_req_init() failed\n"); 1913 TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); 1914 nvme_rdma_req_put(rqpair, rdma_req); 1915 return -1; 1916 } 1917 1918 wr = &rdma_req->send_wr; 1919 wr->next = NULL; 1920 nvme_rdma_trace_ibv_sge(wr->sg_list); 1921 return nvme_rdma_qpair_queue_send_wr(rqpair, wr); 1922 } 1923 1924 static int 1925 nvme_rdma_qpair_reset(struct spdk_nvme_qpair *qpair) 1926 { 1927 /* Currently, doing nothing here */ 1928 return 0; 1929 } 1930 1931 static void 1932 nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1933 { 1934 struct spdk_nvme_rdma_req *rdma_req, *tmp; 1935 struct spdk_nvme_cpl cpl; 1936 struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); 1937 1938 cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; 1939 cpl.status.sct = SPDK_NVME_SCT_GENERIC; 1940 cpl.status.dnr = dnr; 1941 1942 /* 1943 * We cannot abort requests at the RDMA layer without 1944 * unregistering them. If we do, we can still get error 1945 * free completions on the shared completion queue. 1946 */ 1947 if (nvme_qpair_get_state(qpair) > NVME_QPAIR_DISCONNECTING && 1948 nvme_qpair_get_state(qpair) != NVME_QPAIR_DESTROYING) { 1949 nvme_ctrlr_disconnect_qpair(qpair); 1950 } 1951 1952 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { 1953 nvme_rdma_req_complete(rdma_req, &cpl); 1954 nvme_rdma_req_put(rqpair, rdma_req); 1955 } 1956 } 1957 1958 static void 1959 nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 1960 { 1961 uint64_t t02; 1962 struct spdk_nvme_rdma_req *rdma_req, *tmp; 1963 struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); 1964 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 1965 struct spdk_nvme_ctrlr_process *active_proc; 1966 1967 /* Don't check timeouts during controller initialization. */ 1968 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 1969 return; 1970 } 1971 1972 if (nvme_qpair_is_admin_queue(qpair)) { 1973 active_proc = nvme_ctrlr_get_current_process(ctrlr); 1974 } else { 1975 active_proc = qpair->active_proc; 1976 } 1977 1978 /* Only check timeouts if the current process has a timeout callback. */ 1979 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 1980 return; 1981 } 1982 1983 t02 = spdk_get_ticks(); 1984 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { 1985 assert(rdma_req->req != NULL); 1986 1987 if (nvme_request_check_timeout(rdma_req->req, rdma_req->id, active_proc, t02)) { 1988 /* 1989 * The requests are in order, so as soon as one has not timed out, 1990 * stop iterating. 1991 */ 1992 break; 1993 } 1994 } 1995 } 1996 1997 static inline int 1998 nvme_rdma_request_ready(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) 1999 { 2000 nvme_rdma_req_complete(rdma_req, &rqpair->rsps[rdma_req->rsp_idx].cpl); 2001 nvme_rdma_req_put(rqpair, rdma_req); 2002 return nvme_rdma_post_recv(rqpair, rdma_req->rsp_idx); 2003 } 2004 2005 #define MAX_COMPLETIONS_PER_POLL 128 2006 2007 static void 2008 nvme_rdma_fail_qpair(struct spdk_nvme_qpair *qpair, int failure_reason) 2009 { 2010 if (failure_reason == IBV_WC_RETRY_EXC_ERR) { 2011 qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE; 2012 } else if (qpair->transport_failure_reason == SPDK_NVME_QPAIR_FAILURE_NONE) { 2013 qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; 2014 } 2015 2016 nvme_ctrlr_disconnect_qpair(qpair); 2017 } 2018 2019 static void 2020 nvme_rdma_conditional_fail_qpair(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poll_group *group) 2021 { 2022 struct nvme_rdma_destroyed_qpair *qpair_tracker; 2023 2024 assert(rqpair); 2025 if (group) { 2026 STAILQ_FOREACH(qpair_tracker, &group->destroyed_qpairs, link) { 2027 if (qpair_tracker->destroyed_qpair_tracker == rqpair) { 2028 return; 2029 } 2030 } 2031 } 2032 nvme_rdma_fail_qpair(&rqpair->qpair, 0); 2033 } 2034 2035 static int 2036 nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size, 2037 struct nvme_rdma_poll_group *group, 2038 struct nvme_rdma_qpair *rdma_qpair) 2039 { 2040 struct ibv_wc wc[MAX_COMPLETIONS_PER_POLL]; 2041 struct nvme_rdma_qpair *rqpair; 2042 struct spdk_nvme_rdma_req *rdma_req; 2043 struct spdk_nvme_rdma_rsp *rdma_rsp; 2044 struct nvme_rdma_wr *rdma_wr; 2045 uint32_t reaped = 0; 2046 int completion_rc = 0; 2047 int rc, i; 2048 2049 rc = ibv_poll_cq(cq, batch_size, wc); 2050 if (rc < 0) { 2051 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 2052 errno, spdk_strerror(errno)); 2053 return -ECANCELED; 2054 } else if (rc == 0) { 2055 return 0; 2056 } 2057 2058 for (i = 0; i < rc; i++) { 2059 rdma_wr = (struct nvme_rdma_wr *)wc[i].wr_id; 2060 switch (rdma_wr->type) { 2061 case RDMA_WR_TYPE_RECV: 2062 rdma_rsp = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_rsp, rdma_wr); 2063 rqpair = rdma_rsp->rqpair; 2064 assert(rqpair->current_num_recvs > 0); 2065 rqpair->current_num_recvs--; 2066 2067 if (wc[i].status) { 2068 SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n", 2069 rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 2070 nvme_rdma_conditional_fail_qpair(rqpair, group); 2071 completion_rc = -ENXIO; 2072 continue; 2073 } 2074 2075 SPDK_DEBUGLOG(nvme, "CQ recv completion\n"); 2076 2077 if (wc[i].byte_len < sizeof(struct spdk_nvme_cpl)) { 2078 SPDK_ERRLOG("recv length %u less than expected response size\n", wc[i].byte_len); 2079 nvme_rdma_conditional_fail_qpair(rqpair, group); 2080 completion_rc = -ENXIO; 2081 continue; 2082 } 2083 rdma_req = &rqpair->rdma_reqs[rdma_rsp->cpl.cid]; 2084 rdma_req->completion_flags |= NVME_RDMA_RECV_COMPLETED; 2085 rdma_req->rsp_idx = rdma_rsp->idx; 2086 2087 if ((rdma_req->completion_flags & NVME_RDMA_SEND_COMPLETED) != 0) { 2088 if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) { 2089 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 2090 nvme_rdma_conditional_fail_qpair(rqpair, group); 2091 completion_rc = -ENXIO; 2092 continue; 2093 } 2094 reaped++; 2095 rqpair->num_completions++; 2096 } 2097 break; 2098 2099 case RDMA_WR_TYPE_SEND: 2100 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr); 2101 2102 /* If we are flushing I/O */ 2103 if (wc[i].status) { 2104 rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL; 2105 if (!rqpair) { 2106 rqpair = rdma_qpair != NULL ? rdma_qpair : nvme_rdma_poll_group_get_qpair_by_id(group, 2107 wc[i].qp_num); 2108 } 2109 assert(rqpair); 2110 assert(rqpair->current_num_sends > 0); 2111 rqpair->current_num_sends--; 2112 nvme_rdma_conditional_fail_qpair(rqpair, group); 2113 SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n", 2114 rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 2115 completion_rc = -ENXIO; 2116 continue; 2117 } 2118 2119 rqpair = nvme_rdma_qpair(rdma_req->req->qpair); 2120 rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED; 2121 rqpair->current_num_sends--; 2122 2123 if ((rdma_req->completion_flags & NVME_RDMA_RECV_COMPLETED) != 0) { 2124 if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) { 2125 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 2126 nvme_rdma_conditional_fail_qpair(rqpair, group); 2127 completion_rc = -ENXIO; 2128 continue; 2129 } 2130 reaped++; 2131 rqpair->num_completions++; 2132 } 2133 break; 2134 2135 default: 2136 SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", rdma_wr->type); 2137 return -ECANCELED; 2138 } 2139 } 2140 2141 if (completion_rc) { 2142 return completion_rc; 2143 } 2144 2145 return reaped; 2146 } 2147 2148 static void 2149 dummy_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 2150 { 2151 2152 } 2153 2154 static int 2155 nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair, 2156 uint32_t max_completions) 2157 { 2158 struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); 2159 int rc = 0, batch_size; 2160 struct ibv_cq *cq; 2161 struct nvme_rdma_ctrlr *rctrlr; 2162 2163 /* 2164 * This is used during the connection phase. It's possible that we are still reaping error completions 2165 * from other qpairs so we need to call the poll group function. Also, it's more correct since the cq 2166 * is shared. 2167 */ 2168 if (qpair->poll_group != NULL) { 2169 return spdk_nvme_poll_group_process_completions(qpair->poll_group->group, max_completions, 2170 dummy_disconnected_qpair_cb); 2171 } 2172 2173 if (max_completions == 0) { 2174 max_completions = rqpair->num_entries; 2175 } else { 2176 max_completions = spdk_min(max_completions, rqpair->num_entries); 2177 } 2178 2179 if (nvme_qpair_is_admin_queue(&rqpair->qpair)) { 2180 rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); 2181 nvme_rdma_poll_events(rctrlr); 2182 } 2183 nvme_rdma_qpair_process_cm_event(rqpair); 2184 2185 if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) { 2186 nvme_rdma_fail_qpair(qpair, 0); 2187 return -ENXIO; 2188 } 2189 2190 cq = rqpair->cq; 2191 2192 rqpair->num_completions = 0; 2193 do { 2194 batch_size = spdk_min((max_completions - rqpair->num_completions), MAX_COMPLETIONS_PER_POLL); 2195 rc = nvme_rdma_cq_process_completions(cq, batch_size, NULL, rqpair); 2196 2197 if (rc == 0) { 2198 break; 2199 /* Handle the case where we fail to poll the cq. */ 2200 } else if (rc == -ECANCELED) { 2201 nvme_rdma_fail_qpair(qpair, 0); 2202 return -ENXIO; 2203 } else if (rc == -ENXIO) { 2204 return rc; 2205 } 2206 } while (rqpair->num_completions < max_completions); 2207 2208 if (spdk_unlikely(nvme_rdma_qpair_submit_sends(rqpair) || 2209 nvme_rdma_qpair_submit_recvs(rqpair))) { 2210 nvme_rdma_fail_qpair(qpair, 0); 2211 return -ENXIO; 2212 } 2213 2214 if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) { 2215 nvme_rdma_qpair_check_timeout(qpair); 2216 } 2217 2218 return rqpair->num_completions; 2219 } 2220 2221 static uint32_t 2222 nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 2223 { 2224 /* max_mr_size by ibv_query_device indicates the largest value that we can 2225 * set for a registered memory region. It is independent from the actual 2226 * I/O size and is very likely to be larger than 2 MiB which is the 2227 * granularity we currently register memory regions. Hence return 2228 * UINT32_MAX here and let the generic layer use the controller data to 2229 * moderate this value. 2230 */ 2231 return UINT32_MAX; 2232 } 2233 2234 static uint16_t 2235 nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 2236 { 2237 struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); 2238 2239 return rctrlr->max_sge; 2240 } 2241 2242 static int 2243 nvme_rdma_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, 2244 int (*iter_fn)(struct nvme_request *req, void *arg), 2245 void *arg) 2246 { 2247 struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); 2248 struct spdk_nvme_rdma_req *rdma_req, *tmp; 2249 int rc; 2250 2251 assert(iter_fn != NULL); 2252 2253 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { 2254 assert(rdma_req->req != NULL); 2255 2256 rc = iter_fn(rdma_req->req, arg); 2257 if (rc != 0) { 2258 return rc; 2259 } 2260 } 2261 2262 return 0; 2263 } 2264 2265 static void 2266 nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 2267 { 2268 struct spdk_nvme_rdma_req *rdma_req, *tmp; 2269 struct spdk_nvme_cpl cpl; 2270 struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); 2271 2272 cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; 2273 cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2274 2275 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { 2276 assert(rdma_req->req != NULL); 2277 2278 if (rdma_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 2279 continue; 2280 } 2281 2282 nvme_rdma_req_complete(rdma_req, &cpl); 2283 nvme_rdma_req_put(rqpair, rdma_req); 2284 } 2285 } 2286 2287 static int 2288 nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx) 2289 { 2290 struct nvme_rdma_poller *poller; 2291 2292 poller = calloc(1, sizeof(*poller)); 2293 if (poller == NULL) { 2294 SPDK_ERRLOG("Unable to allocate poller.\n"); 2295 return -ENOMEM; 2296 } 2297 2298 poller->device = ctx; 2299 poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0); 2300 2301 if (poller->cq == NULL) { 2302 free(poller); 2303 return -EINVAL; 2304 } 2305 2306 STAILQ_INSERT_HEAD(&group->pollers, poller, link); 2307 group->num_pollers++; 2308 poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE; 2309 poller->required_num_wc = 0; 2310 return 0; 2311 } 2312 2313 static void 2314 nvme_rdma_poll_group_free_pollers(struct nvme_rdma_poll_group *group) 2315 { 2316 struct nvme_rdma_poller *poller, *tmp_poller; 2317 2318 STAILQ_FOREACH_SAFE(poller, &group->pollers, link, tmp_poller) { 2319 if (poller->cq) { 2320 ibv_destroy_cq(poller->cq); 2321 } 2322 STAILQ_REMOVE(&group->pollers, poller, nvme_rdma_poller, link); 2323 free(poller); 2324 } 2325 } 2326 2327 static struct spdk_nvme_transport_poll_group * 2328 nvme_rdma_poll_group_create(void) 2329 { 2330 struct nvme_rdma_poll_group *group; 2331 struct ibv_context **contexts; 2332 int i = 0; 2333 2334 group = calloc(1, sizeof(*group)); 2335 if (group == NULL) { 2336 SPDK_ERRLOG("Unable to allocate poll group.\n"); 2337 return NULL; 2338 } 2339 2340 STAILQ_INIT(&group->pollers); 2341 2342 contexts = rdma_get_devices(NULL); 2343 if (contexts == NULL) { 2344 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 2345 free(group); 2346 return NULL; 2347 } 2348 2349 while (contexts[i] != NULL) { 2350 if (nvme_rdma_poller_create(group, contexts[i])) { 2351 nvme_rdma_poll_group_free_pollers(group); 2352 free(group); 2353 rdma_free_devices(contexts); 2354 return NULL; 2355 } 2356 i++; 2357 } 2358 2359 rdma_free_devices(contexts); 2360 STAILQ_INIT(&group->destroyed_qpairs); 2361 return &group->group; 2362 } 2363 2364 struct nvme_rdma_qpair * 2365 nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, uint32_t qp_num) 2366 { 2367 struct spdk_nvme_qpair *qpair; 2368 struct nvme_rdma_destroyed_qpair *rqpair_tracker; 2369 struct nvme_rdma_qpair *rqpair; 2370 2371 STAILQ_FOREACH(qpair, &group->group.disconnected_qpairs, poll_group_stailq) { 2372 rqpair = nvme_rdma_qpair(qpair); 2373 if (rqpair->rdma_qp->qp->qp_num == qp_num) { 2374 return rqpair; 2375 } 2376 } 2377 2378 STAILQ_FOREACH(qpair, &group->group.connected_qpairs, poll_group_stailq) { 2379 rqpair = nvme_rdma_qpair(qpair); 2380 if (rqpair->rdma_qp->qp->qp_num == qp_num) { 2381 return rqpair; 2382 } 2383 } 2384 2385 STAILQ_FOREACH(rqpair_tracker, &group->destroyed_qpairs, link) { 2386 rqpair = rqpair_tracker->destroyed_qpair_tracker; 2387 if (rqpair->rdma_qp->qp->qp_num == qp_num) { 2388 return rqpair; 2389 } 2390 } 2391 2392 return NULL; 2393 } 2394 2395 static int 2396 nvme_rdma_resize_cq(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poller *poller) 2397 { 2398 int current_num_wc, required_num_wc; 2399 2400 required_num_wc = poller->required_num_wc + WC_PER_QPAIR(rqpair->num_entries); 2401 current_num_wc = poller->current_num_wc; 2402 if (current_num_wc < required_num_wc) { 2403 current_num_wc = spdk_max(current_num_wc * 2, required_num_wc); 2404 } 2405 2406 if (poller->current_num_wc != current_num_wc) { 2407 SPDK_DEBUGLOG(nvme, "Resize RDMA CQ from %d to %d\n", poller->current_num_wc, 2408 current_num_wc); 2409 if (ibv_resize_cq(poller->cq, current_num_wc)) { 2410 SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); 2411 return -1; 2412 } 2413 2414 poller->current_num_wc = current_num_wc; 2415 } 2416 2417 poller->required_num_wc = required_num_wc; 2418 return 0; 2419 } 2420 2421 static int 2422 nvme_rdma_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) 2423 { 2424 struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); 2425 struct nvme_rdma_poll_group *group = nvme_rdma_poll_group(qpair->poll_group); 2426 struct nvme_rdma_poller *poller; 2427 2428 assert(rqpair->cq == NULL); 2429 2430 STAILQ_FOREACH(poller, &group->pollers, link) { 2431 if (poller->device == rqpair->cm_id->verbs) { 2432 if (nvme_rdma_resize_cq(rqpair, poller)) { 2433 return -EPROTO; 2434 } 2435 rqpair->cq = poller->cq; 2436 break; 2437 } 2438 } 2439 2440 if (rqpair->cq == NULL) { 2441 SPDK_ERRLOG("Unable to find a cq for qpair %p on poll group %p\n", qpair, qpair->poll_group); 2442 return -EINVAL; 2443 } 2444 2445 return 0; 2446 } 2447 2448 static int 2449 nvme_rdma_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) 2450 { 2451 struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); 2452 struct nvme_rdma_poll_group *group; 2453 struct nvme_rdma_destroyed_qpair *destroyed_qpair; 2454 enum nvme_qpair_state state; 2455 2456 if (rqpair->poll_group_disconnect_in_progress) { 2457 return -EINPROGRESS; 2458 } 2459 2460 rqpair->poll_group_disconnect_in_progress = true; 2461 state = nvme_qpair_get_state(qpair); 2462 group = nvme_rdma_poll_group(qpair->poll_group); 2463 rqpair->cq = NULL; 2464 2465 /* 2466 * We want to guard against an endless recursive loop while making 2467 * sure the qpair is disconnected before we disconnect it from the qpair. 2468 */ 2469 if (state > NVME_QPAIR_DISCONNECTING && state != NVME_QPAIR_DESTROYING) { 2470 nvme_ctrlr_disconnect_qpair(qpair); 2471 } 2472 2473 /* 2474 * If this fails, the system is in serious trouble, 2475 * just let the qpair get cleaned up immediately. 2476 */ 2477 destroyed_qpair = calloc(1, sizeof(*destroyed_qpair)); 2478 if (destroyed_qpair == NULL) { 2479 return 0; 2480 } 2481 2482 destroyed_qpair->destroyed_qpair_tracker = rqpair; 2483 destroyed_qpair->completed_cycles = 0; 2484 STAILQ_INSERT_TAIL(&group->destroyed_qpairs, destroyed_qpair, link); 2485 2486 rqpair->defer_deletion_to_pg = true; 2487 2488 rqpair->poll_group_disconnect_in_progress = false; 2489 return 0; 2490 } 2491 2492 static int 2493 nvme_rdma_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, 2494 struct spdk_nvme_qpair *qpair) 2495 { 2496 return 0; 2497 } 2498 2499 static int 2500 nvme_rdma_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, 2501 struct spdk_nvme_qpair *qpair) 2502 { 2503 if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { 2504 return nvme_poll_group_disconnect_qpair(qpair); 2505 } 2506 2507 return 0; 2508 } 2509 2510 static void 2511 nvme_rdma_poll_group_delete_qpair(struct nvme_rdma_poll_group *group, 2512 struct nvme_rdma_destroyed_qpair *qpair_tracker) 2513 { 2514 struct nvme_rdma_qpair *rqpair = qpair_tracker->destroyed_qpair_tracker; 2515 2516 rqpair->defer_deletion_to_pg = false; 2517 if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) { 2518 nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair); 2519 } 2520 STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link); 2521 free(qpair_tracker); 2522 } 2523 2524 static int64_t 2525 nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, 2526 uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) 2527 { 2528 struct spdk_nvme_qpair *qpair, *tmp_qpair; 2529 struct nvme_rdma_destroyed_qpair *qpair_tracker, *tmp_qpair_tracker; 2530 struct nvme_rdma_qpair *rqpair; 2531 struct nvme_rdma_poll_group *group; 2532 struct nvme_rdma_poller *poller; 2533 int num_qpairs = 0, batch_size, rc; 2534 int64_t total_completions = 0; 2535 uint64_t completions_allowed = 0; 2536 uint64_t completions_per_poller = 0; 2537 uint64_t poller_completions = 0; 2538 2539 2540 if (completions_per_qpair == 0) { 2541 completions_per_qpair = MAX_COMPLETIONS_PER_POLL; 2542 } 2543 2544 group = nvme_rdma_poll_group(tgroup); 2545 STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { 2546 disconnected_qpair_cb(qpair, tgroup->group->ctx); 2547 } 2548 2549 STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { 2550 rqpair = nvme_rdma_qpair(qpair); 2551 rqpair->num_completions = 0; 2552 nvme_rdma_qpair_process_cm_event(rqpair); 2553 2554 if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) { 2555 nvme_rdma_fail_qpair(qpair, 0); 2556 disconnected_qpair_cb(qpair, tgroup->group->ctx); 2557 continue; 2558 } 2559 num_qpairs++; 2560 } 2561 2562 completions_allowed = completions_per_qpair * num_qpairs; 2563 completions_per_poller = spdk_max(completions_allowed / group->num_pollers, 1); 2564 2565 STAILQ_FOREACH(poller, &group->pollers, link) { 2566 poller_completions = 0; 2567 do { 2568 batch_size = spdk_min((completions_per_poller - poller_completions), MAX_COMPLETIONS_PER_POLL); 2569 rc = nvme_rdma_cq_process_completions(poller->cq, batch_size, group, NULL); 2570 if (rc <= 0) { 2571 if (rc == -ECANCELED) { 2572 return -EIO; 2573 } 2574 break; 2575 } 2576 2577 poller_completions += rc; 2578 } while (poller_completions < completions_per_poller); 2579 total_completions += poller_completions; 2580 } 2581 2582 STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { 2583 rqpair = nvme_rdma_qpair(qpair); 2584 if (spdk_unlikely(qpair->ctrlr->timeout_enabled)) { 2585 nvme_rdma_qpair_check_timeout(qpair); 2586 } 2587 2588 nvme_rdma_qpair_submit_sends(rqpair); 2589 nvme_rdma_qpair_submit_recvs(rqpair); 2590 nvme_qpair_resubmit_requests(&rqpair->qpair, rqpair->num_completions); 2591 } 2592 2593 /* 2594 * Once a qpair is disconnected, we can still get flushed completions for those disconnected qpairs. 2595 * For most pieces of hardware, those requests will complete immediately. However, there are certain 2596 * cases where flushed requests will linger. Default is to destroy qpair after all completions are freed, 2597 * but have a fallback for other cases where we don't get all of our completions back. 2598 */ 2599 STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) { 2600 qpair_tracker->completed_cycles++; 2601 rqpair = qpair_tracker->destroyed_qpair_tracker; 2602 if ((rqpair->current_num_sends == 0 && rqpair->current_num_recvs == 0) || 2603 qpair_tracker->completed_cycles > NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES) { 2604 nvme_rdma_poll_group_delete_qpair(group, qpair_tracker); 2605 } 2606 } 2607 2608 return total_completions; 2609 } 2610 2611 static int 2612 nvme_rdma_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) 2613 { 2614 struct nvme_rdma_poll_group *group = nvme_rdma_poll_group(tgroup); 2615 struct nvme_rdma_destroyed_qpair *qpair_tracker, *tmp_qpair_tracker; 2616 struct nvme_rdma_qpair *rqpair; 2617 2618 if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { 2619 return -EBUSY; 2620 } 2621 2622 STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) { 2623 rqpair = qpair_tracker->destroyed_qpair_tracker; 2624 if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) { 2625 rqpair->defer_deletion_to_pg = false; 2626 nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair); 2627 } 2628 2629 STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link); 2630 free(qpair_tracker); 2631 } 2632 2633 nvme_rdma_poll_group_free_pollers(group); 2634 free(group); 2635 2636 return 0; 2637 } 2638 2639 void 2640 spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) 2641 { 2642 g_nvme_hooks = *hooks; 2643 } 2644 2645 const struct spdk_nvme_transport_ops rdma_ops = { 2646 .name = "RDMA", 2647 .type = SPDK_NVME_TRANSPORT_RDMA, 2648 .ctrlr_construct = nvme_rdma_ctrlr_construct, 2649 .ctrlr_scan = nvme_fabric_ctrlr_scan, 2650 .ctrlr_destruct = nvme_rdma_ctrlr_destruct, 2651 .ctrlr_enable = nvme_rdma_ctrlr_enable, 2652 2653 .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4, 2654 .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8, 2655 .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4, 2656 .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8, 2657 2658 .ctrlr_get_max_xfer_size = nvme_rdma_ctrlr_get_max_xfer_size, 2659 .ctrlr_get_max_sges = nvme_rdma_ctrlr_get_max_sges, 2660 2661 .ctrlr_create_io_qpair = nvme_rdma_ctrlr_create_io_qpair, 2662 .ctrlr_delete_io_qpair = nvme_rdma_ctrlr_delete_io_qpair, 2663 .ctrlr_connect_qpair = nvme_rdma_ctrlr_connect_qpair, 2664 .ctrlr_disconnect_qpair = nvme_rdma_ctrlr_disconnect_qpair, 2665 2666 .qpair_abort_reqs = nvme_rdma_qpair_abort_reqs, 2667 .qpair_reset = nvme_rdma_qpair_reset, 2668 .qpair_submit_request = nvme_rdma_qpair_submit_request, 2669 .qpair_process_completions = nvme_rdma_qpair_process_completions, 2670 .qpair_iterate_requests = nvme_rdma_qpair_iterate_requests, 2671 .admin_qpair_abort_aers = nvme_rdma_admin_qpair_abort_aers, 2672 2673 .poll_group_create = nvme_rdma_poll_group_create, 2674 .poll_group_connect_qpair = nvme_rdma_poll_group_connect_qpair, 2675 .poll_group_disconnect_qpair = nvme_rdma_poll_group_disconnect_qpair, 2676 .poll_group_add = nvme_rdma_poll_group_add, 2677 .poll_group_remove = nvme_rdma_poll_group_remove, 2678 .poll_group_process_completions = nvme_rdma_poll_group_process_completions, 2679 .poll_group_destroy = nvme_rdma_poll_group_destroy, 2680 2681 }; 2682 2683 SPDK_NVME_TRANSPORT_REGISTER(rdma, &rdma_ops); 2684