1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "nvmf_internal.h" 41 #include "transport.h" 42 43 #include "spdk/assert.h" 44 #include "spdk/thread.h" 45 #include "spdk/nvmf.h" 46 #include "spdk/nvmf_spec.h" 47 #include "spdk/string.h" 48 #include "spdk/trace.h" 49 #include "spdk/util.h" 50 51 #include "spdk_internal/log.h" 52 53 /* 54 RDMA Connection Resouce Defaults 55 */ 56 #define NVMF_DEFAULT_TX_SGE 1 57 #define NVMF_DEFAULT_RX_SGE 2 58 #define NVMF_DEFAULT_DATA_SGE 16 59 60 /* The RDMA completion queue size */ 61 #define NVMF_RDMA_CQ_SIZE 4096 62 63 /* AIO backend requires block size aligned data buffers, 64 * extra 4KiB aligned data buffer should work for most devices. 65 */ 66 #define SHIFT_4KB 12 67 #define NVMF_DATA_BUFFER_ALIGNMENT (1 << SHIFT_4KB) 68 #define NVMF_DATA_BUFFER_MASK (NVMF_DATA_BUFFER_ALIGNMENT - 1) 69 70 enum spdk_nvmf_rdma_request_state { 71 /* The request is not currently in use */ 72 RDMA_REQUEST_STATE_FREE = 0, 73 74 /* Initial state when request first received */ 75 RDMA_REQUEST_STATE_NEW, 76 77 /* The request is queued until a data buffer is available. */ 78 RDMA_REQUEST_STATE_NEED_BUFFER, 79 80 /* The request is waiting on RDMA queue depth availability 81 * to transfer data from the host to the controller. 82 */ 83 RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER, 84 85 /* The request is currently transferring data from the host to the controller. */ 86 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 87 88 /* The request is ready to execute at the block device */ 89 RDMA_REQUEST_STATE_READY_TO_EXECUTE, 90 91 /* The request is currently executing at the block device */ 92 RDMA_REQUEST_STATE_EXECUTING, 93 94 /* The request finished executing at the block device */ 95 RDMA_REQUEST_STATE_EXECUTED, 96 97 /* The request is waiting on RDMA queue depth availability 98 * to transfer data from the controller to the host. 99 */ 100 RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST, 101 102 /* The request is ready to send a completion */ 103 RDMA_REQUEST_STATE_READY_TO_COMPLETE, 104 105 /* The request currently has a completion outstanding */ 106 RDMA_REQUEST_STATE_COMPLETING, 107 108 /* The request completed and can be marked free. */ 109 RDMA_REQUEST_STATE_COMPLETED, 110 }; 111 112 /* This structure holds commands as they are received off the wire. 113 * It must be dynamically paired with a full request object 114 * (spdk_nvmf_rdma_request) to service a request. It is separate 115 * from the request because RDMA does not appear to order 116 * completions, so occasionally we'll get a new incoming 117 * command when there aren't any free request objects. 118 */ 119 struct spdk_nvmf_rdma_recv { 120 struct ibv_recv_wr wr; 121 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 122 123 struct spdk_nvmf_rdma_qpair *qpair; 124 125 /* In-capsule data buffer */ 126 uint8_t *buf; 127 128 TAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 129 }; 130 131 struct spdk_nvmf_rdma_request { 132 struct spdk_nvmf_request req; 133 bool data_from_pool; 134 135 enum spdk_nvmf_rdma_request_state state; 136 137 struct spdk_nvmf_rdma_recv *recv; 138 139 struct { 140 struct ibv_send_wr wr; 141 struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; 142 } rsp; 143 144 struct { 145 struct ibv_send_wr wr; 146 struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 147 void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; 148 } data; 149 150 TAILQ_ENTRY(spdk_nvmf_rdma_request) link; 151 }; 152 153 struct spdk_nvmf_rdma_qpair { 154 struct spdk_nvmf_qpair qpair; 155 156 struct spdk_nvmf_rdma_port *port; 157 struct spdk_nvmf_rdma_poller *poller; 158 159 struct rdma_cm_id *cm_id; 160 161 /* The maximum number of I/O outstanding on this connection at one time */ 162 uint16_t max_queue_depth; 163 164 /* The maximum number of active RDMA READ and WRITE operations at one time */ 165 uint16_t max_rw_depth; 166 167 /* The current number of I/O outstanding on this connection. This number 168 * includes all I/O from the time the capsule is first received until it is 169 * completed. 170 */ 171 uint16_t cur_queue_depth; 172 173 /* The number of RDMA READ and WRITE requests that are outstanding */ 174 uint16_t cur_rdma_rw_depth; 175 176 /* Receives that are waiting for a request object */ 177 TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 178 179 /* Requests that are not in use */ 180 TAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; 181 182 /* Requests that are waiting to perform an RDMA READ or WRITE */ 183 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_rw_queue; 184 185 /* Array of size "max_queue_depth" containing RDMA requests. */ 186 struct spdk_nvmf_rdma_request *reqs; 187 188 /* Array of size "max_queue_depth" containing RDMA recvs. */ 189 struct spdk_nvmf_rdma_recv *recvs; 190 191 /* Array of size "max_queue_depth" containing 64 byte capsules 192 * used for receive. 193 */ 194 union nvmf_h2c_msg *cmds; 195 struct ibv_mr *cmds_mr; 196 197 /* Array of size "max_queue_depth" containing 16 byte completions 198 * to be sent back to the user. 199 */ 200 union nvmf_c2h_msg *cpls; 201 struct ibv_mr *cpls_mr; 202 203 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 204 * buffers to be used for in capsule data. 205 */ 206 void *bufs; 207 struct ibv_mr *bufs_mr; 208 209 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; 210 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) pending_link; 211 212 /* Mgmt channel */ 213 struct spdk_io_channel *mgmt_channel; 214 struct spdk_nvmf_rdma_mgmt_channel *ch; 215 }; 216 217 struct spdk_nvmf_rdma_poller { 218 struct spdk_nvmf_rdma_device *device; 219 struct spdk_nvmf_rdma_poll_group *group; 220 221 struct ibv_cq *cq; 222 223 TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; 224 225 TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 226 }; 227 228 struct spdk_nvmf_rdma_poll_group { 229 struct spdk_nvmf_transport_poll_group group; 230 231 TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 232 }; 233 234 /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 235 struct spdk_nvmf_rdma_device { 236 struct ibv_device_attr attr; 237 struct ibv_context *context; 238 239 struct spdk_mem_map *map; 240 struct ibv_pd *pd; 241 242 TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 243 }; 244 245 struct spdk_nvmf_rdma_port { 246 struct spdk_nvme_transport_id trid; 247 struct rdma_cm_id *id; 248 struct spdk_nvmf_rdma_device *device; 249 uint32_t ref; 250 TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 251 }; 252 253 struct spdk_nvmf_rdma_transport { 254 struct spdk_nvmf_transport transport; 255 256 struct rdma_event_channel *event_channel; 257 258 struct spdk_mempool *data_buf_pool; 259 260 pthread_mutex_t lock; 261 262 uint16_t max_queue_depth; 263 uint32_t max_io_size; 264 uint32_t io_unit_size; 265 uint32_t in_capsule_data_size; 266 267 /* fields used to poll RDMA/IB events */ 268 nfds_t npoll_fds; 269 struct pollfd *poll_fds; 270 271 TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 272 TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 273 }; 274 275 struct spdk_nvmf_rdma_mgmt_channel { 276 /* Requests that are waiting to obtain a data buffer */ 277 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; 278 }; 279 280 static int 281 spdk_nvmf_rdma_mgmt_channel_create(void *io_device, void *ctx_buf) 282 { 283 struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; 284 285 TAILQ_INIT(&ch->pending_data_buf_queue); 286 return 0; 287 } 288 289 static void 290 spdk_nvmf_rdma_mgmt_channel_destroy(void *io_device, void *ctx_buf) 291 { 292 struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; 293 294 if (!TAILQ_EMPTY(&ch->pending_data_buf_queue)) { 295 SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n"); 296 } 297 } 298 299 static void 300 spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 301 { 302 if (rqpair->poller) { 303 TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); 304 } 305 306 if (rqpair->cmds_mr) { 307 ibv_dereg_mr(rqpair->cmds_mr); 308 } 309 310 if (rqpair->cpls_mr) { 311 ibv_dereg_mr(rqpair->cpls_mr); 312 } 313 314 if (rqpair->bufs_mr) { 315 ibv_dereg_mr(rqpair->bufs_mr); 316 } 317 318 if (rqpair->cm_id) { 319 rdma_destroy_qp(rqpair->cm_id); 320 rdma_destroy_id(rqpair->cm_id); 321 } 322 323 if (rqpair->mgmt_channel) { 324 spdk_put_io_channel(rqpair->mgmt_channel); 325 } 326 327 /* Free all memory */ 328 spdk_dma_free(rqpair->cmds); 329 spdk_dma_free(rqpair->cpls); 330 spdk_dma_free(rqpair->bufs); 331 free(rqpair->reqs); 332 free(rqpair->recvs); 333 free(rqpair); 334 } 335 336 static int 337 spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 338 { 339 struct spdk_nvmf_rdma_transport *rtransport; 340 struct spdk_nvmf_rdma_qpair *rqpair; 341 int rc, i; 342 struct ibv_qp_init_attr attr; 343 struct spdk_nvmf_rdma_recv *rdma_recv; 344 struct spdk_nvmf_rdma_request *rdma_req; 345 346 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 347 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 348 349 memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); 350 attr.qp_type = IBV_QPT_RC; 351 attr.send_cq = rqpair->poller->cq; 352 attr.recv_cq = rqpair->poller->cq; 353 attr.cap.max_send_wr = rqpair->max_queue_depth * 2; /* SEND, READ, and WRITE operations */ 354 attr.cap.max_recv_wr = rqpair->max_queue_depth; /* RECV operations */ 355 attr.cap.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES; 356 attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE; 357 358 rc = rdma_create_qp(rqpair->cm_id, NULL, &attr); 359 if (rc) { 360 SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); 361 rdma_destroy_id(rqpair->cm_id); 362 rqpair->cm_id = NULL; 363 spdk_nvmf_rdma_qpair_destroy(rqpair); 364 return -1; 365 } 366 367 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); 368 369 rqpair->reqs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->reqs)); 370 rqpair->recvs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->recvs)); 371 rqpair->cmds = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cmds), 372 0x1000, NULL); 373 rqpair->cpls = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cpls), 374 0x1000, NULL); 375 rqpair->bufs = spdk_dma_zmalloc(rqpair->max_queue_depth * rtransport->in_capsule_data_size, 376 0x1000, NULL); 377 if (!rqpair->reqs || !rqpair->recvs || !rqpair->cmds || 378 !rqpair->cpls || !rqpair->bufs) { 379 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 380 spdk_nvmf_rdma_qpair_destroy(rqpair); 381 return -1; 382 } 383 384 rqpair->cmds_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cmds, 385 rqpair->max_queue_depth * sizeof(*rqpair->cmds), 386 IBV_ACCESS_LOCAL_WRITE); 387 rqpair->cpls_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cpls, 388 rqpair->max_queue_depth * sizeof(*rqpair->cpls), 389 0); 390 rqpair->bufs_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->bufs, 391 rqpair->max_queue_depth * rtransport->in_capsule_data_size, 392 IBV_ACCESS_LOCAL_WRITE | 393 IBV_ACCESS_REMOTE_WRITE); 394 if (!rqpair->cmds_mr || !rqpair->cpls_mr || !rqpair->bufs_mr) { 395 SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n"); 396 spdk_nvmf_rdma_qpair_destroy(rqpair); 397 return -1; 398 } 399 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 400 rqpair->cmds, rqpair->max_queue_depth * sizeof(*rqpair->cmds), rqpair->cmds_mr->lkey); 401 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 402 rqpair->cpls, rqpair->max_queue_depth * sizeof(*rqpair->cpls), rqpair->cpls_mr->lkey); 403 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 404 rqpair->bufs, rqpair->max_queue_depth * rtransport->in_capsule_data_size, rqpair->bufs_mr->lkey); 405 406 for (i = 0; i < rqpair->max_queue_depth; i++) { 407 struct ibv_recv_wr *bad_wr = NULL; 408 409 rdma_recv = &rqpair->recvs[i]; 410 rdma_recv->qpair = rqpair; 411 412 /* Set up memory to receive commands */ 413 rdma_recv->buf = (void *)((uintptr_t)rqpair->bufs + (i * rtransport->in_capsule_data_size)); 414 415 rdma_recv->sgl[0].addr = (uintptr_t)&rqpair->cmds[i]; 416 rdma_recv->sgl[0].length = sizeof(rqpair->cmds[i]); 417 rdma_recv->sgl[0].lkey = rqpair->cmds_mr->lkey; 418 419 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 420 rdma_recv->sgl[1].length = rtransport->in_capsule_data_size; 421 rdma_recv->sgl[1].lkey = rqpair->bufs_mr->lkey; 422 423 rdma_recv->wr.wr_id = (uintptr_t)rdma_recv; 424 rdma_recv->wr.sg_list = rdma_recv->sgl; 425 rdma_recv->wr.num_sge = SPDK_COUNTOF(rdma_recv->sgl); 426 427 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_recv->wr, &bad_wr); 428 if (rc) { 429 SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n"); 430 spdk_nvmf_rdma_qpair_destroy(rqpair); 431 return -1; 432 } 433 } 434 435 for (i = 0; i < rqpair->max_queue_depth; i++) { 436 rdma_req = &rqpair->reqs[i]; 437 438 rdma_req->req.qpair = &rqpair->qpair; 439 rdma_req->req.cmd = NULL; 440 441 /* Set up memory to send responses */ 442 rdma_req->req.rsp = &rqpair->cpls[i]; 443 444 rdma_req->rsp.sgl[0].addr = (uintptr_t)&rqpair->cpls[i]; 445 rdma_req->rsp.sgl[0].length = sizeof(rqpair->cpls[i]); 446 rdma_req->rsp.sgl[0].lkey = rqpair->cpls_mr->lkey; 447 448 rdma_req->rsp.wr.wr_id = (uintptr_t)rdma_req; 449 rdma_req->rsp.wr.next = NULL; 450 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 451 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 452 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 453 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 454 455 /* Set up memory for data buffers */ 456 rdma_req->data.wr.wr_id = (uint64_t)rdma_req; 457 rdma_req->data.wr.next = NULL; 458 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 459 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 460 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 461 462 TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link); 463 } 464 465 return 0; 466 } 467 468 static int 469 request_transfer_in(struct spdk_nvmf_request *req) 470 { 471 int rc; 472 struct spdk_nvmf_rdma_request *rdma_req; 473 struct spdk_nvmf_qpair *qpair; 474 struct spdk_nvmf_rdma_qpair *rqpair; 475 struct ibv_send_wr *bad_wr = NULL; 476 477 qpair = req->qpair; 478 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 479 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 480 481 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 482 483 rqpair->cur_rdma_rw_depth++; 484 485 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair); 486 spdk_trace_record(TRACE_RDMA_READ_START, 0, 0, (uintptr_t)req, 0); 487 488 rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; 489 rdma_req->data.wr.next = NULL; 490 rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr); 491 if (rc) { 492 SPDK_ERRLOG("Unable to transfer data from host to target\n"); 493 494 /* Decrement r/w counter back since data transfer 495 * has not started. 496 */ 497 rqpair->cur_rdma_rw_depth--; 498 return -1; 499 } 500 501 return 0; 502 } 503 504 static int 505 request_transfer_out(struct spdk_nvmf_request *req) 506 { 507 int rc; 508 struct spdk_nvmf_rdma_request *rdma_req; 509 struct spdk_nvmf_qpair *qpair; 510 struct spdk_nvmf_rdma_qpair *rqpair; 511 struct spdk_nvme_cpl *rsp; 512 struct ibv_recv_wr *bad_recv_wr = NULL; 513 struct ibv_send_wr *send_wr, *bad_send_wr = NULL; 514 515 qpair = req->qpair; 516 rsp = &req->rsp->nvme_cpl; 517 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 518 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 519 520 /* Advance our sq_head pointer */ 521 if (qpair->sq_head == qpair->sq_head_max) { 522 qpair->sq_head = 0; 523 } else { 524 qpair->sq_head++; 525 } 526 rsp->sqhd = qpair->sq_head; 527 528 /* Post the capsule to the recv buffer */ 529 assert(rdma_req->recv != NULL); 530 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, 531 rqpair); 532 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); 533 if (rc) { 534 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 535 return rc; 536 } 537 rdma_req->recv = NULL; 538 539 /* Build the response which consists of an optional 540 * RDMA WRITE to transfer data, plus an RDMA SEND 541 * containing the response. 542 */ 543 send_wr = &rdma_req->rsp.wr; 544 545 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 546 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 547 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair); 548 spdk_trace_record(TRACE_RDMA_WRITE_START, 0, 0, (uintptr_t)req, 0); 549 550 rqpair->cur_rdma_rw_depth++; 551 rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; 552 553 rdma_req->data.wr.next = send_wr; 554 send_wr = &rdma_req->data.wr; 555 } 556 557 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair); 558 spdk_trace_record(TRACE_NVMF_IO_COMPLETE, 0, 0, (uintptr_t)req, 0); 559 560 /* Send the completion */ 561 rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr); 562 if (rc) { 563 SPDK_ERRLOG("Unable to send response capsule\n"); 564 565 if (rdma_req->data.wr.opcode == IBV_WR_RDMA_WRITE) { 566 /* Decrement r/w counter back since data transfer 567 * has not started. 568 */ 569 rqpair->cur_rdma_rw_depth--; 570 } 571 } 572 573 return rc; 574 } 575 576 static int 577 spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 578 { 579 struct spdk_nvmf_rdma_accept_private_data accept_data; 580 struct rdma_conn_param ctrlr_event_data = {}; 581 int rc; 582 583 accept_data.recfmt = 0; 584 accept_data.crqsize = rqpair->max_queue_depth; 585 586 ctrlr_event_data.private_data = &accept_data; 587 ctrlr_event_data.private_data_len = sizeof(accept_data); 588 if (id->ps == RDMA_PS_TCP) { 589 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 590 ctrlr_event_data.initiator_depth = rqpair->max_rw_depth; 591 } 592 593 rc = rdma_accept(id, &ctrlr_event_data); 594 if (rc) { 595 SPDK_ERRLOG("Error %d on rdma_accept\n", errno); 596 } else { 597 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); 598 } 599 600 return rc; 601 } 602 603 static void 604 spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 605 { 606 struct spdk_nvmf_rdma_reject_private_data rej_data; 607 608 rej_data.recfmt = 0; 609 rej_data.sts = error; 610 611 rdma_reject(id, &rej_data, sizeof(rej_data)); 612 } 613 614 static int 615 nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, 616 new_qpair_fn cb_fn) 617 { 618 struct spdk_nvmf_rdma_transport *rtransport; 619 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 620 struct spdk_nvmf_rdma_port *port; 621 struct rdma_conn_param *rdma_param = NULL; 622 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 623 uint16_t max_queue_depth; 624 uint16_t max_rw_depth; 625 626 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 627 628 assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 629 assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 630 631 rdma_param = &event->param.conn; 632 if (rdma_param->private_data == NULL || 633 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 634 SPDK_ERRLOG("connect request: no private data provided\n"); 635 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 636 return -1; 637 } 638 639 private_data = rdma_param->private_data; 640 if (private_data->recfmt != 0) { 641 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 642 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 643 return -1; 644 } 645 646 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 647 event->id->verbs->device->name, event->id->verbs->device->dev_name); 648 649 port = event->listen_id->context; 650 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 651 event->listen_id, event->listen_id->verbs, port); 652 653 /* Figure out the supported queue depth. This is a multi-step process 654 * that takes into account hardware maximums, host provided values, 655 * and our target's internal memory limits */ 656 657 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); 658 659 /* Start with the maximum queue depth allowed by the target */ 660 max_queue_depth = rtransport->max_queue_depth; 661 max_rw_depth = rtransport->max_queue_depth; 662 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", rtransport->max_queue_depth); 663 664 /* Next check the local NIC's hardware limitations */ 665 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 666 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 667 port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 668 max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 669 max_rw_depth = spdk_min(max_rw_depth, port->device->attr.max_qp_rd_atom); 670 671 /* Next check the remote NIC's hardware limitations */ 672 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 673 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 674 rdma_param->initiator_depth, rdma_param->responder_resources); 675 if (rdma_param->initiator_depth > 0) { 676 max_rw_depth = spdk_min(max_rw_depth, rdma_param->initiator_depth); 677 } 678 679 /* Finally check for the host software requested values, which are 680 * optional. */ 681 if (rdma_param->private_data != NULL && 682 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 683 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 684 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 685 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 686 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 687 } 688 689 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 690 max_queue_depth, max_rw_depth); 691 692 rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 693 if (rqpair == NULL) { 694 SPDK_ERRLOG("Could not allocate new connection.\n"); 695 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 696 return -1; 697 } 698 699 rqpair->port = port; 700 rqpair->max_queue_depth = max_queue_depth; 701 rqpair->max_rw_depth = max_rw_depth; 702 rqpair->cm_id = event->id; 703 rqpair->qpair.transport = transport; 704 TAILQ_INIT(&rqpair->incoming_queue); 705 TAILQ_INIT(&rqpair->free_queue); 706 TAILQ_INIT(&rqpair->pending_rdma_rw_queue); 707 708 event->id->context = &rqpair->qpair; 709 710 cb_fn(&rqpair->qpair); 711 712 return 0; 713 } 714 715 static int 716 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 717 { 718 struct spdk_nvmf_qpair *qpair; 719 720 if (evt->id == NULL) { 721 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 722 return -1; 723 } 724 725 qpair = evt->id->context; 726 if (qpair == NULL) { 727 SPDK_ERRLOG("disconnect request: no active connection\n"); 728 return -1; 729 } 730 /* ack the disconnect event before rdma_destroy_id */ 731 rdma_ack_cm_event(evt); 732 733 spdk_nvmf_ctrlr_disconnect(qpair); 734 735 return 0; 736 } 737 738 #ifdef DEBUG 739 static const char *CM_EVENT_STR[] = { 740 "RDMA_CM_EVENT_ADDR_RESOLVED", 741 "RDMA_CM_EVENT_ADDR_ERROR", 742 "RDMA_CM_EVENT_ROUTE_RESOLVED", 743 "RDMA_CM_EVENT_ROUTE_ERROR", 744 "RDMA_CM_EVENT_CONNECT_REQUEST", 745 "RDMA_CM_EVENT_CONNECT_RESPONSE", 746 "RDMA_CM_EVENT_CONNECT_ERROR", 747 "RDMA_CM_EVENT_UNREACHABLE", 748 "RDMA_CM_EVENT_REJECTED", 749 "RDMA_CM_EVENT_ESTABLISHED", 750 "RDMA_CM_EVENT_DISCONNECTED", 751 "RDMA_CM_EVENT_DEVICE_REMOVAL", 752 "RDMA_CM_EVENT_MULTICAST_JOIN", 753 "RDMA_CM_EVENT_MULTICAST_ERROR", 754 "RDMA_CM_EVENT_ADDR_CHANGE", 755 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 756 }; 757 #endif /* DEBUG */ 758 759 static int 760 spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 761 enum spdk_mem_map_notify_action action, 762 void *vaddr, size_t size) 763 { 764 struct spdk_nvmf_rdma_device *device = cb_ctx; 765 struct ibv_pd *pd = device->pd; 766 struct ibv_mr *mr; 767 768 switch (action) { 769 case SPDK_MEM_MAP_NOTIFY_REGISTER: 770 mr = ibv_reg_mr(pd, vaddr, size, 771 IBV_ACCESS_LOCAL_WRITE | 772 IBV_ACCESS_REMOTE_READ | 773 IBV_ACCESS_REMOTE_WRITE); 774 if (mr == NULL) { 775 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 776 return -1; 777 } else { 778 spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 779 } 780 break; 781 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 782 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, size); 783 spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 784 if (mr) { 785 ibv_dereg_mr(mr); 786 } 787 break; 788 } 789 790 return 0; 791 } 792 793 typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; 794 795 static spdk_nvme_data_transfer_t 796 spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) 797 { 798 enum spdk_nvme_data_transfer xfer; 799 struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; 800 struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; 801 802 /* Figure out data transfer direction */ 803 if (cmd->opc == SPDK_NVME_OPC_FABRIC) { 804 xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); 805 } else { 806 xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 807 808 /* Some admin commands are special cases */ 809 if ((rdma_req->req.qpair->qid == 0) && 810 ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || 811 (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { 812 switch (cmd->cdw10 & 0xff) { 813 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 814 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 815 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 816 break; 817 default: 818 xfer = SPDK_NVME_DATA_NONE; 819 } 820 } 821 } 822 823 if (xfer == SPDK_NVME_DATA_NONE) { 824 return xfer; 825 } 826 827 /* Even for commands that may transfer data, they could have specified 0 length. 828 * We want those to show up with xfer SPDK_NVME_DATA_NONE. 829 */ 830 switch (sgl->generic.type) { 831 case SPDK_NVME_SGL_TYPE_DATA_BLOCK: 832 case SPDK_NVME_SGL_TYPE_BIT_BUCKET: 833 case SPDK_NVME_SGL_TYPE_SEGMENT: 834 case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: 835 case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: 836 if (sgl->unkeyed.length == 0) { 837 xfer = SPDK_NVME_DATA_NONE; 838 } 839 break; 840 case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: 841 if (sgl->keyed.length == 0) { 842 xfer = SPDK_NVME_DATA_NONE; 843 } 844 break; 845 } 846 847 return xfer; 848 } 849 850 static int 851 spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 852 struct spdk_nvmf_rdma_device *device, 853 struct spdk_nvmf_rdma_request *rdma_req) 854 { 855 void *buf = NULL; 856 uint32_t length = rdma_req->req.length; 857 uint32_t i = 0; 858 859 rdma_req->req.iovcnt = 0; 860 while (length) { 861 buf = spdk_mempool_get(rtransport->data_buf_pool); 862 if (!buf) { 863 goto nomem; 864 } 865 866 rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & 867 ~NVMF_DATA_BUFFER_MASK); 868 rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->io_unit_size); 869 rdma_req->req.iovcnt++; 870 rdma_req->data.buffers[i] = buf; 871 rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base); 872 rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len; 873 rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, 874 (uint64_t)buf, rdma_req->req.iov[i].iov_len))->lkey; 875 876 length -= rdma_req->req.iov[i].iov_len; 877 i++; 878 } 879 880 rdma_req->data_from_pool = true; 881 882 return 0; 883 884 nomem: 885 while (i) { 886 i--; 887 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->req.iov[i].iov_base); 888 rdma_req->req.iov[i].iov_base = NULL; 889 rdma_req->req.iov[i].iov_len = 0; 890 891 rdma_req->data.wr.sg_list[i].addr = 0; 892 rdma_req->data.wr.sg_list[i].length = 0; 893 rdma_req->data.wr.sg_list[i].lkey = 0; 894 } 895 rdma_req->req.iovcnt = 0; 896 return -ENOMEM; 897 } 898 899 static int 900 spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 901 struct spdk_nvmf_rdma_device *device, 902 struct spdk_nvmf_rdma_request *rdma_req) 903 { 904 struct spdk_nvme_cmd *cmd; 905 struct spdk_nvme_cpl *rsp; 906 struct spdk_nvme_sgl_descriptor *sgl; 907 908 cmd = &rdma_req->req.cmd->nvme_cmd; 909 rsp = &rdma_req->req.rsp->nvme_cpl; 910 sgl = &cmd->dptr.sgl1; 911 912 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 913 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 914 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 915 if (sgl->keyed.length > rtransport->max_io_size) { 916 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 917 sgl->keyed.length, rtransport->max_io_size); 918 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 919 return -1; 920 } 921 922 /* fill request length and populate iovs */ 923 rdma_req->req.length = sgl->keyed.length; 924 925 if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { 926 /* No available buffers. Queue this request up. */ 927 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 928 return 0; 929 } 930 931 /* backward compatible */ 932 rdma_req->req.data = rdma_req->req.iov[0].iov_base; 933 934 /* rdma wr specifics */ 935 rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; 936 rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; 937 rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; 938 939 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 940 rdma_req->req.iovcnt); 941 942 return 0; 943 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 944 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 945 uint64_t offset = sgl->address; 946 uint32_t max_len = rtransport->in_capsule_data_size; 947 948 SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 949 offset, sgl->unkeyed.length); 950 951 if (offset > max_len) { 952 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 953 offset, max_len); 954 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 955 return -1; 956 } 957 max_len -= (uint32_t)offset; 958 959 if (sgl->unkeyed.length > max_len) { 960 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 961 sgl->unkeyed.length, max_len); 962 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 963 return -1; 964 } 965 966 rdma_req->req.data = rdma_req->recv->buf + offset; 967 rdma_req->data_from_pool = false; 968 rdma_req->req.length = sgl->unkeyed.length; 969 970 rdma_req->req.iov[0].iov_base = rdma_req->req.data; 971 rdma_req->req.iov[0].iov_len = rdma_req->req.length; 972 rdma_req->req.iovcnt = 1; 973 974 return 0; 975 } 976 977 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 978 sgl->generic.type, sgl->generic.subtype); 979 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 980 return -1; 981 } 982 983 static bool 984 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 985 struct spdk_nvmf_rdma_request *rdma_req) 986 { 987 struct spdk_nvmf_rdma_qpair *rqpair; 988 struct spdk_nvmf_rdma_device *device; 989 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 990 int rc; 991 struct spdk_nvmf_rdma_recv *rdma_recv; 992 enum spdk_nvmf_rdma_request_state prev_state; 993 bool progress = false; 994 995 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 996 device = rqpair->port->device; 997 998 assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 999 1000 /* The loop here is to allow for several back-to-back state changes. */ 1001 do { 1002 prev_state = rdma_req->state; 1003 1004 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); 1005 1006 switch (rdma_req->state) { 1007 case RDMA_REQUEST_STATE_FREE: 1008 /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 1009 * to escape this state. */ 1010 break; 1011 case RDMA_REQUEST_STATE_NEW: 1012 rqpair->cur_queue_depth++; 1013 rdma_recv = rdma_req->recv; 1014 1015 /* The first element of the SGL is the NVMe command */ 1016 rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 1017 memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 1018 1019 TAILQ_REMOVE(&rqpair->incoming_queue, rdma_recv, link); 1020 TAILQ_REMOVE(&rqpair->free_queue, rdma_req, link); 1021 1022 /* The next state transition depends on the data transfer needs of this request. */ 1023 rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); 1024 1025 /* If no data to transfer, ready to execute. */ 1026 if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 1027 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1028 break; 1029 } 1030 1031 rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER; 1032 TAILQ_INSERT_TAIL(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1033 break; 1034 case RDMA_REQUEST_STATE_NEED_BUFFER: 1035 assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 1036 1037 if (rdma_req != TAILQ_FIRST(&rqpair->ch->pending_data_buf_queue)) { 1038 /* This request needs to wait in line to obtain a buffer */ 1039 break; 1040 } 1041 1042 /* Try to get a data buffer */ 1043 rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 1044 if (rc < 0) { 1045 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1046 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1047 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1048 break; 1049 } 1050 1051 if (!rdma_req->req.data) { 1052 /* No buffers available. */ 1053 break; 1054 } 1055 1056 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1057 1058 /* If data is transferring from host to controller and the data didn't 1059 * arrive using in capsule data, we need to do a transfer from the host. 1060 */ 1061 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { 1062 rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER; 1063 TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1064 break; 1065 } 1066 1067 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1068 break; 1069 case RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER: 1070 if (rdma_req != TAILQ_FIRST(&rqpair->pending_rdma_rw_queue)) { 1071 /* This request needs to wait in line to perform RDMA */ 1072 break; 1073 } 1074 1075 if (rqpair->cur_rdma_rw_depth < rqpair->max_rw_depth) { 1076 TAILQ_REMOVE(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1077 rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; 1078 rc = request_transfer_in(&rdma_req->req); 1079 if (rc) { 1080 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1081 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1082 } 1083 } 1084 break; 1085 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 1086 /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 1087 * to escape this state. */ 1088 break; 1089 case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 1090 rdma_req->state = RDMA_REQUEST_STATE_EXECUTING; 1091 spdk_nvmf_request_exec(&rdma_req->req); 1092 break; 1093 case RDMA_REQUEST_STATE_EXECUTING: 1094 /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 1095 * to escape this state. */ 1096 break; 1097 case RDMA_REQUEST_STATE_EXECUTED: 1098 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1099 rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST; 1100 TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1101 } else { 1102 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1103 } 1104 break; 1105 case RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST: 1106 if (rdma_req != TAILQ_FIRST(&rqpair->pending_rdma_rw_queue)) { 1107 /* This request needs to wait in line to perform RDMA */ 1108 break; 1109 } 1110 1111 if (rqpair->cur_rdma_rw_depth < rqpair->max_rw_depth) { 1112 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1113 TAILQ_REMOVE(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1114 } 1115 break; 1116 case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 1117 rdma_req->state = RDMA_REQUEST_STATE_COMPLETING; 1118 1119 rc = request_transfer_out(&rdma_req->req); 1120 assert(rc == 0); /* No good way to handle this currently */ 1121 break; 1122 case RDMA_REQUEST_STATE_COMPLETING: 1123 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1124 * to escape this state. */ 1125 break; 1126 case RDMA_REQUEST_STATE_COMPLETED: 1127 assert(rqpair->cur_queue_depth > 0); 1128 rqpair->cur_queue_depth--; 1129 1130 if (rdma_req->data_from_pool) { 1131 /* Put the buffer/s back in the pool */ 1132 for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { 1133 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); 1134 rdma_req->req.iov[i].iov_base = NULL; 1135 rdma_req->data.buffers[i] = NULL; 1136 } 1137 rdma_req->data_from_pool = false; 1138 } 1139 rdma_req->req.length = 0; 1140 rdma_req->req.iovcnt = 0; 1141 rdma_req->req.data = NULL; 1142 rdma_req->state = RDMA_REQUEST_STATE_FREE; 1143 TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link); 1144 break; 1145 } 1146 1147 if (rdma_req->state != prev_state) { 1148 progress = true; 1149 } 1150 } while (rdma_req->state != prev_state); 1151 1152 return progress; 1153 } 1154 1155 /* Public API callbacks begin here */ 1156 1157 static struct spdk_nvmf_transport * 1158 spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt) 1159 { 1160 int rc; 1161 struct spdk_nvmf_rdma_transport *rtransport; 1162 struct spdk_nvmf_rdma_device *device, *tmp; 1163 struct ibv_context **contexts; 1164 uint32_t i; 1165 int flag; 1166 uint32_t sge_count; 1167 1168 rtransport = calloc(1, sizeof(*rtransport)); 1169 if (!rtransport) { 1170 return NULL; 1171 } 1172 1173 pthread_mutex_init(&rtransport->lock, NULL); 1174 TAILQ_INIT(&rtransport->devices); 1175 TAILQ_INIT(&rtransport->ports); 1176 1177 rtransport->transport.tgt = tgt; 1178 rtransport->transport.ops = &spdk_nvmf_transport_rdma; 1179 1180 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n"); 1181 1182 rtransport->max_queue_depth = tgt->opts.max_queue_depth; 1183 rtransport->max_io_size = tgt->opts.max_io_size; 1184 rtransport->io_unit_size = tgt->opts.io_unit_size; 1185 rtransport->in_capsule_data_size = tgt->opts.in_capsule_data_size; 1186 1187 /* I/O unit size cannot be larger than max I/O size */ 1188 if (rtransport->io_unit_size > rtransport->max_io_size) { 1189 rtransport->io_unit_size = rtransport->max_io_size; 1190 } 1191 1192 sge_count = rtransport->max_io_size / rtransport->io_unit_size; 1193 if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) { 1194 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", rtransport->io_unit_size); 1195 free(rtransport); 1196 return NULL; 1197 } 1198 1199 rtransport->event_channel = rdma_create_event_channel(); 1200 if (rtransport->event_channel == NULL) { 1201 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 1202 free(rtransport); 1203 return NULL; 1204 } 1205 1206 flag = fcntl(rtransport->event_channel->fd, F_GETFL); 1207 if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1208 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 1209 rtransport->event_channel->fd, spdk_strerror(errno)); 1210 free(rtransport); 1211 return NULL; 1212 } 1213 1214 rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma", 1215 rtransport->max_queue_depth * 4, /* The 4 is arbitrarily chosen. Needs to be configurable. */ 1216 rtransport->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT, 1217 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 1218 SPDK_ENV_SOCKET_ID_ANY); 1219 if (!rtransport->data_buf_pool) { 1220 SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n"); 1221 free(rtransport); 1222 return NULL; 1223 } 1224 1225 spdk_io_device_register(rtransport, spdk_nvmf_rdma_mgmt_channel_create, 1226 spdk_nvmf_rdma_mgmt_channel_destroy, 1227 sizeof(struct spdk_nvmf_rdma_mgmt_channel)); 1228 1229 contexts = rdma_get_devices(NULL); 1230 i = 0; 1231 rc = 0; 1232 while (contexts[i] != NULL) { 1233 device = calloc(1, sizeof(*device)); 1234 if (!device) { 1235 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 1236 rc = -ENOMEM; 1237 break; 1238 } 1239 device->context = contexts[i]; 1240 rc = ibv_query_device(device->context, &device->attr); 1241 if (rc < 0) { 1242 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1243 free(device); 1244 break; 1245 1246 } 1247 /* set up device context async ev fd as NON_BLOCKING */ 1248 flag = fcntl(device->context->async_fd, F_GETFL); 1249 rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 1250 if (rc < 0) { 1251 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 1252 free(device); 1253 break; 1254 } 1255 1256 device->pd = NULL; 1257 device->map = NULL; 1258 1259 TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 1260 i++; 1261 } 1262 1263 if (rc < 0) { 1264 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1265 TAILQ_REMOVE(&rtransport->devices, device, link); 1266 free(device); 1267 } 1268 spdk_mempool_free(rtransport->data_buf_pool); 1269 rdma_destroy_event_channel(rtransport->event_channel); 1270 free(rtransport); 1271 rdma_free_devices(contexts); 1272 return NULL; 1273 } else { 1274 /* Set up poll descriptor array to monitor events from RDMA and IB 1275 * in a single poll syscall 1276 */ 1277 rtransport->npoll_fds = i + 1; 1278 i = 0; 1279 rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 1280 rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 1281 rtransport->poll_fds[i++].events = POLLIN; 1282 1283 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1284 rtransport->poll_fds[i].fd = device->context->async_fd; 1285 rtransport->poll_fds[i++].events = POLLIN; 1286 } 1287 } 1288 1289 rdma_free_devices(contexts); 1290 1291 return &rtransport->transport; 1292 } 1293 1294 static int 1295 spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) 1296 { 1297 struct spdk_nvmf_rdma_transport *rtransport; 1298 struct spdk_nvmf_rdma_port *port, *port_tmp; 1299 struct spdk_nvmf_rdma_device *device, *device_tmp; 1300 1301 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1302 1303 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 1304 TAILQ_REMOVE(&rtransport->ports, port, link); 1305 rdma_destroy_id(port->id); 1306 free(port); 1307 } 1308 1309 if (rtransport->poll_fds != NULL) { 1310 free(rtransport->poll_fds); 1311 } 1312 1313 if (rtransport->event_channel != NULL) { 1314 rdma_destroy_event_channel(rtransport->event_channel); 1315 } 1316 1317 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 1318 TAILQ_REMOVE(&rtransport->devices, device, link); 1319 if (device->map) { 1320 spdk_mem_map_free(&device->map); 1321 } 1322 free(device); 1323 } 1324 1325 if (spdk_mempool_count(rtransport->data_buf_pool) != (rtransport->max_queue_depth * 4)) { 1326 SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n", 1327 spdk_mempool_count(rtransport->data_buf_pool), 1328 rtransport->max_queue_depth * 4); 1329 } 1330 1331 spdk_mempool_free(rtransport->data_buf_pool); 1332 spdk_io_device_unregister(rtransport, NULL); 1333 free(rtransport); 1334 1335 return 0; 1336 } 1337 1338 static int 1339 spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, 1340 const struct spdk_nvme_transport_id *trid) 1341 { 1342 struct spdk_nvmf_rdma_transport *rtransport; 1343 struct spdk_nvmf_rdma_device *device; 1344 struct spdk_nvmf_rdma_port *port_tmp, *port; 1345 struct addrinfo *res; 1346 struct addrinfo hints; 1347 int family; 1348 int rc; 1349 1350 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1351 1352 port = calloc(1, sizeof(*port)); 1353 if (!port) { 1354 return -ENOMEM; 1355 } 1356 1357 /* Selectively copy the trid. Things like NQN don't matter here - that 1358 * mapping is enforced elsewhere. 1359 */ 1360 port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1361 port->trid.adrfam = trid->adrfam; 1362 snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); 1363 snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); 1364 1365 pthread_mutex_lock(&rtransport->lock); 1366 assert(rtransport->event_channel != NULL); 1367 TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { 1368 if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { 1369 port_tmp->ref++; 1370 free(port); 1371 /* Already listening at this address */ 1372 pthread_mutex_unlock(&rtransport->lock); 1373 return 0; 1374 } 1375 } 1376 1377 rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 1378 if (rc < 0) { 1379 SPDK_ERRLOG("rdma_create_id() failed\n"); 1380 free(port); 1381 pthread_mutex_unlock(&rtransport->lock); 1382 return rc; 1383 } 1384 1385 switch (port->trid.adrfam) { 1386 case SPDK_NVMF_ADRFAM_IPV4: 1387 family = AF_INET; 1388 break; 1389 case SPDK_NVMF_ADRFAM_IPV6: 1390 family = AF_INET6; 1391 break; 1392 default: 1393 SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); 1394 free(port); 1395 pthread_mutex_unlock(&rtransport->lock); 1396 return -EINVAL; 1397 } 1398 1399 memset(&hints, 0, sizeof(hints)); 1400 hints.ai_family = family; 1401 hints.ai_socktype = SOCK_STREAM; 1402 hints.ai_protocol = 0; 1403 1404 rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); 1405 if (rc) { 1406 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 1407 free(port); 1408 pthread_mutex_unlock(&rtransport->lock); 1409 return -EINVAL; 1410 } 1411 1412 rc = rdma_bind_addr(port->id, res->ai_addr); 1413 freeaddrinfo(res); 1414 1415 if (rc < 0) { 1416 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 1417 rdma_destroy_id(port->id); 1418 free(port); 1419 pthread_mutex_unlock(&rtransport->lock); 1420 return rc; 1421 } 1422 1423 rc = rdma_listen(port->id, 10); /* 10 = backlog */ 1424 if (rc < 0) { 1425 SPDK_ERRLOG("rdma_listen() failed\n"); 1426 rdma_destroy_id(port->id); 1427 free(port); 1428 pthread_mutex_unlock(&rtransport->lock); 1429 return rc; 1430 } 1431 1432 TAILQ_FOREACH(device, &rtransport->devices, link) { 1433 if (device->context == port->id->verbs) { 1434 port->device = device; 1435 break; 1436 } 1437 } 1438 if (!port->device) { 1439 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 1440 port->id->verbs); 1441 rdma_destroy_id(port->id); 1442 free(port); 1443 pthread_mutex_unlock(&rtransport->lock); 1444 return -EINVAL; 1445 } 1446 1447 if (!device->map) { 1448 device->pd = port->id->pd; 1449 device->map = spdk_mem_map_alloc(0, spdk_nvmf_rdma_mem_notify, device); 1450 if (!device->map) { 1451 SPDK_ERRLOG("Unable to allocate memory map for new poll group\n"); 1452 return -1; 1453 } 1454 } else { 1455 assert(device->pd == port->id->pd); 1456 } 1457 1458 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", 1459 port->trid.traddr, ntohs(rdma_get_src_port(port->id))); 1460 1461 port->ref = 1; 1462 1463 TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 1464 pthread_mutex_unlock(&rtransport->lock); 1465 1466 return 0; 1467 } 1468 1469 static int 1470 spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 1471 const struct spdk_nvme_transport_id *_trid) 1472 { 1473 struct spdk_nvmf_rdma_transport *rtransport; 1474 struct spdk_nvmf_rdma_port *port, *tmp; 1475 struct spdk_nvme_transport_id trid = {}; 1476 1477 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1478 1479 /* Selectively copy the trid. Things like NQN don't matter here - that 1480 * mapping is enforced elsewhere. 1481 */ 1482 trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1483 trid.adrfam = _trid->adrfam; 1484 snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); 1485 snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); 1486 1487 pthread_mutex_lock(&rtransport->lock); 1488 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 1489 if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { 1490 assert(port->ref > 0); 1491 port->ref--; 1492 if (port->ref == 0) { 1493 TAILQ_REMOVE(&rtransport->ports, port, link); 1494 rdma_destroy_id(port->id); 1495 free(port); 1496 } 1497 break; 1498 } 1499 } 1500 1501 pthread_mutex_unlock(&rtransport->lock); 1502 return 0; 1503 } 1504 1505 static void 1506 spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 1507 { 1508 struct spdk_nvmf_rdma_transport *rtransport; 1509 struct rdma_cm_event *event; 1510 int rc; 1511 1512 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1513 1514 if (rtransport->event_channel == NULL) { 1515 return; 1516 } 1517 1518 while (1) { 1519 rc = rdma_get_cm_event(rtransport->event_channel, &event); 1520 if (rc == 0) { 1521 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 1522 1523 switch (event->event) { 1524 case RDMA_CM_EVENT_ADDR_RESOLVED: 1525 case RDMA_CM_EVENT_ADDR_ERROR: 1526 case RDMA_CM_EVENT_ROUTE_RESOLVED: 1527 case RDMA_CM_EVENT_ROUTE_ERROR: 1528 /* No action required. The target never attempts to resolve routes. */ 1529 break; 1530 case RDMA_CM_EVENT_CONNECT_REQUEST: 1531 rc = nvmf_rdma_connect(transport, event, cb_fn); 1532 if (rc < 0) { 1533 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 1534 break; 1535 } 1536 break; 1537 case RDMA_CM_EVENT_CONNECT_RESPONSE: 1538 /* The target never initiates a new connection. So this will not occur. */ 1539 break; 1540 case RDMA_CM_EVENT_CONNECT_ERROR: 1541 /* Can this happen? The docs say it can, but not sure what causes it. */ 1542 break; 1543 case RDMA_CM_EVENT_UNREACHABLE: 1544 case RDMA_CM_EVENT_REJECTED: 1545 /* These only occur on the client side. */ 1546 break; 1547 case RDMA_CM_EVENT_ESTABLISHED: 1548 /* TODO: Should we be waiting for this event anywhere? */ 1549 break; 1550 case RDMA_CM_EVENT_DISCONNECTED: 1551 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1552 rc = nvmf_rdma_disconnect(event); 1553 if (rc < 0) { 1554 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 1555 break; 1556 } 1557 continue; 1558 case RDMA_CM_EVENT_MULTICAST_JOIN: 1559 case RDMA_CM_EVENT_MULTICAST_ERROR: 1560 /* Multicast is not used */ 1561 break; 1562 case RDMA_CM_EVENT_ADDR_CHANGE: 1563 /* Not utilizing this event */ 1564 break; 1565 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1566 /* For now, do nothing. The target never re-uses queue pairs. */ 1567 break; 1568 default: 1569 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 1570 break; 1571 } 1572 1573 rdma_ack_cm_event(event); 1574 } else { 1575 if (errno != EAGAIN && errno != EWOULDBLOCK) { 1576 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 1577 } 1578 break; 1579 } 1580 } 1581 } 1582 1583 static void 1584 spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 1585 { 1586 int rc; 1587 struct ibv_async_event event; 1588 1589 rc = ibv_get_async_event(device->context, &event); 1590 1591 if (rc) { 1592 SPDK_ERRLOG("Failed to get async_event (%d): %s\n", 1593 errno, spdk_strerror(errno)); 1594 return; 1595 } 1596 1597 SPDK_NOTICELOG("Async event: %s\n", 1598 ibv_event_type_str(event.event_type)); 1599 ibv_ack_async_event(&event); 1600 } 1601 1602 static void 1603 spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 1604 { 1605 int nfds, i = 0; 1606 struct spdk_nvmf_rdma_transport *rtransport; 1607 struct spdk_nvmf_rdma_device *device, *tmp; 1608 1609 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1610 nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 1611 1612 if (nfds <= 0) { 1613 return; 1614 } 1615 1616 /* The first poll descriptor is RDMA CM event */ 1617 if (rtransport->poll_fds[i++].revents & POLLIN) { 1618 spdk_nvmf_process_cm_event(transport, cb_fn); 1619 nfds--; 1620 } 1621 1622 if (nfds == 0) { 1623 return; 1624 } 1625 1626 /* Second and subsequent poll descriptors are IB async events */ 1627 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1628 if (rtransport->poll_fds[i++].revents & POLLIN) { 1629 spdk_nvmf_process_ib_event(device); 1630 nfds--; 1631 } 1632 } 1633 /* check all flagged fd's have been served */ 1634 assert(nfds == 0); 1635 } 1636 1637 static void 1638 spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 1639 struct spdk_nvme_transport_id *trid, 1640 struct spdk_nvmf_discovery_log_page_entry *entry) 1641 { 1642 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 1643 entry->adrfam = trid->adrfam; 1644 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; 1645 1646 spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 1647 spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 1648 1649 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 1650 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 1651 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 1652 } 1653 1654 static struct spdk_nvmf_transport_poll_group * 1655 spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) 1656 { 1657 struct spdk_nvmf_rdma_transport *rtransport; 1658 struct spdk_nvmf_rdma_poll_group *rgroup; 1659 struct spdk_nvmf_rdma_poller *poller; 1660 struct spdk_nvmf_rdma_device *device; 1661 1662 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1663 1664 rgroup = calloc(1, sizeof(*rgroup)); 1665 if (!rgroup) { 1666 return NULL; 1667 } 1668 1669 TAILQ_INIT(&rgroup->pollers); 1670 1671 pthread_mutex_lock(&rtransport->lock); 1672 TAILQ_FOREACH(device, &rtransport->devices, link) { 1673 if (device->map == NULL) { 1674 /* 1675 * The device is not in use (no listeners), 1676 * so no protection domain has been constructed. 1677 * Skip it. 1678 */ 1679 SPDK_NOTICELOG("Skipping unused RDMA device when creating poll group.\n"); 1680 continue; 1681 } 1682 1683 poller = calloc(1, sizeof(*poller)); 1684 if (!poller) { 1685 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 1686 free(rgroup); 1687 pthread_mutex_unlock(&rtransport->lock); 1688 return NULL; 1689 } 1690 1691 poller->device = device; 1692 poller->group = rgroup; 1693 1694 TAILQ_INIT(&poller->qpairs); 1695 1696 poller->cq = ibv_create_cq(device->context, NVMF_RDMA_CQ_SIZE, poller, NULL, 0); 1697 if (!poller->cq) { 1698 SPDK_ERRLOG("Unable to create completion queue\n"); 1699 free(poller); 1700 free(rgroup); 1701 pthread_mutex_unlock(&rtransport->lock); 1702 return NULL; 1703 } 1704 1705 TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 1706 } 1707 1708 pthread_mutex_unlock(&rtransport->lock); 1709 return &rgroup->group; 1710 } 1711 1712 static void 1713 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 1714 { 1715 struct spdk_nvmf_rdma_poll_group *rgroup; 1716 struct spdk_nvmf_rdma_poller *poller, *tmp; 1717 1718 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1719 1720 if (!rgroup) { 1721 return; 1722 } 1723 1724 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 1725 TAILQ_REMOVE(&rgroup->pollers, poller, link); 1726 1727 if (poller->cq) { 1728 ibv_destroy_cq(poller->cq); 1729 } 1730 1731 free(poller); 1732 } 1733 1734 free(rgroup); 1735 } 1736 1737 static int 1738 spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 1739 struct spdk_nvmf_qpair *qpair) 1740 { 1741 struct spdk_nvmf_rdma_transport *rtransport; 1742 struct spdk_nvmf_rdma_poll_group *rgroup; 1743 struct spdk_nvmf_rdma_qpair *rqpair; 1744 struct spdk_nvmf_rdma_device *device; 1745 struct spdk_nvmf_rdma_poller *poller; 1746 int rc; 1747 1748 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 1749 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1750 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1751 1752 device = rqpair->port->device; 1753 1754 if (device->pd != rqpair->cm_id->pd) { 1755 SPDK_ERRLOG("Mismatched protection domains\n"); 1756 return -1; 1757 } 1758 1759 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 1760 if (poller->device == device) { 1761 break; 1762 } 1763 } 1764 1765 if (!poller) { 1766 SPDK_ERRLOG("No poller found for device.\n"); 1767 return -1; 1768 } 1769 1770 TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); 1771 rqpair->poller = poller; 1772 1773 rc = spdk_nvmf_rdma_qpair_initialize(qpair); 1774 if (rc < 0) { 1775 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 1776 return -1; 1777 } 1778 1779 rqpair->mgmt_channel = spdk_get_io_channel(rtransport); 1780 if (!rqpair->mgmt_channel) { 1781 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1782 spdk_nvmf_rdma_qpair_destroy(rqpair); 1783 return -1; 1784 } 1785 1786 rqpair->ch = spdk_io_channel_get_ctx(rqpair->mgmt_channel); 1787 assert(rqpair->ch != NULL); 1788 1789 rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 1790 if (rc) { 1791 /* Try to reject, but we probably can't */ 1792 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1793 spdk_nvmf_rdma_qpair_destroy(rqpair); 1794 return -1; 1795 } 1796 1797 return 0; 1798 } 1799 1800 static int 1801 spdk_nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 1802 struct spdk_nvmf_qpair *qpair) 1803 { 1804 struct spdk_nvmf_rdma_poll_group *rgroup; 1805 struct spdk_nvmf_rdma_qpair *rqpair; 1806 struct spdk_nvmf_rdma_device *device; 1807 struct spdk_nvmf_rdma_poller *poller; 1808 struct spdk_nvmf_rdma_qpair *rq, *trq; 1809 1810 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1811 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1812 1813 device = rqpair->port->device; 1814 1815 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 1816 if (poller->device == device) { 1817 break; 1818 } 1819 } 1820 1821 if (!poller) { 1822 SPDK_ERRLOG("No poller found for device.\n"); 1823 return -1; 1824 } 1825 1826 TAILQ_FOREACH_SAFE(rq, &poller->qpairs, link, trq) { 1827 if (rq == rqpair) { 1828 TAILQ_REMOVE(&poller->qpairs, rqpair, link); 1829 break; 1830 } 1831 } 1832 1833 if (rq == NULL) { 1834 SPDK_ERRLOG("RDMA qpair cannot be removed from group (not in group).\n"); 1835 return -1; 1836 } 1837 1838 return 0; 1839 } 1840 1841 static int 1842 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 1843 { 1844 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 1845 struct spdk_nvmf_rdma_transport, transport); 1846 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1847 1848 rdma_req->state = RDMA_REQUEST_STATE_EXECUTED; 1849 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 1850 1851 return 0; 1852 } 1853 1854 static void 1855 spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) 1856 { 1857 spdk_nvmf_rdma_qpair_destroy(SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair)); 1858 } 1859 1860 static void 1861 spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 1862 struct spdk_nvmf_rdma_qpair *rqpair) 1863 { 1864 struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 1865 struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 1866 1867 /* We process I/O in the pending_rdma_rw queue at the highest priority. */ 1868 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_rw_queue, link, req_tmp) { 1869 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1870 break; 1871 } 1872 } 1873 1874 /* The second highest priority is I/O waiting on memory buffers. */ 1875 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->ch->pending_data_buf_queue, link, req_tmp) { 1876 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1877 break; 1878 } 1879 } 1880 1881 /* The lowest priority is processing newly received commands */ 1882 TAILQ_FOREACH_SAFE(rdma_recv, &rqpair->incoming_queue, link, recv_tmp) { 1883 rdma_req = TAILQ_FIRST(&rqpair->free_queue); 1884 if (rdma_req == NULL) { 1885 /* Need to wait for more SEND completions */ 1886 break; 1887 } 1888 1889 rdma_req->recv = rdma_recv; 1890 rdma_req->state = RDMA_REQUEST_STATE_NEW; 1891 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1892 break; 1893 } 1894 } 1895 } 1896 1897 static struct spdk_nvmf_rdma_request * 1898 get_rdma_req_from_wc(struct ibv_wc *wc) 1899 { 1900 struct spdk_nvmf_rdma_request *rdma_req; 1901 1902 rdma_req = (struct spdk_nvmf_rdma_request *)wc->wr_id; 1903 assert(rdma_req != NULL); 1904 1905 #ifdef DEBUG 1906 struct spdk_nvmf_rdma_qpair *rqpair; 1907 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1908 1909 assert(rdma_req - rqpair->reqs >= 0); 1910 assert(rdma_req - rqpair->reqs < (ptrdiff_t)rqpair->max_queue_depth); 1911 #endif 1912 1913 return rdma_req; 1914 } 1915 1916 static struct spdk_nvmf_rdma_recv * 1917 get_rdma_recv_from_wc(struct ibv_wc *wc) 1918 { 1919 struct spdk_nvmf_rdma_recv *rdma_recv; 1920 1921 assert(wc->byte_len >= sizeof(struct spdk_nvmf_capsule_cmd)); 1922 1923 rdma_recv = (struct spdk_nvmf_rdma_recv *)wc->wr_id; 1924 assert(rdma_recv != NULL); 1925 1926 #ifdef DEBUG 1927 struct spdk_nvmf_rdma_qpair *rqpair = rdma_recv->qpair; 1928 1929 assert(rdma_recv - rqpair->recvs >= 0); 1930 assert(rdma_recv - rqpair->recvs < (ptrdiff_t)rqpair->max_queue_depth); 1931 #endif 1932 1933 return rdma_recv; 1934 } 1935 1936 static int 1937 spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 1938 struct spdk_nvmf_rdma_poller *rpoller) 1939 { 1940 struct ibv_wc wc[32]; 1941 struct spdk_nvmf_rdma_request *rdma_req; 1942 struct spdk_nvmf_rdma_recv *rdma_recv; 1943 struct spdk_nvmf_rdma_qpair *rqpair; 1944 int reaped, i; 1945 int count = 0; 1946 bool error = false; 1947 1948 /* Poll for completing operations. */ 1949 reaped = ibv_poll_cq(rpoller->cq, 32, wc); 1950 if (reaped < 0) { 1951 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 1952 errno, spdk_strerror(errno)); 1953 return -1; 1954 } 1955 1956 for (i = 0; i < reaped; i++) { 1957 if (wc[i].status) { 1958 SPDK_ERRLOG("CQ error on CQ %p, Request 0x%lu (%d): %s\n", 1959 rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 1960 error = true; 1961 continue; 1962 } 1963 1964 switch (wc[i].opcode) { 1965 case IBV_WC_SEND: 1966 rdma_req = get_rdma_req_from_wc(&wc[i]); 1967 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1968 1969 assert(rdma_req->state == RDMA_REQUEST_STATE_COMPLETING); 1970 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 1971 1972 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 1973 1974 count++; 1975 1976 /* Try to process other queued requests */ 1977 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 1978 break; 1979 1980 case IBV_WC_RDMA_WRITE: 1981 rdma_req = get_rdma_req_from_wc(&wc[i]); 1982 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1983 1984 rqpair->cur_rdma_rw_depth--; 1985 1986 /* Try to process other queued requests */ 1987 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 1988 break; 1989 1990 case IBV_WC_RDMA_READ: 1991 rdma_req = get_rdma_req_from_wc(&wc[i]); 1992 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1993 1994 assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); 1995 rqpair->cur_rdma_rw_depth--; 1996 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1997 1998 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 1999 2000 /* Try to process other queued requests */ 2001 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2002 break; 2003 2004 case IBV_WC_RECV: 2005 rdma_recv = get_rdma_recv_from_wc(&wc[i]); 2006 rqpair = rdma_recv->qpair; 2007 2008 TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); 2009 2010 /* Try to process other queued requests */ 2011 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2012 break; 2013 2014 default: 2015 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 2016 continue; 2017 } 2018 } 2019 2020 if (error == true) { 2021 return -1; 2022 } 2023 2024 return count; 2025 } 2026 2027 static int 2028 spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2029 { 2030 struct spdk_nvmf_rdma_transport *rtransport; 2031 struct spdk_nvmf_rdma_poll_group *rgroup; 2032 struct spdk_nvmf_rdma_poller *rpoller; 2033 int count, rc; 2034 2035 rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 2036 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2037 2038 count = 0; 2039 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 2040 rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); 2041 if (rc < 0) { 2042 return rc; 2043 } 2044 count += rc; 2045 } 2046 2047 return count; 2048 } 2049 2050 static bool 2051 spdk_nvmf_rdma_qpair_is_idle(struct spdk_nvmf_qpair *qpair) 2052 { 2053 struct spdk_nvmf_rdma_qpair *rqpair; 2054 2055 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2056 2057 if (rqpair->cur_queue_depth == 0 && rqpair->cur_rdma_rw_depth == 0) { 2058 return true; 2059 } 2060 return false; 2061 } 2062 2063 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 2064 .type = SPDK_NVME_TRANSPORT_RDMA, 2065 .create = spdk_nvmf_rdma_create, 2066 .destroy = spdk_nvmf_rdma_destroy, 2067 2068 .listen = spdk_nvmf_rdma_listen, 2069 .stop_listen = spdk_nvmf_rdma_stop_listen, 2070 .accept = spdk_nvmf_rdma_accept, 2071 2072 .listener_discover = spdk_nvmf_rdma_discover, 2073 2074 .poll_group_create = spdk_nvmf_rdma_poll_group_create, 2075 .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, 2076 .poll_group_add = spdk_nvmf_rdma_poll_group_add, 2077 .poll_group_remove = spdk_nvmf_rdma_poll_group_remove, 2078 .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, 2079 2080 .req_complete = spdk_nvmf_rdma_request_complete, 2081 2082 .qpair_fini = spdk_nvmf_rdma_close_qpair, 2083 .qpair_is_idle = spdk_nvmf_rdma_qpair_is_idle, 2084 2085 }; 2086 2087 SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) 2088