1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "nvmf_internal.h" 41 #include "transport.h" 42 43 #include "spdk/assert.h" 44 #include "spdk/thread.h" 45 #include "spdk/nvmf.h" 46 #include "spdk/nvmf_spec.h" 47 #include "spdk/string.h" 48 #include "spdk/trace.h" 49 #include "spdk/util.h" 50 51 #include "spdk_internal/log.h" 52 53 /* 54 RDMA Connection Resouce Defaults 55 */ 56 #define NVMF_DEFAULT_TX_SGE 1 57 #define NVMF_DEFAULT_RX_SGE 2 58 #define NVMF_DEFAULT_DATA_SGE 16 59 60 /* The RDMA completion queue size */ 61 #define NVMF_RDMA_CQ_SIZE 4096 62 63 /* AIO backend requires block size aligned data buffers, 64 * extra 4KiB aligned data buffer should work for most devices. 65 */ 66 #define SHIFT_4KB 12 67 #define NVMF_DATA_BUFFER_ALIGNMENT (1 << SHIFT_4KB) 68 #define NVMF_DATA_BUFFER_MASK (NVMF_DATA_BUFFER_ALIGNMENT - 1) 69 70 enum spdk_nvmf_rdma_request_state { 71 /* The request is not currently in use */ 72 RDMA_REQUEST_STATE_FREE = 0, 73 74 /* Initial state when request first received */ 75 RDMA_REQUEST_STATE_NEW, 76 77 /* The request is queued until a data buffer is available. */ 78 RDMA_REQUEST_STATE_NEED_BUFFER, 79 80 /* The request is waiting on RDMA queue depth availability 81 * to transfer data from the host to the controller. 82 */ 83 RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER, 84 85 /* The request is currently transferring data from the host to the controller. */ 86 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 87 88 /* The request is ready to execute at the block device */ 89 RDMA_REQUEST_STATE_READY_TO_EXECUTE, 90 91 /* The request is currently executing at the block device */ 92 RDMA_REQUEST_STATE_EXECUTING, 93 94 /* The request finished executing at the block device */ 95 RDMA_REQUEST_STATE_EXECUTED, 96 97 /* The request is waiting on RDMA queue depth availability 98 * to transfer data from the controller to the host. 99 */ 100 RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST, 101 102 /* The request is ready to send a completion */ 103 RDMA_REQUEST_STATE_READY_TO_COMPLETE, 104 105 /* The request currently has a completion outstanding */ 106 RDMA_REQUEST_STATE_COMPLETING, 107 108 /* The request completed and can be marked free. */ 109 RDMA_REQUEST_STATE_COMPLETED, 110 }; 111 112 #define OBJECT_NVMF_RDMA_IO 0x40 113 114 #define TRACE_GROUP_NVMF_RDMA 0x4 115 #define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) 116 #define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) 117 #define TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) 118 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) 119 #define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) 120 #define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) 121 #define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) 122 #define TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) 123 #define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) 124 #define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) 125 #define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) 126 127 SPDK_TRACE_REGISTER_FN(nvmf_trace) 128 { 129 spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); 130 spdk_trace_register_description("RDMA_REQ_NEW", "", 131 TRACE_RDMA_REQUEST_STATE_NEW, 132 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 0, 0, ""); 133 spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", "", 134 TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 135 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 136 spdk_trace_register_description("RDMA_REQ_TX_PENDING_H_TO_C", "", 137 TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER, 138 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 139 spdk_trace_register_description("RDMA_REQ_TX_H_TO_C", "", 140 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 141 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 142 spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", "", 143 TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 144 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 145 spdk_trace_register_description("RDMA_REQ_EXECUTING", "", 146 TRACE_RDMA_REQUEST_STATE_EXECUTING, 147 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 148 spdk_trace_register_description("RDMA_REQ_EXECUTED", "", 149 TRACE_RDMA_REQUEST_STATE_EXECUTED, 150 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 151 spdk_trace_register_description("RDMA_REQ_TX_PENDING_C_TO_H", "", 152 TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST, 153 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 154 spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPLETE", "", 155 TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 156 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 157 spdk_trace_register_description("RDMA_REQ_COMPLETING", "", 158 TRACE_RDMA_REQUEST_STATE_COMPLETING, 159 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 160 spdk_trace_register_description("RDMA_REQ_COMPLETED", "", 161 TRACE_RDMA_REQUEST_STATE_COMPLETED, 162 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 163 } 164 165 /* This structure holds commands as they are received off the wire. 166 * It must be dynamically paired with a full request object 167 * (spdk_nvmf_rdma_request) to service a request. It is separate 168 * from the request because RDMA does not appear to order 169 * completions, so occasionally we'll get a new incoming 170 * command when there aren't any free request objects. 171 */ 172 struct spdk_nvmf_rdma_recv { 173 struct ibv_recv_wr wr; 174 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 175 176 struct spdk_nvmf_rdma_qpair *qpair; 177 178 /* In-capsule data buffer */ 179 uint8_t *buf; 180 181 TAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 182 }; 183 184 struct spdk_nvmf_rdma_request { 185 struct spdk_nvmf_request req; 186 bool data_from_pool; 187 188 enum spdk_nvmf_rdma_request_state state; 189 190 struct spdk_nvmf_rdma_recv *recv; 191 192 struct { 193 struct ibv_send_wr wr; 194 struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; 195 } rsp; 196 197 struct { 198 struct ibv_send_wr wr; 199 struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 200 void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; 201 } data; 202 203 TAILQ_ENTRY(spdk_nvmf_rdma_request) link; 204 }; 205 206 struct spdk_nvmf_rdma_qpair { 207 struct spdk_nvmf_qpair qpair; 208 209 struct spdk_nvmf_rdma_port *port; 210 struct spdk_nvmf_rdma_poller *poller; 211 212 struct rdma_cm_id *cm_id; 213 214 /* The maximum number of I/O outstanding on this connection at one time */ 215 uint16_t max_queue_depth; 216 217 /* The maximum number of active RDMA READ and WRITE operations at one time */ 218 uint16_t max_rw_depth; 219 220 /* The current number of I/O outstanding on this connection. This number 221 * includes all I/O from the time the capsule is first received until it is 222 * completed. 223 */ 224 uint16_t cur_queue_depth; 225 226 /* The number of RDMA READ and WRITE requests that are outstanding */ 227 uint16_t cur_rdma_rw_depth; 228 229 /* Receives that are waiting for a request object */ 230 TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 231 232 /* Requests that are not in use */ 233 TAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; 234 235 /* Requests that are waiting to perform an RDMA READ or WRITE */ 236 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_rw_queue; 237 238 /* Array of size "max_queue_depth" containing RDMA requests. */ 239 struct spdk_nvmf_rdma_request *reqs; 240 241 /* Array of size "max_queue_depth" containing RDMA recvs. */ 242 struct spdk_nvmf_rdma_recv *recvs; 243 244 /* Array of size "max_queue_depth" containing 64 byte capsules 245 * used for receive. 246 */ 247 union nvmf_h2c_msg *cmds; 248 struct ibv_mr *cmds_mr; 249 250 /* Array of size "max_queue_depth" containing 16 byte completions 251 * to be sent back to the user. 252 */ 253 union nvmf_c2h_msg *cpls; 254 struct ibv_mr *cpls_mr; 255 256 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 257 * buffers to be used for in capsule data. 258 */ 259 void *bufs; 260 struct ibv_mr *bufs_mr; 261 262 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; 263 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) pending_link; 264 265 /* Mgmt channel */ 266 struct spdk_io_channel *mgmt_channel; 267 struct spdk_nvmf_rdma_mgmt_channel *ch; 268 }; 269 270 struct spdk_nvmf_rdma_poller { 271 struct spdk_nvmf_rdma_device *device; 272 struct spdk_nvmf_rdma_poll_group *group; 273 274 struct ibv_cq *cq; 275 276 TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; 277 278 TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 279 }; 280 281 struct spdk_nvmf_rdma_poll_group { 282 struct spdk_nvmf_transport_poll_group group; 283 284 TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 285 }; 286 287 /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 288 struct spdk_nvmf_rdma_device { 289 struct ibv_device_attr attr; 290 struct ibv_context *context; 291 292 struct spdk_mem_map *map; 293 struct ibv_pd *pd; 294 295 TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 296 }; 297 298 struct spdk_nvmf_rdma_port { 299 struct spdk_nvme_transport_id trid; 300 struct rdma_cm_id *id; 301 struct spdk_nvmf_rdma_device *device; 302 uint32_t ref; 303 TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 304 }; 305 306 struct spdk_nvmf_rdma_transport { 307 struct spdk_nvmf_transport transport; 308 309 struct rdma_event_channel *event_channel; 310 311 struct spdk_mempool *data_buf_pool; 312 313 pthread_mutex_t lock; 314 315 uint16_t max_queue_depth; 316 uint32_t max_io_size; 317 uint32_t io_unit_size; 318 uint32_t in_capsule_data_size; 319 320 /* fields used to poll RDMA/IB events */ 321 nfds_t npoll_fds; 322 struct pollfd *poll_fds; 323 324 TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 325 TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 326 }; 327 328 struct spdk_nvmf_rdma_mgmt_channel { 329 /* Requests that are waiting to obtain a data buffer */ 330 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; 331 }; 332 333 static int 334 spdk_nvmf_rdma_mgmt_channel_create(void *io_device, void *ctx_buf) 335 { 336 struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; 337 338 TAILQ_INIT(&ch->pending_data_buf_queue); 339 return 0; 340 } 341 342 static void 343 spdk_nvmf_rdma_mgmt_channel_destroy(void *io_device, void *ctx_buf) 344 { 345 struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; 346 347 if (!TAILQ_EMPTY(&ch->pending_data_buf_queue)) { 348 SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n"); 349 } 350 } 351 352 static void 353 spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 354 { 355 if (rqpair->poller) { 356 TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); 357 } 358 359 if (rqpair->cmds_mr) { 360 ibv_dereg_mr(rqpair->cmds_mr); 361 } 362 363 if (rqpair->cpls_mr) { 364 ibv_dereg_mr(rqpair->cpls_mr); 365 } 366 367 if (rqpair->bufs_mr) { 368 ibv_dereg_mr(rqpair->bufs_mr); 369 } 370 371 if (rqpair->cm_id) { 372 rdma_destroy_qp(rqpair->cm_id); 373 rdma_destroy_id(rqpair->cm_id); 374 } 375 376 if (rqpair->mgmt_channel) { 377 spdk_put_io_channel(rqpair->mgmt_channel); 378 } 379 380 /* Free all memory */ 381 spdk_dma_free(rqpair->cmds); 382 spdk_dma_free(rqpair->cpls); 383 spdk_dma_free(rqpair->bufs); 384 free(rqpair->reqs); 385 free(rqpair->recvs); 386 free(rqpair); 387 } 388 389 static int 390 spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 391 { 392 struct spdk_nvmf_rdma_transport *rtransport; 393 struct spdk_nvmf_rdma_qpair *rqpair; 394 int rc, i; 395 struct ibv_qp_init_attr attr; 396 struct spdk_nvmf_rdma_recv *rdma_recv; 397 struct spdk_nvmf_rdma_request *rdma_req; 398 399 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 400 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 401 402 memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); 403 attr.qp_type = IBV_QPT_RC; 404 attr.send_cq = rqpair->poller->cq; 405 attr.recv_cq = rqpair->poller->cq; 406 attr.cap.max_send_wr = rqpair->max_queue_depth * 2; /* SEND, READ, and WRITE operations */ 407 attr.cap.max_recv_wr = rqpair->max_queue_depth; /* RECV operations */ 408 attr.cap.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES; 409 attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE; 410 411 rc = rdma_create_qp(rqpair->cm_id, NULL, &attr); 412 if (rc) { 413 SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); 414 rdma_destroy_id(rqpair->cm_id); 415 rqpair->cm_id = NULL; 416 spdk_nvmf_rdma_qpair_destroy(rqpair); 417 return -1; 418 } 419 420 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); 421 422 rqpair->reqs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->reqs)); 423 rqpair->recvs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->recvs)); 424 rqpair->cmds = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cmds), 425 0x1000, NULL); 426 rqpair->cpls = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cpls), 427 0x1000, NULL); 428 rqpair->bufs = spdk_dma_zmalloc(rqpair->max_queue_depth * rtransport->in_capsule_data_size, 429 0x1000, NULL); 430 if (!rqpair->reqs || !rqpair->recvs || !rqpair->cmds || 431 !rqpair->cpls || !rqpair->bufs) { 432 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 433 spdk_nvmf_rdma_qpair_destroy(rqpair); 434 return -1; 435 } 436 437 rqpair->cmds_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cmds, 438 rqpair->max_queue_depth * sizeof(*rqpair->cmds), 439 IBV_ACCESS_LOCAL_WRITE); 440 rqpair->cpls_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cpls, 441 rqpair->max_queue_depth * sizeof(*rqpair->cpls), 442 0); 443 rqpair->bufs_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->bufs, 444 rqpair->max_queue_depth * rtransport->in_capsule_data_size, 445 IBV_ACCESS_LOCAL_WRITE | 446 IBV_ACCESS_REMOTE_WRITE); 447 if (!rqpair->cmds_mr || !rqpair->cpls_mr || !rqpair->bufs_mr) { 448 SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n"); 449 spdk_nvmf_rdma_qpair_destroy(rqpair); 450 return -1; 451 } 452 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 453 rqpair->cmds, rqpair->max_queue_depth * sizeof(*rqpair->cmds), rqpair->cmds_mr->lkey); 454 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 455 rqpair->cpls, rqpair->max_queue_depth * sizeof(*rqpair->cpls), rqpair->cpls_mr->lkey); 456 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 457 rqpair->bufs, rqpair->max_queue_depth * rtransport->in_capsule_data_size, rqpair->bufs_mr->lkey); 458 459 for (i = 0; i < rqpair->max_queue_depth; i++) { 460 struct ibv_recv_wr *bad_wr = NULL; 461 462 rdma_recv = &rqpair->recvs[i]; 463 rdma_recv->qpair = rqpair; 464 465 /* Set up memory to receive commands */ 466 rdma_recv->buf = (void *)((uintptr_t)rqpair->bufs + (i * rtransport->in_capsule_data_size)); 467 468 rdma_recv->sgl[0].addr = (uintptr_t)&rqpair->cmds[i]; 469 rdma_recv->sgl[0].length = sizeof(rqpair->cmds[i]); 470 rdma_recv->sgl[0].lkey = rqpair->cmds_mr->lkey; 471 472 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 473 rdma_recv->sgl[1].length = rtransport->in_capsule_data_size; 474 rdma_recv->sgl[1].lkey = rqpair->bufs_mr->lkey; 475 476 rdma_recv->wr.wr_id = (uintptr_t)rdma_recv; 477 rdma_recv->wr.sg_list = rdma_recv->sgl; 478 rdma_recv->wr.num_sge = SPDK_COUNTOF(rdma_recv->sgl); 479 480 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_recv->wr, &bad_wr); 481 if (rc) { 482 SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n"); 483 spdk_nvmf_rdma_qpair_destroy(rqpair); 484 return -1; 485 } 486 } 487 488 for (i = 0; i < rqpair->max_queue_depth; i++) { 489 rdma_req = &rqpair->reqs[i]; 490 491 rdma_req->req.qpair = &rqpair->qpair; 492 rdma_req->req.cmd = NULL; 493 494 /* Set up memory to send responses */ 495 rdma_req->req.rsp = &rqpair->cpls[i]; 496 497 rdma_req->rsp.sgl[0].addr = (uintptr_t)&rqpair->cpls[i]; 498 rdma_req->rsp.sgl[0].length = sizeof(rqpair->cpls[i]); 499 rdma_req->rsp.sgl[0].lkey = rqpair->cpls_mr->lkey; 500 501 rdma_req->rsp.wr.wr_id = (uintptr_t)rdma_req; 502 rdma_req->rsp.wr.next = NULL; 503 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 504 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 505 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 506 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 507 508 /* Set up memory for data buffers */ 509 rdma_req->data.wr.wr_id = (uint64_t)rdma_req; 510 rdma_req->data.wr.next = NULL; 511 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 512 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 513 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 514 515 TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link); 516 } 517 518 return 0; 519 } 520 521 static int 522 request_transfer_in(struct spdk_nvmf_request *req) 523 { 524 int rc; 525 struct spdk_nvmf_rdma_request *rdma_req; 526 struct spdk_nvmf_qpair *qpair; 527 struct spdk_nvmf_rdma_qpair *rqpair; 528 struct ibv_send_wr *bad_wr = NULL; 529 530 qpair = req->qpair; 531 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 532 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 533 534 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 535 536 rqpair->cur_rdma_rw_depth++; 537 538 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair); 539 540 rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; 541 rdma_req->data.wr.next = NULL; 542 rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr); 543 if (rc) { 544 SPDK_ERRLOG("Unable to transfer data from host to target\n"); 545 546 /* Decrement r/w counter back since data transfer 547 * has not started. 548 */ 549 rqpair->cur_rdma_rw_depth--; 550 return -1; 551 } 552 553 return 0; 554 } 555 556 static int 557 request_transfer_out(struct spdk_nvmf_request *req) 558 { 559 int rc; 560 struct spdk_nvmf_rdma_request *rdma_req; 561 struct spdk_nvmf_qpair *qpair; 562 struct spdk_nvmf_rdma_qpair *rqpair; 563 struct spdk_nvme_cpl *rsp; 564 struct ibv_recv_wr *bad_recv_wr = NULL; 565 struct ibv_send_wr *send_wr, *bad_send_wr = NULL; 566 567 qpair = req->qpair; 568 rsp = &req->rsp->nvme_cpl; 569 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 570 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 571 572 /* Advance our sq_head pointer */ 573 if (qpair->sq_head == qpair->sq_head_max) { 574 qpair->sq_head = 0; 575 } else { 576 qpair->sq_head++; 577 } 578 rsp->sqhd = qpair->sq_head; 579 580 /* Post the capsule to the recv buffer */ 581 assert(rdma_req->recv != NULL); 582 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, 583 rqpair); 584 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); 585 if (rc) { 586 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 587 return rc; 588 } 589 rdma_req->recv = NULL; 590 591 /* Build the response which consists of an optional 592 * RDMA WRITE to transfer data, plus an RDMA SEND 593 * containing the response. 594 */ 595 send_wr = &rdma_req->rsp.wr; 596 597 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 598 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 599 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair); 600 601 rqpair->cur_rdma_rw_depth++; 602 rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; 603 604 rdma_req->data.wr.next = send_wr; 605 send_wr = &rdma_req->data.wr; 606 } 607 608 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair); 609 610 /* Send the completion */ 611 rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr); 612 if (rc) { 613 SPDK_ERRLOG("Unable to send response capsule\n"); 614 615 if (rdma_req->data.wr.opcode == IBV_WR_RDMA_WRITE) { 616 /* Decrement r/w counter back since data transfer 617 * has not started. 618 */ 619 rqpair->cur_rdma_rw_depth--; 620 } 621 } 622 623 return rc; 624 } 625 626 static int 627 spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 628 { 629 struct spdk_nvmf_rdma_accept_private_data accept_data; 630 struct rdma_conn_param ctrlr_event_data = {}; 631 int rc; 632 633 accept_data.recfmt = 0; 634 accept_data.crqsize = rqpair->max_queue_depth; 635 636 ctrlr_event_data.private_data = &accept_data; 637 ctrlr_event_data.private_data_len = sizeof(accept_data); 638 if (id->ps == RDMA_PS_TCP) { 639 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 640 ctrlr_event_data.initiator_depth = rqpair->max_rw_depth; 641 } 642 643 rc = rdma_accept(id, &ctrlr_event_data); 644 if (rc) { 645 SPDK_ERRLOG("Error %d on rdma_accept\n", errno); 646 } else { 647 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); 648 } 649 650 return rc; 651 } 652 653 static void 654 spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 655 { 656 struct spdk_nvmf_rdma_reject_private_data rej_data; 657 658 rej_data.recfmt = 0; 659 rej_data.sts = error; 660 661 rdma_reject(id, &rej_data, sizeof(rej_data)); 662 } 663 664 static int 665 nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, 666 new_qpair_fn cb_fn) 667 { 668 struct spdk_nvmf_rdma_transport *rtransport; 669 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 670 struct spdk_nvmf_rdma_port *port; 671 struct rdma_conn_param *rdma_param = NULL; 672 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 673 uint16_t max_queue_depth; 674 uint16_t max_rw_depth; 675 676 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 677 678 assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 679 assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 680 681 rdma_param = &event->param.conn; 682 if (rdma_param->private_data == NULL || 683 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 684 SPDK_ERRLOG("connect request: no private data provided\n"); 685 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 686 return -1; 687 } 688 689 private_data = rdma_param->private_data; 690 if (private_data->recfmt != 0) { 691 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 692 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 693 return -1; 694 } 695 696 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 697 event->id->verbs->device->name, event->id->verbs->device->dev_name); 698 699 port = event->listen_id->context; 700 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 701 event->listen_id, event->listen_id->verbs, port); 702 703 /* Figure out the supported queue depth. This is a multi-step process 704 * that takes into account hardware maximums, host provided values, 705 * and our target's internal memory limits */ 706 707 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); 708 709 /* Start with the maximum queue depth allowed by the target */ 710 max_queue_depth = rtransport->max_queue_depth; 711 max_rw_depth = rtransport->max_queue_depth; 712 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", rtransport->max_queue_depth); 713 714 /* Next check the local NIC's hardware limitations */ 715 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 716 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 717 port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 718 max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 719 max_rw_depth = spdk_min(max_rw_depth, port->device->attr.max_qp_rd_atom); 720 721 /* Next check the remote NIC's hardware limitations */ 722 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 723 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 724 rdma_param->initiator_depth, rdma_param->responder_resources); 725 if (rdma_param->initiator_depth > 0) { 726 max_rw_depth = spdk_min(max_rw_depth, rdma_param->initiator_depth); 727 } 728 729 /* Finally check for the host software requested values, which are 730 * optional. */ 731 if (rdma_param->private_data != NULL && 732 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 733 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 734 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 735 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 736 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 737 } 738 739 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 740 max_queue_depth, max_rw_depth); 741 742 rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 743 if (rqpair == NULL) { 744 SPDK_ERRLOG("Could not allocate new connection.\n"); 745 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 746 return -1; 747 } 748 749 rqpair->port = port; 750 rqpair->max_queue_depth = max_queue_depth; 751 rqpair->max_rw_depth = max_rw_depth; 752 rqpair->cm_id = event->id; 753 rqpair->qpair.transport = transport; 754 TAILQ_INIT(&rqpair->incoming_queue); 755 TAILQ_INIT(&rqpair->free_queue); 756 TAILQ_INIT(&rqpair->pending_rdma_rw_queue); 757 758 event->id->context = &rqpair->qpair; 759 760 cb_fn(&rqpair->qpair); 761 762 return 0; 763 } 764 765 static int 766 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 767 { 768 struct spdk_nvmf_qpair *qpair; 769 770 if (evt->id == NULL) { 771 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 772 return -1; 773 } 774 775 qpair = evt->id->context; 776 if (qpair == NULL) { 777 SPDK_ERRLOG("disconnect request: no active connection\n"); 778 return -1; 779 } 780 /* ack the disconnect event before rdma_destroy_id */ 781 rdma_ack_cm_event(evt); 782 783 spdk_nvmf_qpair_disconnect(qpair); 784 785 return 0; 786 } 787 788 #ifdef DEBUG 789 static const char *CM_EVENT_STR[] = { 790 "RDMA_CM_EVENT_ADDR_RESOLVED", 791 "RDMA_CM_EVENT_ADDR_ERROR", 792 "RDMA_CM_EVENT_ROUTE_RESOLVED", 793 "RDMA_CM_EVENT_ROUTE_ERROR", 794 "RDMA_CM_EVENT_CONNECT_REQUEST", 795 "RDMA_CM_EVENT_CONNECT_RESPONSE", 796 "RDMA_CM_EVENT_CONNECT_ERROR", 797 "RDMA_CM_EVENT_UNREACHABLE", 798 "RDMA_CM_EVENT_REJECTED", 799 "RDMA_CM_EVENT_ESTABLISHED", 800 "RDMA_CM_EVENT_DISCONNECTED", 801 "RDMA_CM_EVENT_DEVICE_REMOVAL", 802 "RDMA_CM_EVENT_MULTICAST_JOIN", 803 "RDMA_CM_EVENT_MULTICAST_ERROR", 804 "RDMA_CM_EVENT_ADDR_CHANGE", 805 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 806 }; 807 #endif /* DEBUG */ 808 809 static int 810 spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 811 enum spdk_mem_map_notify_action action, 812 void *vaddr, size_t size) 813 { 814 struct spdk_nvmf_rdma_device *device = cb_ctx; 815 struct ibv_pd *pd = device->pd; 816 struct ibv_mr *mr; 817 818 switch (action) { 819 case SPDK_MEM_MAP_NOTIFY_REGISTER: 820 mr = ibv_reg_mr(pd, vaddr, size, 821 IBV_ACCESS_LOCAL_WRITE | 822 IBV_ACCESS_REMOTE_READ | 823 IBV_ACCESS_REMOTE_WRITE); 824 if (mr == NULL) { 825 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 826 return -1; 827 } else { 828 spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 829 } 830 break; 831 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 832 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, size); 833 spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 834 if (mr) { 835 ibv_dereg_mr(mr); 836 } 837 break; 838 } 839 840 return 0; 841 } 842 843 typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; 844 845 static spdk_nvme_data_transfer_t 846 spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) 847 { 848 enum spdk_nvme_data_transfer xfer; 849 struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; 850 struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; 851 852 /* Figure out data transfer direction */ 853 if (cmd->opc == SPDK_NVME_OPC_FABRIC) { 854 xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); 855 } else { 856 xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 857 858 /* Some admin commands are special cases */ 859 if ((rdma_req->req.qpair->qid == 0) && 860 ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || 861 (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { 862 switch (cmd->cdw10 & 0xff) { 863 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 864 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 865 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 866 break; 867 default: 868 xfer = SPDK_NVME_DATA_NONE; 869 } 870 } 871 } 872 873 if (xfer == SPDK_NVME_DATA_NONE) { 874 return xfer; 875 } 876 877 /* Even for commands that may transfer data, they could have specified 0 length. 878 * We want those to show up with xfer SPDK_NVME_DATA_NONE. 879 */ 880 switch (sgl->generic.type) { 881 case SPDK_NVME_SGL_TYPE_DATA_BLOCK: 882 case SPDK_NVME_SGL_TYPE_BIT_BUCKET: 883 case SPDK_NVME_SGL_TYPE_SEGMENT: 884 case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: 885 case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: 886 if (sgl->unkeyed.length == 0) { 887 xfer = SPDK_NVME_DATA_NONE; 888 } 889 break; 890 case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: 891 if (sgl->keyed.length == 0) { 892 xfer = SPDK_NVME_DATA_NONE; 893 } 894 break; 895 } 896 897 return xfer; 898 } 899 900 static int 901 spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 902 struct spdk_nvmf_rdma_device *device, 903 struct spdk_nvmf_rdma_request *rdma_req) 904 { 905 void *buf = NULL; 906 uint32_t length = rdma_req->req.length; 907 uint32_t i = 0; 908 909 rdma_req->req.iovcnt = 0; 910 while (length) { 911 buf = spdk_mempool_get(rtransport->data_buf_pool); 912 if (!buf) { 913 goto nomem; 914 } 915 916 rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & 917 ~NVMF_DATA_BUFFER_MASK); 918 rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->io_unit_size); 919 rdma_req->req.iovcnt++; 920 rdma_req->data.buffers[i] = buf; 921 rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base); 922 rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len; 923 rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, 924 (uint64_t)buf, rdma_req->req.iov[i].iov_len))->lkey; 925 926 length -= rdma_req->req.iov[i].iov_len; 927 i++; 928 } 929 930 rdma_req->data_from_pool = true; 931 932 return 0; 933 934 nomem: 935 while (i) { 936 i--; 937 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->req.iov[i].iov_base); 938 rdma_req->req.iov[i].iov_base = NULL; 939 rdma_req->req.iov[i].iov_len = 0; 940 941 rdma_req->data.wr.sg_list[i].addr = 0; 942 rdma_req->data.wr.sg_list[i].length = 0; 943 rdma_req->data.wr.sg_list[i].lkey = 0; 944 } 945 rdma_req->req.iovcnt = 0; 946 return -ENOMEM; 947 } 948 949 static int 950 spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 951 struct spdk_nvmf_rdma_device *device, 952 struct spdk_nvmf_rdma_request *rdma_req) 953 { 954 struct spdk_nvme_cmd *cmd; 955 struct spdk_nvme_cpl *rsp; 956 struct spdk_nvme_sgl_descriptor *sgl; 957 958 cmd = &rdma_req->req.cmd->nvme_cmd; 959 rsp = &rdma_req->req.rsp->nvme_cpl; 960 sgl = &cmd->dptr.sgl1; 961 962 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 963 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 964 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 965 if (sgl->keyed.length > rtransport->max_io_size) { 966 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 967 sgl->keyed.length, rtransport->max_io_size); 968 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 969 return -1; 970 } 971 972 /* fill request length and populate iovs */ 973 rdma_req->req.length = sgl->keyed.length; 974 975 if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { 976 /* No available buffers. Queue this request up. */ 977 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 978 return 0; 979 } 980 981 /* backward compatible */ 982 rdma_req->req.data = rdma_req->req.iov[0].iov_base; 983 984 /* rdma wr specifics */ 985 rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; 986 rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; 987 rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; 988 989 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 990 rdma_req->req.iovcnt); 991 992 return 0; 993 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 994 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 995 uint64_t offset = sgl->address; 996 uint32_t max_len = rtransport->in_capsule_data_size; 997 998 SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 999 offset, sgl->unkeyed.length); 1000 1001 if (offset > max_len) { 1002 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 1003 offset, max_len); 1004 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 1005 return -1; 1006 } 1007 max_len -= (uint32_t)offset; 1008 1009 if (sgl->unkeyed.length > max_len) { 1010 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 1011 sgl->unkeyed.length, max_len); 1012 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1013 return -1; 1014 } 1015 1016 rdma_req->req.data = rdma_req->recv->buf + offset; 1017 rdma_req->data_from_pool = false; 1018 rdma_req->req.length = sgl->unkeyed.length; 1019 1020 rdma_req->req.iov[0].iov_base = rdma_req->req.data; 1021 rdma_req->req.iov[0].iov_len = rdma_req->req.length; 1022 rdma_req->req.iovcnt = 1; 1023 1024 return 0; 1025 } 1026 1027 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 1028 sgl->generic.type, sgl->generic.subtype); 1029 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 1030 return -1; 1031 } 1032 1033 static bool 1034 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 1035 struct spdk_nvmf_rdma_request *rdma_req) 1036 { 1037 struct spdk_nvmf_rdma_qpair *rqpair; 1038 struct spdk_nvmf_rdma_device *device; 1039 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 1040 int rc; 1041 struct spdk_nvmf_rdma_recv *rdma_recv; 1042 enum spdk_nvmf_rdma_request_state prev_state; 1043 bool progress = false; 1044 1045 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1046 device = rqpair->port->device; 1047 1048 assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 1049 1050 /* The loop here is to allow for several back-to-back state changes. */ 1051 do { 1052 prev_state = rdma_req->state; 1053 1054 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); 1055 1056 switch (rdma_req->state) { 1057 case RDMA_REQUEST_STATE_FREE: 1058 /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 1059 * to escape this state. */ 1060 break; 1061 case RDMA_REQUEST_STATE_NEW: 1062 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, (uintptr_t)rdma_req, 0); 1063 1064 rqpair->cur_queue_depth++; 1065 rdma_recv = rdma_req->recv; 1066 1067 /* The first element of the SGL is the NVMe command */ 1068 rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 1069 memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 1070 1071 TAILQ_REMOVE(&rqpair->incoming_queue, rdma_recv, link); 1072 TAILQ_REMOVE(&rqpair->free_queue, rdma_req, link); 1073 1074 /* The next state transition depends on the data transfer needs of this request. */ 1075 rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); 1076 1077 /* If no data to transfer, ready to execute. */ 1078 if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 1079 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1080 break; 1081 } 1082 1083 rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER; 1084 TAILQ_INSERT_TAIL(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1085 break; 1086 case RDMA_REQUEST_STATE_NEED_BUFFER: 1087 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, (uintptr_t)rdma_req, 0); 1088 1089 assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 1090 1091 if (rdma_req != TAILQ_FIRST(&rqpair->ch->pending_data_buf_queue)) { 1092 /* This request needs to wait in line to obtain a buffer */ 1093 break; 1094 } 1095 1096 /* Try to get a data buffer */ 1097 rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 1098 if (rc < 0) { 1099 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1100 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1101 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1102 break; 1103 } 1104 1105 if (!rdma_req->req.data) { 1106 /* No buffers available. */ 1107 break; 1108 } 1109 1110 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1111 1112 /* If data is transferring from host to controller and the data didn't 1113 * arrive using in capsule data, we need to do a transfer from the host. 1114 */ 1115 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { 1116 rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER; 1117 TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1118 break; 1119 } 1120 1121 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1122 break; 1123 case RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER: 1124 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER, 0, 0, 1125 (uintptr_t)rdma_req, 0); 1126 1127 if (rdma_req != TAILQ_FIRST(&rqpair->pending_rdma_rw_queue)) { 1128 /* This request needs to wait in line to perform RDMA */ 1129 break; 1130 } 1131 1132 if (rqpair->cur_rdma_rw_depth < rqpair->max_rw_depth) { 1133 TAILQ_REMOVE(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1134 rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; 1135 rc = request_transfer_in(&rdma_req->req); 1136 if (rc) { 1137 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1138 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1139 } 1140 } 1141 break; 1142 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 1143 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, 1144 (uintptr_t)rdma_req, 0); 1145 /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 1146 * to escape this state. */ 1147 break; 1148 case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 1149 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, (uintptr_t)rdma_req, 0); 1150 rdma_req->state = RDMA_REQUEST_STATE_EXECUTING; 1151 spdk_nvmf_request_exec(&rdma_req->req); 1152 break; 1153 case RDMA_REQUEST_STATE_EXECUTING: 1154 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, (uintptr_t)rdma_req, 0); 1155 /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 1156 * to escape this state. */ 1157 break; 1158 case RDMA_REQUEST_STATE_EXECUTED: 1159 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, (uintptr_t)rdma_req, 0); 1160 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1161 rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST; 1162 TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1163 } else { 1164 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1165 } 1166 break; 1167 case RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST: 1168 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST, 0, 0, 1169 (uintptr_t)rdma_req, 0); 1170 if (rdma_req != TAILQ_FIRST(&rqpair->pending_rdma_rw_queue)) { 1171 /* This request needs to wait in line to perform RDMA */ 1172 break; 1173 } 1174 1175 if (rqpair->cur_rdma_rw_depth < rqpair->max_rw_depth) { 1176 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1177 TAILQ_REMOVE(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1178 } 1179 break; 1180 case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 1181 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, (uintptr_t)rdma_req, 0); 1182 rdma_req->state = RDMA_REQUEST_STATE_COMPLETING; 1183 1184 rc = request_transfer_out(&rdma_req->req); 1185 assert(rc == 0); /* No good way to handle this currently */ 1186 break; 1187 case RDMA_REQUEST_STATE_COMPLETING: 1188 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, (uintptr_t)rdma_req, 0); 1189 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1190 * to escape this state. */ 1191 break; 1192 case RDMA_REQUEST_STATE_COMPLETED: 1193 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, (uintptr_t)rdma_req, 0); 1194 assert(rqpair->cur_queue_depth > 0); 1195 rqpair->cur_queue_depth--; 1196 1197 if (rdma_req->data_from_pool) { 1198 /* Put the buffer/s back in the pool */ 1199 for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { 1200 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); 1201 rdma_req->req.iov[i].iov_base = NULL; 1202 rdma_req->data.buffers[i] = NULL; 1203 } 1204 rdma_req->data_from_pool = false; 1205 } 1206 rdma_req->req.length = 0; 1207 rdma_req->req.iovcnt = 0; 1208 rdma_req->req.data = NULL; 1209 rdma_req->state = RDMA_REQUEST_STATE_FREE; 1210 TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link); 1211 break; 1212 } 1213 1214 if (rdma_req->state != prev_state) { 1215 progress = true; 1216 } 1217 } while (rdma_req->state != prev_state); 1218 1219 return progress; 1220 } 1221 1222 /* Public API callbacks begin here */ 1223 1224 static struct spdk_nvmf_transport * 1225 spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt) 1226 { 1227 int rc; 1228 struct spdk_nvmf_rdma_transport *rtransport; 1229 struct spdk_nvmf_rdma_device *device, *tmp; 1230 struct ibv_context **contexts; 1231 uint32_t i; 1232 int flag; 1233 uint32_t sge_count; 1234 1235 rtransport = calloc(1, sizeof(*rtransport)); 1236 if (!rtransport) { 1237 return NULL; 1238 } 1239 1240 pthread_mutex_init(&rtransport->lock, NULL); 1241 TAILQ_INIT(&rtransport->devices); 1242 TAILQ_INIT(&rtransport->ports); 1243 1244 rtransport->transport.tgt = tgt; 1245 rtransport->transport.ops = &spdk_nvmf_transport_rdma; 1246 1247 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n"); 1248 1249 rtransport->max_queue_depth = tgt->opts.max_queue_depth; 1250 rtransport->max_io_size = tgt->opts.max_io_size; 1251 rtransport->io_unit_size = tgt->opts.io_unit_size; 1252 rtransport->in_capsule_data_size = tgt->opts.in_capsule_data_size; 1253 1254 /* I/O unit size cannot be larger than max I/O size */ 1255 if (rtransport->io_unit_size > rtransport->max_io_size) { 1256 rtransport->io_unit_size = rtransport->max_io_size; 1257 } 1258 1259 sge_count = rtransport->max_io_size / rtransport->io_unit_size; 1260 if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) { 1261 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", rtransport->io_unit_size); 1262 free(rtransport); 1263 return NULL; 1264 } 1265 1266 rtransport->event_channel = rdma_create_event_channel(); 1267 if (rtransport->event_channel == NULL) { 1268 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 1269 free(rtransport); 1270 return NULL; 1271 } 1272 1273 flag = fcntl(rtransport->event_channel->fd, F_GETFL); 1274 if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1275 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 1276 rtransport->event_channel->fd, spdk_strerror(errno)); 1277 free(rtransport); 1278 return NULL; 1279 } 1280 1281 rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma", 1282 rtransport->max_queue_depth * 4, /* The 4 is arbitrarily chosen. Needs to be configurable. */ 1283 rtransport->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT, 1284 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 1285 SPDK_ENV_SOCKET_ID_ANY); 1286 if (!rtransport->data_buf_pool) { 1287 SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n"); 1288 free(rtransport); 1289 return NULL; 1290 } 1291 1292 spdk_io_device_register(rtransport, spdk_nvmf_rdma_mgmt_channel_create, 1293 spdk_nvmf_rdma_mgmt_channel_destroy, 1294 sizeof(struct spdk_nvmf_rdma_mgmt_channel)); 1295 1296 contexts = rdma_get_devices(NULL); 1297 i = 0; 1298 rc = 0; 1299 while (contexts[i] != NULL) { 1300 device = calloc(1, sizeof(*device)); 1301 if (!device) { 1302 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 1303 rc = -ENOMEM; 1304 break; 1305 } 1306 device->context = contexts[i]; 1307 rc = ibv_query_device(device->context, &device->attr); 1308 if (rc < 0) { 1309 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1310 free(device); 1311 break; 1312 1313 } 1314 /* set up device context async ev fd as NON_BLOCKING */ 1315 flag = fcntl(device->context->async_fd, F_GETFL); 1316 rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 1317 if (rc < 0) { 1318 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 1319 free(device); 1320 break; 1321 } 1322 1323 device->pd = NULL; 1324 device->map = NULL; 1325 1326 TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 1327 i++; 1328 } 1329 1330 if (rc < 0) { 1331 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1332 TAILQ_REMOVE(&rtransport->devices, device, link); 1333 free(device); 1334 } 1335 spdk_mempool_free(rtransport->data_buf_pool); 1336 rdma_destroy_event_channel(rtransport->event_channel); 1337 free(rtransport); 1338 rdma_free_devices(contexts); 1339 return NULL; 1340 } else { 1341 /* Set up poll descriptor array to monitor events from RDMA and IB 1342 * in a single poll syscall 1343 */ 1344 rtransport->npoll_fds = i + 1; 1345 i = 0; 1346 rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 1347 rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 1348 rtransport->poll_fds[i++].events = POLLIN; 1349 1350 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1351 rtransport->poll_fds[i].fd = device->context->async_fd; 1352 rtransport->poll_fds[i++].events = POLLIN; 1353 } 1354 } 1355 1356 rdma_free_devices(contexts); 1357 1358 return &rtransport->transport; 1359 } 1360 1361 static int 1362 spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) 1363 { 1364 struct spdk_nvmf_rdma_transport *rtransport; 1365 struct spdk_nvmf_rdma_port *port, *port_tmp; 1366 struct spdk_nvmf_rdma_device *device, *device_tmp; 1367 1368 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1369 1370 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 1371 TAILQ_REMOVE(&rtransport->ports, port, link); 1372 rdma_destroy_id(port->id); 1373 free(port); 1374 } 1375 1376 if (rtransport->poll_fds != NULL) { 1377 free(rtransport->poll_fds); 1378 } 1379 1380 if (rtransport->event_channel != NULL) { 1381 rdma_destroy_event_channel(rtransport->event_channel); 1382 } 1383 1384 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 1385 TAILQ_REMOVE(&rtransport->devices, device, link); 1386 if (device->map) { 1387 spdk_mem_map_free(&device->map); 1388 } 1389 free(device); 1390 } 1391 1392 if (spdk_mempool_count(rtransport->data_buf_pool) != (rtransport->max_queue_depth * 4)) { 1393 SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n", 1394 spdk_mempool_count(rtransport->data_buf_pool), 1395 rtransport->max_queue_depth * 4); 1396 } 1397 1398 spdk_mempool_free(rtransport->data_buf_pool); 1399 spdk_io_device_unregister(rtransport, NULL); 1400 free(rtransport); 1401 1402 return 0; 1403 } 1404 1405 static int 1406 spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, 1407 const struct spdk_nvme_transport_id *trid) 1408 { 1409 struct spdk_nvmf_rdma_transport *rtransport; 1410 struct spdk_nvmf_rdma_device *device; 1411 struct spdk_nvmf_rdma_port *port_tmp, *port; 1412 struct addrinfo *res; 1413 struct addrinfo hints; 1414 int family; 1415 int rc; 1416 1417 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1418 1419 port = calloc(1, sizeof(*port)); 1420 if (!port) { 1421 return -ENOMEM; 1422 } 1423 1424 /* Selectively copy the trid. Things like NQN don't matter here - that 1425 * mapping is enforced elsewhere. 1426 */ 1427 port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1428 port->trid.adrfam = trid->adrfam; 1429 snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); 1430 snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); 1431 1432 pthread_mutex_lock(&rtransport->lock); 1433 assert(rtransport->event_channel != NULL); 1434 TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { 1435 if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { 1436 port_tmp->ref++; 1437 free(port); 1438 /* Already listening at this address */ 1439 pthread_mutex_unlock(&rtransport->lock); 1440 return 0; 1441 } 1442 } 1443 1444 rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 1445 if (rc < 0) { 1446 SPDK_ERRLOG("rdma_create_id() failed\n"); 1447 free(port); 1448 pthread_mutex_unlock(&rtransport->lock); 1449 return rc; 1450 } 1451 1452 switch (port->trid.adrfam) { 1453 case SPDK_NVMF_ADRFAM_IPV4: 1454 family = AF_INET; 1455 break; 1456 case SPDK_NVMF_ADRFAM_IPV6: 1457 family = AF_INET6; 1458 break; 1459 default: 1460 SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); 1461 free(port); 1462 pthread_mutex_unlock(&rtransport->lock); 1463 return -EINVAL; 1464 } 1465 1466 memset(&hints, 0, sizeof(hints)); 1467 hints.ai_family = family; 1468 hints.ai_socktype = SOCK_STREAM; 1469 hints.ai_protocol = 0; 1470 1471 rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); 1472 if (rc) { 1473 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 1474 free(port); 1475 pthread_mutex_unlock(&rtransport->lock); 1476 return -EINVAL; 1477 } 1478 1479 rc = rdma_bind_addr(port->id, res->ai_addr); 1480 freeaddrinfo(res); 1481 1482 if (rc < 0) { 1483 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 1484 rdma_destroy_id(port->id); 1485 free(port); 1486 pthread_mutex_unlock(&rtransport->lock); 1487 return rc; 1488 } 1489 1490 rc = rdma_listen(port->id, 10); /* 10 = backlog */ 1491 if (rc < 0) { 1492 SPDK_ERRLOG("rdma_listen() failed\n"); 1493 rdma_destroy_id(port->id); 1494 free(port); 1495 pthread_mutex_unlock(&rtransport->lock); 1496 return rc; 1497 } 1498 1499 TAILQ_FOREACH(device, &rtransport->devices, link) { 1500 if (device->context == port->id->verbs) { 1501 port->device = device; 1502 break; 1503 } 1504 } 1505 if (!port->device) { 1506 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 1507 port->id->verbs); 1508 rdma_destroy_id(port->id); 1509 free(port); 1510 pthread_mutex_unlock(&rtransport->lock); 1511 return -EINVAL; 1512 } 1513 1514 if (!device->map) { 1515 device->pd = port->id->pd; 1516 device->map = spdk_mem_map_alloc(0, spdk_nvmf_rdma_mem_notify, device); 1517 if (!device->map) { 1518 SPDK_ERRLOG("Unable to allocate memory map for new poll group\n"); 1519 return -1; 1520 } 1521 } else { 1522 assert(device->pd == port->id->pd); 1523 } 1524 1525 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", 1526 port->trid.traddr, ntohs(rdma_get_src_port(port->id))); 1527 1528 port->ref = 1; 1529 1530 TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 1531 pthread_mutex_unlock(&rtransport->lock); 1532 1533 return 0; 1534 } 1535 1536 static int 1537 spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 1538 const struct spdk_nvme_transport_id *_trid) 1539 { 1540 struct spdk_nvmf_rdma_transport *rtransport; 1541 struct spdk_nvmf_rdma_port *port, *tmp; 1542 struct spdk_nvme_transport_id trid = {}; 1543 1544 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1545 1546 /* Selectively copy the trid. Things like NQN don't matter here - that 1547 * mapping is enforced elsewhere. 1548 */ 1549 trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1550 trid.adrfam = _trid->adrfam; 1551 snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); 1552 snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); 1553 1554 pthread_mutex_lock(&rtransport->lock); 1555 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 1556 if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { 1557 assert(port->ref > 0); 1558 port->ref--; 1559 if (port->ref == 0) { 1560 TAILQ_REMOVE(&rtransport->ports, port, link); 1561 rdma_destroy_id(port->id); 1562 free(port); 1563 } 1564 break; 1565 } 1566 } 1567 1568 pthread_mutex_unlock(&rtransport->lock); 1569 return 0; 1570 } 1571 1572 static void 1573 spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 1574 { 1575 struct spdk_nvmf_rdma_transport *rtransport; 1576 struct rdma_cm_event *event; 1577 int rc; 1578 1579 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1580 1581 if (rtransport->event_channel == NULL) { 1582 return; 1583 } 1584 1585 while (1) { 1586 rc = rdma_get_cm_event(rtransport->event_channel, &event); 1587 if (rc == 0) { 1588 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 1589 1590 switch (event->event) { 1591 case RDMA_CM_EVENT_ADDR_RESOLVED: 1592 case RDMA_CM_EVENT_ADDR_ERROR: 1593 case RDMA_CM_EVENT_ROUTE_RESOLVED: 1594 case RDMA_CM_EVENT_ROUTE_ERROR: 1595 /* No action required. The target never attempts to resolve routes. */ 1596 break; 1597 case RDMA_CM_EVENT_CONNECT_REQUEST: 1598 rc = nvmf_rdma_connect(transport, event, cb_fn); 1599 if (rc < 0) { 1600 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 1601 break; 1602 } 1603 break; 1604 case RDMA_CM_EVENT_CONNECT_RESPONSE: 1605 /* The target never initiates a new connection. So this will not occur. */ 1606 break; 1607 case RDMA_CM_EVENT_CONNECT_ERROR: 1608 /* Can this happen? The docs say it can, but not sure what causes it. */ 1609 break; 1610 case RDMA_CM_EVENT_UNREACHABLE: 1611 case RDMA_CM_EVENT_REJECTED: 1612 /* These only occur on the client side. */ 1613 break; 1614 case RDMA_CM_EVENT_ESTABLISHED: 1615 /* TODO: Should we be waiting for this event anywhere? */ 1616 break; 1617 case RDMA_CM_EVENT_DISCONNECTED: 1618 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1619 rc = nvmf_rdma_disconnect(event); 1620 if (rc < 0) { 1621 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 1622 break; 1623 } 1624 continue; 1625 case RDMA_CM_EVENT_MULTICAST_JOIN: 1626 case RDMA_CM_EVENT_MULTICAST_ERROR: 1627 /* Multicast is not used */ 1628 break; 1629 case RDMA_CM_EVENT_ADDR_CHANGE: 1630 /* Not utilizing this event */ 1631 break; 1632 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1633 /* For now, do nothing. The target never re-uses queue pairs. */ 1634 break; 1635 default: 1636 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 1637 break; 1638 } 1639 1640 rdma_ack_cm_event(event); 1641 } else { 1642 if (errno != EAGAIN && errno != EWOULDBLOCK) { 1643 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 1644 } 1645 break; 1646 } 1647 } 1648 } 1649 1650 static void 1651 spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 1652 { 1653 int rc; 1654 struct ibv_async_event event; 1655 1656 rc = ibv_get_async_event(device->context, &event); 1657 1658 if (rc) { 1659 SPDK_ERRLOG("Failed to get async_event (%d): %s\n", 1660 errno, spdk_strerror(errno)); 1661 return; 1662 } 1663 1664 SPDK_NOTICELOG("Async event: %s\n", 1665 ibv_event_type_str(event.event_type)); 1666 ibv_ack_async_event(&event); 1667 } 1668 1669 static void 1670 spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 1671 { 1672 int nfds, i = 0; 1673 struct spdk_nvmf_rdma_transport *rtransport; 1674 struct spdk_nvmf_rdma_device *device, *tmp; 1675 1676 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1677 nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 1678 1679 if (nfds <= 0) { 1680 return; 1681 } 1682 1683 /* The first poll descriptor is RDMA CM event */ 1684 if (rtransport->poll_fds[i++].revents & POLLIN) { 1685 spdk_nvmf_process_cm_event(transport, cb_fn); 1686 nfds--; 1687 } 1688 1689 if (nfds == 0) { 1690 return; 1691 } 1692 1693 /* Second and subsequent poll descriptors are IB async events */ 1694 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1695 if (rtransport->poll_fds[i++].revents & POLLIN) { 1696 spdk_nvmf_process_ib_event(device); 1697 nfds--; 1698 } 1699 } 1700 /* check all flagged fd's have been served */ 1701 assert(nfds == 0); 1702 } 1703 1704 static void 1705 spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 1706 struct spdk_nvme_transport_id *trid, 1707 struct spdk_nvmf_discovery_log_page_entry *entry) 1708 { 1709 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 1710 entry->adrfam = trid->adrfam; 1711 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; 1712 1713 spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 1714 spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 1715 1716 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 1717 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 1718 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 1719 } 1720 1721 static struct spdk_nvmf_transport_poll_group * 1722 spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) 1723 { 1724 struct spdk_nvmf_rdma_transport *rtransport; 1725 struct spdk_nvmf_rdma_poll_group *rgroup; 1726 struct spdk_nvmf_rdma_poller *poller; 1727 struct spdk_nvmf_rdma_device *device; 1728 1729 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1730 1731 rgroup = calloc(1, sizeof(*rgroup)); 1732 if (!rgroup) { 1733 return NULL; 1734 } 1735 1736 TAILQ_INIT(&rgroup->pollers); 1737 1738 pthread_mutex_lock(&rtransport->lock); 1739 TAILQ_FOREACH(device, &rtransport->devices, link) { 1740 if (device->map == NULL) { 1741 /* 1742 * The device is not in use (no listeners), 1743 * so no protection domain has been constructed. 1744 * Skip it. 1745 */ 1746 SPDK_NOTICELOG("Skipping unused RDMA device when creating poll group.\n"); 1747 continue; 1748 } 1749 1750 poller = calloc(1, sizeof(*poller)); 1751 if (!poller) { 1752 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 1753 free(rgroup); 1754 pthread_mutex_unlock(&rtransport->lock); 1755 return NULL; 1756 } 1757 1758 poller->device = device; 1759 poller->group = rgroup; 1760 1761 TAILQ_INIT(&poller->qpairs); 1762 1763 poller->cq = ibv_create_cq(device->context, NVMF_RDMA_CQ_SIZE, poller, NULL, 0); 1764 if (!poller->cq) { 1765 SPDK_ERRLOG("Unable to create completion queue\n"); 1766 free(poller); 1767 free(rgroup); 1768 pthread_mutex_unlock(&rtransport->lock); 1769 return NULL; 1770 } 1771 1772 TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 1773 } 1774 1775 pthread_mutex_unlock(&rtransport->lock); 1776 return &rgroup->group; 1777 } 1778 1779 static void 1780 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 1781 { 1782 struct spdk_nvmf_rdma_poll_group *rgroup; 1783 struct spdk_nvmf_rdma_poller *poller, *tmp; 1784 1785 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1786 1787 if (!rgroup) { 1788 return; 1789 } 1790 1791 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 1792 TAILQ_REMOVE(&rgroup->pollers, poller, link); 1793 1794 if (poller->cq) { 1795 ibv_destroy_cq(poller->cq); 1796 } 1797 1798 free(poller); 1799 } 1800 1801 free(rgroup); 1802 } 1803 1804 static int 1805 spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 1806 struct spdk_nvmf_qpair *qpair) 1807 { 1808 struct spdk_nvmf_rdma_transport *rtransport; 1809 struct spdk_nvmf_rdma_poll_group *rgroup; 1810 struct spdk_nvmf_rdma_qpair *rqpair; 1811 struct spdk_nvmf_rdma_device *device; 1812 struct spdk_nvmf_rdma_poller *poller; 1813 int rc; 1814 1815 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 1816 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1817 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1818 1819 device = rqpair->port->device; 1820 1821 if (device->pd != rqpair->cm_id->pd) { 1822 SPDK_ERRLOG("Mismatched protection domains\n"); 1823 return -1; 1824 } 1825 1826 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 1827 if (poller->device == device) { 1828 break; 1829 } 1830 } 1831 1832 if (!poller) { 1833 SPDK_ERRLOG("No poller found for device.\n"); 1834 return -1; 1835 } 1836 1837 TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); 1838 rqpair->poller = poller; 1839 1840 rc = spdk_nvmf_rdma_qpair_initialize(qpair); 1841 if (rc < 0) { 1842 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 1843 return -1; 1844 } 1845 1846 rqpair->mgmt_channel = spdk_get_io_channel(rtransport); 1847 if (!rqpair->mgmt_channel) { 1848 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1849 spdk_nvmf_rdma_qpair_destroy(rqpair); 1850 return -1; 1851 } 1852 1853 rqpair->ch = spdk_io_channel_get_ctx(rqpair->mgmt_channel); 1854 assert(rqpair->ch != NULL); 1855 1856 rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 1857 if (rc) { 1858 /* Try to reject, but we probably can't */ 1859 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1860 spdk_nvmf_rdma_qpair_destroy(rqpair); 1861 return -1; 1862 } 1863 1864 return 0; 1865 } 1866 1867 static int 1868 spdk_nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 1869 struct spdk_nvmf_qpair *qpair) 1870 { 1871 struct spdk_nvmf_rdma_poll_group *rgroup; 1872 struct spdk_nvmf_rdma_qpair *rqpair; 1873 struct spdk_nvmf_rdma_device *device; 1874 struct spdk_nvmf_rdma_poller *poller; 1875 struct spdk_nvmf_rdma_qpair *rq, *trq; 1876 1877 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1878 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1879 1880 device = rqpair->port->device; 1881 1882 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 1883 if (poller->device == device) { 1884 break; 1885 } 1886 } 1887 1888 if (!poller) { 1889 SPDK_ERRLOG("No poller found for device.\n"); 1890 return -1; 1891 } 1892 1893 TAILQ_FOREACH_SAFE(rq, &poller->qpairs, link, trq) { 1894 if (rq == rqpair) { 1895 TAILQ_REMOVE(&poller->qpairs, rqpair, link); 1896 rqpair->poller = NULL; 1897 break; 1898 } 1899 } 1900 1901 if (rq == NULL) { 1902 SPDK_ERRLOG("RDMA qpair cannot be removed from group (not in group).\n"); 1903 return -1; 1904 } 1905 1906 return 0; 1907 } 1908 1909 static int 1910 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 1911 { 1912 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 1913 struct spdk_nvmf_rdma_transport, transport); 1914 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1915 1916 rdma_req->state = RDMA_REQUEST_STATE_EXECUTED; 1917 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 1918 1919 return 0; 1920 } 1921 1922 static void 1923 spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) 1924 { 1925 spdk_nvmf_rdma_qpair_destroy(SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair)); 1926 } 1927 1928 static void 1929 spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 1930 struct spdk_nvmf_rdma_qpair *rqpair) 1931 { 1932 struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 1933 struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 1934 1935 /* We process I/O in the pending_rdma_rw queue at the highest priority. */ 1936 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_rw_queue, link, req_tmp) { 1937 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1938 break; 1939 } 1940 } 1941 1942 /* The second highest priority is I/O waiting on memory buffers. */ 1943 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->ch->pending_data_buf_queue, link, req_tmp) { 1944 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1945 break; 1946 } 1947 } 1948 1949 /* The lowest priority is processing newly received commands */ 1950 TAILQ_FOREACH_SAFE(rdma_recv, &rqpair->incoming_queue, link, recv_tmp) { 1951 rdma_req = TAILQ_FIRST(&rqpair->free_queue); 1952 if (rdma_req == NULL) { 1953 /* Need to wait for more SEND completions */ 1954 break; 1955 } 1956 1957 rdma_req->recv = rdma_recv; 1958 rdma_req->state = RDMA_REQUEST_STATE_NEW; 1959 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1960 break; 1961 } 1962 } 1963 } 1964 1965 static struct spdk_nvmf_rdma_request * 1966 get_rdma_req_from_wc(struct ibv_wc *wc) 1967 { 1968 struct spdk_nvmf_rdma_request *rdma_req; 1969 1970 rdma_req = (struct spdk_nvmf_rdma_request *)wc->wr_id; 1971 assert(rdma_req != NULL); 1972 1973 #ifdef DEBUG 1974 struct spdk_nvmf_rdma_qpair *rqpair; 1975 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1976 1977 assert(rdma_req - rqpair->reqs >= 0); 1978 assert(rdma_req - rqpair->reqs < (ptrdiff_t)rqpair->max_queue_depth); 1979 #endif 1980 1981 return rdma_req; 1982 } 1983 1984 static struct spdk_nvmf_rdma_recv * 1985 get_rdma_recv_from_wc(struct ibv_wc *wc) 1986 { 1987 struct spdk_nvmf_rdma_recv *rdma_recv; 1988 1989 assert(wc->byte_len >= sizeof(struct spdk_nvmf_capsule_cmd)); 1990 1991 rdma_recv = (struct spdk_nvmf_rdma_recv *)wc->wr_id; 1992 assert(rdma_recv != NULL); 1993 1994 #ifdef DEBUG 1995 struct spdk_nvmf_rdma_qpair *rqpair = rdma_recv->qpair; 1996 1997 assert(rdma_recv - rqpair->recvs >= 0); 1998 assert(rdma_recv - rqpair->recvs < (ptrdiff_t)rqpair->max_queue_depth); 1999 #endif 2000 2001 return rdma_recv; 2002 } 2003 2004 static int 2005 spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 2006 struct spdk_nvmf_rdma_poller *rpoller) 2007 { 2008 struct ibv_wc wc[32]; 2009 struct spdk_nvmf_rdma_request *rdma_req; 2010 struct spdk_nvmf_rdma_recv *rdma_recv; 2011 struct spdk_nvmf_rdma_qpair *rqpair; 2012 int reaped, i; 2013 int count = 0; 2014 bool error = false; 2015 2016 /* Poll for completing operations. */ 2017 reaped = ibv_poll_cq(rpoller->cq, 32, wc); 2018 if (reaped < 0) { 2019 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 2020 errno, spdk_strerror(errno)); 2021 return -1; 2022 } 2023 2024 for (i = 0; i < reaped; i++) { 2025 if (wc[i].status) { 2026 SPDK_ERRLOG("CQ error on CQ %p, Request 0x%lu (%d): %s\n", 2027 rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 2028 error = true; 2029 continue; 2030 } 2031 2032 switch (wc[i].opcode) { 2033 case IBV_WC_SEND: 2034 rdma_req = get_rdma_req_from_wc(&wc[i]); 2035 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2036 2037 assert(rdma_req->state == RDMA_REQUEST_STATE_COMPLETING); 2038 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2039 2040 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2041 2042 count++; 2043 2044 /* Try to process other queued requests */ 2045 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2046 break; 2047 2048 case IBV_WC_RDMA_WRITE: 2049 rdma_req = get_rdma_req_from_wc(&wc[i]); 2050 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2051 2052 rqpair->cur_rdma_rw_depth--; 2053 2054 /* Try to process other queued requests */ 2055 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2056 break; 2057 2058 case IBV_WC_RDMA_READ: 2059 rdma_req = get_rdma_req_from_wc(&wc[i]); 2060 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2061 2062 assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); 2063 rqpair->cur_rdma_rw_depth--; 2064 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 2065 2066 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2067 2068 /* Try to process other queued requests */ 2069 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2070 break; 2071 2072 case IBV_WC_RECV: 2073 rdma_recv = get_rdma_recv_from_wc(&wc[i]); 2074 rqpair = rdma_recv->qpair; 2075 2076 TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); 2077 2078 /* Try to process other queued requests */ 2079 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2080 break; 2081 2082 default: 2083 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 2084 continue; 2085 } 2086 } 2087 2088 if (error == true) { 2089 return -1; 2090 } 2091 2092 return count; 2093 } 2094 2095 static int 2096 spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2097 { 2098 struct spdk_nvmf_rdma_transport *rtransport; 2099 struct spdk_nvmf_rdma_poll_group *rgroup; 2100 struct spdk_nvmf_rdma_poller *rpoller; 2101 int count, rc; 2102 2103 rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 2104 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2105 2106 count = 0; 2107 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 2108 rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); 2109 if (rc < 0) { 2110 return rc; 2111 } 2112 count += rc; 2113 } 2114 2115 return count; 2116 } 2117 2118 static bool 2119 spdk_nvmf_rdma_qpair_is_idle(struct spdk_nvmf_qpair *qpair) 2120 { 2121 struct spdk_nvmf_rdma_qpair *rqpair; 2122 2123 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2124 2125 if (rqpair->cur_queue_depth == 0 && rqpair->cur_rdma_rw_depth == 0) { 2126 return true; 2127 } 2128 return false; 2129 } 2130 2131 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 2132 .type = SPDK_NVME_TRANSPORT_RDMA, 2133 .create = spdk_nvmf_rdma_create, 2134 .destroy = spdk_nvmf_rdma_destroy, 2135 2136 .listen = spdk_nvmf_rdma_listen, 2137 .stop_listen = spdk_nvmf_rdma_stop_listen, 2138 .accept = spdk_nvmf_rdma_accept, 2139 2140 .listener_discover = spdk_nvmf_rdma_discover, 2141 2142 .poll_group_create = spdk_nvmf_rdma_poll_group_create, 2143 .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, 2144 .poll_group_add = spdk_nvmf_rdma_poll_group_add, 2145 .poll_group_remove = spdk_nvmf_rdma_poll_group_remove, 2146 .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, 2147 2148 .req_complete = spdk_nvmf_rdma_request_complete, 2149 2150 .qpair_fini = spdk_nvmf_rdma_close_qpair, 2151 .qpair_is_idle = spdk_nvmf_rdma_qpair_is_idle, 2152 2153 }; 2154 2155 SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) 2156