1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "nvmf_internal.h" 41 #include "transport.h" 42 43 #include "spdk/assert.h" 44 #include "spdk/thread.h" 45 #include "spdk/nvmf.h" 46 #include "spdk/nvmf_spec.h" 47 #include "spdk/string.h" 48 #include "spdk/trace.h" 49 #include "spdk/util.h" 50 51 #include "spdk_internal/log.h" 52 53 /* 54 RDMA Connection Resouce Defaults 55 */ 56 #define NVMF_DEFAULT_TX_SGE 1 57 #define NVMF_DEFAULT_RX_SGE 2 58 #define NVMF_DEFAULT_DATA_SGE 16 59 60 /* The RDMA completion queue size */ 61 #define NVMF_RDMA_CQ_SIZE 4096 62 63 /* AIO backend requires block size aligned data buffers, 64 * extra 4KiB aligned data buffer should work for most devices. 65 */ 66 #define SHIFT_4KB 12 67 #define NVMF_DATA_BUFFER_ALIGNMENT (1 << SHIFT_4KB) 68 #define NVMF_DATA_BUFFER_MASK (NVMF_DATA_BUFFER_ALIGNMENT - 1) 69 70 enum spdk_nvmf_rdma_request_state { 71 /* The request is not currently in use */ 72 RDMA_REQUEST_STATE_FREE = 0, 73 74 /* Initial state when request first received */ 75 RDMA_REQUEST_STATE_NEW, 76 77 /* The request is queued until a data buffer is available. */ 78 RDMA_REQUEST_STATE_NEED_BUFFER, 79 80 /* The request is waiting on RDMA queue depth availability 81 * to transfer data from the host to the controller. 82 */ 83 RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER, 84 85 /* The request is currently transferring data from the host to the controller. */ 86 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 87 88 /* The request is ready to execute at the block device */ 89 RDMA_REQUEST_STATE_READY_TO_EXECUTE, 90 91 /* The request is currently executing at the block device */ 92 RDMA_REQUEST_STATE_EXECUTING, 93 94 /* The request finished executing at the block device */ 95 RDMA_REQUEST_STATE_EXECUTED, 96 97 /* The request is waiting on RDMA queue depth availability 98 * to transfer data from the controller to the host. 99 */ 100 RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST, 101 102 /* The request is ready to send a completion */ 103 RDMA_REQUEST_STATE_READY_TO_COMPLETE, 104 105 /* The request currently has a completion outstanding */ 106 RDMA_REQUEST_STATE_COMPLETING, 107 108 /* The request completed and can be marked free. */ 109 RDMA_REQUEST_STATE_COMPLETED, 110 }; 111 112 #define OBJECT_NVMF_RDMA_IO 0x40 113 114 #define TRACE_GROUP_NVMF_RDMA 0x4 115 #define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) 116 #define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) 117 #define TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) 118 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) 119 #define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) 120 #define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) 121 #define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) 122 #define TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) 123 #define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) 124 #define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) 125 #define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) 126 127 SPDK_TRACE_REGISTER_FN(nvmf_trace) 128 { 129 spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); 130 spdk_trace_register_description("RDMA_REQ_NEW", "", 131 TRACE_RDMA_REQUEST_STATE_NEW, 132 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 0, 0, ""); 133 spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", "", 134 TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 135 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 136 spdk_trace_register_description("RDMA_REQ_TX_PENDING_H_TO_C", "", 137 TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER, 138 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 139 spdk_trace_register_description("RDMA_REQ_TX_H_TO_C", "", 140 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 141 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 142 spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", "", 143 TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 144 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 145 spdk_trace_register_description("RDMA_REQ_EXECUTING", "", 146 TRACE_RDMA_REQUEST_STATE_EXECUTING, 147 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 148 spdk_trace_register_description("RDMA_REQ_EXECUTED", "", 149 TRACE_RDMA_REQUEST_STATE_EXECUTED, 150 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 151 spdk_trace_register_description("RDMA_REQ_TX_PENDING_C_TO_H", "", 152 TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST, 153 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 154 spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPLETE", "", 155 TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 156 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 157 spdk_trace_register_description("RDMA_REQ_COMPLETING", "", 158 TRACE_RDMA_REQUEST_STATE_COMPLETING, 159 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 160 spdk_trace_register_description("RDMA_REQ_COMPLETED", "", 161 TRACE_RDMA_REQUEST_STATE_COMPLETED, 162 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 0, 0, ""); 163 } 164 165 /* This structure holds commands as they are received off the wire. 166 * It must be dynamically paired with a full request object 167 * (spdk_nvmf_rdma_request) to service a request. It is separate 168 * from the request because RDMA does not appear to order 169 * completions, so occasionally we'll get a new incoming 170 * command when there aren't any free request objects. 171 */ 172 struct spdk_nvmf_rdma_recv { 173 struct ibv_recv_wr wr; 174 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 175 176 struct spdk_nvmf_rdma_qpair *qpair; 177 178 /* In-capsule data buffer */ 179 uint8_t *buf; 180 181 TAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 182 }; 183 184 struct spdk_nvmf_rdma_request { 185 struct spdk_nvmf_request req; 186 bool data_from_pool; 187 188 enum spdk_nvmf_rdma_request_state state; 189 190 struct spdk_nvmf_rdma_recv *recv; 191 192 struct { 193 struct ibv_send_wr wr; 194 struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; 195 } rsp; 196 197 struct { 198 struct ibv_send_wr wr; 199 struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 200 void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; 201 } data; 202 203 TAILQ_ENTRY(spdk_nvmf_rdma_request) link; 204 }; 205 206 struct spdk_nvmf_rdma_qpair { 207 struct spdk_nvmf_qpair qpair; 208 209 struct spdk_nvmf_rdma_port *port; 210 struct spdk_nvmf_rdma_poller *poller; 211 212 struct rdma_cm_id *cm_id; 213 214 /* The maximum number of I/O outstanding on this connection at one time */ 215 uint16_t max_queue_depth; 216 217 /* The maximum number of active RDMA READ and WRITE operations at one time */ 218 uint16_t max_rw_depth; 219 220 /* The current number of I/O outstanding on this connection. This number 221 * includes all I/O from the time the capsule is first received until it is 222 * completed. 223 */ 224 uint16_t cur_queue_depth; 225 226 /* The number of RDMA READ and WRITE requests that are outstanding */ 227 uint16_t cur_rdma_rw_depth; 228 229 /* Receives that are waiting for a request object */ 230 TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 231 232 /* Requests that are not in use */ 233 TAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; 234 235 /* Requests that are waiting to perform an RDMA READ or WRITE */ 236 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_rw_queue; 237 238 /* Array of size "max_queue_depth" containing RDMA requests. */ 239 struct spdk_nvmf_rdma_request *reqs; 240 241 /* Array of size "max_queue_depth" containing RDMA recvs. */ 242 struct spdk_nvmf_rdma_recv *recvs; 243 244 /* Array of size "max_queue_depth" containing 64 byte capsules 245 * used for receive. 246 */ 247 union nvmf_h2c_msg *cmds; 248 struct ibv_mr *cmds_mr; 249 250 /* Array of size "max_queue_depth" containing 16 byte completions 251 * to be sent back to the user. 252 */ 253 union nvmf_c2h_msg *cpls; 254 struct ibv_mr *cpls_mr; 255 256 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 257 * buffers to be used for in capsule data. 258 */ 259 void *bufs; 260 struct ibv_mr *bufs_mr; 261 262 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; 263 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) pending_link; 264 265 /* Mgmt channel */ 266 struct spdk_io_channel *mgmt_channel; 267 struct spdk_nvmf_rdma_mgmt_channel *ch; 268 }; 269 270 struct spdk_nvmf_rdma_poller { 271 struct spdk_nvmf_rdma_device *device; 272 struct spdk_nvmf_rdma_poll_group *group; 273 274 struct ibv_cq *cq; 275 276 TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; 277 278 TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 279 }; 280 281 struct spdk_nvmf_rdma_poll_group { 282 struct spdk_nvmf_transport_poll_group group; 283 284 TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 285 }; 286 287 /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 288 struct spdk_nvmf_rdma_device { 289 struct ibv_device_attr attr; 290 struct ibv_context *context; 291 292 struct spdk_mem_map *map; 293 struct ibv_pd *pd; 294 295 TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 296 }; 297 298 struct spdk_nvmf_rdma_port { 299 struct spdk_nvme_transport_id trid; 300 struct rdma_cm_id *id; 301 struct spdk_nvmf_rdma_device *device; 302 uint32_t ref; 303 TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 304 }; 305 306 struct spdk_nvmf_rdma_transport { 307 struct spdk_nvmf_transport transport; 308 309 struct rdma_event_channel *event_channel; 310 311 struct spdk_mempool *data_buf_pool; 312 313 pthread_mutex_t lock; 314 315 uint16_t max_queue_depth; 316 uint32_t max_io_size; 317 uint32_t io_unit_size; 318 uint32_t in_capsule_data_size; 319 320 /* fields used to poll RDMA/IB events */ 321 nfds_t npoll_fds; 322 struct pollfd *poll_fds; 323 324 TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 325 TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 326 }; 327 328 struct spdk_nvmf_rdma_mgmt_channel { 329 /* Requests that are waiting to obtain a data buffer */ 330 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; 331 }; 332 333 static int 334 spdk_nvmf_rdma_mgmt_channel_create(void *io_device, void *ctx_buf) 335 { 336 struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; 337 338 TAILQ_INIT(&ch->pending_data_buf_queue); 339 return 0; 340 } 341 342 static void 343 spdk_nvmf_rdma_mgmt_channel_destroy(void *io_device, void *ctx_buf) 344 { 345 struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; 346 347 if (!TAILQ_EMPTY(&ch->pending_data_buf_queue)) { 348 SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n"); 349 } 350 } 351 352 static void 353 spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 354 { 355 if (rqpair->poller) { 356 TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); 357 } 358 359 if (rqpair->cmds_mr) { 360 ibv_dereg_mr(rqpair->cmds_mr); 361 } 362 363 if (rqpair->cpls_mr) { 364 ibv_dereg_mr(rqpair->cpls_mr); 365 } 366 367 if (rqpair->bufs_mr) { 368 ibv_dereg_mr(rqpair->bufs_mr); 369 } 370 371 if (rqpair->cm_id) { 372 rdma_destroy_qp(rqpair->cm_id); 373 rdma_destroy_id(rqpair->cm_id); 374 } 375 376 if (rqpair->mgmt_channel) { 377 spdk_put_io_channel(rqpair->mgmt_channel); 378 } 379 380 /* Free all memory */ 381 spdk_dma_free(rqpair->cmds); 382 spdk_dma_free(rqpair->cpls); 383 spdk_dma_free(rqpair->bufs); 384 free(rqpair->reqs); 385 free(rqpair->recvs); 386 free(rqpair); 387 } 388 389 static int 390 spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 391 { 392 struct spdk_nvmf_rdma_transport *rtransport; 393 struct spdk_nvmf_rdma_qpair *rqpair; 394 int rc, i; 395 struct ibv_qp_init_attr attr; 396 struct spdk_nvmf_rdma_recv *rdma_recv; 397 struct spdk_nvmf_rdma_request *rdma_req; 398 399 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 400 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 401 402 memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); 403 attr.qp_type = IBV_QPT_RC; 404 attr.send_cq = rqpair->poller->cq; 405 attr.recv_cq = rqpair->poller->cq; 406 attr.cap.max_send_wr = rqpair->max_queue_depth * 2; /* SEND, READ, and WRITE operations */ 407 attr.cap.max_recv_wr = rqpair->max_queue_depth; /* RECV operations */ 408 attr.cap.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES; 409 attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE; 410 411 rc = rdma_create_qp(rqpair->cm_id, NULL, &attr); 412 if (rc) { 413 SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); 414 rdma_destroy_id(rqpair->cm_id); 415 rqpair->cm_id = NULL; 416 spdk_nvmf_rdma_qpair_destroy(rqpair); 417 return -1; 418 } 419 420 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); 421 422 rqpair->reqs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->reqs)); 423 rqpair->recvs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->recvs)); 424 rqpair->cmds = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cmds), 425 0x1000, NULL); 426 rqpair->cpls = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cpls), 427 0x1000, NULL); 428 429 if (rtransport->in_capsule_data_size) { 430 rqpair->bufs = spdk_dma_zmalloc(rqpair->max_queue_depth * rtransport->in_capsule_data_size, 431 0x1000, NULL); 432 } 433 434 if (!rqpair->reqs || !rqpair->recvs || !rqpair->cmds || 435 !rqpair->cpls || (rtransport->in_capsule_data_size && !rqpair->bufs)) { 436 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 437 spdk_nvmf_rdma_qpair_destroy(rqpair); 438 return -1; 439 } 440 441 rqpair->cmds_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cmds, 442 rqpair->max_queue_depth * sizeof(*rqpair->cmds), 443 IBV_ACCESS_LOCAL_WRITE); 444 rqpair->cpls_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cpls, 445 rqpair->max_queue_depth * sizeof(*rqpair->cpls), 446 0); 447 448 if (rtransport->in_capsule_data_size) { 449 rqpair->bufs_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->bufs, 450 rqpair->max_queue_depth * rtransport->in_capsule_data_size, 451 IBV_ACCESS_LOCAL_WRITE | 452 IBV_ACCESS_REMOTE_WRITE); 453 } 454 455 if (!rqpair->cmds_mr || !rqpair->cpls_mr || (rtransport->in_capsule_data_size && 456 !rqpair->bufs_mr)) { 457 SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n"); 458 spdk_nvmf_rdma_qpair_destroy(rqpair); 459 return -1; 460 } 461 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 462 rqpair->cmds, rqpair->max_queue_depth * sizeof(*rqpair->cmds), rqpair->cmds_mr->lkey); 463 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 464 rqpair->cpls, rqpair->max_queue_depth * sizeof(*rqpair->cpls), rqpair->cpls_mr->lkey); 465 if (rqpair->bufs && rqpair->bufs_mr) { 466 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 467 rqpair->bufs, rqpair->max_queue_depth * rtransport->in_capsule_data_size, rqpair->bufs_mr->lkey); 468 } 469 470 for (i = 0; i < rqpair->max_queue_depth; i++) { 471 struct ibv_recv_wr *bad_wr = NULL; 472 473 rdma_recv = &rqpair->recvs[i]; 474 rdma_recv->qpair = rqpair; 475 476 /* Set up memory to receive commands */ 477 if (rqpair->bufs) { 478 rdma_recv->buf = (void *)((uintptr_t)rqpair->bufs + (i * rtransport->in_capsule_data_size)); 479 } 480 481 rdma_recv->sgl[0].addr = (uintptr_t)&rqpair->cmds[i]; 482 rdma_recv->sgl[0].length = sizeof(rqpair->cmds[i]); 483 rdma_recv->sgl[0].lkey = rqpair->cmds_mr->lkey; 484 rdma_recv->wr.num_sge = 1; 485 486 if (rdma_recv->buf && rqpair->bufs_mr) { 487 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 488 rdma_recv->sgl[1].length = rtransport->in_capsule_data_size; 489 rdma_recv->sgl[1].lkey = rqpair->bufs_mr->lkey; 490 rdma_recv->wr.num_sge++; 491 } 492 493 rdma_recv->wr.wr_id = (uintptr_t)rdma_recv; 494 rdma_recv->wr.sg_list = rdma_recv->sgl; 495 496 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_recv->wr, &bad_wr); 497 if (rc) { 498 SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n"); 499 spdk_nvmf_rdma_qpair_destroy(rqpair); 500 return -1; 501 } 502 } 503 504 for (i = 0; i < rqpair->max_queue_depth; i++) { 505 rdma_req = &rqpair->reqs[i]; 506 507 rdma_req->req.qpair = &rqpair->qpair; 508 rdma_req->req.cmd = NULL; 509 510 /* Set up memory to send responses */ 511 rdma_req->req.rsp = &rqpair->cpls[i]; 512 513 rdma_req->rsp.sgl[0].addr = (uintptr_t)&rqpair->cpls[i]; 514 rdma_req->rsp.sgl[0].length = sizeof(rqpair->cpls[i]); 515 rdma_req->rsp.sgl[0].lkey = rqpair->cpls_mr->lkey; 516 517 rdma_req->rsp.wr.wr_id = (uintptr_t)rdma_req; 518 rdma_req->rsp.wr.next = NULL; 519 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 520 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 521 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 522 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 523 524 /* Set up memory for data buffers */ 525 rdma_req->data.wr.wr_id = (uint64_t)rdma_req; 526 rdma_req->data.wr.next = NULL; 527 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 528 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 529 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 530 531 TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link); 532 } 533 534 return 0; 535 } 536 537 static int 538 request_transfer_in(struct spdk_nvmf_request *req) 539 { 540 int rc; 541 struct spdk_nvmf_rdma_request *rdma_req; 542 struct spdk_nvmf_qpair *qpair; 543 struct spdk_nvmf_rdma_qpair *rqpair; 544 struct ibv_send_wr *bad_wr = NULL; 545 546 qpair = req->qpair; 547 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 548 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 549 550 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 551 552 rqpair->cur_rdma_rw_depth++; 553 554 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair); 555 556 rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; 557 rdma_req->data.wr.next = NULL; 558 rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr); 559 if (rc) { 560 SPDK_ERRLOG("Unable to transfer data from host to target\n"); 561 562 /* Decrement r/w counter back since data transfer 563 * has not started. 564 */ 565 rqpair->cur_rdma_rw_depth--; 566 return -1; 567 } 568 569 return 0; 570 } 571 572 static int 573 request_transfer_out(struct spdk_nvmf_request *req) 574 { 575 int rc; 576 struct spdk_nvmf_rdma_request *rdma_req; 577 struct spdk_nvmf_qpair *qpair; 578 struct spdk_nvmf_rdma_qpair *rqpair; 579 struct spdk_nvme_cpl *rsp; 580 struct ibv_recv_wr *bad_recv_wr = NULL; 581 struct ibv_send_wr *send_wr, *bad_send_wr = NULL; 582 583 qpair = req->qpair; 584 rsp = &req->rsp->nvme_cpl; 585 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 586 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 587 588 /* Advance our sq_head pointer */ 589 if (qpair->sq_head == qpair->sq_head_max) { 590 qpair->sq_head = 0; 591 } else { 592 qpair->sq_head++; 593 } 594 rsp->sqhd = qpair->sq_head; 595 596 /* Post the capsule to the recv buffer */ 597 assert(rdma_req->recv != NULL); 598 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, 599 rqpair); 600 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); 601 if (rc) { 602 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 603 return rc; 604 } 605 rdma_req->recv = NULL; 606 607 /* Build the response which consists of an optional 608 * RDMA WRITE to transfer data, plus an RDMA SEND 609 * containing the response. 610 */ 611 send_wr = &rdma_req->rsp.wr; 612 613 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 614 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 615 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair); 616 617 rqpair->cur_rdma_rw_depth++; 618 rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; 619 620 rdma_req->data.wr.next = send_wr; 621 send_wr = &rdma_req->data.wr; 622 } 623 624 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair); 625 626 /* Send the completion */ 627 rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr); 628 if (rc) { 629 SPDK_ERRLOG("Unable to send response capsule\n"); 630 631 if (rdma_req->data.wr.opcode == IBV_WR_RDMA_WRITE) { 632 /* Decrement r/w counter back since data transfer 633 * has not started. 634 */ 635 rqpair->cur_rdma_rw_depth--; 636 } 637 } 638 639 return rc; 640 } 641 642 static int 643 spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 644 { 645 struct spdk_nvmf_rdma_accept_private_data accept_data; 646 struct rdma_conn_param ctrlr_event_data = {}; 647 int rc; 648 649 accept_data.recfmt = 0; 650 accept_data.crqsize = rqpair->max_queue_depth; 651 652 ctrlr_event_data.private_data = &accept_data; 653 ctrlr_event_data.private_data_len = sizeof(accept_data); 654 if (id->ps == RDMA_PS_TCP) { 655 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 656 ctrlr_event_data.initiator_depth = rqpair->max_rw_depth; 657 } 658 659 rc = rdma_accept(id, &ctrlr_event_data); 660 if (rc) { 661 SPDK_ERRLOG("Error %d on rdma_accept\n", errno); 662 } else { 663 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); 664 } 665 666 return rc; 667 } 668 669 static void 670 spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 671 { 672 struct spdk_nvmf_rdma_reject_private_data rej_data; 673 674 rej_data.recfmt = 0; 675 rej_data.sts = error; 676 677 rdma_reject(id, &rej_data, sizeof(rej_data)); 678 } 679 680 static int 681 nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, 682 new_qpair_fn cb_fn) 683 { 684 struct spdk_nvmf_rdma_transport *rtransport; 685 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 686 struct spdk_nvmf_rdma_port *port; 687 struct rdma_conn_param *rdma_param = NULL; 688 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 689 uint16_t max_queue_depth; 690 uint16_t max_rw_depth; 691 692 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 693 694 assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 695 assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 696 697 rdma_param = &event->param.conn; 698 if (rdma_param->private_data == NULL || 699 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 700 SPDK_ERRLOG("connect request: no private data provided\n"); 701 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 702 return -1; 703 } 704 705 private_data = rdma_param->private_data; 706 if (private_data->recfmt != 0) { 707 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 708 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 709 return -1; 710 } 711 712 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 713 event->id->verbs->device->name, event->id->verbs->device->dev_name); 714 715 port = event->listen_id->context; 716 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 717 event->listen_id, event->listen_id->verbs, port); 718 719 /* Figure out the supported queue depth. This is a multi-step process 720 * that takes into account hardware maximums, host provided values, 721 * and our target's internal memory limits */ 722 723 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); 724 725 /* Start with the maximum queue depth allowed by the target */ 726 max_queue_depth = rtransport->max_queue_depth; 727 max_rw_depth = rtransport->max_queue_depth; 728 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", rtransport->max_queue_depth); 729 730 /* Next check the local NIC's hardware limitations */ 731 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 732 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 733 port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 734 max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 735 max_rw_depth = spdk_min(max_rw_depth, port->device->attr.max_qp_rd_atom); 736 737 /* Next check the remote NIC's hardware limitations */ 738 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 739 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 740 rdma_param->initiator_depth, rdma_param->responder_resources); 741 if (rdma_param->initiator_depth > 0) { 742 max_rw_depth = spdk_min(max_rw_depth, rdma_param->initiator_depth); 743 } 744 745 /* Finally check for the host software requested values, which are 746 * optional. */ 747 if (rdma_param->private_data != NULL && 748 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 749 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 750 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 751 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 752 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 753 } 754 755 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 756 max_queue_depth, max_rw_depth); 757 758 rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 759 if (rqpair == NULL) { 760 SPDK_ERRLOG("Could not allocate new connection.\n"); 761 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 762 return -1; 763 } 764 765 rqpair->port = port; 766 rqpair->max_queue_depth = max_queue_depth; 767 rqpair->max_rw_depth = max_rw_depth; 768 rqpair->cm_id = event->id; 769 rqpair->qpair.transport = transport; 770 TAILQ_INIT(&rqpair->incoming_queue); 771 TAILQ_INIT(&rqpair->free_queue); 772 TAILQ_INIT(&rqpair->pending_rdma_rw_queue); 773 774 event->id->context = &rqpair->qpair; 775 776 cb_fn(&rqpair->qpair); 777 778 return 0; 779 } 780 781 static int 782 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 783 { 784 struct spdk_nvmf_qpair *qpair; 785 786 if (evt->id == NULL) { 787 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 788 return -1; 789 } 790 791 qpair = evt->id->context; 792 if (qpair == NULL) { 793 SPDK_ERRLOG("disconnect request: no active connection\n"); 794 return -1; 795 } 796 /* ack the disconnect event before rdma_destroy_id */ 797 rdma_ack_cm_event(evt); 798 799 spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); 800 801 return 0; 802 } 803 804 #ifdef DEBUG 805 static const char *CM_EVENT_STR[] = { 806 "RDMA_CM_EVENT_ADDR_RESOLVED", 807 "RDMA_CM_EVENT_ADDR_ERROR", 808 "RDMA_CM_EVENT_ROUTE_RESOLVED", 809 "RDMA_CM_EVENT_ROUTE_ERROR", 810 "RDMA_CM_EVENT_CONNECT_REQUEST", 811 "RDMA_CM_EVENT_CONNECT_RESPONSE", 812 "RDMA_CM_EVENT_CONNECT_ERROR", 813 "RDMA_CM_EVENT_UNREACHABLE", 814 "RDMA_CM_EVENT_REJECTED", 815 "RDMA_CM_EVENT_ESTABLISHED", 816 "RDMA_CM_EVENT_DISCONNECTED", 817 "RDMA_CM_EVENT_DEVICE_REMOVAL", 818 "RDMA_CM_EVENT_MULTICAST_JOIN", 819 "RDMA_CM_EVENT_MULTICAST_ERROR", 820 "RDMA_CM_EVENT_ADDR_CHANGE", 821 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 822 }; 823 #endif /* DEBUG */ 824 825 static int 826 spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 827 enum spdk_mem_map_notify_action action, 828 void *vaddr, size_t size) 829 { 830 struct spdk_nvmf_rdma_device *device = cb_ctx; 831 struct ibv_pd *pd = device->pd; 832 struct ibv_mr *mr; 833 834 switch (action) { 835 case SPDK_MEM_MAP_NOTIFY_REGISTER: 836 mr = ibv_reg_mr(pd, vaddr, size, 837 IBV_ACCESS_LOCAL_WRITE | 838 IBV_ACCESS_REMOTE_READ | 839 IBV_ACCESS_REMOTE_WRITE); 840 if (mr == NULL) { 841 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 842 return -1; 843 } else { 844 spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 845 } 846 break; 847 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 848 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, size); 849 spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 850 if (mr) { 851 ibv_dereg_mr(mr); 852 } 853 break; 854 } 855 856 return 0; 857 } 858 859 typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; 860 861 static spdk_nvme_data_transfer_t 862 spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) 863 { 864 enum spdk_nvme_data_transfer xfer; 865 struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; 866 struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; 867 868 /* Figure out data transfer direction */ 869 if (cmd->opc == SPDK_NVME_OPC_FABRIC) { 870 xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); 871 } else { 872 xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 873 874 /* Some admin commands are special cases */ 875 if ((rdma_req->req.qpair->qid == 0) && 876 ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || 877 (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { 878 switch (cmd->cdw10 & 0xff) { 879 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 880 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 881 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 882 break; 883 default: 884 xfer = SPDK_NVME_DATA_NONE; 885 } 886 } 887 } 888 889 if (xfer == SPDK_NVME_DATA_NONE) { 890 return xfer; 891 } 892 893 /* Even for commands that may transfer data, they could have specified 0 length. 894 * We want those to show up with xfer SPDK_NVME_DATA_NONE. 895 */ 896 switch (sgl->generic.type) { 897 case SPDK_NVME_SGL_TYPE_DATA_BLOCK: 898 case SPDK_NVME_SGL_TYPE_BIT_BUCKET: 899 case SPDK_NVME_SGL_TYPE_SEGMENT: 900 case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: 901 case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: 902 if (sgl->unkeyed.length == 0) { 903 xfer = SPDK_NVME_DATA_NONE; 904 } 905 break; 906 case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: 907 if (sgl->keyed.length == 0) { 908 xfer = SPDK_NVME_DATA_NONE; 909 } 910 break; 911 } 912 913 return xfer; 914 } 915 916 static int 917 spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 918 struct spdk_nvmf_rdma_device *device, 919 struct spdk_nvmf_rdma_request *rdma_req) 920 { 921 void *buf = NULL; 922 uint32_t length = rdma_req->req.length; 923 uint32_t i = 0; 924 925 rdma_req->req.iovcnt = 0; 926 while (length) { 927 buf = spdk_mempool_get(rtransport->data_buf_pool); 928 if (!buf) { 929 goto nomem; 930 } 931 932 rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & 933 ~NVMF_DATA_BUFFER_MASK); 934 rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->io_unit_size); 935 rdma_req->req.iovcnt++; 936 rdma_req->data.buffers[i] = buf; 937 rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base); 938 rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len; 939 rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, 940 (uint64_t)buf, rdma_req->req.iov[i].iov_len))->lkey; 941 942 length -= rdma_req->req.iov[i].iov_len; 943 i++; 944 } 945 946 rdma_req->data_from_pool = true; 947 948 return 0; 949 950 nomem: 951 while (i) { 952 i--; 953 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->req.iov[i].iov_base); 954 rdma_req->req.iov[i].iov_base = NULL; 955 rdma_req->req.iov[i].iov_len = 0; 956 957 rdma_req->data.wr.sg_list[i].addr = 0; 958 rdma_req->data.wr.sg_list[i].length = 0; 959 rdma_req->data.wr.sg_list[i].lkey = 0; 960 } 961 rdma_req->req.iovcnt = 0; 962 return -ENOMEM; 963 } 964 965 static int 966 spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 967 struct spdk_nvmf_rdma_device *device, 968 struct spdk_nvmf_rdma_request *rdma_req) 969 { 970 struct spdk_nvme_cmd *cmd; 971 struct spdk_nvme_cpl *rsp; 972 struct spdk_nvme_sgl_descriptor *sgl; 973 974 cmd = &rdma_req->req.cmd->nvme_cmd; 975 rsp = &rdma_req->req.rsp->nvme_cpl; 976 sgl = &cmd->dptr.sgl1; 977 978 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 979 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 980 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 981 if (sgl->keyed.length > rtransport->max_io_size) { 982 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 983 sgl->keyed.length, rtransport->max_io_size); 984 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 985 return -1; 986 } 987 988 /* fill request length and populate iovs */ 989 rdma_req->req.length = sgl->keyed.length; 990 991 if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { 992 /* No available buffers. Queue this request up. */ 993 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 994 return 0; 995 } 996 997 /* backward compatible */ 998 rdma_req->req.data = rdma_req->req.iov[0].iov_base; 999 1000 /* rdma wr specifics */ 1001 rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; 1002 rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; 1003 rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; 1004 1005 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 1006 rdma_req->req.iovcnt); 1007 1008 return 0; 1009 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 1010 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1011 uint64_t offset = sgl->address; 1012 uint32_t max_len = rtransport->in_capsule_data_size; 1013 1014 SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 1015 offset, sgl->unkeyed.length); 1016 1017 if (offset > max_len) { 1018 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 1019 offset, max_len); 1020 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 1021 return -1; 1022 } 1023 max_len -= (uint32_t)offset; 1024 1025 if (sgl->unkeyed.length > max_len) { 1026 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 1027 sgl->unkeyed.length, max_len); 1028 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1029 return -1; 1030 } 1031 1032 rdma_req->req.data = rdma_req->recv->buf + offset; 1033 rdma_req->data_from_pool = false; 1034 rdma_req->req.length = sgl->unkeyed.length; 1035 1036 rdma_req->req.iov[0].iov_base = rdma_req->req.data; 1037 rdma_req->req.iov[0].iov_len = rdma_req->req.length; 1038 rdma_req->req.iovcnt = 1; 1039 1040 return 0; 1041 } 1042 1043 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 1044 sgl->generic.type, sgl->generic.subtype); 1045 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 1046 return -1; 1047 } 1048 1049 static bool 1050 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 1051 struct spdk_nvmf_rdma_request *rdma_req) 1052 { 1053 struct spdk_nvmf_rdma_qpair *rqpair; 1054 struct spdk_nvmf_rdma_device *device; 1055 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 1056 int rc; 1057 struct spdk_nvmf_rdma_recv *rdma_recv; 1058 enum spdk_nvmf_rdma_request_state prev_state; 1059 bool progress = false; 1060 1061 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1062 device = rqpair->port->device; 1063 1064 assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 1065 1066 /* The loop here is to allow for several back-to-back state changes. */ 1067 do { 1068 prev_state = rdma_req->state; 1069 1070 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); 1071 1072 switch (rdma_req->state) { 1073 case RDMA_REQUEST_STATE_FREE: 1074 /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 1075 * to escape this state. */ 1076 break; 1077 case RDMA_REQUEST_STATE_NEW: 1078 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, (uintptr_t)rdma_req, 0); 1079 1080 rqpair->cur_queue_depth++; 1081 rdma_recv = rdma_req->recv; 1082 1083 /* The first element of the SGL is the NVMe command */ 1084 rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 1085 memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 1086 1087 TAILQ_REMOVE(&rqpair->incoming_queue, rdma_recv, link); 1088 TAILQ_REMOVE(&rqpair->free_queue, rdma_req, link); 1089 1090 /* The next state transition depends on the data transfer needs of this request. */ 1091 rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); 1092 1093 /* If no data to transfer, ready to execute. */ 1094 if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 1095 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1096 break; 1097 } 1098 1099 rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER; 1100 TAILQ_INSERT_TAIL(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1101 break; 1102 case RDMA_REQUEST_STATE_NEED_BUFFER: 1103 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, (uintptr_t)rdma_req, 0); 1104 1105 assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 1106 1107 if (rdma_req != TAILQ_FIRST(&rqpair->ch->pending_data_buf_queue)) { 1108 /* This request needs to wait in line to obtain a buffer */ 1109 break; 1110 } 1111 1112 /* Try to get a data buffer */ 1113 rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 1114 if (rc < 0) { 1115 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1116 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1117 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1118 break; 1119 } 1120 1121 if (!rdma_req->req.data) { 1122 /* No buffers available. */ 1123 break; 1124 } 1125 1126 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1127 1128 /* If data is transferring from host to controller and the data didn't 1129 * arrive using in capsule data, we need to do a transfer from the host. 1130 */ 1131 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { 1132 rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER; 1133 TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1134 break; 1135 } 1136 1137 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1138 break; 1139 case RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER: 1140 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER, 0, 0, 1141 (uintptr_t)rdma_req, 0); 1142 1143 if (rdma_req != TAILQ_FIRST(&rqpair->pending_rdma_rw_queue)) { 1144 /* This request needs to wait in line to perform RDMA */ 1145 break; 1146 } 1147 1148 if (rqpair->cur_rdma_rw_depth < rqpair->max_rw_depth) { 1149 TAILQ_REMOVE(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1150 rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; 1151 rc = request_transfer_in(&rdma_req->req); 1152 if (rc) { 1153 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1154 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1155 } 1156 } 1157 break; 1158 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 1159 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, 1160 (uintptr_t)rdma_req, 0); 1161 /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 1162 * to escape this state. */ 1163 break; 1164 case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 1165 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, (uintptr_t)rdma_req, 0); 1166 rdma_req->state = RDMA_REQUEST_STATE_EXECUTING; 1167 spdk_nvmf_request_exec(&rdma_req->req); 1168 break; 1169 case RDMA_REQUEST_STATE_EXECUTING: 1170 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, (uintptr_t)rdma_req, 0); 1171 /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 1172 * to escape this state. */ 1173 break; 1174 case RDMA_REQUEST_STATE_EXECUTED: 1175 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, (uintptr_t)rdma_req, 0); 1176 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1177 rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST; 1178 TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1179 } else { 1180 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1181 } 1182 break; 1183 case RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST: 1184 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST, 0, 0, 1185 (uintptr_t)rdma_req, 0); 1186 if (rdma_req != TAILQ_FIRST(&rqpair->pending_rdma_rw_queue)) { 1187 /* This request needs to wait in line to perform RDMA */ 1188 break; 1189 } 1190 1191 if (rqpair->cur_rdma_rw_depth < rqpair->max_rw_depth) { 1192 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1193 TAILQ_REMOVE(&rqpair->pending_rdma_rw_queue, rdma_req, link); 1194 } 1195 break; 1196 case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 1197 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, (uintptr_t)rdma_req, 0); 1198 rdma_req->state = RDMA_REQUEST_STATE_COMPLETING; 1199 1200 rc = request_transfer_out(&rdma_req->req); 1201 assert(rc == 0); /* No good way to handle this currently */ 1202 break; 1203 case RDMA_REQUEST_STATE_COMPLETING: 1204 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, (uintptr_t)rdma_req, 0); 1205 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1206 * to escape this state. */ 1207 break; 1208 case RDMA_REQUEST_STATE_COMPLETED: 1209 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, (uintptr_t)rdma_req, 0); 1210 assert(rqpair->cur_queue_depth > 0); 1211 rqpair->cur_queue_depth--; 1212 1213 if (rdma_req->data_from_pool) { 1214 /* Put the buffer/s back in the pool */ 1215 for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { 1216 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); 1217 rdma_req->req.iov[i].iov_base = NULL; 1218 rdma_req->data.buffers[i] = NULL; 1219 } 1220 rdma_req->data_from_pool = false; 1221 } 1222 rdma_req->req.length = 0; 1223 rdma_req->req.iovcnt = 0; 1224 rdma_req->req.data = NULL; 1225 rdma_req->state = RDMA_REQUEST_STATE_FREE; 1226 TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link); 1227 break; 1228 } 1229 1230 if (rdma_req->state != prev_state) { 1231 progress = true; 1232 } 1233 } while (rdma_req->state != prev_state); 1234 1235 return progress; 1236 } 1237 1238 /* Public API callbacks begin here */ 1239 1240 static struct spdk_nvmf_transport * 1241 spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt) 1242 { 1243 int rc; 1244 struct spdk_nvmf_rdma_transport *rtransport; 1245 struct spdk_nvmf_rdma_device *device, *tmp; 1246 struct ibv_context **contexts; 1247 uint32_t i; 1248 int flag; 1249 uint32_t sge_count; 1250 1251 rtransport = calloc(1, sizeof(*rtransport)); 1252 if (!rtransport) { 1253 return NULL; 1254 } 1255 1256 pthread_mutex_init(&rtransport->lock, NULL); 1257 TAILQ_INIT(&rtransport->devices); 1258 TAILQ_INIT(&rtransport->ports); 1259 1260 rtransport->transport.tgt = tgt; 1261 rtransport->transport.ops = &spdk_nvmf_transport_rdma; 1262 1263 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n"); 1264 1265 rtransport->max_queue_depth = tgt->opts.max_queue_depth; 1266 rtransport->max_io_size = tgt->opts.max_io_size; 1267 rtransport->io_unit_size = tgt->opts.io_unit_size; 1268 rtransport->in_capsule_data_size = tgt->opts.in_capsule_data_size; 1269 1270 /* I/O unit size cannot be larger than max I/O size */ 1271 if (rtransport->io_unit_size > rtransport->max_io_size) { 1272 rtransport->io_unit_size = rtransport->max_io_size; 1273 } 1274 1275 sge_count = rtransport->max_io_size / rtransport->io_unit_size; 1276 if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) { 1277 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", rtransport->io_unit_size); 1278 free(rtransport); 1279 return NULL; 1280 } 1281 1282 rtransport->event_channel = rdma_create_event_channel(); 1283 if (rtransport->event_channel == NULL) { 1284 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 1285 free(rtransport); 1286 return NULL; 1287 } 1288 1289 flag = fcntl(rtransport->event_channel->fd, F_GETFL); 1290 if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1291 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 1292 rtransport->event_channel->fd, spdk_strerror(errno)); 1293 free(rtransport); 1294 return NULL; 1295 } 1296 1297 rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma", 1298 rtransport->max_queue_depth * 4, /* The 4 is arbitrarily chosen. Needs to be configurable. */ 1299 rtransport->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT, 1300 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 1301 SPDK_ENV_SOCKET_ID_ANY); 1302 if (!rtransport->data_buf_pool) { 1303 SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n"); 1304 free(rtransport); 1305 return NULL; 1306 } 1307 1308 spdk_io_device_register(rtransport, spdk_nvmf_rdma_mgmt_channel_create, 1309 spdk_nvmf_rdma_mgmt_channel_destroy, 1310 sizeof(struct spdk_nvmf_rdma_mgmt_channel)); 1311 1312 contexts = rdma_get_devices(NULL); 1313 i = 0; 1314 rc = 0; 1315 while (contexts[i] != NULL) { 1316 device = calloc(1, sizeof(*device)); 1317 if (!device) { 1318 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 1319 rc = -ENOMEM; 1320 break; 1321 } 1322 device->context = contexts[i]; 1323 rc = ibv_query_device(device->context, &device->attr); 1324 if (rc < 0) { 1325 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1326 free(device); 1327 break; 1328 1329 } 1330 /* set up device context async ev fd as NON_BLOCKING */ 1331 flag = fcntl(device->context->async_fd, F_GETFL); 1332 rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 1333 if (rc < 0) { 1334 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 1335 free(device); 1336 break; 1337 } 1338 1339 device->pd = NULL; 1340 device->map = NULL; 1341 1342 TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 1343 i++; 1344 } 1345 1346 if (rc < 0) { 1347 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1348 TAILQ_REMOVE(&rtransport->devices, device, link); 1349 free(device); 1350 } 1351 spdk_mempool_free(rtransport->data_buf_pool); 1352 rdma_destroy_event_channel(rtransport->event_channel); 1353 free(rtransport); 1354 rdma_free_devices(contexts); 1355 return NULL; 1356 } else { 1357 /* Set up poll descriptor array to monitor events from RDMA and IB 1358 * in a single poll syscall 1359 */ 1360 rtransport->npoll_fds = i + 1; 1361 i = 0; 1362 rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 1363 rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 1364 rtransport->poll_fds[i++].events = POLLIN; 1365 1366 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1367 rtransport->poll_fds[i].fd = device->context->async_fd; 1368 rtransport->poll_fds[i++].events = POLLIN; 1369 } 1370 } 1371 1372 rdma_free_devices(contexts); 1373 1374 return &rtransport->transport; 1375 } 1376 1377 static int 1378 spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) 1379 { 1380 struct spdk_nvmf_rdma_transport *rtransport; 1381 struct spdk_nvmf_rdma_port *port, *port_tmp; 1382 struct spdk_nvmf_rdma_device *device, *device_tmp; 1383 1384 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1385 1386 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 1387 TAILQ_REMOVE(&rtransport->ports, port, link); 1388 rdma_destroy_id(port->id); 1389 free(port); 1390 } 1391 1392 if (rtransport->poll_fds != NULL) { 1393 free(rtransport->poll_fds); 1394 } 1395 1396 if (rtransport->event_channel != NULL) { 1397 rdma_destroy_event_channel(rtransport->event_channel); 1398 } 1399 1400 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 1401 TAILQ_REMOVE(&rtransport->devices, device, link); 1402 if (device->map) { 1403 spdk_mem_map_free(&device->map); 1404 } 1405 free(device); 1406 } 1407 1408 if (spdk_mempool_count(rtransport->data_buf_pool) != (rtransport->max_queue_depth * 4)) { 1409 SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n", 1410 spdk_mempool_count(rtransport->data_buf_pool), 1411 rtransport->max_queue_depth * 4); 1412 } 1413 1414 spdk_mempool_free(rtransport->data_buf_pool); 1415 spdk_io_device_unregister(rtransport, NULL); 1416 free(rtransport); 1417 1418 return 0; 1419 } 1420 1421 static int 1422 spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, 1423 const struct spdk_nvme_transport_id *trid) 1424 { 1425 struct spdk_nvmf_rdma_transport *rtransport; 1426 struct spdk_nvmf_rdma_device *device; 1427 struct spdk_nvmf_rdma_port *port_tmp, *port; 1428 struct addrinfo *res; 1429 struct addrinfo hints; 1430 int family; 1431 int rc; 1432 1433 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1434 1435 port = calloc(1, sizeof(*port)); 1436 if (!port) { 1437 return -ENOMEM; 1438 } 1439 1440 /* Selectively copy the trid. Things like NQN don't matter here - that 1441 * mapping is enforced elsewhere. 1442 */ 1443 port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1444 port->trid.adrfam = trid->adrfam; 1445 snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); 1446 snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); 1447 1448 pthread_mutex_lock(&rtransport->lock); 1449 assert(rtransport->event_channel != NULL); 1450 TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { 1451 if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { 1452 port_tmp->ref++; 1453 free(port); 1454 /* Already listening at this address */ 1455 pthread_mutex_unlock(&rtransport->lock); 1456 return 0; 1457 } 1458 } 1459 1460 rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 1461 if (rc < 0) { 1462 SPDK_ERRLOG("rdma_create_id() failed\n"); 1463 free(port); 1464 pthread_mutex_unlock(&rtransport->lock); 1465 return rc; 1466 } 1467 1468 switch (port->trid.adrfam) { 1469 case SPDK_NVMF_ADRFAM_IPV4: 1470 family = AF_INET; 1471 break; 1472 case SPDK_NVMF_ADRFAM_IPV6: 1473 family = AF_INET6; 1474 break; 1475 default: 1476 SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); 1477 free(port); 1478 pthread_mutex_unlock(&rtransport->lock); 1479 return -EINVAL; 1480 } 1481 1482 memset(&hints, 0, sizeof(hints)); 1483 hints.ai_family = family; 1484 hints.ai_socktype = SOCK_STREAM; 1485 hints.ai_protocol = 0; 1486 1487 rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); 1488 if (rc) { 1489 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 1490 free(port); 1491 pthread_mutex_unlock(&rtransport->lock); 1492 return -EINVAL; 1493 } 1494 1495 rc = rdma_bind_addr(port->id, res->ai_addr); 1496 freeaddrinfo(res); 1497 1498 if (rc < 0) { 1499 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 1500 rdma_destroy_id(port->id); 1501 free(port); 1502 pthread_mutex_unlock(&rtransport->lock); 1503 return rc; 1504 } 1505 1506 if (!port->id->verbs) { 1507 SPDK_ERRLOG("ibv_context is null\n"); 1508 rdma_destroy_id(port->id); 1509 free(port); 1510 pthread_mutex_unlock(&rtransport->lock); 1511 return -1; 1512 } 1513 1514 rc = rdma_listen(port->id, 10); /* 10 = backlog */ 1515 if (rc < 0) { 1516 SPDK_ERRLOG("rdma_listen() failed\n"); 1517 rdma_destroy_id(port->id); 1518 free(port); 1519 pthread_mutex_unlock(&rtransport->lock); 1520 return rc; 1521 } 1522 1523 TAILQ_FOREACH(device, &rtransport->devices, link) { 1524 if (device->context == port->id->verbs) { 1525 port->device = device; 1526 break; 1527 } 1528 } 1529 if (!port->device) { 1530 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 1531 port->id->verbs); 1532 rdma_destroy_id(port->id); 1533 free(port); 1534 pthread_mutex_unlock(&rtransport->lock); 1535 return -EINVAL; 1536 } 1537 1538 if (!device->map) { 1539 device->pd = port->id->pd; 1540 device->map = spdk_mem_map_alloc(0, spdk_nvmf_rdma_mem_notify, device); 1541 if (!device->map) { 1542 SPDK_ERRLOG("Unable to allocate memory map for new poll group\n"); 1543 return -1; 1544 } 1545 } else { 1546 assert(device->pd == port->id->pd); 1547 } 1548 1549 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", 1550 port->trid.traddr, ntohs(rdma_get_src_port(port->id))); 1551 1552 port->ref = 1; 1553 1554 TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 1555 pthread_mutex_unlock(&rtransport->lock); 1556 1557 return 0; 1558 } 1559 1560 static int 1561 spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 1562 const struct spdk_nvme_transport_id *_trid) 1563 { 1564 struct spdk_nvmf_rdma_transport *rtransport; 1565 struct spdk_nvmf_rdma_port *port, *tmp; 1566 struct spdk_nvme_transport_id trid = {}; 1567 1568 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1569 1570 /* Selectively copy the trid. Things like NQN don't matter here - that 1571 * mapping is enforced elsewhere. 1572 */ 1573 trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1574 trid.adrfam = _trid->adrfam; 1575 snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); 1576 snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); 1577 1578 pthread_mutex_lock(&rtransport->lock); 1579 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 1580 if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { 1581 assert(port->ref > 0); 1582 port->ref--; 1583 if (port->ref == 0) { 1584 TAILQ_REMOVE(&rtransport->ports, port, link); 1585 rdma_destroy_id(port->id); 1586 free(port); 1587 } 1588 break; 1589 } 1590 } 1591 1592 pthread_mutex_unlock(&rtransport->lock); 1593 return 0; 1594 } 1595 1596 static void 1597 spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 1598 { 1599 struct spdk_nvmf_rdma_transport *rtransport; 1600 struct rdma_cm_event *event; 1601 int rc; 1602 1603 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1604 1605 if (rtransport->event_channel == NULL) { 1606 return; 1607 } 1608 1609 while (1) { 1610 rc = rdma_get_cm_event(rtransport->event_channel, &event); 1611 if (rc == 0) { 1612 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 1613 1614 switch (event->event) { 1615 case RDMA_CM_EVENT_ADDR_RESOLVED: 1616 case RDMA_CM_EVENT_ADDR_ERROR: 1617 case RDMA_CM_EVENT_ROUTE_RESOLVED: 1618 case RDMA_CM_EVENT_ROUTE_ERROR: 1619 /* No action required. The target never attempts to resolve routes. */ 1620 break; 1621 case RDMA_CM_EVENT_CONNECT_REQUEST: 1622 rc = nvmf_rdma_connect(transport, event, cb_fn); 1623 if (rc < 0) { 1624 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 1625 break; 1626 } 1627 break; 1628 case RDMA_CM_EVENT_CONNECT_RESPONSE: 1629 /* The target never initiates a new connection. So this will not occur. */ 1630 break; 1631 case RDMA_CM_EVENT_CONNECT_ERROR: 1632 /* Can this happen? The docs say it can, but not sure what causes it. */ 1633 break; 1634 case RDMA_CM_EVENT_UNREACHABLE: 1635 case RDMA_CM_EVENT_REJECTED: 1636 /* These only occur on the client side. */ 1637 break; 1638 case RDMA_CM_EVENT_ESTABLISHED: 1639 /* TODO: Should we be waiting for this event anywhere? */ 1640 break; 1641 case RDMA_CM_EVENT_DISCONNECTED: 1642 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1643 rc = nvmf_rdma_disconnect(event); 1644 if (rc < 0) { 1645 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 1646 break; 1647 } 1648 continue; 1649 case RDMA_CM_EVENT_MULTICAST_JOIN: 1650 case RDMA_CM_EVENT_MULTICAST_ERROR: 1651 /* Multicast is not used */ 1652 break; 1653 case RDMA_CM_EVENT_ADDR_CHANGE: 1654 /* Not utilizing this event */ 1655 break; 1656 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1657 /* For now, do nothing. The target never re-uses queue pairs. */ 1658 break; 1659 default: 1660 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 1661 break; 1662 } 1663 1664 rdma_ack_cm_event(event); 1665 } else { 1666 if (errno != EAGAIN && errno != EWOULDBLOCK) { 1667 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 1668 } 1669 break; 1670 } 1671 } 1672 } 1673 1674 static void 1675 spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 1676 { 1677 int rc; 1678 struct ibv_async_event event; 1679 1680 rc = ibv_get_async_event(device->context, &event); 1681 1682 if (rc) { 1683 SPDK_ERRLOG("Failed to get async_event (%d): %s\n", 1684 errno, spdk_strerror(errno)); 1685 return; 1686 } 1687 1688 SPDK_NOTICELOG("Async event: %s\n", 1689 ibv_event_type_str(event.event_type)); 1690 ibv_ack_async_event(&event); 1691 } 1692 1693 static void 1694 spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 1695 { 1696 int nfds, i = 0; 1697 struct spdk_nvmf_rdma_transport *rtransport; 1698 struct spdk_nvmf_rdma_device *device, *tmp; 1699 1700 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1701 nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 1702 1703 if (nfds <= 0) { 1704 return; 1705 } 1706 1707 /* The first poll descriptor is RDMA CM event */ 1708 if (rtransport->poll_fds[i++].revents & POLLIN) { 1709 spdk_nvmf_process_cm_event(transport, cb_fn); 1710 nfds--; 1711 } 1712 1713 if (nfds == 0) { 1714 return; 1715 } 1716 1717 /* Second and subsequent poll descriptors are IB async events */ 1718 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1719 if (rtransport->poll_fds[i++].revents & POLLIN) { 1720 spdk_nvmf_process_ib_event(device); 1721 nfds--; 1722 } 1723 } 1724 /* check all flagged fd's have been served */ 1725 assert(nfds == 0); 1726 } 1727 1728 static void 1729 spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 1730 struct spdk_nvme_transport_id *trid, 1731 struct spdk_nvmf_discovery_log_page_entry *entry) 1732 { 1733 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 1734 entry->adrfam = trid->adrfam; 1735 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; 1736 1737 spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 1738 spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 1739 1740 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 1741 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 1742 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 1743 } 1744 1745 static struct spdk_nvmf_transport_poll_group * 1746 spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) 1747 { 1748 struct spdk_nvmf_rdma_transport *rtransport; 1749 struct spdk_nvmf_rdma_poll_group *rgroup; 1750 struct spdk_nvmf_rdma_poller *poller; 1751 struct spdk_nvmf_rdma_device *device; 1752 1753 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1754 1755 rgroup = calloc(1, sizeof(*rgroup)); 1756 if (!rgroup) { 1757 return NULL; 1758 } 1759 1760 TAILQ_INIT(&rgroup->pollers); 1761 1762 pthread_mutex_lock(&rtransport->lock); 1763 TAILQ_FOREACH(device, &rtransport->devices, link) { 1764 if (device->map == NULL) { 1765 /* 1766 * The device is not in use (no listeners), 1767 * so no protection domain has been constructed. 1768 * Skip it. 1769 */ 1770 SPDK_NOTICELOG("Skipping unused RDMA device when creating poll group.\n"); 1771 continue; 1772 } 1773 1774 poller = calloc(1, sizeof(*poller)); 1775 if (!poller) { 1776 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 1777 free(rgroup); 1778 pthread_mutex_unlock(&rtransport->lock); 1779 return NULL; 1780 } 1781 1782 poller->device = device; 1783 poller->group = rgroup; 1784 1785 TAILQ_INIT(&poller->qpairs); 1786 1787 poller->cq = ibv_create_cq(device->context, NVMF_RDMA_CQ_SIZE, poller, NULL, 0); 1788 if (!poller->cq) { 1789 SPDK_ERRLOG("Unable to create completion queue\n"); 1790 free(poller); 1791 free(rgroup); 1792 pthread_mutex_unlock(&rtransport->lock); 1793 return NULL; 1794 } 1795 1796 TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 1797 } 1798 1799 pthread_mutex_unlock(&rtransport->lock); 1800 return &rgroup->group; 1801 } 1802 1803 static void 1804 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 1805 { 1806 struct spdk_nvmf_rdma_poll_group *rgroup; 1807 struct spdk_nvmf_rdma_poller *poller, *tmp; 1808 1809 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1810 1811 if (!rgroup) { 1812 return; 1813 } 1814 1815 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 1816 TAILQ_REMOVE(&rgroup->pollers, poller, link); 1817 1818 if (poller->cq) { 1819 ibv_destroy_cq(poller->cq); 1820 } 1821 1822 free(poller); 1823 } 1824 1825 free(rgroup); 1826 } 1827 1828 static int 1829 spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 1830 struct spdk_nvmf_qpair *qpair) 1831 { 1832 struct spdk_nvmf_rdma_transport *rtransport; 1833 struct spdk_nvmf_rdma_poll_group *rgroup; 1834 struct spdk_nvmf_rdma_qpair *rqpair; 1835 struct spdk_nvmf_rdma_device *device; 1836 struct spdk_nvmf_rdma_poller *poller; 1837 int rc; 1838 1839 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 1840 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1841 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1842 1843 device = rqpair->port->device; 1844 1845 if (device->pd != rqpair->cm_id->pd) { 1846 SPDK_ERRLOG("Mismatched protection domains\n"); 1847 return -1; 1848 } 1849 1850 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 1851 if (poller->device == device) { 1852 break; 1853 } 1854 } 1855 1856 if (!poller) { 1857 SPDK_ERRLOG("No poller found for device.\n"); 1858 return -1; 1859 } 1860 1861 TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); 1862 rqpair->poller = poller; 1863 1864 rc = spdk_nvmf_rdma_qpair_initialize(qpair); 1865 if (rc < 0) { 1866 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 1867 return -1; 1868 } 1869 1870 rqpair->mgmt_channel = spdk_get_io_channel(rtransport); 1871 if (!rqpair->mgmt_channel) { 1872 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1873 spdk_nvmf_rdma_qpair_destroy(rqpair); 1874 return -1; 1875 } 1876 1877 rqpair->ch = spdk_io_channel_get_ctx(rqpair->mgmt_channel); 1878 assert(rqpair->ch != NULL); 1879 1880 rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 1881 if (rc) { 1882 /* Try to reject, but we probably can't */ 1883 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1884 spdk_nvmf_rdma_qpair_destroy(rqpair); 1885 return -1; 1886 } 1887 1888 return 0; 1889 } 1890 1891 static int 1892 spdk_nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 1893 struct spdk_nvmf_qpair *qpair) 1894 { 1895 struct spdk_nvmf_rdma_poll_group *rgroup; 1896 struct spdk_nvmf_rdma_qpair *rqpair; 1897 struct spdk_nvmf_rdma_device *device; 1898 struct spdk_nvmf_rdma_poller *poller; 1899 struct spdk_nvmf_rdma_qpair *rq, *trq; 1900 1901 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 1902 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1903 1904 device = rqpair->port->device; 1905 1906 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 1907 if (poller->device == device) { 1908 break; 1909 } 1910 } 1911 1912 if (!poller) { 1913 SPDK_ERRLOG("No poller found for device.\n"); 1914 return -1; 1915 } 1916 1917 TAILQ_FOREACH_SAFE(rq, &poller->qpairs, link, trq) { 1918 if (rq == rqpair) { 1919 TAILQ_REMOVE(&poller->qpairs, rqpair, link); 1920 rqpair->poller = NULL; 1921 break; 1922 } 1923 } 1924 1925 if (rq == NULL) { 1926 SPDK_ERRLOG("RDMA qpair cannot be removed from group (not in group).\n"); 1927 return -1; 1928 } 1929 1930 return 0; 1931 } 1932 1933 static int 1934 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 1935 { 1936 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 1937 struct spdk_nvmf_rdma_transport, transport); 1938 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1939 1940 rdma_req->state = RDMA_REQUEST_STATE_EXECUTED; 1941 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 1942 1943 return 0; 1944 } 1945 1946 static void 1947 spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) 1948 { 1949 spdk_nvmf_rdma_qpair_destroy(SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair)); 1950 } 1951 1952 static void 1953 spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 1954 struct spdk_nvmf_rdma_qpair *rqpair) 1955 { 1956 struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 1957 struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 1958 1959 /* We process I/O in the pending_rdma_rw queue at the highest priority. */ 1960 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_rw_queue, link, req_tmp) { 1961 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1962 break; 1963 } 1964 } 1965 1966 /* The second highest priority is I/O waiting on memory buffers. */ 1967 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->ch->pending_data_buf_queue, link, req_tmp) { 1968 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1969 break; 1970 } 1971 } 1972 1973 /* The lowest priority is processing newly received commands */ 1974 TAILQ_FOREACH_SAFE(rdma_recv, &rqpair->incoming_queue, link, recv_tmp) { 1975 rdma_req = TAILQ_FIRST(&rqpair->free_queue); 1976 if (rdma_req == NULL) { 1977 /* Need to wait for more SEND completions */ 1978 break; 1979 } 1980 1981 rdma_req->recv = rdma_recv; 1982 rdma_req->state = RDMA_REQUEST_STATE_NEW; 1983 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1984 break; 1985 } 1986 } 1987 } 1988 1989 static struct spdk_nvmf_rdma_request * 1990 get_rdma_req_from_wc(struct ibv_wc *wc) 1991 { 1992 struct spdk_nvmf_rdma_request *rdma_req; 1993 1994 rdma_req = (struct spdk_nvmf_rdma_request *)wc->wr_id; 1995 assert(rdma_req != NULL); 1996 1997 #ifdef DEBUG 1998 struct spdk_nvmf_rdma_qpair *rqpair; 1999 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2000 2001 assert(rdma_req - rqpair->reqs >= 0); 2002 assert(rdma_req - rqpair->reqs < (ptrdiff_t)rqpair->max_queue_depth); 2003 #endif 2004 2005 return rdma_req; 2006 } 2007 2008 static struct spdk_nvmf_rdma_recv * 2009 get_rdma_recv_from_wc(struct ibv_wc *wc) 2010 { 2011 struct spdk_nvmf_rdma_recv *rdma_recv; 2012 2013 assert(wc->byte_len >= sizeof(struct spdk_nvmf_capsule_cmd)); 2014 2015 rdma_recv = (struct spdk_nvmf_rdma_recv *)wc->wr_id; 2016 assert(rdma_recv != NULL); 2017 2018 #ifdef DEBUG 2019 struct spdk_nvmf_rdma_qpair *rqpair = rdma_recv->qpair; 2020 2021 assert(rdma_recv - rqpair->recvs >= 0); 2022 assert(rdma_recv - rqpair->recvs < (ptrdiff_t)rqpair->max_queue_depth); 2023 #endif 2024 2025 return rdma_recv; 2026 } 2027 2028 static int 2029 spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 2030 struct spdk_nvmf_rdma_poller *rpoller) 2031 { 2032 struct ibv_wc wc[32]; 2033 struct spdk_nvmf_rdma_request *rdma_req; 2034 struct spdk_nvmf_rdma_recv *rdma_recv; 2035 struct spdk_nvmf_rdma_qpair *rqpair; 2036 int reaped, i; 2037 int count = 0; 2038 bool error = false; 2039 2040 /* Poll for completing operations. */ 2041 reaped = ibv_poll_cq(rpoller->cq, 32, wc); 2042 if (reaped < 0) { 2043 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 2044 errno, spdk_strerror(errno)); 2045 return -1; 2046 } 2047 2048 for (i = 0; i < reaped; i++) { 2049 if (wc[i].status) { 2050 SPDK_ERRLOG("CQ error on CQ %p, Request 0x%lu (%d): %s\n", 2051 rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 2052 error = true; 2053 continue; 2054 } 2055 2056 switch (wc[i].opcode) { 2057 case IBV_WC_SEND: 2058 rdma_req = get_rdma_req_from_wc(&wc[i]); 2059 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2060 2061 assert(rdma_req->state == RDMA_REQUEST_STATE_COMPLETING); 2062 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2063 2064 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2065 2066 count++; 2067 2068 /* Try to process other queued requests */ 2069 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2070 break; 2071 2072 case IBV_WC_RDMA_WRITE: 2073 rdma_req = get_rdma_req_from_wc(&wc[i]); 2074 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2075 2076 rqpair->cur_rdma_rw_depth--; 2077 2078 /* Try to process other queued requests */ 2079 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2080 break; 2081 2082 case IBV_WC_RDMA_READ: 2083 rdma_req = get_rdma_req_from_wc(&wc[i]); 2084 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2085 2086 assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); 2087 rqpair->cur_rdma_rw_depth--; 2088 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 2089 2090 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2091 2092 /* Try to process other queued requests */ 2093 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2094 break; 2095 2096 case IBV_WC_RECV: 2097 rdma_recv = get_rdma_recv_from_wc(&wc[i]); 2098 rqpair = rdma_recv->qpair; 2099 2100 TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); 2101 2102 /* Try to process other queued requests */ 2103 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2104 break; 2105 2106 default: 2107 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 2108 continue; 2109 } 2110 } 2111 2112 if (error == true) { 2113 return -1; 2114 } 2115 2116 return count; 2117 } 2118 2119 static int 2120 spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2121 { 2122 struct spdk_nvmf_rdma_transport *rtransport; 2123 struct spdk_nvmf_rdma_poll_group *rgroup; 2124 struct spdk_nvmf_rdma_poller *rpoller; 2125 int count, rc; 2126 2127 rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 2128 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2129 2130 count = 0; 2131 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 2132 rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); 2133 if (rc < 0) { 2134 return rc; 2135 } 2136 count += rc; 2137 } 2138 2139 return count; 2140 } 2141 2142 static bool 2143 spdk_nvmf_rdma_qpair_is_idle(struct spdk_nvmf_qpair *qpair) 2144 { 2145 struct spdk_nvmf_rdma_qpair *rqpair; 2146 2147 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2148 2149 if (rqpair->cur_queue_depth == 0 && rqpair->cur_rdma_rw_depth == 0) { 2150 return true; 2151 } 2152 return false; 2153 } 2154 2155 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 2156 .type = SPDK_NVME_TRANSPORT_RDMA, 2157 .create = spdk_nvmf_rdma_create, 2158 .destroy = spdk_nvmf_rdma_destroy, 2159 2160 .listen = spdk_nvmf_rdma_listen, 2161 .stop_listen = spdk_nvmf_rdma_stop_listen, 2162 .accept = spdk_nvmf_rdma_accept, 2163 2164 .listener_discover = spdk_nvmf_rdma_discover, 2165 2166 .poll_group_create = spdk_nvmf_rdma_poll_group_create, 2167 .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, 2168 .poll_group_add = spdk_nvmf_rdma_poll_group_add, 2169 .poll_group_remove = spdk_nvmf_rdma_poll_group_remove, 2170 .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, 2171 2172 .req_complete = spdk_nvmf_rdma_request_complete, 2173 2174 .qpair_fini = spdk_nvmf_rdma_close_qpair, 2175 .qpair_is_idle = spdk_nvmf_rdma_qpair_is_idle, 2176 2177 }; 2178 2179 SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) 2180