1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2018 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "nvmf_internal.h" 41 #include "transport.h" 42 43 #include "spdk/config.h" 44 #include "spdk/assert.h" 45 #include "spdk/thread.h" 46 #include "spdk/nvmf.h" 47 #include "spdk/nvmf_spec.h" 48 #include "spdk/string.h" 49 #include "spdk/trace.h" 50 #include "spdk/util.h" 51 52 #include "spdk_internal/log.h" 53 54 struct spdk_nvme_rdma_hooks g_nvmf_hooks = {}; 55 56 /* 57 RDMA Connection Resource Defaults 58 */ 59 #define NVMF_DEFAULT_TX_SGE SPDK_NVMF_MAX_SGL_ENTRIES 60 #define NVMF_DEFAULT_RSP_SGE 1 61 #define NVMF_DEFAULT_RX_SGE 2 62 63 /* The RDMA completion queue size */ 64 #define DEFAULT_NVMF_RDMA_CQ_SIZE 4096 65 #define MAX_WR_PER_QP(queue_depth) (queue_depth * 3 + 2) 66 67 /* Timeout for destroying defunct rqpairs */ 68 #define NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US 4000000 69 70 enum spdk_nvmf_rdma_request_state { 71 /* The request is not currently in use */ 72 RDMA_REQUEST_STATE_FREE = 0, 73 74 /* Initial state when request first received */ 75 RDMA_REQUEST_STATE_NEW, 76 77 /* The request is queued until a data buffer is available. */ 78 RDMA_REQUEST_STATE_NEED_BUFFER, 79 80 /* The request is waiting on RDMA queue depth availability 81 * to transfer data from the host to the controller. 82 */ 83 RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 84 85 /* The request is currently transferring data from the host to the controller. */ 86 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 87 88 /* The request is ready to execute at the block device */ 89 RDMA_REQUEST_STATE_READY_TO_EXECUTE, 90 91 /* The request is currently executing at the block device */ 92 RDMA_REQUEST_STATE_EXECUTING, 93 94 /* The request finished executing at the block device */ 95 RDMA_REQUEST_STATE_EXECUTED, 96 97 /* The request is waiting on RDMA queue depth availability 98 * to transfer data from the controller to the host. 99 */ 100 RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 101 102 /* The request is ready to send a completion */ 103 RDMA_REQUEST_STATE_READY_TO_COMPLETE, 104 105 /* The request is currently transferring data from the controller to the host. */ 106 RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 107 108 /* The request currently has an outstanding completion without an 109 * associated data transfer. 110 */ 111 RDMA_REQUEST_STATE_COMPLETING, 112 113 /* The request completed and can be marked free. */ 114 RDMA_REQUEST_STATE_COMPLETED, 115 116 /* Terminator */ 117 RDMA_REQUEST_NUM_STATES, 118 }; 119 120 #define OBJECT_NVMF_RDMA_IO 0x40 121 122 #define TRACE_GROUP_NVMF_RDMA 0x4 123 #define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) 124 #define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) 125 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) 126 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) 127 #define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) 128 #define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) 129 #define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) 130 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) 131 #define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) 132 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) 133 #define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) 134 #define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB) 135 #define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC) 136 #define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD) 137 #define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE) 138 #define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF) 139 #define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10) 140 #define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x11) 141 142 SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA) 143 { 144 spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); 145 spdk_trace_register_description("RDMA_REQ_NEW", "", 146 TRACE_RDMA_REQUEST_STATE_NEW, 147 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: "); 148 spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", "", 149 TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 150 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 151 spdk_trace_register_description("RDMA_REQ_TX_PENDING_C_TO_H", "", 152 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 153 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 154 spdk_trace_register_description("RDMA_REQ_TX_PENDING_H_TO_C", "", 155 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 156 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 157 spdk_trace_register_description("RDMA_REQ_TX_H_TO_C", "", 158 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 159 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 160 spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", "", 161 TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 162 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 163 spdk_trace_register_description("RDMA_REQ_EXECUTING", "", 164 TRACE_RDMA_REQUEST_STATE_EXECUTING, 165 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 166 spdk_trace_register_description("RDMA_REQ_EXECUTED", "", 167 TRACE_RDMA_REQUEST_STATE_EXECUTED, 168 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 169 spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPLETE", "", 170 TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 171 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 172 spdk_trace_register_description("RDMA_REQ_COMPLETING_CONTROLLER_TO_HOST", "", 173 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 174 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 175 spdk_trace_register_description("RDMA_REQ_COMPLETING_INCAPSULE", "", 176 TRACE_RDMA_REQUEST_STATE_COMPLETING, 177 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 178 spdk_trace_register_description("RDMA_REQ_COMPLETED", "", 179 TRACE_RDMA_REQUEST_STATE_COMPLETED, 180 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 181 182 spdk_trace_register_description("RDMA_QP_CREATE", "", TRACE_RDMA_QP_CREATE, 183 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 184 spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", "", TRACE_RDMA_IBV_ASYNC_EVENT, 185 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 186 spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", "", TRACE_RDMA_CM_ASYNC_EVENT, 187 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 188 spdk_trace_register_description("RDMA_QP_STATE_CHANGE", "", TRACE_RDMA_QP_STATE_CHANGE, 189 OWNER_NONE, OBJECT_NONE, 0, 1, "state: "); 190 spdk_trace_register_description("RDMA_QP_DISCONNECT", "", TRACE_RDMA_QP_DISCONNECT, 191 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 192 spdk_trace_register_description("RDMA_QP_DESTROY", "", TRACE_RDMA_QP_DESTROY, 193 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 194 } 195 196 enum spdk_nvmf_rdma_wr_type { 197 RDMA_WR_TYPE_RECV, 198 RDMA_WR_TYPE_SEND, 199 RDMA_WR_TYPE_DATA, 200 }; 201 202 struct spdk_nvmf_rdma_wr { 203 enum spdk_nvmf_rdma_wr_type type; 204 }; 205 206 /* This structure holds commands as they are received off the wire. 207 * It must be dynamically paired with a full request object 208 * (spdk_nvmf_rdma_request) to service a request. It is separate 209 * from the request because RDMA does not appear to order 210 * completions, so occasionally we'll get a new incoming 211 * command when there aren't any free request objects. 212 */ 213 struct spdk_nvmf_rdma_recv { 214 struct ibv_recv_wr wr; 215 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 216 217 struct spdk_nvmf_rdma_qpair *qpair; 218 219 /* In-capsule data buffer */ 220 uint8_t *buf; 221 222 struct spdk_nvmf_rdma_wr rdma_wr; 223 224 STAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 225 }; 226 227 struct spdk_nvmf_rdma_request_data { 228 struct spdk_nvmf_rdma_wr rdma_wr; 229 struct ibv_send_wr wr; 230 struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 231 void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; 232 }; 233 234 struct spdk_nvmf_rdma_request { 235 struct spdk_nvmf_request req; 236 bool data_from_pool; 237 238 enum spdk_nvmf_rdma_request_state state; 239 240 struct spdk_nvmf_rdma_recv *recv; 241 242 struct { 243 struct spdk_nvmf_rdma_wr rdma_wr; 244 struct ibv_send_wr wr; 245 struct ibv_sge sgl[NVMF_DEFAULT_RSP_SGE]; 246 } rsp; 247 248 struct spdk_nvmf_rdma_request_data data; 249 250 uint32_t num_outstanding_data_wr; 251 252 TAILQ_ENTRY(spdk_nvmf_rdma_request) link; 253 STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; 254 }; 255 256 enum spdk_nvmf_rdma_qpair_disconnect_flags { 257 RDMA_QP_DISCONNECTING = 1, 258 RDMA_QP_RECV_DRAINED = 1 << 1, 259 RDMA_QP_SEND_DRAINED = 1 << 2 260 }; 261 262 struct spdk_nvmf_rdma_resource_opts { 263 struct spdk_nvmf_rdma_qpair *qpair; 264 /* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */ 265 void *qp; 266 struct ibv_pd *pd; 267 uint32_t max_queue_depth; 268 uint32_t in_capsule_data_size; 269 bool shared; 270 }; 271 272 struct spdk_nvmf_rdma_resources { 273 /* Array of size "max_queue_depth" containing RDMA requests. */ 274 struct spdk_nvmf_rdma_request *reqs; 275 276 /* Array of size "max_queue_depth" containing RDMA recvs. */ 277 struct spdk_nvmf_rdma_recv *recvs; 278 279 /* Array of size "max_queue_depth" containing 64 byte capsules 280 * used for receive. 281 */ 282 union nvmf_h2c_msg *cmds; 283 struct ibv_mr *cmds_mr; 284 285 /* Array of size "max_queue_depth" containing 16 byte completions 286 * to be sent back to the user. 287 */ 288 union nvmf_c2h_msg *cpls; 289 struct ibv_mr *cpls_mr; 290 291 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 292 * buffers to be used for in capsule data. 293 */ 294 void *bufs; 295 struct ibv_mr *bufs_mr; 296 297 /* Receives that are waiting for a request object */ 298 STAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 299 300 /* Queue to track free requests */ 301 STAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; 302 }; 303 304 struct spdk_nvmf_rdma_qpair { 305 struct spdk_nvmf_qpair qpair; 306 307 struct spdk_nvmf_rdma_port *port; 308 struct spdk_nvmf_rdma_poller *poller; 309 310 struct rdma_cm_id *cm_id; 311 struct ibv_srq *srq; 312 struct rdma_cm_id *listen_id; 313 314 /* The maximum number of I/O outstanding on this connection at one time */ 315 uint16_t max_queue_depth; 316 317 /* The maximum number of active RDMA READ and ATOMIC operations at one time */ 318 uint16_t max_read_depth; 319 320 /* The maximum number of RDMA SEND operations at one time */ 321 uint32_t max_send_depth; 322 323 /* The current number of outstanding WRs from this qpair's 324 * recv queue. Should not exceed device->attr.max_queue_depth. 325 */ 326 uint16_t current_recv_depth; 327 328 /* The current number of posted WRs from this qpair's 329 * send queue. Should not exceed max_send_depth. 330 */ 331 uint32_t current_send_depth; 332 333 /* The current number of active RDMA READ operations */ 334 uint16_t current_read_depth; 335 336 /* The maximum number of SGEs per WR on the send queue */ 337 uint32_t max_send_sge; 338 339 /* The maximum number of SGEs per WR on the recv queue */ 340 uint32_t max_recv_sge; 341 342 struct spdk_nvmf_rdma_resources *resources; 343 344 STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_read_queue; 345 346 STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_write_queue; 347 348 /* Number of requests not in the free state */ 349 uint32_t qd; 350 351 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; 352 353 /* IBV queue pair attributes: they are used to manage 354 * qp state and recover from errors. 355 */ 356 struct ibv_qp_attr ibv_attr; 357 358 uint32_t disconnect_flags; 359 360 /* Poller registered in case the qpair doesn't properly 361 * complete the qpair destruct process and becomes defunct. 362 */ 363 364 struct spdk_poller *destruct_poller; 365 366 /* There are several ways a disconnect can start on a qpair 367 * and they are not all mutually exclusive. It is important 368 * that we only initialize one of these paths. 369 */ 370 bool disconnect_started; 371 /* Lets us know that we have received the last_wqe event. */ 372 bool last_wqe_reached; 373 }; 374 375 struct spdk_nvmf_rdma_poller { 376 struct spdk_nvmf_rdma_device *device; 377 struct spdk_nvmf_rdma_poll_group *group; 378 379 int num_cqe; 380 int required_num_wr; 381 struct ibv_cq *cq; 382 383 /* The maximum number of I/O outstanding on the shared receive queue at one time */ 384 uint16_t max_srq_depth; 385 386 /* Shared receive queue */ 387 struct ibv_srq *srq; 388 389 struct spdk_nvmf_rdma_resources *resources; 390 391 TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; 392 393 TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 394 }; 395 396 struct spdk_nvmf_rdma_poll_group { 397 struct spdk_nvmf_transport_poll_group group; 398 399 /* Requests that are waiting to obtain a data buffer */ 400 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; 401 402 TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 403 }; 404 405 /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 406 struct spdk_nvmf_rdma_device { 407 struct ibv_device_attr attr; 408 struct ibv_context *context; 409 410 struct spdk_mem_map *map; 411 struct ibv_pd *pd; 412 413 TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 414 }; 415 416 struct spdk_nvmf_rdma_port { 417 struct spdk_nvme_transport_id trid; 418 struct rdma_cm_id *id; 419 struct spdk_nvmf_rdma_device *device; 420 uint32_t ref; 421 TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 422 }; 423 424 struct spdk_nvmf_rdma_transport { 425 struct spdk_nvmf_transport transport; 426 427 struct rdma_event_channel *event_channel; 428 429 struct spdk_mempool *data_wr_pool; 430 431 pthread_mutex_t lock; 432 433 /* fields used to poll RDMA/IB events */ 434 nfds_t npoll_fds; 435 struct pollfd *poll_fds; 436 437 TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 438 TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 439 }; 440 441 static inline int 442 spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state) 443 { 444 switch (state) { 445 case IBV_QPS_RESET: 446 case IBV_QPS_INIT: 447 case IBV_QPS_RTR: 448 case IBV_QPS_RTS: 449 case IBV_QPS_SQD: 450 case IBV_QPS_SQE: 451 case IBV_QPS_ERR: 452 return 0; 453 default: 454 return -1; 455 } 456 } 457 458 static enum ibv_qp_state 459 spdk_nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) { 460 enum ibv_qp_state old_state, new_state; 461 struct ibv_qp_init_attr init_attr; 462 int rc; 463 464 /* All the attributes needed for recovery */ 465 static int spdk_nvmf_ibv_attr_mask = 466 IBV_QP_STATE | 467 IBV_QP_PKEY_INDEX | 468 IBV_QP_PORT | 469 IBV_QP_ACCESS_FLAGS | 470 IBV_QP_AV | 471 IBV_QP_PATH_MTU | 472 IBV_QP_DEST_QPN | 473 IBV_QP_RQ_PSN | 474 IBV_QP_MAX_DEST_RD_ATOMIC | 475 IBV_QP_MIN_RNR_TIMER | 476 IBV_QP_SQ_PSN | 477 IBV_QP_TIMEOUT | 478 IBV_QP_RETRY_CNT | 479 IBV_QP_RNR_RETRY | 480 IBV_QP_MAX_QP_RD_ATOMIC; 481 482 old_state = rqpair->ibv_attr.qp_state; 483 rc = ibv_query_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, 484 spdk_nvmf_ibv_attr_mask, &init_attr); 485 486 if (rc) 487 { 488 SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); 489 assert(false); 490 } 491 492 new_state = rqpair->ibv_attr.qp_state; 493 494 rc = spdk_nvmf_rdma_check_ibv_state(new_state); 495 if (rc) 496 { 497 SPDK_ERRLOG("QP#%d: bad state updated: %u, maybe hardware issue\n", rqpair->qpair.qid, new_state); 498 /* 499 * IBV_QPS_UNKNOWN undefined if lib version smaller than libibverbs-1.1.8 500 * IBV_QPS_UNKNOWN is the enum element after IBV_QPS_ERR 501 */ 502 return IBV_QPS_ERR + 1; 503 } 504 505 if (old_state != new_state) 506 { 507 spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0, 508 (uintptr_t)rqpair->cm_id, new_state); 509 } 510 return new_state; 511 } 512 513 static const char *str_ibv_qp_state[] = { 514 "IBV_QPS_RESET", 515 "IBV_QPS_INIT", 516 "IBV_QPS_RTR", 517 "IBV_QPS_RTS", 518 "IBV_QPS_SQD", 519 "IBV_QPS_SQE", 520 "IBV_QPS_ERR", 521 "IBV_QPS_UNKNOWN" 522 }; 523 524 static int 525 spdk_nvmf_rdma_set_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair, 526 enum ibv_qp_state new_state) 527 { 528 int rc; 529 enum ibv_qp_state state; 530 static int attr_mask_rc[] = { 531 [IBV_QPS_RESET] = IBV_QP_STATE, 532 [IBV_QPS_INIT] = (IBV_QP_STATE | 533 IBV_QP_PKEY_INDEX | 534 IBV_QP_PORT | 535 IBV_QP_ACCESS_FLAGS), 536 [IBV_QPS_RTR] = (IBV_QP_STATE | 537 IBV_QP_AV | 538 IBV_QP_PATH_MTU | 539 IBV_QP_DEST_QPN | 540 IBV_QP_RQ_PSN | 541 IBV_QP_MAX_DEST_RD_ATOMIC | 542 IBV_QP_MIN_RNR_TIMER), 543 [IBV_QPS_RTS] = (IBV_QP_STATE | 544 IBV_QP_SQ_PSN | 545 IBV_QP_TIMEOUT | 546 IBV_QP_RETRY_CNT | 547 IBV_QP_RNR_RETRY | 548 IBV_QP_MAX_QP_RD_ATOMIC), 549 [IBV_QPS_SQD] = IBV_QP_STATE, 550 [IBV_QPS_SQE] = IBV_QP_STATE, 551 [IBV_QPS_ERR] = IBV_QP_STATE, 552 }; 553 554 rc = spdk_nvmf_rdma_check_ibv_state(new_state); 555 if (rc) { 556 SPDK_ERRLOG("QP#%d: bad state requested: %u\n", 557 rqpair->qpair.qid, new_state); 558 return rc; 559 } 560 561 rqpair->ibv_attr.cur_qp_state = rqpair->ibv_attr.qp_state; 562 rqpair->ibv_attr.qp_state = new_state; 563 rqpair->ibv_attr.ah_attr.port_num = rqpair->ibv_attr.port_num; 564 565 rc = ibv_modify_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, 566 attr_mask_rc[new_state]); 567 568 if (rc) { 569 SPDK_ERRLOG("QP#%d: failed to set state to: %s, %d (%s)\n", 570 rqpair->qpair.qid, str_ibv_qp_state[new_state], errno, strerror(errno)); 571 return rc; 572 } 573 574 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 575 576 if (state != new_state) { 577 SPDK_ERRLOG("QP#%d: expected state: %s, actual state: %s\n", 578 rqpair->qpair.qid, str_ibv_qp_state[new_state], 579 str_ibv_qp_state[state]); 580 return -1; 581 } 582 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "IBV QP#%u changed to: %s\n", rqpair->qpair.qid, 583 str_ibv_qp_state[state]); 584 return 0; 585 } 586 587 static void 588 nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req) 589 { 590 SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->data_from_pool); 591 if (req->req.cmd) { 592 SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode); 593 } 594 if (req->recv) { 595 SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id); 596 } 597 } 598 599 static void 600 nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair) 601 { 602 int i; 603 604 SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid); 605 for (i = 0; i < rqpair->max_queue_depth; i++) { 606 if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) { 607 nvmf_rdma_dump_request(&rqpair->resources->reqs[i]); 608 } 609 } 610 } 611 612 static void 613 nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources) 614 { 615 if (resources->cmds_mr) { 616 ibv_dereg_mr(resources->cmds_mr); 617 } 618 619 if (resources->cpls_mr) { 620 ibv_dereg_mr(resources->cpls_mr); 621 } 622 623 if (resources->bufs_mr) { 624 ibv_dereg_mr(resources->bufs_mr); 625 } 626 627 spdk_dma_free(resources->cmds); 628 spdk_dma_free(resources->cpls); 629 spdk_dma_free(resources->bufs); 630 free(resources->reqs); 631 free(resources->recvs); 632 free(resources); 633 } 634 635 636 static struct spdk_nvmf_rdma_resources * 637 nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts) 638 { 639 struct spdk_nvmf_rdma_resources *resources; 640 struct spdk_nvmf_rdma_request *rdma_req; 641 struct spdk_nvmf_rdma_recv *rdma_recv; 642 struct ibv_qp *qp; 643 struct ibv_srq *srq; 644 uint32_t i; 645 int rc; 646 647 resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources)); 648 if (!resources) { 649 SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); 650 return NULL; 651 } 652 653 resources->reqs = calloc(opts->max_queue_depth, sizeof(*resources->reqs)); 654 resources->recvs = calloc(opts->max_queue_depth, sizeof(*resources->recvs)); 655 resources->cmds = spdk_dma_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds), 656 0x1000, NULL); 657 resources->cpls = spdk_dma_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls), 658 0x1000, NULL); 659 660 if (opts->in_capsule_data_size > 0) { 661 resources->bufs = spdk_dma_zmalloc(opts->max_queue_depth * 662 opts->in_capsule_data_size, 663 0x1000, NULL); 664 } 665 666 if (!resources->reqs || !resources->recvs || !resources->cmds || 667 !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) { 668 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 669 goto cleanup; 670 } 671 672 resources->cmds_mr = ibv_reg_mr(opts->pd, resources->cmds, 673 opts->max_queue_depth * sizeof(*resources->cmds), 674 IBV_ACCESS_LOCAL_WRITE); 675 resources->cpls_mr = ibv_reg_mr(opts->pd, resources->cpls, 676 opts->max_queue_depth * sizeof(*resources->cpls), 677 0); 678 679 if (opts->in_capsule_data_size) { 680 resources->bufs_mr = ibv_reg_mr(opts->pd, resources->bufs, 681 opts->max_queue_depth * 682 opts->in_capsule_data_size, 683 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 684 } 685 686 if (!resources->cmds_mr || !resources->cpls_mr || 687 (opts->in_capsule_data_size && 688 !resources->bufs_mr)) { 689 goto cleanup; 690 } 691 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 692 resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds), 693 resources->cmds_mr->lkey); 694 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 695 resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls), 696 resources->cpls_mr->lkey); 697 if (resources->bufs && resources->bufs_mr) { 698 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 699 resources->bufs, opts->max_queue_depth * 700 opts->in_capsule_data_size, resources->bufs_mr->lkey); 701 } 702 703 /* Initialize queues */ 704 STAILQ_INIT(&resources->incoming_queue); 705 STAILQ_INIT(&resources->free_queue); 706 707 for (i = 0; i < opts->max_queue_depth; i++) { 708 struct ibv_recv_wr *bad_wr = NULL; 709 710 rdma_recv = &resources->recvs[i]; 711 rdma_recv->qpair = opts->qpair; 712 713 /* Set up memory to receive commands */ 714 if (resources->bufs) { 715 rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i * 716 opts->in_capsule_data_size)); 717 } 718 719 rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV; 720 721 rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i]; 722 rdma_recv->sgl[0].length = sizeof(resources->cmds[i]); 723 rdma_recv->sgl[0].lkey = resources->cmds_mr->lkey; 724 rdma_recv->wr.num_sge = 1; 725 726 if (rdma_recv->buf && resources->bufs_mr) { 727 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 728 rdma_recv->sgl[1].length = opts->in_capsule_data_size; 729 rdma_recv->sgl[1].lkey = resources->bufs_mr->lkey; 730 rdma_recv->wr.num_sge++; 731 } 732 733 rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr; 734 rdma_recv->wr.sg_list = rdma_recv->sgl; 735 if (opts->shared) { 736 srq = (struct ibv_srq *)opts->qp; 737 rc = ibv_post_srq_recv(srq, &rdma_recv->wr, &bad_wr); 738 } else { 739 qp = (struct ibv_qp *)opts->qp; 740 rc = ibv_post_recv(qp, &rdma_recv->wr, &bad_wr); 741 } 742 if (rc) { 743 goto cleanup; 744 } 745 } 746 747 for (i = 0; i < opts->max_queue_depth; i++) { 748 rdma_req = &resources->reqs[i]; 749 750 if (opts->qpair != NULL) { 751 rdma_req->req.qpair = &opts->qpair->qpair; 752 } else { 753 rdma_req->req.qpair = NULL; 754 } 755 rdma_req->req.cmd = NULL; 756 757 /* Set up memory to send responses */ 758 rdma_req->req.rsp = &resources->cpls[i]; 759 760 rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i]; 761 rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]); 762 rdma_req->rsp.sgl[0].lkey = resources->cpls_mr->lkey; 763 764 rdma_req->rsp.rdma_wr.type = RDMA_WR_TYPE_SEND; 765 rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp.rdma_wr; 766 rdma_req->rsp.wr.next = NULL; 767 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 768 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 769 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 770 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 771 772 /* Set up memory for data buffers */ 773 rdma_req->data.rdma_wr.type = RDMA_WR_TYPE_DATA; 774 rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data.rdma_wr; 775 rdma_req->data.wr.next = NULL; 776 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 777 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 778 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 779 780 /* Initialize request state to FREE */ 781 rdma_req->state = RDMA_REQUEST_STATE_FREE; 782 STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link); 783 } 784 785 return resources; 786 787 cleanup: 788 nvmf_rdma_resources_destroy(resources); 789 return NULL; 790 } 791 792 static void 793 spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 794 { 795 struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 796 struct ibv_recv_wr *bad_recv_wr = NULL; 797 int rc; 798 799 spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); 800 801 spdk_poller_unregister(&rqpair->destruct_poller); 802 803 if (rqpair->qd != 0) { 804 if (rqpair->srq == NULL) { 805 nvmf_rdma_dump_qpair_contents(rqpair); 806 } 807 SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd); 808 } 809 810 if (rqpair->poller) { 811 TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); 812 813 if (rqpair->srq != NULL) { 814 /* Drop all received but unprocessed commands for this queue and return them to SRQ */ 815 STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) { 816 if (rqpair == rdma_recv->qpair) { 817 STAILQ_REMOVE_HEAD(&rqpair->resources->incoming_queue, link); 818 rc = ibv_post_srq_recv(rqpair->srq, &rdma_recv->wr, &bad_recv_wr); 819 if (rc) { 820 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 821 } 822 } 823 } 824 } 825 } 826 827 if (rqpair->cm_id) { 828 rdma_destroy_qp(rqpair->cm_id); 829 rdma_destroy_id(rqpair->cm_id); 830 831 if (rqpair->poller != NULL && rqpair->srq == NULL) { 832 rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth); 833 } 834 } 835 836 if (rqpair->srq == NULL) { 837 nvmf_rdma_resources_destroy(rqpair->resources); 838 } 839 840 free(rqpair); 841 } 842 843 static int 844 nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device) 845 { 846 struct spdk_nvmf_rdma_poller *rpoller; 847 int rc, num_cqe, required_num_wr; 848 849 /* Enlarge CQ size dynamically */ 850 rpoller = rqpair->poller; 851 required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth); 852 num_cqe = rpoller->num_cqe; 853 if (num_cqe < required_num_wr) { 854 num_cqe = spdk_max(num_cqe * 2, required_num_wr); 855 num_cqe = spdk_min(num_cqe, device->attr.max_cqe); 856 } 857 858 if (rpoller->num_cqe != num_cqe) { 859 if (required_num_wr > device->attr.max_cqe) { 860 SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n", 861 required_num_wr, device->attr.max_cqe); 862 return -1; 863 } 864 865 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe); 866 rc = ibv_resize_cq(rpoller->cq, num_cqe); 867 if (rc) { 868 SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); 869 return -1; 870 } 871 872 rpoller->num_cqe = num_cqe; 873 } 874 875 rpoller->required_num_wr = required_num_wr; 876 return 0; 877 } 878 879 static int 880 spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 881 { 882 struct spdk_nvmf_rdma_qpair *rqpair; 883 int rc; 884 struct spdk_nvmf_rdma_transport *rtransport; 885 struct spdk_nvmf_transport *transport; 886 struct spdk_nvmf_rdma_resource_opts opts; 887 struct spdk_nvmf_rdma_device *device; 888 struct ibv_qp_init_attr ibv_init_attr; 889 890 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 891 device = rqpair->port->device; 892 893 memset(&ibv_init_attr, 0, sizeof(struct ibv_qp_init_attr)); 894 ibv_init_attr.qp_context = rqpair; 895 ibv_init_attr.qp_type = IBV_QPT_RC; 896 ibv_init_attr.send_cq = rqpair->poller->cq; 897 ibv_init_attr.recv_cq = rqpair->poller->cq; 898 899 if (rqpair->srq) { 900 ibv_init_attr.srq = rqpair->srq; 901 } else { 902 ibv_init_attr.cap.max_recv_wr = rqpair->max_queue_depth + 903 1; /* RECV operations + dummy drain WR */ 904 } 905 906 ibv_init_attr.cap.max_send_wr = rqpair->max_queue_depth * 907 2 + 1; /* SEND, READ, and WRITE operations + dummy drain WR */ 908 ibv_init_attr.cap.max_send_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_TX_SGE); 909 ibv_init_attr.cap.max_recv_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 910 911 if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) { 912 SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n"); 913 goto error; 914 } 915 916 rc = rdma_create_qp(rqpair->cm_id, rqpair->port->device->pd, &ibv_init_attr); 917 if (rc) { 918 SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); 919 goto error; 920 } 921 922 rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2 + 1), 923 ibv_init_attr.cap.max_send_wr); 924 rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, ibv_init_attr.cap.max_send_sge); 925 rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, ibv_init_attr.cap.max_recv_sge); 926 spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0); 927 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); 928 929 if (rqpair->poller->srq == NULL) { 930 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 931 transport = &rtransport->transport; 932 933 opts.qp = rqpair->cm_id->qp; 934 opts.pd = rqpair->cm_id->pd; 935 opts.qpair = rqpair; 936 opts.shared = false; 937 opts.max_queue_depth = rqpair->max_queue_depth; 938 opts.in_capsule_data_size = transport->opts.in_capsule_data_size; 939 940 rqpair->resources = nvmf_rdma_resources_create(&opts); 941 942 if (!rqpair->resources) { 943 SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); 944 goto error; 945 } 946 } else { 947 rqpair->resources = rqpair->poller->resources; 948 } 949 950 rqpair->current_recv_depth = 0; 951 STAILQ_INIT(&rqpair->pending_rdma_read_queue); 952 STAILQ_INIT(&rqpair->pending_rdma_write_queue); 953 954 return 0; 955 956 error: 957 rdma_destroy_id(rqpair->cm_id); 958 rqpair->cm_id = NULL; 959 spdk_nvmf_rdma_qpair_destroy(rqpair); 960 return -1; 961 } 962 963 static int 964 request_transfer_in(struct spdk_nvmf_request *req) 965 { 966 int rc; 967 struct spdk_nvmf_rdma_request *rdma_req; 968 struct spdk_nvmf_qpair *qpair; 969 struct spdk_nvmf_rdma_qpair *rqpair; 970 struct ibv_send_wr *bad_wr = NULL; 971 972 qpair = req->qpair; 973 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 974 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 975 976 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 977 assert(rdma_req != NULL); 978 979 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair); 980 981 rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr); 982 if (rc) { 983 SPDK_ERRLOG("Unable to transfer data from host to target\n"); 984 return -1; 985 } 986 rqpair->current_read_depth += rdma_req->num_outstanding_data_wr; 987 rqpair->current_send_depth += rdma_req->num_outstanding_data_wr; 988 return 0; 989 } 990 991 static int 992 request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) 993 { 994 int rc; 995 struct spdk_nvmf_rdma_request *rdma_req; 996 struct spdk_nvmf_qpair *qpair; 997 struct spdk_nvmf_rdma_qpair *rqpair; 998 struct spdk_nvme_cpl *rsp; 999 struct ibv_recv_wr *bad_recv_wr = NULL; 1000 struct ibv_send_wr *send_wr, *bad_send_wr = NULL; 1001 1002 *data_posted = 0; 1003 qpair = req->qpair; 1004 rsp = &req->rsp->nvme_cpl; 1005 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1006 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1007 1008 /* Advance our sq_head pointer */ 1009 if (qpair->sq_head == qpair->sq_head_max) { 1010 qpair->sq_head = 0; 1011 } else { 1012 qpair->sq_head++; 1013 } 1014 rsp->sqhd = qpair->sq_head; 1015 1016 /* Post the capsule to the recv buffer */ 1017 assert(rdma_req->recv != NULL); 1018 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, 1019 rqpair); 1020 if (rqpair->srq == NULL) { 1021 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); 1022 } else { 1023 rdma_req->recv->qpair = NULL; 1024 rc = ibv_post_srq_recv(rqpair->srq, &rdma_req->recv->wr, &bad_recv_wr); 1025 } 1026 1027 if (rc) { 1028 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 1029 return rc; 1030 } 1031 rdma_req->recv = NULL; 1032 assert(rqpair->current_recv_depth > 0); 1033 rqpair->current_recv_depth--; 1034 1035 /* Build the response which consists of an optional 1036 * RDMA WRITE to transfer data, plus an RDMA SEND 1037 * containing the response. 1038 */ 1039 send_wr = &rdma_req->rsp.wr; 1040 1041 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 1042 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1043 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair); 1044 send_wr = &rdma_req->data.wr; 1045 *data_posted = 1; 1046 } 1047 1048 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair); 1049 1050 /* Send the completion */ 1051 rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr); 1052 if (rc) { 1053 SPDK_ERRLOG("Unable to send response capsule\n"); 1054 return rc; 1055 } 1056 /* +1 for the rsp wr */ 1057 rqpair->current_send_depth += rdma_req->num_outstanding_data_wr + 1; 1058 1059 return 0; 1060 } 1061 1062 static int 1063 spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 1064 { 1065 struct spdk_nvmf_rdma_accept_private_data accept_data; 1066 struct rdma_conn_param ctrlr_event_data = {}; 1067 int rc; 1068 1069 accept_data.recfmt = 0; 1070 accept_data.crqsize = rqpair->max_queue_depth; 1071 1072 ctrlr_event_data.private_data = &accept_data; 1073 ctrlr_event_data.private_data_len = sizeof(accept_data); 1074 if (id->ps == RDMA_PS_TCP) { 1075 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 1076 ctrlr_event_data.initiator_depth = rqpair->max_read_depth; 1077 } 1078 1079 rc = rdma_accept(id, &ctrlr_event_data); 1080 if (rc) { 1081 SPDK_ERRLOG("Error %d on rdma_accept\n", errno); 1082 } else { 1083 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); 1084 } 1085 1086 return rc; 1087 } 1088 1089 static void 1090 spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 1091 { 1092 struct spdk_nvmf_rdma_reject_private_data rej_data; 1093 1094 rej_data.recfmt = 0; 1095 rej_data.sts = error; 1096 1097 rdma_reject(id, &rej_data, sizeof(rej_data)); 1098 } 1099 1100 static int 1101 nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, 1102 new_qpair_fn cb_fn) 1103 { 1104 struct spdk_nvmf_rdma_transport *rtransport; 1105 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 1106 struct spdk_nvmf_rdma_port *port; 1107 struct rdma_conn_param *rdma_param = NULL; 1108 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 1109 uint16_t max_queue_depth; 1110 uint16_t max_read_depth; 1111 1112 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1113 1114 assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 1115 assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 1116 1117 rdma_param = &event->param.conn; 1118 if (rdma_param->private_data == NULL || 1119 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1120 SPDK_ERRLOG("connect request: no private data provided\n"); 1121 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 1122 return -1; 1123 } 1124 1125 private_data = rdma_param->private_data; 1126 if (private_data->recfmt != 0) { 1127 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 1128 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 1129 return -1; 1130 } 1131 1132 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 1133 event->id->verbs->device->name, event->id->verbs->device->dev_name); 1134 1135 port = event->listen_id->context; 1136 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 1137 event->listen_id, event->listen_id->verbs, port); 1138 1139 /* Figure out the supported queue depth. This is a multi-step process 1140 * that takes into account hardware maximums, host provided values, 1141 * and our target's internal memory limits */ 1142 1143 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); 1144 1145 /* Start with the maximum queue depth allowed by the target */ 1146 max_queue_depth = rtransport->transport.opts.max_queue_depth; 1147 max_read_depth = rtransport->transport.opts.max_queue_depth; 1148 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", 1149 rtransport->transport.opts.max_queue_depth); 1150 1151 /* Next check the local NIC's hardware limitations */ 1152 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 1153 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 1154 port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 1155 max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 1156 max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom); 1157 1158 /* Next check the remote NIC's hardware limitations */ 1159 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 1160 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 1161 rdma_param->initiator_depth, rdma_param->responder_resources); 1162 if (rdma_param->initiator_depth > 0) { 1163 max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth); 1164 } 1165 1166 /* Finally check for the host software requested values, which are 1167 * optional. */ 1168 if (rdma_param->private_data != NULL && 1169 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1170 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 1171 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 1172 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 1173 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 1174 } 1175 1176 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 1177 max_queue_depth, max_read_depth); 1178 1179 rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 1180 if (rqpair == NULL) { 1181 SPDK_ERRLOG("Could not allocate new connection.\n"); 1182 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1183 return -1; 1184 } 1185 1186 rqpair->port = port; 1187 rqpair->max_queue_depth = max_queue_depth; 1188 rqpair->max_read_depth = max_read_depth; 1189 rqpair->cm_id = event->id; 1190 rqpair->listen_id = event->listen_id; 1191 rqpair->qpair.transport = transport; 1192 1193 event->id->context = &rqpair->qpair; 1194 1195 cb_fn(&rqpair->qpair); 1196 1197 return 0; 1198 } 1199 1200 static int 1201 spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 1202 enum spdk_mem_map_notify_action action, 1203 void *vaddr, size_t size) 1204 { 1205 struct ibv_pd *pd = cb_ctx; 1206 struct ibv_mr *mr; 1207 1208 switch (action) { 1209 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1210 if (!g_nvmf_hooks.get_rkey) { 1211 mr = ibv_reg_mr(pd, vaddr, size, 1212 IBV_ACCESS_LOCAL_WRITE | 1213 IBV_ACCESS_REMOTE_READ | 1214 IBV_ACCESS_REMOTE_WRITE); 1215 if (mr == NULL) { 1216 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 1217 return -1; 1218 } else { 1219 spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 1220 } 1221 } else { 1222 spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, 1223 g_nvmf_hooks.get_rkey(pd, vaddr, size)); 1224 } 1225 break; 1226 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1227 if (!g_nvmf_hooks.get_rkey) { 1228 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); 1229 spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 1230 if (mr) { 1231 ibv_dereg_mr(mr); 1232 } 1233 } 1234 break; 1235 } 1236 1237 return 0; 1238 } 1239 1240 static int 1241 spdk_nvmf_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) 1242 { 1243 /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ 1244 return addr_1 == addr_2; 1245 } 1246 1247 static void 1248 spdk_nvmf_rdma_request_free_buffers(struct spdk_nvmf_rdma_request *rdma_req, 1249 struct spdk_nvmf_transport_poll_group *group, struct spdk_nvmf_transport *transport) 1250 { 1251 for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { 1252 if (group->buf_cache_count < group->buf_cache_size) { 1253 STAILQ_INSERT_HEAD(&group->buf_cache, 1254 (struct spdk_nvmf_transport_pg_cache_buf *)rdma_req->data.buffers[i], link); 1255 group->buf_cache_count++; 1256 } else { 1257 spdk_mempool_put(transport->data_buf_pool, rdma_req->data.buffers[i]); 1258 } 1259 rdma_req->req.iov[i].iov_base = NULL; 1260 rdma_req->data.buffers[i] = NULL; 1261 rdma_req->req.iov[i].iov_len = 0; 1262 1263 } 1264 rdma_req->data_from_pool = false; 1265 } 1266 1267 typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; 1268 1269 static spdk_nvme_data_transfer_t 1270 spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) 1271 { 1272 enum spdk_nvme_data_transfer xfer; 1273 struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; 1274 struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; 1275 1276 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1277 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 1278 rdma_req->rsp.wr.imm_data = 0; 1279 #endif 1280 1281 /* Figure out data transfer direction */ 1282 if (cmd->opc == SPDK_NVME_OPC_FABRIC) { 1283 xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); 1284 } else { 1285 xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 1286 1287 /* Some admin commands are special cases */ 1288 if ((rdma_req->req.qpair->qid == 0) && 1289 ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || 1290 (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { 1291 switch (cmd->cdw10 & 0xff) { 1292 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 1293 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1294 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 1295 break; 1296 default: 1297 xfer = SPDK_NVME_DATA_NONE; 1298 } 1299 } 1300 } 1301 1302 if (xfer == SPDK_NVME_DATA_NONE) { 1303 return xfer; 1304 } 1305 1306 /* Even for commands that may transfer data, they could have specified 0 length. 1307 * We want those to show up with xfer SPDK_NVME_DATA_NONE. 1308 */ 1309 switch (sgl->generic.type) { 1310 case SPDK_NVME_SGL_TYPE_DATA_BLOCK: 1311 case SPDK_NVME_SGL_TYPE_BIT_BUCKET: 1312 case SPDK_NVME_SGL_TYPE_SEGMENT: 1313 case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: 1314 case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: 1315 if (sgl->unkeyed.length == 0) { 1316 xfer = SPDK_NVME_DATA_NONE; 1317 } 1318 break; 1319 case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: 1320 if (sgl->keyed.length == 0) { 1321 xfer = SPDK_NVME_DATA_NONE; 1322 } 1323 break; 1324 } 1325 1326 return xfer; 1327 } 1328 1329 static int 1330 nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport, 1331 struct spdk_nvmf_rdma_poll_group *rgroup, 1332 struct spdk_nvmf_rdma_device *device, 1333 struct spdk_nvmf_rdma_request *rdma_req, 1334 struct ibv_send_wr *wr, 1335 uint32_t length) 1336 { 1337 void *buf = NULL; 1338 uint64_t translation_len; 1339 uint32_t remaining_length = length; 1340 uint32_t iovcnt; 1341 uint32_t i = 0; 1342 1343 1344 while (remaining_length) { 1345 if (!(STAILQ_EMPTY(&rgroup->group.buf_cache))) { 1346 rgroup->group.buf_cache_count--; 1347 buf = STAILQ_FIRST(&rgroup->group.buf_cache); 1348 STAILQ_REMOVE_HEAD(&rgroup->group.buf_cache, link); 1349 assert(buf != NULL); 1350 } else { 1351 buf = spdk_mempool_get(rtransport->transport.data_buf_pool); 1352 if (!buf) { 1353 return -ENOMEM; 1354 } 1355 } 1356 1357 iovcnt = rdma_req->req.iovcnt; 1358 rdma_req->req.iov[iovcnt].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & 1359 ~NVMF_DATA_BUFFER_MASK); 1360 rdma_req->req.iov[iovcnt].iov_len = spdk_min(remaining_length, 1361 rtransport->transport.opts.io_unit_size); 1362 rdma_req->req.iovcnt++; 1363 rdma_req->data.buffers[iovcnt] = buf; 1364 wr->sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[iovcnt].iov_base); 1365 wr->sg_list[i].length = rdma_req->req.iov[iovcnt].iov_len; 1366 translation_len = rdma_req->req.iov[iovcnt].iov_len; 1367 1368 if (!g_nvmf_hooks.get_rkey) { 1369 wr->sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, 1370 (uint64_t)buf, &translation_len))->lkey; 1371 } else { 1372 wr->sg_list[i].lkey = spdk_mem_map_translate(device->map, 1373 (uint64_t)buf, &translation_len); 1374 } 1375 1376 remaining_length -= rdma_req->req.iov[iovcnt].iov_len; 1377 1378 if (translation_len < rdma_req->req.iov[iovcnt].iov_len) { 1379 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); 1380 return -EINVAL; 1381 } 1382 i++; 1383 } 1384 1385 return 0; 1386 } 1387 1388 static int 1389 spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 1390 struct spdk_nvmf_rdma_device *device, 1391 struct spdk_nvmf_rdma_request *rdma_req) 1392 { 1393 struct spdk_nvmf_rdma_qpair *rqpair; 1394 struct spdk_nvmf_rdma_poll_group *rgroup; 1395 uint32_t i = 0; 1396 int rc = 0; 1397 1398 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1399 rgroup = rqpair->poller->group; 1400 rdma_req->req.iovcnt = 0; 1401 1402 rc = nvmf_rdma_fill_buffers(rtransport, rgroup, device, rdma_req, &rdma_req->data.wr, 1403 rdma_req->req.length); 1404 if (rc != 0) { 1405 goto err_exit; 1406 } 1407 1408 assert(rdma_req->req.iovcnt <= rqpair->max_send_sge); 1409 1410 rdma_req->data_from_pool = true; 1411 1412 return rc; 1413 1414 err_exit: 1415 spdk_nvmf_rdma_request_free_buffers(rdma_req, &rgroup->group, &rtransport->transport); 1416 while (i) { 1417 i--; 1418 rdma_req->data.wr.sg_list[i].addr = 0; 1419 rdma_req->data.wr.sg_list[i].length = 0; 1420 rdma_req->data.wr.sg_list[i].lkey = 0; 1421 } 1422 rdma_req->req.iovcnt = 0; 1423 return rc; 1424 } 1425 1426 static int 1427 spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 1428 struct spdk_nvmf_rdma_device *device, 1429 struct spdk_nvmf_rdma_request *rdma_req) 1430 { 1431 struct spdk_nvme_cmd *cmd; 1432 struct spdk_nvme_cpl *rsp; 1433 struct spdk_nvme_sgl_descriptor *sgl; 1434 1435 cmd = &rdma_req->req.cmd->nvme_cmd; 1436 rsp = &rdma_req->req.rsp->nvme_cpl; 1437 sgl = &cmd->dptr.sgl1; 1438 1439 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 1440 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 1441 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 1442 if (sgl->keyed.length > rtransport->transport.opts.max_io_size) { 1443 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 1444 sgl->keyed.length, rtransport->transport.opts.max_io_size); 1445 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1446 return -1; 1447 } 1448 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1449 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 1450 if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 1451 rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 1452 rdma_req->rsp.wr.imm_data = sgl->keyed.key; 1453 } 1454 } 1455 #endif 1456 1457 /* fill request length and populate iovs */ 1458 rdma_req->req.length = sgl->keyed.length; 1459 1460 if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { 1461 /* No available buffers. Queue this request up. */ 1462 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 1463 return 0; 1464 } 1465 1466 /* backward compatible */ 1467 rdma_req->req.data = rdma_req->req.iov[0].iov_base; 1468 1469 /* rdma wr specifics */ 1470 rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; 1471 rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; 1472 rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; 1473 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1474 rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; 1475 rdma_req->data.wr.next = &rdma_req->rsp.wr; 1476 } else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 1477 rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; 1478 rdma_req->data.wr.next = NULL; 1479 } 1480 1481 /* set the number of outstanding data WRs for this request. */ 1482 rdma_req->num_outstanding_data_wr = 1; 1483 1484 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 1485 rdma_req->req.iovcnt); 1486 1487 return 0; 1488 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 1489 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1490 uint64_t offset = sgl->address; 1491 uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; 1492 1493 SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 1494 offset, sgl->unkeyed.length); 1495 1496 if (offset > max_len) { 1497 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 1498 offset, max_len); 1499 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 1500 return -1; 1501 } 1502 max_len -= (uint32_t)offset; 1503 1504 if (sgl->unkeyed.length > max_len) { 1505 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 1506 sgl->unkeyed.length, max_len); 1507 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1508 return -1; 1509 } 1510 1511 rdma_req->num_outstanding_data_wr = 0; 1512 rdma_req->req.data = rdma_req->recv->buf + offset; 1513 rdma_req->data_from_pool = false; 1514 rdma_req->req.length = sgl->unkeyed.length; 1515 1516 rdma_req->req.iov[0].iov_base = rdma_req->req.data; 1517 rdma_req->req.iov[0].iov_len = rdma_req->req.length; 1518 rdma_req->req.iovcnt = 1; 1519 1520 return 0; 1521 } 1522 1523 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 1524 sgl->generic.type, sgl->generic.subtype); 1525 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 1526 return -1; 1527 } 1528 1529 static void 1530 nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req, 1531 struct spdk_nvmf_rdma_transport *rtransport) 1532 { 1533 struct spdk_nvmf_rdma_qpair *rqpair; 1534 struct spdk_nvmf_rdma_poll_group *rgroup; 1535 1536 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1537 if (rdma_req->data_from_pool) { 1538 rgroup = rqpair->poller->group; 1539 1540 spdk_nvmf_rdma_request_free_buffers(rdma_req, &rgroup->group, &rtransport->transport); 1541 } 1542 rdma_req->num_outstanding_data_wr = 0; 1543 rdma_req->req.length = 0; 1544 rdma_req->req.iovcnt = 0; 1545 rdma_req->req.data = NULL; 1546 rqpair->qd--; 1547 1548 STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link); 1549 rdma_req->state = RDMA_REQUEST_STATE_FREE; 1550 } 1551 1552 static bool 1553 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 1554 struct spdk_nvmf_rdma_request *rdma_req) 1555 { 1556 struct spdk_nvmf_rdma_qpair *rqpair; 1557 struct spdk_nvmf_rdma_device *device; 1558 struct spdk_nvmf_rdma_poll_group *rgroup; 1559 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 1560 int rc; 1561 struct spdk_nvmf_rdma_recv *rdma_recv; 1562 enum spdk_nvmf_rdma_request_state prev_state; 1563 bool progress = false; 1564 int data_posted; 1565 1566 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1567 device = rqpair->port->device; 1568 rgroup = rqpair->poller->group; 1569 1570 assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 1571 1572 /* If the queue pair is in an error state, force the request to the completed state 1573 * to release resources. */ 1574 if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 1575 if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) { 1576 TAILQ_REMOVE(&rgroup->pending_data_buf_queue, rdma_req, link); 1577 } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) { 1578 STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 1579 } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING) { 1580 STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 1581 } 1582 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 1583 } 1584 1585 /* The loop here is to allow for several back-to-back state changes. */ 1586 do { 1587 prev_state = rdma_req->state; 1588 1589 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); 1590 1591 switch (rdma_req->state) { 1592 case RDMA_REQUEST_STATE_FREE: 1593 /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 1594 * to escape this state. */ 1595 break; 1596 case RDMA_REQUEST_STATE_NEW: 1597 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, 1598 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1599 rdma_recv = rdma_req->recv; 1600 1601 /* The first element of the SGL is the NVMe command */ 1602 rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 1603 memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 1604 1605 if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 1606 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 1607 break; 1608 } 1609 1610 /* The next state transition depends on the data transfer needs of this request. */ 1611 rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); 1612 1613 /* If no data to transfer, ready to execute. */ 1614 if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 1615 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1616 break; 1617 } 1618 1619 rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER; 1620 TAILQ_INSERT_TAIL(&rgroup->pending_data_buf_queue, rdma_req, link); 1621 break; 1622 case RDMA_REQUEST_STATE_NEED_BUFFER: 1623 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, 1624 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1625 1626 assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 1627 1628 if (rdma_req != TAILQ_FIRST(&rgroup->pending_data_buf_queue)) { 1629 /* This request needs to wait in line to obtain a buffer */ 1630 break; 1631 } 1632 1633 /* Try to get a data buffer */ 1634 rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 1635 if (rc < 0) { 1636 TAILQ_REMOVE(&rgroup->pending_data_buf_queue, rdma_req, link); 1637 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1638 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1639 break; 1640 } 1641 1642 if (!rdma_req->req.data) { 1643 /* No buffers available. */ 1644 break; 1645 } 1646 1647 TAILQ_REMOVE(&rgroup->pending_data_buf_queue, rdma_req, link); 1648 1649 /* If data is transferring from host to controller and the data didn't 1650 * arrive using in capsule data, we need to do a transfer from the host. 1651 */ 1652 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { 1653 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link); 1654 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING; 1655 break; 1656 } 1657 1658 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 1659 break; 1660 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: 1661 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0, 1662 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1663 1664 if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) { 1665 /* This request needs to wait in line to perform RDMA */ 1666 break; 1667 } 1668 if (rqpair->current_send_depth + rdma_req->num_outstanding_data_wr > rqpair->max_send_depth 1669 || rqpair->current_read_depth + rdma_req->num_outstanding_data_wr > rqpair->max_read_depth) { 1670 /* We can only have so many WRs outstanding. we have to wait until some finish. */ 1671 break; 1672 } 1673 1674 /* We have already verified that this request is the head of the queue. */ 1675 STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link); 1676 1677 rc = request_transfer_in(&rdma_req->req); 1678 if (!rc) { 1679 rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; 1680 } else { 1681 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1682 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1683 } 1684 break; 1685 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 1686 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, 1687 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1688 /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 1689 * to escape this state. */ 1690 break; 1691 case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 1692 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, 1693 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1694 rdma_req->state = RDMA_REQUEST_STATE_EXECUTING; 1695 spdk_nvmf_request_exec(&rdma_req->req); 1696 break; 1697 case RDMA_REQUEST_STATE_EXECUTING: 1698 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, 1699 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1700 /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 1701 * to escape this state. */ 1702 break; 1703 case RDMA_REQUEST_STATE_EXECUTED: 1704 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, 1705 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1706 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1707 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link); 1708 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING; 1709 } else { 1710 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1711 } 1712 break; 1713 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: 1714 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0, 1715 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1716 1717 if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) { 1718 /* This request needs to wait in line to perform RDMA */ 1719 break; 1720 } 1721 if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) > 1722 rqpair->max_send_depth) { 1723 /* We can only have so many WRs outstanding. we have to wait until some finish. 1724 * +1 since each request has an additional wr in the resp. */ 1725 break; 1726 } 1727 1728 /* We have already verified that this request is the head of the queue. */ 1729 STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link); 1730 1731 /* The data transfer will be kicked off from 1732 * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. 1733 */ 1734 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 1735 break; 1736 case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 1737 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, 1738 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1739 rc = request_transfer_out(&rdma_req->req, &data_posted); 1740 assert(rc == 0); /* No good way to handle this currently */ 1741 if (rc) { 1742 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 1743 } else { 1744 rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : 1745 RDMA_REQUEST_STATE_COMPLETING; 1746 } 1747 break; 1748 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 1749 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, 1750 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1751 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1752 * to escape this state. */ 1753 break; 1754 case RDMA_REQUEST_STATE_COMPLETING: 1755 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, 1756 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1757 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1758 * to escape this state. */ 1759 break; 1760 case RDMA_REQUEST_STATE_COMPLETED: 1761 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, 1762 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1763 1764 nvmf_rdma_request_free(rdma_req, rtransport); 1765 break; 1766 case RDMA_REQUEST_NUM_STATES: 1767 default: 1768 assert(0); 1769 break; 1770 } 1771 1772 if (rdma_req->state != prev_state) { 1773 progress = true; 1774 } 1775 } while (rdma_req->state != prev_state); 1776 1777 return progress; 1778 } 1779 1780 /* Public API callbacks begin here */ 1781 1782 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 1783 #define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 1784 #define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096 1785 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 1786 #define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 1787 #define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 1788 #define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES) 1789 #define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4096 1790 #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32 1791 1792 static void 1793 spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) 1794 { 1795 opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; 1796 opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; 1797 opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; 1798 opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; 1799 opts->io_unit_size = SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE; 1800 opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; 1801 opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS; 1802 opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE; 1803 opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH; 1804 } 1805 1806 const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = { 1807 .notify_cb = spdk_nvmf_rdma_mem_notify, 1808 .are_contiguous = spdk_nvmf_rdma_check_contiguous_entries 1809 }; 1810 1811 static int spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport); 1812 1813 static struct spdk_nvmf_transport * 1814 spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) 1815 { 1816 int rc; 1817 struct spdk_nvmf_rdma_transport *rtransport; 1818 struct spdk_nvmf_rdma_device *device, *tmp; 1819 struct ibv_pd *pd; 1820 struct ibv_context **contexts; 1821 uint32_t i; 1822 int flag; 1823 uint32_t sge_count; 1824 uint32_t min_shared_buffers; 1825 int max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES; 1826 1827 rtransport = calloc(1, sizeof(*rtransport)); 1828 if (!rtransport) { 1829 return NULL; 1830 } 1831 1832 if (pthread_mutex_init(&rtransport->lock, NULL)) { 1833 SPDK_ERRLOG("pthread_mutex_init() failed\n"); 1834 free(rtransport); 1835 return NULL; 1836 } 1837 1838 TAILQ_INIT(&rtransport->devices); 1839 TAILQ_INIT(&rtransport->ports); 1840 1841 rtransport->transport.ops = &spdk_nvmf_transport_rdma; 1842 1843 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n" 1844 " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" 1845 " max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" 1846 " in_capsule_data_size=%d, max_aq_depth=%d,\n" 1847 " num_shared_buffers=%d, max_srq_depth=%d\n", 1848 opts->max_queue_depth, 1849 opts->max_io_size, 1850 opts->max_qpairs_per_ctrlr, 1851 opts->io_unit_size, 1852 opts->in_capsule_data_size, 1853 opts->max_aq_depth, 1854 opts->num_shared_buffers, 1855 opts->max_srq_depth); 1856 1857 /* I/O unit size cannot be larger than max I/O size */ 1858 if (opts->io_unit_size > opts->max_io_size) { 1859 opts->io_unit_size = opts->max_io_size; 1860 } 1861 1862 if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) { 1863 SPDK_ERRLOG("The number of shared data buffers (%d) is less than" 1864 "the minimum number required to guarantee that forward progress can be made (%d)\n", 1865 opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2)); 1866 spdk_nvmf_rdma_destroy(&rtransport->transport); 1867 return NULL; 1868 } 1869 1870 min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size; 1871 if (min_shared_buffers > opts->num_shared_buffers) { 1872 SPDK_ERRLOG("There are not enough buffers to satisfy" 1873 "per-poll group caches for each thread. (%" PRIu32 ")" 1874 "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers); 1875 SPDK_ERRLOG("Please specify a larger number of shared buffers\n"); 1876 spdk_nvmf_rdma_destroy(&rtransport->transport); 1877 return NULL; 1878 } 1879 1880 sge_count = opts->max_io_size / opts->io_unit_size; 1881 if (sge_count > NVMF_DEFAULT_TX_SGE) { 1882 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); 1883 spdk_nvmf_rdma_destroy(&rtransport->transport); 1884 return NULL; 1885 } 1886 1887 rtransport->event_channel = rdma_create_event_channel(); 1888 if (rtransport->event_channel == NULL) { 1889 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 1890 spdk_nvmf_rdma_destroy(&rtransport->transport); 1891 return NULL; 1892 } 1893 1894 flag = fcntl(rtransport->event_channel->fd, F_GETFL); 1895 if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1896 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 1897 rtransport->event_channel->fd, spdk_strerror(errno)); 1898 spdk_nvmf_rdma_destroy(&rtransport->transport); 1899 return NULL; 1900 } 1901 1902 rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data", 1903 opts->max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES, 1904 sizeof(struct spdk_nvmf_rdma_request_data), 1905 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 1906 SPDK_ENV_SOCKET_ID_ANY); 1907 if (!rtransport->data_wr_pool) { 1908 SPDK_ERRLOG("Unable to allocate work request pool for poll group\n"); 1909 spdk_nvmf_rdma_destroy(&rtransport->transport); 1910 return NULL; 1911 } 1912 1913 contexts = rdma_get_devices(NULL); 1914 if (contexts == NULL) { 1915 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 1916 spdk_nvmf_rdma_destroy(&rtransport->transport); 1917 return NULL; 1918 } 1919 1920 i = 0; 1921 rc = 0; 1922 while (contexts[i] != NULL) { 1923 device = calloc(1, sizeof(*device)); 1924 if (!device) { 1925 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 1926 rc = -ENOMEM; 1927 break; 1928 } 1929 device->context = contexts[i]; 1930 rc = ibv_query_device(device->context, &device->attr); 1931 if (rc < 0) { 1932 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1933 free(device); 1934 break; 1935 1936 } 1937 1938 max_device_sge = spdk_min(max_device_sge, device->attr.max_sge); 1939 1940 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1941 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { 1942 SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); 1943 SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); 1944 } 1945 1946 /** 1947 * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. 1948 * The Soft-RoCE RXE driver does not currently support send with invalidate, 1949 * but incorrectly reports that it does. There are changes making their way 1950 * through the kernel now that will enable this feature. When they are merged, 1951 * we can conditionally enable this feature. 1952 * 1953 * TODO: enable this for versions of the kernel rxe driver that support it. 1954 */ 1955 if (device->attr.vendor_id == 0) { 1956 device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); 1957 } 1958 #endif 1959 1960 /* set up device context async ev fd as NON_BLOCKING */ 1961 flag = fcntl(device->context->async_fd, F_GETFL); 1962 rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 1963 if (rc < 0) { 1964 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 1965 free(device); 1966 break; 1967 } 1968 1969 TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 1970 i++; 1971 1972 pd = NULL; 1973 if (g_nvmf_hooks.get_ibv_pd) { 1974 pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context); 1975 } 1976 1977 if (!g_nvmf_hooks.get_ibv_pd) { 1978 device->pd = ibv_alloc_pd(device->context); 1979 if (!device->pd) { 1980 SPDK_ERRLOG("Unable to allocate protection domain.\n"); 1981 spdk_nvmf_rdma_destroy(&rtransport->transport); 1982 return NULL; 1983 } 1984 } else { 1985 device->pd = pd; 1986 } 1987 1988 assert(device->map == NULL); 1989 1990 device->map = spdk_mem_map_alloc(0, &g_nvmf_rdma_map_ops, device->pd); 1991 if (!device->map) { 1992 SPDK_ERRLOG("Unable to allocate memory map for listen address\n"); 1993 spdk_nvmf_rdma_destroy(&rtransport->transport); 1994 return NULL; 1995 } 1996 1997 assert(device->map != NULL); 1998 assert(device->pd != NULL); 1999 } 2000 rdma_free_devices(contexts); 2001 2002 if (opts->io_unit_size * max_device_sge < opts->max_io_size) { 2003 /* divide and round up. */ 2004 opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge; 2005 2006 /* round up to the nearest 4k. */ 2007 opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK; 2008 2009 opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE); 2010 SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n", 2011 opts->io_unit_size); 2012 } 2013 2014 if (rc < 0) { 2015 spdk_nvmf_rdma_destroy(&rtransport->transport); 2016 return NULL; 2017 } 2018 2019 /* Set up poll descriptor array to monitor events from RDMA and IB 2020 * in a single poll syscall 2021 */ 2022 rtransport->npoll_fds = i + 1; 2023 i = 0; 2024 rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 2025 if (rtransport->poll_fds == NULL) { 2026 SPDK_ERRLOG("poll_fds allocation failed\n"); 2027 spdk_nvmf_rdma_destroy(&rtransport->transport); 2028 return NULL; 2029 } 2030 2031 rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 2032 rtransport->poll_fds[i++].events = POLLIN; 2033 2034 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 2035 rtransport->poll_fds[i].fd = device->context->async_fd; 2036 rtransport->poll_fds[i++].events = POLLIN; 2037 } 2038 2039 return &rtransport->transport; 2040 } 2041 2042 static int 2043 spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) 2044 { 2045 struct spdk_nvmf_rdma_transport *rtransport; 2046 struct spdk_nvmf_rdma_port *port, *port_tmp; 2047 struct spdk_nvmf_rdma_device *device, *device_tmp; 2048 2049 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2050 2051 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 2052 TAILQ_REMOVE(&rtransport->ports, port, link); 2053 rdma_destroy_id(port->id); 2054 free(port); 2055 } 2056 2057 if (rtransport->poll_fds != NULL) { 2058 free(rtransport->poll_fds); 2059 } 2060 2061 if (rtransport->event_channel != NULL) { 2062 rdma_destroy_event_channel(rtransport->event_channel); 2063 } 2064 2065 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 2066 TAILQ_REMOVE(&rtransport->devices, device, link); 2067 if (device->map) { 2068 spdk_mem_map_free(&device->map); 2069 } 2070 if (device->pd) { 2071 if (!g_nvmf_hooks.get_ibv_pd) { 2072 ibv_dealloc_pd(device->pd); 2073 } 2074 } 2075 free(device); 2076 } 2077 2078 if (rtransport->data_wr_pool != NULL) { 2079 if (spdk_mempool_count(rtransport->data_wr_pool) != 2080 (transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES)) { 2081 SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n", 2082 spdk_mempool_count(rtransport->data_wr_pool), 2083 transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES); 2084 } 2085 } 2086 2087 spdk_mempool_free(rtransport->data_wr_pool); 2088 pthread_mutex_destroy(&rtransport->lock); 2089 free(rtransport); 2090 2091 return 0; 2092 } 2093 2094 static int 2095 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 2096 struct spdk_nvme_transport_id *trid, 2097 bool peer); 2098 2099 static int 2100 spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, 2101 const struct spdk_nvme_transport_id *trid) 2102 { 2103 struct spdk_nvmf_rdma_transport *rtransport; 2104 struct spdk_nvmf_rdma_device *device; 2105 struct spdk_nvmf_rdma_port *port_tmp, *port; 2106 struct addrinfo *res; 2107 struct addrinfo hints; 2108 int family; 2109 int rc; 2110 2111 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2112 2113 port = calloc(1, sizeof(*port)); 2114 if (!port) { 2115 return -ENOMEM; 2116 } 2117 2118 /* Selectively copy the trid. Things like NQN don't matter here - that 2119 * mapping is enforced elsewhere. 2120 */ 2121 port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 2122 port->trid.adrfam = trid->adrfam; 2123 snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); 2124 snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); 2125 2126 pthread_mutex_lock(&rtransport->lock); 2127 assert(rtransport->event_channel != NULL); 2128 TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { 2129 if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { 2130 port_tmp->ref++; 2131 free(port); 2132 /* Already listening at this address */ 2133 pthread_mutex_unlock(&rtransport->lock); 2134 return 0; 2135 } 2136 } 2137 2138 rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 2139 if (rc < 0) { 2140 SPDK_ERRLOG("rdma_create_id() failed\n"); 2141 free(port); 2142 pthread_mutex_unlock(&rtransport->lock); 2143 return rc; 2144 } 2145 2146 switch (port->trid.adrfam) { 2147 case SPDK_NVMF_ADRFAM_IPV4: 2148 family = AF_INET; 2149 break; 2150 case SPDK_NVMF_ADRFAM_IPV6: 2151 family = AF_INET6; 2152 break; 2153 default: 2154 SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); 2155 free(port); 2156 pthread_mutex_unlock(&rtransport->lock); 2157 return -EINVAL; 2158 } 2159 2160 memset(&hints, 0, sizeof(hints)); 2161 hints.ai_family = family; 2162 hints.ai_flags = AI_NUMERICSERV; 2163 hints.ai_socktype = SOCK_STREAM; 2164 hints.ai_protocol = 0; 2165 2166 rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); 2167 if (rc) { 2168 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 2169 free(port); 2170 pthread_mutex_unlock(&rtransport->lock); 2171 return -EINVAL; 2172 } 2173 2174 rc = rdma_bind_addr(port->id, res->ai_addr); 2175 freeaddrinfo(res); 2176 2177 if (rc < 0) { 2178 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 2179 rdma_destroy_id(port->id); 2180 free(port); 2181 pthread_mutex_unlock(&rtransport->lock); 2182 return rc; 2183 } 2184 2185 if (!port->id->verbs) { 2186 SPDK_ERRLOG("ibv_context is null\n"); 2187 rdma_destroy_id(port->id); 2188 free(port); 2189 pthread_mutex_unlock(&rtransport->lock); 2190 return -1; 2191 } 2192 2193 rc = rdma_listen(port->id, 10); /* 10 = backlog */ 2194 if (rc < 0) { 2195 SPDK_ERRLOG("rdma_listen() failed\n"); 2196 rdma_destroy_id(port->id); 2197 free(port); 2198 pthread_mutex_unlock(&rtransport->lock); 2199 return rc; 2200 } 2201 2202 TAILQ_FOREACH(device, &rtransport->devices, link) { 2203 if (device->context == port->id->verbs) { 2204 port->device = device; 2205 break; 2206 } 2207 } 2208 if (!port->device) { 2209 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 2210 port->id->verbs); 2211 rdma_destroy_id(port->id); 2212 free(port); 2213 pthread_mutex_unlock(&rtransport->lock); 2214 return -EINVAL; 2215 } 2216 2217 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", 2218 port->trid.traddr, ntohs(rdma_get_src_port(port->id))); 2219 2220 port->ref = 1; 2221 2222 TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 2223 pthread_mutex_unlock(&rtransport->lock); 2224 2225 return 0; 2226 } 2227 2228 static int 2229 spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 2230 const struct spdk_nvme_transport_id *_trid) 2231 { 2232 struct spdk_nvmf_rdma_transport *rtransport; 2233 struct spdk_nvmf_rdma_port *port, *tmp; 2234 struct spdk_nvme_transport_id trid = {}; 2235 2236 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2237 2238 /* Selectively copy the trid. Things like NQN don't matter here - that 2239 * mapping is enforced elsewhere. 2240 */ 2241 trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 2242 trid.adrfam = _trid->adrfam; 2243 snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); 2244 snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); 2245 2246 pthread_mutex_lock(&rtransport->lock); 2247 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 2248 if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { 2249 assert(port->ref > 0); 2250 port->ref--; 2251 if (port->ref == 0) { 2252 TAILQ_REMOVE(&rtransport->ports, port, link); 2253 rdma_destroy_id(port->id); 2254 free(port); 2255 } 2256 break; 2257 } 2258 } 2259 2260 pthread_mutex_unlock(&rtransport->lock); 2261 return 0; 2262 } 2263 2264 static void 2265 spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 2266 struct spdk_nvmf_rdma_qpair *rqpair, bool drain) 2267 { 2268 struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 2269 struct spdk_nvmf_rdma_resources *resources; 2270 2271 /* We process I/O in the data transfer pending queue at the highest priority. RDMA reads first */ 2272 STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) { 2273 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2274 break; 2275 } 2276 } 2277 2278 /* Then RDMA writes since reads have stronger restrictions than writes */ 2279 STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) { 2280 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2281 break; 2282 } 2283 } 2284 2285 /* The second highest priority is I/O waiting on memory buffers. */ 2286 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->poller->group->pending_data_buf_queue, link, 2287 req_tmp) { 2288 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2289 break; 2290 } 2291 } 2292 2293 resources = rqpair->resources; 2294 while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) { 2295 rdma_req = STAILQ_FIRST(&resources->free_queue); 2296 STAILQ_REMOVE_HEAD(&resources->free_queue, state_link); 2297 rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue); 2298 STAILQ_REMOVE_HEAD(&resources->incoming_queue, link); 2299 2300 if (rqpair->srq != NULL) { 2301 rdma_req->req.qpair = &rdma_req->recv->qpair->qpair; 2302 rdma_req->recv->qpair->qd++; 2303 } else { 2304 rqpair->qd++; 2305 } 2306 2307 rdma_req->state = RDMA_REQUEST_STATE_NEW; 2308 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 2309 break; 2310 } 2311 } 2312 } 2313 2314 static void 2315 _nvmf_rdma_qpair_disconnect(void *ctx) 2316 { 2317 struct spdk_nvmf_qpair *qpair = ctx; 2318 2319 spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); 2320 } 2321 2322 static void 2323 _nvmf_rdma_try_disconnect(void *ctx) 2324 { 2325 struct spdk_nvmf_qpair *qpair = ctx; 2326 struct spdk_nvmf_poll_group *group; 2327 2328 /* Read the group out of the qpair. This is normally set and accessed only from 2329 * the thread that created the group. Here, we're not on that thread necessarily. 2330 * The data member qpair->group begins it's life as NULL and then is assigned to 2331 * a pointer and never changes. So fortunately reading this and checking for 2332 * non-NULL is thread safe in the x86_64 memory model. */ 2333 group = qpair->group; 2334 2335 if (group == NULL) { 2336 /* The qpair hasn't been assigned to a group yet, so we can't 2337 * process a disconnect. Send a message to ourself and try again. */ 2338 spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_try_disconnect, qpair); 2339 return; 2340 } 2341 2342 spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair); 2343 } 2344 2345 static inline void 2346 spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair) 2347 { 2348 if (__sync_bool_compare_and_swap(&rqpair->disconnect_started, false, true)) { 2349 _nvmf_rdma_try_disconnect(&rqpair->qpair); 2350 } 2351 } 2352 2353 static void nvmf_rdma_destroy_drained_qpair(void *ctx) 2354 { 2355 struct spdk_nvmf_rdma_qpair *rqpair = ctx; 2356 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 2357 struct spdk_nvmf_rdma_transport, transport); 2358 2359 /* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */ 2360 if (rqpair->current_send_depth != 0) { 2361 return; 2362 } 2363 2364 if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) { 2365 return; 2366 } 2367 2368 if (rqpair->srq != NULL && rqpair->last_wqe_reached == false) { 2369 return; 2370 } 2371 2372 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 2373 spdk_nvmf_rdma_qpair_destroy(rqpair); 2374 } 2375 2376 2377 static int 2378 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 2379 { 2380 struct spdk_nvmf_qpair *qpair; 2381 struct spdk_nvmf_rdma_qpair *rqpair; 2382 2383 if (evt->id == NULL) { 2384 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 2385 return -1; 2386 } 2387 2388 qpair = evt->id->context; 2389 if (qpair == NULL) { 2390 SPDK_ERRLOG("disconnect request: no active connection\n"); 2391 return -1; 2392 } 2393 2394 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2395 2396 spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); 2397 2398 spdk_nvmf_rdma_update_ibv_state(rqpair); 2399 2400 spdk_nvmf_rdma_start_disconnect(rqpair); 2401 2402 return 0; 2403 } 2404 2405 #ifdef DEBUG 2406 static const char *CM_EVENT_STR[] = { 2407 "RDMA_CM_EVENT_ADDR_RESOLVED", 2408 "RDMA_CM_EVENT_ADDR_ERROR", 2409 "RDMA_CM_EVENT_ROUTE_RESOLVED", 2410 "RDMA_CM_EVENT_ROUTE_ERROR", 2411 "RDMA_CM_EVENT_CONNECT_REQUEST", 2412 "RDMA_CM_EVENT_CONNECT_RESPONSE", 2413 "RDMA_CM_EVENT_CONNECT_ERROR", 2414 "RDMA_CM_EVENT_UNREACHABLE", 2415 "RDMA_CM_EVENT_REJECTED", 2416 "RDMA_CM_EVENT_ESTABLISHED", 2417 "RDMA_CM_EVENT_DISCONNECTED", 2418 "RDMA_CM_EVENT_DEVICE_REMOVAL", 2419 "RDMA_CM_EVENT_MULTICAST_JOIN", 2420 "RDMA_CM_EVENT_MULTICAST_ERROR", 2421 "RDMA_CM_EVENT_ADDR_CHANGE", 2422 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 2423 }; 2424 #endif /* DEBUG */ 2425 2426 static void 2427 spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 2428 { 2429 struct spdk_nvmf_rdma_transport *rtransport; 2430 struct rdma_cm_event *event; 2431 int rc; 2432 2433 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2434 2435 if (rtransport->event_channel == NULL) { 2436 return; 2437 } 2438 2439 while (1) { 2440 rc = rdma_get_cm_event(rtransport->event_channel, &event); 2441 if (rc == 0) { 2442 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 2443 2444 spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); 2445 2446 switch (event->event) { 2447 case RDMA_CM_EVENT_ADDR_RESOLVED: 2448 case RDMA_CM_EVENT_ADDR_ERROR: 2449 case RDMA_CM_EVENT_ROUTE_RESOLVED: 2450 case RDMA_CM_EVENT_ROUTE_ERROR: 2451 /* No action required. The target never attempts to resolve routes. */ 2452 break; 2453 case RDMA_CM_EVENT_CONNECT_REQUEST: 2454 rc = nvmf_rdma_connect(transport, event, cb_fn); 2455 if (rc < 0) { 2456 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 2457 break; 2458 } 2459 break; 2460 case RDMA_CM_EVENT_CONNECT_RESPONSE: 2461 /* The target never initiates a new connection. So this will not occur. */ 2462 break; 2463 case RDMA_CM_EVENT_CONNECT_ERROR: 2464 /* Can this happen? The docs say it can, but not sure what causes it. */ 2465 break; 2466 case RDMA_CM_EVENT_UNREACHABLE: 2467 case RDMA_CM_EVENT_REJECTED: 2468 /* These only occur on the client side. */ 2469 break; 2470 case RDMA_CM_EVENT_ESTABLISHED: 2471 /* TODO: Should we be waiting for this event anywhere? */ 2472 break; 2473 case RDMA_CM_EVENT_DISCONNECTED: 2474 case RDMA_CM_EVENT_DEVICE_REMOVAL: 2475 rc = nvmf_rdma_disconnect(event); 2476 if (rc < 0) { 2477 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 2478 break; 2479 } 2480 break; 2481 case RDMA_CM_EVENT_MULTICAST_JOIN: 2482 case RDMA_CM_EVENT_MULTICAST_ERROR: 2483 /* Multicast is not used */ 2484 break; 2485 case RDMA_CM_EVENT_ADDR_CHANGE: 2486 /* Not utilizing this event */ 2487 break; 2488 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 2489 /* For now, do nothing. The target never re-uses queue pairs. */ 2490 break; 2491 default: 2492 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 2493 break; 2494 } 2495 2496 rdma_ack_cm_event(event); 2497 } else { 2498 if (errno != EAGAIN && errno != EWOULDBLOCK) { 2499 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 2500 } 2501 break; 2502 } 2503 } 2504 } 2505 2506 static void 2507 spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 2508 { 2509 int rc; 2510 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 2511 struct ibv_async_event event; 2512 enum ibv_qp_state state; 2513 2514 rc = ibv_get_async_event(device->context, &event); 2515 2516 if (rc) { 2517 SPDK_ERRLOG("Failed to get async_event (%d): %s\n", 2518 errno, spdk_strerror(errno)); 2519 return; 2520 } 2521 2522 SPDK_NOTICELOG("Async event: %s\n", 2523 ibv_event_type_str(event.event_type)); 2524 2525 switch (event.event_type) { 2526 case IBV_EVENT_QP_FATAL: 2527 rqpair = event.element.qp->qp_context; 2528 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2529 (uintptr_t)rqpair->cm_id, event.event_type); 2530 spdk_nvmf_rdma_update_ibv_state(rqpair); 2531 spdk_nvmf_rdma_start_disconnect(rqpair); 2532 break; 2533 case IBV_EVENT_QP_LAST_WQE_REACHED: 2534 /* This event only occurs for shared receive queues. */ 2535 rqpair = event.element.qp->qp_context; 2536 rqpair->last_wqe_reached = true; 2537 2538 /* This must be handled on the polling thread if it exists. Otherwise the timeout will catch it. */ 2539 if (rqpair->qpair.group) { 2540 spdk_thread_send_msg(rqpair->qpair.group->thread, nvmf_rdma_destroy_drained_qpair, rqpair); 2541 } else { 2542 SPDK_ERRLOG("Unable to destroy the qpair %p since it does not have a poll group.\n", rqpair); 2543 } 2544 2545 break; 2546 case IBV_EVENT_SQ_DRAINED: 2547 /* This event occurs frequently in both error and non-error states. 2548 * Check if the qpair is in an error state before sending a message. 2549 * Note that we're not on the correct thread to access the qpair, but 2550 * the operations that the below calls make all happen to be thread 2551 * safe. */ 2552 rqpair = event.element.qp->qp_context; 2553 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2554 (uintptr_t)rqpair->cm_id, event.event_type); 2555 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 2556 if (state == IBV_QPS_ERR) { 2557 spdk_nvmf_rdma_start_disconnect(rqpair); 2558 } 2559 break; 2560 case IBV_EVENT_QP_REQ_ERR: 2561 case IBV_EVENT_QP_ACCESS_ERR: 2562 case IBV_EVENT_COMM_EST: 2563 case IBV_EVENT_PATH_MIG: 2564 case IBV_EVENT_PATH_MIG_ERR: 2565 rqpair = event.element.qp->qp_context; 2566 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2567 (uintptr_t)rqpair->cm_id, event.event_type); 2568 spdk_nvmf_rdma_update_ibv_state(rqpair); 2569 break; 2570 case IBV_EVENT_CQ_ERR: 2571 case IBV_EVENT_DEVICE_FATAL: 2572 case IBV_EVENT_PORT_ACTIVE: 2573 case IBV_EVENT_PORT_ERR: 2574 case IBV_EVENT_LID_CHANGE: 2575 case IBV_EVENT_PKEY_CHANGE: 2576 case IBV_EVENT_SM_CHANGE: 2577 case IBV_EVENT_SRQ_ERR: 2578 case IBV_EVENT_SRQ_LIMIT_REACHED: 2579 case IBV_EVENT_CLIENT_REREGISTER: 2580 case IBV_EVENT_GID_CHANGE: 2581 default: 2582 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); 2583 break; 2584 } 2585 ibv_ack_async_event(&event); 2586 } 2587 2588 static void 2589 spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 2590 { 2591 int nfds, i = 0; 2592 struct spdk_nvmf_rdma_transport *rtransport; 2593 struct spdk_nvmf_rdma_device *device, *tmp; 2594 2595 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2596 nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 2597 2598 if (nfds <= 0) { 2599 return; 2600 } 2601 2602 /* The first poll descriptor is RDMA CM event */ 2603 if (rtransport->poll_fds[i++].revents & POLLIN) { 2604 spdk_nvmf_process_cm_event(transport, cb_fn); 2605 nfds--; 2606 } 2607 2608 if (nfds == 0) { 2609 return; 2610 } 2611 2612 /* Second and subsequent poll descriptors are IB async events */ 2613 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 2614 if (rtransport->poll_fds[i++].revents & POLLIN) { 2615 spdk_nvmf_process_ib_event(device); 2616 nfds--; 2617 } 2618 } 2619 /* check all flagged fd's have been served */ 2620 assert(nfds == 0); 2621 } 2622 2623 static void 2624 spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 2625 struct spdk_nvme_transport_id *trid, 2626 struct spdk_nvmf_discovery_log_page_entry *entry) 2627 { 2628 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 2629 entry->adrfam = trid->adrfam; 2630 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; 2631 2632 spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 2633 spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 2634 2635 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 2636 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 2637 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 2638 } 2639 2640 static void 2641 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); 2642 2643 static struct spdk_nvmf_transport_poll_group * 2644 spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) 2645 { 2646 struct spdk_nvmf_rdma_transport *rtransport; 2647 struct spdk_nvmf_rdma_poll_group *rgroup; 2648 struct spdk_nvmf_rdma_poller *poller; 2649 struct spdk_nvmf_rdma_device *device; 2650 struct ibv_srq_init_attr srq_init_attr; 2651 struct spdk_nvmf_rdma_resource_opts opts; 2652 int num_cqe; 2653 2654 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2655 2656 rgroup = calloc(1, sizeof(*rgroup)); 2657 if (!rgroup) { 2658 return NULL; 2659 } 2660 2661 TAILQ_INIT(&rgroup->pollers); 2662 TAILQ_INIT(&rgroup->pending_data_buf_queue); 2663 2664 pthread_mutex_lock(&rtransport->lock); 2665 TAILQ_FOREACH(device, &rtransport->devices, link) { 2666 poller = calloc(1, sizeof(*poller)); 2667 if (!poller) { 2668 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 2669 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 2670 pthread_mutex_unlock(&rtransport->lock); 2671 return NULL; 2672 } 2673 2674 poller->device = device; 2675 poller->group = rgroup; 2676 2677 TAILQ_INIT(&poller->qpairs); 2678 2679 TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 2680 if (device->attr.max_srq != 0) { 2681 poller->max_srq_depth = transport->opts.max_srq_depth; 2682 2683 memset(&srq_init_attr, 0, sizeof(struct ibv_srq_init_attr)); 2684 srq_init_attr.attr.max_wr = poller->max_srq_depth; 2685 srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 2686 poller->srq = ibv_create_srq(device->pd, &srq_init_attr); 2687 if (!poller->srq) { 2688 SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno); 2689 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 2690 pthread_mutex_unlock(&rtransport->lock); 2691 return NULL; 2692 } 2693 2694 opts.qp = poller->srq; 2695 opts.pd = device->pd; 2696 opts.qpair = NULL; 2697 opts.shared = true; 2698 opts.max_queue_depth = poller->max_srq_depth; 2699 opts.in_capsule_data_size = transport->opts.in_capsule_data_size; 2700 2701 poller->resources = nvmf_rdma_resources_create(&opts); 2702 if (!poller->resources) { 2703 SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n"); 2704 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 2705 pthread_mutex_unlock(&rtransport->lock); 2706 } 2707 } 2708 2709 /* 2710 * When using an srq, we can limit the completion queue at startup. 2711 * The following formula represents the calculation: 2712 * num_cqe = num_recv + num_data_wr + num_send_wr. 2713 * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth 2714 */ 2715 if (poller->srq) { 2716 num_cqe = poller->max_srq_depth * 3; 2717 } else { 2718 num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE; 2719 } 2720 2721 poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0); 2722 if (!poller->cq) { 2723 SPDK_ERRLOG("Unable to create completion queue\n"); 2724 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 2725 pthread_mutex_unlock(&rtransport->lock); 2726 return NULL; 2727 } 2728 poller->num_cqe = num_cqe; 2729 } 2730 2731 pthread_mutex_unlock(&rtransport->lock); 2732 return &rgroup->group; 2733 } 2734 2735 static void 2736 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 2737 { 2738 struct spdk_nvmf_rdma_poll_group *rgroup; 2739 struct spdk_nvmf_rdma_poller *poller, *tmp; 2740 struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; 2741 2742 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2743 2744 if (!rgroup) { 2745 return; 2746 } 2747 2748 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 2749 TAILQ_REMOVE(&rgroup->pollers, poller, link); 2750 2751 TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) { 2752 spdk_nvmf_rdma_qpair_destroy(qpair); 2753 } 2754 2755 if (poller->srq) { 2756 nvmf_rdma_resources_destroy(poller->resources); 2757 ibv_destroy_srq(poller->srq); 2758 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Destroyed RDMA shared queue %p\n", poller->srq); 2759 } 2760 2761 if (poller->cq) { 2762 ibv_destroy_cq(poller->cq); 2763 } 2764 2765 free(poller); 2766 } 2767 2768 if (!TAILQ_EMPTY(&rgroup->pending_data_buf_queue)) { 2769 SPDK_ERRLOG("Pending I/O list wasn't empty on poll group destruction\n"); 2770 } 2771 2772 free(rgroup); 2773 } 2774 2775 static void 2776 spdk_nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair) 2777 { 2778 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 2779 spdk_nvmf_rdma_qpair_destroy(rqpair); 2780 } 2781 2782 static int 2783 spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 2784 struct spdk_nvmf_qpair *qpair) 2785 { 2786 struct spdk_nvmf_rdma_poll_group *rgroup; 2787 struct spdk_nvmf_rdma_qpair *rqpair; 2788 struct spdk_nvmf_rdma_device *device; 2789 struct spdk_nvmf_rdma_poller *poller; 2790 int rc; 2791 2792 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2793 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2794 2795 device = rqpair->port->device; 2796 2797 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 2798 if (poller->device == device) { 2799 break; 2800 } 2801 } 2802 2803 if (!poller) { 2804 SPDK_ERRLOG("No poller found for device.\n"); 2805 return -1; 2806 } 2807 2808 TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); 2809 rqpair->poller = poller; 2810 rqpair->srq = rqpair->poller->srq; 2811 2812 rc = spdk_nvmf_rdma_qpair_initialize(qpair); 2813 if (rc < 0) { 2814 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 2815 return -1; 2816 } 2817 2818 rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 2819 if (rc) { 2820 /* Try to reject, but we probably can't */ 2821 spdk_nvmf_rdma_qpair_reject_connection(rqpair); 2822 return -1; 2823 } 2824 2825 spdk_nvmf_rdma_update_ibv_state(rqpair); 2826 2827 return 0; 2828 } 2829 2830 static int 2831 spdk_nvmf_rdma_request_free(struct spdk_nvmf_request *req) 2832 { 2833 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 2834 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 2835 struct spdk_nvmf_rdma_transport, transport); 2836 2837 nvmf_rdma_request_free(rdma_req, rtransport); 2838 return 0; 2839 } 2840 2841 static int 2842 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 2843 { 2844 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 2845 struct spdk_nvmf_rdma_transport, transport); 2846 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, 2847 struct spdk_nvmf_rdma_request, req); 2848 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, 2849 struct spdk_nvmf_rdma_qpair, qpair); 2850 2851 if (rqpair->ibv_attr.qp_state != IBV_QPS_ERR) { 2852 /* The connection is alive, so process the request as normal */ 2853 rdma_req->state = RDMA_REQUEST_STATE_EXECUTED; 2854 } else { 2855 /* The connection is dead. Move the request directly to the completed state. */ 2856 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2857 } 2858 2859 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2860 2861 return 0; 2862 } 2863 2864 static int 2865 spdk_nvmf_rdma_destroy_defunct_qpair(void *ctx) 2866 { 2867 struct spdk_nvmf_rdma_qpair *rqpair = ctx; 2868 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 2869 struct spdk_nvmf_rdma_transport, transport); 2870 2871 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 2872 spdk_nvmf_rdma_qpair_destroy(rqpair); 2873 2874 return 0; 2875 } 2876 2877 static void 2878 spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) 2879 { 2880 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2881 2882 if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) { 2883 return; 2884 } 2885 2886 rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING; 2887 2888 /* This happens only when the qpair is disconnected before 2889 * it is added to the poll group. Since there is no poll group, 2890 * the RDMA qp has not been initialized yet and the RDMA CM 2891 * event has not yet been acknowledged, so we need to reject it. 2892 */ 2893 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) { 2894 spdk_nvmf_rdma_qpair_reject_connection(rqpair); 2895 return; 2896 } 2897 2898 if (rqpair->ibv_attr.qp_state != IBV_QPS_ERR) { 2899 spdk_nvmf_rdma_set_ibv_state(rqpair, IBV_QPS_ERR); 2900 } 2901 2902 rqpair->destruct_poller = spdk_poller_register(spdk_nvmf_rdma_destroy_defunct_qpair, (void *)rqpair, 2903 NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US); 2904 } 2905 2906 static struct spdk_nvmf_rdma_qpair * 2907 get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc) 2908 { 2909 struct spdk_nvmf_rdma_qpair *rqpair; 2910 /* @todo: improve QP search */ 2911 TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) { 2912 if (wc->qp_num == rqpair->cm_id->qp->qp_num) { 2913 return rqpair; 2914 } 2915 } 2916 SPDK_ERRLOG("Didn't find QP with qp_num %u\n", wc->qp_num); 2917 return NULL; 2918 } 2919 2920 #ifdef DEBUG 2921 static int 2922 spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) 2923 { 2924 return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || 2925 rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; 2926 } 2927 #endif 2928 2929 static int 2930 spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 2931 struct spdk_nvmf_rdma_poller *rpoller) 2932 { 2933 struct ibv_wc wc[32]; 2934 struct spdk_nvmf_rdma_wr *rdma_wr; 2935 struct spdk_nvmf_rdma_request *rdma_req; 2936 struct spdk_nvmf_rdma_recv *rdma_recv; 2937 struct spdk_nvmf_rdma_qpair *rqpair; 2938 int reaped, i; 2939 int count = 0; 2940 bool error = false; 2941 2942 /* Poll for completing operations. */ 2943 reaped = ibv_poll_cq(rpoller->cq, 32, wc); 2944 if (reaped < 0) { 2945 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 2946 errno, spdk_strerror(errno)); 2947 return -1; 2948 } 2949 2950 for (i = 0; i < reaped; i++) { 2951 2952 rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id; 2953 2954 /* Handle error conditions */ 2955 if (wc[i].status) { 2956 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "CQ error on CQ %p, Request 0x%lu (%d): %s\n", 2957 rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 2958 2959 error = true; 2960 2961 switch (rdma_wr->type) { 2962 case RDMA_WR_TYPE_SEND: 2963 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 2964 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2965 2966 SPDK_ERRLOG("data=%p length=%u\n", rdma_req->req.data, rdma_req->req.length); 2967 /* We're going to attempt an error recovery, so force the request into 2968 * the completed state. */ 2969 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2970 rqpair->current_send_depth--; 2971 2972 assert(rdma_req->num_outstanding_data_wr == 0); 2973 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2974 break; 2975 case RDMA_WR_TYPE_RECV: 2976 /* rdma_recv->qpair will be NULL if using an SRQ. In that case we have to get the qpair from the wc. */ 2977 rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 2978 if (rdma_recv->qpair == NULL) { 2979 rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]); 2980 } 2981 rqpair = rdma_recv->qpair; 2982 2983 assert(rqpair != NULL); 2984 2985 /* Dump this into the incoming queue. This gets cleaned up when 2986 * the queue pair disconnects or recovers. */ 2987 STAILQ_INSERT_TAIL(&rqpair->resources->incoming_queue, rdma_recv, link); 2988 rqpair->current_recv_depth++; 2989 break; 2990 case RDMA_WR_TYPE_DATA: 2991 /* If the data transfer fails still force the queue into the error state, 2992 * if we were performing an RDMA_READ, we need to force the request into a 2993 * completed state since it wasn't linked to a send. However, in the RDMA_WRITE 2994 * case, we should wait for the SEND to complete. */ 2995 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 2996 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2997 2998 SPDK_ERRLOG("data=%p length=%u\n", rdma_req->req.data, rdma_req->req.length); 2999 assert(rdma_req->num_outstanding_data_wr > 0); 3000 rdma_req->num_outstanding_data_wr--; 3001 if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) { 3002 rqpair->current_read_depth--; 3003 if (rdma_req->num_outstanding_data_wr == 0) { 3004 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3005 } 3006 } 3007 rqpair->current_send_depth--; 3008 break; 3009 default: 3010 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 3011 continue; 3012 } 3013 3014 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { 3015 /* Disconnect the connection. */ 3016 spdk_nvmf_rdma_start_disconnect(rqpair); 3017 } else { 3018 nvmf_rdma_destroy_drained_qpair(rqpair); 3019 } 3020 continue; 3021 } 3022 3023 switch (wc[i].opcode) { 3024 case IBV_WC_SEND: 3025 assert(rdma_wr->type == RDMA_WR_TYPE_SEND); 3026 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 3027 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 3028 3029 assert(spdk_nvmf_rdma_req_is_completing(rdma_req)); 3030 3031 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3032 rqpair->current_send_depth--; 3033 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 3034 3035 count++; 3036 3037 assert(rdma_req->num_outstanding_data_wr == 0); 3038 /* Try to process other queued requests */ 3039 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 3040 break; 3041 3042 case IBV_WC_RDMA_WRITE: 3043 assert(rdma_wr->type == RDMA_WR_TYPE_DATA); 3044 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 3045 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 3046 rqpair->current_send_depth--; 3047 rdma_req->num_outstanding_data_wr--; 3048 3049 /* Try to process other queued requests */ 3050 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 3051 break; 3052 3053 case IBV_WC_RDMA_READ: 3054 assert(rdma_wr->type == RDMA_WR_TYPE_DATA); 3055 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 3056 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 3057 rqpair->current_send_depth--; 3058 3059 assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); 3060 /* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */ 3061 assert(rdma_req->num_outstanding_data_wr > 0); 3062 rqpair->current_read_depth--; 3063 rdma_req->num_outstanding_data_wr--; 3064 if (rdma_req->num_outstanding_data_wr == 0) { 3065 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 3066 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 3067 } 3068 3069 /* Try to process other queued requests */ 3070 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 3071 break; 3072 3073 case IBV_WC_RECV: 3074 assert(rdma_wr->type == RDMA_WR_TYPE_RECV); 3075 /* rdma_recv->qpair will be NULL if using an SRQ. In that case we have to get the qpair from the wc. */ 3076 rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 3077 if (rdma_recv->qpair == NULL) { 3078 rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]); 3079 } 3080 rqpair = rdma_recv->qpair; 3081 /* The qpair should not send more requests than are allowed per qpair. */ 3082 if (rqpair->current_recv_depth >= rqpair->max_queue_depth) { 3083 spdk_nvmf_rdma_start_disconnect(rqpair); 3084 } else { 3085 rqpair->current_recv_depth++; 3086 } 3087 3088 STAILQ_INSERT_TAIL(&rqpair->resources->incoming_queue, rdma_recv, link); 3089 /* Try to process other queued requests */ 3090 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 3091 break; 3092 3093 default: 3094 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 3095 continue; 3096 } 3097 3098 if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 3099 nvmf_rdma_destroy_drained_qpair(rqpair); 3100 } 3101 } 3102 3103 if (error == true) { 3104 return -1; 3105 } 3106 3107 return count; 3108 } 3109 3110 static int 3111 spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 3112 { 3113 struct spdk_nvmf_rdma_transport *rtransport; 3114 struct spdk_nvmf_rdma_poll_group *rgroup; 3115 struct spdk_nvmf_rdma_poller *rpoller; 3116 int count, rc; 3117 3118 rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 3119 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 3120 3121 count = 0; 3122 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 3123 rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); 3124 if (rc < 0) { 3125 return rc; 3126 } 3127 count += rc; 3128 } 3129 3130 return count; 3131 } 3132 3133 static int 3134 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 3135 struct spdk_nvme_transport_id *trid, 3136 bool peer) 3137 { 3138 struct sockaddr *saddr; 3139 uint16_t port; 3140 3141 trid->trtype = SPDK_NVME_TRANSPORT_RDMA; 3142 3143 if (peer) { 3144 saddr = rdma_get_peer_addr(id); 3145 } else { 3146 saddr = rdma_get_local_addr(id); 3147 } 3148 switch (saddr->sa_family) { 3149 case AF_INET: { 3150 struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; 3151 3152 trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; 3153 inet_ntop(AF_INET, &saddr_in->sin_addr, 3154 trid->traddr, sizeof(trid->traddr)); 3155 if (peer) { 3156 port = ntohs(rdma_get_dst_port(id)); 3157 } else { 3158 port = ntohs(rdma_get_src_port(id)); 3159 } 3160 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 3161 break; 3162 } 3163 case AF_INET6: { 3164 struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; 3165 trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; 3166 inet_ntop(AF_INET6, &saddr_in->sin6_addr, 3167 trid->traddr, sizeof(trid->traddr)); 3168 if (peer) { 3169 port = ntohs(rdma_get_dst_port(id)); 3170 } else { 3171 port = ntohs(rdma_get_src_port(id)); 3172 } 3173 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 3174 break; 3175 } 3176 default: 3177 return -1; 3178 3179 } 3180 3181 return 0; 3182 } 3183 3184 static int 3185 spdk_nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 3186 struct spdk_nvme_transport_id *trid) 3187 { 3188 struct spdk_nvmf_rdma_qpair *rqpair; 3189 3190 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3191 3192 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); 3193 } 3194 3195 static int 3196 spdk_nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 3197 struct spdk_nvme_transport_id *trid) 3198 { 3199 struct spdk_nvmf_rdma_qpair *rqpair; 3200 3201 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3202 3203 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); 3204 } 3205 3206 static int 3207 spdk_nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 3208 struct spdk_nvme_transport_id *trid) 3209 { 3210 struct spdk_nvmf_rdma_qpair *rqpair; 3211 3212 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3213 3214 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); 3215 } 3216 3217 void 3218 spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) 3219 { 3220 g_nvmf_hooks = *hooks; 3221 } 3222 3223 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 3224 .type = SPDK_NVME_TRANSPORT_RDMA, 3225 .opts_init = spdk_nvmf_rdma_opts_init, 3226 .create = spdk_nvmf_rdma_create, 3227 .destroy = spdk_nvmf_rdma_destroy, 3228 3229 .listen = spdk_nvmf_rdma_listen, 3230 .stop_listen = spdk_nvmf_rdma_stop_listen, 3231 .accept = spdk_nvmf_rdma_accept, 3232 3233 .listener_discover = spdk_nvmf_rdma_discover, 3234 3235 .poll_group_create = spdk_nvmf_rdma_poll_group_create, 3236 .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, 3237 .poll_group_add = spdk_nvmf_rdma_poll_group_add, 3238 .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, 3239 3240 .req_free = spdk_nvmf_rdma_request_free, 3241 .req_complete = spdk_nvmf_rdma_request_complete, 3242 3243 .qpair_fini = spdk_nvmf_rdma_close_qpair, 3244 .qpair_get_peer_trid = spdk_nvmf_rdma_qpair_get_peer_trid, 3245 .qpair_get_local_trid = spdk_nvmf_rdma_qpair_get_local_trid, 3246 .qpair_get_listen_trid = spdk_nvmf_rdma_qpair_get_listen_trid, 3247 3248 }; 3249 3250 SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) 3251