1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2018 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "nvmf_internal.h" 41 #include "transport.h" 42 43 #include "spdk/config.h" 44 #include "spdk/assert.h" 45 #include "spdk/thread.h" 46 #include "spdk/nvmf.h" 47 #include "spdk/nvmf_spec.h" 48 #include "spdk/string.h" 49 #include "spdk/trace.h" 50 #include "spdk/util.h" 51 52 #include "spdk_internal/log.h" 53 54 struct spdk_nvme_rdma_hooks g_nvmf_hooks = {}; 55 56 /* 57 RDMA Connection Resource Defaults 58 */ 59 #define NVMF_DEFAULT_TX_SGE SPDK_NVMF_MAX_SGL_ENTRIES 60 #define NVMF_DEFAULT_RSP_SGE 1 61 #define NVMF_DEFAULT_RX_SGE 2 62 63 /* The RDMA completion queue size */ 64 #define DEFAULT_NVMF_RDMA_CQ_SIZE 4096 65 #define MAX_WR_PER_QP(queue_depth) (queue_depth * 3 + 2) 66 67 enum spdk_nvmf_rdma_request_state { 68 /* The request is not currently in use */ 69 RDMA_REQUEST_STATE_FREE = 0, 70 71 /* Initial state when request first received */ 72 RDMA_REQUEST_STATE_NEW, 73 74 /* The request is queued until a data buffer is available. */ 75 RDMA_REQUEST_STATE_NEED_BUFFER, 76 77 /* The request is waiting on RDMA queue depth availability 78 * to transfer data from the host to the controller. 79 */ 80 RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 81 82 /* The request is currently transferring data from the host to the controller. */ 83 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 84 85 /* The request is ready to execute at the block device */ 86 RDMA_REQUEST_STATE_READY_TO_EXECUTE, 87 88 /* The request is currently executing at the block device */ 89 RDMA_REQUEST_STATE_EXECUTING, 90 91 /* The request finished executing at the block device */ 92 RDMA_REQUEST_STATE_EXECUTED, 93 94 /* The request is waiting on RDMA queue depth availability 95 * to transfer data from the controller to the host. 96 */ 97 RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 98 99 /* The request is ready to send a completion */ 100 RDMA_REQUEST_STATE_READY_TO_COMPLETE, 101 102 /* The request is currently transferring data from the controller to the host. */ 103 RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 104 105 /* The request currently has an outstanding completion without an 106 * associated data transfer. 107 */ 108 RDMA_REQUEST_STATE_COMPLETING, 109 110 /* The request completed and can be marked free. */ 111 RDMA_REQUEST_STATE_COMPLETED, 112 113 /* Terminator */ 114 RDMA_REQUEST_NUM_STATES, 115 }; 116 117 #define OBJECT_NVMF_RDMA_IO 0x40 118 119 #define TRACE_GROUP_NVMF_RDMA 0x4 120 #define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) 121 #define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) 122 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) 123 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) 124 #define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) 125 #define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) 126 #define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) 127 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) 128 #define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) 129 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) 130 #define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) 131 #define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB) 132 #define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC) 133 #define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD) 134 #define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE) 135 #define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF) 136 #define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10) 137 #define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x11) 138 139 SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA) 140 { 141 spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); 142 spdk_trace_register_description("RDMA_REQ_NEW", "", 143 TRACE_RDMA_REQUEST_STATE_NEW, 144 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: "); 145 spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", "", 146 TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 147 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 148 spdk_trace_register_description("RDMA_REQ_TX_PENDING_C_TO_H", "", 149 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 150 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 151 spdk_trace_register_description("RDMA_REQ_TX_PENDING_H_TO_C", "", 152 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 153 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 154 spdk_trace_register_description("RDMA_REQ_TX_H_TO_C", "", 155 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 156 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 157 spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", "", 158 TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 159 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 160 spdk_trace_register_description("RDMA_REQ_EXECUTING", "", 161 TRACE_RDMA_REQUEST_STATE_EXECUTING, 162 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 163 spdk_trace_register_description("RDMA_REQ_EXECUTED", "", 164 TRACE_RDMA_REQUEST_STATE_EXECUTED, 165 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 166 spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPLETE", "", 167 TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 168 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 169 spdk_trace_register_description("RDMA_REQ_COMPLETING_CONTROLLER_TO_HOST", "", 170 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 171 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 172 spdk_trace_register_description("RDMA_REQ_COMPLETING_INCAPSULE", "", 173 TRACE_RDMA_REQUEST_STATE_COMPLETING, 174 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 175 spdk_trace_register_description("RDMA_REQ_COMPLETED", "", 176 TRACE_RDMA_REQUEST_STATE_COMPLETED, 177 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 178 179 spdk_trace_register_description("RDMA_QP_CREATE", "", TRACE_RDMA_QP_CREATE, 180 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 181 spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", "", TRACE_RDMA_IBV_ASYNC_EVENT, 182 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 183 spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", "", TRACE_RDMA_CM_ASYNC_EVENT, 184 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 185 spdk_trace_register_description("RDMA_QP_STATE_CHANGE", "", TRACE_RDMA_QP_STATE_CHANGE, 186 OWNER_NONE, OBJECT_NONE, 0, 1, "state: "); 187 spdk_trace_register_description("RDMA_QP_DISCONNECT", "", TRACE_RDMA_QP_DISCONNECT, 188 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 189 spdk_trace_register_description("RDMA_QP_DESTROY", "", TRACE_RDMA_QP_DESTROY, 190 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 191 } 192 193 enum spdk_nvmf_rdma_wr_type { 194 RDMA_WR_TYPE_RECV, 195 RDMA_WR_TYPE_SEND, 196 RDMA_WR_TYPE_DATA, 197 RDMA_WR_TYPE_DRAIN_SEND, 198 RDMA_WR_TYPE_DRAIN_RECV 199 }; 200 201 struct spdk_nvmf_rdma_wr { 202 enum spdk_nvmf_rdma_wr_type type; 203 }; 204 205 /* This structure holds commands as they are received off the wire. 206 * It must be dynamically paired with a full request object 207 * (spdk_nvmf_rdma_request) to service a request. It is separate 208 * from the request because RDMA does not appear to order 209 * completions, so occasionally we'll get a new incoming 210 * command when there aren't any free request objects. 211 */ 212 struct spdk_nvmf_rdma_recv { 213 struct ibv_recv_wr wr; 214 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 215 216 struct spdk_nvmf_rdma_qpair *qpair; 217 218 /* In-capsule data buffer */ 219 uint8_t *buf; 220 221 struct spdk_nvmf_rdma_wr rdma_wr; 222 223 TAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 224 }; 225 226 struct spdk_nvmf_rdma_request_data { 227 struct spdk_nvmf_rdma_wr rdma_wr; 228 struct ibv_send_wr wr; 229 struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 230 void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; 231 }; 232 233 struct spdk_nvmf_rdma_request { 234 struct spdk_nvmf_request req; 235 bool data_from_pool; 236 237 enum spdk_nvmf_rdma_request_state state; 238 239 struct spdk_nvmf_rdma_recv *recv; 240 241 struct { 242 struct spdk_nvmf_rdma_wr rdma_wr; 243 struct ibv_send_wr wr; 244 struct ibv_sge sgl[NVMF_DEFAULT_RSP_SGE]; 245 } rsp; 246 247 struct spdk_nvmf_rdma_request_data data; 248 249 uint32_t num_outstanding_data_wr; 250 251 TAILQ_ENTRY(spdk_nvmf_rdma_request) link; 252 TAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; 253 }; 254 255 enum spdk_nvmf_rdma_qpair_disconnect_flags { 256 RDMA_QP_DISCONNECTING = 1, 257 RDMA_QP_RECV_DRAINED = 1 << 1, 258 RDMA_QP_SEND_DRAINED = 1 << 2 259 }; 260 261 struct spdk_nvmf_rdma_qpair { 262 struct spdk_nvmf_qpair qpair; 263 264 struct spdk_nvmf_rdma_port *port; 265 struct spdk_nvmf_rdma_poller *poller; 266 267 struct rdma_cm_id *cm_id; 268 struct rdma_cm_id *listen_id; 269 270 /* The maximum number of I/O outstanding on this connection at one time */ 271 uint16_t max_queue_depth; 272 273 /* The maximum number of active RDMA READ and ATOMIC operations at one time */ 274 uint16_t max_read_depth; 275 276 /* The maximum number of RDMA SEND operations at one time */ 277 uint32_t max_send_depth; 278 279 /* The current number of outstanding WRs from this qpair's 280 * recv queue. Should not exceed device->attr.max_queue_depth. 281 */ 282 uint16_t current_recv_depth; 283 284 /* The current number of posted WRs from this qpair's 285 * send queue. Should not exceed max_send_depth. 286 */ 287 uint32_t current_send_depth; 288 289 /* The current number of active RDMA READ operations */ 290 uint16_t current_read_depth; 291 292 /* The maximum number of SGEs per WR on the send queue */ 293 uint32_t max_send_sge; 294 295 /* The maximum number of SGEs per WR on the recv queue */ 296 uint32_t max_recv_sge; 297 298 /* Receives that are waiting for a request object */ 299 TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 300 301 /* Queues to track the requests in all states */ 302 TAILQ_HEAD(, spdk_nvmf_rdma_request) state_queue[RDMA_REQUEST_NUM_STATES]; 303 304 /* Number of requests in each state */ 305 uint32_t state_cntr[RDMA_REQUEST_NUM_STATES]; 306 307 /* Array of size "max_queue_depth" containing RDMA requests. */ 308 struct spdk_nvmf_rdma_request *reqs; 309 310 /* Array of size "max_queue_depth" containing RDMA recvs. */ 311 struct spdk_nvmf_rdma_recv *recvs; 312 313 /* Array of size "max_queue_depth" containing 64 byte capsules 314 * used for receive. 315 */ 316 union nvmf_h2c_msg *cmds; 317 struct ibv_mr *cmds_mr; 318 319 /* Array of size "max_queue_depth" containing 16 byte completions 320 * to be sent back to the user. 321 */ 322 union nvmf_c2h_msg *cpls; 323 struct ibv_mr *cpls_mr; 324 325 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 326 * buffers to be used for in capsule data. 327 */ 328 void *bufs; 329 struct ibv_mr *bufs_mr; 330 331 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; 332 333 /* IBV queue pair attributes: they are used to manage 334 * qp state and recover from errors. 335 */ 336 struct ibv_qp_attr ibv_attr; 337 338 uint32_t disconnect_flags; 339 struct spdk_nvmf_rdma_wr drain_send_wr; 340 struct spdk_nvmf_rdma_wr drain_recv_wr; 341 342 /* There are several ways a disconnect can start on a qpair 343 * and they are not all mutually exclusive. It is important 344 * that we only initialize one of these paths. 345 */ 346 bool disconnect_started; 347 }; 348 349 struct spdk_nvmf_rdma_poller { 350 struct spdk_nvmf_rdma_device *device; 351 struct spdk_nvmf_rdma_poll_group *group; 352 353 int num_cqe; 354 int required_num_wr; 355 struct ibv_cq *cq; 356 357 TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; 358 359 TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 360 }; 361 362 struct spdk_nvmf_rdma_poll_group { 363 struct spdk_nvmf_transport_poll_group group; 364 365 /* Requests that are waiting to obtain a data buffer */ 366 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; 367 368 TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 369 }; 370 371 /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 372 struct spdk_nvmf_rdma_device { 373 struct ibv_device_attr attr; 374 struct ibv_context *context; 375 376 struct spdk_mem_map *map; 377 struct ibv_pd *pd; 378 379 TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 380 }; 381 382 struct spdk_nvmf_rdma_port { 383 struct spdk_nvme_transport_id trid; 384 struct rdma_cm_id *id; 385 struct spdk_nvmf_rdma_device *device; 386 uint32_t ref; 387 TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 388 }; 389 390 struct spdk_nvmf_rdma_transport { 391 struct spdk_nvmf_transport transport; 392 393 struct rdma_event_channel *event_channel; 394 395 struct spdk_mempool *data_wr_pool; 396 397 pthread_mutex_t lock; 398 399 /* fields used to poll RDMA/IB events */ 400 nfds_t npoll_fds; 401 struct pollfd *poll_fds; 402 403 TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 404 TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 405 }; 406 407 static inline int 408 spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state) 409 { 410 switch (state) { 411 case IBV_QPS_RESET: 412 case IBV_QPS_INIT: 413 case IBV_QPS_RTR: 414 case IBV_QPS_RTS: 415 case IBV_QPS_SQD: 416 case IBV_QPS_SQE: 417 case IBV_QPS_ERR: 418 return 0; 419 default: 420 return -1; 421 } 422 } 423 424 static enum ibv_qp_state 425 spdk_nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) { 426 enum ibv_qp_state old_state, new_state; 427 struct ibv_qp_init_attr init_attr; 428 int rc; 429 430 /* All the attributes needed for recovery */ 431 static int spdk_nvmf_ibv_attr_mask = 432 IBV_QP_STATE | 433 IBV_QP_PKEY_INDEX | 434 IBV_QP_PORT | 435 IBV_QP_ACCESS_FLAGS | 436 IBV_QP_AV | 437 IBV_QP_PATH_MTU | 438 IBV_QP_DEST_QPN | 439 IBV_QP_RQ_PSN | 440 IBV_QP_MAX_DEST_RD_ATOMIC | 441 IBV_QP_MIN_RNR_TIMER | 442 IBV_QP_SQ_PSN | 443 IBV_QP_TIMEOUT | 444 IBV_QP_RETRY_CNT | 445 IBV_QP_RNR_RETRY | 446 IBV_QP_MAX_QP_RD_ATOMIC; 447 448 old_state = rqpair->ibv_attr.qp_state; 449 rc = ibv_query_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, 450 spdk_nvmf_ibv_attr_mask, &init_attr); 451 452 if (rc) 453 { 454 SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); 455 assert(false); 456 } 457 458 new_state = rqpair->ibv_attr.qp_state; 459 460 rc = spdk_nvmf_rdma_check_ibv_state(new_state); 461 if (rc) 462 { 463 SPDK_ERRLOG("QP#%d: bad state updated: %u, maybe hardware issue\n", rqpair->qpair.qid, new_state); 464 /* 465 * IBV_QPS_UNKNOWN undefined if lib version smaller than libibverbs-1.1.8 466 * IBV_QPS_UNKNOWN is the enum element after IBV_QPS_ERR 467 */ 468 return IBV_QPS_ERR + 1; 469 } 470 471 if (old_state != new_state) 472 { 473 spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0, 474 (uintptr_t)rqpair->cm_id, new_state); 475 } 476 return new_state; 477 } 478 479 static const char *str_ibv_qp_state[] = { 480 "IBV_QPS_RESET", 481 "IBV_QPS_INIT", 482 "IBV_QPS_RTR", 483 "IBV_QPS_RTS", 484 "IBV_QPS_SQD", 485 "IBV_QPS_SQE", 486 "IBV_QPS_ERR", 487 "IBV_QPS_UNKNOWN" 488 }; 489 490 static int 491 spdk_nvmf_rdma_set_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair, 492 enum ibv_qp_state new_state) 493 { 494 int rc; 495 enum ibv_qp_state state; 496 static int attr_mask_rc[] = { 497 [IBV_QPS_RESET] = IBV_QP_STATE, 498 [IBV_QPS_INIT] = (IBV_QP_STATE | 499 IBV_QP_PKEY_INDEX | 500 IBV_QP_PORT | 501 IBV_QP_ACCESS_FLAGS), 502 [IBV_QPS_RTR] = (IBV_QP_STATE | 503 IBV_QP_AV | 504 IBV_QP_PATH_MTU | 505 IBV_QP_DEST_QPN | 506 IBV_QP_RQ_PSN | 507 IBV_QP_MAX_DEST_RD_ATOMIC | 508 IBV_QP_MIN_RNR_TIMER), 509 [IBV_QPS_RTS] = (IBV_QP_STATE | 510 IBV_QP_SQ_PSN | 511 IBV_QP_TIMEOUT | 512 IBV_QP_RETRY_CNT | 513 IBV_QP_RNR_RETRY | 514 IBV_QP_MAX_QP_RD_ATOMIC), 515 [IBV_QPS_SQD] = IBV_QP_STATE, 516 [IBV_QPS_SQE] = IBV_QP_STATE, 517 [IBV_QPS_ERR] = IBV_QP_STATE, 518 }; 519 520 rc = spdk_nvmf_rdma_check_ibv_state(new_state); 521 if (rc) { 522 SPDK_ERRLOG("QP#%d: bad state requested: %u\n", 523 rqpair->qpair.qid, new_state); 524 return rc; 525 } 526 527 rqpair->ibv_attr.cur_qp_state = rqpair->ibv_attr.qp_state; 528 rqpair->ibv_attr.qp_state = new_state; 529 rqpair->ibv_attr.ah_attr.port_num = rqpair->ibv_attr.port_num; 530 531 rc = ibv_modify_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, 532 attr_mask_rc[new_state]); 533 534 if (rc) { 535 SPDK_ERRLOG("QP#%d: failed to set state to: %s, %d (%s)\n", 536 rqpair->qpair.qid, str_ibv_qp_state[new_state], errno, strerror(errno)); 537 return rc; 538 } 539 540 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 541 542 if (state != new_state) { 543 SPDK_ERRLOG("QP#%d: expected state: %s, actual state: %s\n", 544 rqpair->qpair.qid, str_ibv_qp_state[new_state], 545 str_ibv_qp_state[state]); 546 return -1; 547 } 548 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "IBV QP#%u changed to: %s\n", rqpair->qpair.qid, 549 str_ibv_qp_state[state]); 550 return 0; 551 } 552 553 static void 554 spdk_nvmf_rdma_request_set_state(struct spdk_nvmf_rdma_request *rdma_req, 555 enum spdk_nvmf_rdma_request_state state) 556 { 557 struct spdk_nvmf_qpair *qpair; 558 struct spdk_nvmf_rdma_qpair *rqpair; 559 560 qpair = rdma_req->req.qpair; 561 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 562 563 TAILQ_REMOVE(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); 564 rqpair->state_cntr[rdma_req->state]--; 565 566 rdma_req->state = state; 567 568 TAILQ_INSERT_TAIL(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); 569 rqpair->state_cntr[rdma_req->state]++; 570 } 571 572 static int 573 spdk_nvmf_rdma_cur_queue_depth(struct spdk_nvmf_rdma_qpair *rqpair) 574 { 575 return rqpair->max_queue_depth - 576 rqpair->state_cntr[RDMA_REQUEST_STATE_FREE]; 577 } 578 579 static void 580 nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req) 581 { 582 SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->data_from_pool); 583 if (req->req.cmd) { 584 SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode); 585 } 586 if (req->recv) { 587 SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id); 588 } 589 } 590 591 static void 592 nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair) 593 { 594 int i; 595 struct spdk_nvmf_rdma_request *req; 596 SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid); 597 for (i = 1; i < RDMA_REQUEST_NUM_STATES; i++) { 598 SPDK_ERRLOG("\tdumping requests in state %d\n", i); 599 TAILQ_FOREACH(req, &rqpair->state_queue[i], state_link) { 600 nvmf_rdma_dump_request(req); 601 } 602 } 603 } 604 605 static void 606 spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 607 { 608 int qd; 609 610 spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); 611 612 qd = spdk_nvmf_rdma_cur_queue_depth(rqpair); 613 if (qd != 0) { 614 nvmf_rdma_dump_qpair_contents(rqpair); 615 SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", qd); 616 } 617 618 if (rqpair->poller) { 619 TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); 620 } 621 622 if (rqpair->cmds_mr) { 623 ibv_dereg_mr(rqpair->cmds_mr); 624 } 625 626 if (rqpair->cpls_mr) { 627 ibv_dereg_mr(rqpair->cpls_mr); 628 } 629 630 if (rqpair->bufs_mr) { 631 ibv_dereg_mr(rqpair->bufs_mr); 632 } 633 634 if (rqpair->cm_id) { 635 rdma_destroy_qp(rqpair->cm_id); 636 rdma_destroy_id(rqpair->cm_id); 637 638 if (rqpair->poller) { 639 rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth); 640 } 641 } 642 643 /* Free all memory */ 644 spdk_dma_free(rqpair->cmds); 645 spdk_dma_free(rqpair->cpls); 646 spdk_dma_free(rqpair->bufs); 647 free(rqpair->reqs); 648 free(rqpair->recvs); 649 free(rqpair); 650 } 651 652 static int 653 spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 654 { 655 struct spdk_nvmf_rdma_transport *rtransport; 656 struct spdk_nvmf_rdma_qpair *rqpair; 657 struct spdk_nvmf_rdma_poller *rpoller; 658 int rc, i, num_cqe, required_num_wr;; 659 struct spdk_nvmf_rdma_recv *rdma_recv; 660 struct spdk_nvmf_rdma_request *rdma_req; 661 struct spdk_nvmf_transport *transport; 662 struct spdk_nvmf_rdma_device *device; 663 struct ibv_qp_init_attr ibv_init_attr; 664 665 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 666 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 667 transport = &rtransport->transport; 668 device = rqpair->port->device; 669 670 memset(&ibv_init_attr, 0, sizeof(struct ibv_qp_init_attr)); 671 ibv_init_attr.qp_context = rqpair; 672 ibv_init_attr.qp_type = IBV_QPT_RC; 673 ibv_init_attr.send_cq = rqpair->poller->cq; 674 ibv_init_attr.recv_cq = rqpair->poller->cq; 675 ibv_init_attr.cap.max_send_wr = rqpair->max_queue_depth * 676 2 + 1; /* SEND, READ, and WRITE operations + dummy drain WR */ 677 ibv_init_attr.cap.max_recv_wr = rqpair->max_queue_depth + 678 1; /* RECV operations + dummy drain WR */ 679 ibv_init_attr.cap.max_send_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_TX_SGE); 680 ibv_init_attr.cap.max_recv_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 681 682 /* Enlarge CQ size dynamically */ 683 rpoller = rqpair->poller; 684 required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth); 685 num_cqe = rpoller->num_cqe; 686 if (num_cqe < required_num_wr) { 687 num_cqe = spdk_max(num_cqe * 2, required_num_wr); 688 num_cqe = spdk_min(num_cqe, device->attr.max_cqe); 689 } 690 691 if (rpoller->num_cqe != num_cqe) { 692 if (required_num_wr > device->attr.max_cqe) { 693 SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n", 694 required_num_wr, device->attr.max_cqe); 695 rdma_destroy_id(rqpair->cm_id); 696 rqpair->cm_id = NULL; 697 spdk_nvmf_rdma_qpair_destroy(rqpair); 698 return -1; 699 } 700 701 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe); 702 rc = ibv_resize_cq(rpoller->cq, num_cqe); 703 if (rc) { 704 SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); 705 rdma_destroy_id(rqpair->cm_id); 706 rqpair->cm_id = NULL; 707 spdk_nvmf_rdma_qpair_destroy(rqpair); 708 return -1; 709 } 710 711 rpoller->num_cqe = num_cqe; 712 } 713 714 rc = rdma_create_qp(rqpair->cm_id, rqpair->port->device->pd, &ibv_init_attr); 715 if (rc) { 716 SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); 717 rdma_destroy_id(rqpair->cm_id); 718 rqpair->cm_id = NULL; 719 spdk_nvmf_rdma_qpair_destroy(rqpair); 720 return -1; 721 } 722 723 rpoller->required_num_wr = required_num_wr; 724 725 rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2 + 1), 726 ibv_init_attr.cap.max_send_wr); 727 rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, ibv_init_attr.cap.max_send_sge); 728 rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, ibv_init_attr.cap.max_recv_sge); 729 spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0); 730 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); 731 732 rqpair->reqs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->reqs)); 733 rqpair->recvs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->recvs)); 734 rqpair->cmds = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cmds), 735 0x1000, NULL); 736 rqpair->cpls = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cpls), 737 0x1000, NULL); 738 739 740 if (transport->opts.in_capsule_data_size > 0) { 741 rqpair->bufs = spdk_dma_zmalloc(rqpair->max_queue_depth * 742 transport->opts.in_capsule_data_size, 743 0x1000, NULL); 744 } 745 746 if (!rqpair->reqs || !rqpair->recvs || !rqpair->cmds || 747 !rqpair->cpls || (transport->opts.in_capsule_data_size && !rqpair->bufs)) { 748 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 749 spdk_nvmf_rdma_qpair_destroy(rqpair); 750 return -1; 751 } 752 753 rqpair->cmds_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cmds, 754 rqpair->max_queue_depth * sizeof(*rqpair->cmds), 755 IBV_ACCESS_LOCAL_WRITE); 756 rqpair->cpls_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cpls, 757 rqpair->max_queue_depth * sizeof(*rqpair->cpls), 758 0); 759 760 if (transport->opts.in_capsule_data_size) { 761 rqpair->bufs_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->bufs, 762 rqpair->max_queue_depth * 763 transport->opts.in_capsule_data_size, 764 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 765 } 766 767 if (!rqpair->cmds_mr || !rqpair->cpls_mr || (transport->opts.in_capsule_data_size && 768 !rqpair->bufs_mr)) { 769 SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n"); 770 spdk_nvmf_rdma_qpair_destroy(rqpair); 771 return -1; 772 } 773 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 774 rqpair->cmds, rqpair->max_queue_depth * sizeof(*rqpair->cmds), rqpair->cmds_mr->lkey); 775 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 776 rqpair->cpls, rqpair->max_queue_depth * sizeof(*rqpair->cpls), rqpair->cpls_mr->lkey); 777 if (rqpair->bufs && rqpair->bufs_mr) { 778 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 779 rqpair->bufs, rqpair->max_queue_depth * 780 transport->opts.in_capsule_data_size, rqpair->bufs_mr->lkey); 781 } 782 783 /* Initialise request state queues and counters of the queue pair */ 784 for (i = RDMA_REQUEST_STATE_FREE; i < RDMA_REQUEST_NUM_STATES; i++) { 785 TAILQ_INIT(&rqpair->state_queue[i]); 786 rqpair->state_cntr[i] = 0; 787 } 788 789 rqpair->current_recv_depth = rqpair->max_queue_depth; 790 for (i = 0; i < rqpair->max_queue_depth; i++) { 791 struct ibv_recv_wr *bad_wr = NULL; 792 793 rdma_recv = &rqpair->recvs[i]; 794 rdma_recv->qpair = rqpair; 795 796 /* Set up memory to receive commands */ 797 if (rqpair->bufs) { 798 rdma_recv->buf = (void *)((uintptr_t)rqpair->bufs + (i * 799 transport->opts.in_capsule_data_size)); 800 } 801 802 rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV; 803 804 rdma_recv->sgl[0].addr = (uintptr_t)&rqpair->cmds[i]; 805 rdma_recv->sgl[0].length = sizeof(rqpair->cmds[i]); 806 rdma_recv->sgl[0].lkey = rqpair->cmds_mr->lkey; 807 rdma_recv->wr.num_sge = 1; 808 809 if (rdma_recv->buf && rqpair->bufs_mr) { 810 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 811 rdma_recv->sgl[1].length = transport->opts.in_capsule_data_size; 812 rdma_recv->sgl[1].lkey = rqpair->bufs_mr->lkey; 813 rdma_recv->wr.num_sge++; 814 } 815 816 rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr; 817 rdma_recv->wr.sg_list = rdma_recv->sgl; 818 819 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_recv->wr, &bad_wr); 820 assert(rqpair->current_recv_depth > 0); 821 rqpair->current_recv_depth--; 822 if (rc) { 823 SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n"); 824 spdk_nvmf_rdma_qpair_destroy(rqpair); 825 return -1; 826 } 827 } 828 assert(rqpair->current_recv_depth == 0); 829 830 for (i = 0; i < rqpair->max_queue_depth; i++) { 831 rdma_req = &rqpair->reqs[i]; 832 833 rdma_req->req.qpair = &rqpair->qpair; 834 rdma_req->req.cmd = NULL; 835 836 /* Set up memory to send responses */ 837 rdma_req->req.rsp = &rqpair->cpls[i]; 838 839 rdma_req->rsp.sgl[0].addr = (uintptr_t)&rqpair->cpls[i]; 840 rdma_req->rsp.sgl[0].length = sizeof(rqpair->cpls[i]); 841 rdma_req->rsp.sgl[0].lkey = rqpair->cpls_mr->lkey; 842 843 rdma_req->rsp.rdma_wr.type = RDMA_WR_TYPE_SEND; 844 rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp.rdma_wr; 845 rdma_req->rsp.wr.next = NULL; 846 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 847 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 848 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 849 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 850 851 /* Set up memory for data buffers */ 852 rdma_req->data.rdma_wr.type = RDMA_WR_TYPE_DATA; 853 rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data.rdma_wr; 854 rdma_req->data.wr.next = NULL; 855 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 856 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 857 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 858 859 /* Initialize request state to FREE */ 860 rdma_req->state = RDMA_REQUEST_STATE_FREE; 861 TAILQ_INSERT_TAIL(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); 862 rqpair->state_cntr[rdma_req->state]++; 863 } 864 865 return 0; 866 } 867 868 static int 869 request_transfer_in(struct spdk_nvmf_request *req) 870 { 871 int rc; 872 struct spdk_nvmf_rdma_request *rdma_req; 873 struct spdk_nvmf_qpair *qpair; 874 struct spdk_nvmf_rdma_qpair *rqpair; 875 struct ibv_send_wr *bad_wr = NULL; 876 877 qpair = req->qpair; 878 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 879 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 880 881 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 882 assert(rdma_req != NULL); 883 884 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair); 885 886 rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr); 887 if (rc) { 888 SPDK_ERRLOG("Unable to transfer data from host to target\n"); 889 return -1; 890 } 891 rqpair->current_read_depth += rdma_req->num_outstanding_data_wr; 892 rqpair->current_send_depth += rdma_req->num_outstanding_data_wr; 893 return 0; 894 } 895 896 static int 897 request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) 898 { 899 int rc; 900 struct spdk_nvmf_rdma_request *rdma_req; 901 struct spdk_nvmf_qpair *qpair; 902 struct spdk_nvmf_rdma_qpair *rqpair; 903 struct spdk_nvme_cpl *rsp; 904 struct ibv_recv_wr *bad_recv_wr = NULL; 905 struct ibv_send_wr *send_wr, *bad_send_wr = NULL; 906 907 *data_posted = 0; 908 qpair = req->qpair; 909 rsp = &req->rsp->nvme_cpl; 910 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 911 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 912 913 /* Advance our sq_head pointer */ 914 if (qpair->sq_head == qpair->sq_head_max) { 915 qpair->sq_head = 0; 916 } else { 917 qpair->sq_head++; 918 } 919 rsp->sqhd = qpair->sq_head; 920 921 /* Post the capsule to the recv buffer */ 922 assert(rdma_req->recv != NULL); 923 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, 924 rqpair); 925 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); 926 if (rc) { 927 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 928 return rc; 929 } 930 rdma_req->recv = NULL; 931 assert(rqpair->current_recv_depth > 0); 932 rqpair->current_recv_depth--; 933 934 /* Build the response which consists of an optional 935 * RDMA WRITE to transfer data, plus an RDMA SEND 936 * containing the response. 937 */ 938 send_wr = &rdma_req->rsp.wr; 939 940 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 941 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 942 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair); 943 send_wr = &rdma_req->data.wr; 944 *data_posted = 1; 945 } 946 947 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair); 948 949 /* Send the completion */ 950 rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr); 951 if (rc) { 952 SPDK_ERRLOG("Unable to send response capsule\n"); 953 return rc; 954 } 955 /* +1 for the rsp wr */ 956 rqpair->current_send_depth += rdma_req->num_outstanding_data_wr + 1; 957 958 return 0; 959 } 960 961 static int 962 spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 963 { 964 struct spdk_nvmf_rdma_accept_private_data accept_data; 965 struct rdma_conn_param ctrlr_event_data = {}; 966 int rc; 967 968 accept_data.recfmt = 0; 969 accept_data.crqsize = rqpair->max_queue_depth; 970 971 ctrlr_event_data.private_data = &accept_data; 972 ctrlr_event_data.private_data_len = sizeof(accept_data); 973 if (id->ps == RDMA_PS_TCP) { 974 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 975 ctrlr_event_data.initiator_depth = rqpair->max_read_depth; 976 } 977 978 rc = rdma_accept(id, &ctrlr_event_data); 979 if (rc) { 980 SPDK_ERRLOG("Error %d on rdma_accept\n", errno); 981 } else { 982 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); 983 } 984 985 return rc; 986 } 987 988 static void 989 spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 990 { 991 struct spdk_nvmf_rdma_reject_private_data rej_data; 992 993 rej_data.recfmt = 0; 994 rej_data.sts = error; 995 996 rdma_reject(id, &rej_data, sizeof(rej_data)); 997 } 998 999 static int 1000 nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, 1001 new_qpair_fn cb_fn) 1002 { 1003 struct spdk_nvmf_rdma_transport *rtransport; 1004 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 1005 struct spdk_nvmf_rdma_port *port; 1006 struct rdma_conn_param *rdma_param = NULL; 1007 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 1008 uint16_t max_queue_depth; 1009 uint16_t max_read_depth; 1010 1011 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1012 1013 assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 1014 assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 1015 1016 rdma_param = &event->param.conn; 1017 if (rdma_param->private_data == NULL || 1018 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1019 SPDK_ERRLOG("connect request: no private data provided\n"); 1020 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 1021 return -1; 1022 } 1023 1024 private_data = rdma_param->private_data; 1025 if (private_data->recfmt != 0) { 1026 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 1027 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 1028 return -1; 1029 } 1030 1031 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 1032 event->id->verbs->device->name, event->id->verbs->device->dev_name); 1033 1034 port = event->listen_id->context; 1035 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 1036 event->listen_id, event->listen_id->verbs, port); 1037 1038 /* Figure out the supported queue depth. This is a multi-step process 1039 * that takes into account hardware maximums, host provided values, 1040 * and our target's internal memory limits */ 1041 1042 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); 1043 1044 /* Start with the maximum queue depth allowed by the target */ 1045 max_queue_depth = rtransport->transport.opts.max_queue_depth; 1046 max_read_depth = rtransport->transport.opts.max_queue_depth; 1047 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", 1048 rtransport->transport.opts.max_queue_depth); 1049 1050 /* Next check the local NIC's hardware limitations */ 1051 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 1052 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 1053 port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 1054 max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 1055 max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom); 1056 1057 /* Next check the remote NIC's hardware limitations */ 1058 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 1059 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 1060 rdma_param->initiator_depth, rdma_param->responder_resources); 1061 if (rdma_param->initiator_depth > 0) { 1062 max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth); 1063 } 1064 1065 /* Finally check for the host software requested values, which are 1066 * optional. */ 1067 if (rdma_param->private_data != NULL && 1068 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1069 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 1070 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 1071 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 1072 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 1073 } 1074 1075 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 1076 max_queue_depth, max_read_depth); 1077 1078 rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 1079 if (rqpair == NULL) { 1080 SPDK_ERRLOG("Could not allocate new connection.\n"); 1081 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1082 return -1; 1083 } 1084 1085 rqpair->port = port; 1086 rqpair->max_queue_depth = max_queue_depth; 1087 rqpair->max_read_depth = max_read_depth; 1088 rqpair->cm_id = event->id; 1089 rqpair->listen_id = event->listen_id; 1090 rqpair->qpair.transport = transport; 1091 TAILQ_INIT(&rqpair->incoming_queue); 1092 event->id->context = &rqpair->qpair; 1093 1094 cb_fn(&rqpair->qpair); 1095 1096 return 0; 1097 } 1098 1099 static int 1100 spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 1101 enum spdk_mem_map_notify_action action, 1102 void *vaddr, size_t size) 1103 { 1104 struct ibv_pd *pd = cb_ctx; 1105 struct ibv_mr *mr; 1106 1107 switch (action) { 1108 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1109 if (!g_nvmf_hooks.get_rkey) { 1110 mr = ibv_reg_mr(pd, vaddr, size, 1111 IBV_ACCESS_LOCAL_WRITE | 1112 IBV_ACCESS_REMOTE_READ | 1113 IBV_ACCESS_REMOTE_WRITE); 1114 if (mr == NULL) { 1115 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 1116 return -1; 1117 } else { 1118 spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 1119 } 1120 } else { 1121 spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, 1122 g_nvmf_hooks.get_rkey(pd, vaddr, size)); 1123 } 1124 break; 1125 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1126 if (!g_nvmf_hooks.get_rkey) { 1127 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); 1128 spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 1129 if (mr) { 1130 ibv_dereg_mr(mr); 1131 } 1132 } 1133 break; 1134 } 1135 1136 return 0; 1137 } 1138 1139 static int 1140 spdk_nvmf_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) 1141 { 1142 /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ 1143 return addr_1 == addr_2; 1144 } 1145 1146 static void 1147 spdk_nvmf_rdma_request_free_buffers(struct spdk_nvmf_rdma_request *rdma_req, 1148 struct spdk_nvmf_transport_poll_group *group, struct spdk_nvmf_transport *transport) 1149 { 1150 for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { 1151 if (group->buf_cache_count < group->buf_cache_size) { 1152 STAILQ_INSERT_HEAD(&group->buf_cache, 1153 (struct spdk_nvmf_transport_pg_cache_buf *)rdma_req->data.buffers[i], link); 1154 group->buf_cache_count++; 1155 } else { 1156 spdk_mempool_put(transport->data_buf_pool, rdma_req->data.buffers[i]); 1157 } 1158 rdma_req->req.iov[i].iov_base = NULL; 1159 rdma_req->data.buffers[i] = NULL; 1160 rdma_req->req.iov[i].iov_len = 0; 1161 1162 } 1163 rdma_req->data_from_pool = false; 1164 } 1165 1166 typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; 1167 1168 static spdk_nvme_data_transfer_t 1169 spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) 1170 { 1171 enum spdk_nvme_data_transfer xfer; 1172 struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; 1173 struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; 1174 1175 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1176 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 1177 rdma_req->rsp.wr.imm_data = 0; 1178 #endif 1179 1180 /* Figure out data transfer direction */ 1181 if (cmd->opc == SPDK_NVME_OPC_FABRIC) { 1182 xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); 1183 } else { 1184 xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 1185 1186 /* Some admin commands are special cases */ 1187 if ((rdma_req->req.qpair->qid == 0) && 1188 ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || 1189 (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { 1190 switch (cmd->cdw10 & 0xff) { 1191 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 1192 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1193 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 1194 break; 1195 default: 1196 xfer = SPDK_NVME_DATA_NONE; 1197 } 1198 } 1199 } 1200 1201 if (xfer == SPDK_NVME_DATA_NONE) { 1202 return xfer; 1203 } 1204 1205 /* Even for commands that may transfer data, they could have specified 0 length. 1206 * We want those to show up with xfer SPDK_NVME_DATA_NONE. 1207 */ 1208 switch (sgl->generic.type) { 1209 case SPDK_NVME_SGL_TYPE_DATA_BLOCK: 1210 case SPDK_NVME_SGL_TYPE_BIT_BUCKET: 1211 case SPDK_NVME_SGL_TYPE_SEGMENT: 1212 case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: 1213 case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: 1214 if (sgl->unkeyed.length == 0) { 1215 xfer = SPDK_NVME_DATA_NONE; 1216 } 1217 break; 1218 case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: 1219 if (sgl->keyed.length == 0) { 1220 xfer = SPDK_NVME_DATA_NONE; 1221 } 1222 break; 1223 } 1224 1225 return xfer; 1226 } 1227 1228 static int 1229 spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 1230 struct spdk_nvmf_rdma_device *device, 1231 struct spdk_nvmf_rdma_request *rdma_req) 1232 { 1233 struct spdk_nvmf_rdma_qpair *rqpair; 1234 struct spdk_nvmf_rdma_poll_group *rgroup; 1235 void *buf = NULL; 1236 uint32_t length = rdma_req->req.length; 1237 uint64_t translation_len; 1238 uint32_t i = 0; 1239 int rc = 0; 1240 1241 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1242 rgroup = rqpair->poller->group; 1243 rdma_req->req.iovcnt = 0; 1244 while (length) { 1245 if (!(STAILQ_EMPTY(&rgroup->group.buf_cache))) { 1246 rgroup->group.buf_cache_count--; 1247 buf = STAILQ_FIRST(&rgroup->group.buf_cache); 1248 STAILQ_REMOVE_HEAD(&rgroup->group.buf_cache, link); 1249 assert(buf != NULL); 1250 } else { 1251 buf = spdk_mempool_get(rtransport->transport.data_buf_pool); 1252 if (!buf) { 1253 rc = -ENOMEM; 1254 goto err_exit; 1255 } 1256 } 1257 1258 rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & 1259 ~NVMF_DATA_BUFFER_MASK); 1260 rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->transport.opts.io_unit_size); 1261 rdma_req->req.iovcnt++; 1262 rdma_req->data.buffers[i] = buf; 1263 rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base); 1264 rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len; 1265 translation_len = rdma_req->req.iov[i].iov_len; 1266 1267 if (!g_nvmf_hooks.get_rkey) { 1268 rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, 1269 (uint64_t)buf, &translation_len))->lkey; 1270 } else { 1271 rdma_req->data.wr.sg_list[i].lkey = *((uint64_t *)spdk_mem_map_translate(device->map, 1272 (uint64_t)buf, &translation_len)); 1273 } 1274 1275 length -= rdma_req->req.iov[i].iov_len; 1276 1277 if (translation_len < rdma_req->req.iov[i].iov_len) { 1278 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); 1279 rc = -EINVAL; 1280 goto err_exit; 1281 } 1282 i++; 1283 } 1284 1285 assert(rdma_req->req.iovcnt <= rqpair->max_send_sge); 1286 1287 rdma_req->data_from_pool = true; 1288 1289 return rc; 1290 1291 err_exit: 1292 spdk_nvmf_rdma_request_free_buffers(rdma_req, &rgroup->group, &rtransport->transport); 1293 while (i) { 1294 i--; 1295 rdma_req->data.wr.sg_list[i].addr = 0; 1296 rdma_req->data.wr.sg_list[i].length = 0; 1297 rdma_req->data.wr.sg_list[i].lkey = 0; 1298 } 1299 rdma_req->req.iovcnt = 0; 1300 return rc; 1301 } 1302 1303 static int 1304 spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 1305 struct spdk_nvmf_rdma_device *device, 1306 struct spdk_nvmf_rdma_request *rdma_req) 1307 { 1308 struct spdk_nvme_cmd *cmd; 1309 struct spdk_nvme_cpl *rsp; 1310 struct spdk_nvme_sgl_descriptor *sgl; 1311 1312 cmd = &rdma_req->req.cmd->nvme_cmd; 1313 rsp = &rdma_req->req.rsp->nvme_cpl; 1314 sgl = &cmd->dptr.sgl1; 1315 1316 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 1317 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 1318 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 1319 if (sgl->keyed.length > rtransport->transport.opts.max_io_size) { 1320 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 1321 sgl->keyed.length, rtransport->transport.opts.max_io_size); 1322 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1323 return -1; 1324 } 1325 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1326 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 1327 if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 1328 rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 1329 rdma_req->rsp.wr.imm_data = sgl->keyed.key; 1330 } 1331 } 1332 #endif 1333 1334 /* fill request length and populate iovs */ 1335 rdma_req->req.length = sgl->keyed.length; 1336 1337 if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { 1338 /* No available buffers. Queue this request up. */ 1339 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 1340 return 0; 1341 } 1342 1343 /* backward compatible */ 1344 rdma_req->req.data = rdma_req->req.iov[0].iov_base; 1345 1346 /* rdma wr specifics */ 1347 rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; 1348 rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; 1349 rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; 1350 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1351 rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; 1352 rdma_req->data.wr.next = &rdma_req->rsp.wr; 1353 } else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 1354 rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; 1355 rdma_req->data.wr.next = NULL; 1356 } 1357 1358 /* set the number of outstanding data WRs for this request. */ 1359 rdma_req->num_outstanding_data_wr = 1; 1360 1361 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 1362 rdma_req->req.iovcnt); 1363 1364 return 0; 1365 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 1366 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1367 uint64_t offset = sgl->address; 1368 uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; 1369 1370 SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 1371 offset, sgl->unkeyed.length); 1372 1373 if (offset > max_len) { 1374 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 1375 offset, max_len); 1376 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 1377 return -1; 1378 } 1379 max_len -= (uint32_t)offset; 1380 1381 if (sgl->unkeyed.length > max_len) { 1382 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 1383 sgl->unkeyed.length, max_len); 1384 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1385 return -1; 1386 } 1387 1388 rdma_req->num_outstanding_data_wr = 0; 1389 rdma_req->req.data = rdma_req->recv->buf + offset; 1390 rdma_req->data_from_pool = false; 1391 rdma_req->req.length = sgl->unkeyed.length; 1392 1393 rdma_req->req.iov[0].iov_base = rdma_req->req.data; 1394 rdma_req->req.iov[0].iov_len = rdma_req->req.length; 1395 rdma_req->req.iovcnt = 1; 1396 1397 return 0; 1398 } 1399 1400 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 1401 sgl->generic.type, sgl->generic.subtype); 1402 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 1403 return -1; 1404 } 1405 1406 static void 1407 nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req, 1408 struct spdk_nvmf_rdma_transport *rtransport) 1409 { 1410 struct spdk_nvmf_rdma_qpair *rqpair; 1411 struct spdk_nvmf_rdma_poll_group *rgroup; 1412 1413 if (rdma_req->data_from_pool) { 1414 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1415 rgroup = rqpair->poller->group; 1416 1417 spdk_nvmf_rdma_request_free_buffers(rdma_req, &rgroup->group, &rtransport->transport); 1418 } 1419 rdma_req->num_outstanding_data_wr = 0; 1420 rdma_req->req.length = 0; 1421 rdma_req->req.iovcnt = 0; 1422 rdma_req->req.data = NULL; 1423 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_FREE); 1424 } 1425 1426 static bool 1427 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 1428 struct spdk_nvmf_rdma_request *rdma_req) 1429 { 1430 struct spdk_nvmf_rdma_qpair *rqpair; 1431 struct spdk_nvmf_rdma_device *device; 1432 struct spdk_nvmf_rdma_poll_group *rgroup; 1433 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 1434 int rc; 1435 struct spdk_nvmf_rdma_recv *rdma_recv; 1436 enum spdk_nvmf_rdma_request_state prev_state; 1437 bool progress = false; 1438 int data_posted; 1439 1440 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1441 device = rqpair->port->device; 1442 rgroup = rqpair->poller->group; 1443 1444 assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 1445 1446 /* If the queue pair is in an error state, force the request to the completed state 1447 * to release resources. */ 1448 if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 1449 if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) { 1450 TAILQ_REMOVE(&rgroup->pending_data_buf_queue, rdma_req, link); 1451 } 1452 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 1453 } 1454 1455 /* The loop here is to allow for several back-to-back state changes. */ 1456 do { 1457 prev_state = rdma_req->state; 1458 1459 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); 1460 1461 switch (rdma_req->state) { 1462 case RDMA_REQUEST_STATE_FREE: 1463 /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 1464 * to escape this state. */ 1465 break; 1466 case RDMA_REQUEST_STATE_NEW: 1467 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, 1468 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1469 rdma_recv = rdma_req->recv; 1470 1471 /* The first element of the SGL is the NVMe command */ 1472 rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 1473 memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 1474 1475 TAILQ_REMOVE(&rqpair->incoming_queue, rdma_recv, link); 1476 1477 if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 1478 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 1479 break; 1480 } 1481 1482 /* The next state transition depends on the data transfer needs of this request. */ 1483 rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); 1484 1485 /* If no data to transfer, ready to execute. */ 1486 if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 1487 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); 1488 break; 1489 } 1490 1491 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_NEED_BUFFER); 1492 TAILQ_INSERT_TAIL(&rgroup->pending_data_buf_queue, rdma_req, link); 1493 break; 1494 case RDMA_REQUEST_STATE_NEED_BUFFER: 1495 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, 1496 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1497 1498 assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 1499 1500 if (rdma_req != TAILQ_FIRST(&rgroup->pending_data_buf_queue)) { 1501 /* This request needs to wait in line to obtain a buffer */ 1502 break; 1503 } 1504 1505 /* Try to get a data buffer */ 1506 rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 1507 if (rc < 0) { 1508 TAILQ_REMOVE(&rgroup->pending_data_buf_queue, rdma_req, link); 1509 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1510 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); 1511 break; 1512 } 1513 1514 if (!rdma_req->req.data) { 1515 /* No buffers available. */ 1516 break; 1517 } 1518 1519 TAILQ_REMOVE(&rgroup->pending_data_buf_queue, rdma_req, link); 1520 1521 /* If data is transferring from host to controller and the data didn't 1522 * arrive using in capsule data, we need to do a transfer from the host. 1523 */ 1524 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { 1525 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING); 1526 break; 1527 } 1528 1529 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); 1530 break; 1531 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: 1532 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0, 1533 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1534 1535 if (rdma_req != TAILQ_FIRST( 1536 &rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING])) { 1537 /* This request needs to wait in line to perform RDMA */ 1538 break; 1539 } 1540 if (rqpair->current_send_depth + rdma_req->num_outstanding_data_wr > rqpair->max_send_depth 1541 || rqpair->current_read_depth + rdma_req->num_outstanding_data_wr > rqpair->max_read_depth) { 1542 /* We can only have so many WRs outstanding. we have to wait until some finish. */ 1543 break; 1544 } 1545 rc = request_transfer_in(&rdma_req->req); 1546 if (!rc) { 1547 spdk_nvmf_rdma_request_set_state(rdma_req, 1548 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); 1549 } else { 1550 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1551 spdk_nvmf_rdma_request_set_state(rdma_req, 1552 RDMA_REQUEST_STATE_READY_TO_COMPLETE); 1553 } 1554 break; 1555 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 1556 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, 1557 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1558 /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 1559 * to escape this state. */ 1560 break; 1561 case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 1562 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, 1563 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1564 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_EXECUTING); 1565 spdk_nvmf_request_exec(&rdma_req->req); 1566 break; 1567 case RDMA_REQUEST_STATE_EXECUTING: 1568 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, 1569 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1570 /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 1571 * to escape this state. */ 1572 break; 1573 case RDMA_REQUEST_STATE_EXECUTED: 1574 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, 1575 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1576 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1577 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING); 1578 } else { 1579 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); 1580 } 1581 break; 1582 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: 1583 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0, 1584 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1585 1586 if (rdma_req != TAILQ_FIRST( 1587 &rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING])) { 1588 /* This request needs to wait in line to perform RDMA */ 1589 break; 1590 } 1591 if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) > 1592 rqpair->max_send_depth) { 1593 /* We can only have so many WRs outstanding. we have to wait until some finish. 1594 * +1 since each request has an additional wr in the resp. */ 1595 break; 1596 } 1597 /* The data transfer will be kicked off from 1598 * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. 1599 */ 1600 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); 1601 break; 1602 case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 1603 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, 1604 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1605 rc = request_transfer_out(&rdma_req->req, &data_posted); 1606 assert(rc == 0); /* No good way to handle this currently */ 1607 if (rc) { 1608 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 1609 } else { 1610 spdk_nvmf_rdma_request_set_state(rdma_req, 1611 data_posted ? 1612 RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : 1613 RDMA_REQUEST_STATE_COMPLETING); 1614 } 1615 break; 1616 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 1617 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, 1618 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1619 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1620 * to escape this state. */ 1621 break; 1622 case RDMA_REQUEST_STATE_COMPLETING: 1623 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, 1624 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1625 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1626 * to escape this state. */ 1627 break; 1628 case RDMA_REQUEST_STATE_COMPLETED: 1629 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, 1630 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1631 1632 nvmf_rdma_request_free(rdma_req, rtransport); 1633 break; 1634 case RDMA_REQUEST_NUM_STATES: 1635 default: 1636 assert(0); 1637 break; 1638 } 1639 1640 if (rdma_req->state != prev_state) { 1641 progress = true; 1642 } 1643 } while (rdma_req->state != prev_state); 1644 1645 return progress; 1646 } 1647 1648 /* Public API callbacks begin here */ 1649 1650 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 1651 #define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 1652 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 1653 #define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 1654 #define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 1655 #define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES) 1656 #define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 512 1657 #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32 1658 1659 static void 1660 spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) 1661 { 1662 opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; 1663 opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; 1664 opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; 1665 opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; 1666 opts->io_unit_size = SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE; 1667 opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; 1668 opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS; 1669 opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE; 1670 } 1671 1672 static int spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport); 1673 1674 static struct spdk_nvmf_transport * 1675 spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) 1676 { 1677 int rc; 1678 struct spdk_nvmf_rdma_transport *rtransport; 1679 struct spdk_nvmf_rdma_device *device, *tmp; 1680 struct ibv_context **contexts; 1681 uint32_t i; 1682 int flag; 1683 uint32_t sge_count; 1684 uint32_t min_shared_buffers; 1685 int max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES; 1686 1687 rtransport = calloc(1, sizeof(*rtransport)); 1688 if (!rtransport) { 1689 return NULL; 1690 } 1691 1692 if (pthread_mutex_init(&rtransport->lock, NULL)) { 1693 SPDK_ERRLOG("pthread_mutex_init() failed\n"); 1694 free(rtransport); 1695 return NULL; 1696 } 1697 1698 TAILQ_INIT(&rtransport->devices); 1699 TAILQ_INIT(&rtransport->ports); 1700 1701 rtransport->transport.ops = &spdk_nvmf_transport_rdma; 1702 1703 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n" 1704 " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" 1705 " max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" 1706 " in_capsule_data_size=%d, max_aq_depth=%d\n" 1707 " num_shared_buffers=%d\n", 1708 opts->max_queue_depth, 1709 opts->max_io_size, 1710 opts->max_qpairs_per_ctrlr, 1711 opts->io_unit_size, 1712 opts->in_capsule_data_size, 1713 opts->max_aq_depth, 1714 opts->num_shared_buffers); 1715 1716 /* I/O unit size cannot be larger than max I/O size */ 1717 if (opts->io_unit_size > opts->max_io_size) { 1718 opts->io_unit_size = opts->max_io_size; 1719 } 1720 1721 if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) { 1722 SPDK_ERRLOG("The number of shared data buffers (%d) is less than" 1723 "the minimum number required to guarantee that forward progress can be made (%d)\n", 1724 opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2)); 1725 spdk_nvmf_rdma_destroy(&rtransport->transport); 1726 return NULL; 1727 } 1728 1729 min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size; 1730 if (min_shared_buffers > opts->num_shared_buffers) { 1731 SPDK_ERRLOG("There are not enough buffers to satisfy" 1732 "per-poll group caches for each thread. (%" PRIu32 ")" 1733 "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers); 1734 SPDK_ERRLOG("Please specify a larger number of shared buffers\n"); 1735 spdk_nvmf_rdma_destroy(&rtransport->transport); 1736 return NULL; 1737 } 1738 1739 sge_count = opts->max_io_size / opts->io_unit_size; 1740 if (sge_count > NVMF_DEFAULT_TX_SGE) { 1741 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); 1742 spdk_nvmf_rdma_destroy(&rtransport->transport); 1743 return NULL; 1744 } 1745 1746 rtransport->event_channel = rdma_create_event_channel(); 1747 if (rtransport->event_channel == NULL) { 1748 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 1749 spdk_nvmf_rdma_destroy(&rtransport->transport); 1750 return NULL; 1751 } 1752 1753 flag = fcntl(rtransport->event_channel->fd, F_GETFL); 1754 if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1755 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 1756 rtransport->event_channel->fd, spdk_strerror(errno)); 1757 spdk_nvmf_rdma_destroy(&rtransport->transport); 1758 return NULL; 1759 } 1760 1761 rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data", 1762 opts->max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES, 1763 sizeof(struct spdk_nvmf_rdma_request_data), 1764 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 1765 SPDK_ENV_SOCKET_ID_ANY); 1766 if (!rtransport->data_wr_pool) { 1767 SPDK_ERRLOG("Unable to allocate work request pool for poll group\n"); 1768 spdk_nvmf_rdma_destroy(&rtransport->transport); 1769 return NULL; 1770 } 1771 1772 contexts = rdma_get_devices(NULL); 1773 if (contexts == NULL) { 1774 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 1775 spdk_nvmf_rdma_destroy(&rtransport->transport); 1776 return NULL; 1777 } 1778 1779 i = 0; 1780 rc = 0; 1781 while (contexts[i] != NULL) { 1782 device = calloc(1, sizeof(*device)); 1783 if (!device) { 1784 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 1785 rc = -ENOMEM; 1786 break; 1787 } 1788 device->context = contexts[i]; 1789 rc = ibv_query_device(device->context, &device->attr); 1790 if (rc < 0) { 1791 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1792 free(device); 1793 break; 1794 1795 } 1796 1797 max_device_sge = spdk_min(max_device_sge, device->attr.max_sge); 1798 1799 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1800 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { 1801 SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); 1802 SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); 1803 } 1804 1805 /** 1806 * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. 1807 * The Soft-RoCE RXE driver does not currently support send with invalidate, 1808 * but incorrectly reports that it does. There are changes making their way 1809 * through the kernel now that will enable this feature. When they are merged, 1810 * we can conditionally enable this feature. 1811 * 1812 * TODO: enable this for versions of the kernel rxe driver that support it. 1813 */ 1814 if (device->attr.vendor_id == 0) { 1815 device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); 1816 } 1817 #endif 1818 1819 /* set up device context async ev fd as NON_BLOCKING */ 1820 flag = fcntl(device->context->async_fd, F_GETFL); 1821 rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 1822 if (rc < 0) { 1823 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 1824 free(device); 1825 break; 1826 } 1827 1828 TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 1829 i++; 1830 } 1831 rdma_free_devices(contexts); 1832 1833 if (opts->io_unit_size * max_device_sge < opts->max_io_size) { 1834 /* divide and round up. */ 1835 opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge; 1836 1837 /* round up to the nearest 4k. */ 1838 opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK; 1839 1840 opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE); 1841 SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n", 1842 opts->io_unit_size); 1843 } 1844 1845 if (rc < 0) { 1846 spdk_nvmf_rdma_destroy(&rtransport->transport); 1847 return NULL; 1848 } 1849 1850 /* Set up poll descriptor array to monitor events from RDMA and IB 1851 * in a single poll syscall 1852 */ 1853 rtransport->npoll_fds = i + 1; 1854 i = 0; 1855 rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 1856 if (rtransport->poll_fds == NULL) { 1857 SPDK_ERRLOG("poll_fds allocation failed\n"); 1858 spdk_nvmf_rdma_destroy(&rtransport->transport); 1859 return NULL; 1860 } 1861 1862 rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 1863 rtransport->poll_fds[i++].events = POLLIN; 1864 1865 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1866 rtransport->poll_fds[i].fd = device->context->async_fd; 1867 rtransport->poll_fds[i++].events = POLLIN; 1868 } 1869 1870 return &rtransport->transport; 1871 } 1872 1873 static int 1874 spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) 1875 { 1876 struct spdk_nvmf_rdma_transport *rtransport; 1877 struct spdk_nvmf_rdma_port *port, *port_tmp; 1878 struct spdk_nvmf_rdma_device *device, *device_tmp; 1879 1880 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1881 1882 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 1883 TAILQ_REMOVE(&rtransport->ports, port, link); 1884 rdma_destroy_id(port->id); 1885 free(port); 1886 } 1887 1888 if (rtransport->poll_fds != NULL) { 1889 free(rtransport->poll_fds); 1890 } 1891 1892 if (rtransport->event_channel != NULL) { 1893 rdma_destroy_event_channel(rtransport->event_channel); 1894 } 1895 1896 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 1897 TAILQ_REMOVE(&rtransport->devices, device, link); 1898 if (device->map) { 1899 spdk_mem_map_free(&device->map); 1900 } 1901 if (device->pd) { 1902 if (!g_nvmf_hooks.get_ibv_pd) { 1903 ibv_dealloc_pd(device->pd); 1904 } 1905 } 1906 free(device); 1907 } 1908 1909 if (rtransport->data_wr_pool != NULL) { 1910 if (spdk_mempool_count(rtransport->data_wr_pool) != 1911 (transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES)) { 1912 SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n", 1913 spdk_mempool_count(rtransport->data_wr_pool), 1914 transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES); 1915 } 1916 } 1917 1918 spdk_mempool_free(rtransport->data_wr_pool); 1919 pthread_mutex_destroy(&rtransport->lock); 1920 free(rtransport); 1921 1922 return 0; 1923 } 1924 1925 static int 1926 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 1927 struct spdk_nvme_transport_id *trid, 1928 bool peer); 1929 1930 const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = { 1931 .notify_cb = spdk_nvmf_rdma_mem_notify, 1932 .are_contiguous = spdk_nvmf_rdma_check_contiguous_entries 1933 }; 1934 1935 static int 1936 spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, 1937 const struct spdk_nvme_transport_id *trid) 1938 { 1939 struct spdk_nvmf_rdma_transport *rtransport; 1940 struct spdk_nvmf_rdma_device *device; 1941 struct spdk_nvmf_rdma_port *port_tmp, *port; 1942 struct ibv_pd *pd; 1943 struct addrinfo *res; 1944 struct addrinfo hints; 1945 int family; 1946 int rc; 1947 1948 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1949 1950 port = calloc(1, sizeof(*port)); 1951 if (!port) { 1952 return -ENOMEM; 1953 } 1954 1955 /* Selectively copy the trid. Things like NQN don't matter here - that 1956 * mapping is enforced elsewhere. 1957 */ 1958 port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1959 port->trid.adrfam = trid->adrfam; 1960 snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); 1961 snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); 1962 1963 pthread_mutex_lock(&rtransport->lock); 1964 assert(rtransport->event_channel != NULL); 1965 TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { 1966 if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { 1967 port_tmp->ref++; 1968 free(port); 1969 /* Already listening at this address */ 1970 pthread_mutex_unlock(&rtransport->lock); 1971 return 0; 1972 } 1973 } 1974 1975 rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 1976 if (rc < 0) { 1977 SPDK_ERRLOG("rdma_create_id() failed\n"); 1978 free(port); 1979 pthread_mutex_unlock(&rtransport->lock); 1980 return rc; 1981 } 1982 1983 switch (port->trid.adrfam) { 1984 case SPDK_NVMF_ADRFAM_IPV4: 1985 family = AF_INET; 1986 break; 1987 case SPDK_NVMF_ADRFAM_IPV6: 1988 family = AF_INET6; 1989 break; 1990 default: 1991 SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); 1992 free(port); 1993 pthread_mutex_unlock(&rtransport->lock); 1994 return -EINVAL; 1995 } 1996 1997 memset(&hints, 0, sizeof(hints)); 1998 hints.ai_family = family; 1999 hints.ai_flags = AI_NUMERICSERV; 2000 hints.ai_socktype = SOCK_STREAM; 2001 hints.ai_protocol = 0; 2002 2003 rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); 2004 if (rc) { 2005 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 2006 free(port); 2007 pthread_mutex_unlock(&rtransport->lock); 2008 return -EINVAL; 2009 } 2010 2011 rc = rdma_bind_addr(port->id, res->ai_addr); 2012 freeaddrinfo(res); 2013 2014 if (rc < 0) { 2015 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 2016 rdma_destroy_id(port->id); 2017 free(port); 2018 pthread_mutex_unlock(&rtransport->lock); 2019 return rc; 2020 } 2021 2022 if (!port->id->verbs) { 2023 SPDK_ERRLOG("ibv_context is null\n"); 2024 rdma_destroy_id(port->id); 2025 free(port); 2026 pthread_mutex_unlock(&rtransport->lock); 2027 return -1; 2028 } 2029 2030 rc = rdma_listen(port->id, 10); /* 10 = backlog */ 2031 if (rc < 0) { 2032 SPDK_ERRLOG("rdma_listen() failed\n"); 2033 rdma_destroy_id(port->id); 2034 free(port); 2035 pthread_mutex_unlock(&rtransport->lock); 2036 return rc; 2037 } 2038 2039 TAILQ_FOREACH(device, &rtransport->devices, link) { 2040 if (device->context == port->id->verbs) { 2041 port->device = device; 2042 break; 2043 } 2044 } 2045 if (!port->device) { 2046 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 2047 port->id->verbs); 2048 rdma_destroy_id(port->id); 2049 free(port); 2050 pthread_mutex_unlock(&rtransport->lock); 2051 return -EINVAL; 2052 } 2053 2054 pd = NULL; 2055 if (g_nvmf_hooks.get_ibv_pd) { 2056 if (spdk_nvmf_rdma_trid_from_cm_id(port->id, &port->trid, 1) < 0) { 2057 rdma_destroy_id(port->id); 2058 free(port); 2059 pthread_mutex_unlock(&rtransport->lock); 2060 return -EINVAL; 2061 } 2062 2063 pd = g_nvmf_hooks.get_ibv_pd(&port->trid, port->id->verbs); 2064 } 2065 2066 if (device->pd == NULL) { 2067 /* Haven't created a protection domain yet. */ 2068 2069 if (!g_nvmf_hooks.get_ibv_pd) { 2070 device->pd = ibv_alloc_pd(device->context); 2071 if (!device->pd) { 2072 SPDK_ERRLOG("Unable to allocate protection domain.\n"); 2073 rdma_destroy_id(port->id); 2074 free(port); 2075 pthread_mutex_unlock(&rtransport->lock); 2076 return -ENOMEM; 2077 } 2078 } else { 2079 device->pd = pd; 2080 } 2081 2082 assert(device->map == NULL); 2083 2084 device->map = spdk_mem_map_alloc(0, &g_nvmf_rdma_map_ops, device->pd); 2085 if (!device->map) { 2086 SPDK_ERRLOG("Unable to allocate memory map for listen address\n"); 2087 if (!g_nvmf_hooks.get_ibv_pd) { 2088 ibv_dealloc_pd(device->pd); 2089 } 2090 rdma_destroy_id(port->id); 2091 free(port); 2092 pthread_mutex_unlock(&rtransport->lock); 2093 return -ENOMEM; 2094 } 2095 } else if (g_nvmf_hooks.get_ibv_pd) { 2096 /* A protection domain exists for this device, but the user has 2097 * enabled hooks. Verify that they only supply one pd per device. */ 2098 if (device->pd != pd) { 2099 SPDK_ERRLOG("The NVMe-oF target only supports one protection domain per device.\n"); 2100 rdma_destroy_id(port->id); 2101 free(port); 2102 pthread_mutex_unlock(&rtransport->lock); 2103 return -EINVAL; 2104 } 2105 } 2106 2107 assert(device->map != NULL); 2108 assert(device->pd != NULL); 2109 2110 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", 2111 port->trid.traddr, ntohs(rdma_get_src_port(port->id))); 2112 2113 port->ref = 1; 2114 2115 TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 2116 pthread_mutex_unlock(&rtransport->lock); 2117 2118 return 0; 2119 } 2120 2121 static int 2122 spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 2123 const struct spdk_nvme_transport_id *_trid) 2124 { 2125 struct spdk_nvmf_rdma_transport *rtransport; 2126 struct spdk_nvmf_rdma_port *port, *tmp; 2127 struct spdk_nvme_transport_id trid = {}; 2128 2129 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2130 2131 /* Selectively copy the trid. Things like NQN don't matter here - that 2132 * mapping is enforced elsewhere. 2133 */ 2134 trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 2135 trid.adrfam = _trid->adrfam; 2136 snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); 2137 snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); 2138 2139 pthread_mutex_lock(&rtransport->lock); 2140 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 2141 if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { 2142 assert(port->ref > 0); 2143 port->ref--; 2144 if (port->ref == 0) { 2145 TAILQ_REMOVE(&rtransport->ports, port, link); 2146 rdma_destroy_id(port->id); 2147 free(port); 2148 } 2149 break; 2150 } 2151 } 2152 2153 pthread_mutex_unlock(&rtransport->lock); 2154 return 0; 2155 } 2156 2157 static bool 2158 spdk_nvmf_rdma_qpair_is_idle(struct spdk_nvmf_qpair *qpair) 2159 { 2160 struct spdk_nvmf_rdma_qpair *rqpair; 2161 2162 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2163 2164 if (spdk_nvmf_rdma_cur_queue_depth(rqpair) == 0) { 2165 return true; 2166 } 2167 return false; 2168 } 2169 2170 static void 2171 spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 2172 struct spdk_nvmf_rdma_qpair *rqpair, bool drain) 2173 { 2174 struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 2175 struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 2176 2177 /* We process I/O in the data transfer pending queue at the highest priority. RDMA reads first */ 2178 TAILQ_FOREACH_SAFE(rdma_req, 2179 &rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING], 2180 state_link, req_tmp) { 2181 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2182 break; 2183 } 2184 } 2185 2186 /* Then RDMA writes sincereads have stronger restrictions than writes */ 2187 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING], 2188 state_link, req_tmp) { 2189 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2190 break; 2191 } 2192 } 2193 2194 /* The second highest priority is I/O waiting on memory buffers. */ 2195 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->poller->group->pending_data_buf_queue, link, 2196 req_tmp) { 2197 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2198 break; 2199 } 2200 } 2201 2202 /* The lowest priority is processing newly received commands */ 2203 TAILQ_FOREACH_SAFE(rdma_recv, &rqpair->incoming_queue, link, recv_tmp) { 2204 if (TAILQ_EMPTY(&rqpair->state_queue[RDMA_REQUEST_STATE_FREE])) { 2205 break; 2206 } 2207 2208 rdma_req = TAILQ_FIRST(&rqpair->state_queue[RDMA_REQUEST_STATE_FREE]); 2209 rdma_req->recv = rdma_recv; 2210 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_NEW); 2211 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 2212 break; 2213 } 2214 } 2215 } 2216 2217 static void 2218 _nvmf_rdma_qpair_disconnect(void *ctx) 2219 { 2220 struct spdk_nvmf_qpair *qpair = ctx; 2221 2222 spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); 2223 } 2224 2225 static void 2226 _nvmf_rdma_try_disconnect(void *ctx) 2227 { 2228 struct spdk_nvmf_qpair *qpair = ctx; 2229 struct spdk_nvmf_poll_group *group; 2230 2231 /* Read the group out of the qpair. This is normally set and accessed only from 2232 * the thread that created the group. Here, we're not on that thread necessarily. 2233 * The data member qpair->group begins it's life as NULL and then is assigned to 2234 * a pointer and never changes. So fortunately reading this and checking for 2235 * non-NULL is thread safe in the x86_64 memory model. */ 2236 group = qpair->group; 2237 2238 if (group == NULL) { 2239 /* The qpair hasn't been assigned to a group yet, so we can't 2240 * process a disconnect. Send a message to ourself and try again. */ 2241 spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_try_disconnect, qpair); 2242 return; 2243 } 2244 2245 spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair); 2246 } 2247 2248 static inline void 2249 spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair) 2250 { 2251 if (__sync_bool_compare_and_swap(&rqpair->disconnect_started, false, true)) { 2252 _nvmf_rdma_try_disconnect(&rqpair->qpair); 2253 } 2254 } 2255 2256 2257 static int 2258 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 2259 { 2260 struct spdk_nvmf_qpair *qpair; 2261 struct spdk_nvmf_rdma_qpair *rqpair; 2262 2263 if (evt->id == NULL) { 2264 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 2265 return -1; 2266 } 2267 2268 qpair = evt->id->context; 2269 if (qpair == NULL) { 2270 SPDK_ERRLOG("disconnect request: no active connection\n"); 2271 return -1; 2272 } 2273 2274 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2275 2276 spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); 2277 2278 spdk_nvmf_rdma_update_ibv_state(rqpair); 2279 2280 spdk_nvmf_rdma_start_disconnect(rqpair); 2281 2282 return 0; 2283 } 2284 2285 #ifdef DEBUG 2286 static const char *CM_EVENT_STR[] = { 2287 "RDMA_CM_EVENT_ADDR_RESOLVED", 2288 "RDMA_CM_EVENT_ADDR_ERROR", 2289 "RDMA_CM_EVENT_ROUTE_RESOLVED", 2290 "RDMA_CM_EVENT_ROUTE_ERROR", 2291 "RDMA_CM_EVENT_CONNECT_REQUEST", 2292 "RDMA_CM_EVENT_CONNECT_RESPONSE", 2293 "RDMA_CM_EVENT_CONNECT_ERROR", 2294 "RDMA_CM_EVENT_UNREACHABLE", 2295 "RDMA_CM_EVENT_REJECTED", 2296 "RDMA_CM_EVENT_ESTABLISHED", 2297 "RDMA_CM_EVENT_DISCONNECTED", 2298 "RDMA_CM_EVENT_DEVICE_REMOVAL", 2299 "RDMA_CM_EVENT_MULTICAST_JOIN", 2300 "RDMA_CM_EVENT_MULTICAST_ERROR", 2301 "RDMA_CM_EVENT_ADDR_CHANGE", 2302 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 2303 }; 2304 #endif /* DEBUG */ 2305 2306 static void 2307 spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 2308 { 2309 struct spdk_nvmf_rdma_transport *rtransport; 2310 struct rdma_cm_event *event; 2311 int rc; 2312 2313 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2314 2315 if (rtransport->event_channel == NULL) { 2316 return; 2317 } 2318 2319 while (1) { 2320 rc = rdma_get_cm_event(rtransport->event_channel, &event); 2321 if (rc == 0) { 2322 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 2323 2324 spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); 2325 2326 switch (event->event) { 2327 case RDMA_CM_EVENT_ADDR_RESOLVED: 2328 case RDMA_CM_EVENT_ADDR_ERROR: 2329 case RDMA_CM_EVENT_ROUTE_RESOLVED: 2330 case RDMA_CM_EVENT_ROUTE_ERROR: 2331 /* No action required. The target never attempts to resolve routes. */ 2332 break; 2333 case RDMA_CM_EVENT_CONNECT_REQUEST: 2334 rc = nvmf_rdma_connect(transport, event, cb_fn); 2335 if (rc < 0) { 2336 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 2337 break; 2338 } 2339 break; 2340 case RDMA_CM_EVENT_CONNECT_RESPONSE: 2341 /* The target never initiates a new connection. So this will not occur. */ 2342 break; 2343 case RDMA_CM_EVENT_CONNECT_ERROR: 2344 /* Can this happen? The docs say it can, but not sure what causes it. */ 2345 break; 2346 case RDMA_CM_EVENT_UNREACHABLE: 2347 case RDMA_CM_EVENT_REJECTED: 2348 /* These only occur on the client side. */ 2349 break; 2350 case RDMA_CM_EVENT_ESTABLISHED: 2351 /* TODO: Should we be waiting for this event anywhere? */ 2352 break; 2353 case RDMA_CM_EVENT_DISCONNECTED: 2354 case RDMA_CM_EVENT_DEVICE_REMOVAL: 2355 rc = nvmf_rdma_disconnect(event); 2356 if (rc < 0) { 2357 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 2358 break; 2359 } 2360 break; 2361 case RDMA_CM_EVENT_MULTICAST_JOIN: 2362 case RDMA_CM_EVENT_MULTICAST_ERROR: 2363 /* Multicast is not used */ 2364 break; 2365 case RDMA_CM_EVENT_ADDR_CHANGE: 2366 /* Not utilizing this event */ 2367 break; 2368 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 2369 /* For now, do nothing. The target never re-uses queue pairs. */ 2370 break; 2371 default: 2372 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 2373 break; 2374 } 2375 2376 rdma_ack_cm_event(event); 2377 } else { 2378 if (errno != EAGAIN && errno != EWOULDBLOCK) { 2379 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 2380 } 2381 break; 2382 } 2383 } 2384 } 2385 2386 static void 2387 spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 2388 { 2389 int rc; 2390 struct spdk_nvmf_rdma_qpair *rqpair; 2391 struct ibv_async_event event; 2392 enum ibv_qp_state state; 2393 2394 rc = ibv_get_async_event(device->context, &event); 2395 2396 if (rc) { 2397 SPDK_ERRLOG("Failed to get async_event (%d): %s\n", 2398 errno, spdk_strerror(errno)); 2399 return; 2400 } 2401 2402 SPDK_NOTICELOG("Async event: %s\n", 2403 ibv_event_type_str(event.event_type)); 2404 2405 switch (event.event_type) { 2406 case IBV_EVENT_QP_FATAL: 2407 rqpair = event.element.qp->qp_context; 2408 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2409 (uintptr_t)rqpair->cm_id, event.event_type); 2410 spdk_nvmf_rdma_update_ibv_state(rqpair); 2411 spdk_nvmf_rdma_start_disconnect(rqpair); 2412 break; 2413 case IBV_EVENT_QP_LAST_WQE_REACHED: 2414 /* This event only occurs for shared receive queues, which are not currently supported. */ 2415 break; 2416 case IBV_EVENT_SQ_DRAINED: 2417 /* This event occurs frequently in both error and non-error states. 2418 * Check if the qpair is in an error state before sending a message. 2419 * Note that we're not on the correct thread to access the qpair, but 2420 * the operations that the below calls make all happen to be thread 2421 * safe. */ 2422 rqpair = event.element.qp->qp_context; 2423 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2424 (uintptr_t)rqpair->cm_id, event.event_type); 2425 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 2426 if (state == IBV_QPS_ERR) { 2427 spdk_nvmf_rdma_start_disconnect(rqpair); 2428 } 2429 break; 2430 case IBV_EVENT_QP_REQ_ERR: 2431 case IBV_EVENT_QP_ACCESS_ERR: 2432 case IBV_EVENT_COMM_EST: 2433 case IBV_EVENT_PATH_MIG: 2434 case IBV_EVENT_PATH_MIG_ERR: 2435 rqpair = event.element.qp->qp_context; 2436 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2437 (uintptr_t)rqpair->cm_id, event.event_type); 2438 spdk_nvmf_rdma_update_ibv_state(rqpair); 2439 break; 2440 case IBV_EVENT_CQ_ERR: 2441 case IBV_EVENT_DEVICE_FATAL: 2442 case IBV_EVENT_PORT_ACTIVE: 2443 case IBV_EVENT_PORT_ERR: 2444 case IBV_EVENT_LID_CHANGE: 2445 case IBV_EVENT_PKEY_CHANGE: 2446 case IBV_EVENT_SM_CHANGE: 2447 case IBV_EVENT_SRQ_ERR: 2448 case IBV_EVENT_SRQ_LIMIT_REACHED: 2449 case IBV_EVENT_CLIENT_REREGISTER: 2450 case IBV_EVENT_GID_CHANGE: 2451 default: 2452 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); 2453 break; 2454 } 2455 ibv_ack_async_event(&event); 2456 } 2457 2458 static void 2459 spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 2460 { 2461 int nfds, i = 0; 2462 struct spdk_nvmf_rdma_transport *rtransport; 2463 struct spdk_nvmf_rdma_device *device, *tmp; 2464 2465 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2466 nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 2467 2468 if (nfds <= 0) { 2469 return; 2470 } 2471 2472 /* The first poll descriptor is RDMA CM event */ 2473 if (rtransport->poll_fds[i++].revents & POLLIN) { 2474 spdk_nvmf_process_cm_event(transport, cb_fn); 2475 nfds--; 2476 } 2477 2478 if (nfds == 0) { 2479 return; 2480 } 2481 2482 /* Second and subsequent poll descriptors are IB async events */ 2483 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 2484 if (rtransport->poll_fds[i++].revents & POLLIN) { 2485 spdk_nvmf_process_ib_event(device); 2486 nfds--; 2487 } 2488 } 2489 /* check all flagged fd's have been served */ 2490 assert(nfds == 0); 2491 } 2492 2493 static void 2494 spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 2495 struct spdk_nvme_transport_id *trid, 2496 struct spdk_nvmf_discovery_log_page_entry *entry) 2497 { 2498 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 2499 entry->adrfam = trid->adrfam; 2500 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; 2501 2502 spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 2503 spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 2504 2505 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 2506 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 2507 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 2508 } 2509 2510 static struct spdk_nvmf_transport_poll_group * 2511 spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) 2512 { 2513 struct spdk_nvmf_rdma_transport *rtransport; 2514 struct spdk_nvmf_rdma_poll_group *rgroup; 2515 struct spdk_nvmf_rdma_poller *poller, *tpoller; 2516 struct spdk_nvmf_rdma_device *device; 2517 2518 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2519 2520 rgroup = calloc(1, sizeof(*rgroup)); 2521 if (!rgroup) { 2522 return NULL; 2523 } 2524 2525 TAILQ_INIT(&rgroup->pollers); 2526 TAILQ_INIT(&rgroup->pending_data_buf_queue); 2527 2528 pthread_mutex_lock(&rtransport->lock); 2529 TAILQ_FOREACH(device, &rtransport->devices, link) { 2530 poller = calloc(1, sizeof(*poller)); 2531 if (!poller) { 2532 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 2533 goto err_exit; 2534 } 2535 2536 poller->device = device; 2537 poller->group = rgroup; 2538 2539 TAILQ_INIT(&poller->qpairs); 2540 2541 poller->cq = ibv_create_cq(device->context, DEFAULT_NVMF_RDMA_CQ_SIZE, poller, NULL, 0); 2542 if (!poller->cq) { 2543 SPDK_ERRLOG("Unable to create completion queue\n"); 2544 free(poller); 2545 goto err_exit; 2546 } 2547 poller->num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE; 2548 2549 TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 2550 } 2551 2552 pthread_mutex_unlock(&rtransport->lock); 2553 return &rgroup->group; 2554 2555 err_exit: 2556 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tpoller) { 2557 TAILQ_REMOVE(&rgroup->pollers, poller, link); 2558 if (poller->cq) { 2559 ibv_destroy_cq(poller->cq); 2560 } 2561 free(poller); 2562 } 2563 2564 free(rgroup); 2565 pthread_mutex_unlock(&rtransport->lock); 2566 return NULL; 2567 } 2568 2569 static void 2570 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 2571 { 2572 struct spdk_nvmf_rdma_poll_group *rgroup; 2573 struct spdk_nvmf_rdma_poller *poller, *tmp; 2574 struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; 2575 2576 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2577 2578 if (!rgroup) { 2579 return; 2580 } 2581 2582 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 2583 TAILQ_REMOVE(&rgroup->pollers, poller, link); 2584 2585 if (poller->cq) { 2586 ibv_destroy_cq(poller->cq); 2587 } 2588 TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) { 2589 spdk_nvmf_rdma_qpair_destroy(qpair); 2590 } 2591 2592 free(poller); 2593 } 2594 2595 if (!TAILQ_EMPTY(&rgroup->pending_data_buf_queue)) { 2596 SPDK_ERRLOG("Pending I/O list wasn't empty on poll group destruction\n"); 2597 } 2598 2599 free(rgroup); 2600 } 2601 2602 static void 2603 spdk_nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair) 2604 { 2605 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 2606 spdk_nvmf_rdma_qpair_destroy(rqpair); 2607 } 2608 2609 static int 2610 spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 2611 struct spdk_nvmf_qpair *qpair) 2612 { 2613 struct spdk_nvmf_rdma_poll_group *rgroup; 2614 struct spdk_nvmf_rdma_qpair *rqpair; 2615 struct spdk_nvmf_rdma_device *device; 2616 struct spdk_nvmf_rdma_poller *poller; 2617 int rc; 2618 2619 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2620 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2621 2622 device = rqpair->port->device; 2623 2624 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 2625 if (poller->device == device) { 2626 break; 2627 } 2628 } 2629 2630 if (!poller) { 2631 SPDK_ERRLOG("No poller found for device.\n"); 2632 return -1; 2633 } 2634 2635 TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); 2636 rqpair->poller = poller; 2637 2638 rc = spdk_nvmf_rdma_qpair_initialize(qpair); 2639 if (rc < 0) { 2640 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 2641 return -1; 2642 } 2643 2644 rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 2645 if (rc) { 2646 /* Try to reject, but we probably can't */ 2647 spdk_nvmf_rdma_qpair_reject_connection(rqpair); 2648 return -1; 2649 } 2650 2651 spdk_nvmf_rdma_update_ibv_state(rqpair); 2652 2653 return 0; 2654 } 2655 2656 static int 2657 spdk_nvmf_rdma_request_free(struct spdk_nvmf_request *req) 2658 { 2659 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 2660 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 2661 struct spdk_nvmf_rdma_transport, transport); 2662 2663 nvmf_rdma_request_free(rdma_req, rtransport); 2664 return 0; 2665 } 2666 2667 static int 2668 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 2669 { 2670 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 2671 struct spdk_nvmf_rdma_transport, transport); 2672 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, 2673 struct spdk_nvmf_rdma_request, req); 2674 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, 2675 struct spdk_nvmf_rdma_qpair, qpair); 2676 2677 if (rqpair->ibv_attr.qp_state != IBV_QPS_ERR) { 2678 /* The connection is alive, so process the request as normal */ 2679 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_EXECUTED); 2680 } else { 2681 /* The connection is dead. Move the request directly to the completed state. */ 2682 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 2683 } 2684 2685 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2686 2687 return 0; 2688 } 2689 2690 static void 2691 spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) 2692 { 2693 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2694 struct ibv_recv_wr recv_wr = {}; 2695 struct ibv_recv_wr *bad_recv_wr; 2696 struct ibv_send_wr send_wr = {}; 2697 struct ibv_send_wr *bad_send_wr; 2698 int rc; 2699 2700 if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) { 2701 return; 2702 } 2703 2704 rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING; 2705 2706 /* This happens only when the qpair is disconnected before 2707 * it is added to the poll group. Since there is no poll group, 2708 * the RDMA qp has not been initialized yet and the RDMA CM 2709 * event has not yet been acknowledged, so we need to reject it. 2710 */ 2711 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) { 2712 spdk_nvmf_rdma_qpair_reject_connection(rqpair); 2713 return; 2714 } 2715 2716 if (rqpair->ibv_attr.qp_state != IBV_QPS_ERR) { 2717 spdk_nvmf_rdma_set_ibv_state(rqpair, IBV_QPS_ERR); 2718 } 2719 2720 rqpair->drain_recv_wr.type = RDMA_WR_TYPE_DRAIN_RECV; 2721 recv_wr.wr_id = (uintptr_t)&rqpair->drain_recv_wr; 2722 rc = ibv_post_recv(rqpair->cm_id->qp, &recv_wr, &bad_recv_wr); 2723 if (rc) { 2724 SPDK_ERRLOG("Failed to post dummy receive WR, errno %d\n", errno); 2725 assert(false); 2726 return; 2727 } 2728 2729 rqpair->drain_send_wr.type = RDMA_WR_TYPE_DRAIN_SEND; 2730 send_wr.wr_id = (uintptr_t)&rqpair->drain_send_wr; 2731 send_wr.opcode = IBV_WR_SEND; 2732 rc = ibv_post_send(rqpair->cm_id->qp, &send_wr, &bad_send_wr); 2733 if (rc) { 2734 SPDK_ERRLOG("Failed to post dummy send WR, errno %d\n", errno); 2735 assert(false); 2736 return; 2737 } 2738 rqpair->current_send_depth++; 2739 } 2740 2741 #ifdef DEBUG 2742 static int 2743 spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) 2744 { 2745 return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || 2746 rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; 2747 } 2748 #endif 2749 2750 static int 2751 spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 2752 struct spdk_nvmf_rdma_poller *rpoller) 2753 { 2754 struct ibv_wc wc[32]; 2755 struct spdk_nvmf_rdma_wr *rdma_wr; 2756 struct spdk_nvmf_rdma_request *rdma_req; 2757 struct spdk_nvmf_rdma_recv *rdma_recv; 2758 struct spdk_nvmf_rdma_qpair *rqpair; 2759 int reaped, i; 2760 int count = 0; 2761 bool error = false; 2762 2763 /* Poll for completing operations. */ 2764 reaped = ibv_poll_cq(rpoller->cq, 32, wc); 2765 if (reaped < 0) { 2766 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 2767 errno, spdk_strerror(errno)); 2768 return -1; 2769 } 2770 2771 for (i = 0; i < reaped; i++) { 2772 2773 rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id; 2774 2775 /* Handle error conditions */ 2776 if (wc[i].status) { 2777 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "CQ error on CQ %p, Request 0x%lu (%d): %s\n", 2778 rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 2779 2780 error = true; 2781 2782 switch (rdma_wr->type) { 2783 case RDMA_WR_TYPE_SEND: 2784 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 2785 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2786 2787 SPDK_ERRLOG("data=%p length=%u\n", rdma_req->req.data, rdma_req->req.length); 2788 /* We're going to attempt an error recovery, so force the request into 2789 * the completed state. */ 2790 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 2791 rqpair->current_send_depth--; 2792 2793 assert(rdma_req->num_outstanding_data_wr == 0); 2794 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2795 break; 2796 case RDMA_WR_TYPE_RECV: 2797 rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 2798 rqpair = rdma_recv->qpair; 2799 2800 /* Dump this into the incoming queue. This gets cleaned up when 2801 * the queue pair disconnects or recovers. */ 2802 TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); 2803 rqpair->current_recv_depth++; 2804 2805 /* Don't worry about responding to recv overflow, we are disconnecting anyways */ 2806 break; 2807 case RDMA_WR_TYPE_DATA: 2808 /* If the data transfer fails still force the queue into the error state, 2809 * if we were performing an RDMA_READ, we need to force the request into a 2810 * completed state since it wasn't linked to a send. However, in the RDMA_WRITE 2811 * case, we should wait for the SEND to complete. */ 2812 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 2813 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2814 2815 SPDK_ERRLOG("data=%p length=%u\n", rdma_req->req.data, rdma_req->req.length); 2816 rdma_req->num_outstanding_data_wr--; 2817 if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) { 2818 assert(rdma_req->num_outstanding_data_wr > 0); 2819 rqpair->current_read_depth--; 2820 if (rdma_req->num_outstanding_data_wr == 0) { 2821 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 2822 } 2823 } 2824 rqpair->current_send_depth--; 2825 break; 2826 case RDMA_WR_TYPE_DRAIN_RECV: 2827 rqpair = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_qpair, drain_recv_wr); 2828 assert(rqpair->disconnect_flags & RDMA_QP_DISCONNECTING); 2829 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Drained QP RECV %u (%p)\n", rqpair->qpair.qid, rqpair); 2830 rqpair->disconnect_flags |= RDMA_QP_RECV_DRAINED; 2831 assert(rqpair->current_recv_depth == rqpair->max_queue_depth); 2832 /* Don't worry about responding to recv overflow, we are disconnecting anyways */ 2833 if (rqpair->disconnect_flags & RDMA_QP_SEND_DRAINED) { 2834 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 2835 spdk_nvmf_rdma_qpair_destroy(rqpair); 2836 } 2837 /* Continue so that this does not trigger the disconnect path below. */ 2838 continue; 2839 case RDMA_WR_TYPE_DRAIN_SEND: 2840 rqpair = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_qpair, drain_send_wr); 2841 assert(rqpair->disconnect_flags & RDMA_QP_DISCONNECTING); 2842 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Drained QP SEND %u (%p)\n", rqpair->qpair.qid, rqpair); 2843 rqpair->disconnect_flags |= RDMA_QP_SEND_DRAINED; 2844 rqpair->current_send_depth--; 2845 if (rqpair->disconnect_flags & RDMA_QP_RECV_DRAINED) { 2846 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 2847 spdk_nvmf_rdma_qpair_destroy(rqpair); 2848 } 2849 /* Continue so that this does not trigger the disconnect path below. */ 2850 continue; 2851 default: 2852 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 2853 continue; 2854 } 2855 2856 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { 2857 /* Disconnect the connection. */ 2858 spdk_nvmf_rdma_start_disconnect(rqpair); 2859 } 2860 continue; 2861 } 2862 2863 switch (wc[i].opcode) { 2864 case IBV_WC_SEND: 2865 assert(rdma_wr->type == RDMA_WR_TYPE_SEND); 2866 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 2867 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2868 2869 assert(spdk_nvmf_rdma_req_is_completing(rdma_req)); 2870 2871 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 2872 rqpair->current_send_depth--; 2873 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2874 2875 count++; 2876 2877 assert(rdma_req->num_outstanding_data_wr == 0); 2878 /* Try to process other queued requests */ 2879 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 2880 break; 2881 2882 case IBV_WC_RDMA_WRITE: 2883 assert(rdma_wr->type == RDMA_WR_TYPE_DATA); 2884 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 2885 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2886 rqpair->current_send_depth--; 2887 rdma_req->num_outstanding_data_wr--; 2888 2889 /* Try to process other queued requests */ 2890 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 2891 break; 2892 2893 case IBV_WC_RDMA_READ: 2894 assert(rdma_wr->type == RDMA_WR_TYPE_DATA); 2895 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 2896 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2897 rqpair->current_send_depth--; 2898 2899 assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); 2900 /* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */ 2901 assert(rdma_req->num_outstanding_data_wr > 0); 2902 rqpair->current_read_depth--; 2903 rdma_req->num_outstanding_data_wr--; 2904 if (rdma_req->num_outstanding_data_wr == 0) { 2905 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); 2906 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2907 } 2908 2909 /* Try to process other queued requests */ 2910 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 2911 break; 2912 2913 case IBV_WC_RECV: 2914 assert(rdma_wr->type == RDMA_WR_TYPE_RECV); 2915 rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 2916 rqpair = rdma_recv->qpair; 2917 /* The qpair should not send more requests than are allowed per qpair. */ 2918 if (rqpair->current_recv_depth >= rqpair->max_queue_depth) { 2919 spdk_nvmf_rdma_start_disconnect(rqpair); 2920 } else { 2921 rqpair->current_recv_depth++; 2922 } 2923 TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); 2924 /* Try to process other queued requests */ 2925 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 2926 break; 2927 2928 default: 2929 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 2930 continue; 2931 } 2932 } 2933 2934 if (error == true) { 2935 return -1; 2936 } 2937 2938 return count; 2939 } 2940 2941 static int 2942 spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2943 { 2944 struct spdk_nvmf_rdma_transport *rtransport; 2945 struct spdk_nvmf_rdma_poll_group *rgroup; 2946 struct spdk_nvmf_rdma_poller *rpoller; 2947 int count, rc; 2948 2949 rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 2950 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2951 2952 count = 0; 2953 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 2954 rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); 2955 if (rc < 0) { 2956 return rc; 2957 } 2958 count += rc; 2959 } 2960 2961 return count; 2962 } 2963 2964 static int 2965 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 2966 struct spdk_nvme_transport_id *trid, 2967 bool peer) 2968 { 2969 struct sockaddr *saddr; 2970 uint16_t port; 2971 2972 trid->trtype = SPDK_NVME_TRANSPORT_RDMA; 2973 2974 if (peer) { 2975 saddr = rdma_get_peer_addr(id); 2976 } else { 2977 saddr = rdma_get_local_addr(id); 2978 } 2979 switch (saddr->sa_family) { 2980 case AF_INET: { 2981 struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; 2982 2983 trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; 2984 inet_ntop(AF_INET, &saddr_in->sin_addr, 2985 trid->traddr, sizeof(trid->traddr)); 2986 if (peer) { 2987 port = ntohs(rdma_get_dst_port(id)); 2988 } else { 2989 port = ntohs(rdma_get_src_port(id)); 2990 } 2991 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 2992 break; 2993 } 2994 case AF_INET6: { 2995 struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; 2996 trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; 2997 inet_ntop(AF_INET6, &saddr_in->sin6_addr, 2998 trid->traddr, sizeof(trid->traddr)); 2999 if (peer) { 3000 port = ntohs(rdma_get_dst_port(id)); 3001 } else { 3002 port = ntohs(rdma_get_src_port(id)); 3003 } 3004 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 3005 break; 3006 } 3007 default: 3008 return -1; 3009 3010 } 3011 3012 return 0; 3013 } 3014 3015 static int 3016 spdk_nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 3017 struct spdk_nvme_transport_id *trid) 3018 { 3019 struct spdk_nvmf_rdma_qpair *rqpair; 3020 3021 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3022 3023 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); 3024 } 3025 3026 static int 3027 spdk_nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 3028 struct spdk_nvme_transport_id *trid) 3029 { 3030 struct spdk_nvmf_rdma_qpair *rqpair; 3031 3032 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3033 3034 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); 3035 } 3036 3037 static int 3038 spdk_nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 3039 struct spdk_nvme_transport_id *trid) 3040 { 3041 struct spdk_nvmf_rdma_qpair *rqpair; 3042 3043 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3044 3045 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); 3046 } 3047 3048 void 3049 spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) 3050 { 3051 g_nvmf_hooks = *hooks; 3052 } 3053 3054 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 3055 .type = SPDK_NVME_TRANSPORT_RDMA, 3056 .opts_init = spdk_nvmf_rdma_opts_init, 3057 .create = spdk_nvmf_rdma_create, 3058 .destroy = spdk_nvmf_rdma_destroy, 3059 3060 .listen = spdk_nvmf_rdma_listen, 3061 .stop_listen = spdk_nvmf_rdma_stop_listen, 3062 .accept = spdk_nvmf_rdma_accept, 3063 3064 .listener_discover = spdk_nvmf_rdma_discover, 3065 3066 .poll_group_create = spdk_nvmf_rdma_poll_group_create, 3067 .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, 3068 .poll_group_add = spdk_nvmf_rdma_poll_group_add, 3069 .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, 3070 3071 .req_free = spdk_nvmf_rdma_request_free, 3072 .req_complete = spdk_nvmf_rdma_request_complete, 3073 3074 .qpair_fini = spdk_nvmf_rdma_close_qpair, 3075 .qpair_is_idle = spdk_nvmf_rdma_qpair_is_idle, 3076 .qpair_get_peer_trid = spdk_nvmf_rdma_qpair_get_peer_trid, 3077 .qpair_get_local_trid = spdk_nvmf_rdma_qpair_get_local_trid, 3078 .qpair_get_listen_trid = spdk_nvmf_rdma_qpair_get_listen_trid, 3079 3080 }; 3081 3082 SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) 3083