1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "spdk/config.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/nvmf_transport.h" 44 #include "spdk/string.h" 45 #include "spdk/trace.h" 46 #include "spdk/util.h" 47 48 #include "spdk_internal/assert.h" 49 #include "spdk_internal/log.h" 50 51 struct spdk_nvme_rdma_hooks g_nvmf_hooks = {}; 52 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma; 53 54 /* 55 RDMA Connection Resource Defaults 56 */ 57 #define NVMF_DEFAULT_TX_SGE SPDK_NVMF_MAX_SGL_ENTRIES 58 #define NVMF_DEFAULT_RSP_SGE 1 59 #define NVMF_DEFAULT_RX_SGE 2 60 61 /* The RDMA completion queue size */ 62 #define DEFAULT_NVMF_RDMA_CQ_SIZE 4096 63 #define MAX_WR_PER_QP(queue_depth) (queue_depth * 3 + 2) 64 65 /* Timeout for destroying defunct rqpairs */ 66 #define NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US 4000000 67 68 static int g_spdk_nvmf_ibv_query_mask = 69 IBV_QP_STATE | 70 IBV_QP_PKEY_INDEX | 71 IBV_QP_PORT | 72 IBV_QP_ACCESS_FLAGS | 73 IBV_QP_AV | 74 IBV_QP_PATH_MTU | 75 IBV_QP_DEST_QPN | 76 IBV_QP_RQ_PSN | 77 IBV_QP_MAX_DEST_RD_ATOMIC | 78 IBV_QP_MIN_RNR_TIMER | 79 IBV_QP_SQ_PSN | 80 IBV_QP_TIMEOUT | 81 IBV_QP_RETRY_CNT | 82 IBV_QP_RNR_RETRY | 83 IBV_QP_MAX_QP_RD_ATOMIC; 84 85 enum spdk_nvmf_rdma_request_state { 86 /* The request is not currently in use */ 87 RDMA_REQUEST_STATE_FREE = 0, 88 89 /* Initial state when request first received */ 90 RDMA_REQUEST_STATE_NEW, 91 92 /* The request is queued until a data buffer is available. */ 93 RDMA_REQUEST_STATE_NEED_BUFFER, 94 95 /* The request is waiting on RDMA queue depth availability 96 * to transfer data from the host to the controller. 97 */ 98 RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 99 100 /* The request is currently transferring data from the host to the controller. */ 101 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 102 103 /* The request is ready to execute at the block device */ 104 RDMA_REQUEST_STATE_READY_TO_EXECUTE, 105 106 /* The request is currently executing at the block device */ 107 RDMA_REQUEST_STATE_EXECUTING, 108 109 /* The request finished executing at the block device */ 110 RDMA_REQUEST_STATE_EXECUTED, 111 112 /* The request is waiting on RDMA queue depth availability 113 * to transfer data from the controller to the host. 114 */ 115 RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 116 117 /* The request is ready to send a completion */ 118 RDMA_REQUEST_STATE_READY_TO_COMPLETE, 119 120 /* The request is currently transferring data from the controller to the host. */ 121 RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 122 123 /* The request currently has an outstanding completion without an 124 * associated data transfer. 125 */ 126 RDMA_REQUEST_STATE_COMPLETING, 127 128 /* The request completed and can be marked free. */ 129 RDMA_REQUEST_STATE_COMPLETED, 130 131 /* Terminator */ 132 RDMA_REQUEST_NUM_STATES, 133 }; 134 135 #define OBJECT_NVMF_RDMA_IO 0x40 136 137 #define TRACE_GROUP_NVMF_RDMA 0x4 138 #define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) 139 #define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) 140 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) 141 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) 142 #define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) 143 #define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) 144 #define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) 145 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) 146 #define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) 147 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) 148 #define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) 149 #define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB) 150 #define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC) 151 #define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD) 152 #define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE) 153 #define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF) 154 #define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10) 155 #define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x11) 156 157 SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA) 158 { 159 spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); 160 spdk_trace_register_description("RDMA_REQ_NEW", TRACE_RDMA_REQUEST_STATE_NEW, 161 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: "); 162 spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 163 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 164 spdk_trace_register_description("RDMA_REQ_TX_PENDING_C2H", 165 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 166 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 167 spdk_trace_register_description("RDMA_REQ_TX_PENDING_H2C", 168 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 169 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 170 spdk_trace_register_description("RDMA_REQ_TX_H2C", 171 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 172 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 173 spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", 174 TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 175 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 176 spdk_trace_register_description("RDMA_REQ_EXECUTING", 177 TRACE_RDMA_REQUEST_STATE_EXECUTING, 178 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 179 spdk_trace_register_description("RDMA_REQ_EXECUTED", 180 TRACE_RDMA_REQUEST_STATE_EXECUTED, 181 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 182 spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPL", 183 TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 184 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 185 spdk_trace_register_description("RDMA_REQ_COMPLETING_C2H", 186 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 187 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 188 spdk_trace_register_description("RDMA_REQ_COMPLETING", 189 TRACE_RDMA_REQUEST_STATE_COMPLETING, 190 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 191 spdk_trace_register_description("RDMA_REQ_COMPLETED", 192 TRACE_RDMA_REQUEST_STATE_COMPLETED, 193 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 194 195 spdk_trace_register_description("RDMA_QP_CREATE", TRACE_RDMA_QP_CREATE, 196 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 197 spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", TRACE_RDMA_IBV_ASYNC_EVENT, 198 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 199 spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", TRACE_RDMA_CM_ASYNC_EVENT, 200 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 201 spdk_trace_register_description("RDMA_QP_STATE_CHANGE", TRACE_RDMA_QP_STATE_CHANGE, 202 OWNER_NONE, OBJECT_NONE, 0, 1, "state: "); 203 spdk_trace_register_description("RDMA_QP_DISCONNECT", TRACE_RDMA_QP_DISCONNECT, 204 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 205 spdk_trace_register_description("RDMA_QP_DESTROY", TRACE_RDMA_QP_DESTROY, 206 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 207 } 208 209 enum spdk_nvmf_rdma_wr_type { 210 RDMA_WR_TYPE_RECV, 211 RDMA_WR_TYPE_SEND, 212 RDMA_WR_TYPE_DATA, 213 }; 214 215 struct spdk_nvmf_rdma_wr { 216 enum spdk_nvmf_rdma_wr_type type; 217 }; 218 219 /* This structure holds commands as they are received off the wire. 220 * It must be dynamically paired with a full request object 221 * (spdk_nvmf_rdma_request) to service a request. It is separate 222 * from the request because RDMA does not appear to order 223 * completions, so occasionally we'll get a new incoming 224 * command when there aren't any free request objects. 225 */ 226 struct spdk_nvmf_rdma_recv { 227 struct ibv_recv_wr wr; 228 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 229 230 struct spdk_nvmf_rdma_qpair *qpair; 231 232 /* In-capsule data buffer */ 233 uint8_t *buf; 234 235 struct spdk_nvmf_rdma_wr rdma_wr; 236 uint64_t receive_tsc; 237 238 STAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 239 }; 240 241 struct spdk_nvmf_rdma_request_data { 242 struct spdk_nvmf_rdma_wr rdma_wr; 243 struct ibv_send_wr wr; 244 struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 245 }; 246 247 struct spdk_nvmf_rdma_request { 248 struct spdk_nvmf_request req; 249 250 enum spdk_nvmf_rdma_request_state state; 251 252 struct spdk_nvmf_rdma_recv *recv; 253 254 struct { 255 struct spdk_nvmf_rdma_wr rdma_wr; 256 struct ibv_send_wr wr; 257 struct ibv_sge sgl[NVMF_DEFAULT_RSP_SGE]; 258 } rsp; 259 260 struct spdk_nvmf_rdma_request_data data; 261 262 uint32_t iovpos; 263 264 uint32_t num_outstanding_data_wr; 265 uint64_t receive_tsc; 266 267 STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; 268 }; 269 270 enum spdk_nvmf_rdma_qpair_disconnect_flags { 271 RDMA_QP_DISCONNECTING = 1, 272 RDMA_QP_RECV_DRAINED = 1 << 1, 273 RDMA_QP_SEND_DRAINED = 1 << 2 274 }; 275 276 struct spdk_nvmf_rdma_resource_opts { 277 struct spdk_nvmf_rdma_qpair *qpair; 278 /* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */ 279 void *qp; 280 struct ibv_pd *pd; 281 uint32_t max_queue_depth; 282 uint32_t in_capsule_data_size; 283 bool shared; 284 }; 285 286 struct spdk_nvmf_send_wr_list { 287 struct ibv_send_wr *first; 288 struct ibv_send_wr *last; 289 }; 290 291 struct spdk_nvmf_recv_wr_list { 292 struct ibv_recv_wr *first; 293 struct ibv_recv_wr *last; 294 }; 295 296 struct spdk_nvmf_rdma_resources { 297 /* Array of size "max_queue_depth" containing RDMA requests. */ 298 struct spdk_nvmf_rdma_request *reqs; 299 300 /* Array of size "max_queue_depth" containing RDMA recvs. */ 301 struct spdk_nvmf_rdma_recv *recvs; 302 303 /* Array of size "max_queue_depth" containing 64 byte capsules 304 * used for receive. 305 */ 306 union nvmf_h2c_msg *cmds; 307 struct ibv_mr *cmds_mr; 308 309 /* Array of size "max_queue_depth" containing 16 byte completions 310 * to be sent back to the user. 311 */ 312 union nvmf_c2h_msg *cpls; 313 struct ibv_mr *cpls_mr; 314 315 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 316 * buffers to be used for in capsule data. 317 */ 318 void *bufs; 319 struct ibv_mr *bufs_mr; 320 321 /* The list of pending recvs to transfer */ 322 struct spdk_nvmf_recv_wr_list recvs_to_post; 323 324 /* Receives that are waiting for a request object */ 325 STAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 326 327 /* Queue to track free requests */ 328 STAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; 329 }; 330 331 typedef void (*spdk_nvmf_rdma_qpair_ibv_event)(struct spdk_nvmf_rdma_qpair *rqpair); 332 333 struct spdk_nvmf_rdma_ibv_event_ctx { 334 struct spdk_nvmf_rdma_qpair *rqpair; 335 spdk_nvmf_rdma_qpair_ibv_event cb_fn; 336 /* Link to other ibv events associated with this qpair */ 337 STAILQ_ENTRY(spdk_nvmf_rdma_ibv_event_ctx) link; 338 }; 339 340 struct spdk_nvmf_rdma_qpair { 341 struct spdk_nvmf_qpair qpair; 342 343 struct spdk_nvmf_rdma_device *device; 344 struct spdk_nvmf_rdma_poller *poller; 345 346 struct rdma_cm_id *cm_id; 347 struct ibv_srq *srq; 348 struct rdma_cm_id *listen_id; 349 350 /* The maximum number of I/O outstanding on this connection at one time */ 351 uint16_t max_queue_depth; 352 353 /* The maximum number of active RDMA READ and ATOMIC operations at one time */ 354 uint16_t max_read_depth; 355 356 /* The maximum number of RDMA SEND operations at one time */ 357 uint32_t max_send_depth; 358 359 /* The current number of outstanding WRs from this qpair's 360 * recv queue. Should not exceed device->attr.max_queue_depth. 361 */ 362 uint16_t current_recv_depth; 363 364 /* The current number of active RDMA READ operations */ 365 uint16_t current_read_depth; 366 367 /* The current number of posted WRs from this qpair's 368 * send queue. Should not exceed max_send_depth. 369 */ 370 uint32_t current_send_depth; 371 372 /* The maximum number of SGEs per WR on the send queue */ 373 uint32_t max_send_sge; 374 375 /* The maximum number of SGEs per WR on the recv queue */ 376 uint32_t max_recv_sge; 377 378 /* The list of pending send requests for a transfer */ 379 struct spdk_nvmf_send_wr_list sends_to_post; 380 381 struct spdk_nvmf_rdma_resources *resources; 382 383 STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_read_queue; 384 385 STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_write_queue; 386 387 /* Number of requests not in the free state */ 388 uint32_t qd; 389 390 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; 391 392 STAILQ_ENTRY(spdk_nvmf_rdma_qpair) recv_link; 393 394 STAILQ_ENTRY(spdk_nvmf_rdma_qpair) send_link; 395 396 /* IBV queue pair attributes: they are used to manage 397 * qp state and recover from errors. 398 */ 399 enum ibv_qp_state ibv_state; 400 401 uint32_t disconnect_flags; 402 403 /* Poller registered in case the qpair doesn't properly 404 * complete the qpair destruct process and becomes defunct. 405 */ 406 407 struct spdk_poller *destruct_poller; 408 409 /* List of ibv async events */ 410 STAILQ_HEAD(, spdk_nvmf_rdma_ibv_event_ctx) ibv_events; 411 412 /* There are several ways a disconnect can start on a qpair 413 * and they are not all mutually exclusive. It is important 414 * that we only initialize one of these paths. 415 */ 416 bool disconnect_started; 417 /* Lets us know that we have received the last_wqe event. */ 418 bool last_wqe_reached; 419 }; 420 421 struct spdk_nvmf_rdma_poller_stat { 422 uint64_t completions; 423 uint64_t polls; 424 uint64_t requests; 425 uint64_t request_latency; 426 uint64_t pending_free_request; 427 uint64_t pending_rdma_read; 428 uint64_t pending_rdma_write; 429 }; 430 431 struct spdk_nvmf_rdma_poller { 432 struct spdk_nvmf_rdma_device *device; 433 struct spdk_nvmf_rdma_poll_group *group; 434 435 int num_cqe; 436 int required_num_wr; 437 struct ibv_cq *cq; 438 439 /* The maximum number of I/O outstanding on the shared receive queue at one time */ 440 uint16_t max_srq_depth; 441 442 /* Shared receive queue */ 443 struct ibv_srq *srq; 444 445 struct spdk_nvmf_rdma_resources *resources; 446 struct spdk_nvmf_rdma_poller_stat stat; 447 448 TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; 449 450 STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_recv; 451 452 STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_send; 453 454 TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 455 }; 456 457 struct spdk_nvmf_rdma_poll_group_stat { 458 uint64_t pending_data_buffer; 459 }; 460 461 struct spdk_nvmf_rdma_poll_group { 462 struct spdk_nvmf_transport_poll_group group; 463 struct spdk_nvmf_rdma_poll_group_stat stat; 464 TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 465 TAILQ_ENTRY(spdk_nvmf_rdma_poll_group) link; 466 /* 467 * buffers which are split across multiple RDMA 468 * memory regions cannot be used by this transport. 469 */ 470 STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf) retired_bufs; 471 }; 472 473 struct spdk_nvmf_rdma_conn_sched { 474 struct spdk_nvmf_rdma_poll_group *next_admin_pg; 475 struct spdk_nvmf_rdma_poll_group *next_io_pg; 476 }; 477 478 /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 479 struct spdk_nvmf_rdma_device { 480 struct ibv_device_attr attr; 481 struct ibv_context *context; 482 483 struct spdk_mem_map *map; 484 struct ibv_pd *pd; 485 486 int num_srq; 487 488 TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 489 }; 490 491 struct spdk_nvmf_rdma_port { 492 struct spdk_nvme_transport_id trid; 493 struct rdma_cm_id *id; 494 struct spdk_nvmf_rdma_device *device; 495 uint32_t ref; 496 TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 497 }; 498 499 struct spdk_nvmf_rdma_transport { 500 struct spdk_nvmf_transport transport; 501 502 struct spdk_nvmf_rdma_conn_sched conn_sched; 503 504 struct rdma_event_channel *event_channel; 505 506 struct spdk_mempool *data_wr_pool; 507 508 pthread_mutex_t lock; 509 510 /* fields used to poll RDMA/IB events */ 511 nfds_t npoll_fds; 512 struct pollfd *poll_fds; 513 514 TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 515 TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 516 TAILQ_HEAD(, spdk_nvmf_rdma_poll_group) poll_groups; 517 }; 518 519 static inline void 520 spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair); 521 522 static bool 523 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 524 struct spdk_nvmf_rdma_request *rdma_req); 525 526 static inline int 527 spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state) 528 { 529 switch (state) { 530 case IBV_QPS_RESET: 531 case IBV_QPS_INIT: 532 case IBV_QPS_RTR: 533 case IBV_QPS_RTS: 534 case IBV_QPS_SQD: 535 case IBV_QPS_SQE: 536 case IBV_QPS_ERR: 537 return 0; 538 default: 539 return -1; 540 } 541 } 542 543 static inline enum spdk_nvme_media_error_status_code 544 spdk_nvmf_rdma_dif_error_to_compl_status(uint8_t err_type) { 545 enum spdk_nvme_media_error_status_code result; 546 switch (err_type) 547 { 548 case SPDK_DIF_REFTAG_ERROR: 549 result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR; 550 break; 551 case SPDK_DIF_APPTAG_ERROR: 552 result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR; 553 break; 554 case SPDK_DIF_GUARD_ERROR: 555 result = SPDK_NVME_SC_GUARD_CHECK_ERROR; 556 break; 557 default: 558 SPDK_UNREACHABLE(); 559 } 560 561 return result; 562 } 563 564 static enum ibv_qp_state 565 spdk_nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) { 566 enum ibv_qp_state old_state, new_state; 567 struct ibv_qp_attr qp_attr; 568 struct ibv_qp_init_attr init_attr; 569 int rc; 570 571 old_state = rqpair->ibv_state; 572 rc = ibv_query_qp(rqpair->cm_id->qp, &qp_attr, 573 g_spdk_nvmf_ibv_query_mask, &init_attr); 574 575 if (rc) 576 { 577 SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); 578 return IBV_QPS_ERR + 1; 579 } 580 581 new_state = qp_attr.qp_state; 582 rqpair->ibv_state = new_state; 583 qp_attr.ah_attr.port_num = qp_attr.port_num; 584 585 rc = spdk_nvmf_rdma_check_ibv_state(new_state); 586 if (rc) 587 { 588 SPDK_ERRLOG("QP#%d: bad state updated: %u, maybe hardware issue\n", rqpair->qpair.qid, new_state); 589 /* 590 * IBV_QPS_UNKNOWN undefined if lib version smaller than libibverbs-1.1.8 591 * IBV_QPS_UNKNOWN is the enum element after IBV_QPS_ERR 592 */ 593 return IBV_QPS_ERR + 1; 594 } 595 596 if (old_state != new_state) 597 { 598 spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0, 599 (uintptr_t)rqpair->cm_id, new_state); 600 } 601 return new_state; 602 } 603 604 static const char *str_ibv_qp_state[] = { 605 "IBV_QPS_RESET", 606 "IBV_QPS_INIT", 607 "IBV_QPS_RTR", 608 "IBV_QPS_RTS", 609 "IBV_QPS_SQD", 610 "IBV_QPS_SQE", 611 "IBV_QPS_ERR", 612 "IBV_QPS_UNKNOWN" 613 }; 614 615 static int 616 spdk_nvmf_rdma_set_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair, 617 enum ibv_qp_state new_state) 618 { 619 struct ibv_qp_attr qp_attr; 620 struct ibv_qp_init_attr init_attr; 621 int rc; 622 enum ibv_qp_state state; 623 static int attr_mask_rc[] = { 624 [IBV_QPS_RESET] = IBV_QP_STATE, 625 [IBV_QPS_INIT] = (IBV_QP_STATE | 626 IBV_QP_PKEY_INDEX | 627 IBV_QP_PORT | 628 IBV_QP_ACCESS_FLAGS), 629 [IBV_QPS_RTR] = (IBV_QP_STATE | 630 IBV_QP_AV | 631 IBV_QP_PATH_MTU | 632 IBV_QP_DEST_QPN | 633 IBV_QP_RQ_PSN | 634 IBV_QP_MAX_DEST_RD_ATOMIC | 635 IBV_QP_MIN_RNR_TIMER), 636 [IBV_QPS_RTS] = (IBV_QP_STATE | 637 IBV_QP_SQ_PSN | 638 IBV_QP_TIMEOUT | 639 IBV_QP_RETRY_CNT | 640 IBV_QP_RNR_RETRY | 641 IBV_QP_MAX_QP_RD_ATOMIC), 642 [IBV_QPS_SQD] = IBV_QP_STATE, 643 [IBV_QPS_SQE] = IBV_QP_STATE, 644 [IBV_QPS_ERR] = IBV_QP_STATE, 645 }; 646 647 rc = spdk_nvmf_rdma_check_ibv_state(new_state); 648 if (rc) { 649 SPDK_ERRLOG("QP#%d: bad state requested: %u\n", 650 rqpair->qpair.qid, new_state); 651 return rc; 652 } 653 654 rc = ibv_query_qp(rqpair->cm_id->qp, &qp_attr, 655 g_spdk_nvmf_ibv_query_mask, &init_attr); 656 657 if (rc) { 658 SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); 659 } 660 661 qp_attr.cur_qp_state = rqpair->ibv_state; 662 qp_attr.qp_state = new_state; 663 664 rc = ibv_modify_qp(rqpair->cm_id->qp, &qp_attr, 665 attr_mask_rc[new_state]); 666 667 if (rc) { 668 SPDK_ERRLOG("QP#%d: failed to set state to: %s, %d (%s)\n", 669 rqpair->qpair.qid, str_ibv_qp_state[new_state], errno, strerror(errno)); 670 return rc; 671 } 672 673 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 674 675 if (state != new_state) { 676 SPDK_ERRLOG("QP#%d: expected state: %s, actual state: %s\n", 677 rqpair->qpair.qid, str_ibv_qp_state[new_state], 678 str_ibv_qp_state[state]); 679 return -1; 680 } 681 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "IBV QP#%u changed to: %s\n", rqpair->qpair.qid, 682 str_ibv_qp_state[state]); 683 return 0; 684 } 685 686 static void 687 nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req, 688 struct spdk_nvmf_rdma_transport *rtransport) 689 { 690 struct spdk_nvmf_rdma_request_data *data_wr; 691 struct ibv_send_wr *next_send_wr; 692 uint64_t req_wrid; 693 694 rdma_req->num_outstanding_data_wr = 0; 695 data_wr = &rdma_req->data; 696 req_wrid = data_wr->wr.wr_id; 697 while (data_wr && data_wr->wr.wr_id == req_wrid) { 698 memset(data_wr->sgl, 0, sizeof(data_wr->wr.sg_list[0]) * data_wr->wr.num_sge); 699 data_wr->wr.num_sge = 0; 700 next_send_wr = data_wr->wr.next; 701 if (data_wr != &rdma_req->data) { 702 spdk_mempool_put(rtransport->data_wr_pool, data_wr); 703 } 704 data_wr = (!next_send_wr || next_send_wr == &rdma_req->rsp.wr) ? NULL : 705 SPDK_CONTAINEROF(next_send_wr, struct spdk_nvmf_rdma_request_data, wr); 706 } 707 } 708 709 static void 710 nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req) 711 { 712 SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->req.data_from_pool); 713 if (req->req.cmd) { 714 SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode); 715 } 716 if (req->recv) { 717 SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id); 718 } 719 } 720 721 static void 722 nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair) 723 { 724 int i; 725 726 SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid); 727 for (i = 0; i < rqpair->max_queue_depth; i++) { 728 if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) { 729 nvmf_rdma_dump_request(&rqpair->resources->reqs[i]); 730 } 731 } 732 } 733 734 static void 735 nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources) 736 { 737 if (resources->cmds_mr) { 738 ibv_dereg_mr(resources->cmds_mr); 739 } 740 741 if (resources->cpls_mr) { 742 ibv_dereg_mr(resources->cpls_mr); 743 } 744 745 if (resources->bufs_mr) { 746 ibv_dereg_mr(resources->bufs_mr); 747 } 748 749 spdk_free(resources->cmds); 750 spdk_free(resources->cpls); 751 spdk_free(resources->bufs); 752 free(resources->reqs); 753 free(resources->recvs); 754 free(resources); 755 } 756 757 758 static struct spdk_nvmf_rdma_resources * 759 nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts) 760 { 761 struct spdk_nvmf_rdma_resources *resources; 762 struct spdk_nvmf_rdma_request *rdma_req; 763 struct spdk_nvmf_rdma_recv *rdma_recv; 764 struct ibv_qp *qp; 765 struct ibv_srq *srq; 766 uint32_t i; 767 int rc; 768 769 resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources)); 770 if (!resources) { 771 SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); 772 return NULL; 773 } 774 775 resources->reqs = calloc(opts->max_queue_depth, sizeof(*resources->reqs)); 776 resources->recvs = calloc(opts->max_queue_depth, sizeof(*resources->recvs)); 777 resources->cmds = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds), 778 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 779 resources->cpls = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls), 780 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 781 782 if (opts->in_capsule_data_size > 0) { 783 resources->bufs = spdk_zmalloc(opts->max_queue_depth * opts->in_capsule_data_size, 784 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, 785 SPDK_MALLOC_DMA); 786 } 787 788 if (!resources->reqs || !resources->recvs || !resources->cmds || 789 !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) { 790 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 791 goto cleanup; 792 } 793 794 resources->cmds_mr = ibv_reg_mr(opts->pd, resources->cmds, 795 opts->max_queue_depth * sizeof(*resources->cmds), 796 IBV_ACCESS_LOCAL_WRITE); 797 resources->cpls_mr = ibv_reg_mr(opts->pd, resources->cpls, 798 opts->max_queue_depth * sizeof(*resources->cpls), 799 0); 800 801 if (opts->in_capsule_data_size) { 802 resources->bufs_mr = ibv_reg_mr(opts->pd, resources->bufs, 803 opts->max_queue_depth * 804 opts->in_capsule_data_size, 805 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 806 } 807 808 if (!resources->cmds_mr || !resources->cpls_mr || 809 (opts->in_capsule_data_size && 810 !resources->bufs_mr)) { 811 goto cleanup; 812 } 813 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 814 resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds), 815 resources->cmds_mr->lkey); 816 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 817 resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls), 818 resources->cpls_mr->lkey); 819 if (resources->bufs && resources->bufs_mr) { 820 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 821 resources->bufs, opts->max_queue_depth * 822 opts->in_capsule_data_size, resources->bufs_mr->lkey); 823 } 824 825 /* Initialize queues */ 826 STAILQ_INIT(&resources->incoming_queue); 827 STAILQ_INIT(&resources->free_queue); 828 829 for (i = 0; i < opts->max_queue_depth; i++) { 830 struct ibv_recv_wr *bad_wr = NULL; 831 832 rdma_recv = &resources->recvs[i]; 833 rdma_recv->qpair = opts->qpair; 834 835 /* Set up memory to receive commands */ 836 if (resources->bufs) { 837 rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i * 838 opts->in_capsule_data_size)); 839 } 840 841 rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV; 842 843 rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i]; 844 rdma_recv->sgl[0].length = sizeof(resources->cmds[i]); 845 rdma_recv->sgl[0].lkey = resources->cmds_mr->lkey; 846 rdma_recv->wr.num_sge = 1; 847 848 if (rdma_recv->buf && resources->bufs_mr) { 849 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 850 rdma_recv->sgl[1].length = opts->in_capsule_data_size; 851 rdma_recv->sgl[1].lkey = resources->bufs_mr->lkey; 852 rdma_recv->wr.num_sge++; 853 } 854 855 rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr; 856 rdma_recv->wr.sg_list = rdma_recv->sgl; 857 if (opts->shared) { 858 srq = (struct ibv_srq *)opts->qp; 859 rc = ibv_post_srq_recv(srq, &rdma_recv->wr, &bad_wr); 860 } else { 861 qp = (struct ibv_qp *)opts->qp; 862 rc = ibv_post_recv(qp, &rdma_recv->wr, &bad_wr); 863 } 864 if (rc) { 865 goto cleanup; 866 } 867 } 868 869 for (i = 0; i < opts->max_queue_depth; i++) { 870 rdma_req = &resources->reqs[i]; 871 872 if (opts->qpair != NULL) { 873 rdma_req->req.qpair = &opts->qpair->qpair; 874 } else { 875 rdma_req->req.qpair = NULL; 876 } 877 rdma_req->req.cmd = NULL; 878 879 /* Set up memory to send responses */ 880 rdma_req->req.rsp = &resources->cpls[i]; 881 882 rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i]; 883 rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]); 884 rdma_req->rsp.sgl[0].lkey = resources->cpls_mr->lkey; 885 886 rdma_req->rsp.rdma_wr.type = RDMA_WR_TYPE_SEND; 887 rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp.rdma_wr; 888 rdma_req->rsp.wr.next = NULL; 889 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 890 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 891 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 892 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 893 894 /* Set up memory for data buffers */ 895 rdma_req->data.rdma_wr.type = RDMA_WR_TYPE_DATA; 896 rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data.rdma_wr; 897 rdma_req->data.wr.next = NULL; 898 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 899 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 900 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 901 902 /* Initialize request state to FREE */ 903 rdma_req->state = RDMA_REQUEST_STATE_FREE; 904 STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link); 905 } 906 907 return resources; 908 909 cleanup: 910 nvmf_rdma_resources_destroy(resources); 911 return NULL; 912 } 913 914 static void 915 spdk_nvmf_rdma_qpair_clean_ibv_events(struct spdk_nvmf_rdma_qpair *rqpair) 916 { 917 struct spdk_nvmf_rdma_ibv_event_ctx *ctx, *tctx; 918 STAILQ_FOREACH_SAFE(ctx, &rqpair->ibv_events, link, tctx) { 919 ctx->rqpair = NULL; 920 /* Memory allocated for ctx is freed in spdk_nvmf_rdma_qpair_process_ibv_event */ 921 STAILQ_REMOVE(&rqpair->ibv_events, ctx, spdk_nvmf_rdma_ibv_event_ctx, link); 922 } 923 } 924 925 static void 926 spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 927 { 928 struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 929 struct ibv_recv_wr *bad_recv_wr = NULL; 930 int rc; 931 932 spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); 933 934 spdk_poller_unregister(&rqpair->destruct_poller); 935 936 if (rqpair->qd != 0) { 937 struct spdk_nvmf_qpair *qpair = &rqpair->qpair; 938 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(qpair->transport, 939 struct spdk_nvmf_rdma_transport, transport); 940 struct spdk_nvmf_rdma_request *req; 941 uint32_t i, max_req_count = 0; 942 943 SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd); 944 945 if (rqpair->srq == NULL) { 946 nvmf_rdma_dump_qpair_contents(rqpair); 947 max_req_count = rqpair->max_queue_depth; 948 } else if (rqpair->poller && rqpair->resources) { 949 max_req_count = rqpair->poller->max_srq_depth; 950 } 951 952 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Release incomplete requests\n"); 953 for (i = 0; i < max_req_count; i++) { 954 req = &rqpair->resources->reqs[i]; 955 if (req->req.qpair == qpair && req->state != RDMA_REQUEST_STATE_FREE) { 956 /* spdk_nvmf_rdma_request_process checks qpair ibv and internal state 957 * and completes a request */ 958 spdk_nvmf_rdma_request_process(rtransport, req); 959 } 960 } 961 assert(rqpair->qd == 0); 962 } 963 964 if (rqpair->poller) { 965 TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); 966 967 if (rqpair->srq != NULL && rqpair->resources != NULL) { 968 /* Drop all received but unprocessed commands for this queue and return them to SRQ */ 969 STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) { 970 if (rqpair == rdma_recv->qpair) { 971 STAILQ_REMOVE(&rqpair->resources->incoming_queue, rdma_recv, spdk_nvmf_rdma_recv, link); 972 rc = ibv_post_srq_recv(rqpair->srq, &rdma_recv->wr, &bad_recv_wr); 973 if (rc) { 974 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 975 } 976 } 977 } 978 } 979 } 980 981 if (rqpair->cm_id) { 982 if (rqpair->cm_id->qp != NULL) { 983 rdma_destroy_qp(rqpair->cm_id); 984 } 985 rdma_destroy_id(rqpair->cm_id); 986 987 if (rqpair->poller != NULL && rqpair->srq == NULL) { 988 rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth); 989 } 990 } 991 992 if (rqpair->srq == NULL && rqpair->resources != NULL) { 993 nvmf_rdma_resources_destroy(rqpair->resources); 994 } 995 996 spdk_nvmf_rdma_qpair_clean_ibv_events(rqpair); 997 998 free(rqpair); 999 } 1000 1001 static int 1002 nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device) 1003 { 1004 struct spdk_nvmf_rdma_poller *rpoller; 1005 int rc, num_cqe, required_num_wr; 1006 1007 /* Enlarge CQ size dynamically */ 1008 rpoller = rqpair->poller; 1009 required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth); 1010 num_cqe = rpoller->num_cqe; 1011 if (num_cqe < required_num_wr) { 1012 num_cqe = spdk_max(num_cqe * 2, required_num_wr); 1013 num_cqe = spdk_min(num_cqe, device->attr.max_cqe); 1014 } 1015 1016 if (rpoller->num_cqe != num_cqe) { 1017 if (required_num_wr > device->attr.max_cqe) { 1018 SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n", 1019 required_num_wr, device->attr.max_cqe); 1020 return -1; 1021 } 1022 1023 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe); 1024 rc = ibv_resize_cq(rpoller->cq, num_cqe); 1025 if (rc) { 1026 SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); 1027 return -1; 1028 } 1029 1030 rpoller->num_cqe = num_cqe; 1031 } 1032 1033 rpoller->required_num_wr = required_num_wr; 1034 return 0; 1035 } 1036 1037 static int 1038 spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 1039 { 1040 struct spdk_nvmf_rdma_qpair *rqpair; 1041 int rc; 1042 struct spdk_nvmf_rdma_transport *rtransport; 1043 struct spdk_nvmf_transport *transport; 1044 struct spdk_nvmf_rdma_resource_opts opts; 1045 struct spdk_nvmf_rdma_device *device; 1046 struct ibv_qp_init_attr ibv_init_attr; 1047 1048 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1049 device = rqpair->device; 1050 1051 memset(&ibv_init_attr, 0, sizeof(struct ibv_qp_init_attr)); 1052 ibv_init_attr.qp_context = rqpair; 1053 ibv_init_attr.qp_type = IBV_QPT_RC; 1054 ibv_init_attr.send_cq = rqpair->poller->cq; 1055 ibv_init_attr.recv_cq = rqpair->poller->cq; 1056 1057 if (rqpair->srq) { 1058 ibv_init_attr.srq = rqpair->srq; 1059 } else { 1060 ibv_init_attr.cap.max_recv_wr = rqpair->max_queue_depth + 1061 1; /* RECV operations + dummy drain WR */ 1062 } 1063 1064 ibv_init_attr.cap.max_send_wr = rqpair->max_queue_depth * 1065 2 + 1; /* SEND, READ, and WRITE operations + dummy drain WR */ 1066 ibv_init_attr.cap.max_send_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_TX_SGE); 1067 ibv_init_attr.cap.max_recv_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 1068 1069 if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) { 1070 SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n"); 1071 goto error; 1072 } 1073 1074 rc = rdma_create_qp(rqpair->cm_id, device->pd, &ibv_init_attr); 1075 if (rc) { 1076 SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); 1077 goto error; 1078 } 1079 1080 rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2 + 1), 1081 ibv_init_attr.cap.max_send_wr); 1082 rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, ibv_init_attr.cap.max_send_sge); 1083 rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, ibv_init_attr.cap.max_recv_sge); 1084 spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0); 1085 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); 1086 1087 rqpair->sends_to_post.first = NULL; 1088 rqpair->sends_to_post.last = NULL; 1089 1090 if (rqpair->poller->srq == NULL) { 1091 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 1092 transport = &rtransport->transport; 1093 1094 opts.qp = rqpair->cm_id->qp; 1095 opts.pd = rqpair->cm_id->pd; 1096 opts.qpair = rqpair; 1097 opts.shared = false; 1098 opts.max_queue_depth = rqpair->max_queue_depth; 1099 opts.in_capsule_data_size = transport->opts.in_capsule_data_size; 1100 1101 rqpair->resources = nvmf_rdma_resources_create(&opts); 1102 1103 if (!rqpair->resources) { 1104 SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); 1105 rdma_destroy_qp(rqpair->cm_id); 1106 goto error; 1107 } 1108 } else { 1109 rqpair->resources = rqpair->poller->resources; 1110 } 1111 1112 rqpair->current_recv_depth = 0; 1113 STAILQ_INIT(&rqpair->pending_rdma_read_queue); 1114 STAILQ_INIT(&rqpair->pending_rdma_write_queue); 1115 1116 return 0; 1117 1118 error: 1119 rdma_destroy_id(rqpair->cm_id); 1120 rqpair->cm_id = NULL; 1121 return -1; 1122 } 1123 1124 /* Append the given recv wr structure to the resource structs outstanding recvs list. */ 1125 /* This function accepts either a single wr or the first wr in a linked list. */ 1126 static void 1127 nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first) 1128 { 1129 struct ibv_recv_wr *last; 1130 1131 last = first; 1132 while (last->next != NULL) { 1133 last = last->next; 1134 } 1135 1136 if (rqpair->resources->recvs_to_post.first == NULL) { 1137 rqpair->resources->recvs_to_post.first = first; 1138 rqpair->resources->recvs_to_post.last = last; 1139 if (rqpair->srq == NULL) { 1140 STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_recv, rqpair, recv_link); 1141 } 1142 } else { 1143 rqpair->resources->recvs_to_post.last->next = first; 1144 rqpair->resources->recvs_to_post.last = last; 1145 } 1146 } 1147 1148 /* Append the given send wr structure to the qpair's outstanding sends list. */ 1149 /* This function accepts either a single wr or the first wr in a linked list. */ 1150 static void 1151 nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first) 1152 { 1153 struct ibv_send_wr *last; 1154 1155 last = first; 1156 while (last->next != NULL) { 1157 last = last->next; 1158 } 1159 1160 if (rqpair->sends_to_post.first == NULL) { 1161 rqpair->sends_to_post.first = first; 1162 rqpair->sends_to_post.last = last; 1163 STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link); 1164 } else { 1165 rqpair->sends_to_post.last->next = first; 1166 rqpair->sends_to_post.last = last; 1167 } 1168 } 1169 1170 static int 1171 request_transfer_in(struct spdk_nvmf_request *req) 1172 { 1173 struct spdk_nvmf_rdma_request *rdma_req; 1174 struct spdk_nvmf_qpair *qpair; 1175 struct spdk_nvmf_rdma_qpair *rqpair; 1176 1177 qpair = req->qpair; 1178 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1179 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1180 1181 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 1182 assert(rdma_req != NULL); 1183 1184 nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr); 1185 rqpair->current_read_depth += rdma_req->num_outstanding_data_wr; 1186 rqpair->current_send_depth += rdma_req->num_outstanding_data_wr; 1187 return 0; 1188 } 1189 1190 static int 1191 request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) 1192 { 1193 int num_outstanding_data_wr = 0; 1194 struct spdk_nvmf_rdma_request *rdma_req; 1195 struct spdk_nvmf_qpair *qpair; 1196 struct spdk_nvmf_rdma_qpair *rqpair; 1197 struct spdk_nvme_cpl *rsp; 1198 struct ibv_send_wr *first = NULL; 1199 1200 *data_posted = 0; 1201 qpair = req->qpair; 1202 rsp = &req->rsp->nvme_cpl; 1203 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1204 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1205 1206 /* Advance our sq_head pointer */ 1207 if (qpair->sq_head == qpair->sq_head_max) { 1208 qpair->sq_head = 0; 1209 } else { 1210 qpair->sq_head++; 1211 } 1212 rsp->sqhd = qpair->sq_head; 1213 1214 /* queue the capsule for the recv buffer */ 1215 assert(rdma_req->recv != NULL); 1216 1217 nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr); 1218 1219 rdma_req->recv = NULL; 1220 assert(rqpair->current_recv_depth > 0); 1221 rqpair->current_recv_depth--; 1222 1223 /* Build the response which consists of optional 1224 * RDMA WRITEs to transfer data, plus an RDMA SEND 1225 * containing the response. 1226 */ 1227 first = &rdma_req->rsp.wr; 1228 1229 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 1230 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1231 first = &rdma_req->data.wr; 1232 *data_posted = 1; 1233 num_outstanding_data_wr = rdma_req->num_outstanding_data_wr; 1234 } 1235 nvmf_rdma_qpair_queue_send_wrs(rqpair, first); 1236 /* +1 for the rsp wr */ 1237 rqpair->current_send_depth += num_outstanding_data_wr + 1; 1238 1239 return 0; 1240 } 1241 1242 static int 1243 spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 1244 { 1245 struct spdk_nvmf_rdma_accept_private_data accept_data; 1246 struct rdma_conn_param ctrlr_event_data = {}; 1247 int rc; 1248 1249 accept_data.recfmt = 0; 1250 accept_data.crqsize = rqpair->max_queue_depth; 1251 1252 ctrlr_event_data.private_data = &accept_data; 1253 ctrlr_event_data.private_data_len = sizeof(accept_data); 1254 if (id->ps == RDMA_PS_TCP) { 1255 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 1256 ctrlr_event_data.initiator_depth = rqpair->max_read_depth; 1257 } 1258 1259 /* Configure infinite retries for the initiator side qpair. 1260 * When using a shared receive queue on the target side, 1261 * we need to pass this value to the initiator to prevent the 1262 * initiator side NIC from completing SEND requests back to the 1263 * initiator with status rnr_retry_count_exceeded. */ 1264 if (rqpair->srq != NULL) { 1265 ctrlr_event_data.rnr_retry_count = 0x7; 1266 } 1267 1268 rc = rdma_accept(id, &ctrlr_event_data); 1269 if (rc) { 1270 SPDK_ERRLOG("Error %d on rdma_accept\n", errno); 1271 } else { 1272 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); 1273 } 1274 1275 return rc; 1276 } 1277 1278 static void 1279 spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 1280 { 1281 struct spdk_nvmf_rdma_reject_private_data rej_data; 1282 1283 rej_data.recfmt = 0; 1284 rej_data.sts = error; 1285 1286 rdma_reject(id, &rej_data, sizeof(rej_data)); 1287 } 1288 1289 static int 1290 nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, 1291 new_qpair_fn cb_fn, void *cb_arg) 1292 { 1293 struct spdk_nvmf_rdma_transport *rtransport; 1294 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 1295 struct spdk_nvmf_rdma_port *port; 1296 struct rdma_conn_param *rdma_param = NULL; 1297 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 1298 uint16_t max_queue_depth; 1299 uint16_t max_read_depth; 1300 1301 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1302 1303 assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 1304 assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 1305 1306 rdma_param = &event->param.conn; 1307 if (rdma_param->private_data == NULL || 1308 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1309 SPDK_ERRLOG("connect request: no private data provided\n"); 1310 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 1311 return -1; 1312 } 1313 1314 private_data = rdma_param->private_data; 1315 if (private_data->recfmt != 0) { 1316 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 1317 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 1318 return -1; 1319 } 1320 1321 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 1322 event->id->verbs->device->name, event->id->verbs->device->dev_name); 1323 1324 port = event->listen_id->context; 1325 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 1326 event->listen_id, event->listen_id->verbs, port); 1327 1328 /* Figure out the supported queue depth. This is a multi-step process 1329 * that takes into account hardware maximums, host provided values, 1330 * and our target's internal memory limits */ 1331 1332 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); 1333 1334 /* Start with the maximum queue depth allowed by the target */ 1335 max_queue_depth = rtransport->transport.opts.max_queue_depth; 1336 max_read_depth = rtransport->transport.opts.max_queue_depth; 1337 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", 1338 rtransport->transport.opts.max_queue_depth); 1339 1340 /* Next check the local NIC's hardware limitations */ 1341 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 1342 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 1343 port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 1344 max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 1345 max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom); 1346 1347 /* Next check the remote NIC's hardware limitations */ 1348 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 1349 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 1350 rdma_param->initiator_depth, rdma_param->responder_resources); 1351 if (rdma_param->initiator_depth > 0) { 1352 max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth); 1353 } 1354 1355 /* Finally check for the host software requested values, which are 1356 * optional. */ 1357 if (rdma_param->private_data != NULL && 1358 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1359 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 1360 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 1361 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 1362 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 1363 } 1364 1365 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 1366 max_queue_depth, max_read_depth); 1367 1368 rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 1369 if (rqpair == NULL) { 1370 SPDK_ERRLOG("Could not allocate new connection.\n"); 1371 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1372 return -1; 1373 } 1374 1375 rqpair->device = port->device; 1376 rqpair->max_queue_depth = max_queue_depth; 1377 rqpair->max_read_depth = max_read_depth; 1378 rqpair->cm_id = event->id; 1379 rqpair->listen_id = event->listen_id; 1380 rqpair->qpair.transport = transport; 1381 STAILQ_INIT(&rqpair->ibv_events); 1382 /* use qid from the private data to determine the qpair type 1383 qid will be set to the appropriate value when the controller is created */ 1384 rqpair->qpair.qid = private_data->qid; 1385 1386 event->id->context = &rqpair->qpair; 1387 1388 cb_fn(&rqpair->qpair, cb_arg); 1389 1390 return 0; 1391 } 1392 1393 static int 1394 spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 1395 enum spdk_mem_map_notify_action action, 1396 void *vaddr, size_t size) 1397 { 1398 struct ibv_pd *pd = cb_ctx; 1399 struct ibv_mr *mr; 1400 int rc; 1401 1402 switch (action) { 1403 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1404 if (!g_nvmf_hooks.get_rkey) { 1405 mr = ibv_reg_mr(pd, vaddr, size, 1406 IBV_ACCESS_LOCAL_WRITE | 1407 IBV_ACCESS_REMOTE_READ | 1408 IBV_ACCESS_REMOTE_WRITE); 1409 if (mr == NULL) { 1410 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 1411 return -1; 1412 } else { 1413 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 1414 } 1415 } else { 1416 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, 1417 g_nvmf_hooks.get_rkey(pd, vaddr, size)); 1418 } 1419 break; 1420 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1421 if (!g_nvmf_hooks.get_rkey) { 1422 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); 1423 if (mr) { 1424 ibv_dereg_mr(mr); 1425 } 1426 } 1427 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 1428 break; 1429 default: 1430 SPDK_UNREACHABLE(); 1431 } 1432 1433 return rc; 1434 } 1435 1436 static int 1437 spdk_nvmf_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) 1438 { 1439 /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ 1440 return addr_1 == addr_2; 1441 } 1442 1443 static inline void 1444 nvmf_rdma_setup_wr(struct ibv_send_wr *wr, struct ibv_send_wr *next, 1445 enum spdk_nvme_data_transfer xfer) 1446 { 1447 if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1448 wr->opcode = IBV_WR_RDMA_WRITE; 1449 wr->send_flags = 0; 1450 wr->next = next; 1451 } else if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 1452 wr->opcode = IBV_WR_RDMA_READ; 1453 wr->send_flags = IBV_SEND_SIGNALED; 1454 wr->next = NULL; 1455 } else { 1456 assert(0); 1457 } 1458 } 1459 1460 static int 1461 nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport, 1462 struct spdk_nvmf_rdma_request *rdma_req, 1463 uint32_t num_sgl_descriptors) 1464 { 1465 struct spdk_nvmf_rdma_request_data *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES]; 1466 struct spdk_nvmf_rdma_request_data *current_data_wr; 1467 uint32_t i; 1468 1469 if (num_sgl_descriptors > SPDK_NVMF_MAX_SGL_ENTRIES) { 1470 SPDK_ERRLOG("Requested too much entries (%u), the limit is %u\n", 1471 num_sgl_descriptors, SPDK_NVMF_MAX_SGL_ENTRIES); 1472 return -EINVAL; 1473 } 1474 1475 if (spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests, num_sgl_descriptors)) { 1476 return -ENOMEM; 1477 } 1478 1479 current_data_wr = &rdma_req->data; 1480 1481 for (i = 0; i < num_sgl_descriptors; i++) { 1482 nvmf_rdma_setup_wr(¤t_data_wr->wr, &work_requests[i]->wr, rdma_req->req.xfer); 1483 current_data_wr->wr.next = &work_requests[i]->wr; 1484 current_data_wr = work_requests[i]; 1485 current_data_wr->wr.sg_list = current_data_wr->sgl; 1486 current_data_wr->wr.wr_id = rdma_req->data.wr.wr_id; 1487 } 1488 1489 nvmf_rdma_setup_wr(¤t_data_wr->wr, &rdma_req->rsp.wr, rdma_req->req.xfer); 1490 1491 return 0; 1492 } 1493 1494 static inline void 1495 nvmf_rdma_setup_request(struct spdk_nvmf_rdma_request *rdma_req) 1496 { 1497 struct ibv_send_wr *wr = &rdma_req->data.wr; 1498 struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1; 1499 1500 wr->wr.rdma.rkey = sgl->keyed.key; 1501 wr->wr.rdma.remote_addr = sgl->address; 1502 nvmf_rdma_setup_wr(wr, &rdma_req->rsp.wr, rdma_req->req.xfer); 1503 } 1504 1505 static inline void 1506 nvmf_rdma_update_remote_addr(struct spdk_nvmf_rdma_request *rdma_req, uint32_t num_wrs) 1507 { 1508 struct ibv_send_wr *wr = &rdma_req->data.wr; 1509 struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1; 1510 uint32_t i; 1511 int j; 1512 uint64_t remote_addr_offset = 0; 1513 1514 for (i = 0; i < num_wrs; ++i) { 1515 wr->wr.rdma.rkey = sgl->keyed.key; 1516 wr->wr.rdma.remote_addr = sgl->address + remote_addr_offset; 1517 for (j = 0; j < wr->num_sge; ++j) { 1518 remote_addr_offset += wr->sg_list[j].length; 1519 } 1520 wr = wr->next; 1521 } 1522 } 1523 1524 /* This function is used in the rare case that we have a buffer split over multiple memory regions. */ 1525 static int 1526 nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf) 1527 { 1528 struct spdk_nvmf_transport_poll_group *group = &rgroup->group; 1529 struct spdk_nvmf_transport *transport = group->transport; 1530 struct spdk_nvmf_transport_pg_cache_buf *old_buf; 1531 void *new_buf; 1532 1533 if (!(STAILQ_EMPTY(&group->buf_cache))) { 1534 group->buf_cache_count--; 1535 new_buf = STAILQ_FIRST(&group->buf_cache); 1536 STAILQ_REMOVE_HEAD(&group->buf_cache, link); 1537 assert(*buf != NULL); 1538 } else { 1539 new_buf = spdk_mempool_get(transport->data_buf_pool); 1540 } 1541 1542 if (*buf == NULL) { 1543 return -ENOMEM; 1544 } 1545 1546 old_buf = *buf; 1547 STAILQ_INSERT_HEAD(&rgroup->retired_bufs, old_buf, link); 1548 *buf = new_buf; 1549 return 0; 1550 } 1551 1552 static bool 1553 nvmf_rdma_get_lkey(struct spdk_nvmf_rdma_device *device, struct iovec *iov, 1554 uint32_t *_lkey) 1555 { 1556 uint64_t translation_len; 1557 uint32_t lkey; 1558 1559 translation_len = iov->iov_len; 1560 1561 if (!g_nvmf_hooks.get_rkey) { 1562 lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, 1563 (uint64_t)iov->iov_base, &translation_len))->lkey; 1564 } else { 1565 lkey = spdk_mem_map_translate(device->map, 1566 (uint64_t)iov->iov_base, &translation_len); 1567 } 1568 1569 if (spdk_unlikely(translation_len < iov->iov_len)) { 1570 return false; 1571 } 1572 1573 *_lkey = lkey; 1574 return true; 1575 } 1576 1577 static bool 1578 nvmf_rdma_fill_wr_sge(struct spdk_nvmf_rdma_device *device, 1579 struct iovec *iov, struct ibv_send_wr **_wr, 1580 uint32_t *_remaining_data_block, uint32_t *_offset, 1581 uint32_t *_num_extra_wrs, 1582 const struct spdk_dif_ctx *dif_ctx) 1583 { 1584 struct ibv_send_wr *wr = *_wr; 1585 struct ibv_sge *sg_ele = &wr->sg_list[wr->num_sge]; 1586 uint32_t lkey = 0; 1587 uint32_t remaining, data_block_size, md_size, sge_len; 1588 1589 if (spdk_unlikely(!nvmf_rdma_get_lkey(device, iov, &lkey))) { 1590 /* This is a very rare case that can occur when using DPDK version < 19.05 */ 1591 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n"); 1592 return false; 1593 } 1594 1595 if (spdk_likely(!dif_ctx)) { 1596 sg_ele->lkey = lkey; 1597 sg_ele->addr = (uintptr_t)(iov->iov_base); 1598 sg_ele->length = iov->iov_len; 1599 wr->num_sge++; 1600 } else { 1601 remaining = iov->iov_len - *_offset; 1602 data_block_size = dif_ctx->block_size - dif_ctx->md_size; 1603 md_size = dif_ctx->md_size; 1604 1605 while (remaining) { 1606 if (wr->num_sge >= SPDK_NVMF_MAX_SGL_ENTRIES) { 1607 if (*_num_extra_wrs > 0 && wr->next) { 1608 *_wr = wr->next; 1609 wr = *_wr; 1610 wr->num_sge = 0; 1611 sg_ele = &wr->sg_list[wr->num_sge]; 1612 (*_num_extra_wrs)--; 1613 } else { 1614 break; 1615 } 1616 } 1617 sg_ele->lkey = lkey; 1618 sg_ele->addr = (uintptr_t)((char *)iov->iov_base + *_offset); 1619 sge_len = spdk_min(remaining, *_remaining_data_block); 1620 sg_ele->length = sge_len; 1621 remaining -= sge_len; 1622 *_remaining_data_block -= sge_len; 1623 *_offset += sge_len; 1624 1625 sg_ele++; 1626 wr->num_sge++; 1627 1628 if (*_remaining_data_block == 0) { 1629 /* skip metadata */ 1630 *_offset += md_size; 1631 /* Metadata that do not fit this IO buffer will be included in the next IO buffer */ 1632 remaining -= spdk_min(remaining, md_size); 1633 *_remaining_data_block = data_block_size; 1634 } 1635 1636 if (remaining == 0) { 1637 /* By subtracting the size of the last IOV from the offset, we ensure that we skip 1638 the remaining metadata bits at the beginning of the next buffer */ 1639 *_offset -= iov->iov_len; 1640 } 1641 } 1642 } 1643 1644 return true; 1645 } 1646 1647 static int 1648 nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup, 1649 struct spdk_nvmf_rdma_device *device, 1650 struct spdk_nvmf_rdma_request *rdma_req, 1651 struct ibv_send_wr *wr, 1652 uint32_t length, 1653 uint32_t num_extra_wrs) 1654 { 1655 struct spdk_nvmf_request *req = &rdma_req->req; 1656 struct spdk_dif_ctx *dif_ctx = NULL; 1657 uint32_t remaining_data_block = 0; 1658 uint32_t offset = 0; 1659 1660 if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) { 1661 dif_ctx = &rdma_req->req.dif.dif_ctx; 1662 remaining_data_block = dif_ctx->block_size - dif_ctx->md_size; 1663 } 1664 1665 wr->num_sge = 0; 1666 1667 while (length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) { 1668 while (spdk_unlikely(!nvmf_rdma_fill_wr_sge(device, &req->iov[rdma_req->iovpos], &wr, 1669 &remaining_data_block, &offset, &num_extra_wrs, dif_ctx))) { 1670 if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[rdma_req->iovpos]) == -ENOMEM) { 1671 return -ENOMEM; 1672 } 1673 req->iov[rdma_req->iovpos].iov_base = (void *)((uintptr_t)(req->buffers[rdma_req->iovpos] + 1674 NVMF_DATA_BUFFER_MASK) & 1675 ~NVMF_DATA_BUFFER_MASK); 1676 } 1677 1678 length -= req->iov[rdma_req->iovpos].iov_len; 1679 rdma_req->iovpos++; 1680 } 1681 1682 if (length) { 1683 SPDK_ERRLOG("Not enough SG entries to hold data buffer\n"); 1684 return -EINVAL; 1685 } 1686 1687 return 0; 1688 } 1689 1690 static inline uint32_t 1691 nvmf_rdma_calc_num_wrs(uint32_t length, uint32_t io_unit_size, uint32_t block_size) 1692 { 1693 /* estimate the number of SG entries and WRs needed to process the request */ 1694 uint32_t num_sge = 0; 1695 uint32_t i; 1696 uint32_t num_buffers = SPDK_CEIL_DIV(length, io_unit_size); 1697 1698 for (i = 0; i < num_buffers && length > 0; i++) { 1699 uint32_t buffer_len = spdk_min(length, io_unit_size); 1700 uint32_t num_sge_in_block = SPDK_CEIL_DIV(buffer_len, block_size); 1701 1702 if (num_sge_in_block * block_size > buffer_len) { 1703 ++num_sge_in_block; 1704 } 1705 num_sge += num_sge_in_block; 1706 length -= buffer_len; 1707 } 1708 return SPDK_CEIL_DIV(num_sge, SPDK_NVMF_MAX_SGL_ENTRIES); 1709 } 1710 1711 static int 1712 spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 1713 struct spdk_nvmf_rdma_device *device, 1714 struct spdk_nvmf_rdma_request *rdma_req, 1715 uint32_t length) 1716 { 1717 struct spdk_nvmf_rdma_qpair *rqpair; 1718 struct spdk_nvmf_rdma_poll_group *rgroup; 1719 struct spdk_nvmf_request *req = &rdma_req->req; 1720 struct ibv_send_wr *wr = &rdma_req->data.wr; 1721 int rc; 1722 uint32_t num_wrs = 1; 1723 1724 rqpair = SPDK_CONTAINEROF(req->qpair, struct spdk_nvmf_rdma_qpair, qpair); 1725 rgroup = rqpair->poller->group; 1726 1727 /* rdma wr specifics */ 1728 nvmf_rdma_setup_request(rdma_req); 1729 1730 rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, 1731 length); 1732 if (rc != 0) { 1733 return rc; 1734 } 1735 1736 assert(req->iovcnt <= rqpair->max_send_sge); 1737 1738 rdma_req->iovpos = 0; 1739 1740 if (spdk_unlikely(req->dif.dif_insert_or_strip)) { 1741 num_wrs = nvmf_rdma_calc_num_wrs(length, rtransport->transport.opts.io_unit_size, 1742 req->dif.dif_ctx.block_size); 1743 if (num_wrs > 1) { 1744 rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_wrs - 1); 1745 if (rc != 0) { 1746 goto err_exit; 1747 } 1748 } 1749 } 1750 1751 rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, wr, length, num_wrs - 1); 1752 if (spdk_unlikely(rc != 0)) { 1753 goto err_exit; 1754 } 1755 1756 if (spdk_unlikely(num_wrs > 1)) { 1757 nvmf_rdma_update_remote_addr(rdma_req, num_wrs); 1758 } 1759 1760 /* set the number of outstanding data WRs for this request. */ 1761 rdma_req->num_outstanding_data_wr = num_wrs; 1762 1763 return rc; 1764 1765 err_exit: 1766 spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport); 1767 nvmf_rdma_request_free_data(rdma_req, rtransport); 1768 req->iovcnt = 0; 1769 return rc; 1770 } 1771 1772 static int 1773 nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport, 1774 struct spdk_nvmf_rdma_device *device, 1775 struct spdk_nvmf_rdma_request *rdma_req) 1776 { 1777 struct spdk_nvmf_rdma_qpair *rqpair; 1778 struct spdk_nvmf_rdma_poll_group *rgroup; 1779 struct ibv_send_wr *current_wr; 1780 struct spdk_nvmf_request *req = &rdma_req->req; 1781 struct spdk_nvme_sgl_descriptor *inline_segment, *desc; 1782 uint32_t num_sgl_descriptors; 1783 uint32_t lengths[SPDK_NVMF_MAX_SGL_ENTRIES]; 1784 uint32_t i; 1785 int rc; 1786 1787 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1788 rgroup = rqpair->poller->group; 1789 1790 inline_segment = &req->cmd->nvme_cmd.dptr.sgl1; 1791 assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT); 1792 assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); 1793 1794 num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor); 1795 assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES); 1796 1797 if (nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1) != 0) { 1798 return -ENOMEM; 1799 } 1800 1801 desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address; 1802 for (i = 0; i < num_sgl_descriptors; i++) { 1803 if (spdk_likely(!req->dif.dif_insert_or_strip)) { 1804 lengths[i] = desc->keyed.length; 1805 } else { 1806 req->dif.orig_length += desc->keyed.length; 1807 lengths[i] = spdk_dif_get_length_with_md(desc->keyed.length, &req->dif.dif_ctx); 1808 req->dif.elba_length += lengths[i]; 1809 } 1810 desc++; 1811 } 1812 1813 rc = spdk_nvmf_request_get_buffers_multi(req, &rgroup->group, &rtransport->transport, 1814 lengths, num_sgl_descriptors); 1815 if (rc != 0) { 1816 nvmf_rdma_request_free_data(rdma_req, rtransport); 1817 return rc; 1818 } 1819 1820 /* The first WR must always be the embedded data WR. This is how we unwind them later. */ 1821 current_wr = &rdma_req->data.wr; 1822 assert(current_wr != NULL); 1823 1824 req->length = 0; 1825 rdma_req->iovpos = 0; 1826 desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address; 1827 for (i = 0; i < num_sgl_descriptors; i++) { 1828 /* The descriptors must be keyed data block descriptors with an address, not an offset. */ 1829 if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK || 1830 desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) { 1831 rc = -EINVAL; 1832 goto err_exit; 1833 } 1834 1835 current_wr->num_sge = 0; 1836 1837 rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, current_wr, lengths[i], 0); 1838 if (rc != 0) { 1839 rc = -ENOMEM; 1840 goto err_exit; 1841 } 1842 1843 req->length += desc->keyed.length; 1844 current_wr->wr.rdma.rkey = desc->keyed.key; 1845 current_wr->wr.rdma.remote_addr = desc->address; 1846 current_wr = current_wr->next; 1847 desc++; 1848 } 1849 1850 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1851 /* Go back to the last descriptor in the list. */ 1852 desc--; 1853 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 1854 if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 1855 rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 1856 rdma_req->rsp.wr.imm_data = desc->keyed.key; 1857 } 1858 } 1859 #endif 1860 1861 rdma_req->num_outstanding_data_wr = num_sgl_descriptors; 1862 1863 return 0; 1864 1865 err_exit: 1866 spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport); 1867 nvmf_rdma_request_free_data(rdma_req, rtransport); 1868 return rc; 1869 } 1870 1871 static int 1872 spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 1873 struct spdk_nvmf_rdma_device *device, 1874 struct spdk_nvmf_rdma_request *rdma_req) 1875 { 1876 struct spdk_nvmf_request *req = &rdma_req->req; 1877 struct spdk_nvme_cpl *rsp; 1878 struct spdk_nvme_sgl_descriptor *sgl; 1879 int rc; 1880 uint32_t length; 1881 1882 rsp = &req->rsp->nvme_cpl; 1883 sgl = &req->cmd->nvme_cmd.dptr.sgl1; 1884 1885 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 1886 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 1887 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 1888 1889 length = sgl->keyed.length; 1890 if (length > rtransport->transport.opts.max_io_size) { 1891 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 1892 length, rtransport->transport.opts.max_io_size); 1893 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1894 return -1; 1895 } 1896 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1897 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 1898 if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 1899 rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 1900 rdma_req->rsp.wr.imm_data = sgl->keyed.key; 1901 } 1902 } 1903 #endif 1904 1905 /* fill request length and populate iovs */ 1906 req->length = length; 1907 1908 if (spdk_unlikely(req->dif.dif_insert_or_strip)) { 1909 req->dif.orig_length = length; 1910 length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx); 1911 req->dif.elba_length = length; 1912 } 1913 1914 rc = spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req, length); 1915 if (spdk_unlikely(rc < 0)) { 1916 if (rc == -EINVAL) { 1917 SPDK_ERRLOG("SGL length exceeds the max I/O size\n"); 1918 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1919 return -1; 1920 } 1921 /* No available buffers. Queue this request up. */ 1922 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 1923 return 0; 1924 } 1925 1926 /* backward compatible */ 1927 req->data = req->iov[0].iov_base; 1928 1929 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 1930 req->iovcnt); 1931 1932 return 0; 1933 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 1934 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1935 uint64_t offset = sgl->address; 1936 uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; 1937 1938 SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 1939 offset, sgl->unkeyed.length); 1940 1941 if (offset > max_len) { 1942 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 1943 offset, max_len); 1944 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 1945 return -1; 1946 } 1947 max_len -= (uint32_t)offset; 1948 1949 if (sgl->unkeyed.length > max_len) { 1950 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 1951 sgl->unkeyed.length, max_len); 1952 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1953 return -1; 1954 } 1955 1956 rdma_req->num_outstanding_data_wr = 0; 1957 req->data = rdma_req->recv->buf + offset; 1958 req->data_from_pool = false; 1959 req->length = sgl->unkeyed.length; 1960 1961 req->iov[0].iov_base = req->data; 1962 req->iov[0].iov_len = req->length; 1963 req->iovcnt = 1; 1964 1965 return 0; 1966 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT && 1967 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1968 1969 rc = nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req); 1970 if (rc == -ENOMEM) { 1971 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 1972 return 0; 1973 } else if (rc == -EINVAL) { 1974 SPDK_ERRLOG("Multi SGL element request length exceeds the max I/O size\n"); 1975 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1976 return -1; 1977 } 1978 1979 /* backward compatible */ 1980 req->data = req->iov[0].iov_base; 1981 1982 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 1983 req->iovcnt); 1984 1985 return 0; 1986 } 1987 1988 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 1989 sgl->generic.type, sgl->generic.subtype); 1990 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 1991 return -1; 1992 } 1993 1994 static void 1995 nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req, 1996 struct spdk_nvmf_rdma_transport *rtransport) 1997 { 1998 struct spdk_nvmf_rdma_qpair *rqpair; 1999 struct spdk_nvmf_rdma_poll_group *rgroup; 2000 2001 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2002 if (rdma_req->req.data_from_pool) { 2003 rgroup = rqpair->poller->group; 2004 2005 spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport); 2006 } 2007 nvmf_rdma_request_free_data(rdma_req, rtransport); 2008 rdma_req->req.length = 0; 2009 rdma_req->req.iovcnt = 0; 2010 rdma_req->req.data = NULL; 2011 rdma_req->rsp.wr.next = NULL; 2012 rdma_req->data.wr.next = NULL; 2013 memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif)); 2014 rqpair->qd--; 2015 2016 STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link); 2017 rdma_req->state = RDMA_REQUEST_STATE_FREE; 2018 } 2019 2020 bool 2021 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 2022 struct spdk_nvmf_rdma_request *rdma_req) 2023 { 2024 struct spdk_nvmf_rdma_qpair *rqpair; 2025 struct spdk_nvmf_rdma_device *device; 2026 struct spdk_nvmf_rdma_poll_group *rgroup; 2027 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 2028 int rc; 2029 struct spdk_nvmf_rdma_recv *rdma_recv; 2030 enum spdk_nvmf_rdma_request_state prev_state; 2031 bool progress = false; 2032 int data_posted; 2033 uint32_t num_blocks; 2034 2035 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2036 device = rqpair->device; 2037 rgroup = rqpair->poller->group; 2038 2039 assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 2040 2041 /* If the queue pair is in an error state, force the request to the completed state 2042 * to release resources. */ 2043 if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 2044 if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) { 2045 STAILQ_REMOVE(&rgroup->group.pending_buf_queue, &rdma_req->req, spdk_nvmf_request, buf_link); 2046 } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) { 2047 STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 2048 } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING) { 2049 STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 2050 } 2051 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2052 } 2053 2054 /* The loop here is to allow for several back-to-back state changes. */ 2055 do { 2056 prev_state = rdma_req->state; 2057 2058 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); 2059 2060 switch (rdma_req->state) { 2061 case RDMA_REQUEST_STATE_FREE: 2062 /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 2063 * to escape this state. */ 2064 break; 2065 case RDMA_REQUEST_STATE_NEW: 2066 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, 2067 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2068 rdma_recv = rdma_req->recv; 2069 2070 /* The first element of the SGL is the NVMe command */ 2071 rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 2072 memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 2073 2074 if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 2075 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2076 break; 2077 } 2078 2079 if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->req.dif.dif_ctx))) { 2080 rdma_req->req.dif.dif_insert_or_strip = true; 2081 } 2082 2083 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 2084 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 2085 rdma_req->rsp.wr.imm_data = 0; 2086 #endif 2087 2088 /* The next state transition depends on the data transfer needs of this request. */ 2089 rdma_req->req.xfer = spdk_nvmf_req_get_xfer(&rdma_req->req); 2090 2091 /* If no data to transfer, ready to execute. */ 2092 if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 2093 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 2094 break; 2095 } 2096 2097 rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER; 2098 STAILQ_INSERT_TAIL(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link); 2099 break; 2100 case RDMA_REQUEST_STATE_NEED_BUFFER: 2101 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, 2102 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2103 2104 assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 2105 2106 if (&rdma_req->req != STAILQ_FIRST(&rgroup->group.pending_buf_queue)) { 2107 /* This request needs to wait in line to obtain a buffer */ 2108 break; 2109 } 2110 2111 /* Try to get a data buffer */ 2112 rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 2113 if (rc < 0) { 2114 STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link); 2115 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2116 break; 2117 } 2118 2119 if (!rdma_req->req.data) { 2120 /* No buffers available. */ 2121 rgroup->stat.pending_data_buffer++; 2122 break; 2123 } 2124 2125 STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link); 2126 2127 /* If data is transferring from host to controller and the data didn't 2128 * arrive using in capsule data, we need to do a transfer from the host. 2129 */ 2130 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && 2131 rdma_req->req.data_from_pool) { 2132 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link); 2133 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING; 2134 break; 2135 } 2136 2137 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 2138 break; 2139 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: 2140 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0, 2141 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2142 2143 if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) { 2144 /* This request needs to wait in line to perform RDMA */ 2145 break; 2146 } 2147 if (rqpair->current_send_depth + rdma_req->num_outstanding_data_wr > rqpair->max_send_depth 2148 || rqpair->current_read_depth + rdma_req->num_outstanding_data_wr > rqpair->max_read_depth) { 2149 /* We can only have so many WRs outstanding. we have to wait until some finish. */ 2150 rqpair->poller->stat.pending_rdma_read++; 2151 break; 2152 } 2153 2154 /* We have already verified that this request is the head of the queue. */ 2155 STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link); 2156 2157 rc = request_transfer_in(&rdma_req->req); 2158 if (!rc) { 2159 rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; 2160 } else { 2161 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2162 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2163 } 2164 break; 2165 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 2166 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, 2167 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2168 /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 2169 * to escape this state. */ 2170 break; 2171 case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 2172 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, 2173 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2174 2175 if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) { 2176 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 2177 /* generate DIF for write operation */ 2178 num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size); 2179 assert(num_blocks > 0); 2180 2181 rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt, 2182 num_blocks, &rdma_req->req.dif.dif_ctx); 2183 if (rc != 0) { 2184 SPDK_ERRLOG("DIF generation failed\n"); 2185 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2186 spdk_nvmf_rdma_start_disconnect(rqpair); 2187 break; 2188 } 2189 } 2190 2191 assert(rdma_req->req.dif.elba_length >= rdma_req->req.length); 2192 /* set extended length before IO operation */ 2193 rdma_req->req.length = rdma_req->req.dif.elba_length; 2194 } 2195 2196 rdma_req->state = RDMA_REQUEST_STATE_EXECUTING; 2197 spdk_nvmf_request_exec(&rdma_req->req); 2198 break; 2199 case RDMA_REQUEST_STATE_EXECUTING: 2200 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, 2201 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2202 /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 2203 * to escape this state. */ 2204 break; 2205 case RDMA_REQUEST_STATE_EXECUTED: 2206 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, 2207 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2208 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 2209 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link); 2210 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING; 2211 } else { 2212 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2213 } 2214 if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) { 2215 /* restore the original length */ 2216 rdma_req->req.length = rdma_req->req.dif.orig_length; 2217 2218 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 2219 struct spdk_dif_error error_blk; 2220 2221 num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size); 2222 2223 rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks, 2224 &rdma_req->req.dif.dif_ctx, &error_blk); 2225 if (rc) { 2226 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 2227 2228 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", error_blk.err_type, 2229 error_blk.err_offset); 2230 rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR; 2231 rsp->status.sc = spdk_nvmf_rdma_dif_error_to_compl_status(error_blk.err_type); 2232 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2233 STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 2234 } 2235 } 2236 } 2237 break; 2238 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: 2239 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0, 2240 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2241 2242 if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) { 2243 /* This request needs to wait in line to perform RDMA */ 2244 break; 2245 } 2246 if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) > 2247 rqpair->max_send_depth) { 2248 /* We can only have so many WRs outstanding. we have to wait until some finish. 2249 * +1 since each request has an additional wr in the resp. */ 2250 rqpair->poller->stat.pending_rdma_write++; 2251 break; 2252 } 2253 2254 /* We have already verified that this request is the head of the queue. */ 2255 STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link); 2256 2257 /* The data transfer will be kicked off from 2258 * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. 2259 */ 2260 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2261 break; 2262 case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 2263 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, 2264 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2265 rc = request_transfer_out(&rdma_req->req, &data_posted); 2266 assert(rc == 0); /* No good way to handle this currently */ 2267 if (rc) { 2268 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2269 } else { 2270 rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : 2271 RDMA_REQUEST_STATE_COMPLETING; 2272 } 2273 break; 2274 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 2275 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, 2276 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2277 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 2278 * to escape this state. */ 2279 break; 2280 case RDMA_REQUEST_STATE_COMPLETING: 2281 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, 2282 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2283 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 2284 * to escape this state. */ 2285 break; 2286 case RDMA_REQUEST_STATE_COMPLETED: 2287 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, 2288 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2289 2290 rqpair->poller->stat.request_latency += spdk_get_ticks() - rdma_req->receive_tsc; 2291 nvmf_rdma_request_free(rdma_req, rtransport); 2292 break; 2293 case RDMA_REQUEST_NUM_STATES: 2294 default: 2295 assert(0); 2296 break; 2297 } 2298 2299 if (rdma_req->state != prev_state) { 2300 progress = true; 2301 } 2302 } while (rdma_req->state != prev_state); 2303 2304 return progress; 2305 } 2306 2307 /* Public API callbacks begin here */ 2308 2309 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 2310 #define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 2311 #define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096 2312 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 128 2313 #define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 2314 #define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 2315 #define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES) 2316 #define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095 2317 #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32 2318 #define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false 2319 #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false 2320 2321 static void 2322 spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) 2323 { 2324 opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; 2325 opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; 2326 opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; 2327 opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; 2328 opts->io_unit_size = SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE; 2329 opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; 2330 opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS; 2331 opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE; 2332 opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH; 2333 opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ; 2334 opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP; 2335 } 2336 2337 const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = { 2338 .notify_cb = spdk_nvmf_rdma_mem_notify, 2339 .are_contiguous = spdk_nvmf_rdma_check_contiguous_entries 2340 }; 2341 2342 static int spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport); 2343 2344 static struct spdk_nvmf_transport * 2345 spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) 2346 { 2347 int rc; 2348 struct spdk_nvmf_rdma_transport *rtransport; 2349 struct spdk_nvmf_rdma_device *device, *tmp; 2350 struct ibv_context **contexts; 2351 uint32_t i; 2352 int flag; 2353 uint32_t sge_count; 2354 uint32_t min_shared_buffers; 2355 int max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES; 2356 pthread_mutexattr_t attr; 2357 2358 rtransport = calloc(1, sizeof(*rtransport)); 2359 if (!rtransport) { 2360 return NULL; 2361 } 2362 2363 if (pthread_mutexattr_init(&attr)) { 2364 SPDK_ERRLOG("pthread_mutexattr_init() failed\n"); 2365 free(rtransport); 2366 return NULL; 2367 } 2368 2369 if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) { 2370 SPDK_ERRLOG("pthread_mutexattr_settype() failed\n"); 2371 pthread_mutexattr_destroy(&attr); 2372 free(rtransport); 2373 return NULL; 2374 } 2375 2376 if (pthread_mutex_init(&rtransport->lock, &attr)) { 2377 SPDK_ERRLOG("pthread_mutex_init() failed\n"); 2378 pthread_mutexattr_destroy(&attr); 2379 free(rtransport); 2380 return NULL; 2381 } 2382 2383 pthread_mutexattr_destroy(&attr); 2384 2385 TAILQ_INIT(&rtransport->devices); 2386 TAILQ_INIT(&rtransport->ports); 2387 TAILQ_INIT(&rtransport->poll_groups); 2388 2389 rtransport->transport.ops = &spdk_nvmf_transport_rdma; 2390 2391 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n" 2392 " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" 2393 " max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" 2394 " in_capsule_data_size=%d, max_aq_depth=%d,\n" 2395 " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d\n", 2396 opts->max_queue_depth, 2397 opts->max_io_size, 2398 opts->max_qpairs_per_ctrlr, 2399 opts->io_unit_size, 2400 opts->in_capsule_data_size, 2401 opts->max_aq_depth, 2402 opts->num_shared_buffers, 2403 opts->max_srq_depth, 2404 opts->no_srq); 2405 2406 /* I/O unit size cannot be larger than max I/O size */ 2407 if (opts->io_unit_size > opts->max_io_size) { 2408 opts->io_unit_size = opts->max_io_size; 2409 } 2410 2411 if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) { 2412 SPDK_ERRLOG("The number of shared data buffers (%d) is less than" 2413 "the minimum number required to guarantee that forward progress can be made (%d)\n", 2414 opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2)); 2415 spdk_nvmf_rdma_destroy(&rtransport->transport); 2416 return NULL; 2417 } 2418 2419 min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size; 2420 if (min_shared_buffers > opts->num_shared_buffers) { 2421 SPDK_ERRLOG("There are not enough buffers to satisfy" 2422 "per-poll group caches for each thread. (%" PRIu32 ")" 2423 "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers); 2424 SPDK_ERRLOG("Please specify a larger number of shared buffers\n"); 2425 spdk_nvmf_rdma_destroy(&rtransport->transport); 2426 return NULL; 2427 } 2428 2429 sge_count = opts->max_io_size / opts->io_unit_size; 2430 if (sge_count > NVMF_DEFAULT_TX_SGE) { 2431 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); 2432 spdk_nvmf_rdma_destroy(&rtransport->transport); 2433 return NULL; 2434 } 2435 2436 rtransport->event_channel = rdma_create_event_channel(); 2437 if (rtransport->event_channel == NULL) { 2438 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 2439 spdk_nvmf_rdma_destroy(&rtransport->transport); 2440 return NULL; 2441 } 2442 2443 flag = fcntl(rtransport->event_channel->fd, F_GETFL); 2444 if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 2445 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 2446 rtransport->event_channel->fd, spdk_strerror(errno)); 2447 spdk_nvmf_rdma_destroy(&rtransport->transport); 2448 return NULL; 2449 } 2450 2451 rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data", 2452 opts->max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES, 2453 sizeof(struct spdk_nvmf_rdma_request_data), 2454 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 2455 SPDK_ENV_SOCKET_ID_ANY); 2456 if (!rtransport->data_wr_pool) { 2457 SPDK_ERRLOG("Unable to allocate work request pool for poll group\n"); 2458 spdk_nvmf_rdma_destroy(&rtransport->transport); 2459 return NULL; 2460 } 2461 2462 contexts = rdma_get_devices(NULL); 2463 if (contexts == NULL) { 2464 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 2465 spdk_nvmf_rdma_destroy(&rtransport->transport); 2466 return NULL; 2467 } 2468 2469 i = 0; 2470 rc = 0; 2471 while (contexts[i] != NULL) { 2472 device = calloc(1, sizeof(*device)); 2473 if (!device) { 2474 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 2475 rc = -ENOMEM; 2476 break; 2477 } 2478 device->context = contexts[i]; 2479 rc = ibv_query_device(device->context, &device->attr); 2480 if (rc < 0) { 2481 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 2482 free(device); 2483 break; 2484 2485 } 2486 2487 max_device_sge = spdk_min(max_device_sge, device->attr.max_sge); 2488 2489 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 2490 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { 2491 SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); 2492 SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); 2493 } 2494 2495 /** 2496 * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. 2497 * The Soft-RoCE RXE driver does not currently support send with invalidate, 2498 * but incorrectly reports that it does. There are changes making their way 2499 * through the kernel now that will enable this feature. When they are merged, 2500 * we can conditionally enable this feature. 2501 * 2502 * TODO: enable this for versions of the kernel rxe driver that support it. 2503 */ 2504 if (device->attr.vendor_id == 0) { 2505 device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); 2506 } 2507 #endif 2508 2509 /* set up device context async ev fd as NON_BLOCKING */ 2510 flag = fcntl(device->context->async_fd, F_GETFL); 2511 rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 2512 if (rc < 0) { 2513 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 2514 free(device); 2515 break; 2516 } 2517 2518 TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 2519 i++; 2520 2521 if (g_nvmf_hooks.get_ibv_pd) { 2522 device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context); 2523 } else { 2524 device->pd = ibv_alloc_pd(device->context); 2525 } 2526 2527 if (!device->pd) { 2528 SPDK_ERRLOG("Unable to allocate protection domain.\n"); 2529 rc = -ENOMEM; 2530 break; 2531 } 2532 2533 assert(device->map == NULL); 2534 2535 device->map = spdk_mem_map_alloc(0, &g_nvmf_rdma_map_ops, device->pd); 2536 if (!device->map) { 2537 SPDK_ERRLOG("Unable to allocate memory map for listen address\n"); 2538 rc = -ENOMEM; 2539 break; 2540 } 2541 2542 assert(device->map != NULL); 2543 assert(device->pd != NULL); 2544 } 2545 rdma_free_devices(contexts); 2546 2547 if (opts->io_unit_size * max_device_sge < opts->max_io_size) { 2548 /* divide and round up. */ 2549 opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge; 2550 2551 /* round up to the nearest 4k. */ 2552 opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK; 2553 2554 opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE); 2555 SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n", 2556 opts->io_unit_size); 2557 } 2558 2559 if (rc < 0) { 2560 spdk_nvmf_rdma_destroy(&rtransport->transport); 2561 return NULL; 2562 } 2563 2564 /* Set up poll descriptor array to monitor events from RDMA and IB 2565 * in a single poll syscall 2566 */ 2567 rtransport->npoll_fds = i + 1; 2568 i = 0; 2569 rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 2570 if (rtransport->poll_fds == NULL) { 2571 SPDK_ERRLOG("poll_fds allocation failed\n"); 2572 spdk_nvmf_rdma_destroy(&rtransport->transport); 2573 return NULL; 2574 } 2575 2576 rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 2577 rtransport->poll_fds[i++].events = POLLIN; 2578 2579 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 2580 rtransport->poll_fds[i].fd = device->context->async_fd; 2581 rtransport->poll_fds[i++].events = POLLIN; 2582 } 2583 2584 return &rtransport->transport; 2585 } 2586 2587 static int 2588 spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) 2589 { 2590 struct spdk_nvmf_rdma_transport *rtransport; 2591 struct spdk_nvmf_rdma_port *port, *port_tmp; 2592 struct spdk_nvmf_rdma_device *device, *device_tmp; 2593 2594 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2595 2596 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 2597 TAILQ_REMOVE(&rtransport->ports, port, link); 2598 rdma_destroy_id(port->id); 2599 free(port); 2600 } 2601 2602 if (rtransport->poll_fds != NULL) { 2603 free(rtransport->poll_fds); 2604 } 2605 2606 if (rtransport->event_channel != NULL) { 2607 rdma_destroy_event_channel(rtransport->event_channel); 2608 } 2609 2610 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 2611 TAILQ_REMOVE(&rtransport->devices, device, link); 2612 if (device->map) { 2613 spdk_mem_map_free(&device->map); 2614 } 2615 if (device->pd) { 2616 if (!g_nvmf_hooks.get_ibv_pd) { 2617 ibv_dealloc_pd(device->pd); 2618 } 2619 } 2620 free(device); 2621 } 2622 2623 if (rtransport->data_wr_pool != NULL) { 2624 if (spdk_mempool_count(rtransport->data_wr_pool) != 2625 (transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES)) { 2626 SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n", 2627 spdk_mempool_count(rtransport->data_wr_pool), 2628 transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES); 2629 } 2630 } 2631 2632 spdk_mempool_free(rtransport->data_wr_pool); 2633 2634 pthread_mutex_destroy(&rtransport->lock); 2635 free(rtransport); 2636 2637 return 0; 2638 } 2639 2640 static int 2641 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 2642 struct spdk_nvme_transport_id *trid, 2643 bool peer); 2644 2645 static int 2646 spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, 2647 const struct spdk_nvme_transport_id *trid, 2648 spdk_nvmf_tgt_listen_done_fn cb_fn, 2649 void *cb_arg) 2650 { 2651 struct spdk_nvmf_rdma_transport *rtransport; 2652 struct spdk_nvmf_rdma_device *device; 2653 struct spdk_nvmf_rdma_port *port; 2654 struct addrinfo *res; 2655 struct addrinfo hints; 2656 int family; 2657 int rc; 2658 2659 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2660 assert(rtransport->event_channel != NULL); 2661 2662 pthread_mutex_lock(&rtransport->lock); 2663 TAILQ_FOREACH(port, &rtransport->ports, link) { 2664 if (spdk_nvme_transport_id_compare(&port->trid, trid) == 0) { 2665 goto success; 2666 } 2667 } 2668 2669 port = calloc(1, sizeof(*port)); 2670 if (!port) { 2671 SPDK_ERRLOG("Port allocation failed\n"); 2672 pthread_mutex_unlock(&rtransport->lock); 2673 return -ENOMEM; 2674 } 2675 2676 /* Selectively copy the trid. Things like NQN don't matter here - that 2677 * mapping is enforced elsewhere. 2678 */ 2679 spdk_nvme_trid_populate_transport(&port->trid, SPDK_NVME_TRANSPORT_RDMA); 2680 port->trid.adrfam = trid->adrfam; 2681 snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); 2682 snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); 2683 2684 switch (port->trid.adrfam) { 2685 case SPDK_NVMF_ADRFAM_IPV4: 2686 family = AF_INET; 2687 break; 2688 case SPDK_NVMF_ADRFAM_IPV6: 2689 family = AF_INET6; 2690 break; 2691 default: 2692 SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); 2693 free(port); 2694 pthread_mutex_unlock(&rtransport->lock); 2695 return -EINVAL; 2696 } 2697 2698 memset(&hints, 0, sizeof(hints)); 2699 hints.ai_family = family; 2700 hints.ai_flags = AI_NUMERICSERV; 2701 hints.ai_socktype = SOCK_STREAM; 2702 hints.ai_protocol = 0; 2703 2704 rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); 2705 if (rc) { 2706 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 2707 free(port); 2708 pthread_mutex_unlock(&rtransport->lock); 2709 return -EINVAL; 2710 } 2711 2712 rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 2713 if (rc < 0) { 2714 SPDK_ERRLOG("rdma_create_id() failed\n"); 2715 freeaddrinfo(res); 2716 free(port); 2717 pthread_mutex_unlock(&rtransport->lock); 2718 return rc; 2719 } 2720 2721 rc = rdma_bind_addr(port->id, res->ai_addr); 2722 freeaddrinfo(res); 2723 2724 if (rc < 0) { 2725 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 2726 rdma_destroy_id(port->id); 2727 free(port); 2728 pthread_mutex_unlock(&rtransport->lock); 2729 return rc; 2730 } 2731 2732 if (!port->id->verbs) { 2733 SPDK_ERRLOG("ibv_context is null\n"); 2734 rdma_destroy_id(port->id); 2735 free(port); 2736 pthread_mutex_unlock(&rtransport->lock); 2737 return -1; 2738 } 2739 2740 rc = rdma_listen(port->id, 10); /* 10 = backlog */ 2741 if (rc < 0) { 2742 SPDK_ERRLOG("rdma_listen() failed\n"); 2743 rdma_destroy_id(port->id); 2744 free(port); 2745 pthread_mutex_unlock(&rtransport->lock); 2746 return rc; 2747 } 2748 2749 TAILQ_FOREACH(device, &rtransport->devices, link) { 2750 if (device->context == port->id->verbs) { 2751 port->device = device; 2752 break; 2753 } 2754 } 2755 if (!port->device) { 2756 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 2757 port->id->verbs); 2758 rdma_destroy_id(port->id); 2759 free(port); 2760 pthread_mutex_unlock(&rtransport->lock); 2761 return -EINVAL; 2762 } 2763 2764 SPDK_NOTICELOG("*** NVMe/RDMA Target Listening on %s port %s ***\n", 2765 trid->traddr, trid->trsvcid); 2766 2767 TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 2768 2769 success: 2770 port->ref++; 2771 pthread_mutex_unlock(&rtransport->lock); 2772 if (cb_fn != NULL) { 2773 cb_fn(cb_arg, 0); 2774 } 2775 return 0; 2776 } 2777 2778 static int 2779 spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 2780 const struct spdk_nvme_transport_id *_trid) 2781 { 2782 struct spdk_nvmf_rdma_transport *rtransport; 2783 struct spdk_nvmf_rdma_port *port, *tmp; 2784 struct spdk_nvme_transport_id trid = {}; 2785 2786 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2787 2788 /* Selectively copy the trid. Things like NQN don't matter here - that 2789 * mapping is enforced elsewhere. 2790 */ 2791 spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_RDMA); 2792 trid.adrfam = _trid->adrfam; 2793 snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); 2794 snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); 2795 2796 pthread_mutex_lock(&rtransport->lock); 2797 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 2798 if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { 2799 assert(port->ref > 0); 2800 port->ref--; 2801 if (port->ref == 0) { 2802 TAILQ_REMOVE(&rtransport->ports, port, link); 2803 rdma_destroy_id(port->id); 2804 free(port); 2805 } 2806 break; 2807 } 2808 } 2809 2810 pthread_mutex_unlock(&rtransport->lock); 2811 return 0; 2812 } 2813 2814 static void 2815 spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 2816 struct spdk_nvmf_rdma_qpair *rqpair, bool drain) 2817 { 2818 struct spdk_nvmf_request *req, *tmp; 2819 struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 2820 struct spdk_nvmf_rdma_resources *resources; 2821 2822 /* We process I/O in the data transfer pending queue at the highest priority. RDMA reads first */ 2823 STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) { 2824 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2825 break; 2826 } 2827 } 2828 2829 /* Then RDMA writes since reads have stronger restrictions than writes */ 2830 STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) { 2831 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2832 break; 2833 } 2834 } 2835 2836 /* The second highest priority is I/O waiting on memory buffers. */ 2837 STAILQ_FOREACH_SAFE(req, &rqpair->poller->group->group.pending_buf_queue, buf_link, tmp) { 2838 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 2839 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2840 break; 2841 } 2842 } 2843 2844 resources = rqpair->resources; 2845 while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) { 2846 rdma_req = STAILQ_FIRST(&resources->free_queue); 2847 STAILQ_REMOVE_HEAD(&resources->free_queue, state_link); 2848 rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue); 2849 STAILQ_REMOVE_HEAD(&resources->incoming_queue, link); 2850 2851 if (rqpair->srq != NULL) { 2852 rdma_req->req.qpair = &rdma_req->recv->qpair->qpair; 2853 rdma_req->recv->qpair->qd++; 2854 } else { 2855 rqpair->qd++; 2856 } 2857 2858 rdma_req->receive_tsc = rdma_req->recv->receive_tsc; 2859 rdma_req->state = RDMA_REQUEST_STATE_NEW; 2860 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 2861 break; 2862 } 2863 } 2864 if (!STAILQ_EMPTY(&resources->incoming_queue) && STAILQ_EMPTY(&resources->free_queue)) { 2865 rqpair->poller->stat.pending_free_request++; 2866 } 2867 } 2868 2869 static void 2870 _nvmf_rdma_qpair_disconnect(void *ctx) 2871 { 2872 struct spdk_nvmf_qpair *qpair = ctx; 2873 2874 spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); 2875 } 2876 2877 static void 2878 _nvmf_rdma_try_disconnect(void *ctx) 2879 { 2880 struct spdk_nvmf_qpair *qpair = ctx; 2881 struct spdk_nvmf_poll_group *group; 2882 2883 /* Read the group out of the qpair. This is normally set and accessed only from 2884 * the thread that created the group. Here, we're not on that thread necessarily. 2885 * The data member qpair->group begins it's life as NULL and then is assigned to 2886 * a pointer and never changes. So fortunately reading this and checking for 2887 * non-NULL is thread safe in the x86_64 memory model. */ 2888 group = qpair->group; 2889 2890 if (group == NULL) { 2891 /* The qpair hasn't been assigned to a group yet, so we can't 2892 * process a disconnect. Send a message to ourself and try again. */ 2893 spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_try_disconnect, qpair); 2894 return; 2895 } 2896 2897 spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair); 2898 } 2899 2900 static inline void 2901 spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair) 2902 { 2903 if (!__atomic_test_and_set(&rqpair->disconnect_started, __ATOMIC_RELAXED)) { 2904 _nvmf_rdma_try_disconnect(&rqpair->qpair); 2905 } 2906 } 2907 2908 static void nvmf_rdma_destroy_drained_qpair(void *ctx) 2909 { 2910 struct spdk_nvmf_rdma_qpair *rqpair = ctx; 2911 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 2912 struct spdk_nvmf_rdma_transport, transport); 2913 2914 /* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */ 2915 if (rqpair->current_send_depth != 0) { 2916 return; 2917 } 2918 2919 if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) { 2920 return; 2921 } 2922 2923 if (rqpair->srq != NULL && rqpair->last_wqe_reached == false) { 2924 return; 2925 } 2926 2927 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 2928 2929 /* Qpair will be destroyed after nvmf layer closes this qpair */ 2930 if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ERROR) { 2931 return; 2932 } 2933 2934 spdk_nvmf_rdma_qpair_destroy(rqpair); 2935 } 2936 2937 2938 static int 2939 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 2940 { 2941 struct spdk_nvmf_qpair *qpair; 2942 struct spdk_nvmf_rdma_qpair *rqpair; 2943 2944 if (evt->id == NULL) { 2945 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 2946 return -1; 2947 } 2948 2949 qpair = evt->id->context; 2950 if (qpair == NULL) { 2951 SPDK_ERRLOG("disconnect request: no active connection\n"); 2952 return -1; 2953 } 2954 2955 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2956 2957 spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); 2958 2959 spdk_nvmf_rdma_start_disconnect(rqpair); 2960 2961 return 0; 2962 } 2963 2964 #ifdef DEBUG 2965 static const char *CM_EVENT_STR[] = { 2966 "RDMA_CM_EVENT_ADDR_RESOLVED", 2967 "RDMA_CM_EVENT_ADDR_ERROR", 2968 "RDMA_CM_EVENT_ROUTE_RESOLVED", 2969 "RDMA_CM_EVENT_ROUTE_ERROR", 2970 "RDMA_CM_EVENT_CONNECT_REQUEST", 2971 "RDMA_CM_EVENT_CONNECT_RESPONSE", 2972 "RDMA_CM_EVENT_CONNECT_ERROR", 2973 "RDMA_CM_EVENT_UNREACHABLE", 2974 "RDMA_CM_EVENT_REJECTED", 2975 "RDMA_CM_EVENT_ESTABLISHED", 2976 "RDMA_CM_EVENT_DISCONNECTED", 2977 "RDMA_CM_EVENT_DEVICE_REMOVAL", 2978 "RDMA_CM_EVENT_MULTICAST_JOIN", 2979 "RDMA_CM_EVENT_MULTICAST_ERROR", 2980 "RDMA_CM_EVENT_ADDR_CHANGE", 2981 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 2982 }; 2983 #endif /* DEBUG */ 2984 2985 static void 2986 nvmf_rdma_disconnect_qpairs_on_port(struct spdk_nvmf_rdma_transport *rtransport, 2987 struct spdk_nvmf_rdma_port *port) 2988 { 2989 struct spdk_nvmf_rdma_poll_group *rgroup; 2990 struct spdk_nvmf_rdma_poller *rpoller; 2991 struct spdk_nvmf_rdma_qpair *rqpair; 2992 2993 TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) { 2994 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 2995 TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) { 2996 if (rqpair->listen_id == port->id) { 2997 spdk_nvmf_rdma_start_disconnect(rqpair); 2998 } 2999 } 3000 } 3001 } 3002 } 3003 3004 static bool 3005 nvmf_rdma_handle_cm_event_addr_change(struct spdk_nvmf_transport *transport, 3006 struct rdma_cm_event *event) 3007 { 3008 struct spdk_nvme_transport_id trid; 3009 struct spdk_nvmf_rdma_port *port; 3010 struct spdk_nvmf_rdma_transport *rtransport; 3011 uint32_t ref, i; 3012 bool event_acked = false; 3013 3014 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3015 TAILQ_FOREACH(port, &rtransport->ports, link) { 3016 if (port->id == event->id) { 3017 SPDK_ERRLOG("ADDR_CHANGE: IP %s:%s migrated\n", port->trid.traddr, port->trid.trsvcid); 3018 rdma_ack_cm_event(event); 3019 event_acked = true; 3020 trid = port->trid; 3021 ref = port->ref; 3022 break; 3023 } 3024 } 3025 if (event_acked) { 3026 nvmf_rdma_disconnect_qpairs_on_port(rtransport, port); 3027 3028 for (i = 0; i < ref; i++) { 3029 spdk_nvmf_rdma_stop_listen(transport, &trid); 3030 } 3031 for (i = 0; i < ref; i++) { 3032 spdk_nvmf_rdma_listen(transport, &trid, NULL, NULL); 3033 } 3034 } 3035 return event_acked; 3036 } 3037 3038 static void 3039 nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport, 3040 struct rdma_cm_event *event) 3041 { 3042 struct spdk_nvmf_rdma_port *port; 3043 struct spdk_nvmf_rdma_transport *rtransport; 3044 uint32_t ref, i; 3045 3046 port = event->id->context; 3047 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3048 ref = port->ref; 3049 3050 SPDK_NOTICELOG("Port %s:%s is being removed\n", port->trid.traddr, port->trid.trsvcid); 3051 3052 nvmf_rdma_disconnect_qpairs_on_port(rtransport, port); 3053 3054 rdma_ack_cm_event(event); 3055 3056 for (i = 0; i < ref; i++) { 3057 spdk_nvmf_rdma_stop_listen(transport, &port->trid); 3058 } 3059 } 3060 3061 static void 3062 spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn, void *cb_arg) 3063 { 3064 struct spdk_nvmf_rdma_transport *rtransport; 3065 struct rdma_cm_event *event; 3066 int rc; 3067 bool event_acked; 3068 3069 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3070 3071 if (rtransport->event_channel == NULL) { 3072 return; 3073 } 3074 3075 while (1) { 3076 event_acked = false; 3077 rc = rdma_get_cm_event(rtransport->event_channel, &event); 3078 if (rc) { 3079 if (errno != EAGAIN && errno != EWOULDBLOCK) { 3080 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 3081 } 3082 break; 3083 } 3084 3085 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 3086 3087 spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); 3088 3089 switch (event->event) { 3090 case RDMA_CM_EVENT_ADDR_RESOLVED: 3091 case RDMA_CM_EVENT_ADDR_ERROR: 3092 case RDMA_CM_EVENT_ROUTE_RESOLVED: 3093 case RDMA_CM_EVENT_ROUTE_ERROR: 3094 /* No action required. The target never attempts to resolve routes. */ 3095 break; 3096 case RDMA_CM_EVENT_CONNECT_REQUEST: 3097 rc = nvmf_rdma_connect(transport, event, cb_fn, cb_arg); 3098 if (rc < 0) { 3099 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 3100 break; 3101 } 3102 break; 3103 case RDMA_CM_EVENT_CONNECT_RESPONSE: 3104 /* The target never initiates a new connection. So this will not occur. */ 3105 break; 3106 case RDMA_CM_EVENT_CONNECT_ERROR: 3107 /* Can this happen? The docs say it can, but not sure what causes it. */ 3108 break; 3109 case RDMA_CM_EVENT_UNREACHABLE: 3110 case RDMA_CM_EVENT_REJECTED: 3111 /* These only occur on the client side. */ 3112 break; 3113 case RDMA_CM_EVENT_ESTABLISHED: 3114 /* TODO: Should we be waiting for this event anywhere? */ 3115 break; 3116 case RDMA_CM_EVENT_DISCONNECTED: 3117 rc = nvmf_rdma_disconnect(event); 3118 if (rc < 0) { 3119 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 3120 break; 3121 } 3122 break; 3123 case RDMA_CM_EVENT_DEVICE_REMOVAL: 3124 /* In case of device removal, kernel IB part triggers IBV_EVENT_DEVICE_FATAL 3125 * which triggers RDMA_CM_EVENT_DEVICE_REMOVAL on all cma_id’s. 3126 * Once these events are sent to SPDK, we should release all IB resources and 3127 * don't make attempts to call any ibv_query/modify/create functions. We can only call 3128 * ibv_destory* functions to release user space memory allocated by IB. All kernel 3129 * resources are already cleaned. */ 3130 if (event->id->qp) { 3131 /* If rdma_cm event has a valid `qp` pointer then the event refers to the 3132 * corresponding qpair. Otherwise the event refers to a listening device */ 3133 rc = nvmf_rdma_disconnect(event); 3134 if (rc < 0) { 3135 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 3136 break; 3137 } 3138 } else { 3139 nvmf_rdma_handle_cm_event_port_removal(transport, event); 3140 event_acked = true; 3141 } 3142 break; 3143 case RDMA_CM_EVENT_MULTICAST_JOIN: 3144 case RDMA_CM_EVENT_MULTICAST_ERROR: 3145 /* Multicast is not used */ 3146 break; 3147 case RDMA_CM_EVENT_ADDR_CHANGE: 3148 event_acked = nvmf_rdma_handle_cm_event_addr_change(transport, event); 3149 break; 3150 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 3151 /* For now, do nothing. The target never re-uses queue pairs. */ 3152 break; 3153 default: 3154 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 3155 break; 3156 } 3157 if (!event_acked) { 3158 rdma_ack_cm_event(event); 3159 } 3160 } 3161 } 3162 3163 static void 3164 nvmf_rdma_handle_qp_fatal(struct spdk_nvmf_rdma_qpair *rqpair) 3165 { 3166 spdk_nvmf_rdma_update_ibv_state(rqpair); 3167 spdk_nvmf_rdma_start_disconnect(rqpair); 3168 } 3169 3170 static void 3171 nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair) 3172 { 3173 rqpair->last_wqe_reached = true; 3174 nvmf_rdma_destroy_drained_qpair(rqpair); 3175 } 3176 3177 static void 3178 nvmf_rdma_handle_sq_drained(struct spdk_nvmf_rdma_qpair *rqpair) 3179 { 3180 spdk_nvmf_rdma_start_disconnect(rqpair); 3181 } 3182 3183 static void 3184 spdk_nvmf_rdma_qpair_process_ibv_event(void *ctx) 3185 { 3186 struct spdk_nvmf_rdma_ibv_event_ctx *event_ctx = ctx; 3187 3188 if (event_ctx->rqpair) { 3189 STAILQ_REMOVE(&event_ctx->rqpair->ibv_events, event_ctx, spdk_nvmf_rdma_ibv_event_ctx, link); 3190 if (event_ctx->cb_fn) { 3191 event_ctx->cb_fn(event_ctx->rqpair); 3192 } 3193 } 3194 free(event_ctx); 3195 } 3196 3197 static int 3198 spdk_nvmf_rdma_send_qpair_async_event(struct spdk_nvmf_rdma_qpair *rqpair, 3199 spdk_nvmf_rdma_qpair_ibv_event fn) 3200 { 3201 struct spdk_nvmf_rdma_ibv_event_ctx *ctx; 3202 3203 if (!rqpair->qpair.group) { 3204 return EINVAL; 3205 } 3206 3207 ctx = calloc(1, sizeof(*ctx)); 3208 if (!ctx) { 3209 return ENOMEM; 3210 } 3211 3212 ctx->rqpair = rqpair; 3213 ctx->cb_fn = fn; 3214 STAILQ_INSERT_TAIL(&rqpair->ibv_events, ctx, link); 3215 3216 return spdk_thread_send_msg(rqpair->qpair.group->thread, spdk_nvmf_rdma_qpair_process_ibv_event, 3217 ctx); 3218 } 3219 3220 static void 3221 spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 3222 { 3223 int rc; 3224 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 3225 struct ibv_async_event event; 3226 3227 rc = ibv_get_async_event(device->context, &event); 3228 3229 if (rc) { 3230 SPDK_ERRLOG("Failed to get async_event (%d): %s\n", 3231 errno, spdk_strerror(errno)); 3232 return; 3233 } 3234 3235 switch (event.event_type) { 3236 case IBV_EVENT_QP_FATAL: 3237 rqpair = event.element.qp->qp_context; 3238 SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair); 3239 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 3240 (uintptr_t)rqpair->cm_id, event.event_type); 3241 if (spdk_nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_qp_fatal)) { 3242 SPDK_ERRLOG("Failed to send QP_FATAL event for rqpair %p\n", rqpair); 3243 nvmf_rdma_handle_qp_fatal(rqpair); 3244 } 3245 break; 3246 case IBV_EVENT_QP_LAST_WQE_REACHED: 3247 /* This event only occurs for shared receive queues. */ 3248 rqpair = event.element.qp->qp_context; 3249 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last WQE reached event received for rqpair %p\n", rqpair); 3250 if (spdk_nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_last_wqe_reached)) { 3251 SPDK_ERRLOG("Failed to send LAST_WQE_REACHED event for rqpair %p\n", rqpair); 3252 rqpair->last_wqe_reached = true; 3253 } 3254 break; 3255 case IBV_EVENT_SQ_DRAINED: 3256 /* This event occurs frequently in both error and non-error states. 3257 * Check if the qpair is in an error state before sending a message. */ 3258 rqpair = event.element.qp->qp_context; 3259 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last sq drained event received for rqpair %p\n", rqpair); 3260 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 3261 (uintptr_t)rqpair->cm_id, event.event_type); 3262 if (spdk_nvmf_rdma_update_ibv_state(rqpair) == IBV_QPS_ERR) { 3263 if (spdk_nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_sq_drained)) { 3264 SPDK_ERRLOG("Failed to send SQ_DRAINED event for rqpair %p\n", rqpair); 3265 nvmf_rdma_handle_sq_drained(rqpair); 3266 } 3267 } 3268 break; 3269 case IBV_EVENT_QP_REQ_ERR: 3270 case IBV_EVENT_QP_ACCESS_ERR: 3271 case IBV_EVENT_COMM_EST: 3272 case IBV_EVENT_PATH_MIG: 3273 case IBV_EVENT_PATH_MIG_ERR: 3274 SPDK_NOTICELOG("Async event: %s\n", 3275 ibv_event_type_str(event.event_type)); 3276 rqpair = event.element.qp->qp_context; 3277 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 3278 (uintptr_t)rqpair->cm_id, event.event_type); 3279 spdk_nvmf_rdma_update_ibv_state(rqpair); 3280 break; 3281 case IBV_EVENT_CQ_ERR: 3282 case IBV_EVENT_DEVICE_FATAL: 3283 case IBV_EVENT_PORT_ACTIVE: 3284 case IBV_EVENT_PORT_ERR: 3285 case IBV_EVENT_LID_CHANGE: 3286 case IBV_EVENT_PKEY_CHANGE: 3287 case IBV_EVENT_SM_CHANGE: 3288 case IBV_EVENT_SRQ_ERR: 3289 case IBV_EVENT_SRQ_LIMIT_REACHED: 3290 case IBV_EVENT_CLIENT_REREGISTER: 3291 case IBV_EVENT_GID_CHANGE: 3292 default: 3293 SPDK_NOTICELOG("Async event: %s\n", 3294 ibv_event_type_str(event.event_type)); 3295 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); 3296 break; 3297 } 3298 ibv_ack_async_event(&event); 3299 } 3300 3301 static void 3302 spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn, void *cb_arg) 3303 { 3304 int nfds, i = 0; 3305 struct spdk_nvmf_rdma_transport *rtransport; 3306 struct spdk_nvmf_rdma_device *device, *tmp; 3307 3308 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3309 nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 3310 3311 if (nfds <= 0) { 3312 return; 3313 } 3314 3315 /* The first poll descriptor is RDMA CM event */ 3316 if (rtransport->poll_fds[i++].revents & POLLIN) { 3317 spdk_nvmf_process_cm_event(transport, cb_fn, cb_arg); 3318 nfds--; 3319 } 3320 3321 if (nfds == 0) { 3322 return; 3323 } 3324 3325 /* Second and subsequent poll descriptors are IB async events */ 3326 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 3327 if (rtransport->poll_fds[i++].revents & POLLIN) { 3328 spdk_nvmf_process_ib_event(device); 3329 nfds--; 3330 } 3331 } 3332 /* check all flagged fd's have been served */ 3333 assert(nfds == 0); 3334 } 3335 3336 static void 3337 spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 3338 struct spdk_nvme_transport_id *trid, 3339 struct spdk_nvmf_discovery_log_page_entry *entry) 3340 { 3341 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 3342 entry->adrfam = trid->adrfam; 3343 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED; 3344 3345 spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 3346 spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 3347 3348 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 3349 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 3350 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 3351 } 3352 3353 static void 3354 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); 3355 3356 static struct spdk_nvmf_transport_poll_group * 3357 spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) 3358 { 3359 struct spdk_nvmf_rdma_transport *rtransport; 3360 struct spdk_nvmf_rdma_poll_group *rgroup; 3361 struct spdk_nvmf_rdma_poller *poller; 3362 struct spdk_nvmf_rdma_device *device; 3363 struct ibv_srq_init_attr srq_init_attr; 3364 struct spdk_nvmf_rdma_resource_opts opts; 3365 int num_cqe; 3366 3367 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3368 3369 rgroup = calloc(1, sizeof(*rgroup)); 3370 if (!rgroup) { 3371 return NULL; 3372 } 3373 3374 TAILQ_INIT(&rgroup->pollers); 3375 STAILQ_INIT(&rgroup->retired_bufs); 3376 3377 pthread_mutex_lock(&rtransport->lock); 3378 TAILQ_FOREACH(device, &rtransport->devices, link) { 3379 poller = calloc(1, sizeof(*poller)); 3380 if (!poller) { 3381 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 3382 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 3383 pthread_mutex_unlock(&rtransport->lock); 3384 return NULL; 3385 } 3386 3387 poller->device = device; 3388 poller->group = rgroup; 3389 3390 TAILQ_INIT(&poller->qpairs); 3391 STAILQ_INIT(&poller->qpairs_pending_send); 3392 STAILQ_INIT(&poller->qpairs_pending_recv); 3393 3394 TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 3395 if (transport->opts.no_srq == false && device->num_srq < device->attr.max_srq) { 3396 poller->max_srq_depth = transport->opts.max_srq_depth; 3397 3398 device->num_srq++; 3399 memset(&srq_init_attr, 0, sizeof(struct ibv_srq_init_attr)); 3400 srq_init_attr.attr.max_wr = poller->max_srq_depth; 3401 srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 3402 poller->srq = ibv_create_srq(device->pd, &srq_init_attr); 3403 if (!poller->srq) { 3404 SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno); 3405 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 3406 pthread_mutex_unlock(&rtransport->lock); 3407 return NULL; 3408 } 3409 3410 opts.qp = poller->srq; 3411 opts.pd = device->pd; 3412 opts.qpair = NULL; 3413 opts.shared = true; 3414 opts.max_queue_depth = poller->max_srq_depth; 3415 opts.in_capsule_data_size = transport->opts.in_capsule_data_size; 3416 3417 poller->resources = nvmf_rdma_resources_create(&opts); 3418 if (!poller->resources) { 3419 SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n"); 3420 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 3421 pthread_mutex_unlock(&rtransport->lock); 3422 return NULL; 3423 } 3424 } 3425 3426 /* 3427 * When using an srq, we can limit the completion queue at startup. 3428 * The following formula represents the calculation: 3429 * num_cqe = num_recv + num_data_wr + num_send_wr. 3430 * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth 3431 */ 3432 if (poller->srq) { 3433 num_cqe = poller->max_srq_depth * 3; 3434 } else { 3435 num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE; 3436 } 3437 3438 poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0); 3439 if (!poller->cq) { 3440 SPDK_ERRLOG("Unable to create completion queue\n"); 3441 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 3442 pthread_mutex_unlock(&rtransport->lock); 3443 return NULL; 3444 } 3445 poller->num_cqe = num_cqe; 3446 } 3447 3448 TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link); 3449 if (rtransport->conn_sched.next_admin_pg == NULL) { 3450 rtransport->conn_sched.next_admin_pg = rgroup; 3451 rtransport->conn_sched.next_io_pg = rgroup; 3452 } 3453 3454 pthread_mutex_unlock(&rtransport->lock); 3455 return &rgroup->group; 3456 } 3457 3458 static struct spdk_nvmf_transport_poll_group * 3459 spdk_nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 3460 { 3461 struct spdk_nvmf_rdma_transport *rtransport; 3462 struct spdk_nvmf_rdma_poll_group **pg; 3463 struct spdk_nvmf_transport_poll_group *result; 3464 3465 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 3466 3467 pthread_mutex_lock(&rtransport->lock); 3468 3469 if (TAILQ_EMPTY(&rtransport->poll_groups)) { 3470 pthread_mutex_unlock(&rtransport->lock); 3471 return NULL; 3472 } 3473 3474 if (qpair->qid == 0) { 3475 pg = &rtransport->conn_sched.next_admin_pg; 3476 } else { 3477 pg = &rtransport->conn_sched.next_io_pg; 3478 } 3479 3480 assert(*pg != NULL); 3481 3482 result = &(*pg)->group; 3483 3484 *pg = TAILQ_NEXT(*pg, link); 3485 if (*pg == NULL) { 3486 *pg = TAILQ_FIRST(&rtransport->poll_groups); 3487 } 3488 3489 pthread_mutex_unlock(&rtransport->lock); 3490 3491 return result; 3492 } 3493 3494 static void 3495 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 3496 { 3497 struct spdk_nvmf_rdma_poll_group *rgroup, *next_rgroup; 3498 struct spdk_nvmf_rdma_poller *poller, *tmp; 3499 struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; 3500 struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp_buf; 3501 struct spdk_nvmf_rdma_transport *rtransport; 3502 3503 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 3504 if (!rgroup) { 3505 return; 3506 } 3507 3508 /* free all retired buffers back to the transport so we don't short the mempool. */ 3509 STAILQ_FOREACH_SAFE(buf, &rgroup->retired_bufs, link, tmp_buf) { 3510 STAILQ_REMOVE(&rgroup->retired_bufs, buf, spdk_nvmf_transport_pg_cache_buf, link); 3511 assert(group->transport != NULL); 3512 spdk_mempool_put(group->transport->data_buf_pool, buf); 3513 } 3514 3515 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 3516 TAILQ_REMOVE(&rgroup->pollers, poller, link); 3517 3518 TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) { 3519 spdk_nvmf_rdma_qpair_destroy(qpair); 3520 } 3521 3522 if (poller->srq) { 3523 nvmf_rdma_resources_destroy(poller->resources); 3524 ibv_destroy_srq(poller->srq); 3525 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Destroyed RDMA shared queue %p\n", poller->srq); 3526 } 3527 3528 if (poller->cq) { 3529 ibv_destroy_cq(poller->cq); 3530 } 3531 3532 free(poller); 3533 } 3534 3535 if (rgroup->group.transport == NULL) { 3536 /* Transport can be NULL when spdk_nvmf_rdma_poll_group_create() 3537 * calls this function directly in a failure path. */ 3538 free(rgroup); 3539 return; 3540 } 3541 3542 rtransport = SPDK_CONTAINEROF(rgroup->group.transport, struct spdk_nvmf_rdma_transport, transport); 3543 3544 pthread_mutex_lock(&rtransport->lock); 3545 next_rgroup = TAILQ_NEXT(rgroup, link); 3546 TAILQ_REMOVE(&rtransport->poll_groups, rgroup, link); 3547 if (next_rgroup == NULL) { 3548 next_rgroup = TAILQ_FIRST(&rtransport->poll_groups); 3549 } 3550 if (rtransport->conn_sched.next_admin_pg == rgroup) { 3551 rtransport->conn_sched.next_admin_pg = next_rgroup; 3552 } 3553 if (rtransport->conn_sched.next_io_pg == rgroup) { 3554 rtransport->conn_sched.next_io_pg = next_rgroup; 3555 } 3556 pthread_mutex_unlock(&rtransport->lock); 3557 3558 free(rgroup); 3559 } 3560 3561 static void 3562 spdk_nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair) 3563 { 3564 if (rqpair->cm_id != NULL) { 3565 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 3566 } 3567 spdk_nvmf_rdma_qpair_destroy(rqpair); 3568 } 3569 3570 static int 3571 spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 3572 struct spdk_nvmf_qpair *qpair) 3573 { 3574 struct spdk_nvmf_rdma_poll_group *rgroup; 3575 struct spdk_nvmf_rdma_qpair *rqpair; 3576 struct spdk_nvmf_rdma_device *device; 3577 struct spdk_nvmf_rdma_poller *poller; 3578 int rc; 3579 3580 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 3581 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3582 3583 device = rqpair->device; 3584 3585 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 3586 if (poller->device == device) { 3587 break; 3588 } 3589 } 3590 3591 if (!poller) { 3592 SPDK_ERRLOG("No poller found for device.\n"); 3593 return -1; 3594 } 3595 3596 TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); 3597 rqpair->poller = poller; 3598 rqpair->srq = rqpair->poller->srq; 3599 3600 rc = spdk_nvmf_rdma_qpair_initialize(qpair); 3601 if (rc < 0) { 3602 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 3603 return -1; 3604 } 3605 3606 rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 3607 if (rc) { 3608 /* Try to reject, but we probably can't */ 3609 spdk_nvmf_rdma_qpair_reject_connection(rqpair); 3610 return -1; 3611 } 3612 3613 spdk_nvmf_rdma_update_ibv_state(rqpair); 3614 3615 return 0; 3616 } 3617 3618 static int 3619 spdk_nvmf_rdma_request_free(struct spdk_nvmf_request *req) 3620 { 3621 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 3622 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 3623 struct spdk_nvmf_rdma_transport, transport); 3624 3625 nvmf_rdma_request_free(rdma_req, rtransport); 3626 return 0; 3627 } 3628 3629 static int 3630 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 3631 { 3632 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 3633 struct spdk_nvmf_rdma_transport, transport); 3634 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, 3635 struct spdk_nvmf_rdma_request, req); 3636 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, 3637 struct spdk_nvmf_rdma_qpair, qpair); 3638 3639 if (rqpair->ibv_state != IBV_QPS_ERR) { 3640 /* The connection is alive, so process the request as normal */ 3641 rdma_req->state = RDMA_REQUEST_STATE_EXECUTED; 3642 } else { 3643 /* The connection is dead. Move the request directly to the completed state. */ 3644 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3645 } 3646 3647 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 3648 3649 return 0; 3650 } 3651 3652 static int 3653 spdk_nvmf_rdma_destroy_defunct_qpair(void *ctx) 3654 { 3655 struct spdk_nvmf_rdma_qpair *rqpair = ctx; 3656 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 3657 struct spdk_nvmf_rdma_transport, transport); 3658 3659 SPDK_INFOLOG(SPDK_LOG_RDMA, "QP#%d hasn't been drained as expected, manually destroy it\n", 3660 rqpair->qpair.qid); 3661 3662 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 3663 spdk_nvmf_rdma_qpair_destroy(rqpair); 3664 3665 return 0; 3666 } 3667 3668 static void 3669 spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) 3670 { 3671 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3672 3673 if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) { 3674 return; 3675 } 3676 3677 rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING; 3678 3679 /* This happens only when the qpair is disconnected before 3680 * it is added to the poll group. Since there is no poll group, 3681 * the RDMA qp has not been initialized yet and the RDMA CM 3682 * event has not yet been acknowledged, so we need to reject it. 3683 */ 3684 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) { 3685 spdk_nvmf_rdma_qpair_reject_connection(rqpair); 3686 return; 3687 } 3688 3689 if (rqpair->ibv_state != IBV_QPS_ERR) { 3690 spdk_nvmf_rdma_set_ibv_state(rqpair, IBV_QPS_ERR); 3691 } 3692 3693 rqpair->destruct_poller = spdk_poller_register(spdk_nvmf_rdma_destroy_defunct_qpair, (void *)rqpair, 3694 NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US); 3695 } 3696 3697 static struct spdk_nvmf_rdma_qpair * 3698 get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc) 3699 { 3700 struct spdk_nvmf_rdma_qpair *rqpair; 3701 /* @todo: improve QP search */ 3702 TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) { 3703 if (wc->qp_num == rqpair->cm_id->qp->qp_num) { 3704 return rqpair; 3705 } 3706 } 3707 SPDK_ERRLOG("Didn't find QP with qp_num %u\n", wc->qp_num); 3708 return NULL; 3709 } 3710 3711 #ifdef DEBUG 3712 static int 3713 spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) 3714 { 3715 return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || 3716 rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; 3717 } 3718 #endif 3719 3720 static void 3721 _poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_recv_wr *bad_recv_wr, 3722 int rc) 3723 { 3724 struct spdk_nvmf_rdma_recv *rdma_recv; 3725 struct spdk_nvmf_rdma_wr *bad_rdma_wr; 3726 3727 SPDK_ERRLOG("Failed to post a recv for the poller %p with errno %d\n", rpoller, -rc); 3728 while (bad_recv_wr != NULL) { 3729 bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_recv_wr->wr_id; 3730 rdma_recv = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 3731 3732 rdma_recv->qpair->current_recv_depth++; 3733 bad_recv_wr = bad_recv_wr->next; 3734 SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc); 3735 spdk_nvmf_rdma_start_disconnect(rdma_recv->qpair); 3736 } 3737 } 3738 3739 static void 3740 _qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *bad_recv_wr, int rc) 3741 { 3742 SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rqpair, -rc); 3743 while (bad_recv_wr != NULL) { 3744 bad_recv_wr = bad_recv_wr->next; 3745 rqpair->current_recv_depth++; 3746 } 3747 spdk_nvmf_rdma_start_disconnect(rqpair); 3748 } 3749 3750 static void 3751 _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport, 3752 struct spdk_nvmf_rdma_poller *rpoller) 3753 { 3754 struct spdk_nvmf_rdma_qpair *rqpair; 3755 struct ibv_recv_wr *bad_recv_wr; 3756 int rc; 3757 3758 if (rpoller->srq) { 3759 if (rpoller->resources->recvs_to_post.first != NULL) { 3760 rc = ibv_post_srq_recv(rpoller->srq, rpoller->resources->recvs_to_post.first, &bad_recv_wr); 3761 if (rc) { 3762 _poller_reset_failed_recvs(rpoller, bad_recv_wr, rc); 3763 } 3764 rpoller->resources->recvs_to_post.first = NULL; 3765 rpoller->resources->recvs_to_post.last = NULL; 3766 } 3767 } else { 3768 while (!STAILQ_EMPTY(&rpoller->qpairs_pending_recv)) { 3769 rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_recv); 3770 assert(rqpair->resources->recvs_to_post.first != NULL); 3771 rc = ibv_post_recv(rqpair->cm_id->qp, rqpair->resources->recvs_to_post.first, &bad_recv_wr); 3772 if (rc) { 3773 _qp_reset_failed_recvs(rqpair, bad_recv_wr, rc); 3774 } 3775 rqpair->resources->recvs_to_post.first = NULL; 3776 rqpair->resources->recvs_to_post.last = NULL; 3777 STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_recv, recv_link); 3778 } 3779 } 3780 } 3781 3782 static void 3783 _qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport, 3784 struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc) 3785 { 3786 struct spdk_nvmf_rdma_wr *bad_rdma_wr; 3787 struct spdk_nvmf_rdma_request *prev_rdma_req = NULL, *cur_rdma_req = NULL; 3788 3789 SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc); 3790 for (; bad_wr != NULL; bad_wr = bad_wr->next) { 3791 bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id; 3792 assert(rqpair->current_send_depth > 0); 3793 rqpair->current_send_depth--; 3794 switch (bad_rdma_wr->type) { 3795 case RDMA_WR_TYPE_DATA: 3796 cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 3797 if (bad_wr->opcode == IBV_WR_RDMA_READ) { 3798 assert(rqpair->current_read_depth > 0); 3799 rqpair->current_read_depth--; 3800 } 3801 break; 3802 case RDMA_WR_TYPE_SEND: 3803 cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 3804 break; 3805 default: 3806 SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair); 3807 prev_rdma_req = cur_rdma_req; 3808 continue; 3809 } 3810 3811 if (prev_rdma_req == cur_rdma_req) { 3812 /* this request was handled by an earlier wr. i.e. we were performing an nvme read. */ 3813 /* We only have to check against prev_wr since each requests wrs are contiguous in this list. */ 3814 continue; 3815 } 3816 3817 switch (cur_rdma_req->state) { 3818 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 3819 cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 3820 cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 3821 break; 3822 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 3823 case RDMA_REQUEST_STATE_COMPLETING: 3824 cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3825 break; 3826 default: 3827 SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n", 3828 cur_rdma_req->state, rqpair); 3829 continue; 3830 } 3831 3832 spdk_nvmf_rdma_request_process(rtransport, cur_rdma_req); 3833 prev_rdma_req = cur_rdma_req; 3834 } 3835 3836 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { 3837 /* Disconnect the connection. */ 3838 spdk_nvmf_rdma_start_disconnect(rqpair); 3839 } 3840 3841 } 3842 3843 static void 3844 _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport, 3845 struct spdk_nvmf_rdma_poller *rpoller) 3846 { 3847 struct spdk_nvmf_rdma_qpair *rqpair; 3848 struct ibv_send_wr *bad_wr = NULL; 3849 int rc; 3850 3851 while (!STAILQ_EMPTY(&rpoller->qpairs_pending_send)) { 3852 rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_send); 3853 assert(rqpair->sends_to_post.first != NULL); 3854 rc = ibv_post_send(rqpair->cm_id->qp, rqpair->sends_to_post.first, &bad_wr); 3855 3856 /* bad wr always points to the first wr that failed. */ 3857 if (rc) { 3858 _qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc); 3859 } 3860 rqpair->sends_to_post.first = NULL; 3861 rqpair->sends_to_post.last = NULL; 3862 STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_send, send_link); 3863 } 3864 } 3865 3866 static int 3867 spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 3868 struct spdk_nvmf_rdma_poller *rpoller) 3869 { 3870 struct ibv_wc wc[32]; 3871 struct spdk_nvmf_rdma_wr *rdma_wr; 3872 struct spdk_nvmf_rdma_request *rdma_req; 3873 struct spdk_nvmf_rdma_recv *rdma_recv; 3874 struct spdk_nvmf_rdma_qpair *rqpair; 3875 int reaped, i; 3876 int count = 0; 3877 bool error = false; 3878 uint64_t poll_tsc = spdk_get_ticks(); 3879 3880 /* Poll for completing operations. */ 3881 reaped = ibv_poll_cq(rpoller->cq, 32, wc); 3882 if (reaped < 0) { 3883 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 3884 errno, spdk_strerror(errno)); 3885 return -1; 3886 } 3887 3888 rpoller->stat.polls++; 3889 rpoller->stat.completions += reaped; 3890 3891 for (i = 0; i < reaped; i++) { 3892 3893 rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id; 3894 3895 switch (rdma_wr->type) { 3896 case RDMA_WR_TYPE_SEND: 3897 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 3898 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 3899 3900 if (!wc[i].status) { 3901 count++; 3902 assert(wc[i].opcode == IBV_WC_SEND); 3903 assert(spdk_nvmf_rdma_req_is_completing(rdma_req)); 3904 } else { 3905 SPDK_ERRLOG("data=%p length=%u\n", rdma_req->req.data, rdma_req->req.length); 3906 } 3907 3908 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3909 /* +1 for the response wr */ 3910 rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1; 3911 rdma_req->num_outstanding_data_wr = 0; 3912 3913 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 3914 break; 3915 case RDMA_WR_TYPE_RECV: 3916 /* rdma_recv->qpair will be invalid if using an SRQ. In that case we have to get the qpair from the wc. */ 3917 rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 3918 if (rpoller->srq != NULL) { 3919 rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]); 3920 /* It is possible that there are still some completions for destroyed QP 3921 * associated with SRQ. We just ignore these late completions and re-post 3922 * receive WRs back to SRQ. 3923 */ 3924 if (spdk_unlikely(NULL == rdma_recv->qpair)) { 3925 struct ibv_recv_wr *bad_wr; 3926 int rc; 3927 3928 rdma_recv->wr.next = NULL; 3929 rc = ibv_post_srq_recv(rpoller->srq, 3930 &rdma_recv->wr, 3931 &bad_wr); 3932 if (rc) { 3933 SPDK_ERRLOG("Failed to re-post recv WR to SRQ, err %d\n", rc); 3934 } 3935 continue; 3936 } 3937 } 3938 rqpair = rdma_recv->qpair; 3939 3940 assert(rqpair != NULL); 3941 if (!wc[i].status) { 3942 assert(wc[i].opcode == IBV_WC_RECV); 3943 if (rqpair->current_recv_depth >= rqpair->max_queue_depth) { 3944 spdk_nvmf_rdma_start_disconnect(rqpair); 3945 break; 3946 } 3947 } 3948 3949 rdma_recv->wr.next = NULL; 3950 rqpair->current_recv_depth++; 3951 rdma_recv->receive_tsc = poll_tsc; 3952 rpoller->stat.requests++; 3953 STAILQ_INSERT_TAIL(&rqpair->resources->incoming_queue, rdma_recv, link); 3954 break; 3955 case RDMA_WR_TYPE_DATA: 3956 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 3957 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 3958 3959 assert(rdma_req->num_outstanding_data_wr > 0); 3960 3961 rqpair->current_send_depth--; 3962 rdma_req->num_outstanding_data_wr--; 3963 if (!wc[i].status) { 3964 assert(wc[i].opcode == IBV_WC_RDMA_READ); 3965 rqpair->current_read_depth--; 3966 /* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */ 3967 if (rdma_req->num_outstanding_data_wr == 0) { 3968 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 3969 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 3970 } 3971 } else { 3972 /* If the data transfer fails still force the queue into the error state, 3973 * if we were performing an RDMA_READ, we need to force the request into a 3974 * completed state since it wasn't linked to a send. However, in the RDMA_WRITE 3975 * case, we should wait for the SEND to complete. */ 3976 SPDK_ERRLOG("data=%p length=%u\n", rdma_req->req.data, rdma_req->req.length); 3977 if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) { 3978 rqpair->current_read_depth--; 3979 if (rdma_req->num_outstanding_data_wr == 0) { 3980 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3981 } 3982 } 3983 } 3984 break; 3985 default: 3986 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 3987 continue; 3988 } 3989 3990 /* Handle error conditions */ 3991 if (wc[i].status) { 3992 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "CQ error on CQ %p, Request 0x%lu (%d): %s\n", 3993 rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 3994 3995 error = true; 3996 3997 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { 3998 /* Disconnect the connection. */ 3999 spdk_nvmf_rdma_start_disconnect(rqpair); 4000 } else { 4001 nvmf_rdma_destroy_drained_qpair(rqpair); 4002 } 4003 continue; 4004 } 4005 4006 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 4007 4008 if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 4009 nvmf_rdma_destroy_drained_qpair(rqpair); 4010 } 4011 } 4012 4013 if (error == true) { 4014 return -1; 4015 } 4016 4017 /* submit outstanding work requests. */ 4018 _poller_submit_recvs(rtransport, rpoller); 4019 _poller_submit_sends(rtransport, rpoller); 4020 4021 return count; 4022 } 4023 4024 static int 4025 spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 4026 { 4027 struct spdk_nvmf_rdma_transport *rtransport; 4028 struct spdk_nvmf_rdma_poll_group *rgroup; 4029 struct spdk_nvmf_rdma_poller *rpoller; 4030 int count, rc; 4031 4032 rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 4033 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 4034 4035 count = 0; 4036 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 4037 rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); 4038 if (rc < 0) { 4039 return rc; 4040 } 4041 count += rc; 4042 } 4043 4044 return count; 4045 } 4046 4047 static int 4048 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 4049 struct spdk_nvme_transport_id *trid, 4050 bool peer) 4051 { 4052 struct sockaddr *saddr; 4053 uint16_t port; 4054 4055 spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_RDMA); 4056 4057 if (peer) { 4058 saddr = rdma_get_peer_addr(id); 4059 } else { 4060 saddr = rdma_get_local_addr(id); 4061 } 4062 switch (saddr->sa_family) { 4063 case AF_INET: { 4064 struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; 4065 4066 trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; 4067 inet_ntop(AF_INET, &saddr_in->sin_addr, 4068 trid->traddr, sizeof(trid->traddr)); 4069 if (peer) { 4070 port = ntohs(rdma_get_dst_port(id)); 4071 } else { 4072 port = ntohs(rdma_get_src_port(id)); 4073 } 4074 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 4075 break; 4076 } 4077 case AF_INET6: { 4078 struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; 4079 trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; 4080 inet_ntop(AF_INET6, &saddr_in->sin6_addr, 4081 trid->traddr, sizeof(trid->traddr)); 4082 if (peer) { 4083 port = ntohs(rdma_get_dst_port(id)); 4084 } else { 4085 port = ntohs(rdma_get_src_port(id)); 4086 } 4087 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 4088 break; 4089 } 4090 default: 4091 return -1; 4092 4093 } 4094 4095 return 0; 4096 } 4097 4098 static int 4099 spdk_nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 4100 struct spdk_nvme_transport_id *trid) 4101 { 4102 struct spdk_nvmf_rdma_qpair *rqpair; 4103 4104 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 4105 4106 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); 4107 } 4108 4109 static int 4110 spdk_nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 4111 struct spdk_nvme_transport_id *trid) 4112 { 4113 struct spdk_nvmf_rdma_qpair *rqpair; 4114 4115 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 4116 4117 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); 4118 } 4119 4120 static int 4121 spdk_nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 4122 struct spdk_nvme_transport_id *trid) 4123 { 4124 struct spdk_nvmf_rdma_qpair *rqpair; 4125 4126 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 4127 4128 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); 4129 } 4130 4131 void 4132 spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) 4133 { 4134 g_nvmf_hooks = *hooks; 4135 } 4136 4137 static int 4138 spdk_nvmf_rdma_poll_group_get_stat(struct spdk_nvmf_tgt *tgt, 4139 struct spdk_nvmf_transport_poll_group_stat **stat) 4140 { 4141 struct spdk_io_channel *ch; 4142 struct spdk_nvmf_poll_group *group; 4143 struct spdk_nvmf_transport_poll_group *tgroup; 4144 struct spdk_nvmf_rdma_poll_group *rgroup; 4145 struct spdk_nvmf_rdma_poller *rpoller; 4146 struct spdk_nvmf_rdma_device_stat *device_stat; 4147 uint64_t num_devices = 0; 4148 4149 if (tgt == NULL || stat == NULL) { 4150 return -EINVAL; 4151 } 4152 4153 ch = spdk_get_io_channel(tgt); 4154 group = spdk_io_channel_get_ctx(ch);; 4155 spdk_put_io_channel(ch); 4156 TAILQ_FOREACH(tgroup, &group->tgroups, link) { 4157 if (SPDK_NVME_TRANSPORT_RDMA == tgroup->transport->ops->type) { 4158 *stat = calloc(1, sizeof(struct spdk_nvmf_transport_poll_group_stat)); 4159 if (!*stat) { 4160 SPDK_ERRLOG("Failed to allocate memory for NVMf RDMA statistics\n"); 4161 return -ENOMEM; 4162 } 4163 (*stat)->trtype = SPDK_NVME_TRANSPORT_RDMA; 4164 4165 rgroup = SPDK_CONTAINEROF(tgroup, struct spdk_nvmf_rdma_poll_group, group); 4166 /* Count devices to allocate enough memory */ 4167 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 4168 ++num_devices; 4169 } 4170 (*stat)->rdma.devices = calloc(num_devices, sizeof(struct spdk_nvmf_rdma_device_stat)); 4171 if (!(*stat)->rdma.devices) { 4172 SPDK_ERRLOG("Failed to allocate NVMf RDMA devices statistics\n"); 4173 free(*stat); 4174 return -ENOMEM; 4175 } 4176 4177 (*stat)->rdma.pending_data_buffer = rgroup->stat.pending_data_buffer; 4178 (*stat)->rdma.num_devices = num_devices; 4179 num_devices = 0; 4180 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 4181 device_stat = &(*stat)->rdma.devices[num_devices++]; 4182 device_stat->name = ibv_get_device_name(rpoller->device->context->device); 4183 device_stat->polls = rpoller->stat.polls; 4184 device_stat->completions = rpoller->stat.completions; 4185 device_stat->requests = rpoller->stat.requests; 4186 device_stat->request_latency = rpoller->stat.request_latency; 4187 device_stat->pending_free_request = rpoller->stat.pending_free_request; 4188 device_stat->pending_rdma_read = rpoller->stat.pending_rdma_read; 4189 device_stat->pending_rdma_write = rpoller->stat.pending_rdma_write; 4190 } 4191 return 0; 4192 } 4193 } 4194 return -ENOENT; 4195 } 4196 4197 static void 4198 spdk_nvmf_rdma_poll_group_free_stat(struct spdk_nvmf_transport_poll_group_stat *stat) 4199 { 4200 if (stat) { 4201 free(stat->rdma.devices); 4202 } 4203 free(stat); 4204 } 4205 4206 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 4207 .name = "RDMA", 4208 .type = SPDK_NVME_TRANSPORT_RDMA, 4209 .opts_init = spdk_nvmf_rdma_opts_init, 4210 .create = spdk_nvmf_rdma_create, 4211 .destroy = spdk_nvmf_rdma_destroy, 4212 4213 .listen = spdk_nvmf_rdma_listen, 4214 .stop_listen = spdk_nvmf_rdma_stop_listen, 4215 .accept = spdk_nvmf_rdma_accept, 4216 4217 .listener_discover = spdk_nvmf_rdma_discover, 4218 4219 .poll_group_create = spdk_nvmf_rdma_poll_group_create, 4220 .get_optimal_poll_group = spdk_nvmf_rdma_get_optimal_poll_group, 4221 .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, 4222 .poll_group_add = spdk_nvmf_rdma_poll_group_add, 4223 .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, 4224 4225 .req_free = spdk_nvmf_rdma_request_free, 4226 .req_complete = spdk_nvmf_rdma_request_complete, 4227 4228 .qpair_fini = spdk_nvmf_rdma_close_qpair, 4229 .qpair_get_peer_trid = spdk_nvmf_rdma_qpair_get_peer_trid, 4230 .qpair_get_local_trid = spdk_nvmf_rdma_qpair_get_local_trid, 4231 .qpair_get_listen_trid = spdk_nvmf_rdma_qpair_get_listen_trid, 4232 4233 .poll_group_get_stat = spdk_nvmf_rdma_poll_group_get_stat, 4234 .poll_group_free_stat = spdk_nvmf_rdma_poll_group_free_stat, 4235 }; 4236 4237 SPDK_NVMF_TRANSPORT_REGISTER(rdma, &spdk_nvmf_transport_rdma); 4238 SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) 4239