1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "nvmf_internal.h" 41 #include "transport.h" 42 43 #include "spdk/config.h" 44 #include "spdk/thread.h" 45 #include "spdk/nvmf.h" 46 #include "spdk/nvmf_spec.h" 47 #include "spdk/string.h" 48 #include "spdk/trace.h" 49 #include "spdk/util.h" 50 51 #include "spdk_internal/assert.h" 52 #include "spdk_internal/log.h" 53 54 struct spdk_nvme_rdma_hooks g_nvmf_hooks = {}; 55 56 /* 57 RDMA Connection Resource Defaults 58 */ 59 #define NVMF_DEFAULT_TX_SGE SPDK_NVMF_MAX_SGL_ENTRIES 60 #define NVMF_DEFAULT_RSP_SGE 1 61 #define NVMF_DEFAULT_RX_SGE 2 62 63 /* The RDMA completion queue size */ 64 #define DEFAULT_NVMF_RDMA_CQ_SIZE 4096 65 #define MAX_WR_PER_QP(queue_depth) (queue_depth * 3 + 2) 66 67 /* Timeout for destroying defunct rqpairs */ 68 #define NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US 4000000 69 70 /* The maximum number of buffers per request */ 71 #define NVMF_REQ_MAX_BUFFERS (SPDK_NVMF_MAX_SGL_ENTRIES * 2) 72 73 static int g_spdk_nvmf_ibv_query_mask = 74 IBV_QP_STATE | 75 IBV_QP_PKEY_INDEX | 76 IBV_QP_PORT | 77 IBV_QP_ACCESS_FLAGS | 78 IBV_QP_AV | 79 IBV_QP_PATH_MTU | 80 IBV_QP_DEST_QPN | 81 IBV_QP_RQ_PSN | 82 IBV_QP_MAX_DEST_RD_ATOMIC | 83 IBV_QP_MIN_RNR_TIMER | 84 IBV_QP_SQ_PSN | 85 IBV_QP_TIMEOUT | 86 IBV_QP_RETRY_CNT | 87 IBV_QP_RNR_RETRY | 88 IBV_QP_MAX_QP_RD_ATOMIC; 89 90 enum spdk_nvmf_rdma_request_state { 91 /* The request is not currently in use */ 92 RDMA_REQUEST_STATE_FREE = 0, 93 94 /* Initial state when request first received */ 95 RDMA_REQUEST_STATE_NEW, 96 97 /* The request is queued until a data buffer is available. */ 98 RDMA_REQUEST_STATE_NEED_BUFFER, 99 100 /* The request is waiting on RDMA queue depth availability 101 * to transfer data from the host to the controller. 102 */ 103 RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 104 105 /* The request is currently transferring data from the host to the controller. */ 106 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 107 108 /* The request is ready to execute at the block device */ 109 RDMA_REQUEST_STATE_READY_TO_EXECUTE, 110 111 /* The request is currently executing at the block device */ 112 RDMA_REQUEST_STATE_EXECUTING, 113 114 /* The request finished executing at the block device */ 115 RDMA_REQUEST_STATE_EXECUTED, 116 117 /* The request is waiting on RDMA queue depth availability 118 * to transfer data from the controller to the host. 119 */ 120 RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 121 122 /* The request is ready to send a completion */ 123 RDMA_REQUEST_STATE_READY_TO_COMPLETE, 124 125 /* The request is currently transferring data from the controller to the host. */ 126 RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 127 128 /* The request currently has an outstanding completion without an 129 * associated data transfer. 130 */ 131 RDMA_REQUEST_STATE_COMPLETING, 132 133 /* The request completed and can be marked free. */ 134 RDMA_REQUEST_STATE_COMPLETED, 135 136 /* Terminator */ 137 RDMA_REQUEST_NUM_STATES, 138 }; 139 140 #define OBJECT_NVMF_RDMA_IO 0x40 141 142 #define TRACE_GROUP_NVMF_RDMA 0x4 143 #define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) 144 #define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) 145 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) 146 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) 147 #define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) 148 #define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) 149 #define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) 150 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) 151 #define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) 152 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) 153 #define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) 154 #define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB) 155 #define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC) 156 #define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD) 157 #define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE) 158 #define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF) 159 #define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10) 160 #define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x11) 161 162 SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA) 163 { 164 spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); 165 spdk_trace_register_description("RDMA_REQ_NEW", TRACE_RDMA_REQUEST_STATE_NEW, 166 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: "); 167 spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 168 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 169 spdk_trace_register_description("RDMA_REQ_TX_PENDING_C2H", 170 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 171 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 172 spdk_trace_register_description("RDMA_REQ_TX_PENDING_H2C", 173 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 174 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 175 spdk_trace_register_description("RDMA_REQ_TX_H2C", 176 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 177 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 178 spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", 179 TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 180 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 181 spdk_trace_register_description("RDMA_REQ_EXECUTING", 182 TRACE_RDMA_REQUEST_STATE_EXECUTING, 183 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 184 spdk_trace_register_description("RDMA_REQ_EXECUTED", 185 TRACE_RDMA_REQUEST_STATE_EXECUTED, 186 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 187 spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPL", 188 TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 189 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 190 spdk_trace_register_description("RDMA_REQ_COMPLETING_C2H", 191 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 192 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 193 spdk_trace_register_description("RDMA_REQ_COMPLETING", 194 TRACE_RDMA_REQUEST_STATE_COMPLETING, 195 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 196 spdk_trace_register_description("RDMA_REQ_COMPLETED", 197 TRACE_RDMA_REQUEST_STATE_COMPLETED, 198 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 199 200 spdk_trace_register_description("RDMA_QP_CREATE", TRACE_RDMA_QP_CREATE, 201 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 202 spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", TRACE_RDMA_IBV_ASYNC_EVENT, 203 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 204 spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", TRACE_RDMA_CM_ASYNC_EVENT, 205 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 206 spdk_trace_register_description("RDMA_QP_STATE_CHANGE", TRACE_RDMA_QP_STATE_CHANGE, 207 OWNER_NONE, OBJECT_NONE, 0, 1, "state: "); 208 spdk_trace_register_description("RDMA_QP_DISCONNECT", TRACE_RDMA_QP_DISCONNECT, 209 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 210 spdk_trace_register_description("RDMA_QP_DESTROY", TRACE_RDMA_QP_DESTROY, 211 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 212 } 213 214 enum spdk_nvmf_rdma_wr_type { 215 RDMA_WR_TYPE_RECV, 216 RDMA_WR_TYPE_SEND, 217 RDMA_WR_TYPE_DATA, 218 }; 219 220 struct spdk_nvmf_rdma_wr { 221 enum spdk_nvmf_rdma_wr_type type; 222 }; 223 224 /* This structure holds commands as they are received off the wire. 225 * It must be dynamically paired with a full request object 226 * (spdk_nvmf_rdma_request) to service a request. It is separate 227 * from the request because RDMA does not appear to order 228 * completions, so occasionally we'll get a new incoming 229 * command when there aren't any free request objects. 230 */ 231 struct spdk_nvmf_rdma_recv { 232 struct ibv_recv_wr wr; 233 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 234 235 struct spdk_nvmf_rdma_qpair *qpair; 236 237 /* In-capsule data buffer */ 238 uint8_t *buf; 239 240 struct spdk_nvmf_rdma_wr rdma_wr; 241 uint64_t receive_tsc; 242 243 STAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 244 }; 245 246 struct spdk_nvmf_rdma_request_data { 247 struct spdk_nvmf_rdma_wr rdma_wr; 248 struct ibv_send_wr wr; 249 struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 250 }; 251 252 struct spdk_nvmf_rdma_request { 253 struct spdk_nvmf_request req; 254 255 enum spdk_nvmf_rdma_request_state state; 256 257 struct spdk_nvmf_rdma_recv *recv; 258 259 struct { 260 struct spdk_nvmf_rdma_wr rdma_wr; 261 struct ibv_send_wr wr; 262 struct ibv_sge sgl[NVMF_DEFAULT_RSP_SGE]; 263 } rsp; 264 265 struct spdk_nvmf_rdma_request_data data; 266 267 uint32_t num_outstanding_data_wr; 268 uint64_t receive_tsc; 269 270 struct spdk_dif_ctx dif_ctx; 271 bool dif_insert_or_strip; 272 uint32_t elba_length; 273 uint32_t orig_length; 274 275 STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; 276 }; 277 278 enum spdk_nvmf_rdma_qpair_disconnect_flags { 279 RDMA_QP_DISCONNECTING = 1, 280 RDMA_QP_RECV_DRAINED = 1 << 1, 281 RDMA_QP_SEND_DRAINED = 1 << 2 282 }; 283 284 struct spdk_nvmf_rdma_resource_opts { 285 struct spdk_nvmf_rdma_qpair *qpair; 286 /* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */ 287 void *qp; 288 struct ibv_pd *pd; 289 uint32_t max_queue_depth; 290 uint32_t in_capsule_data_size; 291 bool shared; 292 }; 293 294 struct spdk_nvmf_send_wr_list { 295 struct ibv_send_wr *first; 296 struct ibv_send_wr *last; 297 }; 298 299 struct spdk_nvmf_recv_wr_list { 300 struct ibv_recv_wr *first; 301 struct ibv_recv_wr *last; 302 }; 303 304 struct spdk_nvmf_rdma_resources { 305 /* Array of size "max_queue_depth" containing RDMA requests. */ 306 struct spdk_nvmf_rdma_request *reqs; 307 308 /* Array of size "max_queue_depth" containing RDMA recvs. */ 309 struct spdk_nvmf_rdma_recv *recvs; 310 311 /* Array of size "max_queue_depth" containing 64 byte capsules 312 * used for receive. 313 */ 314 union nvmf_h2c_msg *cmds; 315 struct ibv_mr *cmds_mr; 316 317 /* Array of size "max_queue_depth" containing 16 byte completions 318 * to be sent back to the user. 319 */ 320 union nvmf_c2h_msg *cpls; 321 struct ibv_mr *cpls_mr; 322 323 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 324 * buffers to be used for in capsule data. 325 */ 326 void *bufs; 327 struct ibv_mr *bufs_mr; 328 329 /* The list of pending recvs to transfer */ 330 struct spdk_nvmf_recv_wr_list recvs_to_post; 331 332 /* Receives that are waiting for a request object */ 333 STAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 334 335 /* Queue to track free requests */ 336 STAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; 337 }; 338 339 struct spdk_nvmf_rdma_qpair { 340 struct spdk_nvmf_qpair qpair; 341 342 struct spdk_nvmf_rdma_port *port; 343 struct spdk_nvmf_rdma_poller *poller; 344 345 struct rdma_cm_id *cm_id; 346 struct ibv_srq *srq; 347 struct rdma_cm_id *listen_id; 348 349 /* The maximum number of I/O outstanding on this connection at one time */ 350 uint16_t max_queue_depth; 351 352 /* The maximum number of active RDMA READ and ATOMIC operations at one time */ 353 uint16_t max_read_depth; 354 355 /* The maximum number of RDMA SEND operations at one time */ 356 uint32_t max_send_depth; 357 358 /* The current number of outstanding WRs from this qpair's 359 * recv queue. Should not exceed device->attr.max_queue_depth. 360 */ 361 uint16_t current_recv_depth; 362 363 /* The current number of active RDMA READ operations */ 364 uint16_t current_read_depth; 365 366 /* The current number of posted WRs from this qpair's 367 * send queue. Should not exceed max_send_depth. 368 */ 369 uint32_t current_send_depth; 370 371 /* The maximum number of SGEs per WR on the send queue */ 372 uint32_t max_send_sge; 373 374 /* The maximum number of SGEs per WR on the recv queue */ 375 uint32_t max_recv_sge; 376 377 /* The list of pending send requests for a transfer */ 378 struct spdk_nvmf_send_wr_list sends_to_post; 379 380 struct spdk_nvmf_rdma_resources *resources; 381 382 STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_read_queue; 383 384 STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_write_queue; 385 386 /* Number of requests not in the free state */ 387 uint32_t qd; 388 389 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; 390 391 STAILQ_ENTRY(spdk_nvmf_rdma_qpair) recv_link; 392 393 STAILQ_ENTRY(spdk_nvmf_rdma_qpair) send_link; 394 395 /* IBV queue pair attributes: they are used to manage 396 * qp state and recover from errors. 397 */ 398 enum ibv_qp_state ibv_state; 399 400 uint32_t disconnect_flags; 401 402 /* Poller registered in case the qpair doesn't properly 403 * complete the qpair destruct process and becomes defunct. 404 */ 405 406 struct spdk_poller *destruct_poller; 407 408 /* There are several ways a disconnect can start on a qpair 409 * and they are not all mutually exclusive. It is important 410 * that we only initialize one of these paths. 411 */ 412 bool disconnect_started; 413 /* Lets us know that we have received the last_wqe event. */ 414 bool last_wqe_reached; 415 }; 416 417 struct spdk_nvmf_rdma_poller_stat { 418 uint64_t completions; 419 uint64_t polls; 420 uint64_t requests; 421 uint64_t request_latency; 422 uint64_t pending_free_request; 423 uint64_t pending_rdma_read; 424 uint64_t pending_rdma_write; 425 }; 426 427 struct spdk_nvmf_rdma_poller { 428 struct spdk_nvmf_rdma_device *device; 429 struct spdk_nvmf_rdma_poll_group *group; 430 431 int num_cqe; 432 int required_num_wr; 433 struct ibv_cq *cq; 434 435 /* The maximum number of I/O outstanding on the shared receive queue at one time */ 436 uint16_t max_srq_depth; 437 438 /* Shared receive queue */ 439 struct ibv_srq *srq; 440 441 struct spdk_nvmf_rdma_resources *resources; 442 struct spdk_nvmf_rdma_poller_stat stat; 443 444 TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; 445 446 STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_recv; 447 448 STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_send; 449 450 TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 451 }; 452 453 struct spdk_nvmf_rdma_poll_group_stat { 454 uint64_t pending_data_buffer; 455 }; 456 457 struct spdk_nvmf_rdma_poll_group { 458 struct spdk_nvmf_transport_poll_group group; 459 struct spdk_nvmf_rdma_poll_group_stat stat; 460 TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 461 TAILQ_ENTRY(spdk_nvmf_rdma_poll_group) link; 462 /* 463 * buffers which are split across multiple RDMA 464 * memory regions cannot be used by this transport. 465 */ 466 STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf) retired_bufs; 467 }; 468 469 struct spdk_nvmf_rdma_conn_sched { 470 struct spdk_nvmf_rdma_poll_group *next_admin_pg; 471 struct spdk_nvmf_rdma_poll_group *next_io_pg; 472 }; 473 474 /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 475 struct spdk_nvmf_rdma_device { 476 struct ibv_device_attr attr; 477 struct ibv_context *context; 478 479 struct spdk_mem_map *map; 480 struct ibv_pd *pd; 481 482 int num_srq; 483 484 TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 485 }; 486 487 struct spdk_nvmf_rdma_port { 488 struct spdk_nvme_transport_id trid; 489 struct rdma_cm_id *id; 490 struct spdk_nvmf_rdma_device *device; 491 uint32_t ref; 492 TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 493 }; 494 495 struct spdk_nvmf_rdma_transport { 496 struct spdk_nvmf_transport transport; 497 498 struct spdk_nvmf_rdma_conn_sched conn_sched; 499 500 struct rdma_event_channel *event_channel; 501 502 struct spdk_mempool *data_wr_pool; 503 504 pthread_mutex_t lock; 505 506 /* fields used to poll RDMA/IB events */ 507 nfds_t npoll_fds; 508 struct pollfd *poll_fds; 509 510 TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 511 TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 512 TAILQ_HEAD(, spdk_nvmf_rdma_poll_group) poll_groups; 513 }; 514 515 static inline void 516 spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair); 517 518 static inline int 519 spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state) 520 { 521 switch (state) { 522 case IBV_QPS_RESET: 523 case IBV_QPS_INIT: 524 case IBV_QPS_RTR: 525 case IBV_QPS_RTS: 526 case IBV_QPS_SQD: 527 case IBV_QPS_SQE: 528 case IBV_QPS_ERR: 529 return 0; 530 default: 531 return -1; 532 } 533 } 534 535 static inline enum spdk_nvme_media_error_status_code 536 spdk_nvmf_rdma_dif_error_to_compl_status(uint8_t err_type) { 537 enum spdk_nvme_media_error_status_code result; 538 switch (err_type) 539 { 540 case SPDK_DIF_REFTAG_ERROR: 541 result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR; 542 break; 543 case SPDK_DIF_APPTAG_ERROR: 544 result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR; 545 break; 546 case SPDK_DIF_GUARD_ERROR: 547 result = SPDK_NVME_SC_GUARD_CHECK_ERROR; 548 break; 549 default: 550 SPDK_UNREACHABLE(); 551 } 552 553 return result; 554 } 555 556 static enum ibv_qp_state 557 spdk_nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) { 558 enum ibv_qp_state old_state, new_state; 559 struct ibv_qp_attr qp_attr; 560 struct ibv_qp_init_attr init_attr; 561 int rc; 562 563 old_state = rqpair->ibv_state; 564 rc = ibv_query_qp(rqpair->cm_id->qp, &qp_attr, 565 g_spdk_nvmf_ibv_query_mask, &init_attr); 566 567 if (rc) 568 { 569 SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); 570 return IBV_QPS_ERR + 1; 571 } 572 573 new_state = qp_attr.qp_state; 574 rqpair->ibv_state = new_state; 575 qp_attr.ah_attr.port_num = qp_attr.port_num; 576 577 rc = spdk_nvmf_rdma_check_ibv_state(new_state); 578 if (rc) 579 { 580 SPDK_ERRLOG("QP#%d: bad state updated: %u, maybe hardware issue\n", rqpair->qpair.qid, new_state); 581 /* 582 * IBV_QPS_UNKNOWN undefined if lib version smaller than libibverbs-1.1.8 583 * IBV_QPS_UNKNOWN is the enum element after IBV_QPS_ERR 584 */ 585 return IBV_QPS_ERR + 1; 586 } 587 588 if (old_state != new_state) 589 { 590 spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0, 591 (uintptr_t)rqpair->cm_id, new_state); 592 } 593 return new_state; 594 } 595 596 static const char *str_ibv_qp_state[] = { 597 "IBV_QPS_RESET", 598 "IBV_QPS_INIT", 599 "IBV_QPS_RTR", 600 "IBV_QPS_RTS", 601 "IBV_QPS_SQD", 602 "IBV_QPS_SQE", 603 "IBV_QPS_ERR", 604 "IBV_QPS_UNKNOWN" 605 }; 606 607 static int 608 spdk_nvmf_rdma_set_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair, 609 enum ibv_qp_state new_state) 610 { 611 struct ibv_qp_attr qp_attr; 612 struct ibv_qp_init_attr init_attr; 613 int rc; 614 enum ibv_qp_state state; 615 static int attr_mask_rc[] = { 616 [IBV_QPS_RESET] = IBV_QP_STATE, 617 [IBV_QPS_INIT] = (IBV_QP_STATE | 618 IBV_QP_PKEY_INDEX | 619 IBV_QP_PORT | 620 IBV_QP_ACCESS_FLAGS), 621 [IBV_QPS_RTR] = (IBV_QP_STATE | 622 IBV_QP_AV | 623 IBV_QP_PATH_MTU | 624 IBV_QP_DEST_QPN | 625 IBV_QP_RQ_PSN | 626 IBV_QP_MAX_DEST_RD_ATOMIC | 627 IBV_QP_MIN_RNR_TIMER), 628 [IBV_QPS_RTS] = (IBV_QP_STATE | 629 IBV_QP_SQ_PSN | 630 IBV_QP_TIMEOUT | 631 IBV_QP_RETRY_CNT | 632 IBV_QP_RNR_RETRY | 633 IBV_QP_MAX_QP_RD_ATOMIC), 634 [IBV_QPS_SQD] = IBV_QP_STATE, 635 [IBV_QPS_SQE] = IBV_QP_STATE, 636 [IBV_QPS_ERR] = IBV_QP_STATE, 637 }; 638 639 rc = spdk_nvmf_rdma_check_ibv_state(new_state); 640 if (rc) { 641 SPDK_ERRLOG("QP#%d: bad state requested: %u\n", 642 rqpair->qpair.qid, new_state); 643 return rc; 644 } 645 646 rc = ibv_query_qp(rqpair->cm_id->qp, &qp_attr, 647 g_spdk_nvmf_ibv_query_mask, &init_attr); 648 649 if (rc) { 650 SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); 651 assert(false); 652 } 653 654 qp_attr.cur_qp_state = rqpair->ibv_state; 655 qp_attr.qp_state = new_state; 656 657 rc = ibv_modify_qp(rqpair->cm_id->qp, &qp_attr, 658 attr_mask_rc[new_state]); 659 660 if (rc) { 661 SPDK_ERRLOG("QP#%d: failed to set state to: %s, %d (%s)\n", 662 rqpair->qpair.qid, str_ibv_qp_state[new_state], errno, strerror(errno)); 663 return rc; 664 } 665 666 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 667 668 if (state != new_state) { 669 SPDK_ERRLOG("QP#%d: expected state: %s, actual state: %s\n", 670 rqpair->qpair.qid, str_ibv_qp_state[new_state], 671 str_ibv_qp_state[state]); 672 return -1; 673 } 674 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "IBV QP#%u changed to: %s\n", rqpair->qpair.qid, 675 str_ibv_qp_state[state]); 676 return 0; 677 } 678 679 static void 680 nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req, 681 struct spdk_nvmf_rdma_transport *rtransport) 682 { 683 struct spdk_nvmf_rdma_request_data *data_wr; 684 struct ibv_send_wr *next_send_wr; 685 uint64_t req_wrid; 686 687 rdma_req->num_outstanding_data_wr = 0; 688 data_wr = &rdma_req->data; 689 req_wrid = data_wr->wr.wr_id; 690 while (data_wr && data_wr->wr.wr_id == req_wrid) { 691 memset(data_wr->sgl, 0, sizeof(data_wr->wr.sg_list[0]) * data_wr->wr.num_sge); 692 data_wr->wr.num_sge = 0; 693 next_send_wr = data_wr->wr.next; 694 if (data_wr != &rdma_req->data) { 695 spdk_mempool_put(rtransport->data_wr_pool, data_wr); 696 } 697 data_wr = (!next_send_wr || next_send_wr == &rdma_req->rsp.wr) ? NULL : 698 SPDK_CONTAINEROF(next_send_wr, struct spdk_nvmf_rdma_request_data, wr); 699 } 700 } 701 702 static void 703 nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req) 704 { 705 SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->req.data_from_pool); 706 if (req->req.cmd) { 707 SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode); 708 } 709 if (req->recv) { 710 SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id); 711 } 712 } 713 714 static void 715 nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair) 716 { 717 int i; 718 719 SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid); 720 for (i = 0; i < rqpair->max_queue_depth; i++) { 721 if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) { 722 nvmf_rdma_dump_request(&rqpair->resources->reqs[i]); 723 } 724 } 725 } 726 727 static void 728 nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources) 729 { 730 if (resources->cmds_mr) { 731 ibv_dereg_mr(resources->cmds_mr); 732 } 733 734 if (resources->cpls_mr) { 735 ibv_dereg_mr(resources->cpls_mr); 736 } 737 738 if (resources->bufs_mr) { 739 ibv_dereg_mr(resources->bufs_mr); 740 } 741 742 spdk_free(resources->cmds); 743 spdk_free(resources->cpls); 744 spdk_free(resources->bufs); 745 free(resources->reqs); 746 free(resources->recvs); 747 free(resources); 748 } 749 750 751 static struct spdk_nvmf_rdma_resources * 752 nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts) 753 { 754 struct spdk_nvmf_rdma_resources *resources; 755 struct spdk_nvmf_rdma_request *rdma_req; 756 struct spdk_nvmf_rdma_recv *rdma_recv; 757 struct ibv_qp *qp; 758 struct ibv_srq *srq; 759 uint32_t i; 760 int rc; 761 762 resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources)); 763 if (!resources) { 764 SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); 765 return NULL; 766 } 767 768 resources->reqs = calloc(opts->max_queue_depth, sizeof(*resources->reqs)); 769 resources->recvs = calloc(opts->max_queue_depth, sizeof(*resources->recvs)); 770 resources->cmds = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds), 771 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 772 resources->cpls = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls), 773 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 774 775 if (opts->in_capsule_data_size > 0) { 776 resources->bufs = spdk_zmalloc(opts->max_queue_depth * opts->in_capsule_data_size, 777 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, 778 SPDK_MALLOC_DMA); 779 } 780 781 if (!resources->reqs || !resources->recvs || !resources->cmds || 782 !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) { 783 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 784 goto cleanup; 785 } 786 787 resources->cmds_mr = ibv_reg_mr(opts->pd, resources->cmds, 788 opts->max_queue_depth * sizeof(*resources->cmds), 789 IBV_ACCESS_LOCAL_WRITE); 790 resources->cpls_mr = ibv_reg_mr(opts->pd, resources->cpls, 791 opts->max_queue_depth * sizeof(*resources->cpls), 792 0); 793 794 if (opts->in_capsule_data_size) { 795 resources->bufs_mr = ibv_reg_mr(opts->pd, resources->bufs, 796 opts->max_queue_depth * 797 opts->in_capsule_data_size, 798 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 799 } 800 801 if (!resources->cmds_mr || !resources->cpls_mr || 802 (opts->in_capsule_data_size && 803 !resources->bufs_mr)) { 804 goto cleanup; 805 } 806 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 807 resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds), 808 resources->cmds_mr->lkey); 809 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 810 resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls), 811 resources->cpls_mr->lkey); 812 if (resources->bufs && resources->bufs_mr) { 813 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 814 resources->bufs, opts->max_queue_depth * 815 opts->in_capsule_data_size, resources->bufs_mr->lkey); 816 } 817 818 /* Initialize queues */ 819 STAILQ_INIT(&resources->incoming_queue); 820 STAILQ_INIT(&resources->free_queue); 821 822 for (i = 0; i < opts->max_queue_depth; i++) { 823 struct ibv_recv_wr *bad_wr = NULL; 824 825 rdma_recv = &resources->recvs[i]; 826 rdma_recv->qpair = opts->qpair; 827 828 /* Set up memory to receive commands */ 829 if (resources->bufs) { 830 rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i * 831 opts->in_capsule_data_size)); 832 } 833 834 rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV; 835 836 rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i]; 837 rdma_recv->sgl[0].length = sizeof(resources->cmds[i]); 838 rdma_recv->sgl[0].lkey = resources->cmds_mr->lkey; 839 rdma_recv->wr.num_sge = 1; 840 841 if (rdma_recv->buf && resources->bufs_mr) { 842 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 843 rdma_recv->sgl[1].length = opts->in_capsule_data_size; 844 rdma_recv->sgl[1].lkey = resources->bufs_mr->lkey; 845 rdma_recv->wr.num_sge++; 846 } 847 848 rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr; 849 rdma_recv->wr.sg_list = rdma_recv->sgl; 850 if (opts->shared) { 851 srq = (struct ibv_srq *)opts->qp; 852 rc = ibv_post_srq_recv(srq, &rdma_recv->wr, &bad_wr); 853 } else { 854 qp = (struct ibv_qp *)opts->qp; 855 rc = ibv_post_recv(qp, &rdma_recv->wr, &bad_wr); 856 } 857 if (rc) { 858 goto cleanup; 859 } 860 } 861 862 for (i = 0; i < opts->max_queue_depth; i++) { 863 rdma_req = &resources->reqs[i]; 864 865 if (opts->qpair != NULL) { 866 rdma_req->req.qpair = &opts->qpair->qpair; 867 } else { 868 rdma_req->req.qpair = NULL; 869 } 870 rdma_req->req.cmd = NULL; 871 872 /* Set up memory to send responses */ 873 rdma_req->req.rsp = &resources->cpls[i]; 874 875 rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i]; 876 rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]); 877 rdma_req->rsp.sgl[0].lkey = resources->cpls_mr->lkey; 878 879 rdma_req->rsp.rdma_wr.type = RDMA_WR_TYPE_SEND; 880 rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp.rdma_wr; 881 rdma_req->rsp.wr.next = NULL; 882 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 883 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 884 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 885 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 886 887 /* Set up memory for data buffers */ 888 rdma_req->data.rdma_wr.type = RDMA_WR_TYPE_DATA; 889 rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data.rdma_wr; 890 rdma_req->data.wr.next = NULL; 891 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 892 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 893 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 894 895 /* Initialize request state to FREE */ 896 rdma_req->state = RDMA_REQUEST_STATE_FREE; 897 STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link); 898 } 899 900 return resources; 901 902 cleanup: 903 nvmf_rdma_resources_destroy(resources); 904 return NULL; 905 } 906 907 static void 908 spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 909 { 910 struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 911 struct ibv_recv_wr *bad_recv_wr = NULL; 912 int rc; 913 914 spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); 915 916 spdk_poller_unregister(&rqpair->destruct_poller); 917 918 if (rqpair->qd != 0) { 919 if (rqpair->srq == NULL) { 920 nvmf_rdma_dump_qpair_contents(rqpair); 921 } 922 SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd); 923 } 924 925 if (rqpair->poller) { 926 TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); 927 928 if (rqpair->srq != NULL && rqpair->resources != NULL) { 929 /* Drop all received but unprocessed commands for this queue and return them to SRQ */ 930 STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) { 931 if (rqpair == rdma_recv->qpair) { 932 STAILQ_REMOVE_HEAD(&rqpair->resources->incoming_queue, link); 933 rc = ibv_post_srq_recv(rqpair->srq, &rdma_recv->wr, &bad_recv_wr); 934 if (rc) { 935 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 936 } 937 } 938 } 939 } 940 } 941 942 if (rqpair->cm_id) { 943 if (rqpair->cm_id->qp != NULL) { 944 rdma_destroy_qp(rqpair->cm_id); 945 } 946 rdma_destroy_id(rqpair->cm_id); 947 948 if (rqpair->poller != NULL && rqpair->srq == NULL) { 949 rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth); 950 } 951 } 952 953 if (rqpair->srq == NULL && rqpair->resources != NULL) { 954 nvmf_rdma_resources_destroy(rqpair->resources); 955 } 956 957 free(rqpair); 958 } 959 960 static int 961 nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device) 962 { 963 struct spdk_nvmf_rdma_poller *rpoller; 964 int rc, num_cqe, required_num_wr; 965 966 /* Enlarge CQ size dynamically */ 967 rpoller = rqpair->poller; 968 required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth); 969 num_cqe = rpoller->num_cqe; 970 if (num_cqe < required_num_wr) { 971 num_cqe = spdk_max(num_cqe * 2, required_num_wr); 972 num_cqe = spdk_min(num_cqe, device->attr.max_cqe); 973 } 974 975 if (rpoller->num_cqe != num_cqe) { 976 if (required_num_wr > device->attr.max_cqe) { 977 SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n", 978 required_num_wr, device->attr.max_cqe); 979 return -1; 980 } 981 982 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe); 983 rc = ibv_resize_cq(rpoller->cq, num_cqe); 984 if (rc) { 985 SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); 986 return -1; 987 } 988 989 rpoller->num_cqe = num_cqe; 990 } 991 992 rpoller->required_num_wr = required_num_wr; 993 return 0; 994 } 995 996 static int 997 spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 998 { 999 struct spdk_nvmf_rdma_qpair *rqpair; 1000 int rc; 1001 struct spdk_nvmf_rdma_transport *rtransport; 1002 struct spdk_nvmf_transport *transport; 1003 struct spdk_nvmf_rdma_resource_opts opts; 1004 struct spdk_nvmf_rdma_device *device; 1005 struct ibv_qp_init_attr ibv_init_attr; 1006 1007 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1008 device = rqpair->port->device; 1009 1010 memset(&ibv_init_attr, 0, sizeof(struct ibv_qp_init_attr)); 1011 ibv_init_attr.qp_context = rqpair; 1012 ibv_init_attr.qp_type = IBV_QPT_RC; 1013 ibv_init_attr.send_cq = rqpair->poller->cq; 1014 ibv_init_attr.recv_cq = rqpair->poller->cq; 1015 1016 if (rqpair->srq) { 1017 ibv_init_attr.srq = rqpair->srq; 1018 } else { 1019 ibv_init_attr.cap.max_recv_wr = rqpair->max_queue_depth + 1020 1; /* RECV operations + dummy drain WR */ 1021 } 1022 1023 ibv_init_attr.cap.max_send_wr = rqpair->max_queue_depth * 1024 2 + 1; /* SEND, READ, and WRITE operations + dummy drain WR */ 1025 ibv_init_attr.cap.max_send_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_TX_SGE); 1026 ibv_init_attr.cap.max_recv_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 1027 1028 if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) { 1029 SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n"); 1030 goto error; 1031 } 1032 1033 rc = rdma_create_qp(rqpair->cm_id, rqpair->port->device->pd, &ibv_init_attr); 1034 if (rc) { 1035 SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); 1036 goto error; 1037 } 1038 1039 rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2 + 1), 1040 ibv_init_attr.cap.max_send_wr); 1041 rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, ibv_init_attr.cap.max_send_sge); 1042 rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, ibv_init_attr.cap.max_recv_sge); 1043 spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0); 1044 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); 1045 1046 rqpair->sends_to_post.first = NULL; 1047 rqpair->sends_to_post.last = NULL; 1048 1049 if (rqpair->poller->srq == NULL) { 1050 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 1051 transport = &rtransport->transport; 1052 1053 opts.qp = rqpair->cm_id->qp; 1054 opts.pd = rqpair->cm_id->pd; 1055 opts.qpair = rqpair; 1056 opts.shared = false; 1057 opts.max_queue_depth = rqpair->max_queue_depth; 1058 opts.in_capsule_data_size = transport->opts.in_capsule_data_size; 1059 1060 rqpair->resources = nvmf_rdma_resources_create(&opts); 1061 1062 if (!rqpair->resources) { 1063 SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); 1064 rdma_destroy_qp(rqpair->cm_id); 1065 goto error; 1066 } 1067 } else { 1068 rqpair->resources = rqpair->poller->resources; 1069 } 1070 1071 rqpair->current_recv_depth = 0; 1072 STAILQ_INIT(&rqpair->pending_rdma_read_queue); 1073 STAILQ_INIT(&rqpair->pending_rdma_write_queue); 1074 1075 return 0; 1076 1077 error: 1078 rdma_destroy_id(rqpair->cm_id); 1079 rqpair->cm_id = NULL; 1080 return -1; 1081 } 1082 1083 /* Append the given recv wr structure to the resource structs outstanding recvs list. */ 1084 /* This function accepts either a single wr or the first wr in a linked list. */ 1085 static void 1086 nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first) 1087 { 1088 struct ibv_recv_wr *last; 1089 1090 last = first; 1091 while (last->next != NULL) { 1092 last = last->next; 1093 } 1094 1095 if (rqpair->resources->recvs_to_post.first == NULL) { 1096 rqpair->resources->recvs_to_post.first = first; 1097 rqpair->resources->recvs_to_post.last = last; 1098 if (rqpair->srq == NULL) { 1099 STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_recv, rqpair, recv_link); 1100 } 1101 } else { 1102 rqpair->resources->recvs_to_post.last->next = first; 1103 rqpair->resources->recvs_to_post.last = last; 1104 } 1105 } 1106 1107 /* Append the given send wr structure to the qpair's outstanding sends list. */ 1108 /* This function accepts either a single wr or the first wr in a linked list. */ 1109 static void 1110 nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first) 1111 { 1112 struct ibv_send_wr *last; 1113 1114 last = first; 1115 while (last->next != NULL) { 1116 last = last->next; 1117 } 1118 1119 if (rqpair->sends_to_post.first == NULL) { 1120 rqpair->sends_to_post.first = first; 1121 rqpair->sends_to_post.last = last; 1122 STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link); 1123 } else { 1124 rqpair->sends_to_post.last->next = first; 1125 rqpair->sends_to_post.last = last; 1126 } 1127 } 1128 1129 static int 1130 request_transfer_in(struct spdk_nvmf_request *req) 1131 { 1132 struct spdk_nvmf_rdma_request *rdma_req; 1133 struct spdk_nvmf_qpair *qpair; 1134 struct spdk_nvmf_rdma_qpair *rqpair; 1135 1136 qpair = req->qpair; 1137 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1138 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1139 1140 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 1141 assert(rdma_req != NULL); 1142 1143 nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr); 1144 rqpair->current_read_depth += rdma_req->num_outstanding_data_wr; 1145 rqpair->current_send_depth += rdma_req->num_outstanding_data_wr; 1146 return 0; 1147 } 1148 1149 static int 1150 request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) 1151 { 1152 int num_outstanding_data_wr = 0; 1153 struct spdk_nvmf_rdma_request *rdma_req; 1154 struct spdk_nvmf_qpair *qpair; 1155 struct spdk_nvmf_rdma_qpair *rqpair; 1156 struct spdk_nvme_cpl *rsp; 1157 struct ibv_send_wr *first = NULL; 1158 1159 *data_posted = 0; 1160 qpair = req->qpair; 1161 rsp = &req->rsp->nvme_cpl; 1162 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1163 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1164 1165 /* Advance our sq_head pointer */ 1166 if (qpair->sq_head == qpair->sq_head_max) { 1167 qpair->sq_head = 0; 1168 } else { 1169 qpair->sq_head++; 1170 } 1171 rsp->sqhd = qpair->sq_head; 1172 1173 /* queue the capsule for the recv buffer */ 1174 assert(rdma_req->recv != NULL); 1175 1176 nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr); 1177 1178 rdma_req->recv = NULL; 1179 assert(rqpair->current_recv_depth > 0); 1180 rqpair->current_recv_depth--; 1181 1182 /* Build the response which consists of optional 1183 * RDMA WRITEs to transfer data, plus an RDMA SEND 1184 * containing the response. 1185 */ 1186 first = &rdma_req->rsp.wr; 1187 1188 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 1189 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1190 first = &rdma_req->data.wr; 1191 *data_posted = 1; 1192 num_outstanding_data_wr = rdma_req->num_outstanding_data_wr; 1193 } 1194 nvmf_rdma_qpair_queue_send_wrs(rqpair, first); 1195 /* +1 for the rsp wr */ 1196 rqpair->current_send_depth += num_outstanding_data_wr + 1; 1197 1198 return 0; 1199 } 1200 1201 static int 1202 spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 1203 { 1204 struct spdk_nvmf_rdma_accept_private_data accept_data; 1205 struct rdma_conn_param ctrlr_event_data = {}; 1206 int rc; 1207 1208 accept_data.recfmt = 0; 1209 accept_data.crqsize = rqpair->max_queue_depth; 1210 1211 ctrlr_event_data.private_data = &accept_data; 1212 ctrlr_event_data.private_data_len = sizeof(accept_data); 1213 if (id->ps == RDMA_PS_TCP) { 1214 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 1215 ctrlr_event_data.initiator_depth = rqpair->max_read_depth; 1216 } 1217 1218 /* Configure infinite retries for the initiator side qpair. 1219 * When using a shared receive queue on the target side, 1220 * we need to pass this value to the initiator to prevent the 1221 * initiator side NIC from completing SEND requests back to the 1222 * initiator with status rnr_retry_count_exceeded. */ 1223 if (rqpair->srq != NULL) { 1224 ctrlr_event_data.rnr_retry_count = 0x7; 1225 } 1226 1227 rc = rdma_accept(id, &ctrlr_event_data); 1228 if (rc) { 1229 SPDK_ERRLOG("Error %d on rdma_accept\n", errno); 1230 } else { 1231 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); 1232 } 1233 1234 return rc; 1235 } 1236 1237 static void 1238 spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 1239 { 1240 struct spdk_nvmf_rdma_reject_private_data rej_data; 1241 1242 rej_data.recfmt = 0; 1243 rej_data.sts = error; 1244 1245 rdma_reject(id, &rej_data, sizeof(rej_data)); 1246 } 1247 1248 static int 1249 nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, 1250 new_qpair_fn cb_fn) 1251 { 1252 struct spdk_nvmf_rdma_transport *rtransport; 1253 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 1254 struct spdk_nvmf_rdma_port *port; 1255 struct rdma_conn_param *rdma_param = NULL; 1256 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 1257 uint16_t max_queue_depth; 1258 uint16_t max_read_depth; 1259 1260 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1261 1262 assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 1263 assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 1264 1265 rdma_param = &event->param.conn; 1266 if (rdma_param->private_data == NULL || 1267 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1268 SPDK_ERRLOG("connect request: no private data provided\n"); 1269 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 1270 return -1; 1271 } 1272 1273 private_data = rdma_param->private_data; 1274 if (private_data->recfmt != 0) { 1275 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 1276 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 1277 return -1; 1278 } 1279 1280 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 1281 event->id->verbs->device->name, event->id->verbs->device->dev_name); 1282 1283 port = event->listen_id->context; 1284 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 1285 event->listen_id, event->listen_id->verbs, port); 1286 1287 /* Figure out the supported queue depth. This is a multi-step process 1288 * that takes into account hardware maximums, host provided values, 1289 * and our target's internal memory limits */ 1290 1291 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); 1292 1293 /* Start with the maximum queue depth allowed by the target */ 1294 max_queue_depth = rtransport->transport.opts.max_queue_depth; 1295 max_read_depth = rtransport->transport.opts.max_queue_depth; 1296 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", 1297 rtransport->transport.opts.max_queue_depth); 1298 1299 /* Next check the local NIC's hardware limitations */ 1300 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 1301 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 1302 port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 1303 max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 1304 max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom); 1305 1306 /* Next check the remote NIC's hardware limitations */ 1307 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 1308 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 1309 rdma_param->initiator_depth, rdma_param->responder_resources); 1310 if (rdma_param->initiator_depth > 0) { 1311 max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth); 1312 } 1313 1314 /* Finally check for the host software requested values, which are 1315 * optional. */ 1316 if (rdma_param->private_data != NULL && 1317 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1318 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 1319 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 1320 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 1321 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 1322 } 1323 1324 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 1325 max_queue_depth, max_read_depth); 1326 1327 rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 1328 if (rqpair == NULL) { 1329 SPDK_ERRLOG("Could not allocate new connection.\n"); 1330 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1331 return -1; 1332 } 1333 1334 rqpair->port = port; 1335 rqpair->max_queue_depth = max_queue_depth; 1336 rqpair->max_read_depth = max_read_depth; 1337 rqpair->cm_id = event->id; 1338 rqpair->listen_id = event->listen_id; 1339 rqpair->qpair.transport = transport; 1340 /* use qid from the private data to determine the qpair type 1341 qid will be set to the appropriate value when the controller is created */ 1342 rqpair->qpair.qid = private_data->qid; 1343 1344 event->id->context = &rqpair->qpair; 1345 1346 cb_fn(&rqpair->qpair); 1347 1348 return 0; 1349 } 1350 1351 static int 1352 spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 1353 enum spdk_mem_map_notify_action action, 1354 void *vaddr, size_t size) 1355 { 1356 struct ibv_pd *pd = cb_ctx; 1357 struct ibv_mr *mr; 1358 int rc; 1359 1360 switch (action) { 1361 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1362 if (!g_nvmf_hooks.get_rkey) { 1363 mr = ibv_reg_mr(pd, vaddr, size, 1364 IBV_ACCESS_LOCAL_WRITE | 1365 IBV_ACCESS_REMOTE_READ | 1366 IBV_ACCESS_REMOTE_WRITE); 1367 if (mr == NULL) { 1368 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 1369 return -1; 1370 } else { 1371 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 1372 } 1373 } else { 1374 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, 1375 g_nvmf_hooks.get_rkey(pd, vaddr, size)); 1376 } 1377 break; 1378 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1379 if (!g_nvmf_hooks.get_rkey) { 1380 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); 1381 if (mr) { 1382 ibv_dereg_mr(mr); 1383 } 1384 } 1385 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 1386 break; 1387 default: 1388 SPDK_UNREACHABLE(); 1389 } 1390 1391 return rc; 1392 } 1393 1394 static int 1395 spdk_nvmf_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) 1396 { 1397 /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ 1398 return addr_1 == addr_2; 1399 } 1400 1401 typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; 1402 1403 static spdk_nvme_data_transfer_t 1404 spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) 1405 { 1406 enum spdk_nvme_data_transfer xfer; 1407 struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; 1408 struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; 1409 1410 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1411 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 1412 rdma_req->rsp.wr.imm_data = 0; 1413 #endif 1414 1415 /* Figure out data transfer direction */ 1416 if (cmd->opc == SPDK_NVME_OPC_FABRIC) { 1417 xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); 1418 } else { 1419 xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 1420 1421 /* Some admin commands are special cases */ 1422 if ((rdma_req->req.qpair->qid == 0) && 1423 ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || 1424 (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { 1425 switch (cmd->cdw10 & 0xff) { 1426 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 1427 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1428 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 1429 break; 1430 default: 1431 xfer = SPDK_NVME_DATA_NONE; 1432 } 1433 } 1434 } 1435 1436 if (xfer == SPDK_NVME_DATA_NONE) { 1437 return xfer; 1438 } 1439 1440 /* Even for commands that may transfer data, they could have specified 0 length. 1441 * We want those to show up with xfer SPDK_NVME_DATA_NONE. 1442 */ 1443 switch (sgl->generic.type) { 1444 case SPDK_NVME_SGL_TYPE_DATA_BLOCK: 1445 case SPDK_NVME_SGL_TYPE_BIT_BUCKET: 1446 case SPDK_NVME_SGL_TYPE_SEGMENT: 1447 case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: 1448 case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: 1449 if (sgl->unkeyed.length == 0) { 1450 xfer = SPDK_NVME_DATA_NONE; 1451 } 1452 break; 1453 case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: 1454 if (sgl->keyed.length == 0) { 1455 xfer = SPDK_NVME_DATA_NONE; 1456 } 1457 break; 1458 } 1459 1460 return xfer; 1461 } 1462 1463 static int 1464 nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport, 1465 struct spdk_nvmf_rdma_request *rdma_req, 1466 uint32_t num_sgl_descriptors) 1467 { 1468 struct spdk_nvmf_rdma_request_data *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES]; 1469 struct spdk_nvmf_rdma_request_data *current_data_wr; 1470 uint32_t i; 1471 1472 if (spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests, num_sgl_descriptors)) { 1473 return -ENOMEM; 1474 } 1475 1476 current_data_wr = &rdma_req->data; 1477 1478 for (i = 0; i < num_sgl_descriptors; i++) { 1479 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1480 current_data_wr->wr.opcode = IBV_WR_RDMA_WRITE; 1481 current_data_wr->wr.send_flags = 0; 1482 } else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 1483 current_data_wr->wr.opcode = IBV_WR_RDMA_READ; 1484 current_data_wr->wr.send_flags = IBV_SEND_SIGNALED; 1485 } else { 1486 assert(false); 1487 } 1488 work_requests[i]->wr.sg_list = work_requests[i]->sgl; 1489 work_requests[i]->wr.wr_id = rdma_req->data.wr.wr_id; 1490 current_data_wr->wr.next = &work_requests[i]->wr; 1491 current_data_wr = work_requests[i]; 1492 } 1493 1494 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1495 current_data_wr->wr.opcode = IBV_WR_RDMA_WRITE; 1496 current_data_wr->wr.next = &rdma_req->rsp.wr; 1497 current_data_wr->wr.send_flags = 0; 1498 } else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 1499 current_data_wr->wr.opcode = IBV_WR_RDMA_READ; 1500 current_data_wr->wr.next = NULL; 1501 current_data_wr->wr.send_flags = IBV_SEND_SIGNALED; 1502 } 1503 return 0; 1504 } 1505 1506 /* This function is used in the rare case that we have a buffer split over multiple memory regions. */ 1507 static int 1508 nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf) 1509 { 1510 struct spdk_nvmf_transport_poll_group *group = &rgroup->group; 1511 struct spdk_nvmf_transport *transport = group->transport; 1512 struct spdk_nvmf_transport_pg_cache_buf *old_buf; 1513 void *new_buf; 1514 1515 if (!(STAILQ_EMPTY(&group->buf_cache))) { 1516 group->buf_cache_count--; 1517 new_buf = STAILQ_FIRST(&group->buf_cache); 1518 STAILQ_REMOVE_HEAD(&group->buf_cache, link); 1519 assert(*buf != NULL); 1520 } else { 1521 new_buf = spdk_mempool_get(transport->data_buf_pool); 1522 } 1523 1524 if (*buf == NULL) { 1525 return -ENOMEM; 1526 } 1527 1528 old_buf = *buf; 1529 STAILQ_INSERT_HEAD(&rgroup->retired_bufs, old_buf, link); 1530 *buf = new_buf; 1531 return 0; 1532 } 1533 1534 /* 1535 * Fills iov and SGL, iov[i] points to buffer[i], SGE[i] is limited in length to data block size 1536 * and points to part of buffer 1537 */ 1538 static int 1539 nvmf_rdma_fill_buffers_with_md_interleave(struct spdk_nvmf_rdma_transport *rtransport, 1540 struct spdk_nvmf_rdma_poll_group *rgroup, 1541 struct spdk_nvmf_rdma_device *device, 1542 struct spdk_nvmf_request *req, 1543 struct ibv_send_wr *wr, 1544 uint32_t length, 1545 uint32_t data_block_size, 1546 uint32_t md_size) 1547 { 1548 uint32_t remaining_length = length; 1549 uint32_t remaining_io_buffer_length; 1550 uint32_t remaining_data_block = data_block_size; 1551 uint32_t offset = 0; 1552 uint32_t sge_len; 1553 uint64_t translation_len; 1554 struct iovec *iovec; 1555 struct ibv_sge *sg_list; 1556 uint32_t lkey = 0; 1557 1558 wr->num_sge = 0; 1559 1560 while (remaining_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) { 1561 iovec = &req->iov[req->iovcnt]; 1562 iovec->iov_base = (void *)((uintptr_t)(req->buffers[req->iovcnt] + NVMF_DATA_BUFFER_MASK) 1563 & ~NVMF_DATA_BUFFER_MASK); 1564 iovec->iov_len = spdk_min(remaining_length, rtransport->transport.opts.io_unit_size); 1565 remaining_io_buffer_length = iovec->iov_len - offset; 1566 translation_len = iovec->iov_len; 1567 1568 if (!g_nvmf_hooks.get_rkey) { 1569 lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, (uint64_t)iovec->iov_base, 1570 &translation_len))->lkey; 1571 } else { 1572 lkey = spdk_mem_map_translate(device->map, (uint64_t)iovec->iov_base, &translation_len); 1573 } 1574 /* This is a very rare case that can occur when using DPDK version < 19.05 */ 1575 if (spdk_unlikely(translation_len < iovec->iov_len)) { 1576 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n"); 1577 if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[req->iovcnt]) == -ENOMEM) { 1578 return -ENOMEM; 1579 } 1580 continue; 1581 } 1582 1583 req->iovcnt++; 1584 1585 while (remaining_io_buffer_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) { 1586 sg_list = &wr->sg_list[wr->num_sge]; 1587 sg_list->addr = (uintptr_t)((char *) iovec->iov_base + offset); 1588 sge_len = spdk_min(remaining_io_buffer_length, remaining_data_block); 1589 sg_list->length = sge_len; 1590 sg_list->lkey = lkey; 1591 remaining_io_buffer_length -= sge_len; 1592 remaining_data_block -= sge_len; 1593 offset += sge_len; 1594 wr->num_sge++; 1595 1596 if (remaining_data_block == 0) { 1597 /* skip metadata */ 1598 offset += md_size; 1599 /* Metadata that do not fit this IO buffer will be included in the next IO buffer */ 1600 remaining_io_buffer_length -= spdk_min(remaining_io_buffer_length, md_size); 1601 remaining_data_block = data_block_size; 1602 } 1603 1604 if (remaining_io_buffer_length == 0) { 1605 /* By subtracting the size of the last IOV from the offset, we ensure that we skip 1606 the remaining metadata bits at the beginning of the next buffer */ 1607 offset -= iovec->iov_len; 1608 } 1609 } 1610 remaining_length -= iovec->iov_len; 1611 } 1612 1613 if (remaining_length) { 1614 SPDK_ERRLOG("Not enough SG entries to hold data buffer\n"); 1615 return -EINVAL; 1616 } 1617 1618 return 0; 1619 } 1620 1621 static bool 1622 nvmf_rdma_get_lkey(struct spdk_nvmf_rdma_device *device, struct iovec *iov, 1623 uint32_t *_lkey) 1624 { 1625 uint64_t translation_len; 1626 uint32_t lkey; 1627 1628 translation_len = iov->iov_len; 1629 1630 if (!g_nvmf_hooks.get_rkey) { 1631 lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, 1632 (uint64_t)iov->iov_base, &translation_len))->lkey; 1633 } else { 1634 lkey = spdk_mem_map_translate(device->map, 1635 (uint64_t)iov->iov_base, &translation_len); 1636 } 1637 1638 if (spdk_unlikely(translation_len < iov->iov_len)) { 1639 return false; 1640 } 1641 1642 *_lkey = lkey; 1643 return true; 1644 } 1645 1646 static bool 1647 nvmf_rdma_fill_wr_sge(struct spdk_nvmf_rdma_device *device, 1648 struct spdk_nvmf_request *req, struct ibv_send_wr *wr) 1649 { 1650 struct iovec *iov = &req->iov[req->iovcnt]; 1651 struct ibv_sge *sg_ele = &wr->sg_list[wr->num_sge]; 1652 1653 if (spdk_unlikely(!nvmf_rdma_get_lkey(device, iov, &sg_ele->lkey))) { 1654 /* This is a very rare case that can occur when using DPDK version < 19.05 */ 1655 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n"); 1656 return false; 1657 } 1658 1659 sg_ele->addr = (uintptr_t)(iov->iov_base); 1660 sg_ele->length = iov->iov_len; 1661 wr->num_sge++; 1662 1663 return true; 1664 } 1665 1666 static int 1667 nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport, 1668 struct spdk_nvmf_rdma_poll_group *rgroup, 1669 struct spdk_nvmf_rdma_device *device, 1670 struct spdk_nvmf_request *req, 1671 struct ibv_send_wr *wr, 1672 uint32_t length) 1673 { 1674 wr->num_sge = 0; 1675 while (length) { 1676 req->iov[req->iovcnt].iov_base = (void *)((uintptr_t)(req->buffers[req->iovcnt] + 1677 NVMF_DATA_BUFFER_MASK) & 1678 ~NVMF_DATA_BUFFER_MASK); 1679 req->iov[req->iovcnt].iov_len = spdk_min(length, 1680 rtransport->transport.opts.io_unit_size); 1681 if (spdk_unlikely(!nvmf_rdma_fill_wr_sge(device, req, wr))) { 1682 if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[req->iovcnt]) == -ENOMEM) { 1683 return -ENOMEM; 1684 } 1685 continue; 1686 } 1687 1688 length -= req->iov[req->iovcnt].iov_len; 1689 req->iovcnt++; 1690 } 1691 1692 return 0; 1693 } 1694 1695 static int 1696 spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 1697 struct spdk_nvmf_rdma_device *device, 1698 struct spdk_nvmf_rdma_request *rdma_req, 1699 uint32_t length) 1700 { 1701 struct spdk_nvmf_rdma_qpair *rqpair; 1702 struct spdk_nvmf_rdma_poll_group *rgroup; 1703 struct spdk_nvmf_request *req = &rdma_req->req; 1704 struct ibv_send_wr *wr = &rdma_req->data.wr; 1705 int rc = 0; 1706 1707 rqpair = SPDK_CONTAINEROF(req->qpair, struct spdk_nvmf_rdma_qpair, qpair); 1708 rgroup = rqpair->poller->group; 1709 1710 if (spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, 1711 length)) { 1712 return -ENOMEM; 1713 } 1714 1715 req->iovcnt = 0; 1716 1717 if (spdk_unlikely(rdma_req->dif_insert_or_strip)) { 1718 rc = nvmf_rdma_fill_buffers_with_md_interleave(rtransport, 1719 rgroup, 1720 device, 1721 &rdma_req->req, 1722 wr, 1723 length, 1724 rdma_req->dif_ctx.block_size - rdma_req->dif_ctx.md_size, 1725 rdma_req->dif_ctx.md_size); 1726 } else { 1727 rc = nvmf_rdma_fill_buffers(rtransport, rgroup, device, req, wr, length); 1728 } 1729 if (rc != 0) { 1730 goto err_exit; 1731 } 1732 1733 assert(req->iovcnt <= rqpair->max_send_sge); 1734 1735 req->data_from_pool = true; 1736 1737 return rc; 1738 1739 err_exit: 1740 spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport); 1741 memset(wr->sg_list, 0, sizeof(wr->sg_list[0]) * wr->num_sge); 1742 wr->num_sge = 0; 1743 req->iovcnt = 0; 1744 return rc; 1745 } 1746 1747 static int 1748 nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport, 1749 struct spdk_nvmf_rdma_device *device, 1750 struct spdk_nvmf_rdma_request *rdma_req) 1751 { 1752 struct spdk_nvmf_rdma_qpair *rqpair; 1753 struct spdk_nvmf_rdma_poll_group *rgroup; 1754 struct ibv_send_wr *current_wr; 1755 struct spdk_nvmf_request *req = &rdma_req->req; 1756 struct spdk_nvme_sgl_descriptor *inline_segment, *desc; 1757 uint32_t num_sgl_descriptors; 1758 uint32_t i; 1759 int rc; 1760 1761 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1762 rgroup = rqpair->poller->group; 1763 1764 inline_segment = &req->cmd->nvme_cmd.dptr.sgl1; 1765 assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT); 1766 assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); 1767 1768 num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor); 1769 assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES); 1770 1771 if (nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1) != 0) { 1772 return -ENOMEM; 1773 } 1774 1775 /* The first WR must always be the embedded data WR. This is how we unwind them later. */ 1776 current_wr = &rdma_req->data.wr; 1777 assert(current_wr != NULL); 1778 1779 req->iovcnt = 0; 1780 req->length = 0; 1781 desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address; 1782 for (i = 0; i < num_sgl_descriptors; i++) { 1783 /* The descriptors must be keyed data block descriptors with an address, not an offset. */ 1784 if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK || 1785 desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) { 1786 rc = -EINVAL; 1787 goto err_exit; 1788 } 1789 1790 rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, 1791 desc->keyed.length); 1792 if (rc != 0) { 1793 goto err_exit; 1794 } 1795 1796 current_wr->num_sge = 0; 1797 1798 rc = nvmf_rdma_fill_buffers(rtransport, rgroup, device, req, current_wr, 1799 desc->keyed.length); 1800 if (rc != 0) { 1801 rc = -ENOMEM; 1802 goto err_exit; 1803 } 1804 1805 req->length += desc->keyed.length; 1806 current_wr->wr.rdma.rkey = desc->keyed.key; 1807 current_wr->wr.rdma.remote_addr = desc->address; 1808 current_wr = current_wr->next; 1809 desc++; 1810 } 1811 1812 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1813 /* Go back to the last descriptor in the list. */ 1814 desc--; 1815 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 1816 if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 1817 rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 1818 rdma_req->rsp.wr.imm_data = desc->keyed.key; 1819 } 1820 } 1821 #endif 1822 1823 rdma_req->num_outstanding_data_wr = num_sgl_descriptors; 1824 req->data_from_pool = true; 1825 1826 return 0; 1827 1828 err_exit: 1829 spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport); 1830 nvmf_rdma_request_free_data(rdma_req, rtransport); 1831 return rc; 1832 } 1833 1834 static int 1835 spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 1836 struct spdk_nvmf_rdma_device *device, 1837 struct spdk_nvmf_rdma_request *rdma_req) 1838 { 1839 struct spdk_nvme_cmd *cmd; 1840 struct spdk_nvme_cpl *rsp; 1841 struct spdk_nvme_sgl_descriptor *sgl; 1842 int rc; 1843 uint32_t length; 1844 1845 cmd = &rdma_req->req.cmd->nvme_cmd; 1846 rsp = &rdma_req->req.rsp->nvme_cpl; 1847 sgl = &cmd->dptr.sgl1; 1848 1849 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 1850 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 1851 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 1852 1853 length = sgl->keyed.length; 1854 if (length > rtransport->transport.opts.max_io_size) { 1855 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 1856 length, rtransport->transport.opts.max_io_size); 1857 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1858 return -1; 1859 } 1860 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1861 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 1862 if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 1863 rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 1864 rdma_req->rsp.wr.imm_data = sgl->keyed.key; 1865 } 1866 } 1867 #endif 1868 1869 /* fill request length and populate iovs */ 1870 rdma_req->req.length = length; 1871 1872 if (spdk_unlikely(rdma_req->dif_insert_or_strip)) { 1873 rdma_req->orig_length = length; 1874 length = spdk_dif_get_length_with_md(length, &rdma_req->dif_ctx); 1875 rdma_req->elba_length = length; 1876 } 1877 1878 if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req, length) < 0) { 1879 /* No available buffers. Queue this request up. */ 1880 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 1881 return 0; 1882 } 1883 1884 /* backward compatible */ 1885 rdma_req->req.data = rdma_req->req.iov[0].iov_base; 1886 1887 /* rdma wr specifics */ 1888 rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; 1889 rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; 1890 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1891 rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; 1892 rdma_req->data.wr.next = &rdma_req->rsp.wr; 1893 rdma_req->data.wr.send_flags &= ~IBV_SEND_SIGNALED; 1894 } else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 1895 rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; 1896 rdma_req->data.wr.next = NULL; 1897 rdma_req->data.wr.send_flags |= IBV_SEND_SIGNALED; 1898 } 1899 1900 /* set the number of outstanding data WRs for this request. */ 1901 rdma_req->num_outstanding_data_wr = 1; 1902 1903 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 1904 rdma_req->req.iovcnt); 1905 1906 return 0; 1907 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 1908 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1909 uint64_t offset = sgl->address; 1910 uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; 1911 1912 SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 1913 offset, sgl->unkeyed.length); 1914 1915 if (offset > max_len) { 1916 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 1917 offset, max_len); 1918 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 1919 return -1; 1920 } 1921 max_len -= (uint32_t)offset; 1922 1923 if (sgl->unkeyed.length > max_len) { 1924 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 1925 sgl->unkeyed.length, max_len); 1926 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1927 return -1; 1928 } 1929 1930 rdma_req->num_outstanding_data_wr = 0; 1931 rdma_req->req.data = rdma_req->recv->buf + offset; 1932 rdma_req->req.data_from_pool = false; 1933 rdma_req->req.length = sgl->unkeyed.length; 1934 1935 rdma_req->req.iov[0].iov_base = rdma_req->req.data; 1936 rdma_req->req.iov[0].iov_len = rdma_req->req.length; 1937 rdma_req->req.iovcnt = 1; 1938 1939 return 0; 1940 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT && 1941 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1942 1943 rc = nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req); 1944 if (rc == -ENOMEM) { 1945 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 1946 return 0; 1947 } else if (rc == -EINVAL) { 1948 SPDK_ERRLOG("Multi SGL element request length exceeds the max I/O size\n"); 1949 return -1; 1950 } 1951 1952 /* backward compatible */ 1953 rdma_req->req.data = rdma_req->req.iov[0].iov_base; 1954 1955 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 1956 rdma_req->req.iovcnt); 1957 1958 return 0; 1959 } 1960 1961 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 1962 sgl->generic.type, sgl->generic.subtype); 1963 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 1964 return -1; 1965 } 1966 1967 static void 1968 nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req, 1969 struct spdk_nvmf_rdma_transport *rtransport) 1970 { 1971 struct spdk_nvmf_rdma_qpair *rqpair; 1972 struct spdk_nvmf_rdma_poll_group *rgroup; 1973 1974 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1975 if (rdma_req->req.data_from_pool) { 1976 rgroup = rqpair->poller->group; 1977 1978 spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport); 1979 } 1980 nvmf_rdma_request_free_data(rdma_req, rtransport); 1981 rdma_req->req.length = 0; 1982 rdma_req->req.iovcnt = 0; 1983 rdma_req->req.data = NULL; 1984 rdma_req->rsp.wr.next = NULL; 1985 rdma_req->data.wr.next = NULL; 1986 rdma_req->dif_insert_or_strip = false; 1987 rdma_req->elba_length = 0; 1988 rdma_req->orig_length = 0; 1989 memset(&rdma_req->dif_ctx, 0, sizeof(rdma_req->dif_ctx)); 1990 rqpair->qd--; 1991 1992 STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link); 1993 rdma_req->state = RDMA_REQUEST_STATE_FREE; 1994 } 1995 1996 static bool 1997 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 1998 struct spdk_nvmf_rdma_request *rdma_req) 1999 { 2000 struct spdk_nvmf_rdma_qpair *rqpair; 2001 struct spdk_nvmf_rdma_device *device; 2002 struct spdk_nvmf_rdma_poll_group *rgroup; 2003 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 2004 int rc; 2005 struct spdk_nvmf_rdma_recv *rdma_recv; 2006 enum spdk_nvmf_rdma_request_state prev_state; 2007 bool progress = false; 2008 int data_posted; 2009 uint32_t num_blocks; 2010 2011 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2012 device = rqpair->port->device; 2013 rgroup = rqpair->poller->group; 2014 2015 assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 2016 2017 /* If the queue pair is in an error state, force the request to the completed state 2018 * to release resources. */ 2019 if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 2020 if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) { 2021 STAILQ_REMOVE(&rgroup->group.pending_buf_queue, &rdma_req->req, spdk_nvmf_request, buf_link); 2022 } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) { 2023 STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 2024 } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING) { 2025 STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 2026 } 2027 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2028 } 2029 2030 /* The loop here is to allow for several back-to-back state changes. */ 2031 do { 2032 prev_state = rdma_req->state; 2033 2034 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); 2035 2036 switch (rdma_req->state) { 2037 case RDMA_REQUEST_STATE_FREE: 2038 /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 2039 * to escape this state. */ 2040 break; 2041 case RDMA_REQUEST_STATE_NEW: 2042 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, 2043 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2044 rdma_recv = rdma_req->recv; 2045 2046 /* The first element of the SGL is the NVMe command */ 2047 rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 2048 memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 2049 2050 if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 2051 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2052 break; 2053 } 2054 2055 if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->dif_ctx))) { 2056 rdma_req->dif_insert_or_strip = true; 2057 } 2058 2059 /* The next state transition depends on the data transfer needs of this request. */ 2060 rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); 2061 2062 /* If no data to transfer, ready to execute. */ 2063 if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 2064 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 2065 break; 2066 } 2067 2068 rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER; 2069 STAILQ_INSERT_TAIL(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link); 2070 break; 2071 case RDMA_REQUEST_STATE_NEED_BUFFER: 2072 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, 2073 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2074 2075 assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 2076 2077 if (&rdma_req->req != STAILQ_FIRST(&rgroup->group.pending_buf_queue)) { 2078 /* This request needs to wait in line to obtain a buffer */ 2079 break; 2080 } 2081 2082 /* Try to get a data buffer */ 2083 rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 2084 if (rc < 0) { 2085 STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link); 2086 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2087 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2088 break; 2089 } 2090 2091 if (!rdma_req->req.data) { 2092 /* No buffers available. */ 2093 rgroup->stat.pending_data_buffer++; 2094 break; 2095 } 2096 2097 STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link); 2098 2099 /* If data is transferring from host to controller and the data didn't 2100 * arrive using in capsule data, we need to do a transfer from the host. 2101 */ 2102 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && 2103 rdma_req->req.data_from_pool) { 2104 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link); 2105 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING; 2106 break; 2107 } 2108 2109 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 2110 break; 2111 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: 2112 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0, 2113 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2114 2115 if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) { 2116 /* This request needs to wait in line to perform RDMA */ 2117 break; 2118 } 2119 if (rqpair->current_send_depth + rdma_req->num_outstanding_data_wr > rqpair->max_send_depth 2120 || rqpair->current_read_depth + rdma_req->num_outstanding_data_wr > rqpair->max_read_depth) { 2121 /* We can only have so many WRs outstanding. we have to wait until some finish. */ 2122 rqpair->poller->stat.pending_rdma_read++; 2123 break; 2124 } 2125 2126 /* We have already verified that this request is the head of the queue. */ 2127 STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link); 2128 2129 rc = request_transfer_in(&rdma_req->req); 2130 if (!rc) { 2131 rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; 2132 } else { 2133 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2134 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2135 } 2136 break; 2137 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 2138 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, 2139 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2140 /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 2141 * to escape this state. */ 2142 break; 2143 case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 2144 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, 2145 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2146 2147 if (spdk_unlikely(rdma_req->dif_insert_or_strip)) { 2148 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 2149 /* generate DIF for write operation */ 2150 num_blocks = SPDK_CEIL_DIV(rdma_req->elba_length, rdma_req->dif_ctx.block_size); 2151 assert(num_blocks > 0); 2152 2153 rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt, 2154 num_blocks, &rdma_req->dif_ctx); 2155 if (rc != 0) { 2156 SPDK_ERRLOG("DIF generation failed\n"); 2157 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2158 spdk_nvmf_rdma_start_disconnect(rqpair); 2159 break; 2160 } 2161 } 2162 2163 assert(rdma_req->elba_length >= rdma_req->req.length); 2164 /* set extended length before IO operation */ 2165 rdma_req->req.length = rdma_req->elba_length; 2166 } 2167 2168 rdma_req->state = RDMA_REQUEST_STATE_EXECUTING; 2169 spdk_nvmf_request_exec(&rdma_req->req); 2170 break; 2171 case RDMA_REQUEST_STATE_EXECUTING: 2172 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, 2173 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2174 /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 2175 * to escape this state. */ 2176 break; 2177 case RDMA_REQUEST_STATE_EXECUTED: 2178 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, 2179 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2180 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 2181 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link); 2182 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING; 2183 } else { 2184 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2185 } 2186 if (spdk_unlikely(rdma_req->dif_insert_or_strip)) { 2187 /* restore the original length */ 2188 rdma_req->req.length = rdma_req->orig_length; 2189 2190 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 2191 struct spdk_dif_error error_blk; 2192 2193 num_blocks = SPDK_CEIL_DIV(rdma_req->elba_length, rdma_req->dif_ctx.block_size); 2194 2195 rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks, &rdma_req->dif_ctx, 2196 &error_blk); 2197 if (rc) { 2198 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 2199 2200 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", error_blk.err_type, 2201 error_blk.err_offset); 2202 rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR; 2203 rsp->status.sc = spdk_nvmf_rdma_dif_error_to_compl_status(error_blk.err_type); 2204 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2205 STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 2206 } 2207 } 2208 } 2209 break; 2210 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: 2211 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0, 2212 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2213 2214 if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) { 2215 /* This request needs to wait in line to perform RDMA */ 2216 break; 2217 } 2218 if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) > 2219 rqpair->max_send_depth) { 2220 /* We can only have so many WRs outstanding. we have to wait until some finish. 2221 * +1 since each request has an additional wr in the resp. */ 2222 rqpair->poller->stat.pending_rdma_write++; 2223 break; 2224 } 2225 2226 /* We have already verified that this request is the head of the queue. */ 2227 STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link); 2228 2229 /* The data transfer will be kicked off from 2230 * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. 2231 */ 2232 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 2233 break; 2234 case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 2235 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, 2236 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2237 rc = request_transfer_out(&rdma_req->req, &data_posted); 2238 assert(rc == 0); /* No good way to handle this currently */ 2239 if (rc) { 2240 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2241 } else { 2242 rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : 2243 RDMA_REQUEST_STATE_COMPLETING; 2244 } 2245 break; 2246 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 2247 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, 2248 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2249 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 2250 * to escape this state. */ 2251 break; 2252 case RDMA_REQUEST_STATE_COMPLETING: 2253 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, 2254 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2255 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 2256 * to escape this state. */ 2257 break; 2258 case RDMA_REQUEST_STATE_COMPLETED: 2259 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, 2260 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 2261 2262 rqpair->poller->stat.request_latency += spdk_get_ticks() - rdma_req->receive_tsc; 2263 nvmf_rdma_request_free(rdma_req, rtransport); 2264 break; 2265 case RDMA_REQUEST_NUM_STATES: 2266 default: 2267 assert(0); 2268 break; 2269 } 2270 2271 if (rdma_req->state != prev_state) { 2272 progress = true; 2273 } 2274 } while (rdma_req->state != prev_state); 2275 2276 return progress; 2277 } 2278 2279 /* Public API callbacks begin here */ 2280 2281 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 2282 #define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 2283 #define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096 2284 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 128 2285 #define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 2286 #define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 2287 #define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES) 2288 #define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095 2289 #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32 2290 #define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false 2291 #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false 2292 2293 static void 2294 spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) 2295 { 2296 opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; 2297 opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; 2298 opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; 2299 opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; 2300 opts->io_unit_size = SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE; 2301 opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; 2302 opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS; 2303 opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE; 2304 opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH; 2305 opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ; 2306 opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP; 2307 } 2308 2309 const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = { 2310 .notify_cb = spdk_nvmf_rdma_mem_notify, 2311 .are_contiguous = spdk_nvmf_rdma_check_contiguous_entries 2312 }; 2313 2314 static int spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport); 2315 2316 static struct spdk_nvmf_transport * 2317 spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) 2318 { 2319 int rc; 2320 struct spdk_nvmf_rdma_transport *rtransport; 2321 struct spdk_nvmf_rdma_device *device, *tmp; 2322 struct ibv_context **contexts; 2323 uint32_t i; 2324 int flag; 2325 uint32_t sge_count; 2326 uint32_t min_shared_buffers; 2327 int max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES; 2328 pthread_mutexattr_t attr; 2329 2330 rtransport = calloc(1, sizeof(*rtransport)); 2331 if (!rtransport) { 2332 return NULL; 2333 } 2334 2335 if (pthread_mutexattr_init(&attr)) { 2336 SPDK_ERRLOG("pthread_mutexattr_init() failed\n"); 2337 free(rtransport); 2338 return NULL; 2339 } 2340 2341 if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) { 2342 SPDK_ERRLOG("pthread_mutexattr_settype() failed\n"); 2343 pthread_mutexattr_destroy(&attr); 2344 free(rtransport); 2345 return NULL; 2346 } 2347 2348 if (pthread_mutex_init(&rtransport->lock, &attr)) { 2349 SPDK_ERRLOG("pthread_mutex_init() failed\n"); 2350 pthread_mutexattr_destroy(&attr); 2351 free(rtransport); 2352 return NULL; 2353 } 2354 2355 pthread_mutexattr_destroy(&attr); 2356 2357 TAILQ_INIT(&rtransport->devices); 2358 TAILQ_INIT(&rtransport->ports); 2359 TAILQ_INIT(&rtransport->poll_groups); 2360 2361 rtransport->transport.ops = &spdk_nvmf_transport_rdma; 2362 2363 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n" 2364 " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" 2365 " max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" 2366 " in_capsule_data_size=%d, max_aq_depth=%d,\n" 2367 " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d\n", 2368 opts->max_queue_depth, 2369 opts->max_io_size, 2370 opts->max_qpairs_per_ctrlr, 2371 opts->io_unit_size, 2372 opts->in_capsule_data_size, 2373 opts->max_aq_depth, 2374 opts->num_shared_buffers, 2375 opts->max_srq_depth, 2376 opts->no_srq); 2377 2378 /* I/O unit size cannot be larger than max I/O size */ 2379 if (opts->io_unit_size > opts->max_io_size) { 2380 opts->io_unit_size = opts->max_io_size; 2381 } 2382 2383 if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) { 2384 SPDK_ERRLOG("The number of shared data buffers (%d) is less than" 2385 "the minimum number required to guarantee that forward progress can be made (%d)\n", 2386 opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2)); 2387 spdk_nvmf_rdma_destroy(&rtransport->transport); 2388 return NULL; 2389 } 2390 2391 min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size; 2392 if (min_shared_buffers > opts->num_shared_buffers) { 2393 SPDK_ERRLOG("There are not enough buffers to satisfy" 2394 "per-poll group caches for each thread. (%" PRIu32 ")" 2395 "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers); 2396 SPDK_ERRLOG("Please specify a larger number of shared buffers\n"); 2397 spdk_nvmf_rdma_destroy(&rtransport->transport); 2398 return NULL; 2399 } 2400 2401 sge_count = opts->max_io_size / opts->io_unit_size; 2402 if (sge_count > NVMF_DEFAULT_TX_SGE) { 2403 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); 2404 spdk_nvmf_rdma_destroy(&rtransport->transport); 2405 return NULL; 2406 } 2407 2408 rtransport->event_channel = rdma_create_event_channel(); 2409 if (rtransport->event_channel == NULL) { 2410 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 2411 spdk_nvmf_rdma_destroy(&rtransport->transport); 2412 return NULL; 2413 } 2414 2415 flag = fcntl(rtransport->event_channel->fd, F_GETFL); 2416 if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 2417 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 2418 rtransport->event_channel->fd, spdk_strerror(errno)); 2419 spdk_nvmf_rdma_destroy(&rtransport->transport); 2420 return NULL; 2421 } 2422 2423 rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data", 2424 opts->max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES, 2425 sizeof(struct spdk_nvmf_rdma_request_data), 2426 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 2427 SPDK_ENV_SOCKET_ID_ANY); 2428 if (!rtransport->data_wr_pool) { 2429 SPDK_ERRLOG("Unable to allocate work request pool for poll group\n"); 2430 spdk_nvmf_rdma_destroy(&rtransport->transport); 2431 return NULL; 2432 } 2433 2434 contexts = rdma_get_devices(NULL); 2435 if (contexts == NULL) { 2436 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 2437 spdk_nvmf_rdma_destroy(&rtransport->transport); 2438 return NULL; 2439 } 2440 2441 i = 0; 2442 rc = 0; 2443 while (contexts[i] != NULL) { 2444 device = calloc(1, sizeof(*device)); 2445 if (!device) { 2446 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 2447 rc = -ENOMEM; 2448 break; 2449 } 2450 device->context = contexts[i]; 2451 rc = ibv_query_device(device->context, &device->attr); 2452 if (rc < 0) { 2453 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 2454 free(device); 2455 break; 2456 2457 } 2458 2459 max_device_sge = spdk_min(max_device_sge, device->attr.max_sge); 2460 2461 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 2462 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { 2463 SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); 2464 SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); 2465 } 2466 2467 /** 2468 * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. 2469 * The Soft-RoCE RXE driver does not currently support send with invalidate, 2470 * but incorrectly reports that it does. There are changes making their way 2471 * through the kernel now that will enable this feature. When they are merged, 2472 * we can conditionally enable this feature. 2473 * 2474 * TODO: enable this for versions of the kernel rxe driver that support it. 2475 */ 2476 if (device->attr.vendor_id == 0) { 2477 device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); 2478 } 2479 #endif 2480 2481 /* set up device context async ev fd as NON_BLOCKING */ 2482 flag = fcntl(device->context->async_fd, F_GETFL); 2483 rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 2484 if (rc < 0) { 2485 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 2486 free(device); 2487 break; 2488 } 2489 2490 TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 2491 i++; 2492 2493 if (g_nvmf_hooks.get_ibv_pd) { 2494 device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context); 2495 } else { 2496 device->pd = ibv_alloc_pd(device->context); 2497 } 2498 2499 if (!device->pd) { 2500 SPDK_ERRLOG("Unable to allocate protection domain.\n"); 2501 rc = -ENOMEM; 2502 break; 2503 } 2504 2505 assert(device->map == NULL); 2506 2507 device->map = spdk_mem_map_alloc(0, &g_nvmf_rdma_map_ops, device->pd); 2508 if (!device->map) { 2509 SPDK_ERRLOG("Unable to allocate memory map for listen address\n"); 2510 rc = -ENOMEM; 2511 break; 2512 } 2513 2514 assert(device->map != NULL); 2515 assert(device->pd != NULL); 2516 } 2517 rdma_free_devices(contexts); 2518 2519 if (opts->io_unit_size * max_device_sge < opts->max_io_size) { 2520 /* divide and round up. */ 2521 opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge; 2522 2523 /* round up to the nearest 4k. */ 2524 opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK; 2525 2526 opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE); 2527 SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n", 2528 opts->io_unit_size); 2529 } 2530 2531 if (rc < 0) { 2532 spdk_nvmf_rdma_destroy(&rtransport->transport); 2533 return NULL; 2534 } 2535 2536 /* Set up poll descriptor array to monitor events from RDMA and IB 2537 * in a single poll syscall 2538 */ 2539 rtransport->npoll_fds = i + 1; 2540 i = 0; 2541 rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 2542 if (rtransport->poll_fds == NULL) { 2543 SPDK_ERRLOG("poll_fds allocation failed\n"); 2544 spdk_nvmf_rdma_destroy(&rtransport->transport); 2545 return NULL; 2546 } 2547 2548 rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 2549 rtransport->poll_fds[i++].events = POLLIN; 2550 2551 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 2552 rtransport->poll_fds[i].fd = device->context->async_fd; 2553 rtransport->poll_fds[i++].events = POLLIN; 2554 } 2555 2556 return &rtransport->transport; 2557 } 2558 2559 static int 2560 spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) 2561 { 2562 struct spdk_nvmf_rdma_transport *rtransport; 2563 struct spdk_nvmf_rdma_port *port, *port_tmp; 2564 struct spdk_nvmf_rdma_device *device, *device_tmp; 2565 2566 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2567 2568 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 2569 TAILQ_REMOVE(&rtransport->ports, port, link); 2570 rdma_destroy_id(port->id); 2571 free(port); 2572 } 2573 2574 if (rtransport->poll_fds != NULL) { 2575 free(rtransport->poll_fds); 2576 } 2577 2578 if (rtransport->event_channel != NULL) { 2579 rdma_destroy_event_channel(rtransport->event_channel); 2580 } 2581 2582 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 2583 TAILQ_REMOVE(&rtransport->devices, device, link); 2584 if (device->map) { 2585 spdk_mem_map_free(&device->map); 2586 } 2587 if (device->pd) { 2588 if (!g_nvmf_hooks.get_ibv_pd) { 2589 ibv_dealloc_pd(device->pd); 2590 } 2591 } 2592 free(device); 2593 } 2594 2595 if (rtransport->data_wr_pool != NULL) { 2596 if (spdk_mempool_count(rtransport->data_wr_pool) != 2597 (transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES)) { 2598 SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n", 2599 spdk_mempool_count(rtransport->data_wr_pool), 2600 transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES); 2601 } 2602 } 2603 2604 spdk_mempool_free(rtransport->data_wr_pool); 2605 2606 pthread_mutex_destroy(&rtransport->lock); 2607 free(rtransport); 2608 2609 return 0; 2610 } 2611 2612 static int 2613 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 2614 struct spdk_nvme_transport_id *trid, 2615 bool peer); 2616 2617 static int 2618 spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, 2619 const struct spdk_nvme_transport_id *trid) 2620 { 2621 struct spdk_nvmf_rdma_transport *rtransport; 2622 struct spdk_nvmf_rdma_device *device; 2623 struct spdk_nvmf_rdma_port *port_tmp, *port; 2624 struct addrinfo *res; 2625 struct addrinfo hints; 2626 int family; 2627 int rc; 2628 2629 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2630 2631 port = calloc(1, sizeof(*port)); 2632 if (!port) { 2633 return -ENOMEM; 2634 } 2635 2636 /* Selectively copy the trid. Things like NQN don't matter here - that 2637 * mapping is enforced elsewhere. 2638 */ 2639 port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 2640 port->trid.adrfam = trid->adrfam; 2641 snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); 2642 snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); 2643 2644 switch (port->trid.adrfam) { 2645 case SPDK_NVMF_ADRFAM_IPV4: 2646 family = AF_INET; 2647 break; 2648 case SPDK_NVMF_ADRFAM_IPV6: 2649 family = AF_INET6; 2650 break; 2651 default: 2652 SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); 2653 free(port); 2654 return -EINVAL; 2655 } 2656 2657 memset(&hints, 0, sizeof(hints)); 2658 hints.ai_family = family; 2659 hints.ai_flags = AI_NUMERICSERV; 2660 hints.ai_socktype = SOCK_STREAM; 2661 hints.ai_protocol = 0; 2662 2663 rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); 2664 if (rc) { 2665 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 2666 free(port); 2667 return -EINVAL; 2668 } 2669 2670 pthread_mutex_lock(&rtransport->lock); 2671 assert(rtransport->event_channel != NULL); 2672 TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { 2673 if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { 2674 port_tmp->ref++; 2675 freeaddrinfo(res); 2676 free(port); 2677 /* Already listening at this address */ 2678 pthread_mutex_unlock(&rtransport->lock); 2679 return 0; 2680 } 2681 } 2682 2683 rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 2684 if (rc < 0) { 2685 SPDK_ERRLOG("rdma_create_id() failed\n"); 2686 freeaddrinfo(res); 2687 free(port); 2688 pthread_mutex_unlock(&rtransport->lock); 2689 return rc; 2690 } 2691 2692 rc = rdma_bind_addr(port->id, res->ai_addr); 2693 freeaddrinfo(res); 2694 2695 if (rc < 0) { 2696 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 2697 rdma_destroy_id(port->id); 2698 free(port); 2699 pthread_mutex_unlock(&rtransport->lock); 2700 return rc; 2701 } 2702 2703 if (!port->id->verbs) { 2704 SPDK_ERRLOG("ibv_context is null\n"); 2705 rdma_destroy_id(port->id); 2706 free(port); 2707 pthread_mutex_unlock(&rtransport->lock); 2708 return -1; 2709 } 2710 2711 rc = rdma_listen(port->id, 10); /* 10 = backlog */ 2712 if (rc < 0) { 2713 SPDK_ERRLOG("rdma_listen() failed\n"); 2714 rdma_destroy_id(port->id); 2715 free(port); 2716 pthread_mutex_unlock(&rtransport->lock); 2717 return rc; 2718 } 2719 2720 TAILQ_FOREACH(device, &rtransport->devices, link) { 2721 if (device->context == port->id->verbs) { 2722 port->device = device; 2723 break; 2724 } 2725 } 2726 if (!port->device) { 2727 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 2728 port->id->verbs); 2729 rdma_destroy_id(port->id); 2730 free(port); 2731 pthread_mutex_unlock(&rtransport->lock); 2732 return -EINVAL; 2733 } 2734 2735 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", 2736 port->trid.traddr, ntohs(rdma_get_src_port(port->id))); 2737 2738 port->ref = 1; 2739 2740 TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 2741 pthread_mutex_unlock(&rtransport->lock); 2742 2743 return 0; 2744 } 2745 2746 static int 2747 spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 2748 const struct spdk_nvme_transport_id *_trid) 2749 { 2750 struct spdk_nvmf_rdma_transport *rtransport; 2751 struct spdk_nvmf_rdma_port *port, *tmp; 2752 struct spdk_nvme_transport_id trid = {}; 2753 2754 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2755 2756 /* Selectively copy the trid. Things like NQN don't matter here - that 2757 * mapping is enforced elsewhere. 2758 */ 2759 trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 2760 trid.adrfam = _trid->adrfam; 2761 snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); 2762 snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); 2763 2764 pthread_mutex_lock(&rtransport->lock); 2765 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 2766 if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { 2767 assert(port->ref > 0); 2768 port->ref--; 2769 if (port->ref == 0) { 2770 TAILQ_REMOVE(&rtransport->ports, port, link); 2771 rdma_destroy_id(port->id); 2772 free(port); 2773 } 2774 break; 2775 } 2776 } 2777 2778 pthread_mutex_unlock(&rtransport->lock); 2779 return 0; 2780 } 2781 2782 static void 2783 spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 2784 struct spdk_nvmf_rdma_qpair *rqpair, bool drain) 2785 { 2786 struct spdk_nvmf_request *req, *tmp; 2787 struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 2788 struct spdk_nvmf_rdma_resources *resources; 2789 2790 /* We process I/O in the data transfer pending queue at the highest priority. RDMA reads first */ 2791 STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) { 2792 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2793 break; 2794 } 2795 } 2796 2797 /* Then RDMA writes since reads have stronger restrictions than writes */ 2798 STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) { 2799 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2800 break; 2801 } 2802 } 2803 2804 /* The second highest priority is I/O waiting on memory buffers. */ 2805 STAILQ_FOREACH_SAFE(req, &rqpair->poller->group->group.pending_buf_queue, buf_link, tmp) { 2806 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 2807 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 2808 break; 2809 } 2810 } 2811 2812 resources = rqpair->resources; 2813 while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) { 2814 rdma_req = STAILQ_FIRST(&resources->free_queue); 2815 STAILQ_REMOVE_HEAD(&resources->free_queue, state_link); 2816 rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue); 2817 STAILQ_REMOVE_HEAD(&resources->incoming_queue, link); 2818 2819 if (rqpair->srq != NULL) { 2820 rdma_req->req.qpair = &rdma_req->recv->qpair->qpair; 2821 rdma_req->recv->qpair->qd++; 2822 } else { 2823 rqpair->qd++; 2824 } 2825 2826 rdma_req->receive_tsc = rdma_req->recv->receive_tsc; 2827 rdma_req->state = RDMA_REQUEST_STATE_NEW; 2828 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 2829 break; 2830 } 2831 } 2832 if (!STAILQ_EMPTY(&resources->incoming_queue) && STAILQ_EMPTY(&resources->free_queue)) { 2833 rqpair->poller->stat.pending_free_request++; 2834 } 2835 } 2836 2837 static void 2838 _nvmf_rdma_qpair_disconnect(void *ctx) 2839 { 2840 struct spdk_nvmf_qpair *qpair = ctx; 2841 2842 spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); 2843 } 2844 2845 static void 2846 _nvmf_rdma_try_disconnect(void *ctx) 2847 { 2848 struct spdk_nvmf_qpair *qpair = ctx; 2849 struct spdk_nvmf_poll_group *group; 2850 2851 /* Read the group out of the qpair. This is normally set and accessed only from 2852 * the thread that created the group. Here, we're not on that thread necessarily. 2853 * The data member qpair->group begins it's life as NULL and then is assigned to 2854 * a pointer and never changes. So fortunately reading this and checking for 2855 * non-NULL is thread safe in the x86_64 memory model. */ 2856 group = qpair->group; 2857 2858 if (group == NULL) { 2859 /* The qpair hasn't been assigned to a group yet, so we can't 2860 * process a disconnect. Send a message to ourself and try again. */ 2861 spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_try_disconnect, qpair); 2862 return; 2863 } 2864 2865 spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair); 2866 } 2867 2868 static inline void 2869 spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair) 2870 { 2871 if (__sync_bool_compare_and_swap(&rqpair->disconnect_started, false, true)) { 2872 _nvmf_rdma_try_disconnect(&rqpair->qpair); 2873 } 2874 } 2875 2876 static void nvmf_rdma_destroy_drained_qpair(void *ctx) 2877 { 2878 struct spdk_nvmf_rdma_qpair *rqpair = ctx; 2879 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 2880 struct spdk_nvmf_rdma_transport, transport); 2881 2882 /* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */ 2883 if (rqpair->current_send_depth != 0) { 2884 return; 2885 } 2886 2887 if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) { 2888 return; 2889 } 2890 2891 if (rqpair->srq != NULL && rqpair->last_wqe_reached == false) { 2892 return; 2893 } 2894 2895 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 2896 2897 /* Qpair will be destroyed after nvmf layer closes this qpair */ 2898 if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ERROR) { 2899 return; 2900 } 2901 2902 spdk_nvmf_rdma_qpair_destroy(rqpair); 2903 } 2904 2905 2906 static int 2907 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 2908 { 2909 struct spdk_nvmf_qpair *qpair; 2910 struct spdk_nvmf_rdma_qpair *rqpair; 2911 2912 if (evt->id == NULL) { 2913 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 2914 return -1; 2915 } 2916 2917 qpair = evt->id->context; 2918 if (qpair == NULL) { 2919 SPDK_ERRLOG("disconnect request: no active connection\n"); 2920 return -1; 2921 } 2922 2923 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2924 2925 spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); 2926 2927 spdk_nvmf_rdma_update_ibv_state(rqpair); 2928 2929 spdk_nvmf_rdma_start_disconnect(rqpair); 2930 2931 return 0; 2932 } 2933 2934 #ifdef DEBUG 2935 static const char *CM_EVENT_STR[] = { 2936 "RDMA_CM_EVENT_ADDR_RESOLVED", 2937 "RDMA_CM_EVENT_ADDR_ERROR", 2938 "RDMA_CM_EVENT_ROUTE_RESOLVED", 2939 "RDMA_CM_EVENT_ROUTE_ERROR", 2940 "RDMA_CM_EVENT_CONNECT_REQUEST", 2941 "RDMA_CM_EVENT_CONNECT_RESPONSE", 2942 "RDMA_CM_EVENT_CONNECT_ERROR", 2943 "RDMA_CM_EVENT_UNREACHABLE", 2944 "RDMA_CM_EVENT_REJECTED", 2945 "RDMA_CM_EVENT_ESTABLISHED", 2946 "RDMA_CM_EVENT_DISCONNECTED", 2947 "RDMA_CM_EVENT_DEVICE_REMOVAL", 2948 "RDMA_CM_EVENT_MULTICAST_JOIN", 2949 "RDMA_CM_EVENT_MULTICAST_ERROR", 2950 "RDMA_CM_EVENT_ADDR_CHANGE", 2951 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 2952 }; 2953 #endif /* DEBUG */ 2954 2955 static void 2956 nvmf_rdma_handle_last_wqe_reached(void *ctx) 2957 { 2958 struct spdk_nvmf_rdma_qpair *rqpair = ctx; 2959 rqpair->last_wqe_reached = true; 2960 2961 nvmf_rdma_destroy_drained_qpair(rqpair); 2962 } 2963 2964 static void 2965 spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 2966 { 2967 struct spdk_nvmf_rdma_transport *rtransport; 2968 struct rdma_cm_event *event; 2969 int rc; 2970 2971 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2972 2973 if (rtransport->event_channel == NULL) { 2974 return; 2975 } 2976 2977 while (1) { 2978 rc = rdma_get_cm_event(rtransport->event_channel, &event); 2979 if (rc == 0) { 2980 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 2981 2982 spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); 2983 2984 switch (event->event) { 2985 case RDMA_CM_EVENT_ADDR_RESOLVED: 2986 case RDMA_CM_EVENT_ADDR_ERROR: 2987 case RDMA_CM_EVENT_ROUTE_RESOLVED: 2988 case RDMA_CM_EVENT_ROUTE_ERROR: 2989 /* No action required. The target never attempts to resolve routes. */ 2990 break; 2991 case RDMA_CM_EVENT_CONNECT_REQUEST: 2992 rc = nvmf_rdma_connect(transport, event, cb_fn); 2993 if (rc < 0) { 2994 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 2995 break; 2996 } 2997 break; 2998 case RDMA_CM_EVENT_CONNECT_RESPONSE: 2999 /* The target never initiates a new connection. So this will not occur. */ 3000 break; 3001 case RDMA_CM_EVENT_CONNECT_ERROR: 3002 /* Can this happen? The docs say it can, but not sure what causes it. */ 3003 break; 3004 case RDMA_CM_EVENT_UNREACHABLE: 3005 case RDMA_CM_EVENT_REJECTED: 3006 /* These only occur on the client side. */ 3007 break; 3008 case RDMA_CM_EVENT_ESTABLISHED: 3009 /* TODO: Should we be waiting for this event anywhere? */ 3010 break; 3011 case RDMA_CM_EVENT_DISCONNECTED: 3012 case RDMA_CM_EVENT_DEVICE_REMOVAL: 3013 rc = nvmf_rdma_disconnect(event); 3014 if (rc < 0) { 3015 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 3016 break; 3017 } 3018 break; 3019 case RDMA_CM_EVENT_MULTICAST_JOIN: 3020 case RDMA_CM_EVENT_MULTICAST_ERROR: 3021 /* Multicast is not used */ 3022 break; 3023 case RDMA_CM_EVENT_ADDR_CHANGE: 3024 /* Not utilizing this event */ 3025 break; 3026 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 3027 /* For now, do nothing. The target never re-uses queue pairs. */ 3028 break; 3029 default: 3030 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 3031 break; 3032 } 3033 3034 rdma_ack_cm_event(event); 3035 } else { 3036 if (errno != EAGAIN && errno != EWOULDBLOCK) { 3037 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 3038 } 3039 break; 3040 } 3041 } 3042 } 3043 3044 static void 3045 spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 3046 { 3047 int rc; 3048 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 3049 struct ibv_async_event event; 3050 enum ibv_qp_state state; 3051 3052 rc = ibv_get_async_event(device->context, &event); 3053 3054 if (rc) { 3055 SPDK_ERRLOG("Failed to get async_event (%d): %s\n", 3056 errno, spdk_strerror(errno)); 3057 return; 3058 } 3059 3060 switch (event.event_type) { 3061 case IBV_EVENT_QP_FATAL: 3062 rqpair = event.element.qp->qp_context; 3063 SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair); 3064 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 3065 (uintptr_t)rqpair->cm_id, event.event_type); 3066 spdk_nvmf_rdma_update_ibv_state(rqpair); 3067 spdk_nvmf_rdma_start_disconnect(rqpair); 3068 break; 3069 case IBV_EVENT_QP_LAST_WQE_REACHED: 3070 /* This event only occurs for shared receive queues. */ 3071 rqpair = event.element.qp->qp_context; 3072 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last WQE reached event received for rqpair %p\n", rqpair); 3073 /* This must be handled on the polling thread if it exists. Otherwise the timeout will catch it. */ 3074 if (rqpair->qpair.group) { 3075 spdk_thread_send_msg(rqpair->qpair.group->thread, nvmf_rdma_handle_last_wqe_reached, rqpair); 3076 } else { 3077 SPDK_ERRLOG("Unable to destroy the qpair %p since it does not have a poll group.\n", rqpair); 3078 rqpair->last_wqe_reached = true; 3079 } 3080 3081 break; 3082 case IBV_EVENT_SQ_DRAINED: 3083 /* This event occurs frequently in both error and non-error states. 3084 * Check if the qpair is in an error state before sending a message. 3085 * Note that we're not on the correct thread to access the qpair, but 3086 * the operations that the below calls make all happen to be thread 3087 * safe. */ 3088 rqpair = event.element.qp->qp_context; 3089 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last sq drained event received for rqpair %p\n", rqpair); 3090 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 3091 (uintptr_t)rqpair->cm_id, event.event_type); 3092 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 3093 if (state == IBV_QPS_ERR) { 3094 spdk_nvmf_rdma_start_disconnect(rqpair); 3095 } 3096 break; 3097 case IBV_EVENT_QP_REQ_ERR: 3098 case IBV_EVENT_QP_ACCESS_ERR: 3099 case IBV_EVENT_COMM_EST: 3100 case IBV_EVENT_PATH_MIG: 3101 case IBV_EVENT_PATH_MIG_ERR: 3102 SPDK_NOTICELOG("Async event: %s\n", 3103 ibv_event_type_str(event.event_type)); 3104 rqpair = event.element.qp->qp_context; 3105 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 3106 (uintptr_t)rqpair->cm_id, event.event_type); 3107 spdk_nvmf_rdma_update_ibv_state(rqpair); 3108 break; 3109 case IBV_EVENT_CQ_ERR: 3110 case IBV_EVENT_DEVICE_FATAL: 3111 case IBV_EVENT_PORT_ACTIVE: 3112 case IBV_EVENT_PORT_ERR: 3113 case IBV_EVENT_LID_CHANGE: 3114 case IBV_EVENT_PKEY_CHANGE: 3115 case IBV_EVENT_SM_CHANGE: 3116 case IBV_EVENT_SRQ_ERR: 3117 case IBV_EVENT_SRQ_LIMIT_REACHED: 3118 case IBV_EVENT_CLIENT_REREGISTER: 3119 case IBV_EVENT_GID_CHANGE: 3120 default: 3121 SPDK_NOTICELOG("Async event: %s\n", 3122 ibv_event_type_str(event.event_type)); 3123 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); 3124 break; 3125 } 3126 ibv_ack_async_event(&event); 3127 } 3128 3129 static void 3130 spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 3131 { 3132 int nfds, i = 0; 3133 struct spdk_nvmf_rdma_transport *rtransport; 3134 struct spdk_nvmf_rdma_device *device, *tmp; 3135 3136 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3137 nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 3138 3139 if (nfds <= 0) { 3140 return; 3141 } 3142 3143 /* The first poll descriptor is RDMA CM event */ 3144 if (rtransport->poll_fds[i++].revents & POLLIN) { 3145 spdk_nvmf_process_cm_event(transport, cb_fn); 3146 nfds--; 3147 } 3148 3149 if (nfds == 0) { 3150 return; 3151 } 3152 3153 /* Second and subsequent poll descriptors are IB async events */ 3154 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 3155 if (rtransport->poll_fds[i++].revents & POLLIN) { 3156 spdk_nvmf_process_ib_event(device); 3157 nfds--; 3158 } 3159 } 3160 /* check all flagged fd's have been served */ 3161 assert(nfds == 0); 3162 } 3163 3164 static void 3165 spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 3166 struct spdk_nvme_transport_id *trid, 3167 struct spdk_nvmf_discovery_log_page_entry *entry) 3168 { 3169 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 3170 entry->adrfam = trid->adrfam; 3171 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED; 3172 3173 spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 3174 spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 3175 3176 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 3177 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 3178 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 3179 } 3180 3181 static void 3182 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); 3183 3184 static struct spdk_nvmf_transport_poll_group * 3185 spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) 3186 { 3187 struct spdk_nvmf_rdma_transport *rtransport; 3188 struct spdk_nvmf_rdma_poll_group *rgroup; 3189 struct spdk_nvmf_rdma_poller *poller; 3190 struct spdk_nvmf_rdma_device *device; 3191 struct ibv_srq_init_attr srq_init_attr; 3192 struct spdk_nvmf_rdma_resource_opts opts; 3193 int num_cqe; 3194 3195 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3196 3197 rgroup = calloc(1, sizeof(*rgroup)); 3198 if (!rgroup) { 3199 return NULL; 3200 } 3201 3202 TAILQ_INIT(&rgroup->pollers); 3203 STAILQ_INIT(&rgroup->retired_bufs); 3204 3205 pthread_mutex_lock(&rtransport->lock); 3206 TAILQ_FOREACH(device, &rtransport->devices, link) { 3207 poller = calloc(1, sizeof(*poller)); 3208 if (!poller) { 3209 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 3210 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 3211 pthread_mutex_unlock(&rtransport->lock); 3212 return NULL; 3213 } 3214 3215 poller->device = device; 3216 poller->group = rgroup; 3217 3218 TAILQ_INIT(&poller->qpairs); 3219 STAILQ_INIT(&poller->qpairs_pending_send); 3220 STAILQ_INIT(&poller->qpairs_pending_recv); 3221 3222 TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 3223 if (transport->opts.no_srq == false && device->num_srq < device->attr.max_srq) { 3224 poller->max_srq_depth = transport->opts.max_srq_depth; 3225 3226 device->num_srq++; 3227 memset(&srq_init_attr, 0, sizeof(struct ibv_srq_init_attr)); 3228 srq_init_attr.attr.max_wr = poller->max_srq_depth; 3229 srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 3230 poller->srq = ibv_create_srq(device->pd, &srq_init_attr); 3231 if (!poller->srq) { 3232 SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno); 3233 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 3234 pthread_mutex_unlock(&rtransport->lock); 3235 return NULL; 3236 } 3237 3238 opts.qp = poller->srq; 3239 opts.pd = device->pd; 3240 opts.qpair = NULL; 3241 opts.shared = true; 3242 opts.max_queue_depth = poller->max_srq_depth; 3243 opts.in_capsule_data_size = transport->opts.in_capsule_data_size; 3244 3245 poller->resources = nvmf_rdma_resources_create(&opts); 3246 if (!poller->resources) { 3247 SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n"); 3248 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 3249 pthread_mutex_unlock(&rtransport->lock); 3250 return NULL; 3251 } 3252 } 3253 3254 /* 3255 * When using an srq, we can limit the completion queue at startup. 3256 * The following formula represents the calculation: 3257 * num_cqe = num_recv + num_data_wr + num_send_wr. 3258 * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth 3259 */ 3260 if (poller->srq) { 3261 num_cqe = poller->max_srq_depth * 3; 3262 } else { 3263 num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE; 3264 } 3265 3266 poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0); 3267 if (!poller->cq) { 3268 SPDK_ERRLOG("Unable to create completion queue\n"); 3269 spdk_nvmf_rdma_poll_group_destroy(&rgroup->group); 3270 pthread_mutex_unlock(&rtransport->lock); 3271 return NULL; 3272 } 3273 poller->num_cqe = num_cqe; 3274 } 3275 3276 TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link); 3277 if (rtransport->conn_sched.next_admin_pg == NULL) { 3278 rtransport->conn_sched.next_admin_pg = rgroup; 3279 rtransport->conn_sched.next_io_pg = rgroup; 3280 } 3281 3282 pthread_mutex_unlock(&rtransport->lock); 3283 return &rgroup->group; 3284 } 3285 3286 static struct spdk_nvmf_transport_poll_group * 3287 spdk_nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 3288 { 3289 struct spdk_nvmf_rdma_transport *rtransport; 3290 struct spdk_nvmf_rdma_poll_group **pg; 3291 struct spdk_nvmf_transport_poll_group *result; 3292 3293 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 3294 3295 pthread_mutex_lock(&rtransport->lock); 3296 3297 if (TAILQ_EMPTY(&rtransport->poll_groups)) { 3298 pthread_mutex_unlock(&rtransport->lock); 3299 return NULL; 3300 } 3301 3302 if (qpair->qid == 0) { 3303 pg = &rtransport->conn_sched.next_admin_pg; 3304 } else { 3305 pg = &rtransport->conn_sched.next_io_pg; 3306 } 3307 3308 assert(*pg != NULL); 3309 3310 result = &(*pg)->group; 3311 3312 *pg = TAILQ_NEXT(*pg, link); 3313 if (*pg == NULL) { 3314 *pg = TAILQ_FIRST(&rtransport->poll_groups); 3315 } 3316 3317 pthread_mutex_unlock(&rtransport->lock); 3318 3319 return result; 3320 } 3321 3322 static void 3323 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 3324 { 3325 struct spdk_nvmf_rdma_poll_group *rgroup, *next_rgroup; 3326 struct spdk_nvmf_rdma_poller *poller, *tmp; 3327 struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; 3328 struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp_buf; 3329 struct spdk_nvmf_rdma_transport *rtransport; 3330 3331 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 3332 rtransport = SPDK_CONTAINEROF(rgroup->group.transport, struct spdk_nvmf_rdma_transport, transport); 3333 3334 if (!rgroup) { 3335 return; 3336 } 3337 3338 /* free all retired buffers back to the transport so we don't short the mempool. */ 3339 STAILQ_FOREACH_SAFE(buf, &rgroup->retired_bufs, link, tmp_buf) { 3340 STAILQ_REMOVE(&rgroup->retired_bufs, buf, spdk_nvmf_transport_pg_cache_buf, link); 3341 assert(group->transport != NULL); 3342 spdk_mempool_put(group->transport->data_buf_pool, buf); 3343 } 3344 3345 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 3346 TAILQ_REMOVE(&rgroup->pollers, poller, link); 3347 3348 TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) { 3349 spdk_nvmf_rdma_qpair_destroy(qpair); 3350 } 3351 3352 if (poller->srq) { 3353 nvmf_rdma_resources_destroy(poller->resources); 3354 ibv_destroy_srq(poller->srq); 3355 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Destroyed RDMA shared queue %p\n", poller->srq); 3356 } 3357 3358 if (poller->cq) { 3359 ibv_destroy_cq(poller->cq); 3360 } 3361 3362 free(poller); 3363 } 3364 3365 pthread_mutex_lock(&rtransport->lock); 3366 next_rgroup = TAILQ_NEXT(rgroup, link); 3367 TAILQ_REMOVE(&rtransport->poll_groups, rgroup, link); 3368 if (next_rgroup == NULL) { 3369 next_rgroup = TAILQ_FIRST(&rtransport->poll_groups); 3370 } 3371 if (rtransport->conn_sched.next_admin_pg == rgroup) { 3372 rtransport->conn_sched.next_admin_pg = next_rgroup; 3373 } 3374 if (rtransport->conn_sched.next_io_pg == rgroup) { 3375 rtransport->conn_sched.next_io_pg = next_rgroup; 3376 } 3377 pthread_mutex_unlock(&rtransport->lock); 3378 3379 free(rgroup); 3380 } 3381 3382 static void 3383 spdk_nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair) 3384 { 3385 if (rqpair->cm_id != NULL) { 3386 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 3387 } 3388 spdk_nvmf_rdma_qpair_destroy(rqpair); 3389 } 3390 3391 static int 3392 spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 3393 struct spdk_nvmf_qpair *qpair) 3394 { 3395 struct spdk_nvmf_rdma_poll_group *rgroup; 3396 struct spdk_nvmf_rdma_qpair *rqpair; 3397 struct spdk_nvmf_rdma_device *device; 3398 struct spdk_nvmf_rdma_poller *poller; 3399 int rc; 3400 3401 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 3402 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3403 3404 device = rqpair->port->device; 3405 3406 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 3407 if (poller->device == device) { 3408 break; 3409 } 3410 } 3411 3412 if (!poller) { 3413 SPDK_ERRLOG("No poller found for device.\n"); 3414 return -1; 3415 } 3416 3417 TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); 3418 rqpair->poller = poller; 3419 rqpair->srq = rqpair->poller->srq; 3420 3421 rc = spdk_nvmf_rdma_qpair_initialize(qpair); 3422 if (rc < 0) { 3423 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 3424 return -1; 3425 } 3426 3427 rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 3428 if (rc) { 3429 /* Try to reject, but we probably can't */ 3430 spdk_nvmf_rdma_qpair_reject_connection(rqpair); 3431 return -1; 3432 } 3433 3434 spdk_nvmf_rdma_update_ibv_state(rqpair); 3435 3436 return 0; 3437 } 3438 3439 static int 3440 spdk_nvmf_rdma_request_free(struct spdk_nvmf_request *req) 3441 { 3442 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 3443 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 3444 struct spdk_nvmf_rdma_transport, transport); 3445 3446 nvmf_rdma_request_free(rdma_req, rtransport); 3447 return 0; 3448 } 3449 3450 static int 3451 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 3452 { 3453 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 3454 struct spdk_nvmf_rdma_transport, transport); 3455 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, 3456 struct spdk_nvmf_rdma_request, req); 3457 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, 3458 struct spdk_nvmf_rdma_qpair, qpair); 3459 3460 if (rqpair->ibv_state != IBV_QPS_ERR) { 3461 /* The connection is alive, so process the request as normal */ 3462 rdma_req->state = RDMA_REQUEST_STATE_EXECUTED; 3463 } else { 3464 /* The connection is dead. Move the request directly to the completed state. */ 3465 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3466 } 3467 3468 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 3469 3470 return 0; 3471 } 3472 3473 static int 3474 spdk_nvmf_rdma_destroy_defunct_qpair(void *ctx) 3475 { 3476 struct spdk_nvmf_rdma_qpair *rqpair = ctx; 3477 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 3478 struct spdk_nvmf_rdma_transport, transport); 3479 3480 SPDK_INFOLOG(SPDK_LOG_RDMA, "QP#%d hasn't been drained as expected, manually destroy it\n", 3481 rqpair->qpair.qid); 3482 3483 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 3484 spdk_nvmf_rdma_qpair_destroy(rqpair); 3485 3486 return 0; 3487 } 3488 3489 static void 3490 spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) 3491 { 3492 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3493 3494 if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) { 3495 return; 3496 } 3497 3498 rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING; 3499 3500 /* This happens only when the qpair is disconnected before 3501 * it is added to the poll group. Since there is no poll group, 3502 * the RDMA qp has not been initialized yet and the RDMA CM 3503 * event has not yet been acknowledged, so we need to reject it. 3504 */ 3505 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) { 3506 spdk_nvmf_rdma_qpair_reject_connection(rqpair); 3507 return; 3508 } 3509 3510 if (rqpair->ibv_state != IBV_QPS_ERR) { 3511 spdk_nvmf_rdma_set_ibv_state(rqpair, IBV_QPS_ERR); 3512 } 3513 3514 rqpair->destruct_poller = spdk_poller_register(spdk_nvmf_rdma_destroy_defunct_qpair, (void *)rqpair, 3515 NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US); 3516 } 3517 3518 static struct spdk_nvmf_rdma_qpair * 3519 get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc) 3520 { 3521 struct spdk_nvmf_rdma_qpair *rqpair; 3522 /* @todo: improve QP search */ 3523 TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) { 3524 if (wc->qp_num == rqpair->cm_id->qp->qp_num) { 3525 return rqpair; 3526 } 3527 } 3528 SPDK_ERRLOG("Didn't find QP with qp_num %u\n", wc->qp_num); 3529 return NULL; 3530 } 3531 3532 #ifdef DEBUG 3533 static int 3534 spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) 3535 { 3536 return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || 3537 rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; 3538 } 3539 #endif 3540 3541 static void 3542 _poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_recv_wr *bad_recv_wr, 3543 int rc) 3544 { 3545 struct spdk_nvmf_rdma_recv *rdma_recv; 3546 struct spdk_nvmf_rdma_wr *bad_rdma_wr; 3547 3548 SPDK_ERRLOG("Failed to post a recv for the poller %p with errno %d\n", rpoller, -rc); 3549 while (bad_recv_wr != NULL) { 3550 bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_recv_wr->wr_id; 3551 rdma_recv = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 3552 3553 rdma_recv->qpair->current_recv_depth++; 3554 bad_recv_wr = bad_recv_wr->next; 3555 SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc); 3556 spdk_nvmf_rdma_start_disconnect(rdma_recv->qpair); 3557 } 3558 } 3559 3560 static void 3561 _qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *bad_recv_wr, int rc) 3562 { 3563 SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rqpair, -rc); 3564 while (bad_recv_wr != NULL) { 3565 bad_recv_wr = bad_recv_wr->next; 3566 rqpair->current_recv_depth++; 3567 } 3568 spdk_nvmf_rdma_start_disconnect(rqpair); 3569 } 3570 3571 static void 3572 _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport, 3573 struct spdk_nvmf_rdma_poller *rpoller) 3574 { 3575 struct spdk_nvmf_rdma_qpair *rqpair; 3576 struct ibv_recv_wr *bad_recv_wr; 3577 int rc; 3578 3579 if (rpoller->srq) { 3580 if (rpoller->resources->recvs_to_post.first != NULL) { 3581 rc = ibv_post_srq_recv(rpoller->srq, rpoller->resources->recvs_to_post.first, &bad_recv_wr); 3582 if (rc) { 3583 _poller_reset_failed_recvs(rpoller, bad_recv_wr, rc); 3584 } 3585 rpoller->resources->recvs_to_post.first = NULL; 3586 rpoller->resources->recvs_to_post.last = NULL; 3587 } 3588 } else { 3589 while (!STAILQ_EMPTY(&rpoller->qpairs_pending_recv)) { 3590 rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_recv); 3591 assert(rqpair->resources->recvs_to_post.first != NULL); 3592 rc = ibv_post_recv(rqpair->cm_id->qp, rqpair->resources->recvs_to_post.first, &bad_recv_wr); 3593 if (rc) { 3594 _qp_reset_failed_recvs(rqpair, bad_recv_wr, rc); 3595 } 3596 rqpair->resources->recvs_to_post.first = NULL; 3597 rqpair->resources->recvs_to_post.last = NULL; 3598 STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_recv, recv_link); 3599 } 3600 } 3601 } 3602 3603 static void 3604 _qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport, 3605 struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc) 3606 { 3607 struct spdk_nvmf_rdma_wr *bad_rdma_wr; 3608 struct spdk_nvmf_rdma_request *prev_rdma_req = NULL, *cur_rdma_req = NULL; 3609 3610 SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc); 3611 for (; bad_wr != NULL; bad_wr = bad_wr->next) { 3612 bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id; 3613 assert(rqpair->current_send_depth > 0); 3614 rqpair->current_send_depth--; 3615 switch (bad_rdma_wr->type) { 3616 case RDMA_WR_TYPE_DATA: 3617 cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 3618 if (bad_wr->opcode == IBV_WR_RDMA_READ) { 3619 assert(rqpair->current_read_depth > 0); 3620 rqpair->current_read_depth--; 3621 } 3622 break; 3623 case RDMA_WR_TYPE_SEND: 3624 cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 3625 break; 3626 default: 3627 SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair); 3628 prev_rdma_req = cur_rdma_req; 3629 continue; 3630 } 3631 3632 if (prev_rdma_req == cur_rdma_req) { 3633 /* this request was handled by an earlier wr. i.e. we were performing an nvme read. */ 3634 /* We only have to check against prev_wr since each requests wrs are contiguous in this list. */ 3635 continue; 3636 } 3637 3638 switch (cur_rdma_req->state) { 3639 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 3640 cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 3641 cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 3642 break; 3643 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 3644 case RDMA_REQUEST_STATE_COMPLETING: 3645 cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3646 break; 3647 default: 3648 SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n", 3649 cur_rdma_req->state, rqpair); 3650 continue; 3651 } 3652 3653 spdk_nvmf_rdma_request_process(rtransport, cur_rdma_req); 3654 prev_rdma_req = cur_rdma_req; 3655 } 3656 3657 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { 3658 /* Disconnect the connection. */ 3659 spdk_nvmf_rdma_start_disconnect(rqpair); 3660 } 3661 3662 } 3663 3664 static void 3665 _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport, 3666 struct spdk_nvmf_rdma_poller *rpoller) 3667 { 3668 struct spdk_nvmf_rdma_qpair *rqpair; 3669 struct ibv_send_wr *bad_wr = NULL; 3670 int rc; 3671 3672 while (!STAILQ_EMPTY(&rpoller->qpairs_pending_send)) { 3673 rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_send); 3674 assert(rqpair->sends_to_post.first != NULL); 3675 rc = ibv_post_send(rqpair->cm_id->qp, rqpair->sends_to_post.first, &bad_wr); 3676 3677 /* bad wr always points to the first wr that failed. */ 3678 if (rc) { 3679 _qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc); 3680 } 3681 rqpair->sends_to_post.first = NULL; 3682 rqpair->sends_to_post.last = NULL; 3683 STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_send, send_link); 3684 } 3685 } 3686 3687 static int 3688 spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 3689 struct spdk_nvmf_rdma_poller *rpoller) 3690 { 3691 struct ibv_wc wc[32]; 3692 struct spdk_nvmf_rdma_wr *rdma_wr; 3693 struct spdk_nvmf_rdma_request *rdma_req; 3694 struct spdk_nvmf_rdma_recv *rdma_recv; 3695 struct spdk_nvmf_rdma_qpair *rqpair; 3696 int reaped, i; 3697 int count = 0; 3698 bool error = false; 3699 uint64_t poll_tsc = spdk_get_ticks(); 3700 3701 /* Poll for completing operations. */ 3702 reaped = ibv_poll_cq(rpoller->cq, 32, wc); 3703 if (reaped < 0) { 3704 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 3705 errno, spdk_strerror(errno)); 3706 return -1; 3707 } 3708 3709 rpoller->stat.polls++; 3710 rpoller->stat.completions += reaped; 3711 3712 for (i = 0; i < reaped; i++) { 3713 3714 rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id; 3715 3716 switch (rdma_wr->type) { 3717 case RDMA_WR_TYPE_SEND: 3718 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 3719 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 3720 3721 if (!wc[i].status) { 3722 count++; 3723 assert(wc[i].opcode == IBV_WC_SEND); 3724 assert(spdk_nvmf_rdma_req_is_completing(rdma_req)); 3725 } else { 3726 SPDK_ERRLOG("data=%p length=%u\n", rdma_req->req.data, rdma_req->req.length); 3727 } 3728 3729 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3730 /* +1 for the response wr */ 3731 rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1; 3732 rdma_req->num_outstanding_data_wr = 0; 3733 3734 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 3735 break; 3736 case RDMA_WR_TYPE_RECV: 3737 /* rdma_recv->qpair will be invalid if using an SRQ. In that case we have to get the qpair from the wc. */ 3738 rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 3739 if (rpoller->srq != NULL) { 3740 rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]); 3741 /* It is possible that there are still some completions for destroyed QP 3742 * associated with SRQ. We just ignore these late completions and re-post 3743 * receive WRs back to SRQ. 3744 */ 3745 if (spdk_unlikely(NULL == rdma_recv->qpair)) { 3746 struct ibv_recv_wr *bad_wr; 3747 int rc; 3748 3749 rdma_recv->wr.next = NULL; 3750 rc = ibv_post_srq_recv(rpoller->srq, 3751 &rdma_recv->wr, 3752 &bad_wr); 3753 if (rc) { 3754 SPDK_ERRLOG("Failed to re-post recv WR to SRQ, err %d\n", rc); 3755 } 3756 continue; 3757 } 3758 } 3759 rqpair = rdma_recv->qpair; 3760 3761 assert(rqpair != NULL); 3762 if (!wc[i].status) { 3763 assert(wc[i].opcode == IBV_WC_RECV); 3764 if (rqpair->current_recv_depth >= rqpair->max_queue_depth) { 3765 spdk_nvmf_rdma_start_disconnect(rqpair); 3766 break; 3767 } 3768 } 3769 3770 rdma_recv->wr.next = NULL; 3771 rqpair->current_recv_depth++; 3772 rdma_recv->receive_tsc = poll_tsc; 3773 rpoller->stat.requests++; 3774 STAILQ_INSERT_TAIL(&rqpair->resources->incoming_queue, rdma_recv, link); 3775 break; 3776 case RDMA_WR_TYPE_DATA: 3777 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 3778 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 3779 3780 assert(rdma_req->num_outstanding_data_wr > 0); 3781 3782 rqpair->current_send_depth--; 3783 rdma_req->num_outstanding_data_wr--; 3784 if (!wc[i].status) { 3785 assert(wc[i].opcode == IBV_WC_RDMA_READ); 3786 rqpair->current_read_depth--; 3787 /* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */ 3788 if (rdma_req->num_outstanding_data_wr == 0) { 3789 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 3790 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 3791 } 3792 } else { 3793 /* If the data transfer fails still force the queue into the error state, 3794 * if we were performing an RDMA_READ, we need to force the request into a 3795 * completed state since it wasn't linked to a send. However, in the RDMA_WRITE 3796 * case, we should wait for the SEND to complete. */ 3797 SPDK_ERRLOG("data=%p length=%u\n", rdma_req->req.data, rdma_req->req.length); 3798 if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) { 3799 rqpair->current_read_depth--; 3800 if (rdma_req->num_outstanding_data_wr == 0) { 3801 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 3802 } 3803 } 3804 } 3805 break; 3806 default: 3807 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 3808 continue; 3809 } 3810 3811 /* Handle error conditions */ 3812 if (wc[i].status) { 3813 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "CQ error on CQ %p, Request 0x%lu (%d): %s\n", 3814 rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 3815 3816 error = true; 3817 3818 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { 3819 /* Disconnect the connection. */ 3820 spdk_nvmf_rdma_start_disconnect(rqpair); 3821 } else { 3822 nvmf_rdma_destroy_drained_qpair(rqpair); 3823 } 3824 continue; 3825 } 3826 3827 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 3828 3829 if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 3830 nvmf_rdma_destroy_drained_qpair(rqpair); 3831 } 3832 } 3833 3834 if (error == true) { 3835 return -1; 3836 } 3837 3838 /* submit outstanding work requests. */ 3839 _poller_submit_recvs(rtransport, rpoller); 3840 _poller_submit_sends(rtransport, rpoller); 3841 3842 return count; 3843 } 3844 3845 static int 3846 spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 3847 { 3848 struct spdk_nvmf_rdma_transport *rtransport; 3849 struct spdk_nvmf_rdma_poll_group *rgroup; 3850 struct spdk_nvmf_rdma_poller *rpoller; 3851 int count, rc; 3852 3853 rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 3854 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 3855 3856 count = 0; 3857 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 3858 rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); 3859 if (rc < 0) { 3860 return rc; 3861 } 3862 count += rc; 3863 } 3864 3865 return count; 3866 } 3867 3868 static int 3869 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 3870 struct spdk_nvme_transport_id *trid, 3871 bool peer) 3872 { 3873 struct sockaddr *saddr; 3874 uint16_t port; 3875 3876 trid->trtype = SPDK_NVME_TRANSPORT_RDMA; 3877 3878 if (peer) { 3879 saddr = rdma_get_peer_addr(id); 3880 } else { 3881 saddr = rdma_get_local_addr(id); 3882 } 3883 switch (saddr->sa_family) { 3884 case AF_INET: { 3885 struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; 3886 3887 trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; 3888 inet_ntop(AF_INET, &saddr_in->sin_addr, 3889 trid->traddr, sizeof(trid->traddr)); 3890 if (peer) { 3891 port = ntohs(rdma_get_dst_port(id)); 3892 } else { 3893 port = ntohs(rdma_get_src_port(id)); 3894 } 3895 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 3896 break; 3897 } 3898 case AF_INET6: { 3899 struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; 3900 trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; 3901 inet_ntop(AF_INET6, &saddr_in->sin6_addr, 3902 trid->traddr, sizeof(trid->traddr)); 3903 if (peer) { 3904 port = ntohs(rdma_get_dst_port(id)); 3905 } else { 3906 port = ntohs(rdma_get_src_port(id)); 3907 } 3908 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 3909 break; 3910 } 3911 default: 3912 return -1; 3913 3914 } 3915 3916 return 0; 3917 } 3918 3919 static int 3920 spdk_nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 3921 struct spdk_nvme_transport_id *trid) 3922 { 3923 struct spdk_nvmf_rdma_qpair *rqpair; 3924 3925 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3926 3927 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); 3928 } 3929 3930 static int 3931 spdk_nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 3932 struct spdk_nvme_transport_id *trid) 3933 { 3934 struct spdk_nvmf_rdma_qpair *rqpair; 3935 3936 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3937 3938 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); 3939 } 3940 3941 static int 3942 spdk_nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 3943 struct spdk_nvme_transport_id *trid) 3944 { 3945 struct spdk_nvmf_rdma_qpair *rqpair; 3946 3947 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3948 3949 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); 3950 } 3951 3952 void 3953 spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) 3954 { 3955 g_nvmf_hooks = *hooks; 3956 } 3957 3958 static int 3959 spdk_nvmf_rdma_poll_group_get_stat(struct spdk_nvmf_tgt *tgt, 3960 struct spdk_nvmf_transport_poll_group_stat **stat) 3961 { 3962 struct spdk_io_channel *ch; 3963 struct spdk_nvmf_poll_group *group; 3964 struct spdk_nvmf_transport_poll_group *tgroup; 3965 struct spdk_nvmf_rdma_poll_group *rgroup; 3966 struct spdk_nvmf_rdma_poller *rpoller; 3967 struct spdk_nvmf_rdma_device_stat *device_stat; 3968 uint64_t num_devices = 0; 3969 3970 if (tgt == NULL || stat == NULL) { 3971 return -EINVAL; 3972 } 3973 3974 ch = spdk_get_io_channel(tgt); 3975 group = spdk_io_channel_get_ctx(ch);; 3976 spdk_put_io_channel(ch); 3977 TAILQ_FOREACH(tgroup, &group->tgroups, link) { 3978 if (SPDK_NVME_TRANSPORT_RDMA == tgroup->transport->ops->type) { 3979 *stat = calloc(1, sizeof(struct spdk_nvmf_transport_poll_group_stat)); 3980 if (!*stat) { 3981 SPDK_ERRLOG("Failed to allocate memory for NVMf RDMA statistics\n"); 3982 return -ENOMEM; 3983 } 3984 (*stat)->trtype = SPDK_NVME_TRANSPORT_RDMA; 3985 3986 rgroup = SPDK_CONTAINEROF(tgroup, struct spdk_nvmf_rdma_poll_group, group); 3987 /* Count devices to allocate enough memory */ 3988 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 3989 ++num_devices; 3990 } 3991 (*stat)->rdma.devices = calloc(num_devices, sizeof(struct spdk_nvmf_rdma_device_stat)); 3992 if (!(*stat)->rdma.devices) { 3993 SPDK_ERRLOG("Failed to allocate NVMf RDMA devices statistics\n"); 3994 free(*stat); 3995 return -ENOMEM; 3996 } 3997 3998 (*stat)->rdma.pending_data_buffer = rgroup->stat.pending_data_buffer; 3999 (*stat)->rdma.num_devices = num_devices; 4000 num_devices = 0; 4001 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 4002 device_stat = &(*stat)->rdma.devices[num_devices++]; 4003 device_stat->name = ibv_get_device_name(rpoller->device->context->device); 4004 device_stat->polls = rpoller->stat.polls; 4005 device_stat->completions = rpoller->stat.completions; 4006 device_stat->requests = rpoller->stat.requests; 4007 device_stat->request_latency = rpoller->stat.request_latency; 4008 device_stat->pending_free_request = rpoller->stat.pending_free_request; 4009 device_stat->pending_rdma_read = rpoller->stat.pending_rdma_read; 4010 device_stat->pending_rdma_write = rpoller->stat.pending_rdma_write; 4011 } 4012 return 0; 4013 } 4014 } 4015 return -ENOENT; 4016 } 4017 4018 static void 4019 spdk_nvmf_rdma_poll_group_free_stat(struct spdk_nvmf_transport_poll_group_stat *stat) 4020 { 4021 if (stat) { 4022 free(stat->rdma.devices); 4023 } 4024 free(stat); 4025 } 4026 4027 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 4028 .type = SPDK_NVME_TRANSPORT_RDMA, 4029 .opts_init = spdk_nvmf_rdma_opts_init, 4030 .create = spdk_nvmf_rdma_create, 4031 .destroy = spdk_nvmf_rdma_destroy, 4032 4033 .listen = spdk_nvmf_rdma_listen, 4034 .stop_listen = spdk_nvmf_rdma_stop_listen, 4035 .accept = spdk_nvmf_rdma_accept, 4036 4037 .listener_discover = spdk_nvmf_rdma_discover, 4038 4039 .poll_group_create = spdk_nvmf_rdma_poll_group_create, 4040 .get_optimal_poll_group = spdk_nvmf_rdma_get_optimal_poll_group, 4041 .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, 4042 .poll_group_add = spdk_nvmf_rdma_poll_group_add, 4043 .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, 4044 4045 .req_free = spdk_nvmf_rdma_request_free, 4046 .req_complete = spdk_nvmf_rdma_request_complete, 4047 4048 .qpair_fini = spdk_nvmf_rdma_close_qpair, 4049 .qpair_get_peer_trid = spdk_nvmf_rdma_qpair_get_peer_trid, 4050 .qpair_get_local_trid = spdk_nvmf_rdma_qpair_get_local_trid, 4051 .qpair_get_listen_trid = spdk_nvmf_rdma_qpair_get_listen_trid, 4052 4053 .poll_group_get_stat = spdk_nvmf_rdma_poll_group_get_stat, 4054 .poll_group_free_stat = spdk_nvmf_rdma_poll_group_free_stat, 4055 }; 4056 4057 SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) 4058