1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2018 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "nvmf_internal.h" 41 #include "transport.h" 42 43 #include "spdk/config.h" 44 #include "spdk/assert.h" 45 #include "spdk/thread.h" 46 #include "spdk/nvmf.h" 47 #include "spdk/nvmf_spec.h" 48 #include "spdk/string.h" 49 #include "spdk/trace.h" 50 #include "spdk/util.h" 51 52 #include "spdk_internal/log.h" 53 54 /* 55 RDMA Connection Resource Defaults 56 */ 57 #define NVMF_DEFAULT_TX_SGE 1 58 #define NVMF_DEFAULT_RX_SGE 2 59 #define NVMF_DEFAULT_DATA_SGE 16 60 61 /* The RDMA completion queue size */ 62 #define NVMF_RDMA_CQ_SIZE 4096 63 64 /* AIO backend requires block size aligned data buffers, 65 * extra 4KiB aligned data buffer should work for most devices. 66 */ 67 #define SHIFT_4KB 12 68 #define NVMF_DATA_BUFFER_ALIGNMENT (1 << SHIFT_4KB) 69 #define NVMF_DATA_BUFFER_MASK (NVMF_DATA_BUFFER_ALIGNMENT - 1) 70 71 enum spdk_nvmf_rdma_request_state { 72 /* The request is not currently in use */ 73 RDMA_REQUEST_STATE_FREE = 0, 74 75 /* Initial state when request first received */ 76 RDMA_REQUEST_STATE_NEW, 77 78 /* The request is queued until a data buffer is available. */ 79 RDMA_REQUEST_STATE_NEED_BUFFER, 80 81 /* The request is waiting on RDMA queue depth availability 82 * to transfer data between the host and the controller. 83 */ 84 RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, 85 86 /* The request is currently transferring data from the host to the controller. */ 87 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 88 89 /* The request is ready to execute at the block device */ 90 RDMA_REQUEST_STATE_READY_TO_EXECUTE, 91 92 /* The request is currently executing at the block device */ 93 RDMA_REQUEST_STATE_EXECUTING, 94 95 /* The request finished executing at the block device */ 96 RDMA_REQUEST_STATE_EXECUTED, 97 98 /* The request is ready to send a completion */ 99 RDMA_REQUEST_STATE_READY_TO_COMPLETE, 100 101 /* The request is currently transferring data from the controller to the host. */ 102 RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 103 104 /* The request currently has an outstanding completion without an 105 * associated data transfer. 106 */ 107 RDMA_REQUEST_STATE_COMPLETING, 108 109 /* The request completed and can be marked free. */ 110 RDMA_REQUEST_STATE_COMPLETED, 111 112 /* Terminator */ 113 RDMA_REQUEST_NUM_STATES, 114 }; 115 116 #define OBJECT_NVMF_RDMA_IO 0x40 117 118 #define TRACE_GROUP_NVMF_RDMA 0x4 119 #define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) 120 #define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) 121 #define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) 122 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) 123 #define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) 124 #define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) 125 #define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) 126 #define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) 127 #define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) 128 #define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) 129 #define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) 130 #define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB) 131 #define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC) 132 #define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD) 133 #define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE) 134 #define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF) 135 #define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10) 136 137 SPDK_TRACE_REGISTER_FN(nvmf_trace) 138 { 139 spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); 140 spdk_trace_register_description("RDMA_REQ_NEW", "", 141 TRACE_RDMA_REQUEST_STATE_NEW, 142 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: "); 143 spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", "", 144 TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 145 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 146 spdk_trace_register_description("RDMA_REQ_TX_PENDING_H_TO_C", "", 147 TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, 148 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 149 spdk_trace_register_description("RDMA_REQ_TX_H_TO_C", "", 150 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 151 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 152 spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", "", 153 TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 154 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 155 spdk_trace_register_description("RDMA_REQ_EXECUTING", "", 156 TRACE_RDMA_REQUEST_STATE_EXECUTING, 157 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 158 spdk_trace_register_description("RDMA_REQ_EXECUTED", "", 159 TRACE_RDMA_REQUEST_STATE_EXECUTED, 160 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 161 spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPLETE", "", 162 TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 163 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 164 spdk_trace_register_description("RDMA_REQ_COMPLETING_CONTROLLER_TO_HOST", "", 165 TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 166 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 167 spdk_trace_register_description("RDMA_REQ_COMPLETING_INCAPSULE", "", 168 TRACE_RDMA_REQUEST_STATE_COMPLETING, 169 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 170 spdk_trace_register_description("RDMA_REQ_COMPLETED", "", 171 TRACE_RDMA_REQUEST_STATE_COMPLETED, 172 OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); 173 174 spdk_trace_register_description("RDMA_QP_CREATE", "", TRACE_RDMA_QP_CREATE, 175 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 176 spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", "", TRACE_RDMA_IBV_ASYNC_EVENT, 177 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 178 spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", "", TRACE_RDMA_CM_ASYNC_EVENT, 179 OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); 180 spdk_trace_register_description("RDMA_QP_STATE_CHANGE", "", TRACE_RDMA_QP_STATE_CHANGE, 181 OWNER_NONE, OBJECT_NONE, 0, 1, "state: "); 182 spdk_trace_register_description("RDMA_QP_DISCONNECT", "", TRACE_RDMA_QP_DISCONNECT, 183 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 184 spdk_trace_register_description("RDMA_QP_DESTROY", "", TRACE_RDMA_QP_DESTROY, 185 OWNER_NONE, OBJECT_NONE, 0, 0, ""); 186 } 187 188 enum spdk_nvmf_rdma_wr_type { 189 RDMA_WR_TYPE_RECV, 190 RDMA_WR_TYPE_SEND, 191 RDMA_WR_TYPE_DATA, 192 RDMA_WR_TYPE_DRAIN_SEND, 193 RDMA_WR_TYPE_DRAIN_RECV 194 }; 195 196 struct spdk_nvmf_rdma_wr { 197 enum spdk_nvmf_rdma_wr_type type; 198 }; 199 200 /* This structure holds commands as they are received off the wire. 201 * It must be dynamically paired with a full request object 202 * (spdk_nvmf_rdma_request) to service a request. It is separate 203 * from the request because RDMA does not appear to order 204 * completions, so occasionally we'll get a new incoming 205 * command when there aren't any free request objects. 206 */ 207 struct spdk_nvmf_rdma_recv { 208 struct ibv_recv_wr wr; 209 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 210 211 struct spdk_nvmf_rdma_qpair *qpair; 212 213 /* In-capsule data buffer */ 214 uint8_t *buf; 215 216 struct spdk_nvmf_rdma_wr rdma_wr; 217 218 TAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 219 }; 220 221 struct spdk_nvmf_rdma_request { 222 struct spdk_nvmf_request req; 223 bool data_from_pool; 224 225 enum spdk_nvmf_rdma_request_state state; 226 227 struct spdk_nvmf_rdma_recv *recv; 228 229 struct { 230 struct spdk_nvmf_rdma_wr rdma_wr; 231 struct ibv_send_wr wr; 232 struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; 233 } rsp; 234 235 struct { 236 struct spdk_nvmf_rdma_wr rdma_wr; 237 struct ibv_send_wr wr; 238 struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 239 void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; 240 } data; 241 242 struct spdk_nvmf_rdma_wr rdma_wr; 243 244 TAILQ_ENTRY(spdk_nvmf_rdma_request) link; 245 TAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; 246 }; 247 248 enum spdk_nvmf_rdma_qpair_disconnect_flags { 249 RDMA_QP_DISCONNECTING = 1, 250 RDMA_QP_RECV_DRAINED = 1 << 1, 251 RDMA_QP_SEND_DRAINED = 1 << 2 252 }; 253 254 struct spdk_nvmf_rdma_qpair { 255 struct spdk_nvmf_qpair qpair; 256 257 struct spdk_nvmf_rdma_port *port; 258 struct spdk_nvmf_rdma_poller *poller; 259 260 struct rdma_cm_id *cm_id; 261 struct rdma_cm_id *listen_id; 262 263 /* The maximum number of I/O outstanding on this connection at one time */ 264 uint16_t max_queue_depth; 265 266 /* The maximum number of active RDMA READ and WRITE operations at one time */ 267 uint16_t max_rw_depth; 268 269 /* Receives that are waiting for a request object */ 270 TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 271 272 /* Queues to track the requests in all states */ 273 TAILQ_HEAD(, spdk_nvmf_rdma_request) state_queue[RDMA_REQUEST_NUM_STATES]; 274 275 /* Number of requests in each state */ 276 uint32_t state_cntr[RDMA_REQUEST_NUM_STATES]; 277 278 int max_sge; 279 280 /* Array of size "max_queue_depth" containing RDMA requests. */ 281 struct spdk_nvmf_rdma_request *reqs; 282 283 /* Array of size "max_queue_depth" containing RDMA recvs. */ 284 struct spdk_nvmf_rdma_recv *recvs; 285 286 /* Array of size "max_queue_depth" containing 64 byte capsules 287 * used for receive. 288 */ 289 union nvmf_h2c_msg *cmds; 290 struct ibv_mr *cmds_mr; 291 292 /* Array of size "max_queue_depth" containing 16 byte completions 293 * to be sent back to the user. 294 */ 295 union nvmf_c2h_msg *cpls; 296 struct ibv_mr *cpls_mr; 297 298 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 299 * buffers to be used for in capsule data. 300 */ 301 void *bufs; 302 struct ibv_mr *bufs_mr; 303 304 TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; 305 306 /* Mgmt channel */ 307 struct spdk_io_channel *mgmt_channel; 308 struct spdk_nvmf_rdma_mgmt_channel *ch; 309 310 /* IBV queue pair attributes: they are used to manage 311 * qp state and recover from errors. 312 */ 313 struct ibv_qp_init_attr ibv_init_attr; 314 struct ibv_qp_attr ibv_attr; 315 316 uint32_t disconnect_flags; 317 struct spdk_nvmf_rdma_wr drain_send_wr; 318 struct spdk_nvmf_rdma_wr drain_recv_wr; 319 320 /* Reference counter for how many unprocessed messages 321 * from other threads are currently outstanding. The 322 * qpair cannot be destroyed until this is 0. This is 323 * atomically incremented from any thread, but only 324 * decremented and read from the thread that owns this 325 * qpair. 326 */ 327 uint32_t refcnt; 328 }; 329 330 struct spdk_nvmf_rdma_poller { 331 struct spdk_nvmf_rdma_device *device; 332 struct spdk_nvmf_rdma_poll_group *group; 333 334 struct ibv_cq *cq; 335 336 TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; 337 338 TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 339 }; 340 341 struct spdk_nvmf_rdma_poll_group { 342 struct spdk_nvmf_transport_poll_group group; 343 344 TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 345 }; 346 347 /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 348 struct spdk_nvmf_rdma_device { 349 struct ibv_device_attr attr; 350 struct ibv_context *context; 351 352 struct spdk_mem_map *map; 353 struct ibv_pd *pd; 354 355 TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 356 }; 357 358 struct spdk_nvmf_rdma_port { 359 struct spdk_nvme_transport_id trid; 360 struct rdma_cm_id *id; 361 struct spdk_nvmf_rdma_device *device; 362 uint32_t ref; 363 TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 364 }; 365 366 struct spdk_nvmf_rdma_transport { 367 struct spdk_nvmf_transport transport; 368 369 struct rdma_event_channel *event_channel; 370 371 struct spdk_mempool *data_buf_pool; 372 373 pthread_mutex_t lock; 374 375 /* fields used to poll RDMA/IB events */ 376 nfds_t npoll_fds; 377 struct pollfd *poll_fds; 378 379 TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 380 TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 381 }; 382 383 struct spdk_nvmf_rdma_mgmt_channel { 384 /* Requests that are waiting to obtain a data buffer */ 385 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; 386 }; 387 388 static inline void 389 spdk_nvmf_rdma_qpair_inc_refcnt(struct spdk_nvmf_rdma_qpair *rqpair) 390 { 391 __sync_fetch_and_add(&rqpair->refcnt, 1); 392 } 393 394 static inline uint32_t 395 spdk_nvmf_rdma_qpair_dec_refcnt(struct spdk_nvmf_rdma_qpair *rqpair) 396 { 397 uint32_t old_refcnt, new_refcnt; 398 399 do { 400 old_refcnt = rqpair->refcnt; 401 assert(old_refcnt > 0); 402 new_refcnt = old_refcnt - 1; 403 } while (__sync_bool_compare_and_swap(&rqpair->refcnt, old_refcnt, new_refcnt) == false); 404 405 return new_refcnt; 406 } 407 408 static enum ibv_qp_state 409 spdk_nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) { 410 enum ibv_qp_state old_state, new_state; 411 int rc; 412 413 /* All the attributes needed for recovery */ 414 static int spdk_nvmf_ibv_attr_mask = 415 IBV_QP_STATE | 416 IBV_QP_PKEY_INDEX | 417 IBV_QP_PORT | 418 IBV_QP_ACCESS_FLAGS | 419 IBV_QP_AV | 420 IBV_QP_PATH_MTU | 421 IBV_QP_DEST_QPN | 422 IBV_QP_RQ_PSN | 423 IBV_QP_MAX_DEST_RD_ATOMIC | 424 IBV_QP_MIN_RNR_TIMER | 425 IBV_QP_SQ_PSN | 426 IBV_QP_TIMEOUT | 427 IBV_QP_RETRY_CNT | 428 IBV_QP_RNR_RETRY | 429 IBV_QP_MAX_QP_RD_ATOMIC; 430 431 old_state = rqpair->ibv_attr.qp_state; 432 rc = ibv_query_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, 433 spdk_nvmf_ibv_attr_mask, &rqpair->ibv_init_attr); 434 435 if (rc) 436 { 437 SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); 438 assert(false); 439 } 440 441 new_state = rqpair->ibv_attr.qp_state; 442 if (old_state != new_state) 443 { 444 spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0, 445 (uintptr_t)rqpair->cm_id, new_state); 446 } 447 return new_state; 448 } 449 450 static const char *str_ibv_qp_state[] = { 451 "IBV_QPS_RESET", 452 "IBV_QPS_INIT", 453 "IBV_QPS_RTR", 454 "IBV_QPS_RTS", 455 "IBV_QPS_SQD", 456 "IBV_QPS_SQE", 457 "IBV_QPS_ERR" 458 }; 459 460 static int 461 spdk_nvmf_rdma_set_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair, 462 enum ibv_qp_state new_state) 463 { 464 int rc; 465 enum ibv_qp_state state; 466 static int attr_mask_rc[] = { 467 [IBV_QPS_RESET] = IBV_QP_STATE, 468 [IBV_QPS_INIT] = (IBV_QP_STATE | 469 IBV_QP_PKEY_INDEX | 470 IBV_QP_PORT | 471 IBV_QP_ACCESS_FLAGS), 472 [IBV_QPS_RTR] = (IBV_QP_STATE | 473 IBV_QP_AV | 474 IBV_QP_PATH_MTU | 475 IBV_QP_DEST_QPN | 476 IBV_QP_RQ_PSN | 477 IBV_QP_MAX_DEST_RD_ATOMIC | 478 IBV_QP_MIN_RNR_TIMER), 479 [IBV_QPS_RTS] = (IBV_QP_STATE | 480 IBV_QP_SQ_PSN | 481 IBV_QP_TIMEOUT | 482 IBV_QP_RETRY_CNT | 483 IBV_QP_RNR_RETRY | 484 IBV_QP_MAX_QP_RD_ATOMIC), 485 [IBV_QPS_SQD] = IBV_QP_STATE, 486 [IBV_QPS_SQE] = IBV_QP_STATE, 487 [IBV_QPS_ERR] = IBV_QP_STATE, 488 }; 489 490 switch (new_state) { 491 case IBV_QPS_RESET: 492 case IBV_QPS_INIT: 493 case IBV_QPS_RTR: 494 case IBV_QPS_RTS: 495 case IBV_QPS_SQD: 496 case IBV_QPS_SQE: 497 case IBV_QPS_ERR: 498 break; 499 default: 500 SPDK_ERRLOG("QP#%d: bad state requested: %u\n", 501 rqpair->qpair.qid, new_state); 502 return -1; 503 } 504 rqpair->ibv_attr.cur_qp_state = rqpair->ibv_attr.qp_state; 505 rqpair->ibv_attr.qp_state = new_state; 506 rqpair->ibv_attr.ah_attr.port_num = rqpair->ibv_attr.port_num; 507 508 rc = ibv_modify_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, 509 attr_mask_rc[new_state]); 510 511 if (rc) { 512 SPDK_ERRLOG("QP#%d: failed to set state to: %s, %d (%s)\n", 513 rqpair->qpair.qid, str_ibv_qp_state[new_state], errno, strerror(errno)); 514 return rc; 515 } 516 517 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 518 519 if (state != new_state) { 520 SPDK_ERRLOG("QP#%d: expected state: %s, actual state: %s\n", 521 rqpair->qpair.qid, str_ibv_qp_state[new_state], 522 str_ibv_qp_state[state]); 523 return -1; 524 } 525 SPDK_NOTICELOG("IBV QP#%u changed to: %s\n", rqpair->qpair.qid, 526 str_ibv_qp_state[state]); 527 return 0; 528 } 529 530 static void 531 spdk_nvmf_rdma_request_set_state(struct spdk_nvmf_rdma_request *rdma_req, 532 enum spdk_nvmf_rdma_request_state state) 533 { 534 struct spdk_nvmf_qpair *qpair; 535 struct spdk_nvmf_rdma_qpair *rqpair; 536 537 qpair = rdma_req->req.qpair; 538 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 539 540 TAILQ_REMOVE(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); 541 rqpair->state_cntr[rdma_req->state]--; 542 543 rdma_req->state = state; 544 545 TAILQ_INSERT_TAIL(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); 546 rqpair->state_cntr[rdma_req->state]++; 547 } 548 549 static int 550 spdk_nvmf_rdma_mgmt_channel_create(void *io_device, void *ctx_buf) 551 { 552 struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; 553 554 TAILQ_INIT(&ch->pending_data_buf_queue); 555 return 0; 556 } 557 558 static void 559 spdk_nvmf_rdma_mgmt_channel_destroy(void *io_device, void *ctx_buf) 560 { 561 struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; 562 563 if (!TAILQ_EMPTY(&ch->pending_data_buf_queue)) { 564 SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n"); 565 } 566 } 567 568 static int 569 spdk_nvmf_rdma_cur_rw_depth(struct spdk_nvmf_rdma_qpair *rqpair) 570 { 571 return rqpair->state_cntr[RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER] + 572 rqpair->state_cntr[RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST]; 573 } 574 575 static int 576 spdk_nvmf_rdma_cur_queue_depth(struct spdk_nvmf_rdma_qpair *rqpair) 577 { 578 return rqpair->max_queue_depth - 579 rqpair->state_cntr[RDMA_REQUEST_STATE_FREE]; 580 } 581 582 static void 583 spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 584 { 585 int qd; 586 587 if (rqpair->refcnt == 0) { 588 return; 589 } 590 591 spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); 592 593 qd = spdk_nvmf_rdma_cur_queue_depth(rqpair); 594 if (qd != 0) { 595 SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", qd); 596 } 597 598 if (rqpair->poller) { 599 TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); 600 } 601 602 if (rqpair->cmds_mr) { 603 ibv_dereg_mr(rqpair->cmds_mr); 604 } 605 606 if (rqpair->cpls_mr) { 607 ibv_dereg_mr(rqpair->cpls_mr); 608 } 609 610 if (rqpair->bufs_mr) { 611 ibv_dereg_mr(rqpair->bufs_mr); 612 } 613 614 if (rqpair->cm_id) { 615 rdma_destroy_qp(rqpair->cm_id); 616 rdma_destroy_id(rqpair->cm_id); 617 } 618 619 if (rqpair->mgmt_channel) { 620 spdk_put_io_channel(rqpair->mgmt_channel); 621 } 622 623 /* Free all memory */ 624 spdk_dma_free(rqpair->cmds); 625 spdk_dma_free(rqpair->cpls); 626 spdk_dma_free(rqpair->bufs); 627 free(rqpair->reqs); 628 free(rqpair->recvs); 629 free(rqpair); 630 } 631 632 static int 633 spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 634 { 635 struct spdk_nvmf_rdma_transport *rtransport; 636 struct spdk_nvmf_rdma_qpair *rqpair; 637 int rc, i; 638 struct spdk_nvmf_rdma_recv *rdma_recv; 639 struct spdk_nvmf_rdma_request *rdma_req; 640 struct spdk_nvmf_transport *transport; 641 642 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 643 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 644 transport = &rtransport->transport; 645 646 memset(&rqpair->ibv_init_attr, 0, sizeof(struct ibv_qp_init_attr)); 647 rqpair->ibv_init_attr.qp_context = rqpair; 648 rqpair->ibv_init_attr.qp_type = IBV_QPT_RC; 649 rqpair->ibv_init_attr.send_cq = rqpair->poller->cq; 650 rqpair->ibv_init_attr.recv_cq = rqpair->poller->cq; 651 rqpair->ibv_init_attr.cap.max_send_wr = rqpair->max_queue_depth * 652 2 + 1; /* SEND, READ, and WRITE operations + dummy drain WR */ 653 rqpair->ibv_init_attr.cap.max_recv_wr = rqpair->max_queue_depth + 654 1; /* RECV operations + dummy drain WR */ 655 rqpair->ibv_init_attr.cap.max_send_sge = rqpair->max_sge; 656 rqpair->ibv_init_attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE; 657 658 rc = rdma_create_qp(rqpair->cm_id, rqpair->port->device->pd, &rqpair->ibv_init_attr); 659 if (rc) { 660 SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); 661 rdma_destroy_id(rqpair->cm_id); 662 rqpair->cm_id = NULL; 663 spdk_nvmf_rdma_qpair_destroy(rqpair); 664 return -1; 665 } 666 667 spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0); 668 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); 669 670 rqpair->reqs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->reqs)); 671 rqpair->recvs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->recvs)); 672 rqpair->cmds = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cmds), 673 0x1000, NULL); 674 rqpair->cpls = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cpls), 675 0x1000, NULL); 676 677 678 if (transport->opts.in_capsule_data_size > 0) { 679 rqpair->bufs = spdk_dma_zmalloc(rqpair->max_queue_depth * 680 transport->opts.in_capsule_data_size, 681 0x1000, NULL); 682 } 683 684 if (!rqpair->reqs || !rqpair->recvs || !rqpair->cmds || 685 !rqpair->cpls || (transport->opts.in_capsule_data_size && !rqpair->bufs)) { 686 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 687 spdk_nvmf_rdma_qpair_destroy(rqpair); 688 return -1; 689 } 690 691 rqpair->cmds_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cmds, 692 rqpair->max_queue_depth * sizeof(*rqpair->cmds), 693 IBV_ACCESS_LOCAL_WRITE); 694 rqpair->cpls_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cpls, 695 rqpair->max_queue_depth * sizeof(*rqpair->cpls), 696 0); 697 698 if (transport->opts.in_capsule_data_size) { 699 rqpair->bufs_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->bufs, 700 rqpair->max_queue_depth * 701 transport->opts.in_capsule_data_size, 702 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 703 } 704 705 if (!rqpair->cmds_mr || !rqpair->cpls_mr || (transport->opts.in_capsule_data_size && 706 !rqpair->bufs_mr)) { 707 SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n"); 708 spdk_nvmf_rdma_qpair_destroy(rqpair); 709 return -1; 710 } 711 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 712 rqpair->cmds, rqpair->max_queue_depth * sizeof(*rqpair->cmds), rqpair->cmds_mr->lkey); 713 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 714 rqpair->cpls, rqpair->max_queue_depth * sizeof(*rqpair->cpls), rqpair->cpls_mr->lkey); 715 if (rqpair->bufs && rqpair->bufs_mr) { 716 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 717 rqpair->bufs, rqpair->max_queue_depth * 718 transport->opts.in_capsule_data_size, rqpair->bufs_mr->lkey); 719 } 720 721 /* Initialise request state queues and counters of the queue pair */ 722 for (i = RDMA_REQUEST_STATE_FREE; i < RDMA_REQUEST_NUM_STATES; i++) { 723 TAILQ_INIT(&rqpair->state_queue[i]); 724 rqpair->state_cntr[i] = 0; 725 } 726 727 for (i = 0; i < rqpair->max_queue_depth; i++) { 728 struct ibv_recv_wr *bad_wr = NULL; 729 730 rdma_recv = &rqpair->recvs[i]; 731 rdma_recv->qpair = rqpair; 732 733 /* Set up memory to receive commands */ 734 if (rqpair->bufs) { 735 rdma_recv->buf = (void *)((uintptr_t)rqpair->bufs + (i * 736 transport->opts.in_capsule_data_size)); 737 } 738 739 rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV; 740 741 rdma_recv->sgl[0].addr = (uintptr_t)&rqpair->cmds[i]; 742 rdma_recv->sgl[0].length = sizeof(rqpair->cmds[i]); 743 rdma_recv->sgl[0].lkey = rqpair->cmds_mr->lkey; 744 rdma_recv->wr.num_sge = 1; 745 746 if (rdma_recv->buf && rqpair->bufs_mr) { 747 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 748 rdma_recv->sgl[1].length = transport->opts.in_capsule_data_size; 749 rdma_recv->sgl[1].lkey = rqpair->bufs_mr->lkey; 750 rdma_recv->wr.num_sge++; 751 } 752 753 rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr; 754 rdma_recv->wr.sg_list = rdma_recv->sgl; 755 756 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_recv->wr, &bad_wr); 757 if (rc) { 758 SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n"); 759 spdk_nvmf_rdma_qpair_destroy(rqpair); 760 return -1; 761 } 762 } 763 764 for (i = 0; i < rqpair->max_queue_depth; i++) { 765 rdma_req = &rqpair->reqs[i]; 766 767 rdma_req->req.qpair = &rqpair->qpair; 768 rdma_req->req.cmd = NULL; 769 770 /* Set up memory to send responses */ 771 rdma_req->req.rsp = &rqpair->cpls[i]; 772 773 rdma_req->rsp.sgl[0].addr = (uintptr_t)&rqpair->cpls[i]; 774 rdma_req->rsp.sgl[0].length = sizeof(rqpair->cpls[i]); 775 rdma_req->rsp.sgl[0].lkey = rqpair->cpls_mr->lkey; 776 777 rdma_req->rsp.rdma_wr.type = RDMA_WR_TYPE_SEND; 778 rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp.rdma_wr; 779 rdma_req->rsp.wr.next = NULL; 780 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 781 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 782 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 783 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 784 785 /* Set up memory for data buffers */ 786 rdma_req->data.rdma_wr.type = RDMA_WR_TYPE_DATA; 787 rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data.rdma_wr; 788 rdma_req->data.wr.next = NULL; 789 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 790 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 791 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 792 793 /* Initialize request state to FREE */ 794 rdma_req->state = RDMA_REQUEST_STATE_FREE; 795 TAILQ_INSERT_TAIL(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); 796 rqpair->state_cntr[rdma_req->state]++; 797 } 798 799 return 0; 800 } 801 802 static int 803 request_transfer_in(struct spdk_nvmf_request *req) 804 { 805 int rc; 806 struct spdk_nvmf_rdma_request *rdma_req; 807 struct spdk_nvmf_qpair *qpair; 808 struct spdk_nvmf_rdma_qpair *rqpair; 809 struct ibv_send_wr *bad_wr = NULL; 810 811 qpair = req->qpair; 812 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 813 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 814 815 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 816 817 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair); 818 819 rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; 820 rdma_req->data.wr.next = NULL; 821 rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr); 822 if (rc) { 823 SPDK_ERRLOG("Unable to transfer data from host to target\n"); 824 return -1; 825 } 826 return 0; 827 } 828 829 static int 830 request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) 831 { 832 int rc; 833 struct spdk_nvmf_rdma_request *rdma_req; 834 struct spdk_nvmf_qpair *qpair; 835 struct spdk_nvmf_rdma_qpair *rqpair; 836 struct spdk_nvme_cpl *rsp; 837 struct ibv_recv_wr *bad_recv_wr = NULL; 838 struct ibv_send_wr *send_wr, *bad_send_wr = NULL; 839 840 *data_posted = 0; 841 qpair = req->qpair; 842 rsp = &req->rsp->nvme_cpl; 843 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 844 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 845 846 /* Advance our sq_head pointer */ 847 if (qpair->sq_head == qpair->sq_head_max) { 848 qpair->sq_head = 0; 849 } else { 850 qpair->sq_head++; 851 } 852 rsp->sqhd = qpair->sq_head; 853 854 /* Post the capsule to the recv buffer */ 855 assert(rdma_req->recv != NULL); 856 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, 857 rqpair); 858 rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); 859 if (rc) { 860 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 861 return rc; 862 } 863 rdma_req->recv = NULL; 864 865 /* Build the response which consists of an optional 866 * RDMA WRITE to transfer data, plus an RDMA SEND 867 * containing the response. 868 */ 869 send_wr = &rdma_req->rsp.wr; 870 871 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 872 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 873 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair); 874 875 rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; 876 877 rdma_req->data.wr.next = send_wr; 878 *data_posted = 1; 879 send_wr = &rdma_req->data.wr; 880 } 881 882 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair); 883 884 /* Send the completion */ 885 rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr); 886 if (rc) { 887 SPDK_ERRLOG("Unable to send response capsule\n"); 888 } 889 890 return rc; 891 } 892 893 static int 894 spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 895 { 896 struct spdk_nvmf_rdma_accept_private_data accept_data; 897 struct rdma_conn_param ctrlr_event_data = {}; 898 int rc; 899 900 accept_data.recfmt = 0; 901 accept_data.crqsize = rqpair->max_queue_depth; 902 903 ctrlr_event_data.private_data = &accept_data; 904 ctrlr_event_data.private_data_len = sizeof(accept_data); 905 if (id->ps == RDMA_PS_TCP) { 906 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 907 ctrlr_event_data.initiator_depth = rqpair->max_rw_depth; 908 } 909 910 rc = rdma_accept(id, &ctrlr_event_data); 911 if (rc) { 912 SPDK_ERRLOG("Error %d on rdma_accept\n", errno); 913 } else { 914 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); 915 } 916 917 return rc; 918 } 919 920 static void 921 spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 922 { 923 struct spdk_nvmf_rdma_reject_private_data rej_data; 924 925 rej_data.recfmt = 0; 926 rej_data.sts = error; 927 928 rdma_reject(id, &rej_data, sizeof(rej_data)); 929 } 930 931 static int 932 nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, 933 new_qpair_fn cb_fn) 934 { 935 struct spdk_nvmf_rdma_transport *rtransport; 936 struct spdk_nvmf_rdma_qpair *rqpair = NULL; 937 struct spdk_nvmf_rdma_port *port; 938 struct rdma_conn_param *rdma_param = NULL; 939 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 940 uint16_t max_queue_depth; 941 uint16_t max_rw_depth; 942 943 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 944 945 assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 946 assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 947 948 rdma_param = &event->param.conn; 949 if (rdma_param->private_data == NULL || 950 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 951 SPDK_ERRLOG("connect request: no private data provided\n"); 952 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 953 return -1; 954 } 955 956 private_data = rdma_param->private_data; 957 if (private_data->recfmt != 0) { 958 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 959 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 960 return -1; 961 } 962 963 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 964 event->id->verbs->device->name, event->id->verbs->device->dev_name); 965 966 port = event->listen_id->context; 967 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 968 event->listen_id, event->listen_id->verbs, port); 969 970 /* Figure out the supported queue depth. This is a multi-step process 971 * that takes into account hardware maximums, host provided values, 972 * and our target's internal memory limits */ 973 974 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); 975 976 /* Start with the maximum queue depth allowed by the target */ 977 max_queue_depth = rtransport->transport.opts.max_queue_depth; 978 max_rw_depth = rtransport->transport.opts.max_queue_depth; 979 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", 980 rtransport->transport.opts.max_queue_depth); 981 982 /* Next check the local NIC's hardware limitations */ 983 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 984 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 985 port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 986 max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 987 max_rw_depth = spdk_min(max_rw_depth, port->device->attr.max_qp_rd_atom); 988 989 /* Next check the remote NIC's hardware limitations */ 990 SPDK_DEBUGLOG(SPDK_LOG_RDMA, 991 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 992 rdma_param->initiator_depth, rdma_param->responder_resources); 993 if (rdma_param->initiator_depth > 0) { 994 max_rw_depth = spdk_min(max_rw_depth, rdma_param->initiator_depth); 995 } 996 997 /* Finally check for the host software requested values, which are 998 * optional. */ 999 if (rdma_param->private_data != NULL && 1000 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1001 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 1002 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 1003 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 1004 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 1005 } 1006 1007 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 1008 max_queue_depth, max_rw_depth); 1009 1010 rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 1011 if (rqpair == NULL) { 1012 SPDK_ERRLOG("Could not allocate new connection.\n"); 1013 spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1014 return -1; 1015 } 1016 1017 rqpair->port = port; 1018 rqpair->max_queue_depth = max_queue_depth; 1019 rqpair->max_rw_depth = max_rw_depth; 1020 rqpair->cm_id = event->id; 1021 rqpair->listen_id = event->listen_id; 1022 rqpair->qpair.transport = transport; 1023 rqpair->max_sge = spdk_min(port->device->attr.max_sge, SPDK_NVMF_MAX_SGL_ENTRIES); 1024 TAILQ_INIT(&rqpair->incoming_queue); 1025 event->id->context = &rqpair->qpair; 1026 1027 cb_fn(&rqpair->qpair); 1028 1029 return 0; 1030 } 1031 1032 static int 1033 spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 1034 enum spdk_mem_map_notify_action action, 1035 void *vaddr, size_t size) 1036 { 1037 struct spdk_nvmf_rdma_device *device = cb_ctx; 1038 struct ibv_pd *pd = device->pd; 1039 struct ibv_mr *mr; 1040 1041 switch (action) { 1042 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1043 mr = ibv_reg_mr(pd, vaddr, size, 1044 IBV_ACCESS_LOCAL_WRITE | 1045 IBV_ACCESS_REMOTE_READ | 1046 IBV_ACCESS_REMOTE_WRITE); 1047 if (mr == NULL) { 1048 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 1049 return -1; 1050 } else { 1051 spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 1052 } 1053 break; 1054 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1055 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); 1056 spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 1057 if (mr) { 1058 ibv_dereg_mr(mr); 1059 } 1060 break; 1061 } 1062 1063 return 0; 1064 } 1065 1066 typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; 1067 1068 static spdk_nvme_data_transfer_t 1069 spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) 1070 { 1071 enum spdk_nvme_data_transfer xfer; 1072 struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; 1073 struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; 1074 1075 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1076 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 1077 rdma_req->rsp.wr.imm_data = 0; 1078 #endif 1079 1080 /* Figure out data transfer direction */ 1081 if (cmd->opc == SPDK_NVME_OPC_FABRIC) { 1082 xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); 1083 } else { 1084 xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 1085 1086 /* Some admin commands are special cases */ 1087 if ((rdma_req->req.qpair->qid == 0) && 1088 ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || 1089 (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { 1090 switch (cmd->cdw10 & 0xff) { 1091 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 1092 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1093 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 1094 break; 1095 default: 1096 xfer = SPDK_NVME_DATA_NONE; 1097 } 1098 } 1099 } 1100 1101 if (xfer == SPDK_NVME_DATA_NONE) { 1102 return xfer; 1103 } 1104 1105 /* Even for commands that may transfer data, they could have specified 0 length. 1106 * We want those to show up with xfer SPDK_NVME_DATA_NONE. 1107 */ 1108 switch (sgl->generic.type) { 1109 case SPDK_NVME_SGL_TYPE_DATA_BLOCK: 1110 case SPDK_NVME_SGL_TYPE_BIT_BUCKET: 1111 case SPDK_NVME_SGL_TYPE_SEGMENT: 1112 case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: 1113 case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: 1114 if (sgl->unkeyed.length == 0) { 1115 xfer = SPDK_NVME_DATA_NONE; 1116 } 1117 break; 1118 case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: 1119 if (sgl->keyed.length == 0) { 1120 xfer = SPDK_NVME_DATA_NONE; 1121 } 1122 break; 1123 } 1124 1125 return xfer; 1126 } 1127 1128 static int 1129 spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 1130 struct spdk_nvmf_rdma_device *device, 1131 struct spdk_nvmf_rdma_request *rdma_req) 1132 { 1133 void *buf = NULL; 1134 uint32_t length = rdma_req->req.length; 1135 uint32_t i = 0; 1136 1137 rdma_req->req.iovcnt = 0; 1138 while (length) { 1139 buf = spdk_mempool_get(rtransport->data_buf_pool); 1140 if (!buf) { 1141 goto nomem; 1142 } 1143 1144 rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & 1145 ~NVMF_DATA_BUFFER_MASK); 1146 rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->transport.opts.io_unit_size); 1147 rdma_req->req.iovcnt++; 1148 rdma_req->data.buffers[i] = buf; 1149 rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base); 1150 rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len; 1151 rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, 1152 (uint64_t)buf, NULL))->lkey; 1153 1154 length -= rdma_req->req.iov[i].iov_len; 1155 i++; 1156 } 1157 1158 rdma_req->data_from_pool = true; 1159 1160 return 0; 1161 1162 nomem: 1163 while (i) { 1164 i--; 1165 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->req.iov[i].iov_base); 1166 rdma_req->req.iov[i].iov_base = NULL; 1167 rdma_req->req.iov[i].iov_len = 0; 1168 1169 rdma_req->data.wr.sg_list[i].addr = 0; 1170 rdma_req->data.wr.sg_list[i].length = 0; 1171 rdma_req->data.wr.sg_list[i].lkey = 0; 1172 } 1173 rdma_req->req.iovcnt = 0; 1174 return -ENOMEM; 1175 } 1176 1177 static int 1178 spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 1179 struct spdk_nvmf_rdma_device *device, 1180 struct spdk_nvmf_rdma_request *rdma_req) 1181 { 1182 struct spdk_nvme_cmd *cmd; 1183 struct spdk_nvme_cpl *rsp; 1184 struct spdk_nvme_sgl_descriptor *sgl; 1185 1186 cmd = &rdma_req->req.cmd->nvme_cmd; 1187 rsp = &rdma_req->req.rsp->nvme_cpl; 1188 sgl = &cmd->dptr.sgl1; 1189 1190 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 1191 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 1192 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 1193 if (sgl->keyed.length > rtransport->transport.opts.max_io_size) { 1194 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 1195 sgl->keyed.length, rtransport->transport.opts.max_io_size); 1196 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1197 return -1; 1198 } 1199 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1200 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 1201 if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 1202 rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 1203 rdma_req->rsp.wr.imm_data = sgl->keyed.key; 1204 } 1205 } 1206 #endif 1207 1208 /* fill request length and populate iovs */ 1209 rdma_req->req.length = sgl->keyed.length; 1210 1211 if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { 1212 /* No available buffers. Queue this request up. */ 1213 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); 1214 return 0; 1215 } 1216 1217 /* backward compatible */ 1218 rdma_req->req.data = rdma_req->req.iov[0].iov_base; 1219 1220 /* rdma wr specifics */ 1221 rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; 1222 rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; 1223 rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; 1224 1225 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, 1226 rdma_req->req.iovcnt); 1227 1228 return 0; 1229 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 1230 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1231 uint64_t offset = sgl->address; 1232 uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; 1233 1234 SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 1235 offset, sgl->unkeyed.length); 1236 1237 if (offset > max_len) { 1238 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 1239 offset, max_len); 1240 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 1241 return -1; 1242 } 1243 max_len -= (uint32_t)offset; 1244 1245 if (sgl->unkeyed.length > max_len) { 1246 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 1247 sgl->unkeyed.length, max_len); 1248 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1249 return -1; 1250 } 1251 1252 rdma_req->req.data = rdma_req->recv->buf + offset; 1253 rdma_req->data_from_pool = false; 1254 rdma_req->req.length = sgl->unkeyed.length; 1255 1256 rdma_req->req.iov[0].iov_base = rdma_req->req.data; 1257 rdma_req->req.iov[0].iov_len = rdma_req->req.length; 1258 rdma_req->req.iovcnt = 1; 1259 1260 return 0; 1261 } 1262 1263 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 1264 sgl->generic.type, sgl->generic.subtype); 1265 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 1266 return -1; 1267 } 1268 1269 static bool 1270 spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 1271 struct spdk_nvmf_rdma_request *rdma_req) 1272 { 1273 struct spdk_nvmf_rdma_qpair *rqpair; 1274 struct spdk_nvmf_rdma_device *device; 1275 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 1276 int rc; 1277 struct spdk_nvmf_rdma_recv *rdma_recv; 1278 enum spdk_nvmf_rdma_request_state prev_state; 1279 bool progress = false; 1280 int data_posted; 1281 int cur_rdma_rw_depth; 1282 1283 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1284 device = rqpair->port->device; 1285 1286 assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 1287 1288 /* If the queue pair is in an error state, force the request to the completed state 1289 * to release resources. */ 1290 if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { 1291 if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) { 1292 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1293 } 1294 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 1295 } 1296 1297 /* The loop here is to allow for several back-to-back state changes. */ 1298 do { 1299 prev_state = rdma_req->state; 1300 1301 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); 1302 1303 switch (rdma_req->state) { 1304 case RDMA_REQUEST_STATE_FREE: 1305 /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 1306 * to escape this state. */ 1307 break; 1308 case RDMA_REQUEST_STATE_NEW: 1309 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, 1310 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1311 rdma_recv = rdma_req->recv; 1312 1313 /* The first element of the SGL is the NVMe command */ 1314 rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 1315 memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 1316 1317 TAILQ_REMOVE(&rqpair->incoming_queue, rdma_recv, link); 1318 1319 if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR) { 1320 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 1321 break; 1322 } 1323 1324 /* The next state transition depends on the data transfer needs of this request. */ 1325 rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); 1326 1327 /* If no data to transfer, ready to execute. */ 1328 if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 1329 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); 1330 break; 1331 } 1332 1333 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_NEED_BUFFER); 1334 TAILQ_INSERT_TAIL(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1335 break; 1336 case RDMA_REQUEST_STATE_NEED_BUFFER: 1337 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, 1338 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1339 1340 assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 1341 1342 if (rdma_req != TAILQ_FIRST(&rqpair->ch->pending_data_buf_queue)) { 1343 /* This request needs to wait in line to obtain a buffer */ 1344 break; 1345 } 1346 1347 /* Try to get a data buffer */ 1348 rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 1349 if (rc < 0) { 1350 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1351 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1352 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); 1353 break; 1354 } 1355 1356 if (!rdma_req->req.data) { 1357 /* No buffers available. */ 1358 break; 1359 } 1360 1361 TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); 1362 1363 /* If data is transferring from host to controller and the data didn't 1364 * arrive using in capsule data, we need to do a transfer from the host. 1365 */ 1366 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { 1367 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING); 1368 break; 1369 } 1370 1371 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); 1372 break; 1373 case RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING: 1374 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, 0, 0, 1375 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1376 1377 if (rdma_req != TAILQ_FIRST(&rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING])) { 1378 /* This request needs to wait in line to perform RDMA */ 1379 break; 1380 } 1381 cur_rdma_rw_depth = spdk_nvmf_rdma_cur_rw_depth(rqpair); 1382 1383 if (cur_rdma_rw_depth >= rqpair->max_rw_depth) { 1384 /* R/W queue is full, need to wait */ 1385 break; 1386 } 1387 1388 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 1389 rc = request_transfer_in(&rdma_req->req); 1390 if (!rc) { 1391 spdk_nvmf_rdma_request_set_state(rdma_req, 1392 RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); 1393 } else { 1394 rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1395 spdk_nvmf_rdma_request_set_state(rdma_req, 1396 RDMA_REQUEST_STATE_READY_TO_COMPLETE); 1397 } 1398 } else if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1399 /* The data transfer will be kicked off from 1400 * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. 1401 */ 1402 spdk_nvmf_rdma_request_set_state(rdma_req, 1403 RDMA_REQUEST_STATE_READY_TO_COMPLETE); 1404 } else { 1405 SPDK_ERRLOG("Cannot perform data transfer, unknown state: %u\n", 1406 rdma_req->req.xfer); 1407 assert(0); 1408 } 1409 break; 1410 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 1411 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, 1412 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1413 /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 1414 * to escape this state. */ 1415 break; 1416 case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 1417 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, 1418 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1419 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_EXECUTING); 1420 spdk_nvmf_request_exec(&rdma_req->req); 1421 break; 1422 case RDMA_REQUEST_STATE_EXECUTING: 1423 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, 1424 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1425 /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 1426 * to escape this state. */ 1427 break; 1428 case RDMA_REQUEST_STATE_EXECUTED: 1429 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, 1430 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1431 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1432 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING); 1433 } else { 1434 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); 1435 } 1436 break; 1437 case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 1438 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, 1439 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1440 rc = request_transfer_out(&rdma_req->req, &data_posted); 1441 assert(rc == 0); /* No good way to handle this currently */ 1442 if (rc) { 1443 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 1444 } else { 1445 spdk_nvmf_rdma_request_set_state(rdma_req, 1446 data_posted ? 1447 RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : 1448 RDMA_REQUEST_STATE_COMPLETING); 1449 } 1450 break; 1451 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 1452 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, 1453 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1454 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1455 * to escape this state. */ 1456 break; 1457 case RDMA_REQUEST_STATE_COMPLETING: 1458 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, 1459 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1460 /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 1461 * to escape this state. */ 1462 break; 1463 case RDMA_REQUEST_STATE_COMPLETED: 1464 spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, 1465 (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); 1466 1467 if (rdma_req->data_from_pool) { 1468 /* Put the buffer/s back in the pool */ 1469 for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { 1470 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); 1471 rdma_req->req.iov[i].iov_base = NULL; 1472 rdma_req->data.buffers[i] = NULL; 1473 } 1474 rdma_req->data_from_pool = false; 1475 } 1476 rdma_req->req.length = 0; 1477 rdma_req->req.iovcnt = 0; 1478 rdma_req->req.data = NULL; 1479 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_FREE); 1480 break; 1481 case RDMA_REQUEST_NUM_STATES: 1482 default: 1483 assert(0); 1484 break; 1485 } 1486 1487 if (rdma_req->state != prev_state) { 1488 progress = true; 1489 } 1490 } while (rdma_req->state != prev_state); 1491 1492 return progress; 1493 } 1494 1495 /* Public API callbacks begin here */ 1496 1497 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 1498 #define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 1499 #define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 1500 #define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 1501 #define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 1502 #define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE 4096 1503 #define SPDK_NVMF_RDMA_DEFAULT_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES) 1504 1505 static void 1506 spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) 1507 { 1508 opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; 1509 opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; 1510 opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; 1511 opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; 1512 opts->io_unit_size = spdk_max(SPDK_NVMF_RDMA_DEFAULT_IO_BUFFER_SIZE, 1513 SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE); 1514 opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; 1515 } 1516 1517 static int spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport); 1518 1519 static struct spdk_nvmf_transport * 1520 spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) 1521 { 1522 int rc; 1523 struct spdk_nvmf_rdma_transport *rtransport; 1524 struct spdk_nvmf_rdma_device *device, *tmp; 1525 struct ibv_context **contexts; 1526 uint32_t i; 1527 int flag; 1528 uint32_t sge_count; 1529 1530 const struct spdk_mem_map_ops nvmf_rdma_map_ops = { 1531 .notify_cb = spdk_nvmf_rdma_mem_notify, 1532 .are_contiguous = NULL 1533 }; 1534 1535 rtransport = calloc(1, sizeof(*rtransport)); 1536 if (!rtransport) { 1537 return NULL; 1538 } 1539 1540 if (pthread_mutex_init(&rtransport->lock, NULL)) { 1541 SPDK_ERRLOG("pthread_mutex_init() failed\n"); 1542 free(rtransport); 1543 return NULL; 1544 } 1545 1546 spdk_io_device_register(rtransport, spdk_nvmf_rdma_mgmt_channel_create, 1547 spdk_nvmf_rdma_mgmt_channel_destroy, 1548 sizeof(struct spdk_nvmf_rdma_mgmt_channel), 1549 "rdma_transport"); 1550 1551 TAILQ_INIT(&rtransport->devices); 1552 TAILQ_INIT(&rtransport->ports); 1553 1554 rtransport->transport.ops = &spdk_nvmf_transport_rdma; 1555 1556 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n" 1557 " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" 1558 " max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" 1559 " in_capsule_data_size=%d, max_aq_depth=%d\n", 1560 opts->max_queue_depth, 1561 opts->max_io_size, 1562 opts->max_qpairs_per_ctrlr, 1563 opts->io_unit_size, 1564 opts->in_capsule_data_size, 1565 opts->max_aq_depth); 1566 1567 /* I/O unit size cannot be larger than max I/O size */ 1568 if (opts->io_unit_size > opts->max_io_size) { 1569 opts->io_unit_size = opts->max_io_size; 1570 } 1571 1572 sge_count = opts->max_io_size / opts->io_unit_size; 1573 if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) { 1574 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); 1575 spdk_nvmf_rdma_destroy(&rtransport->transport); 1576 return NULL; 1577 } 1578 1579 rtransport->event_channel = rdma_create_event_channel(); 1580 if (rtransport->event_channel == NULL) { 1581 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 1582 spdk_nvmf_rdma_destroy(&rtransport->transport); 1583 return NULL; 1584 } 1585 1586 flag = fcntl(rtransport->event_channel->fd, F_GETFL); 1587 if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1588 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 1589 rtransport->event_channel->fd, spdk_strerror(errno)); 1590 spdk_nvmf_rdma_destroy(&rtransport->transport); 1591 return NULL; 1592 } 1593 1594 /* The maximum number of buffers we will need for a given request is equal to just less than double the number of SGL elements */ 1595 rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma", 1596 opts->max_queue_depth * (SPDK_NVMF_MAX_SGL_ENTRIES * 2) * 4, 1597 opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT, 1598 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 1599 SPDK_ENV_SOCKET_ID_ANY); 1600 if (!rtransport->data_buf_pool) { 1601 SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n"); 1602 spdk_nvmf_rdma_destroy(&rtransport->transport); 1603 return NULL; 1604 } 1605 1606 contexts = rdma_get_devices(NULL); 1607 if (contexts == NULL) { 1608 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 1609 spdk_nvmf_rdma_destroy(&rtransport->transport); 1610 return NULL; 1611 } 1612 1613 i = 0; 1614 rc = 0; 1615 while (contexts[i] != NULL) { 1616 device = calloc(1, sizeof(*device)); 1617 if (!device) { 1618 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 1619 rc = -ENOMEM; 1620 break; 1621 } 1622 device->context = contexts[i]; 1623 rc = ibv_query_device(device->context, &device->attr); 1624 if (rc < 0) { 1625 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1626 free(device); 1627 break; 1628 1629 } 1630 1631 #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1632 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { 1633 SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); 1634 SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); 1635 } 1636 1637 /** 1638 * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. 1639 * The Soft-RoCE RXE driver does not currently support send with invalidate, 1640 * but incorrectly reports that it does. There are changes making their way 1641 * through the kernel now that will enable this feature. When they are merged, 1642 * we can conditionally enable this feature. 1643 * 1644 * TODO: enable this for versions of the kernel rxe driver that support it. 1645 */ 1646 if (device->attr.vendor_id == 0) { 1647 device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); 1648 } 1649 #endif 1650 1651 /* set up device context async ev fd as NON_BLOCKING */ 1652 flag = fcntl(device->context->async_fd, F_GETFL); 1653 rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 1654 if (rc < 0) { 1655 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 1656 free(device); 1657 break; 1658 } 1659 1660 device->pd = ibv_alloc_pd(device->context); 1661 if (!device->pd) { 1662 SPDK_ERRLOG("Unable to allocate protection domain.\n"); 1663 free(device); 1664 rc = -1; 1665 break; 1666 } 1667 1668 device->map = spdk_mem_map_alloc(0, &nvmf_rdma_map_ops, device); 1669 if (!device->map) { 1670 SPDK_ERRLOG("Unable to allocate memory map for new poll group\n"); 1671 ibv_dealloc_pd(device->pd); 1672 free(device); 1673 rc = -1; 1674 break; 1675 } 1676 1677 TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 1678 i++; 1679 } 1680 rdma_free_devices(contexts); 1681 1682 if (rc < 0) { 1683 spdk_nvmf_rdma_destroy(&rtransport->transport); 1684 return NULL; 1685 } 1686 1687 /* Set up poll descriptor array to monitor events from RDMA and IB 1688 * in a single poll syscall 1689 */ 1690 rtransport->npoll_fds = i + 1; 1691 i = 0; 1692 rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 1693 if (rtransport->poll_fds == NULL) { 1694 SPDK_ERRLOG("poll_fds allocation failed\n"); 1695 spdk_nvmf_rdma_destroy(&rtransport->transport); 1696 return NULL; 1697 } 1698 1699 rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 1700 rtransport->poll_fds[i++].events = POLLIN; 1701 1702 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 1703 rtransport->poll_fds[i].fd = device->context->async_fd; 1704 rtransport->poll_fds[i++].events = POLLIN; 1705 } 1706 1707 return &rtransport->transport; 1708 } 1709 1710 static int 1711 spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) 1712 { 1713 struct spdk_nvmf_rdma_transport *rtransport; 1714 struct spdk_nvmf_rdma_port *port, *port_tmp; 1715 struct spdk_nvmf_rdma_device *device, *device_tmp; 1716 1717 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1718 1719 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 1720 TAILQ_REMOVE(&rtransport->ports, port, link); 1721 rdma_destroy_id(port->id); 1722 free(port); 1723 } 1724 1725 if (rtransport->poll_fds != NULL) { 1726 free(rtransport->poll_fds); 1727 } 1728 1729 if (rtransport->event_channel != NULL) { 1730 rdma_destroy_event_channel(rtransport->event_channel); 1731 } 1732 1733 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 1734 TAILQ_REMOVE(&rtransport->devices, device, link); 1735 if (device->map) { 1736 spdk_mem_map_free(&device->map); 1737 } 1738 if (device->pd) { 1739 ibv_dealloc_pd(device->pd); 1740 } 1741 free(device); 1742 } 1743 1744 if (rtransport->data_buf_pool != NULL) { 1745 if (spdk_mempool_count(rtransport->data_buf_pool) != 1746 (transport->opts.max_queue_depth * (SPDK_NVMF_MAX_SGL_ENTRIES * 2) * 4)) { 1747 SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n", 1748 spdk_mempool_count(rtransport->data_buf_pool), 1749 transport->opts.max_queue_depth * (SPDK_NVMF_MAX_SGL_ENTRIES * 2) * 4); 1750 } 1751 } 1752 1753 spdk_mempool_free(rtransport->data_buf_pool); 1754 spdk_io_device_unregister(rtransport, NULL); 1755 pthread_mutex_destroy(&rtransport->lock); 1756 free(rtransport); 1757 1758 return 0; 1759 } 1760 1761 static int 1762 spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, 1763 const struct spdk_nvme_transport_id *trid) 1764 { 1765 struct spdk_nvmf_rdma_transport *rtransport; 1766 struct spdk_nvmf_rdma_device *device; 1767 struct spdk_nvmf_rdma_port *port_tmp, *port; 1768 struct addrinfo *res; 1769 struct addrinfo hints; 1770 int family; 1771 int rc; 1772 1773 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1774 1775 port = calloc(1, sizeof(*port)); 1776 if (!port) { 1777 return -ENOMEM; 1778 } 1779 1780 /* Selectively copy the trid. Things like NQN don't matter here - that 1781 * mapping is enforced elsewhere. 1782 */ 1783 port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1784 port->trid.adrfam = trid->adrfam; 1785 snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); 1786 snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); 1787 1788 pthread_mutex_lock(&rtransport->lock); 1789 assert(rtransport->event_channel != NULL); 1790 TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { 1791 if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { 1792 port_tmp->ref++; 1793 free(port); 1794 /* Already listening at this address */ 1795 pthread_mutex_unlock(&rtransport->lock); 1796 return 0; 1797 } 1798 } 1799 1800 rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 1801 if (rc < 0) { 1802 SPDK_ERRLOG("rdma_create_id() failed\n"); 1803 free(port); 1804 pthread_mutex_unlock(&rtransport->lock); 1805 return rc; 1806 } 1807 1808 switch (port->trid.adrfam) { 1809 case SPDK_NVMF_ADRFAM_IPV4: 1810 family = AF_INET; 1811 break; 1812 case SPDK_NVMF_ADRFAM_IPV6: 1813 family = AF_INET6; 1814 break; 1815 default: 1816 SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); 1817 free(port); 1818 pthread_mutex_unlock(&rtransport->lock); 1819 return -EINVAL; 1820 } 1821 1822 memset(&hints, 0, sizeof(hints)); 1823 hints.ai_family = family; 1824 hints.ai_flags = AI_NUMERICSERV; 1825 hints.ai_socktype = SOCK_STREAM; 1826 hints.ai_protocol = 0; 1827 1828 rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); 1829 if (rc) { 1830 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 1831 free(port); 1832 pthread_mutex_unlock(&rtransport->lock); 1833 return -EINVAL; 1834 } 1835 1836 rc = rdma_bind_addr(port->id, res->ai_addr); 1837 freeaddrinfo(res); 1838 1839 if (rc < 0) { 1840 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 1841 rdma_destroy_id(port->id); 1842 free(port); 1843 pthread_mutex_unlock(&rtransport->lock); 1844 return rc; 1845 } 1846 1847 if (!port->id->verbs) { 1848 SPDK_ERRLOG("ibv_context is null\n"); 1849 rdma_destroy_id(port->id); 1850 free(port); 1851 pthread_mutex_unlock(&rtransport->lock); 1852 return -1; 1853 } 1854 1855 rc = rdma_listen(port->id, 10); /* 10 = backlog */ 1856 if (rc < 0) { 1857 SPDK_ERRLOG("rdma_listen() failed\n"); 1858 rdma_destroy_id(port->id); 1859 free(port); 1860 pthread_mutex_unlock(&rtransport->lock); 1861 return rc; 1862 } 1863 1864 TAILQ_FOREACH(device, &rtransport->devices, link) { 1865 if (device->context == port->id->verbs) { 1866 port->device = device; 1867 break; 1868 } 1869 } 1870 if (!port->device) { 1871 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 1872 port->id->verbs); 1873 rdma_destroy_id(port->id); 1874 free(port); 1875 pthread_mutex_unlock(&rtransport->lock); 1876 return -EINVAL; 1877 } 1878 1879 SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", 1880 port->trid.traddr, ntohs(rdma_get_src_port(port->id))); 1881 1882 port->ref = 1; 1883 1884 TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 1885 pthread_mutex_unlock(&rtransport->lock); 1886 1887 return 0; 1888 } 1889 1890 static int 1891 spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 1892 const struct spdk_nvme_transport_id *_trid) 1893 { 1894 struct spdk_nvmf_rdma_transport *rtransport; 1895 struct spdk_nvmf_rdma_port *port, *tmp; 1896 struct spdk_nvme_transport_id trid = {}; 1897 1898 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1899 1900 /* Selectively copy the trid. Things like NQN don't matter here - that 1901 * mapping is enforced elsewhere. 1902 */ 1903 trid.trtype = SPDK_NVME_TRANSPORT_RDMA; 1904 trid.adrfam = _trid->adrfam; 1905 snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); 1906 snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); 1907 1908 pthread_mutex_lock(&rtransport->lock); 1909 TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 1910 if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { 1911 assert(port->ref > 0); 1912 port->ref--; 1913 if (port->ref == 0) { 1914 TAILQ_REMOVE(&rtransport->ports, port, link); 1915 rdma_destroy_id(port->id); 1916 free(port); 1917 } 1918 break; 1919 } 1920 } 1921 1922 pthread_mutex_unlock(&rtransport->lock); 1923 return 0; 1924 } 1925 1926 static bool 1927 spdk_nvmf_rdma_qpair_is_idle(struct spdk_nvmf_qpair *qpair) 1928 { 1929 int cur_queue_depth, cur_rdma_rw_depth; 1930 struct spdk_nvmf_rdma_qpair *rqpair; 1931 1932 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1933 cur_queue_depth = spdk_nvmf_rdma_cur_queue_depth(rqpair); 1934 cur_rdma_rw_depth = spdk_nvmf_rdma_cur_rw_depth(rqpair); 1935 1936 if (cur_queue_depth == 0 && cur_rdma_rw_depth == 0) { 1937 return true; 1938 } 1939 return false; 1940 } 1941 1942 static void 1943 spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 1944 struct spdk_nvmf_rdma_qpair *rqpair) 1945 { 1946 struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 1947 struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 1948 1949 /* We process I/O in the data transfer pending queue at the highest priority. */ 1950 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING], 1951 state_link, req_tmp) { 1952 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1953 break; 1954 } 1955 } 1956 1957 /* The second highest priority is I/O waiting on memory buffers. */ 1958 TAILQ_FOREACH_SAFE(rdma_req, &rqpair->ch->pending_data_buf_queue, link, 1959 req_tmp) { 1960 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1961 break; 1962 } 1963 } 1964 1965 /* The lowest priority is processing newly received commands */ 1966 TAILQ_FOREACH_SAFE(rdma_recv, &rqpair->incoming_queue, link, recv_tmp) { 1967 if (TAILQ_EMPTY(&rqpair->state_queue[RDMA_REQUEST_STATE_FREE])) { 1968 break; 1969 } 1970 1971 rdma_req = TAILQ_FIRST(&rqpair->state_queue[RDMA_REQUEST_STATE_FREE]); 1972 rdma_req->recv = rdma_recv; 1973 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_NEW); 1974 if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { 1975 break; 1976 } 1977 } 1978 } 1979 1980 static void 1981 _nvmf_rdma_qpair_disconnect(void *ctx) 1982 { 1983 struct spdk_nvmf_qpair *qpair = ctx; 1984 struct spdk_nvmf_rdma_qpair *rqpair; 1985 1986 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1987 1988 spdk_nvmf_rdma_qpair_dec_refcnt(rqpair); 1989 1990 spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); 1991 } 1992 1993 static void 1994 _nvmf_rdma_disconnect_retry(void *ctx) 1995 { 1996 struct spdk_nvmf_qpair *qpair = ctx; 1997 struct spdk_nvmf_poll_group *group; 1998 1999 /* Read the group out of the qpair. This is normally set and accessed only from 2000 * the thread that created the group. Here, we're not on that thread necessarily. 2001 * The data member qpair->group begins it's life as NULL and then is assigned to 2002 * a pointer and never changes. So fortunately reading this and checking for 2003 * non-NULL is thread safe in the x86_64 memory model. */ 2004 group = qpair->group; 2005 2006 if (group == NULL) { 2007 /* The qpair hasn't been assigned to a group yet, so we can't 2008 * process a disconnect. Send a message to ourself and try again. */ 2009 spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_disconnect_retry, qpair); 2010 return; 2011 } 2012 2013 spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair); 2014 } 2015 2016 static int 2017 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 2018 { 2019 struct spdk_nvmf_qpair *qpair; 2020 struct spdk_nvmf_rdma_qpair *rqpair; 2021 2022 if (evt->id == NULL) { 2023 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 2024 return -1; 2025 } 2026 2027 qpair = evt->id->context; 2028 if (qpair == NULL) { 2029 SPDK_ERRLOG("disconnect request: no active connection\n"); 2030 return -1; 2031 } 2032 2033 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2034 2035 spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); 2036 2037 spdk_nvmf_rdma_update_ibv_state(rqpair); 2038 spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); 2039 2040 _nvmf_rdma_disconnect_retry(qpair); 2041 2042 return 0; 2043 } 2044 2045 #ifdef DEBUG 2046 static const char *CM_EVENT_STR[] = { 2047 "RDMA_CM_EVENT_ADDR_RESOLVED", 2048 "RDMA_CM_EVENT_ADDR_ERROR", 2049 "RDMA_CM_EVENT_ROUTE_RESOLVED", 2050 "RDMA_CM_EVENT_ROUTE_ERROR", 2051 "RDMA_CM_EVENT_CONNECT_REQUEST", 2052 "RDMA_CM_EVENT_CONNECT_RESPONSE", 2053 "RDMA_CM_EVENT_CONNECT_ERROR", 2054 "RDMA_CM_EVENT_UNREACHABLE", 2055 "RDMA_CM_EVENT_REJECTED", 2056 "RDMA_CM_EVENT_ESTABLISHED", 2057 "RDMA_CM_EVENT_DISCONNECTED", 2058 "RDMA_CM_EVENT_DEVICE_REMOVAL", 2059 "RDMA_CM_EVENT_MULTICAST_JOIN", 2060 "RDMA_CM_EVENT_MULTICAST_ERROR", 2061 "RDMA_CM_EVENT_ADDR_CHANGE", 2062 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 2063 }; 2064 #endif /* DEBUG */ 2065 2066 static void 2067 spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 2068 { 2069 struct spdk_nvmf_rdma_transport *rtransport; 2070 struct rdma_cm_event *event; 2071 int rc; 2072 2073 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2074 2075 if (rtransport->event_channel == NULL) { 2076 return; 2077 } 2078 2079 while (1) { 2080 rc = rdma_get_cm_event(rtransport->event_channel, &event); 2081 if (rc == 0) { 2082 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 2083 2084 spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); 2085 2086 switch (event->event) { 2087 case RDMA_CM_EVENT_ADDR_RESOLVED: 2088 case RDMA_CM_EVENT_ADDR_ERROR: 2089 case RDMA_CM_EVENT_ROUTE_RESOLVED: 2090 case RDMA_CM_EVENT_ROUTE_ERROR: 2091 /* No action required. The target never attempts to resolve routes. */ 2092 break; 2093 case RDMA_CM_EVENT_CONNECT_REQUEST: 2094 rc = nvmf_rdma_connect(transport, event, cb_fn); 2095 if (rc < 0) { 2096 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 2097 break; 2098 } 2099 break; 2100 case RDMA_CM_EVENT_CONNECT_RESPONSE: 2101 /* The target never initiates a new connection. So this will not occur. */ 2102 break; 2103 case RDMA_CM_EVENT_CONNECT_ERROR: 2104 /* Can this happen? The docs say it can, but not sure what causes it. */ 2105 break; 2106 case RDMA_CM_EVENT_UNREACHABLE: 2107 case RDMA_CM_EVENT_REJECTED: 2108 /* These only occur on the client side. */ 2109 break; 2110 case RDMA_CM_EVENT_ESTABLISHED: 2111 /* TODO: Should we be waiting for this event anywhere? */ 2112 break; 2113 case RDMA_CM_EVENT_DISCONNECTED: 2114 case RDMA_CM_EVENT_DEVICE_REMOVAL: 2115 rc = nvmf_rdma_disconnect(event); 2116 if (rc < 0) { 2117 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 2118 break; 2119 } 2120 break; 2121 case RDMA_CM_EVENT_MULTICAST_JOIN: 2122 case RDMA_CM_EVENT_MULTICAST_ERROR: 2123 /* Multicast is not used */ 2124 break; 2125 case RDMA_CM_EVENT_ADDR_CHANGE: 2126 /* Not utilizing this event */ 2127 break; 2128 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 2129 /* For now, do nothing. The target never re-uses queue pairs. */ 2130 break; 2131 default: 2132 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 2133 break; 2134 } 2135 2136 rdma_ack_cm_event(event); 2137 } else { 2138 if (errno != EAGAIN && errno != EWOULDBLOCK) { 2139 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 2140 } 2141 break; 2142 } 2143 } 2144 } 2145 2146 static void 2147 spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 2148 { 2149 int rc; 2150 struct spdk_nvmf_rdma_qpair *rqpair; 2151 struct ibv_async_event event; 2152 enum ibv_qp_state state; 2153 2154 rc = ibv_get_async_event(device->context, &event); 2155 2156 if (rc) { 2157 SPDK_ERRLOG("Failed to get async_event (%d): %s\n", 2158 errno, spdk_strerror(errno)); 2159 return; 2160 } 2161 2162 SPDK_NOTICELOG("Async event: %s\n", 2163 ibv_event_type_str(event.event_type)); 2164 2165 switch (event.event_type) { 2166 case IBV_EVENT_QP_FATAL: 2167 rqpair = event.element.qp->qp_context; 2168 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2169 (uintptr_t)rqpair->cm_id, event.event_type); 2170 spdk_nvmf_rdma_update_ibv_state(rqpair); 2171 spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); 2172 spdk_thread_send_msg(rqpair->qpair.group->thread, _nvmf_rdma_qpair_disconnect, &rqpair->qpair); 2173 break; 2174 case IBV_EVENT_QP_LAST_WQE_REACHED: 2175 /* This event only occurs for shared receive queues, which are not currently supported. */ 2176 break; 2177 case IBV_EVENT_SQ_DRAINED: 2178 /* This event occurs frequently in both error and non-error states. 2179 * Check if the qpair is in an error state before sending a message. 2180 * Note that we're not on the correct thread to access the qpair, but 2181 * the operations that the below calls make all happen to be thread 2182 * safe. */ 2183 rqpair = event.element.qp->qp_context; 2184 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2185 (uintptr_t)rqpair->cm_id, event.event_type); 2186 state = spdk_nvmf_rdma_update_ibv_state(rqpair); 2187 if (state == IBV_QPS_ERR) { 2188 spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); 2189 spdk_thread_send_msg(rqpair->qpair.group->thread, _nvmf_rdma_qpair_disconnect, &rqpair->qpair); 2190 } 2191 break; 2192 case IBV_EVENT_QP_REQ_ERR: 2193 case IBV_EVENT_QP_ACCESS_ERR: 2194 case IBV_EVENT_COMM_EST: 2195 case IBV_EVENT_PATH_MIG: 2196 case IBV_EVENT_PATH_MIG_ERR: 2197 rqpair = event.element.qp->qp_context; 2198 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 2199 (uintptr_t)rqpair->cm_id, event.event_type); 2200 spdk_nvmf_rdma_update_ibv_state(rqpair); 2201 break; 2202 case IBV_EVENT_CQ_ERR: 2203 case IBV_EVENT_DEVICE_FATAL: 2204 case IBV_EVENT_PORT_ACTIVE: 2205 case IBV_EVENT_PORT_ERR: 2206 case IBV_EVENT_LID_CHANGE: 2207 case IBV_EVENT_PKEY_CHANGE: 2208 case IBV_EVENT_SM_CHANGE: 2209 case IBV_EVENT_SRQ_ERR: 2210 case IBV_EVENT_SRQ_LIMIT_REACHED: 2211 case IBV_EVENT_CLIENT_REREGISTER: 2212 case IBV_EVENT_GID_CHANGE: 2213 default: 2214 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); 2215 break; 2216 } 2217 ibv_ack_async_event(&event); 2218 } 2219 2220 static void 2221 spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) 2222 { 2223 int nfds, i = 0; 2224 struct spdk_nvmf_rdma_transport *rtransport; 2225 struct spdk_nvmf_rdma_device *device, *tmp; 2226 2227 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2228 nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 2229 2230 if (nfds <= 0) { 2231 return; 2232 } 2233 2234 /* The first poll descriptor is RDMA CM event */ 2235 if (rtransport->poll_fds[i++].revents & POLLIN) { 2236 spdk_nvmf_process_cm_event(transport, cb_fn); 2237 nfds--; 2238 } 2239 2240 if (nfds == 0) { 2241 return; 2242 } 2243 2244 /* Second and subsequent poll descriptors are IB async events */ 2245 TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 2246 if (rtransport->poll_fds[i++].revents & POLLIN) { 2247 spdk_nvmf_process_ib_event(device); 2248 nfds--; 2249 } 2250 } 2251 /* check all flagged fd's have been served */ 2252 assert(nfds == 0); 2253 } 2254 2255 static void 2256 spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 2257 struct spdk_nvme_transport_id *trid, 2258 struct spdk_nvmf_discovery_log_page_entry *entry) 2259 { 2260 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 2261 entry->adrfam = trid->adrfam; 2262 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; 2263 2264 spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 2265 spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 2266 2267 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 2268 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 2269 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 2270 } 2271 2272 static struct spdk_nvmf_transport_poll_group * 2273 spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) 2274 { 2275 struct spdk_nvmf_rdma_transport *rtransport; 2276 struct spdk_nvmf_rdma_poll_group *rgroup; 2277 struct spdk_nvmf_rdma_poller *poller; 2278 struct spdk_nvmf_rdma_device *device; 2279 2280 rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2281 2282 rgroup = calloc(1, sizeof(*rgroup)); 2283 if (!rgroup) { 2284 return NULL; 2285 } 2286 2287 TAILQ_INIT(&rgroup->pollers); 2288 2289 pthread_mutex_lock(&rtransport->lock); 2290 TAILQ_FOREACH(device, &rtransport->devices, link) { 2291 poller = calloc(1, sizeof(*poller)); 2292 if (!poller) { 2293 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 2294 free(rgroup); 2295 pthread_mutex_unlock(&rtransport->lock); 2296 return NULL; 2297 } 2298 2299 poller->device = device; 2300 poller->group = rgroup; 2301 2302 TAILQ_INIT(&poller->qpairs); 2303 2304 poller->cq = ibv_create_cq(device->context, NVMF_RDMA_CQ_SIZE, poller, NULL, 0); 2305 if (!poller->cq) { 2306 SPDK_ERRLOG("Unable to create completion queue\n"); 2307 free(poller); 2308 free(rgroup); 2309 pthread_mutex_unlock(&rtransport->lock); 2310 return NULL; 2311 } 2312 2313 TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 2314 } 2315 2316 pthread_mutex_unlock(&rtransport->lock); 2317 return &rgroup->group; 2318 } 2319 2320 static void 2321 spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 2322 { 2323 struct spdk_nvmf_rdma_poll_group *rgroup; 2324 struct spdk_nvmf_rdma_poller *poller, *tmp; 2325 struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; 2326 2327 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2328 2329 if (!rgroup) { 2330 return; 2331 } 2332 2333 TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 2334 TAILQ_REMOVE(&rgroup->pollers, poller, link); 2335 2336 if (poller->cq) { 2337 ibv_destroy_cq(poller->cq); 2338 } 2339 TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) { 2340 spdk_nvmf_rdma_qpair_destroy(qpair); 2341 } 2342 2343 free(poller); 2344 } 2345 2346 free(rgroup); 2347 } 2348 2349 static int 2350 spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 2351 struct spdk_nvmf_qpair *qpair) 2352 { 2353 struct spdk_nvmf_rdma_transport *rtransport; 2354 struct spdk_nvmf_rdma_poll_group *rgroup; 2355 struct spdk_nvmf_rdma_qpair *rqpair; 2356 struct spdk_nvmf_rdma_device *device; 2357 struct spdk_nvmf_rdma_poller *poller; 2358 int rc; 2359 2360 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 2361 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2362 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2363 2364 device = rqpair->port->device; 2365 2366 TAILQ_FOREACH(poller, &rgroup->pollers, link) { 2367 if (poller->device == device) { 2368 break; 2369 } 2370 } 2371 2372 if (!poller) { 2373 SPDK_ERRLOG("No poller found for device.\n"); 2374 return -1; 2375 } 2376 2377 TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); 2378 rqpair->poller = poller; 2379 2380 rc = spdk_nvmf_rdma_qpair_initialize(qpair); 2381 if (rc < 0) { 2382 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 2383 return -1; 2384 } 2385 2386 rqpair->mgmt_channel = spdk_get_io_channel(rtransport); 2387 if (!rqpair->mgmt_channel) { 2388 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 2389 spdk_nvmf_rdma_qpair_destroy(rqpair); 2390 return -1; 2391 } 2392 2393 rqpair->ch = spdk_io_channel_get_ctx(rqpair->mgmt_channel); 2394 assert(rqpair->ch != NULL); 2395 2396 rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 2397 if (rc) { 2398 /* Try to reject, but we probably can't */ 2399 spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 2400 spdk_nvmf_rdma_qpair_destroy(rqpair); 2401 return -1; 2402 } 2403 2404 spdk_nvmf_rdma_update_ibv_state(rqpair); 2405 2406 return 0; 2407 } 2408 2409 static int 2410 spdk_nvmf_rdma_request_free(struct spdk_nvmf_request *req) 2411 { 2412 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 2413 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 2414 struct spdk_nvmf_rdma_transport, transport); 2415 2416 if (rdma_req->data_from_pool) { 2417 /* Put the buffer/s back in the pool */ 2418 for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { 2419 spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); 2420 rdma_req->req.iov[i].iov_base = NULL; 2421 rdma_req->data.buffers[i] = NULL; 2422 } 2423 rdma_req->data_from_pool = false; 2424 } 2425 rdma_req->req.length = 0; 2426 rdma_req->req.iovcnt = 0; 2427 rdma_req->req.data = NULL; 2428 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_FREE); 2429 return 0; 2430 } 2431 2432 static int 2433 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 2434 { 2435 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 2436 struct spdk_nvmf_rdma_transport, transport); 2437 struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, 2438 struct spdk_nvmf_rdma_request, req); 2439 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, 2440 struct spdk_nvmf_rdma_qpair, qpair); 2441 2442 if (rqpair->ibv_attr.qp_state != IBV_QPS_ERR) { 2443 /* The connection is alive, so process the request as normal */ 2444 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_EXECUTED); 2445 } else { 2446 /* The connection is dead. Move the request directly to the completed state. */ 2447 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 2448 } 2449 2450 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2451 2452 return 0; 2453 } 2454 2455 static void 2456 spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) 2457 { 2458 struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2459 struct ibv_recv_wr recv_wr = {}; 2460 struct ibv_recv_wr *bad_recv_wr; 2461 struct ibv_send_wr send_wr = {}; 2462 struct ibv_send_wr *bad_send_wr; 2463 int rc; 2464 2465 if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) { 2466 return; 2467 } 2468 2469 rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING; 2470 2471 if (rqpair->ibv_attr.qp_state != IBV_QPS_ERR) { 2472 spdk_nvmf_rdma_set_ibv_state(rqpair, IBV_QPS_ERR); 2473 } 2474 2475 rqpair->drain_recv_wr.type = RDMA_WR_TYPE_DRAIN_RECV; 2476 recv_wr.wr_id = (uintptr_t)&rqpair->drain_recv_wr; 2477 rc = ibv_post_recv(rqpair->cm_id->qp, &recv_wr, &bad_recv_wr); 2478 if (rc) { 2479 SPDK_ERRLOG("Failed to post dummy receive WR, errno %d\n", errno); 2480 assert(false); 2481 return; 2482 } 2483 2484 rqpair->drain_send_wr.type = RDMA_WR_TYPE_DRAIN_SEND; 2485 send_wr.wr_id = (uintptr_t)&rqpair->drain_send_wr; 2486 send_wr.opcode = IBV_WR_SEND; 2487 rc = ibv_post_send(rqpair->cm_id->qp, &send_wr, &bad_send_wr); 2488 if (rc) { 2489 SPDK_ERRLOG("Failed to post dummy send WR, errno %d\n", errno); 2490 assert(false); 2491 return; 2492 } 2493 } 2494 2495 #ifdef DEBUG 2496 static int 2497 spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) 2498 { 2499 return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || 2500 rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; 2501 } 2502 #endif 2503 2504 static int 2505 spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 2506 struct spdk_nvmf_rdma_poller *rpoller) 2507 { 2508 struct ibv_wc wc[32]; 2509 struct spdk_nvmf_rdma_wr *rdma_wr; 2510 struct spdk_nvmf_rdma_request *rdma_req; 2511 struct spdk_nvmf_rdma_recv *rdma_recv; 2512 struct spdk_nvmf_rdma_qpair *rqpair; 2513 int reaped, i; 2514 int count = 0; 2515 bool error = false; 2516 2517 /* Poll for completing operations. */ 2518 reaped = ibv_poll_cq(rpoller->cq, 32, wc); 2519 if (reaped < 0) { 2520 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 2521 errno, spdk_strerror(errno)); 2522 return -1; 2523 } 2524 2525 for (i = 0; i < reaped; i++) { 2526 2527 rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id; 2528 2529 /* Handle error conditions */ 2530 if (wc[i].status) { 2531 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "CQ error on CQ %p, Request 0x%lu (%d): %s\n", 2532 rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 2533 2534 error = true; 2535 2536 switch (rdma_wr->type) { 2537 case RDMA_WR_TYPE_SEND: 2538 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 2539 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2540 2541 /* We're going to attempt an error recovery, so force the request into 2542 * the completed state. */ 2543 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 2544 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2545 break; 2546 case RDMA_WR_TYPE_RECV: 2547 rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 2548 rqpair = rdma_recv->qpair; 2549 2550 /* Dump this into the incoming queue. This gets cleaned up when 2551 * the queue pair disconnects or recovers. */ 2552 TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); 2553 break; 2554 case RDMA_WR_TYPE_DATA: 2555 /* If the data transfer fails still force the queue into the error state, 2556 * but the rdma_req objects should only be manipulated in response to 2557 * SEND and RECV operations. */ 2558 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 2559 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2560 break; 2561 case RDMA_WR_TYPE_DRAIN_RECV: 2562 rqpair = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_qpair, drain_recv_wr); 2563 assert(rqpair->disconnect_flags & RDMA_QP_DISCONNECTING); 2564 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Drained QP RECV %u (%p)\n", rqpair->qpair.qid, rqpair); 2565 rqpair->disconnect_flags |= RDMA_QP_RECV_DRAINED; 2566 if (rqpair->disconnect_flags & RDMA_QP_SEND_DRAINED) { 2567 spdk_nvmf_rdma_qpair_destroy(rqpair); 2568 } 2569 /* Continue so that this does not trigger the disconnect path below. */ 2570 continue; 2571 case RDMA_WR_TYPE_DRAIN_SEND: 2572 rqpair = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_qpair, drain_send_wr); 2573 assert(rqpair->disconnect_flags & RDMA_QP_DISCONNECTING); 2574 SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Drained QP SEND %u (%p)\n", rqpair->qpair.qid, rqpair); 2575 rqpair->disconnect_flags |= RDMA_QP_SEND_DRAINED; 2576 if (rqpair->disconnect_flags & RDMA_QP_RECV_DRAINED) { 2577 spdk_nvmf_rdma_qpair_destroy(rqpair); 2578 } 2579 /* Continue so that this does not trigger the disconnect path below. */ 2580 continue; 2581 default: 2582 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 2583 continue; 2584 } 2585 2586 if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { 2587 /* Disconnect the connection. */ 2588 spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); 2589 } 2590 continue; 2591 } 2592 2593 switch (wc[i].opcode) { 2594 case IBV_WC_SEND: 2595 assert(rdma_wr->type == RDMA_WR_TYPE_SEND); 2596 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); 2597 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2598 2599 assert(spdk_nvmf_rdma_req_is_completing(rdma_req)); 2600 2601 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); 2602 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2603 2604 count++; 2605 2606 /* Try to process other queued requests */ 2607 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2608 break; 2609 2610 case IBV_WC_RDMA_WRITE: 2611 assert(rdma_wr->type == RDMA_WR_TYPE_DATA); 2612 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 2613 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2614 2615 /* Try to process other queued requests */ 2616 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2617 break; 2618 2619 case IBV_WC_RDMA_READ: 2620 assert(rdma_wr->type == RDMA_WR_TYPE_DATA); 2621 rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); 2622 rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 2623 2624 assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); 2625 spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); 2626 spdk_nvmf_rdma_request_process(rtransport, rdma_req); 2627 2628 /* Try to process other queued requests */ 2629 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2630 break; 2631 2632 case IBV_WC_RECV: 2633 assert(rdma_wr->type == RDMA_WR_TYPE_RECV); 2634 rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 2635 rqpair = rdma_recv->qpair; 2636 2637 TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); 2638 /* Try to process other queued requests */ 2639 spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); 2640 break; 2641 2642 default: 2643 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 2644 continue; 2645 } 2646 } 2647 2648 if (error == true) { 2649 return -1; 2650 } 2651 2652 return count; 2653 } 2654 2655 static int 2656 spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 2657 { 2658 struct spdk_nvmf_rdma_transport *rtransport; 2659 struct spdk_nvmf_rdma_poll_group *rgroup; 2660 struct spdk_nvmf_rdma_poller *rpoller; 2661 int count, rc; 2662 2663 rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 2664 rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 2665 2666 count = 0; 2667 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 2668 rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); 2669 if (rc < 0) { 2670 return rc; 2671 } 2672 count += rc; 2673 } 2674 2675 return count; 2676 } 2677 2678 static int 2679 spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 2680 struct spdk_nvme_transport_id *trid, 2681 bool peer) 2682 { 2683 struct sockaddr *saddr; 2684 uint16_t port; 2685 2686 trid->trtype = SPDK_NVME_TRANSPORT_RDMA; 2687 2688 if (peer) { 2689 saddr = rdma_get_peer_addr(id); 2690 } else { 2691 saddr = rdma_get_local_addr(id); 2692 } 2693 switch (saddr->sa_family) { 2694 case AF_INET: { 2695 struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; 2696 2697 trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; 2698 inet_ntop(AF_INET, &saddr_in->sin_addr, 2699 trid->traddr, sizeof(trid->traddr)); 2700 if (peer) { 2701 port = ntohs(rdma_get_dst_port(id)); 2702 } else { 2703 port = ntohs(rdma_get_src_port(id)); 2704 } 2705 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 2706 break; 2707 } 2708 case AF_INET6: { 2709 struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; 2710 trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; 2711 inet_ntop(AF_INET6, &saddr_in->sin6_addr, 2712 trid->traddr, sizeof(trid->traddr)); 2713 if (peer) { 2714 port = ntohs(rdma_get_dst_port(id)); 2715 } else { 2716 port = ntohs(rdma_get_src_port(id)); 2717 } 2718 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 2719 break; 2720 } 2721 default: 2722 return -1; 2723 2724 } 2725 2726 return 0; 2727 } 2728 2729 static int 2730 spdk_nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 2731 struct spdk_nvme_transport_id *trid) 2732 { 2733 struct spdk_nvmf_rdma_qpair *rqpair; 2734 2735 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2736 2737 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); 2738 } 2739 2740 static int 2741 spdk_nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 2742 struct spdk_nvme_transport_id *trid) 2743 { 2744 struct spdk_nvmf_rdma_qpair *rqpair; 2745 2746 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2747 2748 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); 2749 } 2750 2751 static int 2752 spdk_nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 2753 struct spdk_nvme_transport_id *trid) 2754 { 2755 struct spdk_nvmf_rdma_qpair *rqpair; 2756 2757 rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 2758 2759 return spdk_nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); 2760 } 2761 2762 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 2763 .type = SPDK_NVME_TRANSPORT_RDMA, 2764 .opts_init = spdk_nvmf_rdma_opts_init, 2765 .create = spdk_nvmf_rdma_create, 2766 .destroy = spdk_nvmf_rdma_destroy, 2767 2768 .listen = spdk_nvmf_rdma_listen, 2769 .stop_listen = spdk_nvmf_rdma_stop_listen, 2770 .accept = spdk_nvmf_rdma_accept, 2771 2772 .listener_discover = spdk_nvmf_rdma_discover, 2773 2774 .poll_group_create = spdk_nvmf_rdma_poll_group_create, 2775 .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, 2776 .poll_group_add = spdk_nvmf_rdma_poll_group_add, 2777 .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, 2778 2779 .req_free = spdk_nvmf_rdma_request_free, 2780 .req_complete = spdk_nvmf_rdma_request_complete, 2781 2782 .qpair_fini = spdk_nvmf_rdma_close_qpair, 2783 .qpair_is_idle = spdk_nvmf_rdma_qpair_is_idle, 2784 .qpair_get_peer_trid = spdk_nvmf_rdma_qpair_get_peer_trid, 2785 .qpair_get_local_trid = spdk_nvmf_rdma_qpair_get_local_trid, 2786 .qpair_get_listen_trid = spdk_nvmf_rdma_qpair_get_listen_trid, 2787 2788 }; 2789 2790 SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) 2791