1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include <infiniband/verbs.h> 37 #include <rdma/rdma_cma.h> 38 #include <rdma/rdma_verbs.h> 39 40 #include "nvmf_internal.h" 41 #include "request.h" 42 #include "session.h" 43 #include "subsystem.h" 44 #include "transport.h" 45 46 #include "spdk/assert.h" 47 #include "spdk/nvmf.h" 48 #include "spdk/nvmf_spec.h" 49 #include "spdk/string.h" 50 #include "spdk/trace.h" 51 #include "spdk/util.h" 52 #include "spdk/likely.h" 53 54 #include "spdk_internal/log.h" 55 56 /* 57 RDMA Connection Resouce Defaults 58 */ 59 #define NVMF_DEFAULT_TX_SGE 1 60 #define NVMF_DEFAULT_RX_SGE 2 61 62 struct spdk_nvmf_rdma_buf { 63 SLIST_ENTRY(spdk_nvmf_rdma_buf) link; 64 }; 65 66 /* This structure holds commands as they are received off the wire. 67 * It must be dynamically paired with a full request object 68 * (spdk_nvmf_rdma_request) to service a request. It is separate 69 * from the request because RDMA does not appear to order 70 * completions, so occasionally we'll get a new incoming 71 * command when there aren't any free request objects. 72 */ 73 struct spdk_nvmf_rdma_recv { 74 struct ibv_recv_wr wr; 75 struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 76 77 /* In-capsule data buffer */ 78 uint8_t *buf; 79 80 TAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 81 82 #ifdef DEBUG 83 bool in_use; 84 #endif 85 }; 86 87 struct spdk_nvmf_rdma_request { 88 struct spdk_nvmf_request req; 89 bool data_from_pool; 90 91 struct spdk_nvmf_rdma_recv *recv; 92 93 struct { 94 struct ibv_send_wr wr; 95 struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; 96 } rsp; 97 98 struct { 99 struct ibv_send_wr wr; 100 struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; 101 } data; 102 103 TAILQ_ENTRY(spdk_nvmf_rdma_request) link; 104 }; 105 106 struct spdk_nvmf_rdma_conn { 107 struct spdk_nvmf_conn conn; 108 109 struct rdma_cm_id *cm_id; 110 struct ibv_cq *cq; 111 112 /* The maximum number of I/O outstanding on this connection at one time */ 113 uint16_t max_queue_depth; 114 115 /* The maximum number of active RDMA READ and WRITE operations at one time */ 116 uint16_t max_rw_depth; 117 118 /* The current number of I/O outstanding on this connection. This number 119 * includes all I/O from the time the capsule is first received until it is 120 * completed. 121 */ 122 uint16_t cur_queue_depth; 123 124 /* The number of RDMA READ and WRITE requests that are outstanding */ 125 uint16_t cur_rdma_rw_depth; 126 127 /* Receives that are waiting for a request object */ 128 TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 129 130 /* Requests that are not in use */ 131 TAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; 132 133 /* Requests that are waiting to obtain a data buffer */ 134 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; 135 136 /* Requests that are waiting to perform an RDMA READ or WRITE */ 137 TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_rw_queue; 138 139 /* Array of size "max_queue_depth" containing RDMA requests. */ 140 struct spdk_nvmf_rdma_request *reqs; 141 142 /* Array of size "max_queue_depth" containing RDMA recvs. */ 143 struct spdk_nvmf_rdma_recv *recvs; 144 145 /* Array of size "max_queue_depth" containing 64 byte capsules 146 * used for receive. 147 */ 148 union nvmf_h2c_msg *cmds; 149 struct ibv_mr *cmds_mr; 150 151 /* Array of size "max_queue_depth" containing 16 byte completions 152 * to be sent back to the user. 153 */ 154 union nvmf_c2h_msg *cpls; 155 struct ibv_mr *cpls_mr; 156 157 /* Array of size "max_queue_depth * InCapsuleDataSize" containing 158 * buffers to be used for in capsule data. 159 */ 160 void *bufs; 161 struct ibv_mr *bufs_mr; 162 163 TAILQ_ENTRY(spdk_nvmf_rdma_conn) link; 164 }; 165 166 /* List of RDMA connections that have not yet received a CONNECT capsule */ 167 static TAILQ_HEAD(, spdk_nvmf_rdma_conn) g_pending_conns = TAILQ_HEAD_INITIALIZER(g_pending_conns); 168 169 struct spdk_nvmf_rdma_session { 170 struct spdk_nvmf_session session; 171 172 SLIST_HEAD(, spdk_nvmf_rdma_buf) data_buf_pool; 173 174 struct ibv_context *verbs; 175 176 uint8_t *buf; 177 struct ibv_mr *buf_mr; 178 }; 179 180 struct spdk_nvmf_rdma_listen_addr { 181 char *traddr; 182 char *trsvcid; 183 struct rdma_cm_id *id; 184 struct ibv_device_attr attr; 185 struct ibv_comp_channel *comp_channel; 186 uint32_t ref; 187 bool is_listened; 188 TAILQ_ENTRY(spdk_nvmf_rdma_listen_addr) link; 189 }; 190 191 struct spdk_nvmf_rdma { 192 struct rdma_event_channel *event_channel; 193 194 pthread_mutex_t lock; 195 196 uint16_t max_queue_depth; 197 uint32_t max_io_size; 198 uint32_t in_capsule_data_size; 199 200 TAILQ_HEAD(, spdk_nvmf_rdma_listen_addr) listen_addrs; 201 }; 202 203 static struct spdk_nvmf_rdma g_rdma = { 204 .lock = PTHREAD_MUTEX_INITIALIZER, 205 .listen_addrs = TAILQ_HEAD_INITIALIZER(g_rdma.listen_addrs), 206 }; 207 208 static inline struct spdk_nvmf_rdma_conn * 209 get_rdma_conn(struct spdk_nvmf_conn *conn) 210 { 211 return (struct spdk_nvmf_rdma_conn *)((uintptr_t)conn - offsetof(struct spdk_nvmf_rdma_conn, conn)); 212 } 213 214 static inline struct spdk_nvmf_rdma_request * 215 get_rdma_req(struct spdk_nvmf_request *req) 216 { 217 return (struct spdk_nvmf_rdma_request *)((uintptr_t)req - offsetof(struct spdk_nvmf_rdma_request, 218 req)); 219 } 220 221 static inline struct spdk_nvmf_rdma_session * 222 get_rdma_sess(struct spdk_nvmf_session *sess) 223 { 224 return (struct spdk_nvmf_rdma_session *)((uintptr_t)sess - offsetof(struct spdk_nvmf_rdma_session, 225 session)); 226 } 227 228 static void 229 spdk_nvmf_rdma_conn_destroy(struct spdk_nvmf_rdma_conn *rdma_conn) 230 { 231 if (rdma_conn->cmds_mr) { 232 ibv_dereg_mr(rdma_conn->cmds_mr); 233 } 234 235 if (rdma_conn->cpls_mr) { 236 ibv_dereg_mr(rdma_conn->cpls_mr); 237 } 238 239 if (rdma_conn->bufs_mr) { 240 ibv_dereg_mr(rdma_conn->bufs_mr); 241 } 242 243 if (rdma_conn->cm_id) { 244 rdma_destroy_qp(rdma_conn->cm_id); 245 rdma_destroy_id(rdma_conn->cm_id); 246 } 247 248 if (rdma_conn->cq) { 249 ibv_destroy_cq(rdma_conn->cq); 250 } 251 252 /* Free all memory */ 253 spdk_free(rdma_conn->cmds); 254 spdk_free(rdma_conn->cpls); 255 spdk_free(rdma_conn->bufs); 256 free(rdma_conn->reqs); 257 free(rdma_conn); 258 } 259 260 static struct spdk_nvmf_rdma_conn * 261 spdk_nvmf_rdma_conn_create(struct rdma_cm_id *id, struct ibv_comp_channel *channel, 262 uint16_t max_queue_depth, uint16_t max_rw_depth, uint32_t subsystem_id) 263 { 264 struct spdk_nvmf_rdma_conn *rdma_conn; 265 struct spdk_nvmf_conn *conn; 266 int rc, i; 267 struct ibv_qp_init_attr attr; 268 struct spdk_nvmf_rdma_recv *rdma_recv; 269 struct spdk_nvmf_rdma_request *rdma_req; 270 271 rdma_conn = calloc(1, sizeof(struct spdk_nvmf_rdma_conn)); 272 if (rdma_conn == NULL) { 273 SPDK_ERRLOG("Could not allocate new connection.\n"); 274 return NULL; 275 } 276 277 rdma_conn->max_queue_depth = max_queue_depth; 278 rdma_conn->max_rw_depth = max_rw_depth; 279 TAILQ_INIT(&rdma_conn->incoming_queue); 280 TAILQ_INIT(&rdma_conn->free_queue); 281 TAILQ_INIT(&rdma_conn->pending_data_buf_queue); 282 TAILQ_INIT(&rdma_conn->pending_rdma_rw_queue); 283 284 rdma_conn->cq = ibv_create_cq(id->verbs, max_queue_depth * 3, rdma_conn, channel, 0); 285 if (!rdma_conn->cq) { 286 SPDK_ERRLOG("Unable to create completion queue\n"); 287 SPDK_ERRLOG("Completion Channel: %p Id: %p Verbs: %p\n", channel, id, id->verbs); 288 SPDK_ERRLOG("Errno %d: %s\n", errno, strerror(errno)); 289 rdma_destroy_id(id); 290 spdk_nvmf_rdma_conn_destroy(rdma_conn); 291 return NULL; 292 } 293 294 memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); 295 attr.qp_type = IBV_QPT_RC; 296 attr.send_cq = rdma_conn->cq; 297 attr.recv_cq = rdma_conn->cq; 298 attr.cap.max_send_wr = max_queue_depth * 2; /* SEND, READ, and WRITE operations */ 299 attr.cap.max_recv_wr = max_queue_depth; /* RECV operations */ 300 attr.cap.max_send_sge = NVMF_DEFAULT_TX_SGE; 301 attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE; 302 303 rc = rdma_create_qp(id, NULL, &attr); 304 if (rc) { 305 SPDK_ERRLOG("rdma_create_qp failed\n"); 306 SPDK_ERRLOG("Errno %d: %s\n", errno, strerror(errno)); 307 rdma_destroy_id(id); 308 spdk_nvmf_rdma_conn_destroy(rdma_conn); 309 return NULL; 310 } 311 312 conn = &rdma_conn->conn; 313 conn->transport = &spdk_nvmf_transport_rdma; 314 id->context = conn; 315 rdma_conn->cm_id = id; 316 317 SPDK_TRACELOG(SPDK_TRACE_RDMA, "New RDMA Connection: %p\n", conn); 318 319 rdma_conn->reqs = calloc(max_queue_depth, sizeof(*rdma_conn->reqs)); 320 rdma_conn->recvs = calloc(max_queue_depth, sizeof(*rdma_conn->recvs)); 321 rdma_conn->cmds = spdk_zmalloc(max_queue_depth * sizeof(*rdma_conn->cmds), 322 0x1000, NULL); 323 rdma_conn->cpls = spdk_zmalloc(max_queue_depth * sizeof(*rdma_conn->cpls), 324 0x1000, NULL); 325 rdma_conn->bufs = spdk_zmalloc(max_queue_depth * g_rdma.in_capsule_data_size, 326 0x1000, NULL); 327 if (!rdma_conn->reqs || !rdma_conn->recvs || !rdma_conn->cmds || 328 !rdma_conn->cpls || !rdma_conn->bufs) { 329 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 330 spdk_nvmf_rdma_conn_destroy(rdma_conn); 331 return NULL; 332 } 333 334 rdma_conn->cmds_mr = ibv_reg_mr(id->pd, rdma_conn->cmds, 335 max_queue_depth * sizeof(*rdma_conn->cmds), 336 IBV_ACCESS_LOCAL_WRITE); 337 rdma_conn->cpls_mr = ibv_reg_mr(id->pd, rdma_conn->cpls, 338 max_queue_depth * sizeof(*rdma_conn->cpls), 339 0); 340 rdma_conn->bufs_mr = ibv_reg_mr(id->pd, rdma_conn->bufs, 341 max_queue_depth * g_rdma.in_capsule_data_size, 342 IBV_ACCESS_LOCAL_WRITE | 343 IBV_ACCESS_REMOTE_WRITE); 344 if (!rdma_conn->cmds_mr || !rdma_conn->cpls_mr || !rdma_conn->bufs_mr) { 345 SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n"); 346 spdk_nvmf_rdma_conn_destroy(rdma_conn); 347 return NULL; 348 } 349 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Command Array: %p Length: %lx LKey: %x\n", 350 rdma_conn->cmds, max_queue_depth * sizeof(*rdma_conn->cmds), rdma_conn->cmds_mr->lkey); 351 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", 352 rdma_conn->cpls, max_queue_depth * sizeof(*rdma_conn->cpls), rdma_conn->cpls_mr->lkey); 353 SPDK_TRACELOG(SPDK_TRACE_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", 354 rdma_conn->bufs, max_queue_depth * g_rdma.in_capsule_data_size, rdma_conn->bufs_mr->lkey); 355 356 for (i = 0; i < max_queue_depth; i++) { 357 struct ibv_recv_wr *bad_wr = NULL; 358 359 rdma_recv = &rdma_conn->recvs[i]; 360 361 /* Set up memory to receive commands */ 362 rdma_recv->buf = (void *)((uintptr_t)rdma_conn->bufs + (i * g_rdma.in_capsule_data_size)); 363 364 rdma_recv->sgl[0].addr = (uintptr_t)&rdma_conn->cmds[i]; 365 rdma_recv->sgl[0].length = sizeof(rdma_conn->cmds[i]); 366 rdma_recv->sgl[0].lkey = rdma_conn->cmds_mr->lkey; 367 368 rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 369 rdma_recv->sgl[1].length = g_rdma.in_capsule_data_size; 370 rdma_recv->sgl[1].lkey = rdma_conn->bufs_mr->lkey; 371 372 rdma_recv->wr.wr_id = (uintptr_t)rdma_recv; 373 rdma_recv->wr.sg_list = rdma_recv->sgl; 374 rdma_recv->wr.num_sge = SPDK_COUNTOF(rdma_recv->sgl); 375 #ifdef DEBUG 376 rdma_recv->in_use = false; 377 #endif 378 379 rc = ibv_post_recv(rdma_conn->cm_id->qp, &rdma_recv->wr, &bad_wr); 380 if (rc) { 381 SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n"); 382 spdk_nvmf_rdma_conn_destroy(rdma_conn); 383 return NULL; 384 } 385 } 386 387 for (i = 0; i < max_queue_depth; i++) { 388 rdma_req = &rdma_conn->reqs[i]; 389 390 rdma_req->req.conn = &rdma_conn->conn; 391 rdma_req->req.cmd = NULL; 392 393 /* Set up memory to send responses */ 394 rdma_req->req.rsp = &rdma_conn->cpls[i]; 395 396 rdma_req->rsp.sgl[0].addr = (uintptr_t)&rdma_conn->cpls[i]; 397 rdma_req->rsp.sgl[0].length = sizeof(rdma_conn->cpls[i]); 398 rdma_req->rsp.sgl[0].lkey = rdma_conn->cpls_mr->lkey; 399 400 rdma_req->rsp.wr.wr_id = (uintptr_t)rdma_req; 401 rdma_req->rsp.wr.next = NULL; 402 rdma_req->rsp.wr.opcode = IBV_WR_SEND; 403 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 404 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 405 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 406 407 /* Set up memory for data buffers */ 408 rdma_req->data.wr.wr_id = (uint64_t)rdma_req; 409 rdma_req->data.wr.next = NULL; 410 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 411 rdma_req->data.wr.sg_list = rdma_req->data.sgl; 412 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 413 414 TAILQ_INSERT_TAIL(&rdma_conn->free_queue, rdma_req, link); 415 } 416 417 return rdma_conn; 418 } 419 420 static int 421 request_transfer_in(struct spdk_nvmf_request *req) 422 { 423 int rc; 424 struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); 425 struct spdk_nvmf_conn *conn = req->conn; 426 struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); 427 struct ibv_send_wr *bad_wr = NULL; 428 429 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 430 431 rdma_conn->cur_rdma_rw_depth++; 432 433 SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, conn); 434 spdk_trace_record(TRACE_RDMA_READ_START, 0, 0, (uintptr_t)req, 0); 435 436 rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; 437 rdma_req->data.wr.next = NULL; 438 rc = ibv_post_send(rdma_conn->cm_id->qp, &rdma_req->data.wr, &bad_wr); 439 if (rc) { 440 SPDK_ERRLOG("Unable to transfer data from host to target\n"); 441 return -1; 442 } 443 444 return 0; 445 } 446 447 static int 448 request_transfer_out(struct spdk_nvmf_request *req) 449 { 450 int rc; 451 struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); 452 struct spdk_nvmf_conn *conn = req->conn; 453 struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); 454 struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; 455 struct ibv_recv_wr *bad_recv_wr = NULL; 456 struct ibv_send_wr *send_wr, *bad_send_wr = NULL; 457 458 /* Advance our sq_head pointer */ 459 if (conn->sq_head == conn->sq_head_max) { 460 conn->sq_head = 0; 461 } else { 462 conn->sq_head++; 463 } 464 rsp->sqhd = conn->sq_head; 465 466 /* Post the capsule to the recv buffer */ 467 assert(rdma_req->recv != NULL); 468 #ifdef DEBUG 469 assert(rdma_req->recv->in_use == true); 470 rdma_req->recv->in_use = false; 471 #endif 472 SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, 473 rdma_conn); 474 rc = ibv_post_recv(rdma_conn->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); 475 if (rc) { 476 SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 477 return rc; 478 } 479 rdma_req->recv = NULL; 480 481 /* Build the response which consists of an optional 482 * RDMA WRITE to transfer data, plus an RDMA SEND 483 * containing the response. 484 */ 485 send_wr = &rdma_req->rsp.wr; 486 487 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 488 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 489 SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, conn); 490 spdk_trace_record(TRACE_RDMA_WRITE_START, 0, 0, (uintptr_t)req, 0); 491 492 rdma_conn->cur_rdma_rw_depth++; 493 rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; 494 495 rdma_req->data.wr.next = send_wr; 496 send_wr = &rdma_req->data.wr; 497 } 498 499 SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, conn); 500 spdk_trace_record(TRACE_NVMF_IO_COMPLETE, 0, 0, (uintptr_t)req, 0); 501 502 /* Send the completion */ 503 rc = ibv_post_send(rdma_conn->cm_id->qp, send_wr, &bad_send_wr); 504 if (rc) { 505 SPDK_ERRLOG("Unable to send response capsule\n"); 506 } 507 508 return rc; 509 } 510 511 static int 512 spdk_nvmf_rdma_request_transfer_data(struct spdk_nvmf_request *req) 513 { 514 struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); 515 struct spdk_nvmf_conn *conn = req->conn; 516 struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); 517 518 if (req->xfer == SPDK_NVME_DATA_NONE) { 519 /* If no data transfer, this can bypass the queue */ 520 return request_transfer_out(req); 521 } 522 523 if (rdma_conn->cur_rdma_rw_depth < rdma_conn->max_rw_depth) { 524 if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 525 return request_transfer_out(req); 526 } else if (req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 527 return request_transfer_in(req); 528 } 529 } else { 530 TAILQ_INSERT_TAIL(&rdma_conn->pending_rdma_rw_queue, rdma_req, link); 531 } 532 533 return 0; 534 } 535 536 static int 537 nvmf_rdma_connect(struct rdma_cm_event *event) 538 { 539 struct spdk_nvmf_rdma_conn *rdma_conn = NULL; 540 struct spdk_nvmf_rdma_listen_addr *addr; 541 struct rdma_conn_param *rdma_param = NULL; 542 struct rdma_conn_param ctrlr_event_data; 543 const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 544 struct spdk_nvmf_rdma_accept_private_data accept_data; 545 uint16_t sts = 0; 546 uint16_t max_queue_depth; 547 uint16_t max_rw_depth; 548 uint32_t subsystem_id = 0; 549 int rc; 550 551 if (event->id == NULL) { 552 SPDK_ERRLOG("connect request: missing cm_id\n"); 553 goto err0; 554 } 555 556 if (event->id->verbs == NULL) { 557 SPDK_ERRLOG("connect request: missing cm_id ibv_context\n"); 558 goto err0; 559 } 560 561 rdma_param = &event->param.conn; 562 if (rdma_param->private_data == NULL || 563 rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 564 SPDK_ERRLOG("connect request: no private data provided\n"); 565 goto err0; 566 } 567 private_data = rdma_param->private_data; 568 569 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", 570 event->id->verbs->device->name, event->id->verbs->device->dev_name); 571 572 addr = event->listen_id->context; 573 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 574 event->listen_id, event->listen_id->verbs, addr); 575 576 /* Figure out the supported queue depth. This is a multi-step process 577 * that takes into account hardware maximums, host provided values, 578 * and our target's internal memory limits */ 579 580 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Calculating Queue Depth\n"); 581 582 /* Start with the maximum queue depth allowed by the target */ 583 max_queue_depth = g_rdma.max_queue_depth; 584 max_rw_depth = g_rdma.max_queue_depth; 585 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Target Max Queue Depth: %d\n", g_rdma.max_queue_depth); 586 587 /* Next check the local NIC's hardware limitations */ 588 SPDK_TRACELOG(SPDK_TRACE_RDMA, 589 "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 590 addr->attr.max_qp_wr, addr->attr.max_qp_rd_atom); 591 max_queue_depth = spdk_min(max_queue_depth, addr->attr.max_qp_wr); 592 max_rw_depth = spdk_min(max_rw_depth, addr->attr.max_qp_rd_atom); 593 594 /* Next check the remote NIC's hardware limitations */ 595 SPDK_TRACELOG(SPDK_TRACE_RDMA, 596 "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 597 rdma_param->initiator_depth, rdma_param->responder_resources); 598 if (rdma_param->initiator_depth > 0) { 599 max_rw_depth = spdk_min(max_rw_depth, rdma_param->initiator_depth); 600 } 601 602 /* Finally check for the host software requested values, which are 603 * optional. */ 604 if (rdma_param->private_data != NULL && 605 rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { 606 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); 607 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); 608 max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 609 max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 610 } 611 612 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 613 max_queue_depth, max_rw_depth); 614 615 /* Init the NVMf rdma transport connection */ 616 rdma_conn = spdk_nvmf_rdma_conn_create(event->id, addr->comp_channel, max_queue_depth, 617 max_rw_depth, subsystem_id); 618 if (rdma_conn == NULL) { 619 SPDK_ERRLOG("Error on nvmf connection creation\n"); 620 goto err1; 621 } 622 623 accept_data.recfmt = 0; 624 accept_data.crqsize = max_queue_depth; 625 ctrlr_event_data = *rdma_param; 626 ctrlr_event_data.private_data = &accept_data; 627 ctrlr_event_data.private_data_len = sizeof(accept_data); 628 if (event->id->ps == RDMA_PS_TCP) { 629 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 630 ctrlr_event_data.initiator_depth = max_rw_depth; 631 } 632 633 rc = rdma_accept(event->id, &ctrlr_event_data); 634 if (rc) { 635 SPDK_ERRLOG("Error on rdma_accept\n"); 636 goto err2; 637 } 638 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Sent back the accept\n"); 639 640 /* Add this RDMA connection to the global list until a CONNECT capsule 641 * is received. */ 642 TAILQ_INSERT_TAIL(&g_pending_conns, rdma_conn, link); 643 644 return 0; 645 646 err2: 647 spdk_nvmf_rdma_conn_destroy(rdma_conn); 648 649 err1: { 650 struct spdk_nvmf_rdma_reject_private_data rej_data; 651 652 rej_data.status.sc = sts; 653 rdma_reject(event->id, &ctrlr_event_data, sizeof(rej_data)); 654 } 655 err0: 656 return -1; 657 } 658 659 static int 660 nvmf_rdma_disconnect(struct rdma_cm_event *evt) 661 { 662 struct spdk_nvmf_conn *conn; 663 struct spdk_nvmf_session *session; 664 struct spdk_nvmf_subsystem *subsystem; 665 struct spdk_nvmf_rdma_conn *rdma_conn; 666 667 if (evt->id == NULL) { 668 SPDK_ERRLOG("disconnect request: missing cm_id\n"); 669 return -1; 670 } 671 672 conn = evt->id->context; 673 if (conn == NULL) { 674 SPDK_ERRLOG("disconnect request: no active connection\n"); 675 return -1; 676 } 677 /* ack the disconnect event before rdma_destroy_id */ 678 rdma_ack_cm_event(evt); 679 680 rdma_conn = get_rdma_conn(conn); 681 682 session = conn->sess; 683 if (session == NULL) { 684 /* No session has been established yet. That means the conn 685 * must be in the pending connections list. Remove it. */ 686 TAILQ_REMOVE(&g_pending_conns, rdma_conn, link); 687 spdk_nvmf_rdma_conn_destroy(rdma_conn); 688 return 0; 689 } 690 691 subsystem = session->subsys; 692 693 subsystem->disconnect_cb(subsystem->cb_ctx, conn); 694 695 return 0; 696 } 697 698 #ifdef DEBUG 699 static const char *CM_EVENT_STR[] = { 700 "RDMA_CM_EVENT_ADDR_RESOLVED", 701 "RDMA_CM_EVENT_ADDR_ERROR", 702 "RDMA_CM_EVENT_ROUTE_RESOLVED", 703 "RDMA_CM_EVENT_ROUTE_ERROR", 704 "RDMA_CM_EVENT_CONNECT_REQUEST", 705 "RDMA_CM_EVENT_CONNECT_RESPONSE", 706 "RDMA_CM_EVENT_CONNECT_ERROR", 707 "RDMA_CM_EVENT_UNREACHABLE", 708 "RDMA_CM_EVENT_REJECTED", 709 "RDMA_CM_EVENT_ESTABLISHED", 710 "RDMA_CM_EVENT_DISCONNECTED", 711 "RDMA_CM_EVENT_DEVICE_REMOVAL", 712 "RDMA_CM_EVENT_MULTICAST_JOIN", 713 "RDMA_CM_EVENT_MULTICAST_ERROR", 714 "RDMA_CM_EVENT_ADDR_CHANGE", 715 "RDMA_CM_EVENT_TIMEWAIT_EXIT" 716 }; 717 #endif /* DEBUG */ 718 719 typedef enum _spdk_nvmf_request_prep_type { 720 SPDK_NVMF_REQUEST_PREP_ERROR = -1, 721 SPDK_NVMF_REQUEST_PREP_READY = 0, 722 SPDK_NVMF_REQUEST_PREP_PENDING_BUFFER = 1, 723 SPDK_NVMF_REQUEST_PREP_PENDING_DATA = 2, 724 } spdk_nvmf_request_prep_type; 725 726 static spdk_nvmf_request_prep_type 727 spdk_nvmf_request_prep_data(struct spdk_nvmf_request *req) 728 { 729 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 730 struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; 731 struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); 732 struct spdk_nvmf_rdma_session *rdma_sess; 733 struct spdk_nvme_sgl_descriptor *sgl; 734 735 req->length = 0; 736 req->data = NULL; 737 738 if (cmd->opc == SPDK_NVME_OPC_FABRIC) { 739 req->xfer = spdk_nvme_opc_get_data_transfer(req->cmd->nvmf_cmd.fctype); 740 } else { 741 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 742 if ((req->conn->type == CONN_TYPE_AQ) && 743 ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || 744 (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { 745 switch (cmd->cdw10 & 0xff) { 746 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 747 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 748 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 749 break; 750 default: 751 req->xfer = SPDK_NVME_DATA_NONE; 752 break; 753 } 754 } 755 } 756 757 if (req->xfer == SPDK_NVME_DATA_NONE) { 758 return SPDK_NVMF_REQUEST_PREP_READY; 759 } 760 761 sgl = &cmd->dptr.sgl1; 762 763 if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 764 (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 765 sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 766 if (sgl->keyed.length > g_rdma.max_io_size) { 767 SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 768 sgl->keyed.length, g_rdma.max_io_size); 769 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 770 return SPDK_NVMF_REQUEST_PREP_ERROR; 771 } 772 773 if (sgl->keyed.length == 0) { 774 req->xfer = SPDK_NVME_DATA_NONE; 775 return SPDK_NVMF_REQUEST_PREP_READY; 776 } 777 778 req->length = sgl->keyed.length; 779 rdma_req->data.sgl[0].length = sgl->keyed.length; 780 rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; 781 rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; 782 783 rdma_sess = get_rdma_sess(req->conn->sess); 784 if (!rdma_sess) { 785 /* The only time a connection won't have a session 786 * is when this is the CONNECT request. 787 */ 788 assert(cmd->opc == SPDK_NVME_OPC_FABRIC); 789 assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 790 assert(req->length <= g_rdma.in_capsule_data_size); 791 792 /* Use the in capsule data buffer, even though this isn't in capsule data. */ 793 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Request using in capsule buffer for non-capsule data\n"); 794 req->data = rdma_req->recv->buf; 795 rdma_req->data.sgl[0].lkey = get_rdma_conn(req->conn)->bufs_mr->lkey; 796 rdma_req->data_from_pool = false; 797 } else { 798 req->data = SLIST_FIRST(&rdma_sess->data_buf_pool); 799 rdma_req->data.sgl[0].lkey = rdma_sess->buf_mr->lkey; 800 rdma_req->data_from_pool = true; 801 if (!req->data) { 802 /* No available buffers. Queue this request up. */ 803 SPDK_TRACELOG(SPDK_TRACE_RDMA, "No available large data buffers. Queueing request %p\n", req); 804 /* This will get assigned when we actually obtain a buffer */ 805 rdma_req->data.sgl[0].addr = (uintptr_t)NULL; 806 return SPDK_NVMF_REQUEST_PREP_PENDING_BUFFER; 807 } 808 809 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Request %p took buffer from central pool\n", req); 810 SLIST_REMOVE_HEAD(&rdma_sess->data_buf_pool, link); 811 } 812 813 rdma_req->data.sgl[0].addr = (uintptr_t)req->data; 814 815 if (req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 816 return SPDK_NVMF_REQUEST_PREP_PENDING_DATA; 817 } else { 818 return SPDK_NVMF_REQUEST_PREP_READY; 819 } 820 } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 821 sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 822 uint64_t offset = sgl->address; 823 uint32_t max_len = g_rdma.in_capsule_data_size; 824 825 SPDK_TRACELOG(SPDK_TRACE_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 826 offset, sgl->unkeyed.length); 827 828 if (offset > max_len) { 829 SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 830 offset, max_len); 831 rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 832 return SPDK_NVMF_REQUEST_PREP_ERROR; 833 } 834 max_len -= (uint32_t)offset; 835 836 if (sgl->unkeyed.length > max_len) { 837 SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 838 sgl->unkeyed.length, max_len); 839 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 840 return SPDK_NVMF_REQUEST_PREP_ERROR; 841 } 842 843 if (sgl->unkeyed.length == 0) { 844 req->xfer = SPDK_NVME_DATA_NONE; 845 return SPDK_NVMF_REQUEST_PREP_READY; 846 } 847 848 req->data = rdma_req->recv->buf + offset; 849 rdma_req->data_from_pool = false; 850 req->length = sgl->unkeyed.length; 851 return SPDK_NVMF_REQUEST_PREP_READY; 852 } 853 854 SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 855 sgl->generic.type, sgl->generic.subtype); 856 rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 857 return SPDK_NVMF_REQUEST_PREP_ERROR; 858 } 859 860 static int 861 spdk_nvmf_rdma_handle_pending_rdma_rw(struct spdk_nvmf_conn *conn) 862 { 863 struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); 864 struct spdk_nvmf_rdma_session *rdma_sess; 865 struct spdk_nvmf_rdma_request *rdma_req, *tmp; 866 int rc; 867 int count = 0; 868 869 /* First, try to assign free data buffers to requests that need one */ 870 if (conn->sess) { 871 rdma_sess = get_rdma_sess(conn->sess); 872 TAILQ_FOREACH_SAFE(rdma_req, &rdma_conn->pending_data_buf_queue, link, tmp) { 873 assert(rdma_req->req.data == NULL); 874 rdma_req->req.data = SLIST_FIRST(&rdma_sess->data_buf_pool); 875 if (!rdma_req->req.data) { 876 break; 877 } 878 SLIST_REMOVE_HEAD(&rdma_sess->data_buf_pool, link); 879 rdma_req->data.sgl[0].addr = (uintptr_t)rdma_req->req.data; 880 TAILQ_REMOVE(&rdma_conn->pending_data_buf_queue, rdma_req, link); 881 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 882 TAILQ_INSERT_TAIL(&rdma_conn->pending_rdma_rw_queue, rdma_req, link); 883 } else { 884 rc = spdk_nvmf_request_exec(&rdma_req->req); 885 if (rc < 0) { 886 return -1; 887 } 888 count++; 889 } 890 } 891 } 892 893 /* Try to initiate RDMA Reads or Writes on requests that have data buffers */ 894 while (rdma_conn->cur_rdma_rw_depth < rdma_conn->max_rw_depth) { 895 rdma_req = TAILQ_FIRST(&rdma_conn->pending_rdma_rw_queue); 896 if (spdk_unlikely(!rdma_req)) { 897 break; 898 } 899 900 TAILQ_REMOVE(&rdma_conn->pending_rdma_rw_queue, rdma_req, link); 901 902 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Submitting previously queued for RDMA R/W request %p\n", rdma_req); 903 904 rc = spdk_nvmf_rdma_request_transfer_data(&rdma_req->req); 905 if (rc) { 906 return -1; 907 } 908 } 909 910 return count; 911 } 912 913 /* Public API callbacks begin here */ 914 915 static int 916 spdk_nvmf_rdma_init(uint16_t max_queue_depth, uint32_t max_io_size, 917 uint32_t in_capsule_data_size) 918 { 919 int rc; 920 921 SPDK_NOTICELOG("*** RDMA Transport Init ***\n"); 922 923 pthread_mutex_lock(&g_rdma.lock); 924 g_rdma.max_queue_depth = max_queue_depth; 925 g_rdma.max_io_size = max_io_size; 926 g_rdma.in_capsule_data_size = in_capsule_data_size; 927 928 g_rdma.event_channel = rdma_create_event_channel(); 929 if (g_rdma.event_channel == NULL) { 930 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", strerror(errno)); 931 pthread_mutex_unlock(&g_rdma.lock); 932 return -1; 933 } 934 935 rc = fcntl(g_rdma.event_channel->fd, F_SETFL, O_NONBLOCK); 936 if (rc < 0) { 937 SPDK_ERRLOG("fcntl to set fd to non-blocking failed\n"); 938 pthread_mutex_unlock(&g_rdma.lock); 939 return -1; 940 } 941 942 pthread_mutex_unlock(&g_rdma.lock); 943 return 0; 944 } 945 946 static void 947 spdk_nvmf_rdma_listen_addr_free(struct spdk_nvmf_rdma_listen_addr *addr) 948 { 949 if (!addr) { 950 return; 951 } 952 953 free(addr->traddr); 954 free(addr->trsvcid); 955 free(addr); 956 } 957 static int 958 spdk_nvmf_rdma_fini(void) 959 { 960 pthread_mutex_lock(&g_rdma.lock); 961 962 assert(TAILQ_EMPTY(&g_rdma.listen_addrs)); 963 if (g_rdma.event_channel != NULL) { 964 rdma_destroy_event_channel(g_rdma.event_channel); 965 } 966 pthread_mutex_unlock(&g_rdma.lock); 967 968 return 0; 969 } 970 971 static int 972 spdk_nvmf_rdma_listen_remove(struct spdk_nvmf_listen_addr *listen_addr) 973 { 974 struct spdk_nvmf_rdma_listen_addr *addr, *tmp; 975 976 pthread_mutex_lock(&g_rdma.lock); 977 TAILQ_FOREACH_SAFE(addr, &g_rdma.listen_addrs, link, tmp) { 978 if ((!strcasecmp(addr->traddr, listen_addr->traddr)) && 979 (!strcasecmp(addr->trsvcid, listen_addr->trsvcid))) { 980 assert(addr->ref > 0); 981 addr->ref--; 982 if (!addr->ref) { 983 TAILQ_REMOVE(&g_rdma.listen_addrs, addr, link); 984 ibv_destroy_comp_channel(addr->comp_channel); 985 rdma_destroy_id(addr->id); 986 spdk_nvmf_rdma_listen_addr_free(addr); 987 } 988 break; 989 } 990 } 991 992 pthread_mutex_unlock(&g_rdma.lock); 993 return 0; 994 } 995 996 static int 997 spdk_nvmf_rdma_poll(struct spdk_nvmf_conn *conn); 998 999 static void 1000 spdk_nvmf_rdma_addr_listen_init(struct spdk_nvmf_rdma_listen_addr *addr) 1001 { 1002 int rc; 1003 1004 rc = rdma_listen(addr->id, 10); /* 10 = backlog */ 1005 if (rc < 0) { 1006 SPDK_ERRLOG("rdma_listen() failed\n"); 1007 addr->ref--; 1008 assert(addr->ref == 0); 1009 TAILQ_REMOVE(&g_rdma.listen_addrs, addr, link); 1010 ibv_destroy_comp_channel(addr->comp_channel); 1011 rdma_destroy_id(addr->id); 1012 spdk_nvmf_rdma_listen_addr_free(addr); 1013 return; 1014 } 1015 1016 addr->is_listened = true; 1017 1018 SPDK_NOTICELOG("*** NVMf Target Listening on %s port %d ***\n", 1019 addr->traddr, ntohs(rdma_get_src_port(addr->id))); 1020 } 1021 1022 static void 1023 spdk_nvmf_rdma_acceptor_poll(void) 1024 { 1025 struct rdma_cm_event *event; 1026 int rc; 1027 struct spdk_nvmf_rdma_conn *rdma_conn, *tmp; 1028 struct spdk_nvmf_rdma_listen_addr *addr = NULL, *addr_tmp; 1029 1030 if (g_rdma.event_channel == NULL) { 1031 return; 1032 } 1033 1034 pthread_mutex_lock(&g_rdma.lock); 1035 TAILQ_FOREACH_SAFE(addr, &g_rdma.listen_addrs, link, addr_tmp) { 1036 if (!addr->is_listened) { 1037 spdk_nvmf_rdma_addr_listen_init(addr); 1038 } 1039 } 1040 pthread_mutex_unlock(&g_rdma.lock); 1041 1042 /* Process pending connections for incoming capsules. The only capsule 1043 * this should ever find is a CONNECT request. */ 1044 TAILQ_FOREACH_SAFE(rdma_conn, &g_pending_conns, link, tmp) { 1045 rc = spdk_nvmf_rdma_poll(&rdma_conn->conn); 1046 if (rc < 0) { 1047 TAILQ_REMOVE(&g_pending_conns, rdma_conn, link); 1048 spdk_nvmf_rdma_conn_destroy(rdma_conn); 1049 } else if (rc > 0) { 1050 /* At least one request was processed which is assumed to be 1051 * a CONNECT. Remove this connection from our list. */ 1052 TAILQ_REMOVE(&g_pending_conns, rdma_conn, link); 1053 } 1054 } 1055 1056 while (1) { 1057 rc = rdma_get_cm_event(g_rdma.event_channel, &event); 1058 if (rc == 0) { 1059 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 1060 1061 switch (event->event) { 1062 case RDMA_CM_EVENT_CONNECT_REQUEST: 1063 rc = nvmf_rdma_connect(event); 1064 if (rc < 0) { 1065 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 1066 break; 1067 } 1068 break; 1069 case RDMA_CM_EVENT_ESTABLISHED: 1070 break; 1071 case RDMA_CM_EVENT_ADDR_CHANGE: 1072 case RDMA_CM_EVENT_DISCONNECTED: 1073 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1074 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1075 rc = nvmf_rdma_disconnect(event); 1076 if (rc < 0) { 1077 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 1078 break; 1079 } 1080 continue; 1081 default: 1082 SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 1083 break; 1084 } 1085 1086 rdma_ack_cm_event(event); 1087 } else { 1088 if (errno != EAGAIN && errno != EWOULDBLOCK) { 1089 SPDK_ERRLOG("Acceptor Event Error: %s\n", strerror(errno)); 1090 } 1091 break; 1092 } 1093 } 1094 } 1095 1096 static int 1097 spdk_nvmf_rdma_listen(struct spdk_nvmf_listen_addr *listen_addr) 1098 { 1099 struct spdk_nvmf_rdma_listen_addr *addr; 1100 struct sockaddr_in saddr; 1101 int rc; 1102 1103 pthread_mutex_lock(&g_rdma.lock); 1104 assert(g_rdma.event_channel != NULL); 1105 TAILQ_FOREACH(addr, &g_rdma.listen_addrs, link) { 1106 if ((!strcasecmp(addr->traddr, listen_addr->traddr)) && 1107 (!strcasecmp(addr->trsvcid, listen_addr->trsvcid))) { 1108 addr->ref++; 1109 /* Already listening at this address */ 1110 pthread_mutex_unlock(&g_rdma.lock); 1111 return 0; 1112 } 1113 } 1114 1115 addr = calloc(1, sizeof(*addr)); 1116 if (!addr) { 1117 pthread_mutex_unlock(&g_rdma.lock); 1118 return -1; 1119 } 1120 1121 addr->traddr = strdup(listen_addr->traddr); 1122 if (!addr->traddr) { 1123 spdk_nvmf_rdma_listen_addr_free(addr); 1124 pthread_mutex_unlock(&g_rdma.lock); 1125 return -1; 1126 } 1127 1128 addr->trsvcid = strdup(listen_addr->trsvcid); 1129 if (!addr->trsvcid) { 1130 spdk_nvmf_rdma_listen_addr_free(addr); 1131 pthread_mutex_unlock(&g_rdma.lock); 1132 return -1; 1133 } 1134 1135 rc = rdma_create_id(g_rdma.event_channel, &addr->id, addr, RDMA_PS_TCP); 1136 if (rc < 0) { 1137 SPDK_ERRLOG("rdma_create_id() failed\n"); 1138 spdk_nvmf_rdma_listen_addr_free(addr); 1139 pthread_mutex_unlock(&g_rdma.lock); 1140 return -1; 1141 } 1142 1143 memset(&saddr, 0, sizeof(saddr)); 1144 saddr.sin_family = AF_INET; 1145 saddr.sin_addr.s_addr = inet_addr(addr->traddr); 1146 saddr.sin_port = htons((uint16_t)strtoul(addr->trsvcid, NULL, 10)); 1147 rc = rdma_bind_addr(addr->id, (struct sockaddr *)&saddr); 1148 if (rc < 0) { 1149 SPDK_ERRLOG("rdma_bind_addr() failed\n"); 1150 rdma_destroy_id(addr->id); 1151 spdk_nvmf_rdma_listen_addr_free(addr); 1152 pthread_mutex_unlock(&g_rdma.lock); 1153 return -1; 1154 } 1155 1156 rc = ibv_query_device(addr->id->verbs, &addr->attr); 1157 if (rc < 0) { 1158 SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 1159 rdma_destroy_id(addr->id); 1160 spdk_nvmf_rdma_listen_addr_free(addr); 1161 pthread_mutex_unlock(&g_rdma.lock); 1162 return -1; 1163 } 1164 1165 addr->comp_channel = ibv_create_comp_channel(addr->id->verbs); 1166 if (!addr->comp_channel) { 1167 SPDK_ERRLOG("Failed to create completion channel\n"); 1168 rdma_destroy_id(addr->id); 1169 spdk_nvmf_rdma_listen_addr_free(addr); 1170 pthread_mutex_unlock(&g_rdma.lock); 1171 return -1; 1172 } 1173 SPDK_TRACELOG(SPDK_TRACE_RDMA, "For listen id %p with context %p, created completion channel %p\n", 1174 addr->id, addr->id->verbs, addr->comp_channel); 1175 1176 rc = fcntl(addr->comp_channel->fd, F_SETFL, O_NONBLOCK); 1177 if (rc < 0) { 1178 SPDK_ERRLOG("fcntl to set comp channel to non-blocking failed\n"); 1179 rdma_destroy_id(addr->id); 1180 ibv_destroy_comp_channel(addr->comp_channel); 1181 spdk_nvmf_rdma_listen_addr_free(addr); 1182 pthread_mutex_unlock(&g_rdma.lock); 1183 return -1; 1184 } 1185 1186 1187 addr->ref = 1; 1188 TAILQ_INSERT_TAIL(&g_rdma.listen_addrs, addr, link); 1189 pthread_mutex_unlock(&g_rdma.lock); 1190 1191 1192 return 0; 1193 } 1194 1195 static void 1196 spdk_nvmf_rdma_discover(struct spdk_nvmf_listen_addr *listen_addr, 1197 struct spdk_nvmf_discovery_log_page_entry *entry) 1198 { 1199 entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 1200 entry->adrfam = SPDK_NVMF_ADRFAM_IPV4; 1201 entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; 1202 1203 spdk_strcpy_pad(entry->trsvcid, listen_addr->trsvcid, sizeof(entry->trsvcid), ' '); 1204 spdk_strcpy_pad(entry->traddr, listen_addr->traddr, sizeof(entry->traddr), ' '); 1205 1206 entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 1207 entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 1208 entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 1209 } 1210 1211 static struct spdk_nvmf_session * 1212 spdk_nvmf_rdma_session_init(void) 1213 { 1214 struct spdk_nvmf_rdma_session *rdma_sess; 1215 int i; 1216 struct spdk_nvmf_rdma_buf *buf; 1217 1218 rdma_sess = calloc(1, sizeof(*rdma_sess)); 1219 if (!rdma_sess) { 1220 return NULL; 1221 } 1222 1223 /* TODO: Make the number of elements in this pool configurable. For now, one full queue 1224 * worth seems reasonable. 1225 */ 1226 rdma_sess->buf = spdk_zmalloc(g_rdma.max_queue_depth * g_rdma.max_io_size, 1227 0x20000, NULL); 1228 if (!rdma_sess->buf) { 1229 SPDK_ERRLOG("Large buffer pool allocation failed (%d x %d)\n", 1230 g_rdma.max_queue_depth, g_rdma.max_io_size); 1231 free(rdma_sess); 1232 return NULL; 1233 } 1234 1235 SLIST_INIT(&rdma_sess->data_buf_pool); 1236 for (i = 0; i < g_rdma.max_queue_depth; i++) { 1237 buf = (struct spdk_nvmf_rdma_buf *)(rdma_sess->buf + (i * g_rdma.max_io_size)); 1238 SLIST_INSERT_HEAD(&rdma_sess->data_buf_pool, buf, link); 1239 } 1240 1241 rdma_sess->session.transport = &spdk_nvmf_transport_rdma; 1242 1243 return &rdma_sess->session; 1244 } 1245 1246 static void 1247 spdk_nvmf_rdma_session_fini(struct spdk_nvmf_session *session) 1248 { 1249 struct spdk_nvmf_rdma_session *rdma_sess = get_rdma_sess(session); 1250 1251 if (!rdma_sess) { 1252 return; 1253 } 1254 1255 ibv_dereg_mr(rdma_sess->buf_mr); 1256 spdk_free(rdma_sess->buf); 1257 free(rdma_sess); 1258 } 1259 1260 static int 1261 spdk_nvmf_rdma_session_add_conn(struct spdk_nvmf_session *session, 1262 struct spdk_nvmf_conn *conn) 1263 { 1264 struct spdk_nvmf_rdma_session *rdma_sess = get_rdma_sess(session); 1265 struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); 1266 1267 if (rdma_sess->verbs != NULL) { 1268 if (rdma_sess->verbs != rdma_conn->cm_id->verbs) { 1269 SPDK_ERRLOG("Two connections belonging to the same session cannot connect using different RDMA devices.\n"); 1270 return -1; 1271 } 1272 1273 /* Nothing else to do. */ 1274 return 0; 1275 } 1276 1277 rdma_sess->verbs = rdma_conn->cm_id->verbs; 1278 rdma_sess->buf_mr = ibv_reg_mr(rdma_conn->cm_id->pd, rdma_sess->buf, 1279 g_rdma.max_queue_depth * g_rdma.max_io_size, 1280 IBV_ACCESS_LOCAL_WRITE | 1281 IBV_ACCESS_REMOTE_WRITE); 1282 if (!rdma_sess->buf_mr) { 1283 SPDK_ERRLOG("Large buffer pool registration failed (%d x %d)\n", 1284 g_rdma.max_queue_depth, g_rdma.max_io_size); 1285 spdk_free(rdma_sess->buf); 1286 free(rdma_sess); 1287 return -1; 1288 } 1289 1290 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Session Shared Data Pool: %p Length: %x LKey: %x\n", 1291 rdma_sess->buf, g_rdma.max_queue_depth * g_rdma.max_io_size, rdma_sess->buf_mr->lkey); 1292 1293 return 0; 1294 } 1295 1296 static int 1297 spdk_nvmf_rdma_session_remove_conn(struct spdk_nvmf_session *session, 1298 struct spdk_nvmf_conn *conn) 1299 { 1300 return 0; 1301 } 1302 1303 static int 1304 spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 1305 { 1306 struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; 1307 int rc; 1308 1309 if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 1310 req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1311 rc = spdk_nvmf_rdma_request_transfer_data(req); 1312 } else { 1313 rc = request_transfer_out(req); 1314 } 1315 1316 return rc; 1317 } 1318 1319 static void 1320 request_release_buffer(struct spdk_nvmf_request *req) 1321 { 1322 struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); 1323 struct spdk_nvmf_conn *conn = req->conn; 1324 struct spdk_nvmf_rdma_session *rdma_sess; 1325 struct spdk_nvmf_rdma_buf *buf; 1326 1327 if (rdma_req->data_from_pool) { 1328 /* Put the buffer back in the pool */ 1329 rdma_sess = get_rdma_sess(conn->sess); 1330 buf = req->data; 1331 1332 SLIST_INSERT_HEAD(&rdma_sess->data_buf_pool, buf, link); 1333 req->data = NULL; 1334 req->length = 0; 1335 rdma_req->data_from_pool = false; 1336 } 1337 } 1338 1339 static void 1340 spdk_nvmf_rdma_close_conn(struct spdk_nvmf_conn *conn) 1341 { 1342 spdk_nvmf_rdma_conn_destroy(get_rdma_conn(conn)); 1343 } 1344 1345 static int 1346 process_incoming_queue(struct spdk_nvmf_rdma_conn *rdma_conn) 1347 { 1348 struct spdk_nvmf_rdma_recv *rdma_recv, *tmp; 1349 struct spdk_nvmf_rdma_request *rdma_req; 1350 struct spdk_nvmf_request *req; 1351 int rc, count; 1352 bool error = false; 1353 1354 count = 0; 1355 TAILQ_FOREACH_SAFE(rdma_recv, &rdma_conn->incoming_queue, link, tmp) { 1356 rdma_req = TAILQ_FIRST(&rdma_conn->free_queue); 1357 if (rdma_req == NULL) { 1358 /* Need to wait for more SEND completions */ 1359 break; 1360 } 1361 TAILQ_REMOVE(&rdma_conn->free_queue, rdma_req, link); 1362 TAILQ_REMOVE(&rdma_conn->incoming_queue, rdma_recv, link); 1363 rdma_req->recv = rdma_recv; 1364 req = &rdma_req->req; 1365 1366 /* The first element of the SGL is the NVMe command */ 1367 req->cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 1368 1369 spdk_trace_record(TRACE_NVMF_IO_START, 0, 0, (uint64_t)req, 0); 1370 1371 memset(req->rsp, 0, sizeof(*req->rsp)); 1372 rc = spdk_nvmf_request_prep_data(req); 1373 switch (rc) { 1374 case SPDK_NVMF_REQUEST_PREP_READY: 1375 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Request %p is ready for execution\n", req); 1376 /* Data is immediately available */ 1377 rc = spdk_nvmf_request_exec(req); 1378 if (rc < 0) { 1379 error = true; 1380 continue; 1381 } 1382 count++; 1383 break; 1384 case SPDK_NVMF_REQUEST_PREP_PENDING_BUFFER: 1385 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Request %p needs data buffer\n", req); 1386 TAILQ_INSERT_TAIL(&rdma_conn->pending_data_buf_queue, rdma_req, link); 1387 break; 1388 case SPDK_NVMF_REQUEST_PREP_PENDING_DATA: 1389 SPDK_TRACELOG(SPDK_TRACE_RDMA, "Request %p needs data transfer\n", req); 1390 rc = spdk_nvmf_rdma_request_transfer_data(req); 1391 if (rc < 0) { 1392 error = true; 1393 continue; 1394 } 1395 break; 1396 case SPDK_NVMF_REQUEST_PREP_ERROR: 1397 spdk_nvmf_request_complete(req); 1398 break; 1399 } 1400 } 1401 1402 if (error) { 1403 return -1; 1404 } 1405 1406 return count; 1407 } 1408 1409 static struct spdk_nvmf_rdma_request * 1410 get_rdma_req_from_wc(struct spdk_nvmf_rdma_conn *rdma_conn, 1411 struct ibv_wc *wc) 1412 { 1413 struct spdk_nvmf_rdma_request *rdma_req; 1414 1415 rdma_req = (struct spdk_nvmf_rdma_request *)wc->wr_id; 1416 assert(rdma_req != NULL); 1417 assert(rdma_req - rdma_conn->reqs >= 0); 1418 assert(rdma_req - rdma_conn->reqs < (ptrdiff_t)rdma_conn->max_queue_depth); 1419 1420 return rdma_req; 1421 } 1422 1423 static struct spdk_nvmf_rdma_recv * 1424 get_rdma_recv_from_wc(struct spdk_nvmf_rdma_conn *rdma_conn, 1425 struct ibv_wc *wc) 1426 { 1427 struct spdk_nvmf_rdma_recv *rdma_recv; 1428 1429 assert(wc->byte_len >= sizeof(struct spdk_nvmf_capsule_cmd)); 1430 1431 rdma_recv = (struct spdk_nvmf_rdma_recv *)wc->wr_id; 1432 assert(rdma_recv != NULL); 1433 assert(rdma_recv - rdma_conn->recvs >= 0); 1434 assert(rdma_recv - rdma_conn->recvs < (ptrdiff_t)rdma_conn->max_queue_depth); 1435 #ifdef DEBUG 1436 assert(rdma_recv->in_use == false); 1437 rdma_recv->in_use = true; 1438 #endif 1439 1440 return rdma_recv; 1441 } 1442 1443 /* Returns the number of times that spdk_nvmf_request_exec was called, 1444 * or -1 on error. 1445 */ 1446 static int 1447 spdk_nvmf_rdma_poll(struct spdk_nvmf_conn *conn) 1448 { 1449 struct ibv_wc wc[32]; 1450 struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); 1451 struct spdk_nvmf_rdma_request *rdma_req; 1452 struct spdk_nvmf_rdma_recv *rdma_recv; 1453 struct spdk_nvmf_request *req; 1454 int reaped, i, rc; 1455 int count = 0; 1456 bool error = false; 1457 1458 /* Poll for completing operations. */ 1459 rc = ibv_poll_cq(rdma_conn->cq, 32, wc); 1460 if (rc < 0) { 1461 SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 1462 errno, strerror(errno)); 1463 return -1; 1464 } 1465 1466 reaped = rc; 1467 for (i = 0; i < reaped; i++) { 1468 if (wc[i].status) { 1469 SPDK_ERRLOG("CQ error on Connection %p, Request 0x%lu (%d): %s\n", 1470 conn, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); 1471 error = true; 1472 continue; 1473 } 1474 1475 switch (wc[i].opcode) { 1476 case IBV_WC_SEND: 1477 rdma_req = get_rdma_req_from_wc(rdma_conn, &wc[i]); 1478 req = &rdma_req->req; 1479 1480 assert(rdma_conn->cur_queue_depth > 0); 1481 SPDK_TRACELOG(SPDK_TRACE_RDMA, 1482 "RDMA SEND Complete. Request: %p Connection: %p Outstanding I/O: %d\n", 1483 req, conn, rdma_conn->cur_queue_depth - 1); 1484 rdma_conn->cur_queue_depth--; 1485 1486 /* The request may still own a data buffer. Release it */ 1487 request_release_buffer(req); 1488 1489 /* Put the request back on the free list */ 1490 TAILQ_INSERT_TAIL(&rdma_conn->free_queue, rdma_req, link); 1491 1492 /* Try to process queued incoming requests */ 1493 rc = process_incoming_queue(rdma_conn); 1494 if (rc < 0) { 1495 error = true; 1496 continue; 1497 } 1498 count += rc; 1499 break; 1500 1501 case IBV_WC_RDMA_WRITE: 1502 rdma_req = get_rdma_req_from_wc(rdma_conn, &wc[i]); 1503 req = &rdma_req->req; 1504 1505 SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA WRITE Complete. Request: %p Connection: %p\n", 1506 req, conn); 1507 spdk_trace_record(TRACE_RDMA_WRITE_COMPLETE, 0, 0, (uint64_t)req, 0); 1508 1509 /* Now that the write has completed, the data buffer can be released */ 1510 request_release_buffer(req); 1511 1512 rdma_conn->cur_rdma_rw_depth--; 1513 1514 /* Since an RDMA R/W operation completed, try to submit from the pending list. */ 1515 rc = spdk_nvmf_rdma_handle_pending_rdma_rw(conn); 1516 if (rc < 0) { 1517 error = true; 1518 continue; 1519 } 1520 count += rc; 1521 break; 1522 1523 case IBV_WC_RDMA_READ: 1524 rdma_req = get_rdma_req_from_wc(rdma_conn, &wc[i]); 1525 req = &rdma_req->req; 1526 1527 SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA READ Complete. Request: %p Connection: %p\n", 1528 req, conn); 1529 spdk_trace_record(TRACE_RDMA_READ_COMPLETE, 0, 0, (uint64_t)req, 0); 1530 rc = spdk_nvmf_request_exec(req); 1531 if (rc) { 1532 error = true; 1533 continue; 1534 } 1535 count++; 1536 1537 /* Since an RDMA R/W operation completed, try to submit from the pending list. */ 1538 rdma_conn->cur_rdma_rw_depth--; 1539 rc = spdk_nvmf_rdma_handle_pending_rdma_rw(conn); 1540 if (rc < 0) { 1541 error = true; 1542 continue; 1543 } 1544 count += rc; 1545 break; 1546 1547 case IBV_WC_RECV: 1548 rdma_recv = get_rdma_recv_from_wc(rdma_conn, &wc[i]); 1549 1550 rdma_conn->cur_queue_depth++; 1551 if (rdma_conn->cur_queue_depth > rdma_conn->max_queue_depth) { 1552 SPDK_TRACELOG(SPDK_TRACE_RDMA, 1553 "Temporarily exceeded maximum queue depth (%u). Queueing.\n", 1554 rdma_conn->cur_queue_depth); 1555 } 1556 SPDK_TRACELOG(SPDK_TRACE_RDMA, 1557 "RDMA RECV Complete. Recv: %p Connection: %p Outstanding I/O: %d\n", 1558 rdma_recv, conn, rdma_conn->cur_queue_depth); 1559 1560 TAILQ_INSERT_TAIL(&rdma_conn->incoming_queue, rdma_recv, link); 1561 rc = process_incoming_queue(rdma_conn); 1562 if (rc < 0) { 1563 error = true; 1564 continue; 1565 } 1566 count += rc; 1567 break; 1568 1569 default: 1570 SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 1571 error = true; 1572 continue; 1573 } 1574 } 1575 1576 if (error == true) { 1577 return -1; 1578 } 1579 1580 return count; 1581 } 1582 1583 static bool 1584 spdk_nvmf_rdma_conn_is_idle(struct spdk_nvmf_conn *conn) 1585 { 1586 struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); 1587 1588 if (rdma_conn->cur_queue_depth == 0 && rdma_conn->cur_rdma_rw_depth == 0) { 1589 return true; 1590 } 1591 return false; 1592 } 1593 1594 const struct spdk_nvmf_transport spdk_nvmf_transport_rdma = { 1595 .name = "rdma", 1596 .transport_init = spdk_nvmf_rdma_init, 1597 .transport_fini = spdk_nvmf_rdma_fini, 1598 1599 .acceptor_poll = spdk_nvmf_rdma_acceptor_poll, 1600 1601 .listen_addr_add = spdk_nvmf_rdma_listen, 1602 .listen_addr_remove = spdk_nvmf_rdma_listen_remove, 1603 .listen_addr_discover = spdk_nvmf_rdma_discover, 1604 1605 .session_init = spdk_nvmf_rdma_session_init, 1606 .session_fini = spdk_nvmf_rdma_session_fini, 1607 .session_add_conn = spdk_nvmf_rdma_session_add_conn, 1608 .session_remove_conn = spdk_nvmf_rdma_session_remove_conn, 1609 1610 .req_complete = spdk_nvmf_rdma_request_complete, 1611 1612 .conn_fini = spdk_nvmf_rdma_close_conn, 1613 .conn_poll = spdk_nvmf_rdma_poll, 1614 .conn_is_idle = spdk_nvmf_rdma_conn_is_idle, 1615 1616 }; 1617 1618 SPDK_LOG_REGISTER_TRACE_FLAG("rdma", SPDK_TRACE_RDMA) 1619