1488570ebSJim Harris /* SPDX-License-Identifier: BSD-3-Clause 2a6dbe372Spaul luse * Copyright (C) 2016 Intel Corporation. All rights reserved. 34a2c27f7SAlexey Marchuk * Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved. 4a681f8d5SAlexey Marchuk * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 50f912a0eSDaniel Verkamp */ 60f912a0eSDaniel Verkamp 7b961d9ccSBen Walker #include "spdk/stdinc.h" 8b961d9ccSBen Walker 9c4fee1e9SPawel Wodkowski #include "spdk/config.h" 10a83f91c2SBen Walker #include "spdk/thread.h" 11cc353f0eSBen Walker #include "spdk/likely.h" 12cc353f0eSBen Walker #include "spdk/nvmf_transport.h" 1337402f49SDaniel Verkamp #include "spdk/string.h" 140f912a0eSDaniel Verkamp #include "spdk/trace.h" 1552f7aeb7SShuhei Matsumoto #include "spdk/tree.h" 1650947d55SDaniel Verkamp #include "spdk/util.h" 170f912a0eSDaniel Verkamp 18024127dcSyidong0635 #include "spdk_internal/assert.h" 194e8e97c8STomasz Zawadzki #include "spdk/log.h" 20cf151d60SAlexey Marchuk #include "spdk_internal/rdma_provider.h" 218a01b4d6SAlexey Marchuk #include "spdk_internal/rdma_utils.h" 22d27b24c9SDaniel Verkamp 23deec1fc7SShuhei Matsumoto #include "nvmf_internal.h" 240db0c443SChunsong Feng #include "transport.h" 25deec1fc7SShuhei Matsumoto 26c37e776eSKrzysztof Karas #include "spdk_internal/trace_defs.h" 27c37e776eSKrzysztof Karas 288e2f0cdbSzkhatami88 struct spdk_nvme_rdma_hooks g_nvmf_hooks = {}; 29f038354eSSeth Howell const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma; 308e2f0cdbSzkhatami88 3154b538d1SDaniel Verkamp /* 326fa48bbfSChen Wang RDMA Connection Resource Defaults 3354b538d1SDaniel Verkamp */ 34a3f09a8fSAlexey Marchuk #define NVMF_DEFAULT_MSDBD 16 351180bf83SSeth Howell #define NVMF_DEFAULT_TX_SGE SPDK_NVMF_MAX_SGL_ENTRIES 361180bf83SSeth Howell #define NVMF_DEFAULT_RSP_SGE 1 3754b538d1SDaniel Verkamp #define NVMF_DEFAULT_RX_SGE 2 3854b538d1SDaniel Verkamp 39e89ae156SAlexey Marchuk #define NVMF_RDMA_MAX_EVENTS_PER_POLL 32 40e89ae156SAlexey Marchuk 41a3f09a8fSAlexey Marchuk SPDK_STATIC_ASSERT(NVMF_DEFAULT_MSDBD <= SPDK_NVMF_MAX_SGL_ENTRIES, 42a3f09a8fSAlexey Marchuk "MSDBD must not exceed SPDK_NVMF_MAX_SGL_ENTRIES"); 43a3f09a8fSAlexey Marchuk 442a0772e3SBen Walker /* The RDMA completion queue size */ 45db5c3ce3SXiaodong Liu #define DEFAULT_NVMF_RDMA_CQ_SIZE 4096 46db5c3ce3SXiaodong Liu #define MAX_WR_PER_QP(queue_depth) (queue_depth * 3 + 2) 472a0772e3SBen Walker 483c423f40SBen Walker enum spdk_nvmf_rdma_request_state { 493c423f40SBen Walker /* The request is not currently in use */ 503c423f40SBen Walker RDMA_REQUEST_STATE_FREE = 0, 513c423f40SBen Walker 523c423f40SBen Walker /* Initial state when request first received */ 533c423f40SBen Walker RDMA_REQUEST_STATE_NEW, 543c423f40SBen Walker 553c423f40SBen Walker /* The request is queued until a data buffer is available. */ 563c423f40SBen Walker RDMA_REQUEST_STATE_NEED_BUFFER, 573c423f40SBen Walker 583c423f40SBen Walker /* The request is waiting on RDMA queue depth availability 591d0a8e1cSSeth Howell * to transfer data from the host to the controller. 603c423f40SBen Walker */ 611d0a8e1cSSeth Howell RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 623c423f40SBen Walker 633c423f40SBen Walker /* The request is currently transferring data from the host to the controller. */ 643c423f40SBen Walker RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 653c423f40SBen Walker 663c423f40SBen Walker /* The request is ready to execute at the block device */ 673c423f40SBen Walker RDMA_REQUEST_STATE_READY_TO_EXECUTE, 683c423f40SBen Walker 693c423f40SBen Walker /* The request is currently executing at the block device */ 703c423f40SBen Walker RDMA_REQUEST_STATE_EXECUTING, 713c423f40SBen Walker 723c423f40SBen Walker /* The request finished executing at the block device */ 733c423f40SBen Walker RDMA_REQUEST_STATE_EXECUTED, 743c423f40SBen Walker 751d0a8e1cSSeth Howell /* The request is waiting on RDMA queue depth availability 761d0a8e1cSSeth Howell * to transfer data from the controller to the host. 771d0a8e1cSSeth Howell */ 781d0a8e1cSSeth Howell RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 791d0a8e1cSSeth Howell 8004cd8e47SAlexey Marchuk /* The request is waiting on RDMA queue depth availability 8104cd8e47SAlexey Marchuk * to send response to the host. 8204cd8e47SAlexey Marchuk */ 8304cd8e47SAlexey Marchuk RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING, 8404cd8e47SAlexey Marchuk 853c423f40SBen Walker /* The request is ready to send a completion */ 863c423f40SBen Walker RDMA_REQUEST_STATE_READY_TO_COMPLETE, 873c423f40SBen Walker 88fdec444aSPhilipp Skadorov /* The request is currently transferring data from the controller to the host. */ 89fdec444aSPhilipp Skadorov RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 90fdec444aSPhilipp Skadorov 91fdec444aSPhilipp Skadorov /* The request currently has an outstanding completion without an 92fdec444aSPhilipp Skadorov * associated data transfer. 93fdec444aSPhilipp Skadorov */ 943c423f40SBen Walker RDMA_REQUEST_STATE_COMPLETING, 953c423f40SBen Walker 963c423f40SBen Walker /* The request completed and can be marked free. */ 973c423f40SBen Walker RDMA_REQUEST_STATE_COMPLETED, 98fdec444aSPhilipp Skadorov 99fdec444aSPhilipp Skadorov /* Terminator */ 100fdec444aSPhilipp Skadorov RDMA_REQUEST_NUM_STATES, 1013c423f40SBen Walker }; 1023c423f40SBen Walker 1030eae0106SJim Harris static void 1040eae0106SJim Harris nvmf_trace(void) 1056a5ae72bSBen Walker { 1066a5ae72bSBen Walker spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); 10746d7b94fSAtul Malakar 10846d7b94fSAtul Malakar struct spdk_trace_tpoint_opts opts[] = { 10946d7b94fSAtul Malakar { 11046d7b94fSAtul Malakar "RDMA_REQ_NEW", TRACE_RDMA_REQUEST_STATE_NEW, 11126d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 1, 11246d7b94fSAtul Malakar { 11346d7b94fSAtul Malakar { "qpair", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 11446d7b94fSAtul Malakar { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 11546d7b94fSAtul Malakar } 11646d7b94fSAtul Malakar }, 11746d7b94fSAtul Malakar { 11846d7b94fSAtul Malakar "RDMA_REQ_COMPLETED", TRACE_RDMA_REQUEST_STATE_COMPLETED, 11946d7b94fSAtul Malakar OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 12046d7b94fSAtul Malakar { 12146d7b94fSAtul Malakar { "qpair", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 12246d7b94fSAtul Malakar { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 12346d7b94fSAtul Malakar } 12446d7b94fSAtul Malakar }, 12546d7b94fSAtul Malakar }; 12646d7b94fSAtul Malakar 12746d7b94fSAtul Malakar spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 128617184beSJim Harris spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 12926d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 130441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 131b6206d65SJim Harris spdk_trace_register_description("RDMA_REQ_TX_PENDING_C2H", 1321d0a8e1cSSeth Howell TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 13326d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 134441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 135b6206d65SJim Harris spdk_trace_register_description("RDMA_REQ_TX_PENDING_H2C", 1361d0a8e1cSSeth Howell TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 13726d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 138441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 139b6206d65SJim Harris spdk_trace_register_description("RDMA_REQ_TX_H2C", 1406a5ae72bSBen Walker TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 14126d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 142441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 143617184beSJim Harris spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", 1446a5ae72bSBen Walker TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 14526d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 146441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 147617184beSJim Harris spdk_trace_register_description("RDMA_REQ_EXECUTING", 1486a5ae72bSBen Walker TRACE_RDMA_REQUEST_STATE_EXECUTING, 14926d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 150441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 151617184beSJim Harris spdk_trace_register_description("RDMA_REQ_EXECUTED", 1526a5ae72bSBen Walker TRACE_RDMA_REQUEST_STATE_EXECUTED, 15326d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 154441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 155dd1939d8SAlexey Marchuk spdk_trace_register_description("RDMA_REQ_RDY2COMPL_PEND", 15604cd8e47SAlexey Marchuk TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING, 15726d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 15804cd8e47SAlexey Marchuk SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 159b6206d65SJim Harris spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPL", 1606a5ae72bSBen Walker TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 16126d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 162441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 163b6206d65SJim Harris spdk_trace_register_description("RDMA_REQ_COMPLETING_C2H", 164fdec444aSPhilipp Skadorov TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 16526d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 166441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 167b6206d65SJim Harris spdk_trace_register_description("RDMA_REQ_COMPLETING", 1686a5ae72bSBen Walker TRACE_RDMA_REQUEST_STATE_COMPLETING, 16926d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0, 170441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_PTR, "qpair"); 171e8881867SJim Harris 172617184beSJim Harris spdk_trace_register_description("RDMA_QP_CREATE", TRACE_RDMA_QP_CREATE, 17326d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NONE, 0, 17440cf86f2SJim Harris SPDK_TRACE_ARG_TYPE_INT, ""); 175617184beSJim Harris spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", TRACE_RDMA_IBV_ASYNC_EVENT, 17626d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NONE, 0, 177441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_INT, "type"); 178617184beSJim Harris spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", TRACE_RDMA_CM_ASYNC_EVENT, 17926d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NONE, 0, 180441431d2SKonrad Sztyber SPDK_TRACE_ARG_TYPE_INT, "type"); 181617184beSJim Harris spdk_trace_register_description("RDMA_QP_DISCONNECT", TRACE_RDMA_QP_DISCONNECT, 18226d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NONE, 0, 18340cf86f2SJim Harris SPDK_TRACE_ARG_TYPE_INT, ""); 184617184beSJim Harris spdk_trace_register_description("RDMA_QP_DESTROY", TRACE_RDMA_QP_DESTROY, 18526d44a12SJim Harris OWNER_TYPE_NONE, OBJECT_NONE, 0, 18640cf86f2SJim Harris SPDK_TRACE_ARG_TYPE_INT, ""); 1879937c016Sxupeng-mingtu 1889937c016Sxupeng-mingtu spdk_trace_tpoint_register_relation(TRACE_BDEV_IO_START, OBJECT_NVMF_RDMA_IO, 1); 1899937c016Sxupeng-mingtu spdk_trace_tpoint_register_relation(TRACE_BDEV_IO_DONE, OBJECT_NVMF_RDMA_IO, 0); 1906a5ae72bSBen Walker } 1910eae0106SJim Harris SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA) 1926a5ae72bSBen Walker 19350a438d3SBen Walker enum spdk_nvmf_rdma_wr_type { 19450a438d3SBen Walker RDMA_WR_TYPE_RECV, 19550a438d3SBen Walker RDMA_WR_TYPE_SEND, 19650a438d3SBen Walker RDMA_WR_TYPE_DATA, 19750a438d3SBen Walker }; 19850a438d3SBen Walker 19950a438d3SBen Walker struct spdk_nvmf_rdma_wr { 2008288fcf9SAlexey Marchuk /* Uses enum spdk_nvmf_rdma_wr_type */ 2018288fcf9SAlexey Marchuk uint8_t type; 20250a438d3SBen Walker }; 20350a438d3SBen Walker 2041db3a037SBen Walker /* This structure holds commands as they are received off the wire. 2051db3a037SBen Walker * It must be dynamically paired with a full request object 2061db3a037SBen Walker * (spdk_nvmf_rdma_request) to service a request. It is separate 2071db3a037SBen Walker * from the request because RDMA does not appear to order 2081db3a037SBen Walker * completions, so occasionally we'll get a new incoming 2091db3a037SBen Walker * command when there aren't any free request objects. 2101db3a037SBen Walker */ 2111db3a037SBen Walker struct spdk_nvmf_rdma_recv { 2121db3a037SBen Walker struct ibv_recv_wr wr; 2131db3a037SBen Walker struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; 2141db3a037SBen Walker 2152a0772e3SBen Walker struct spdk_nvmf_rdma_qpair *qpair; 2162a0772e3SBen Walker 2171db3a037SBen Walker /* In-capsule data buffer */ 2181db3a037SBen Walker uint8_t *buf; 2191db3a037SBen Walker 22050a438d3SBen Walker struct spdk_nvmf_rdma_wr rdma_wr; 221fbe8f804SEvgeniy Kochetov uint64_t receive_tsc; 22250a438d3SBen Walker 22380eecdd8SSeth Howell STAILQ_ENTRY(spdk_nvmf_rdma_recv) link; 2241db3a037SBen Walker }; 2251db3a037SBen Walker 226cf73fb2fSSeth Howell struct spdk_nvmf_rdma_request_data { 227cf73fb2fSSeth Howell struct ibv_send_wr wr; 228cf73fb2fSSeth Howell struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; 229cf73fb2fSSeth Howell }; 230cf73fb2fSSeth Howell 2315e152960SBen Walker struct spdk_nvmf_rdma_request { 2325e152960SBen Walker struct spdk_nvmf_request req; 2335e152960SBen Walker 2348288fcf9SAlexey Marchuk bool fused_failed; 2358288fcf9SAlexey Marchuk 2368288fcf9SAlexey Marchuk struct spdk_nvmf_rdma_wr data_wr; 2378288fcf9SAlexey Marchuk struct spdk_nvmf_rdma_wr rsp_wr; 2388288fcf9SAlexey Marchuk 2398288fcf9SAlexey Marchuk /* Uses enum spdk_nvmf_rdma_request_state */ 2408288fcf9SAlexey Marchuk uint8_t state; 2413c423f40SBen Walker 242019a5361SAlexey Marchuk /* Data offset in req.iov */ 243019a5361SAlexey Marchuk uint32_t offset; 244019a5361SAlexey Marchuk 2451db3a037SBen Walker struct spdk_nvmf_rdma_recv *recv; 2460239003aSZiye Yang 2470239003aSZiye Yang struct { 2484e742338SZiye Yang struct ibv_send_wr wr; 2491180bf83SSeth Howell struct ibv_sge sgl[NVMF_DEFAULT_RSP_SGE]; 2504e742338SZiye Yang } rsp; 2514e742338SZiye Yang 252ca59dd5dSAlexey Marchuk uint16_t iovpos; 253ca59dd5dSAlexey Marchuk uint16_t num_outstanding_data_wr; 254ca59dd5dSAlexey Marchuk /* Used to split Write IO with multi SGL payload */ 255ca59dd5dSAlexey Marchuk uint16_t num_remaining_data_wr; 256fbe8f804SEvgeniy Kochetov uint64_t receive_tsc; 257183c3485SJim Harris struct spdk_nvmf_rdma_request *fused_pair; 25804ebc6eaSSeth Howell STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; 259ca59dd5dSAlexey Marchuk struct ibv_send_wr *remaining_tranfer_in_wrs; 2605a6e7a41SAlexey Marchuk struct ibv_send_wr *transfer_wr; 2618288fcf9SAlexey Marchuk struct spdk_nvmf_rdma_request_data data; 2625e152960SBen Walker }; 2635e152960SBen Walker 2640d3fcd10SSeth Howell struct spdk_nvmf_rdma_resource_opts { 2650d3fcd10SSeth Howell struct spdk_nvmf_rdma_qpair *qpair; 2660d3fcd10SSeth Howell /* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */ 2670d3fcd10SSeth Howell void *qp; 2688a01b4d6SAlexey Marchuk struct spdk_rdma_utils_mem_map *map; 2690d3fcd10SSeth Howell uint32_t max_queue_depth; 2700d3fcd10SSeth Howell uint32_t in_capsule_data_size; 2710d3fcd10SSeth Howell bool shared; 2720d3fcd10SSeth Howell }; 2730d3fcd10SSeth Howell 274b25751d9SBen Walker struct spdk_nvmf_rdma_resources { 275b25751d9SBen Walker /* Array of size "max_queue_depth" containing RDMA requests. */ 276b25751d9SBen Walker struct spdk_nvmf_rdma_request *reqs; 277b25751d9SBen Walker 278b25751d9SBen Walker /* Array of size "max_queue_depth" containing RDMA recvs. */ 279b25751d9SBen Walker struct spdk_nvmf_rdma_recv *recvs; 280b25751d9SBen Walker 281b25751d9SBen Walker /* Array of size "max_queue_depth" containing 64 byte capsules 282b25751d9SBen Walker * used for receive. 283b25751d9SBen Walker */ 284b25751d9SBen Walker union nvmf_h2c_msg *cmds; 285b25751d9SBen Walker 286b25751d9SBen Walker /* Array of size "max_queue_depth" containing 16 byte completions 287b25751d9SBen Walker * to be sent back to the user. 288b25751d9SBen Walker */ 289b25751d9SBen Walker union nvmf_c2h_msg *cpls; 290b25751d9SBen Walker 291b25751d9SBen Walker /* Array of size "max_queue_depth * InCapsuleDataSize" containing 292b25751d9SBen Walker * buffers to be used for in capsule data. 293b25751d9SBen Walker */ 294b25751d9SBen Walker void *bufs; 295b25751d9SBen Walker 296b25751d9SBen Walker /* Receives that are waiting for a request object */ 297b25751d9SBen Walker STAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; 298b25751d9SBen Walker 299b25751d9SBen Walker /* Queue to track free requests */ 300b25751d9SBen Walker STAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; 301b25751d9SBen Walker }; 302b25751d9SBen Walker 303dc84fbaaSAlexey Marchuk typedef void (*spdk_nvmf_rdma_qpair_ibv_event)(struct spdk_nvmf_rdma_qpair *rqpair); 304dc84fbaaSAlexey Marchuk 3058ddc5cd4Ssijie.sun typedef void (*spdk_poller_destroy_cb)(void *ctx); 3068ddc5cd4Ssijie.sun 307dc84fbaaSAlexey Marchuk struct spdk_nvmf_rdma_ibv_event_ctx { 308dc84fbaaSAlexey Marchuk struct spdk_nvmf_rdma_qpair *rqpair; 309dc84fbaaSAlexey Marchuk }; 310dc84fbaaSAlexey Marchuk 3111d304bc5SBen Walker struct spdk_nvmf_rdma_qpair { 3121d304bc5SBen Walker struct spdk_nvmf_qpair qpair; 313dcc055e3SDaniel Verkamp 3148209c8cfSSeth Howell struct spdk_nvmf_rdma_device *device; 3158b79ef33SBen Walker struct spdk_nvmf_rdma_poller *poller; 3162b7b41eeSBen Walker 317cf151d60SAlexey Marchuk struct spdk_rdma_provider_qp *rdma_qp; 318dcc055e3SDaniel Verkamp struct rdma_cm_id *cm_id; 319cf151d60SAlexey Marchuk struct spdk_rdma_provider_srq *srq; 320311ce0e2SBen Walker struct rdma_cm_id *listen_id; 321dcc055e3SDaniel Verkamp 32252f7aeb7SShuhei Matsumoto /* Cache the QP number to improve QP search by RB tree. */ 32352f7aeb7SShuhei Matsumoto uint32_t qp_num; 32452f7aeb7SShuhei Matsumoto 325ca0c1338SBen Walker /* The maximum number of I/O outstanding on this connection at one time */ 326caf88609SBen Walker uint16_t max_queue_depth; 327ca0c1338SBen Walker 3287289d370SSeth Howell /* The maximum number of active RDMA READ and ATOMIC operations at one time */ 3297289d370SSeth Howell uint16_t max_read_depth; 330ca0c1338SBen Walker 3314e614b31SBen Walker /* The maximum number of RDMA SEND operations at one time */ 332158dc947SSeth Howell uint32_t max_send_depth; 333158dc947SSeth Howell 334158dc947SSeth Howell /* The current number of outstanding WRs from this qpair's 335158dc947SSeth Howell * recv queue. Should not exceed device->attr.max_queue_depth. 336158dc947SSeth Howell */ 337158dc947SSeth Howell uint16_t current_recv_depth; 338158dc947SSeth Howell 3399f7582c3SSeth Howell /* The current number of active RDMA READ operations */ 3409f7582c3SSeth Howell uint16_t current_read_depth; 3419f7582c3SSeth Howell 342158dc947SSeth Howell /* The current number of posted WRs from this qpair's 343158dc947SSeth Howell * send queue. Should not exceed max_send_depth. 344158dc947SSeth Howell */ 345158dc947SSeth Howell uint32_t current_send_depth; 346158dc947SSeth Howell 3471180bf83SSeth Howell /* The maximum number of SGEs per WR on the send queue */ 3481180bf83SSeth Howell uint32_t max_send_sge; 3491180bf83SSeth Howell 3501180bf83SSeth Howell /* The maximum number of SGEs per WR on the recv queue */ 3511180bf83SSeth Howell uint32_t max_recv_sge; 3521180bf83SSeth Howell 353b25751d9SBen Walker struct spdk_nvmf_rdma_resources *resources; 35404ebc6eaSSeth Howell 35504ebc6eaSSeth Howell STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_read_queue; 35604ebc6eaSSeth Howell 35704ebc6eaSSeth Howell STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_write_queue; 3581db3a037SBen Walker 35904cd8e47SAlexey Marchuk STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_send_queue; 36004cd8e47SAlexey Marchuk 361bfdc957cSSeth Howell /* Number of requests not in the free state */ 362bfdc957cSSeth Howell uint32_t qd; 363caf88609SBen Walker 364feeaa282SAlexey Marchuk bool ibv_in_error_state; 365feeaa282SAlexey Marchuk 36652f7aeb7SShuhei Matsumoto RB_ENTRY(spdk_nvmf_rdma_qpair) node; 3676f95c325SZiye Yang 36814777890SSeth Howell STAILQ_ENTRY(spdk_nvmf_rdma_qpair) recv_link; 36914777890SSeth Howell 370b4dc10fbSSeth Howell STAILQ_ENTRY(spdk_nvmf_rdma_qpair) send_link; 371b4dc10fbSSeth Howell 372183c3485SJim Harris /* Points to the a request that has fuse bits set to 373183c3485SJim Harris * SPDK_NVME_CMD_FUSE_FIRST, when the qpair is waiting 374183c3485SJim Harris * for the request that has SPDK_NVME_CMD_FUSE_SECOND. 375183c3485SJim Harris */ 376183c3485SJim Harris struct spdk_nvmf_rdma_request *fused_first; 377183c3485SJim Harris 3783d1d4fcfSAlexey Marchuk /* 3793d1d4fcfSAlexey Marchuk * io_channel which is used to destroy qpair when it is removed from poll group 3803d1d4fcfSAlexey Marchuk */ 3813d1d4fcfSAlexey Marchuk struct spdk_io_channel *destruct_channel; 3823d1d4fcfSAlexey Marchuk 38343f6d338SJim Harris /* ctx for async processing of last_wqe_reached event */ 38443f6d338SJim Harris struct spdk_nvmf_rdma_ibv_event_ctx *last_wqe_reached_ctx; 385dc84fbaaSAlexey Marchuk 386a9fc7e1dSSeth Howell /* Lets us know that we have received the last_wqe event. */ 387a9fc7e1dSSeth Howell bool last_wqe_reached; 38885ff3fceSZiye Yang 38985ff3fceSZiye Yang /* Indicate that nvmf_rdma_close_qpair is called */ 39085ff3fceSZiye Yang bool to_close; 391dcc055e3SDaniel Verkamp }; 392dcc055e3SDaniel Verkamp 39338ab383aSEvgeniy Kochetov struct spdk_nvmf_rdma_poller_stat { 39438ab383aSEvgeniy Kochetov uint64_t completions; 39538ab383aSEvgeniy Kochetov uint64_t polls; 3963caf2e71SAlexey Marchuk uint64_t idle_polls; 397fbe8f804SEvgeniy Kochetov uint64_t requests; 398fbe8f804SEvgeniy Kochetov uint64_t request_latency; 399251db814SEvgeniy Kochetov uint64_t pending_free_request; 400251db814SEvgeniy Kochetov uint64_t pending_rdma_read; 401251db814SEvgeniy Kochetov uint64_t pending_rdma_write; 40204cd8e47SAlexey Marchuk uint64_t pending_rdma_send; 403cf151d60SAlexey Marchuk struct spdk_rdma_provider_qp_stats qp_stats; 40438ab383aSEvgeniy Kochetov }; 40538ab383aSEvgeniy Kochetov 4063ee93c32SBen Walker struct spdk_nvmf_rdma_poller { 4073ee93c32SBen Walker struct spdk_nvmf_rdma_device *device; 4083ee93c32SBen Walker struct spdk_nvmf_rdma_poll_group *group; 4093ee93c32SBen Walker 410db5c3ce3SXiaodong Liu int num_cqe; 411db5c3ce3SXiaodong Liu int required_num_wr; 4122a0772e3SBen Walker struct ibv_cq *cq; 4132a0772e3SBen Walker 414ed0b611fSEvgeniy Kochetov /* The maximum number of I/O outstanding on the shared receive queue at one time */ 415ed0b611fSEvgeniy Kochetov uint16_t max_srq_depth; 4168ddc5cd4Ssijie.sun bool need_destroy; 417ed0b611fSEvgeniy Kochetov 418ed0b611fSEvgeniy Kochetov /* Shared receive queue */ 419cf151d60SAlexey Marchuk struct spdk_rdma_provider_srq *srq; 420ed0b611fSEvgeniy Kochetov 421b25751d9SBen Walker struct spdk_nvmf_rdma_resources *resources; 42238ab383aSEvgeniy Kochetov struct spdk_nvmf_rdma_poller_stat stat; 423ed0b611fSEvgeniy Kochetov 4248ddc5cd4Ssijie.sun spdk_poller_destroy_cb destroy_cb; 4258ddc5cd4Ssijie.sun void *destroy_cb_ctx; 4268ddc5cd4Ssijie.sun 42752f7aeb7SShuhei Matsumoto RB_HEAD(qpairs_tree, spdk_nvmf_rdma_qpair) qpairs; 4283ee93c32SBen Walker 42914777890SSeth Howell STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_recv; 43014777890SSeth Howell 431b4dc10fbSSeth Howell STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_send; 432b4dc10fbSSeth Howell 4333ee93c32SBen Walker TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; 4343ee93c32SBen Walker }; 4353ee93c32SBen Walker 436251db814SEvgeniy Kochetov struct spdk_nvmf_rdma_poll_group_stat { 437251db814SEvgeniy Kochetov uint64_t pending_data_buffer; 438251db814SEvgeniy Kochetov }; 439251db814SEvgeniy Kochetov 440baa936a1SBen Walker struct spdk_nvmf_rdma_poll_group { 441c1535ca0SBen Walker struct spdk_nvmf_transport_poll_group group; 442251db814SEvgeniy Kochetov struct spdk_nvmf_rdma_poll_group_stat stat; 443645d5944SAlexey Marchuk TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; 444645d5944SAlexey Marchuk TAILQ_ENTRY(spdk_nvmf_rdma_poll_group) link; 445d7b8da3bSBen Walker }; 446d7b8da3bSBen Walker 447645d5944SAlexey Marchuk struct spdk_nvmf_rdma_conn_sched { 448645d5944SAlexey Marchuk struct spdk_nvmf_rdma_poll_group *next_admin_pg; 449645d5944SAlexey Marchuk struct spdk_nvmf_rdma_poll_group *next_io_pg; 450645d5944SAlexey Marchuk }; 451645d5944SAlexey Marchuk 452958c68f1SBen Walker /* Assuming rdma_cm uses just one protection domain per ibv_context. */ 453958c68f1SBen Walker struct spdk_nvmf_rdma_device { 454958c68f1SBen Walker struct ibv_device_attr attr; 455958c68f1SBen Walker struct ibv_context *context; 456958c68f1SBen Walker 4578a01b4d6SAlexey Marchuk struct spdk_rdma_utils_mem_map *map; 458916d1f4fSBen Walker struct ibv_pd *pd; 459916d1f4fSBen Walker 46061948a1cSSeth Howell int num_srq; 4618ddc5cd4Ssijie.sun bool need_destroy; 4628ddc5cd4Ssijie.sun bool ready_to_destroy; 463549be9adSsijie.sun bool is_ready; 46461948a1cSSeth Howell 465958c68f1SBen Walker TAILQ_ENTRY(spdk_nvmf_rdma_device) link; 466958c68f1SBen Walker }; 467958c68f1SBen Walker 4681cbc2b16SBen Walker struct spdk_nvmf_rdma_port { 4696d8f1fc6SJacek Kalwas const struct spdk_nvme_transport_id *trid; 4702641c31aSChangpeng Liu struct rdma_cm_id *id; 471958c68f1SBen Walker struct spdk_nvmf_rdma_device *device; 4721cbc2b16SBen Walker TAILQ_ENTRY(spdk_nvmf_rdma_port) link; 4732641c31aSChangpeng Liu }; 4742641c31aSChangpeng Liu 475f766d1e4SDarek Stojaczyk struct rdma_transport_opts { 47697ef8701SMonica Kenguva int num_cqe; 477f766d1e4SDarek Stojaczyk uint32_t max_srq_depth; 478f766d1e4SDarek Stojaczyk bool no_srq; 479c818233bSIvan Betsis bool no_wr_batching; 480f766d1e4SDarek Stojaczyk int acceptor_backlog; 481f766d1e4SDarek Stojaczyk }; 482f766d1e4SDarek Stojaczyk 483ecc436fcSBen Walker struct spdk_nvmf_rdma_transport { 484ecc436fcSBen Walker struct spdk_nvmf_transport transport; 485f766d1e4SDarek Stojaczyk struct rdma_transport_opts rdma_opts; 486ecc436fcSBen Walker 487645d5944SAlexey Marchuk struct spdk_nvmf_rdma_conn_sched conn_sched; 488645d5944SAlexey Marchuk 489a0a92ff4SBen Walker struct rdma_event_channel *event_channel; 490a0a92ff4SBen Walker 491cf73fb2fSSeth Howell struct spdk_mempool *data_wr_pool; 492cf73fb2fSSeth Howell 49343022da3SJacek Kalwas struct spdk_poller *accept_poller; 494756df044SBen Walker 495b6f90c52SPhilipp Skadorov /* fields used to poll RDMA/IB events */ 496b6f90c52SPhilipp Skadorov nfds_t npoll_fds; 497b6f90c52SPhilipp Skadorov struct pollfd *poll_fds; 498b6f90c52SPhilipp Skadorov 499958c68f1SBen Walker TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; 5001cbc2b16SBen Walker TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; 501645d5944SAlexey Marchuk TAILQ_HEAD(, spdk_nvmf_rdma_poll_group) poll_groups; 502549be9adSsijie.sun 503549be9adSsijie.sun /* ports that are removed unexpectedly and need retry listen */ 504549be9adSsijie.sun TAILQ_HEAD(, spdk_nvmf_rdma_port) retry_ports; 5051b9cc2a9SBen Walker }; 5060f912a0eSDaniel Verkamp 5078ddc5cd4Ssijie.sun struct poller_manage_ctx { 5088ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_transport *rtransport; 5098ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_poll_group *rgroup; 5108ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_poller *rpoller; 5118ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_device *device; 5128ddc5cd4Ssijie.sun 5138ddc5cd4Ssijie.sun struct spdk_thread *thread; 5148ddc5cd4Ssijie.sun volatile int *inflight_op_counter; 5158ddc5cd4Ssijie.sun }; 5168ddc5cd4Ssijie.sun 517f766d1e4SDarek Stojaczyk static const struct spdk_json_object_decoder rdma_transport_opts_decoder[] = { 518f766d1e4SDarek Stojaczyk { 51997ef8701SMonica Kenguva "num_cqe", offsetof(struct rdma_transport_opts, num_cqe), 52097ef8701SMonica Kenguva spdk_json_decode_int32, true 52197ef8701SMonica Kenguva }, 52297ef8701SMonica Kenguva { 523f766d1e4SDarek Stojaczyk "max_srq_depth", offsetof(struct rdma_transport_opts, max_srq_depth), 524f766d1e4SDarek Stojaczyk spdk_json_decode_uint32, true 525f766d1e4SDarek Stojaczyk }, 526f766d1e4SDarek Stojaczyk { 527f766d1e4SDarek Stojaczyk "no_srq", offsetof(struct rdma_transport_opts, no_srq), 528f766d1e4SDarek Stojaczyk spdk_json_decode_bool, true 529f766d1e4SDarek Stojaczyk }, 530f766d1e4SDarek Stojaczyk { 531bd3840a7SIvan Betsis "no_wr_batching", offsetof(struct rdma_transport_opts, no_wr_batching), 532bd3840a7SIvan Betsis spdk_json_decode_bool, true 533bd3840a7SIvan Betsis }, 534bd3840a7SIvan Betsis { 535f766d1e4SDarek Stojaczyk "acceptor_backlog", offsetof(struct rdma_transport_opts, acceptor_backlog), 536f766d1e4SDarek Stojaczyk spdk_json_decode_int32, true 537f766d1e4SDarek Stojaczyk }, 538f766d1e4SDarek Stojaczyk }; 539f766d1e4SDarek Stojaczyk 54052f7aeb7SShuhei Matsumoto static int 54152f7aeb7SShuhei Matsumoto nvmf_rdma_qpair_compare(struct spdk_nvmf_rdma_qpair *rqpair1, struct spdk_nvmf_rdma_qpair *rqpair2) 54252f7aeb7SShuhei Matsumoto { 543d8a10574SShuhei Matsumoto return rqpair1->qp_num < rqpair2->qp_num ? -1 : rqpair1->qp_num > rqpair2->qp_num; 54452f7aeb7SShuhei Matsumoto } 54552f7aeb7SShuhei Matsumoto 54652f7aeb7SShuhei Matsumoto RB_GENERATE_STATIC(qpairs_tree, spdk_nvmf_rdma_qpair, node, nvmf_rdma_qpair_compare); 54752f7aeb7SShuhei Matsumoto 5488dd1cd21SBen Walker static bool nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 549f8cbdf2cSAlexey Marchuk struct spdk_nvmf_rdma_request *rdma_req); 550f8cbdf2cSAlexey Marchuk 5518dd1cd21SBen Walker static void _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport, 552c818233bSIvan Betsis struct spdk_nvmf_rdma_poller *rpoller); 553c818233bSIvan Betsis 5548dd1cd21SBen Walker static void _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport, 555c818233bSIvan Betsis struct spdk_nvmf_rdma_poller *rpoller); 556c818233bSIvan Betsis 5578ddc5cd4Ssijie.sun static void _nvmf_rdma_remove_destroyed_device(void *c); 5588ddc5cd4Ssijie.sun 5597545e8c8SAlexey Marchuk static inline enum spdk_nvme_media_error_status_code 56055d8d943SSeth Howell nvmf_rdma_dif_error_to_compl_status(uint8_t err_type) { 5617545e8c8SAlexey Marchuk enum spdk_nvme_media_error_status_code result; 5627545e8c8SAlexey Marchuk switch (err_type) 5637545e8c8SAlexey Marchuk { 5647545e8c8SAlexey Marchuk case SPDK_DIF_REFTAG_ERROR: 5657545e8c8SAlexey Marchuk result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR; 5667545e8c8SAlexey Marchuk break; 5677545e8c8SAlexey Marchuk case SPDK_DIF_APPTAG_ERROR: 5687545e8c8SAlexey Marchuk result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR; 5697545e8c8SAlexey Marchuk break; 5707545e8c8SAlexey Marchuk case SPDK_DIF_GUARD_ERROR: 5717545e8c8SAlexey Marchuk result = SPDK_NVME_SC_GUARD_CHECK_ERROR; 5727545e8c8SAlexey Marchuk break; 5737545e8c8SAlexey Marchuk default: 5747545e8c8SAlexey Marchuk SPDK_UNREACHABLE(); 5757545e8c8SAlexey Marchuk } 5767545e8c8SAlexey Marchuk 5777545e8c8SAlexey Marchuk return result; 5787545e8c8SAlexey Marchuk } 5797545e8c8SAlexey Marchuk 58090d91cd3SAlexey Marchuk /* 58190d91cd3SAlexey Marchuk * Return data_wrs to pool starting from \b data_wr 58290d91cd3SAlexey Marchuk * Request's own response and data WR are excluded 58390d91cd3SAlexey Marchuk */ 5841cfff49fSBen Walker static void 58590d91cd3SAlexey Marchuk _nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req, 5865a6e7a41SAlexey Marchuk struct ibv_send_wr *data_wr, 58790d91cd3SAlexey Marchuk struct spdk_mempool *pool) 58862700dacSSeth Howell { 5896375b60cSAlexey Marchuk struct spdk_nvmf_rdma_request_data *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES]; 5905a6e7a41SAlexey Marchuk struct spdk_nvmf_rdma_request_data *nvmf_data; 59101887d3cSEvgeniy Kochetov struct ibv_send_wr *next_send_wr; 592ca59dd5dSAlexey Marchuk uint64_t req_wrid = (uint64_t)&rdma_req->data_wr; 5936375b60cSAlexey Marchuk uint32_t num_wrs = 0; 59462700dacSSeth Howell 5955a6e7a41SAlexey Marchuk while (data_wr && data_wr->wr_id == req_wrid) { 5965a6e7a41SAlexey Marchuk nvmf_data = SPDK_CONTAINEROF(data_wr, struct spdk_nvmf_rdma_request_data, wr); 5975a6e7a41SAlexey Marchuk memset(nvmf_data->sgl, 0, sizeof(data_wr->sg_list[0]) * data_wr->num_sge); 5985a6e7a41SAlexey Marchuk data_wr->num_sge = 0; 5995a6e7a41SAlexey Marchuk next_send_wr = data_wr->next; 6005a6e7a41SAlexey Marchuk if (data_wr != &rdma_req->data.wr) { 6015a6e7a41SAlexey Marchuk data_wr->next = NULL; 6026375b60cSAlexey Marchuk assert(num_wrs < SPDK_NVMF_MAX_SGL_ENTRIES); 6035a6e7a41SAlexey Marchuk work_requests[num_wrs] = nvmf_data; 6046375b60cSAlexey Marchuk num_wrs++; 60562700dacSSeth Howell } 6065a6e7a41SAlexey Marchuk data_wr = (!next_send_wr || next_send_wr == &rdma_req->rsp.wr) ? NULL : next_send_wr; 60762700dacSSeth Howell } 6086375b60cSAlexey Marchuk 6096375b60cSAlexey Marchuk if (num_wrs) { 6106375b60cSAlexey Marchuk spdk_mempool_put_bulk(pool, (void **) work_requests, num_wrs); 6116375b60cSAlexey Marchuk } 61290d91cd3SAlexey Marchuk } 61390d91cd3SAlexey Marchuk 61490d91cd3SAlexey Marchuk static void 61590d91cd3SAlexey Marchuk nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req, 61690d91cd3SAlexey Marchuk struct spdk_nvmf_rdma_transport *rtransport) 61790d91cd3SAlexey Marchuk { 61890d91cd3SAlexey Marchuk rdma_req->num_outstanding_data_wr = 0; 61990d91cd3SAlexey Marchuk 6205a6e7a41SAlexey Marchuk _nvmf_rdma_request_free_data(rdma_req, rdma_req->transfer_wr, rtransport->data_wr_pool); 62190d91cd3SAlexey Marchuk 62270683284SAlexey Marchuk if (rdma_req->remaining_tranfer_in_wrs) { 62370683284SAlexey Marchuk _nvmf_rdma_request_free_data(rdma_req, rdma_req->remaining_tranfer_in_wrs, 62470683284SAlexey Marchuk rtransport->data_wr_pool); 62570683284SAlexey Marchuk rdma_req->remaining_tranfer_in_wrs = NULL; 62670683284SAlexey Marchuk } 62770683284SAlexey Marchuk 6287fbda6d9SAlexey Marchuk rdma_req->data.wr.next = NULL; 6297fbda6d9SAlexey Marchuk rdma_req->rsp.wr.next = NULL; 63062700dacSSeth Howell } 63162700dacSSeth Howell 63262700dacSSeth Howell static void 633fa757dc9SSeth Howell nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req) 634fa757dc9SSeth Howell { 635005b053aSShuhei Matsumoto SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->req.data_from_pool); 6369d838d24Syidong0635 if (req->req.cmd) { 637fa757dc9SSeth Howell SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode); 6389d838d24Syidong0635 } 6399d838d24Syidong0635 if (req->recv) { 640fa757dc9SSeth Howell SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id); 641fa757dc9SSeth Howell } 6429d838d24Syidong0635 } 643fa757dc9SSeth Howell 644fa757dc9SSeth Howell static void 645fa757dc9SSeth Howell nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair) 646fa757dc9SSeth Howell { 647fa757dc9SSeth Howell int i; 64804ebc6eaSSeth Howell 649fa757dc9SSeth Howell SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid); 65004ebc6eaSSeth Howell for (i = 0; i < rqpair->max_queue_depth; i++) { 651b25751d9SBen Walker if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) { 652b25751d9SBen Walker nvmf_rdma_dump_request(&rqpair->resources->reqs[i]); 653fa757dc9SSeth Howell } 654fa757dc9SSeth Howell } 655fa757dc9SSeth Howell } 656fa757dc9SSeth Howell 657fa757dc9SSeth Howell static void 658353fbcdaSBen Walker nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources) 659353fbcdaSBen Walker { 66096ec8bffSDarek Stojaczyk spdk_free(resources->cmds); 66196ec8bffSDarek Stojaczyk spdk_free(resources->cpls); 66296ec8bffSDarek Stojaczyk spdk_free(resources->bufs); 663bfcfdb79SOr Gerlitz spdk_free(resources->reqs); 664bfcfdb79SOr Gerlitz spdk_free(resources->recvs); 665353fbcdaSBen Walker free(resources); 666353fbcdaSBen Walker } 667353fbcdaSBen Walker 6680d3fcd10SSeth Howell 6690d3fcd10SSeth Howell static struct spdk_nvmf_rdma_resources * 6700d3fcd10SSeth Howell nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts) 6710d3fcd10SSeth Howell { 6720d3fcd10SSeth Howell struct spdk_nvmf_rdma_resources *resources; 6730d3fcd10SSeth Howell struct spdk_nvmf_rdma_request *rdma_req; 6740d3fcd10SSeth Howell struct spdk_nvmf_rdma_recv *rdma_recv; 675cf151d60SAlexey Marchuk struct spdk_rdma_provider_qp *qp = NULL; 676cf151d60SAlexey Marchuk struct spdk_rdma_provider_srq *srq = NULL; 677696e8580SAlexey Marchuk struct ibv_recv_wr *bad_wr = NULL; 6788a01b4d6SAlexey Marchuk struct spdk_rdma_utils_memory_translation translation; 6790d3fcd10SSeth Howell uint32_t i; 680696e8580SAlexey Marchuk int rc = 0; 6810d3fcd10SSeth Howell 6820d3fcd10SSeth Howell resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources)); 6830d3fcd10SSeth Howell if (!resources) { 6840d3fcd10SSeth Howell SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); 6850d3fcd10SSeth Howell return NULL; 6860d3fcd10SSeth Howell } 6870d3fcd10SSeth Howell 688bfcfdb79SOr Gerlitz resources->reqs = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->reqs), 689bfcfdb79SOr Gerlitz 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 690bfcfdb79SOr Gerlitz resources->recvs = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->recvs), 691bfcfdb79SOr Gerlitz 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 69296ec8bffSDarek Stojaczyk resources->cmds = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds), 69396ec8bffSDarek Stojaczyk 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 69496ec8bffSDarek Stojaczyk resources->cpls = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls), 69596ec8bffSDarek Stojaczyk 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 6960d3fcd10SSeth Howell 6970d3fcd10SSeth Howell if (opts->in_capsule_data_size > 0) { 69896ec8bffSDarek Stojaczyk resources->bufs = spdk_zmalloc(opts->max_queue_depth * opts->in_capsule_data_size, 69996ec8bffSDarek Stojaczyk 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, 70096ec8bffSDarek Stojaczyk SPDK_MALLOC_DMA); 7010d3fcd10SSeth Howell } 7020d3fcd10SSeth Howell 7030d3fcd10SSeth Howell if (!resources->reqs || !resources->recvs || !resources->cmds || 7040d3fcd10SSeth Howell !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) { 7050d3fcd10SSeth Howell SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); 7060d3fcd10SSeth Howell goto cleanup; 7070d3fcd10SSeth Howell } 7080d3fcd10SSeth Howell 709bf41b46cSAleksey Marchuk SPDK_DEBUGLOG(rdma, "Command Array: %p Length: %lx\n", 710bf41b46cSAleksey Marchuk resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds)); 711bf41b46cSAleksey Marchuk SPDK_DEBUGLOG(rdma, "Completion Array: %p Length: %lx\n", 712bf41b46cSAleksey Marchuk resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls)); 713bf41b46cSAleksey Marchuk if (resources->bufs) { 714bf41b46cSAleksey Marchuk SPDK_DEBUGLOG(rdma, "In Capsule Data Array: %p Length: %x\n", 7150d3fcd10SSeth Howell resources->bufs, opts->max_queue_depth * 716bf41b46cSAleksey Marchuk opts->in_capsule_data_size); 7170d3fcd10SSeth Howell } 7180d3fcd10SSeth Howell 7190d3fcd10SSeth Howell /* Initialize queues */ 7200d3fcd10SSeth Howell STAILQ_INIT(&resources->incoming_queue); 7210d3fcd10SSeth Howell STAILQ_INIT(&resources->free_queue); 7220d3fcd10SSeth Howell 723696e8580SAlexey Marchuk if (opts->shared) { 724cf151d60SAlexey Marchuk srq = (struct spdk_rdma_provider_srq *)opts->qp; 725696e8580SAlexey Marchuk } else { 726cf151d60SAlexey Marchuk qp = (struct spdk_rdma_provider_qp *)opts->qp; 727696e8580SAlexey Marchuk } 7280d3fcd10SSeth Howell 729696e8580SAlexey Marchuk for (i = 0; i < opts->max_queue_depth; i++) { 7300d3fcd10SSeth Howell rdma_recv = &resources->recvs[i]; 7310d3fcd10SSeth Howell rdma_recv->qpair = opts->qpair; 7320d3fcd10SSeth Howell 7330d3fcd10SSeth Howell /* Set up memory to receive commands */ 7340d3fcd10SSeth Howell if (resources->bufs) { 7350d3fcd10SSeth Howell rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i * 7360d3fcd10SSeth Howell opts->in_capsule_data_size)); 7370d3fcd10SSeth Howell } 7380d3fcd10SSeth Howell 7390d3fcd10SSeth Howell rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV; 7400d3fcd10SSeth Howell 7410d3fcd10SSeth Howell rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i]; 7420d3fcd10SSeth Howell rdma_recv->sgl[0].length = sizeof(resources->cmds[i]); 7438a01b4d6SAlexey Marchuk rc = spdk_rdma_utils_get_translation(opts->map, &resources->cmds[i], sizeof(resources->cmds[i]), 744bf41b46cSAleksey Marchuk &translation); 745bf41b46cSAleksey Marchuk if (rc) { 746bf41b46cSAleksey Marchuk goto cleanup; 747bf41b46cSAleksey Marchuk } 7488a01b4d6SAlexey Marchuk rdma_recv->sgl[0].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation); 7490d3fcd10SSeth Howell rdma_recv->wr.num_sge = 1; 7500d3fcd10SSeth Howell 751bf41b46cSAleksey Marchuk if (rdma_recv->buf) { 7520d3fcd10SSeth Howell rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; 7530d3fcd10SSeth Howell rdma_recv->sgl[1].length = opts->in_capsule_data_size; 7548a01b4d6SAlexey Marchuk rc = spdk_rdma_utils_get_translation(opts->map, rdma_recv->buf, opts->in_capsule_data_size, 7558a01b4d6SAlexey Marchuk &translation); 756bf41b46cSAleksey Marchuk if (rc) { 757bf41b46cSAleksey Marchuk goto cleanup; 758bf41b46cSAleksey Marchuk } 7598a01b4d6SAlexey Marchuk rdma_recv->sgl[1].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation); 7600d3fcd10SSeth Howell rdma_recv->wr.num_sge++; 7610d3fcd10SSeth Howell } 7620d3fcd10SSeth Howell 7630d3fcd10SSeth Howell rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr; 7640d3fcd10SSeth Howell rdma_recv->wr.sg_list = rdma_recv->sgl; 765d9ff7d09SAlexey Marchuk if (srq) { 766cf151d60SAlexey Marchuk spdk_rdma_provider_srq_queue_recv_wrs(srq, &rdma_recv->wr); 7670d3fcd10SSeth Howell } else { 768cf151d60SAlexey Marchuk spdk_rdma_provider_qp_queue_recv_wrs(qp, &rdma_recv->wr); 7690d3fcd10SSeth Howell } 7700d3fcd10SSeth Howell } 7710d3fcd10SSeth Howell 7720d3fcd10SSeth Howell for (i = 0; i < opts->max_queue_depth; i++) { 7730d3fcd10SSeth Howell rdma_req = &resources->reqs[i]; 7740d3fcd10SSeth Howell 7750d3fcd10SSeth Howell if (opts->qpair != NULL) { 7760d3fcd10SSeth Howell rdma_req->req.qpair = &opts->qpair->qpair; 7770d3fcd10SSeth Howell } else { 7780d3fcd10SSeth Howell rdma_req->req.qpair = NULL; 7790d3fcd10SSeth Howell } 7800d3fcd10SSeth Howell rdma_req->req.cmd = NULL; 7810db0c443SChunsong Feng rdma_req->req.iovcnt = 0; 7820db0c443SChunsong Feng rdma_req->req.stripped_data = NULL; 7830d3fcd10SSeth Howell 7840d3fcd10SSeth Howell /* Set up memory to send responses */ 7850d3fcd10SSeth Howell rdma_req->req.rsp = &resources->cpls[i]; 7860d3fcd10SSeth Howell 7870d3fcd10SSeth Howell rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i]; 7880d3fcd10SSeth Howell rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]); 7898a01b4d6SAlexey Marchuk rc = spdk_rdma_utils_get_translation(opts->map, &resources->cpls[i], sizeof(resources->cpls[i]), 790bf41b46cSAleksey Marchuk &translation); 791bf41b46cSAleksey Marchuk if (rc) { 792bf41b46cSAleksey Marchuk goto cleanup; 793bf41b46cSAleksey Marchuk } 7948a01b4d6SAlexey Marchuk rdma_req->rsp.sgl[0].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation); 7950d3fcd10SSeth Howell 7968288fcf9SAlexey Marchuk rdma_req->rsp_wr.type = RDMA_WR_TYPE_SEND; 7978288fcf9SAlexey Marchuk rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp_wr; 7980d3fcd10SSeth Howell rdma_req->rsp.wr.next = NULL; 7990d3fcd10SSeth Howell rdma_req->rsp.wr.opcode = IBV_WR_SEND; 8000d3fcd10SSeth Howell rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; 8010d3fcd10SSeth Howell rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; 8020d3fcd10SSeth Howell rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); 8030d3fcd10SSeth Howell 8040d3fcd10SSeth Howell /* Set up memory for data buffers */ 8058288fcf9SAlexey Marchuk rdma_req->data_wr.type = RDMA_WR_TYPE_DATA; 8068288fcf9SAlexey Marchuk rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data_wr; 8070d3fcd10SSeth Howell rdma_req->data.wr.next = NULL; 8080d3fcd10SSeth Howell rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; 8090d3fcd10SSeth Howell rdma_req->data.wr.sg_list = rdma_req->data.sgl; 8100d3fcd10SSeth Howell rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); 8110d3fcd10SSeth Howell 8120d3fcd10SSeth Howell /* Initialize request state to FREE */ 8130d3fcd10SSeth Howell rdma_req->state = RDMA_REQUEST_STATE_FREE; 8140d3fcd10SSeth Howell STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link); 8150d3fcd10SSeth Howell } 8160d3fcd10SSeth Howell 817d9ff7d09SAlexey Marchuk if (srq) { 818cf151d60SAlexey Marchuk rc = spdk_rdma_provider_srq_flush_recv_wrs(srq, &bad_wr); 819d9ff7d09SAlexey Marchuk } else { 820cf151d60SAlexey Marchuk rc = spdk_rdma_provider_qp_flush_recv_wrs(qp, &bad_wr); 821d9ff7d09SAlexey Marchuk } 822d9ff7d09SAlexey Marchuk 823696e8580SAlexey Marchuk if (rc) { 824696e8580SAlexey Marchuk goto cleanup; 825696e8580SAlexey Marchuk } 826696e8580SAlexey Marchuk 8270d3fcd10SSeth Howell return resources; 8280d3fcd10SSeth Howell 8290d3fcd10SSeth Howell cleanup: 8300d3fcd10SSeth Howell nvmf_rdma_resources_destroy(resources); 8310d3fcd10SSeth Howell return NULL; 8320d3fcd10SSeth Howell } 8330d3fcd10SSeth Howell 834353fbcdaSBen Walker static void 83555d8d943SSeth Howell nvmf_rdma_qpair_clean_ibv_events(struct spdk_nvmf_rdma_qpair *rqpair) 836dc84fbaaSAlexey Marchuk { 83743f6d338SJim Harris struct spdk_nvmf_rdma_ibv_event_ctx *ctx; 83843f6d338SJim Harris 83943f6d338SJim Harris ctx = rqpair->last_wqe_reached_ctx; 84043f6d338SJim Harris if (ctx) { 841dc84fbaaSAlexey Marchuk ctx->rqpair = NULL; 8429645421cSJim Harris /* Memory allocated for ctx is freed in nvmf_rdma_qpair_process_last_wqe_event */ 8435e156a6eSJim Harris rqpair->last_wqe_reached_ctx = NULL; 844dc84fbaaSAlexey Marchuk } 845dc84fbaaSAlexey Marchuk } 846dc84fbaaSAlexey Marchuk 8478ddc5cd4Ssijie.sun static void nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller); 8488ddc5cd4Ssijie.sun 849dc84fbaaSAlexey Marchuk static void 85055d8d943SSeth Howell nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) 8515e152960SBen Walker { 852ed0b611fSEvgeniy Kochetov struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; 853ed0b611fSEvgeniy Kochetov struct ibv_recv_wr *bad_recv_wr = NULL; 854ed0b611fSEvgeniy Kochetov int rc; 855ed0b611fSEvgeniy Kochetov 856c556b6b8SKonrad Sztyber spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair); 85790b4bd6cSEvgeniy Kochetov 858bfdc957cSSeth Howell if (rqpair->qd != 0) { 859f8cbdf2cSAlexey Marchuk struct spdk_nvmf_qpair *qpair = &rqpair->qpair; 860f8cbdf2cSAlexey Marchuk struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(qpair->transport, 861f8cbdf2cSAlexey Marchuk struct spdk_nvmf_rdma_transport, transport); 862f8cbdf2cSAlexey Marchuk struct spdk_nvmf_rdma_request *req; 863f8cbdf2cSAlexey Marchuk uint32_t i, max_req_count = 0; 864f8cbdf2cSAlexey Marchuk 865f8cbdf2cSAlexey Marchuk SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd); 866f8cbdf2cSAlexey Marchuk 867fa79f64aSSeth Howell if (rqpair->srq == NULL) { 868fa757dc9SSeth Howell nvmf_rdma_dump_qpair_contents(rqpair); 869f8cbdf2cSAlexey Marchuk max_req_count = rqpair->max_queue_depth; 870f8cbdf2cSAlexey Marchuk } else if (rqpair->poller && rqpair->resources) { 871f8cbdf2cSAlexey Marchuk max_req_count = rqpair->poller->max_srq_depth; 87201201d3eSSeth Howell } 873f8cbdf2cSAlexey Marchuk 8742172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Release incomplete requests\n"); 875f8cbdf2cSAlexey Marchuk for (i = 0; i < max_req_count; i++) { 876f8cbdf2cSAlexey Marchuk req = &rqpair->resources->reqs[i]; 877f8cbdf2cSAlexey Marchuk if (req->req.qpair == qpair && req->state != RDMA_REQUEST_STATE_FREE) { 87855d8d943SSeth Howell /* nvmf_rdma_request_process checks qpair ibv and internal state 879f8cbdf2cSAlexey Marchuk * and completes a request */ 88055d8d943SSeth Howell nvmf_rdma_request_process(rtransport, req); 881f8cbdf2cSAlexey Marchuk } 882f8cbdf2cSAlexey Marchuk } 883f8cbdf2cSAlexey Marchuk assert(rqpair->qd == 0); 8849b47c7e7SBen Walker } 8859b47c7e7SBen Walker 8868b79ef33SBen Walker if (rqpair->poller) { 88752f7aeb7SShuhei Matsumoto RB_REMOVE(qpairs_tree, &rqpair->poller->qpairs, rqpair); 8888b79ef33SBen Walker 889dd90ff7aSJinYu if (rqpair->srq != NULL && rqpair->resources != NULL) { 890ed0b611fSEvgeniy Kochetov /* Drop all received but unprocessed commands for this queue and return them to SRQ */ 891b25751d9SBen Walker STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) { 892ed0b611fSEvgeniy Kochetov if (rqpair == rdma_recv->qpair) { 8934af2b9bfSAlexey Marchuk STAILQ_REMOVE(&rqpair->resources->incoming_queue, rdma_recv, spdk_nvmf_rdma_recv, link); 894cf151d60SAlexey Marchuk spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, &rdma_recv->wr); 895cf151d60SAlexey Marchuk rc = spdk_rdma_provider_srq_flush_recv_wrs(rqpair->srq, &bad_recv_wr); 896ed0b611fSEvgeniy Kochetov if (rc) { 897ed0b611fSEvgeniy Kochetov SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 898ed0b611fSEvgeniy Kochetov } 899ed0b611fSEvgeniy Kochetov } 900ed0b611fSEvgeniy Kochetov } 90101201d3eSSeth Howell } 90201201d3eSSeth Howell } 9035e152960SBen Walker 90455a624edSBen Walker if (rqpair->cm_id) { 905ea7a4f3cSAlexey Marchuk if (rqpair->rdma_qp != NULL) { 906cf151d60SAlexey Marchuk spdk_rdma_provider_qp_destroy(rqpair->rdma_qp); 907ea7a4f3cSAlexey Marchuk rqpair->rdma_qp = NULL; 908dd90ff7aSJinYu } 909db5c3ce3SXiaodong Liu 9107dd3cf44SSeth Howell if (rqpair->poller != NULL && rqpair->srq == NULL) { 911db5c3ce3SXiaodong Liu rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth); 912db5c3ce3SXiaodong Liu } 9135e152960SBen Walker } 9145e152960SBen Walker 915dd90ff7aSJinYu if (rqpair->srq == NULL && rqpair->resources != NULL) { 916353fbcdaSBen Walker nvmf_rdma_resources_destroy(rqpair->resources); 91701201d3eSSeth Howell } 918353fbcdaSBen Walker 91955d8d943SSeth Howell nvmf_rdma_qpair_clean_ibv_events(rqpair); 920dc84fbaaSAlexey Marchuk 9213d1d4fcfSAlexey Marchuk if (rqpair->destruct_channel) { 9223d1d4fcfSAlexey Marchuk spdk_put_io_channel(rqpair->destruct_channel); 9233d1d4fcfSAlexey Marchuk rqpair->destruct_channel = NULL; 9243d1d4fcfSAlexey Marchuk } 9253d1d4fcfSAlexey Marchuk 9268ddc5cd4Ssijie.sun if (rqpair->poller && rqpair->poller->need_destroy && RB_EMPTY(&rqpair->poller->qpairs)) { 9278ddc5cd4Ssijie.sun nvmf_rdma_poller_destroy(rqpair->poller); 9288ddc5cd4Ssijie.sun } 929549be9adSsijie.sun 930549be9adSsijie.sun /* destroy cm_id last so cma device will not be freed before we destroy the cq. */ 931549be9adSsijie.sun if (rqpair->cm_id) { 932549be9adSsijie.sun rdma_destroy_id(rqpair->cm_id); 933549be9adSsijie.sun } 934549be9adSsijie.sun 93555a624edSBen Walker free(rqpair); 9365e152960SBen Walker } 9379d9dc845SBen Walker 938ee691fefSBen Walker static int 93997a43680SSeth Howell nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device) 94097a43680SSeth Howell { 94197a43680SSeth Howell struct spdk_nvmf_rdma_poller *rpoller; 94297a43680SSeth Howell int rc, num_cqe, required_num_wr; 94397a43680SSeth Howell 94497a43680SSeth Howell /* Enlarge CQ size dynamically */ 94597a43680SSeth Howell rpoller = rqpair->poller; 94697a43680SSeth Howell required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth); 94797a43680SSeth Howell num_cqe = rpoller->num_cqe; 94897a43680SSeth Howell if (num_cqe < required_num_wr) { 94997a43680SSeth Howell num_cqe = spdk_max(num_cqe * 2, required_num_wr); 95097a43680SSeth Howell num_cqe = spdk_min(num_cqe, device->attr.max_cqe); 95197a43680SSeth Howell } 95297a43680SSeth Howell 95397a43680SSeth Howell if (rpoller->num_cqe != num_cqe) { 9544a2c27f7SAlexey Marchuk if (device->context->device->transport_type == IBV_TRANSPORT_IWARP) { 9554a2c27f7SAlexey Marchuk SPDK_ERRLOG("iWARP doesn't support CQ resize. Current capacity %u, required %u\n" 9564a2c27f7SAlexey Marchuk "Using CQ of insufficient size may lead to CQ overrun\n", rpoller->num_cqe, num_cqe); 9574a2c27f7SAlexey Marchuk return -1; 9584a2c27f7SAlexey Marchuk } 95997a43680SSeth Howell if (required_num_wr > device->attr.max_cqe) { 96097a43680SSeth Howell SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n", 96197a43680SSeth Howell required_num_wr, device->attr.max_cqe); 96297a43680SSeth Howell return -1; 96397a43680SSeth Howell } 96497a43680SSeth Howell 9652172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe); 96697a43680SSeth Howell rc = ibv_resize_cq(rpoller->cq, num_cqe); 96797a43680SSeth Howell if (rc) { 96897a43680SSeth Howell SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); 96997a43680SSeth Howell return -1; 97097a43680SSeth Howell } 97197a43680SSeth Howell 97297a43680SSeth Howell rpoller->num_cqe = num_cqe; 97397a43680SSeth Howell } 97497a43680SSeth Howell 97597a43680SSeth Howell rpoller->required_num_wr = required_num_wr; 97697a43680SSeth Howell return 0; 97797a43680SSeth Howell } 97897a43680SSeth Howell 97997a43680SSeth Howell static int 98055d8d943SSeth Howell nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) 9811ade6e1dSDaniel Verkamp { 98255a624edSBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 983ed0b611fSEvgeniy Kochetov struct spdk_nvmf_rdma_transport *rtransport; 9848e808490SJohn Barnard struct spdk_nvmf_transport *transport; 9850d3fcd10SSeth Howell struct spdk_nvmf_rdma_resource_opts opts; 9861180bf83SSeth Howell struct spdk_nvmf_rdma_device *device; 987cf151d60SAlexey Marchuk struct spdk_rdma_provider_qp_init_attr qp_init_attr = {}; 9881ade6e1dSDaniel Verkamp 989ee691fefSBen Walker rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 9908209c8cfSSeth Howell device = rqpair->device; 991ecc436fcSBen Walker 992ea7a4f3cSAlexey Marchuk qp_init_attr.qp_context = rqpair; 993ea7a4f3cSAlexey Marchuk qp_init_attr.pd = device->pd; 994ea7a4f3cSAlexey Marchuk qp_init_attr.send_cq = rqpair->poller->cq; 995ea7a4f3cSAlexey Marchuk qp_init_attr.recv_cq = rqpair->poller->cq; 99601201d3eSSeth Howell 997fa79f64aSSeth Howell if (rqpair->srq) { 998696e8580SAlexey Marchuk qp_init_attr.srq = rqpair->srq->srq; 99901201d3eSSeth Howell } else { 1000ea7a4f3cSAlexey Marchuk qp_init_attr.cap.max_recv_wr = rqpair->max_queue_depth; 100101201d3eSSeth Howell } 100201201d3eSSeth Howell 1003ea7a4f3cSAlexey Marchuk /* SEND, READ, and WRITE operations */ 1004ea7a4f3cSAlexey Marchuk qp_init_attr.cap.max_send_wr = (uint32_t)rqpair->max_queue_depth * 2; 1005ea7a4f3cSAlexey Marchuk qp_init_attr.cap.max_send_sge = spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_TX_SGE); 1006ea7a4f3cSAlexey Marchuk qp_init_attr.cap.max_recv_sge = spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 100736ac75b9SAlexey Marchuk qp_init_attr.stats = &rqpair->poller->stat.qp_stats; 10081b17e4eeSBen Walker 10097dd3cf44SSeth Howell if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) { 101097a43680SSeth Howell SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n"); 1011a5972c62SSeth Howell goto error; 1012db5c3ce3SXiaodong Liu } 1013db5c3ce3SXiaodong Liu 1014cf151d60SAlexey Marchuk rqpair->rdma_qp = spdk_rdma_provider_qp_create(rqpair->cm_id, &qp_init_attr); 1015ea7a4f3cSAlexey Marchuk if (!rqpair->rdma_qp) { 1016a5972c62SSeth Howell goto error; 10171b17e4eeSBen Walker } 10187e23841dSBen Walker 101952f7aeb7SShuhei Matsumoto rqpair->qp_num = rqpair->rdma_qp->qp->qp_num; 102052f7aeb7SShuhei Matsumoto 10211f626649SAlexey Marchuk rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2), 1022ea7a4f3cSAlexey Marchuk qp_init_attr.cap.max_send_wr); 1023ea7a4f3cSAlexey Marchuk rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, qp_init_attr.cap.max_send_sge); 1024ea7a4f3cSAlexey Marchuk rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, qp_init_attr.cap.max_recv_sge); 1025c556b6b8SKonrad Sztyber spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair); 10262172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "New RDMA Connection: %p\n", qpair); 1027130fec66SBen Walker 102801201d3eSSeth Howell if (rqpair->poller->srq == NULL) { 1029ed0b611fSEvgeniy Kochetov rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 1030ed0b611fSEvgeniy Kochetov transport = &rtransport->transport; 1031ed0b611fSEvgeniy Kochetov 1032d9ff7d09SAlexey Marchuk opts.qp = rqpair->rdma_qp; 1033bf41b46cSAleksey Marchuk opts.map = device->map; 10340d3fcd10SSeth Howell opts.qpair = rqpair; 10350d3fcd10SSeth Howell opts.shared = false; 10360d3fcd10SSeth Howell opts.max_queue_depth = rqpair->max_queue_depth; 10370d3fcd10SSeth Howell opts.in_capsule_data_size = transport->opts.in_capsule_data_size; 10386138d3bcSSenthil Kumar V 10390d3fcd10SSeth Howell rqpair->resources = nvmf_rdma_resources_create(&opts); 10408e808490SJohn Barnard 10410d3fcd10SSeth Howell if (!rqpair->resources) { 10420d3fcd10SSeth Howell SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); 10438a14af68SJacek Kalwas rdma_destroy_qp(rqpair->cm_id); 1044a5972c62SSeth Howell goto error; 10459d9dc845SBen Walker } 104601201d3eSSeth Howell } else { 10470d3fcd10SSeth Howell rqpair->resources = rqpair->poller->resources; 104801201d3eSSeth Howell } 10493d52e57cSBen Walker 10500d3fcd10SSeth Howell rqpair->current_recv_depth = 0; 105104ebc6eaSSeth Howell STAILQ_INIT(&rqpair->pending_rdma_read_queue); 105204ebc6eaSSeth Howell STAILQ_INIT(&rqpair->pending_rdma_write_queue); 105304cd8e47SAlexey Marchuk STAILQ_INIT(&rqpair->pending_rdma_send_queue); 105446d7b94fSAtul Malakar rqpair->qpair.queue_depth = 0; 105504ebc6eaSSeth Howell 1056ee691fefSBen Walker return 0; 1057a5972c62SSeth Howell 1058a5972c62SSeth Howell error: 1059a5972c62SSeth Howell rdma_destroy_id(rqpair->cm_id); 1060a5972c62SSeth Howell rqpair->cm_id = NULL; 1061a5972c62SSeth Howell return -1; 1062989859bbSBen Walker } 1063989859bbSBen Walker 1064c3884f94SSeth Howell /* Append the given recv wr structure to the resource structs outstanding recvs list. */ 1065c3884f94SSeth Howell /* This function accepts either a single wr or the first wr in a linked list. */ 1066c3884f94SSeth Howell static void 1067c3884f94SSeth Howell nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first) 1068c3884f94SSeth Howell { 1069c818233bSIvan Betsis struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 1070c818233bSIvan Betsis struct spdk_nvmf_rdma_transport, transport); 1071c3884f94SSeth Howell 1072696e8580SAlexey Marchuk if (rqpair->srq != NULL) { 1073cf151d60SAlexey Marchuk spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, first); 1074c3884f94SSeth Howell } else { 1075cf151d60SAlexey Marchuk if (spdk_rdma_provider_qp_queue_recv_wrs(rqpair->rdma_qp, first)) { 1076d9ff7d09SAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_recv, rqpair, recv_link); 1077d9ff7d09SAlexey Marchuk } 1078c3884f94SSeth Howell } 1079c818233bSIvan Betsis 1080c818233bSIvan Betsis if (rtransport->rdma_opts.no_wr_batching) { 1081c818233bSIvan Betsis _poller_submit_recvs(rtransport, rqpair->poller); 1082c818233bSIvan Betsis } 1083c3884f94SSeth Howell } 10849d63933bSSeth Howell 10858b9c92d3SAlexey Marchuk static inline void 1086cc294653SBen Walker request_transfer_in(struct spdk_nvmf_request *req) 10872e550d51SDaniel Verkamp { 10886fb90732SBen Walker struct spdk_nvmf_rdma_request *rdma_req; 10896fb90732SBen Walker struct spdk_nvmf_qpair *qpair; 109055a624edSBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 1091c818233bSIvan Betsis struct spdk_nvmf_rdma_transport *rtransport; 1092caf88609SBen Walker 10936fb90732SBen Walker qpair = req->qpair; 10946fb90732SBen Walker rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 109555a624edSBen Walker rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1096c818233bSIvan Betsis rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 1097c818233bSIvan Betsis struct spdk_nvmf_rdma_transport, transport); 10986fb90732SBen Walker 1099cc294653SBen Walker assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 1100158dc947SSeth Howell assert(rdma_req != NULL); 1101caf88609SBen Walker 1102cf151d60SAlexey Marchuk if (spdk_rdma_provider_qp_queue_send_wrs(rqpair->rdma_qp, rdma_req->transfer_wr)) { 1103bbb493ceSAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link); 1104bbb493ceSAlexey Marchuk } 1105c818233bSIvan Betsis if (rtransport->rdma_opts.no_wr_batching) { 1106c818233bSIvan Betsis _poller_submit_sends(rtransport, rqpair->poller); 1107c818233bSIvan Betsis } 1108bbb493ceSAlexey Marchuk 1109a681f8d5SAlexey Marchuk assert(rqpair->current_read_depth + rdma_req->num_outstanding_data_wr <= rqpair->max_read_depth); 1110dfdd76cfSSeth Howell rqpair->current_read_depth += rdma_req->num_outstanding_data_wr; 1111a681f8d5SAlexey Marchuk assert(rqpair->current_send_depth + rdma_req->num_outstanding_data_wr <= rqpair->max_send_depth); 1112158dc947SSeth Howell rqpair->current_send_depth += rdma_req->num_outstanding_data_wr; 11132e550d51SDaniel Verkamp } 11142e550d51SDaniel Verkamp 11158307ab43SAlexey Marchuk static inline void 1116ca59dd5dSAlexey Marchuk nvmf_rdma_request_reset_transfer_in(struct spdk_nvmf_rdma_request *rdma_req, 1117ca59dd5dSAlexey Marchuk struct spdk_nvmf_rdma_transport *rtransport) 1118ca59dd5dSAlexey Marchuk { 1119ca59dd5dSAlexey Marchuk /* Put completed WRs back to pool and move transfer_wr pointer */ 1120ca59dd5dSAlexey Marchuk _nvmf_rdma_request_free_data(rdma_req, rdma_req->transfer_wr, rtransport->data_wr_pool); 1121ca59dd5dSAlexey Marchuk rdma_req->transfer_wr = rdma_req->remaining_tranfer_in_wrs; 1122ca59dd5dSAlexey Marchuk rdma_req->remaining_tranfer_in_wrs = NULL; 1123ca59dd5dSAlexey Marchuk rdma_req->num_outstanding_data_wr = rdma_req->num_remaining_data_wr; 1124ca59dd5dSAlexey Marchuk rdma_req->num_remaining_data_wr = 0; 1125ca59dd5dSAlexey Marchuk } 1126ca59dd5dSAlexey Marchuk 1127ca59dd5dSAlexey Marchuk static inline int 1128ca59dd5dSAlexey Marchuk request_prepare_transfer_in_part(struct spdk_nvmf_request *req, uint32_t num_reads_available) 1129ca59dd5dSAlexey Marchuk { 1130ca59dd5dSAlexey Marchuk struct spdk_nvmf_rdma_request *rdma_req; 1131ca59dd5dSAlexey Marchuk struct ibv_send_wr *wr; 1132ca59dd5dSAlexey Marchuk uint32_t i; 1133ca59dd5dSAlexey Marchuk 1134ca59dd5dSAlexey Marchuk rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 1135ca59dd5dSAlexey Marchuk 1136ca59dd5dSAlexey Marchuk assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); 1137ca59dd5dSAlexey Marchuk assert(rdma_req != NULL); 1138ca59dd5dSAlexey Marchuk assert(num_reads_available > 0); 1139ca59dd5dSAlexey Marchuk assert(rdma_req->num_outstanding_data_wr > num_reads_available); 1140ca59dd5dSAlexey Marchuk wr = rdma_req->transfer_wr; 1141ca59dd5dSAlexey Marchuk 1142ca59dd5dSAlexey Marchuk for (i = 0; i < num_reads_available - 1; i++) { 1143ca59dd5dSAlexey Marchuk wr = wr->next; 1144ca59dd5dSAlexey Marchuk } 1145ca59dd5dSAlexey Marchuk 1146ca59dd5dSAlexey Marchuk rdma_req->remaining_tranfer_in_wrs = wr->next; 1147ca59dd5dSAlexey Marchuk rdma_req->num_remaining_data_wr = rdma_req->num_outstanding_data_wr - num_reads_available; 1148ca59dd5dSAlexey Marchuk rdma_req->num_outstanding_data_wr = num_reads_available; 1149ca59dd5dSAlexey Marchuk /* Break chain of WRs to send only part. Once this portion completes, we continue sending RDMA_READs */ 1150ca59dd5dSAlexey Marchuk wr->next = NULL; 1151ca59dd5dSAlexey Marchuk 1152ca59dd5dSAlexey Marchuk return 0; 1153ca59dd5dSAlexey Marchuk } 1154ca59dd5dSAlexey Marchuk 1155411df9adSDaniel Verkamp static int 1156fdec444aSPhilipp Skadorov request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) 1157eee64c69SBen Walker { 1158c7395a11SJinYu int num_outstanding_data_wr = 0; 11596fb90732SBen Walker struct spdk_nvmf_rdma_request *rdma_req; 11606fb90732SBen Walker struct spdk_nvmf_qpair *qpair; 116155a624edSBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 11626fb90732SBen Walker struct spdk_nvme_cpl *rsp; 11639d63933bSSeth Howell struct ibv_send_wr *first = NULL; 1164c818233bSIvan Betsis struct spdk_nvmf_rdma_transport *rtransport; 1165eee64c69SBen Walker 1166fdec444aSPhilipp Skadorov *data_posted = 0; 11676fb90732SBen Walker qpair = req->qpair; 11686fb90732SBen Walker rsp = &req->rsp->nvme_cpl; 11696fb90732SBen Walker rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 117055a624edSBen Walker rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 1171c818233bSIvan Betsis rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 1172c818233bSIvan Betsis struct spdk_nvmf_rdma_transport, transport); 11736fb90732SBen Walker 1174eee64c69SBen Walker /* Advance our sq_head pointer */ 11751d304bc5SBen Walker if (qpair->sq_head == qpair->sq_head_max) { 11761d304bc5SBen Walker qpair->sq_head = 0; 117718498460SDaniel Verkamp } else { 11781d304bc5SBen Walker qpair->sq_head++; 1179eee64c69SBen Walker } 11801d304bc5SBen Walker rsp->sqhd = qpair->sq_head; 1181eee64c69SBen Walker 1182c3884f94SSeth Howell /* queue the capsule for the recv buffer */ 11831db3a037SBen Walker assert(rdma_req->recv != NULL); 118401201d3eSSeth Howell 1185c3884f94SSeth Howell nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr); 1186c3884f94SSeth Howell 11871db3a037SBen Walker rdma_req->recv = NULL; 1188e1dd85a5SBen Walker assert(rqpair->current_recv_depth > 0); 1189158dc947SSeth Howell rqpair->current_recv_depth--; 1190eee64c69SBen Walker 119162700dacSSeth Howell /* Build the response which consists of optional 119262700dacSSeth Howell * RDMA WRITEs to transfer data, plus an RDMA SEND 11931f382439SBen Walker * containing the response. 11941f382439SBen Walker */ 11959d63933bSSeth Howell first = &rdma_req->rsp.wr; 11961f382439SBen Walker 1197e718d8caSAlexey Marchuk if (spdk_unlikely(rsp->status.sc != SPDK_NVME_SC_SUCCESS)) { 1198e0cd084bSShuhei Matsumoto /* On failure, data was not read from the controller. So clear the 1199e0cd084bSShuhei Matsumoto * number of outstanding data WRs to zero. 1200e0cd084bSShuhei Matsumoto */ 1201e0cd084bSShuhei Matsumoto rdma_req->num_outstanding_data_wr = 0; 1202e0cd084bSShuhei Matsumoto } else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 12035a6e7a41SAlexey Marchuk first = rdma_req->transfer_wr; 12045301be93SSeth Howell *data_posted = 1; 1205c7395a11SJinYu num_outstanding_data_wr = rdma_req->num_outstanding_data_wr; 1206cc294653SBen Walker } 1207cf151d60SAlexey Marchuk if (spdk_rdma_provider_qp_queue_send_wrs(rqpair->rdma_qp, first)) { 1208bbb493ceSAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link); 1209bbb493ceSAlexey Marchuk } 1210c818233bSIvan Betsis if (rtransport->rdma_opts.no_wr_batching) { 1211c818233bSIvan Betsis _poller_submit_sends(rtransport, rqpair->poller); 1212c818233bSIvan Betsis } 1213bbb493ceSAlexey Marchuk 1214158dc947SSeth Howell /* +1 for the rsp wr */ 1215a681f8d5SAlexey Marchuk assert(rqpair->current_send_depth + num_outstanding_data_wr + 1 <= rqpair->max_send_depth); 1216c7395a11SJinYu rqpair->current_send_depth += num_outstanding_data_wr + 1; 1217eee64c69SBen Walker 1218dfdd76cfSSeth Howell return 0; 1219eee64c69SBen Walker } 1220eee64c69SBen Walker 1221eee64c69SBen Walker static int 122255d8d943SSeth Howell nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) 1223ba3d96e8SBen Walker { 1224ba3d96e8SBen Walker struct spdk_nvmf_rdma_accept_private_data accept_data; 1225ba3d96e8SBen Walker struct rdma_conn_param ctrlr_event_data = {}; 1226ba3d96e8SBen Walker int rc; 1227ba3d96e8SBen Walker 1228ba3d96e8SBen Walker accept_data.recfmt = 0; 1229ba3d96e8SBen Walker accept_data.crqsize = rqpair->max_queue_depth; 1230ba3d96e8SBen Walker 1231ba3d96e8SBen Walker ctrlr_event_data.private_data = &accept_data; 1232ba3d96e8SBen Walker ctrlr_event_data.private_data_len = sizeof(accept_data); 1233ba3d96e8SBen Walker if (id->ps == RDMA_PS_TCP) { 1234ba3d96e8SBen Walker ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ 12357289d370SSeth Howell ctrlr_event_data.initiator_depth = rqpair->max_read_depth; 1236ba3d96e8SBen Walker } 1237ba3d96e8SBen Walker 123889d2efe0SSeth Howell /* Configure infinite retries for the initiator side qpair. 123957dc541cSChunsong Feng * We need to pass this value to the initiator to prevent the 124089d2efe0SSeth Howell * initiator side NIC from completing SEND requests back to the 124189d2efe0SSeth Howell * initiator with status rnr_retry_count_exceeded. */ 124289d2efe0SSeth Howell ctrlr_event_data.rnr_retry_count = 0x7; 124389d2efe0SSeth Howell 1244ea7a4f3cSAlexey Marchuk /* When qpair is created without use of rdma cm API, an additional 1245ea7a4f3cSAlexey Marchuk * information must be provided to initiator in the connection response: 1246ea7a4f3cSAlexey Marchuk * whether qpair is using SRQ and its qp_num 1247ea7a4f3cSAlexey Marchuk * Fields below are ignored by rdma cm if qpair has been 1248ea7a4f3cSAlexey Marchuk * created using rdma cm API. */ 1249ea7a4f3cSAlexey Marchuk ctrlr_event_data.srq = rqpair->srq ? 1 : 0; 125052f7aeb7SShuhei Matsumoto ctrlr_event_data.qp_num = rqpair->qp_num; 1251ea7a4f3cSAlexey Marchuk 1252cf151d60SAlexey Marchuk rc = spdk_rdma_provider_qp_accept(rqpair->rdma_qp, &ctrlr_event_data); 1253ba3d96e8SBen Walker if (rc) { 1254cf151d60SAlexey Marchuk SPDK_ERRLOG("Error %d on spdk_rdma_provider_qp_accept\n", errno); 1255ba3d96e8SBen Walker } else { 12562172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Sent back the accept\n"); 1257ba3d96e8SBen Walker } 1258ba3d96e8SBen Walker 1259ba3d96e8SBen Walker return rc; 1260ba3d96e8SBen Walker } 1261ba3d96e8SBen Walker 1262ba3d96e8SBen Walker static void 126355d8d943SSeth Howell nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) 1264ba3d96e8SBen Walker { 1265ba3d96e8SBen Walker struct spdk_nvmf_rdma_reject_private_data rej_data; 1266ba3d96e8SBen Walker 1267ba3d96e8SBen Walker rej_data.recfmt = 0; 1268ba3d96e8SBen Walker rej_data.sts = error; 1269ba3d96e8SBen Walker 1270ba3d96e8SBen Walker rdma_reject(id, &rej_data, sizeof(rej_data)); 1271ba3d96e8SBen Walker } 1272ba3d96e8SBen Walker 1273ba3d96e8SBen Walker static int 12745584232cSBen Walker nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event) 12750f912a0eSDaniel Verkamp { 1276ecc436fcSBen Walker struct spdk_nvmf_rdma_transport *rtransport; 127755a624edSBen Walker struct spdk_nvmf_rdma_qpair *rqpair = NULL; 12781cbc2b16SBen Walker struct spdk_nvmf_rdma_port *port; 1279a9f5ffbdSBen Walker struct rdma_conn_param *rdma_param = NULL; 1280a9f5ffbdSBen Walker const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; 1281caf88609SBen Walker uint16_t max_queue_depth; 12827289d370SSeth Howell uint16_t max_read_depth; 12830f912a0eSDaniel Verkamp 1284ecc436fcSBen Walker rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 1285ecc436fcSBen Walker 1286ba3d96e8SBen Walker assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ 1287ba3d96e8SBen Walker assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ 1288765173a7SBen Walker 1289765173a7SBen Walker rdma_param = &event->param.conn; 1290765173a7SBen Walker if (rdma_param->private_data == NULL || 1291765173a7SBen Walker rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { 1292765173a7SBen Walker SPDK_ERRLOG("connect request: no private data provided\n"); 129355d8d943SSeth Howell nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); 1294ba3d96e8SBen Walker return -1; 1295765173a7SBen Walker } 1296ba3d96e8SBen Walker 1297765173a7SBen Walker private_data = rdma_param->private_data; 1298ba3d96e8SBen Walker if (private_data->recfmt != 0) { 1299ba3d96e8SBen Walker SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); 130055d8d943SSeth Howell nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); 1301ba3d96e8SBen Walker return -1; 1302ba3d96e8SBen Walker } 1303765173a7SBen Walker 13042172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Connect Recv on fabric intf name %s, dev_name %s\n", 13056a61126fSBen Walker event->id->verbs->device->name, event->id->verbs->device->dev_name); 13060f912a0eSDaniel Verkamp 13071cbc2b16SBen Walker port = event->listen_id->context; 13082172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Listen Id was %p with verbs %p. ListenAddr: %p\n", 13091cbc2b16SBen Walker event->listen_id, event->listen_id->verbs, port); 1310a0a92ff4SBen Walker 1311a9f5ffbdSBen Walker /* Figure out the supported queue depth. This is a multi-step process 1312a9f5ffbdSBen Walker * that takes into account hardware maximums, host provided values, 1313a9f5ffbdSBen Walker * and our target's internal memory limits */ 13140f912a0eSDaniel Verkamp 13152172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Calculating Queue Depth\n"); 1316a9f5ffbdSBen Walker 1317a9f5ffbdSBen Walker /* Start with the maximum queue depth allowed by the target */ 13188e808490SJohn Barnard max_queue_depth = rtransport->transport.opts.max_queue_depth; 13197289d370SSeth Howell max_read_depth = rtransport->transport.opts.max_queue_depth; 13202172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Target Max Queue Depth: %d\n", 13218e808490SJohn Barnard rtransport->transport.opts.max_queue_depth); 1322a9f5ffbdSBen Walker 1323a9f5ffbdSBen Walker /* Next check the local NIC's hardware limitations */ 13242172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, 132535bc1e93SBen Walker "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", 13261cbc2b16SBen Walker port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); 13271cbc2b16SBen Walker max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); 132841cd5ff4SSeth Howell max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom); 1329a9f5ffbdSBen Walker 1330a9f5ffbdSBen Walker /* Next check the remote NIC's hardware limitations */ 13312172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, 1332b2a86421SBen Walker "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", 1333ca0c1338SBen Walker rdma_param->initiator_depth, rdma_param->responder_resources); 13346bc8d265SAlexey Marchuk /* from man3 rdma_get_cm_event 13356bc8d265SAlexey Marchuk * responder_resources - Specifies the number of responder resources that is requested by the recipient. 13366bc8d265SAlexey Marchuk * The responder_resources field must match the initiator depth specified by the remote node when running 13376bc8d265SAlexey Marchuk * the rdma_connect and rdma_accept functions. */ 13386bc8d265SAlexey Marchuk if (rdma_param->responder_resources != 0) { 1339c8b9bbafSAlexey Marchuk if (private_data->qid) { 1340c8b9bbafSAlexey Marchuk SPDK_DEBUGLOG(rdma, "Host (Initiator) is not allowed to use RDMA operations," 1341c8b9bbafSAlexey Marchuk " responder_resources must be 0 but set to %u\n", 13426bc8d265SAlexey Marchuk rdma_param->responder_resources); 1343c8b9bbafSAlexey Marchuk } else { 1344c8b9bbafSAlexey Marchuk SPDK_WARNLOG("Host (Initiator) is not allowed to use RDMA operations," 1345c8b9bbafSAlexey Marchuk " responder_resources must be 0 but set to %u\n", 1346c8b9bbafSAlexey Marchuk rdma_param->responder_resources); 1347c8b9bbafSAlexey Marchuk } 1348f64690d4SBen Walker } 13496bc8d265SAlexey Marchuk /* from man3 rdma_get_cm_event 13506bc8d265SAlexey Marchuk * initiator_depth - Specifies the maximum number of outstanding RDMA read operations that the recipient holds. 13516bc8d265SAlexey Marchuk * The initiator_depth field must match the responder resources specified by the remote node when running 13526bc8d265SAlexey Marchuk * the rdma_connect and rdma_accept functions. */ 13536bc8d265SAlexey Marchuk if (rdma_param->initiator_depth == 0) { 13546bc8d265SAlexey Marchuk SPDK_ERRLOG("Host (Initiator) doesn't support RDMA_READ or atomic operations\n"); 13556bc8d265SAlexey Marchuk nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_IRD); 13566bc8d265SAlexey Marchuk return -1; 13576bc8d265SAlexey Marchuk } 13586bc8d265SAlexey Marchuk max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth); 1359a9f5ffbdSBen Walker 13602172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Host Receive Queue Size: %d\n", private_data->hrqsize); 13612172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Host Send Queue Size: %d\n", private_data->hsqsize); 136284d90484SDaniel Verkamp max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); 1363b4ed77efSBen Walker max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); 1364a9f5ffbdSBen Walker 13652172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", 13667289d370SSeth Howell max_queue_depth, max_read_depth); 1367ca0c1338SBen Walker 1368ee691fefSBen Walker rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); 136955a624edSBen Walker if (rqpair == NULL) { 1370ee691fefSBen Walker SPDK_ERRLOG("Could not allocate new connection.\n"); 137155d8d943SSeth Howell nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 1372ba3d96e8SBen Walker return -1; 13730f912a0eSDaniel Verkamp } 13740f912a0eSDaniel Verkamp 13758209c8cfSSeth Howell rqpair->device = port->device; 1376ee691fefSBen Walker rqpair->max_queue_depth = max_queue_depth; 13777289d370SSeth Howell rqpair->max_read_depth = max_read_depth; 1378ee691fefSBen Walker rqpair->cm_id = event->id; 1379311ce0e2SBen Walker rqpair->listen_id = event->listen_id; 1380ee691fefSBen Walker rqpair->qpair.transport = transport; 138173e87ed2SAlexey Marchuk /* use qid from the private data to determine the qpair type 138273e87ed2SAlexey Marchuk qid will be set to the appropriate value when the controller is created */ 138373e87ed2SAlexey Marchuk rqpair->qpair.qid = private_data->qid; 138445f2e732SJim Harris rqpair->qpair.numa.id_valid = 1; 138545f2e732SJim Harris rqpair->qpair.numa.id = spdk_rdma_cm_id_get_numa_id(rqpair->cm_id); 1386b25751d9SBen Walker 1387ee691fefSBen Walker event->id->context = &rqpair->qpair; 1388ee691fefSBen Walker 13895584232cSBen Walker spdk_nvmf_tgt_new_qpair(transport->tgt, &rqpair->qpair); 13906f95c325SZiye Yang 13910f912a0eSDaniel Verkamp return 0; 13920f912a0eSDaniel Verkamp } 13930f912a0eSDaniel Verkamp 1394568f4d2bSAlexey Marchuk static inline void 1395568f4d2bSAlexey Marchuk nvmf_rdma_setup_wr(struct ibv_send_wr *wr, struct ibv_send_wr *next, 1396568f4d2bSAlexey Marchuk enum spdk_nvme_data_transfer xfer) 1397568f4d2bSAlexey Marchuk { 1398568f4d2bSAlexey Marchuk if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 1399568f4d2bSAlexey Marchuk wr->opcode = IBV_WR_RDMA_WRITE; 1400568f4d2bSAlexey Marchuk wr->send_flags = 0; 1401568f4d2bSAlexey Marchuk wr->next = next; 1402568f4d2bSAlexey Marchuk } else if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 1403568f4d2bSAlexey Marchuk wr->opcode = IBV_WR_RDMA_READ; 1404568f4d2bSAlexey Marchuk wr->send_flags = IBV_SEND_SIGNALED; 1405568f4d2bSAlexey Marchuk wr->next = NULL; 1406568f4d2bSAlexey Marchuk } else { 1407568f4d2bSAlexey Marchuk assert(0); 1408568f4d2bSAlexey Marchuk } 1409568f4d2bSAlexey Marchuk } 1410568f4d2bSAlexey Marchuk 14111ff5f4abSBen Walker static int 141262700dacSSeth Howell nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport, 141362700dacSSeth Howell struct spdk_nvmf_rdma_request *rdma_req, 141462700dacSSeth Howell uint32_t num_sgl_descriptors) 141562700dacSSeth Howell { 141662700dacSSeth Howell struct spdk_nvmf_rdma_request_data *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES]; 141762700dacSSeth Howell struct spdk_nvmf_rdma_request_data *current_data_wr; 141862700dacSSeth Howell uint32_t i; 141962700dacSSeth Howell 1420e718d8caSAlexey Marchuk if (spdk_unlikely(num_sgl_descriptors > SPDK_NVMF_MAX_SGL_ENTRIES)) { 14216a77723eSAlexey Marchuk SPDK_ERRLOG("Requested too much entries (%u), the limit is %u\n", 14226a77723eSAlexey Marchuk num_sgl_descriptors, SPDK_NVMF_MAX_SGL_ENTRIES); 14236a77723eSAlexey Marchuk return -EINVAL; 14246a77723eSAlexey Marchuk } 14256a77723eSAlexey Marchuk 1426e718d8caSAlexey Marchuk if (spdk_unlikely(spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests, 1427e718d8caSAlexey Marchuk num_sgl_descriptors))) { 142862700dacSSeth Howell return -ENOMEM; 142962700dacSSeth Howell } 143062700dacSSeth Howell 143162700dacSSeth Howell current_data_wr = &rdma_req->data; 143262700dacSSeth Howell 143362700dacSSeth Howell for (i = 0; i < num_sgl_descriptors; i++) { 1434568f4d2bSAlexey Marchuk nvmf_rdma_setup_wr(¤t_data_wr->wr, &work_requests[i]->wr, rdma_req->req.xfer); 143562700dacSSeth Howell current_data_wr->wr.next = &work_requests[i]->wr; 143662700dacSSeth Howell current_data_wr = work_requests[i]; 1437568f4d2bSAlexey Marchuk current_data_wr->wr.sg_list = current_data_wr->sgl; 1438568f4d2bSAlexey Marchuk current_data_wr->wr.wr_id = rdma_req->data.wr.wr_id; 143962700dacSSeth Howell } 144062700dacSSeth Howell 1441568f4d2bSAlexey Marchuk nvmf_rdma_setup_wr(¤t_data_wr->wr, &rdma_req->rsp.wr, rdma_req->req.xfer); 1442568f4d2bSAlexey Marchuk 144362700dacSSeth Howell return 0; 144462700dacSSeth Howell } 144562700dacSSeth Howell 1446a335a524SAlexey Marchuk static inline void 1447a335a524SAlexey Marchuk nvmf_rdma_setup_request(struct spdk_nvmf_rdma_request *rdma_req) 1448a335a524SAlexey Marchuk { 1449a335a524SAlexey Marchuk struct ibv_send_wr *wr = &rdma_req->data.wr; 1450a335a524SAlexey Marchuk struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1; 1451a335a524SAlexey Marchuk 1452a335a524SAlexey Marchuk wr->wr.rdma.rkey = sgl->keyed.key; 1453a335a524SAlexey Marchuk wr->wr.rdma.remote_addr = sgl->address; 1454568f4d2bSAlexey Marchuk nvmf_rdma_setup_wr(wr, &rdma_req->rsp.wr, rdma_req->req.xfer); 1455a335a524SAlexey Marchuk } 1456a335a524SAlexey Marchuk 14576a77723eSAlexey Marchuk static inline void 14586a77723eSAlexey Marchuk nvmf_rdma_update_remote_addr(struct spdk_nvmf_rdma_request *rdma_req, uint32_t num_wrs) 14596a77723eSAlexey Marchuk { 14606a77723eSAlexey Marchuk struct ibv_send_wr *wr = &rdma_req->data.wr; 14616a77723eSAlexey Marchuk struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1; 14626a77723eSAlexey Marchuk uint32_t i; 14636a77723eSAlexey Marchuk int j; 14646a77723eSAlexey Marchuk uint64_t remote_addr_offset = 0; 14656a77723eSAlexey Marchuk 14666a77723eSAlexey Marchuk for (i = 0; i < num_wrs; ++i) { 14676a77723eSAlexey Marchuk wr->wr.rdma.rkey = sgl->keyed.key; 14686a77723eSAlexey Marchuk wr->wr.rdma.remote_addr = sgl->address + remote_addr_offset; 14696a77723eSAlexey Marchuk for (j = 0; j < wr->num_sge; ++j) { 14706a77723eSAlexey Marchuk remote_addr_offset += wr->sg_list[j].length; 14716a77723eSAlexey Marchuk } 14726a77723eSAlexey Marchuk wr = wr->next; 14736a77723eSAlexey Marchuk } 14746a77723eSAlexey Marchuk } 14756a77723eSAlexey Marchuk 147689a28bfdSShuhei Matsumoto static int 1477ad0221afSShuhei Matsumoto nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_device *device, 14785593b61fSShuhei Matsumoto struct spdk_nvmf_rdma_request *rdma_req, 147989a28bfdSShuhei Matsumoto struct ibv_send_wr *wr, 14809db2571dSShuhei Matsumoto uint32_t total_length) 148189a28bfdSShuhei Matsumoto { 14828a01b4d6SAlexey Marchuk struct spdk_rdma_utils_memory_translation mem_translation; 14834642d7b2SAlexey Marchuk struct ibv_sge *sg_ele; 14844642d7b2SAlexey Marchuk struct iovec *iov; 1485019a5361SAlexey Marchuk uint32_t lkey, remaining; 14864642d7b2SAlexey Marchuk int rc; 148716365fd8SShuhei Matsumoto 148889a28bfdSShuhei Matsumoto wr->num_sge = 0; 148916365fd8SShuhei Matsumoto 14909db2571dSShuhei Matsumoto while (total_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) { 1491019a5361SAlexey Marchuk iov = &rdma_req->req.iov[rdma_req->iovpos]; 14928a01b4d6SAlexey Marchuk rc = spdk_rdma_utils_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation); 14934642d7b2SAlexey Marchuk if (spdk_unlikely(rc)) { 14949db2571dSShuhei Matsumoto return rc; 14955593b61fSShuhei Matsumoto } 14965593b61fSShuhei Matsumoto 14978a01b4d6SAlexey Marchuk lkey = spdk_rdma_utils_memory_translation_get_lkey(&mem_translation); 14984642d7b2SAlexey Marchuk sg_ele = &wr->sg_list[wr->num_sge]; 1499019a5361SAlexey Marchuk remaining = spdk_min((uint32_t)iov->iov_len - rdma_req->offset, total_length); 15004642d7b2SAlexey Marchuk 15014642d7b2SAlexey Marchuk sg_ele->lkey = lkey; 1502019a5361SAlexey Marchuk sg_ele->addr = (uintptr_t)iov->iov_base + rdma_req->offset; 1503019a5361SAlexey Marchuk sg_ele->length = remaining; 1504019a5361SAlexey Marchuk SPDK_DEBUGLOG(rdma, "sge[%d] %p addr 0x%"PRIx64", len %u\n", wr->num_sge, sg_ele, sg_ele->addr, 1505019a5361SAlexey Marchuk sg_ele->length); 1506019a5361SAlexey Marchuk rdma_req->offset += sg_ele->length; 1507019a5361SAlexey Marchuk total_length -= sg_ele->length; 15084642d7b2SAlexey Marchuk wr->num_sge++; 1509019a5361SAlexey Marchuk 1510019a5361SAlexey Marchuk if (rdma_req->offset == iov->iov_len) { 1511019a5361SAlexey Marchuk rdma_req->offset = 0; 1512019a5361SAlexey Marchuk rdma_req->iovpos++; 1513019a5361SAlexey Marchuk } 15149db2571dSShuhei Matsumoto } 15159db2571dSShuhei Matsumoto 1516e718d8caSAlexey Marchuk if (spdk_unlikely(total_length)) { 15179db2571dSShuhei Matsumoto SPDK_ERRLOG("Not enough SG entries to hold data buffer\n"); 15189db2571dSShuhei Matsumoto return -EINVAL; 15199db2571dSShuhei Matsumoto } 15209db2571dSShuhei Matsumoto 15219db2571dSShuhei Matsumoto return 0; 15229db2571dSShuhei Matsumoto } 15239db2571dSShuhei Matsumoto 15249db2571dSShuhei Matsumoto static int 1525ad0221afSShuhei Matsumoto nvmf_rdma_fill_wr_sgl_with_dif(struct spdk_nvmf_rdma_device *device, 15269db2571dSShuhei Matsumoto struct spdk_nvmf_rdma_request *rdma_req, 15279db2571dSShuhei Matsumoto struct ibv_send_wr *wr, 15289db2571dSShuhei Matsumoto uint32_t total_length, 15299db2571dSShuhei Matsumoto uint32_t num_extra_wrs) 15309db2571dSShuhei Matsumoto { 15318a01b4d6SAlexey Marchuk struct spdk_rdma_utils_memory_translation mem_translation; 15329db2571dSShuhei Matsumoto struct spdk_dif_ctx *dif_ctx = &rdma_req->req.dif.dif_ctx; 15339db2571dSShuhei Matsumoto struct ibv_sge *sg_ele; 15349db2571dSShuhei Matsumoto struct iovec *iov; 15350db0c443SChunsong Feng struct iovec *rdma_iov; 15369db2571dSShuhei Matsumoto uint32_t lkey, remaining; 15379db2571dSShuhei Matsumoto uint32_t remaining_data_block, data_block_size, md_size; 15384642d7b2SAlexey Marchuk uint32_t sge_len; 15399db2571dSShuhei Matsumoto int rc; 15409db2571dSShuhei Matsumoto 15419db2571dSShuhei Matsumoto data_block_size = dif_ctx->block_size - dif_ctx->md_size; 15420db0c443SChunsong Feng 15430db0c443SChunsong Feng if (spdk_likely(!rdma_req->req.stripped_data)) { 15440db0c443SChunsong Feng rdma_iov = rdma_req->req.iov; 15459db2571dSShuhei Matsumoto remaining_data_block = data_block_size; 15460db0c443SChunsong Feng md_size = dif_ctx->md_size; 15470db0c443SChunsong Feng } else { 15480db0c443SChunsong Feng rdma_iov = rdma_req->req.stripped_data->iov; 15490db0c443SChunsong Feng total_length = total_length / dif_ctx->block_size * data_block_size; 15500db0c443SChunsong Feng remaining_data_block = total_length; 15510db0c443SChunsong Feng md_size = 0; 15520db0c443SChunsong Feng } 15539db2571dSShuhei Matsumoto 15549db2571dSShuhei Matsumoto wr->num_sge = 0; 15559db2571dSShuhei Matsumoto 15569db2571dSShuhei Matsumoto while (total_length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) { 15570db0c443SChunsong Feng iov = rdma_iov + rdma_req->iovpos; 15588a01b4d6SAlexey Marchuk rc = spdk_rdma_utils_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation); 15599db2571dSShuhei Matsumoto if (spdk_unlikely(rc)) { 15609db2571dSShuhei Matsumoto return rc; 15619db2571dSShuhei Matsumoto } 15629db2571dSShuhei Matsumoto 15638a01b4d6SAlexey Marchuk lkey = spdk_rdma_utils_memory_translation_get_lkey(&mem_translation); 15649db2571dSShuhei Matsumoto sg_ele = &wr->sg_list[wr->num_sge]; 15659db2571dSShuhei Matsumoto remaining = spdk_min((uint32_t)iov->iov_len - rdma_req->offset, total_length); 15664642d7b2SAlexey Marchuk 15674642d7b2SAlexey Marchuk while (remaining) { 15684642d7b2SAlexey Marchuk if (wr->num_sge >= SPDK_NVMF_MAX_SGL_ENTRIES) { 15694642d7b2SAlexey Marchuk if (num_extra_wrs > 0 && wr->next) { 15704642d7b2SAlexey Marchuk wr = wr->next; 15714642d7b2SAlexey Marchuk wr->num_sge = 0; 15724642d7b2SAlexey Marchuk sg_ele = &wr->sg_list[wr->num_sge]; 15734642d7b2SAlexey Marchuk num_extra_wrs--; 15744642d7b2SAlexey Marchuk } else { 15754642d7b2SAlexey Marchuk break; 15764642d7b2SAlexey Marchuk } 15774642d7b2SAlexey Marchuk } 15784642d7b2SAlexey Marchuk sg_ele->lkey = lkey; 1579019a5361SAlexey Marchuk sg_ele->addr = (uintptr_t)((char *)iov->iov_base + rdma_req->offset); 15804642d7b2SAlexey Marchuk sge_len = spdk_min(remaining, remaining_data_block); 15814642d7b2SAlexey Marchuk sg_ele->length = sge_len; 15829db2571dSShuhei Matsumoto SPDK_DEBUGLOG(rdma, "sge[%d] %p addr 0x%"PRIx64", len %u\n", wr->num_sge, sg_ele, 15839db2571dSShuhei Matsumoto sg_ele->addr, sg_ele->length); 15844642d7b2SAlexey Marchuk remaining -= sge_len; 15854642d7b2SAlexey Marchuk remaining_data_block -= sge_len; 1586019a5361SAlexey Marchuk rdma_req->offset += sge_len; 1587019a5361SAlexey Marchuk total_length -= sge_len; 15884642d7b2SAlexey Marchuk 15894642d7b2SAlexey Marchuk sg_ele++; 15904642d7b2SAlexey Marchuk wr->num_sge++; 15914642d7b2SAlexey Marchuk 15924642d7b2SAlexey Marchuk if (remaining_data_block == 0) { 15934642d7b2SAlexey Marchuk /* skip metadata */ 1594019a5361SAlexey Marchuk rdma_req->offset += md_size; 1595019a5361SAlexey Marchuk total_length -= md_size; 15964642d7b2SAlexey Marchuk /* Metadata that do not fit this IO buffer will be included in the next IO buffer */ 15974642d7b2SAlexey Marchuk remaining -= spdk_min(remaining, md_size); 15984642d7b2SAlexey Marchuk remaining_data_block = data_block_size; 15994642d7b2SAlexey Marchuk } 16004642d7b2SAlexey Marchuk 16014642d7b2SAlexey Marchuk if (remaining == 0) { 16024642d7b2SAlexey Marchuk /* By subtracting the size of the last IOV from the offset, we ensure that we skip 16034642d7b2SAlexey Marchuk the remaining metadata bits at the beginning of the next buffer */ 1604019a5361SAlexey Marchuk rdma_req->offset -= spdk_min(iov->iov_len, rdma_req->offset); 16055593b61fSShuhei Matsumoto rdma_req->iovpos++; 16065593b61fSShuhei Matsumoto } 1607019a5361SAlexey Marchuk } 1608019a5361SAlexey Marchuk } 16095593b61fSShuhei Matsumoto 1610e718d8caSAlexey Marchuk if (spdk_unlikely(total_length)) { 161116365fd8SShuhei Matsumoto SPDK_ERRLOG("Not enough SG entries to hold data buffer\n"); 161216365fd8SShuhei Matsumoto return -EINVAL; 161316365fd8SShuhei Matsumoto } 161416365fd8SShuhei Matsumoto 16155593b61fSShuhei Matsumoto return 0; 16165593b61fSShuhei Matsumoto } 16175593b61fSShuhei Matsumoto 1618653496d2SAlexey Marchuk static inline uint32_t 1619653496d2SAlexey Marchuk nvmf_rdma_calc_num_wrs(uint32_t length, uint32_t io_unit_size, uint32_t block_size) 1620653496d2SAlexey Marchuk { 1621653496d2SAlexey Marchuk /* estimate the number of SG entries and WRs needed to process the request */ 1622653496d2SAlexey Marchuk uint32_t num_sge = 0; 1623653496d2SAlexey Marchuk uint32_t i; 1624653496d2SAlexey Marchuk uint32_t num_buffers = SPDK_CEIL_DIV(length, io_unit_size); 1625653496d2SAlexey Marchuk 1626653496d2SAlexey Marchuk for (i = 0; i < num_buffers && length > 0; i++) { 1627653496d2SAlexey Marchuk uint32_t buffer_len = spdk_min(length, io_unit_size); 1628653496d2SAlexey Marchuk uint32_t num_sge_in_block = SPDK_CEIL_DIV(buffer_len, block_size); 1629653496d2SAlexey Marchuk 1630653496d2SAlexey Marchuk if (num_sge_in_block * block_size > buffer_len) { 1631653496d2SAlexey Marchuk ++num_sge_in_block; 1632653496d2SAlexey Marchuk } 1633653496d2SAlexey Marchuk num_sge += num_sge_in_block; 1634653496d2SAlexey Marchuk length -= buffer_len; 1635653496d2SAlexey Marchuk } 1636653496d2SAlexey Marchuk return SPDK_CEIL_DIV(num_sge, SPDK_NVMF_MAX_SGL_ENTRIES); 1637653496d2SAlexey Marchuk } 1638653496d2SAlexey Marchuk 1639e70a7594SSeth Howell static int 164055d8d943SSeth Howell nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, 1641e70a7594SSeth Howell struct spdk_nvmf_rdma_device *device, 1642e48475b7SShuhei Matsumoto struct spdk_nvmf_rdma_request *rdma_req) 1643e70a7594SSeth Howell { 1644e70a7594SSeth Howell struct spdk_nvmf_rdma_qpair *rqpair; 1645e70a7594SSeth Howell struct spdk_nvmf_rdma_poll_group *rgroup; 16460b068f85SShuhei Matsumoto struct spdk_nvmf_request *req = &rdma_req->req; 1647d409da0cSShuhei Matsumoto struct ibv_send_wr *wr = &rdma_req->data.wr; 1648fda0e558SShuhei Matsumoto int rc; 1649653496d2SAlexey Marchuk uint32_t num_wrs = 1; 1650e48475b7SShuhei Matsumoto uint32_t length; 1651e70a7594SSeth Howell 16520b068f85SShuhei Matsumoto rqpair = SPDK_CONTAINEROF(req->qpair, struct spdk_nvmf_rdma_qpair, qpair); 1653e70a7594SSeth Howell rgroup = rqpair->poller->group; 1654e70a7594SSeth Howell 1655838c45c8SAlexey Marchuk /* rdma wr specifics */ 1656838c45c8SAlexey Marchuk nvmf_rdma_setup_request(rdma_req); 1657838c45c8SAlexey Marchuk 1658e48475b7SShuhei Matsumoto length = req->length; 1659e48475b7SShuhei Matsumoto if (spdk_unlikely(req->dif_enabled)) { 1660e48475b7SShuhei Matsumoto req->dif.orig_length = length; 1661e48475b7SShuhei Matsumoto length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx); 1662e48475b7SShuhei Matsumoto req->dif.elba_length = length; 1663e48475b7SShuhei Matsumoto } 1664e48475b7SShuhei Matsumoto 1665fda0e558SShuhei Matsumoto rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, 1666fda0e558SShuhei Matsumoto length); 1667e718d8caSAlexey Marchuk if (spdk_unlikely(rc != 0)) { 1668fda0e558SShuhei Matsumoto return rc; 16696812b63cSSeth Howell } 16706812b63cSSeth Howell 1671c0ee8ef7SShuhei Matsumoto assert(req->iovcnt <= rqpair->max_send_sge); 16726812b63cSSeth Howell 16730db0c443SChunsong Feng /* When dif_insert_or_strip is true and the I/O data length is greater than one block, 16740db0c443SChunsong Feng * the stripped_buffers are got for DIF stripping. */ 16750db0c443SChunsong Feng if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) 16760db0c443SChunsong Feng && (req->dif.elba_length > req->dif.dif_ctx.block_size))) { 16770db0c443SChunsong Feng rc = nvmf_request_get_stripped_buffers(req, &rgroup->group, 16780db0c443SChunsong Feng &rtransport->transport, req->dif.orig_length); 16790db0c443SChunsong Feng if (rc != 0) { 16800db0c443SChunsong Feng SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc); 16810db0c443SChunsong Feng } 16820db0c443SChunsong Feng } 16830db0c443SChunsong Feng 1684c0ee8ef7SShuhei Matsumoto rdma_req->iovpos = 0; 1685b48a97d4SShuhei Matsumoto 168615ae31fbSBen Walker if (spdk_unlikely(req->dif_enabled)) { 1687653496d2SAlexey Marchuk num_wrs = nvmf_rdma_calc_num_wrs(length, rtransport->transport.opts.io_unit_size, 1688653496d2SAlexey Marchuk req->dif.dif_ctx.block_size); 1689653496d2SAlexey Marchuk if (num_wrs > 1) { 1690653496d2SAlexey Marchuk rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_wrs - 1); 1691e718d8caSAlexey Marchuk if (spdk_unlikely(rc != 0)) { 1692e70a7594SSeth Howell goto err_exit; 1693e70a7594SSeth Howell } 1694653496d2SAlexey Marchuk } 1695653496d2SAlexey Marchuk 1696ad0221afSShuhei Matsumoto rc = nvmf_rdma_fill_wr_sgl_with_dif(device, rdma_req, wr, length, num_wrs - 1); 1697653496d2SAlexey Marchuk if (spdk_unlikely(rc != 0)) { 1698653496d2SAlexey Marchuk goto err_exit; 1699653496d2SAlexey Marchuk } 1700e70a7594SSeth Howell 17019db2571dSShuhei Matsumoto if (num_wrs > 1) { 17026a77723eSAlexey Marchuk nvmf_rdma_update_remote_addr(rdma_req, num_wrs); 17036a77723eSAlexey Marchuk } 17049db2571dSShuhei Matsumoto } else { 1705ad0221afSShuhei Matsumoto rc = nvmf_rdma_fill_wr_sgl(device, rdma_req, wr, length); 17069db2571dSShuhei Matsumoto if (spdk_unlikely(rc != 0)) { 17079db2571dSShuhei Matsumoto goto err_exit; 17089db2571dSShuhei Matsumoto } 17099db2571dSShuhei Matsumoto } 17106a77723eSAlexey Marchuk 1711838c45c8SAlexey Marchuk /* set the number of outstanding data WRs for this request. */ 1712653496d2SAlexey Marchuk rdma_req->num_outstanding_data_wr = num_wrs; 1713838c45c8SAlexey Marchuk 1714a451c838SSeth Howell return rc; 17158580daa1SSrikanth kaligotla 1716a451c838SSeth Howell err_exit: 171779945ef0SShuhei Matsumoto spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport); 1718653496d2SAlexey Marchuk nvmf_rdma_request_free_data(rdma_req, rtransport); 17190b068f85SShuhei Matsumoto req->iovcnt = 0; 1720a451c838SSeth Howell return rc; 17218580daa1SSrikanth kaligotla } 17228580daa1SSrikanth kaligotla 17238580daa1SSrikanth kaligotla static int 172462700dacSSeth Howell nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport, 172562700dacSSeth Howell struct spdk_nvmf_rdma_device *device, 172662700dacSSeth Howell struct spdk_nvmf_rdma_request *rdma_req) 172762700dacSSeth Howell { 172862700dacSSeth Howell struct spdk_nvmf_rdma_qpair *rqpair; 172962700dacSSeth Howell struct spdk_nvmf_rdma_poll_group *rgroup; 173062700dacSSeth Howell struct ibv_send_wr *current_wr; 173162700dacSSeth Howell struct spdk_nvmf_request *req = &rdma_req->req; 173262700dacSSeth Howell struct spdk_nvme_sgl_descriptor *inline_segment, *desc; 173362700dacSSeth Howell uint32_t num_sgl_descriptors; 1734019a5361SAlexey Marchuk uint32_t lengths[SPDK_NVMF_MAX_SGL_ENTRIES], total_length = 0; 173562700dacSSeth Howell uint32_t i; 173662700dacSSeth Howell int rc; 173762700dacSSeth Howell 173862700dacSSeth Howell rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 173962700dacSSeth Howell rgroup = rqpair->poller->group; 174062700dacSSeth Howell 174162700dacSSeth Howell inline_segment = &req->cmd->nvme_cmd.dptr.sgl1; 174262700dacSSeth Howell assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT); 174362700dacSSeth Howell assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); 174462700dacSSeth Howell 174562700dacSSeth Howell num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor); 174662700dacSSeth Howell assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES); 1747410455e4SShuhei Matsumoto 174804621576SShuhei Matsumoto desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address; 174904621576SShuhei Matsumoto for (i = 0; i < num_sgl_descriptors; i++) { 175015ae31fbSBen Walker if (spdk_likely(!req->dif_enabled)) { 175104621576SShuhei Matsumoto lengths[i] = desc->keyed.length; 17526ec974edSAlexey Marchuk } else { 17536ec974edSAlexey Marchuk req->dif.orig_length += desc->keyed.length; 17546ec974edSAlexey Marchuk lengths[i] = spdk_dif_get_length_with_md(desc->keyed.length, &req->dif.dif_ctx); 17556ec974edSAlexey Marchuk req->dif.elba_length += lengths[i]; 17566ec974edSAlexey Marchuk } 1757019a5361SAlexey Marchuk total_length += lengths[i]; 175804621576SShuhei Matsumoto desc++; 175904621576SShuhei Matsumoto } 176004621576SShuhei Matsumoto 1761e718d8caSAlexey Marchuk if (spdk_unlikely(total_length > rtransport->transport.opts.max_io_size)) { 1762019a5361SAlexey Marchuk SPDK_ERRLOG("Multi SGL length 0x%x exceeds max io size 0x%x\n", 1763019a5361SAlexey Marchuk total_length, rtransport->transport.opts.max_io_size); 1764019a5361SAlexey Marchuk req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1765019a5361SAlexey Marchuk return -EINVAL; 1766019a5361SAlexey Marchuk } 1767019a5361SAlexey Marchuk 1768e718d8caSAlexey Marchuk rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1); 1769e718d8caSAlexey Marchuk if (spdk_unlikely(rc != 0)) { 1770019a5361SAlexey Marchuk return -ENOMEM; 1771019a5361SAlexey Marchuk } 1772019a5361SAlexey Marchuk 1773019a5361SAlexey Marchuk rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, total_length); 1774e718d8caSAlexey Marchuk if (spdk_unlikely(rc != 0)) { 177504621576SShuhei Matsumoto nvmf_rdma_request_free_data(rdma_req, rtransport); 1776fda0e558SShuhei Matsumoto return rc; 177704621576SShuhei Matsumoto } 177804621576SShuhei Matsumoto 17790db0c443SChunsong Feng /* When dif_insert_or_strip is true and the I/O data length is greater than one block, 17800db0c443SChunsong Feng * the stripped_buffers are got for DIF stripping. */ 17810db0c443SChunsong Feng if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) 17820db0c443SChunsong Feng && (req->dif.elba_length > req->dif.dif_ctx.block_size))) { 17830db0c443SChunsong Feng rc = nvmf_request_get_stripped_buffers(req, &rgroup->group, 17840db0c443SChunsong Feng &rtransport->transport, req->dif.orig_length); 1785e718d8caSAlexey Marchuk if (spdk_unlikely(rc != 0)) { 17860db0c443SChunsong Feng SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc); 17870db0c443SChunsong Feng } 17880db0c443SChunsong Feng } 17890db0c443SChunsong Feng 179062700dacSSeth Howell /* The first WR must always be the embedded data WR. This is how we unwind them later. */ 179162700dacSSeth Howell current_wr = &rdma_req->data.wr; 179273a171a0SHailiang Wang assert(current_wr != NULL); 179362700dacSSeth Howell 1794f0c21261SShuhei Matsumoto req->length = 0; 17955593b61fSShuhei Matsumoto rdma_req->iovpos = 0; 17966812b63cSSeth Howell desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address; 179762700dacSSeth Howell for (i = 0; i < num_sgl_descriptors; i++) { 179862700dacSSeth Howell /* The descriptors must be keyed data block descriptors with an address, not an offset. */ 179962700dacSSeth Howell if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK || 180062700dacSSeth Howell desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) { 180162700dacSSeth Howell rc = -EINVAL; 180262700dacSSeth Howell goto err_exit; 180362700dacSSeth Howell } 180462700dacSSeth Howell 18059db2571dSShuhei Matsumoto if (spdk_likely(!req->dif_enabled)) { 1806ad0221afSShuhei Matsumoto rc = nvmf_rdma_fill_wr_sgl(device, rdma_req, current_wr, lengths[i]); 18079db2571dSShuhei Matsumoto } else { 1808ad0221afSShuhei Matsumoto rc = nvmf_rdma_fill_wr_sgl_with_dif(device, rdma_req, current_wr, 18099db2571dSShuhei Matsumoto lengths[i], 0); 18109db2571dSShuhei Matsumoto } 1811e718d8caSAlexey Marchuk if (spdk_unlikely(rc != 0)) { 181262700dacSSeth Howell rc = -ENOMEM; 181362700dacSSeth Howell goto err_exit; 181462700dacSSeth Howell } 181562700dacSSeth Howell 1816f0c21261SShuhei Matsumoto req->length += desc->keyed.length; 181762700dacSSeth Howell current_wr->wr.rdma.rkey = desc->keyed.key; 181862700dacSSeth Howell current_wr->wr.rdma.remote_addr = desc->address; 181962700dacSSeth Howell current_wr = current_wr->next; 182062700dacSSeth Howell desc++; 182162700dacSSeth Howell } 182262700dacSSeth Howell 182362700dacSSeth Howell #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 18247d7b44f2SSeth Howell /* Go back to the last descriptor in the list. */ 18257d7b44f2SSeth Howell desc--; 182662700dacSSeth Howell if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 182762700dacSSeth Howell if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 182862700dacSSeth Howell rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 182962700dacSSeth Howell rdma_req->rsp.wr.imm_data = desc->keyed.key; 183062700dacSSeth Howell } 183162700dacSSeth Howell } 183262700dacSSeth Howell #endif 183362700dacSSeth Howell 183462700dacSSeth Howell rdma_req->num_outstanding_data_wr = num_sgl_descriptors; 183562700dacSSeth Howell 183662700dacSSeth Howell return 0; 183762700dacSSeth Howell 183862700dacSSeth Howell err_exit: 183979945ef0SShuhei Matsumoto spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport); 184062700dacSSeth Howell nvmf_rdma_request_free_data(rdma_req, rtransport); 184162700dacSSeth Howell return rc; 184262700dacSSeth Howell } 184362700dacSSeth Howell 184462700dacSSeth Howell static int 184555d8d943SSeth Howell nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, 18461ff5f4abSBen Walker struct spdk_nvmf_rdma_device *device, 18471ff5f4abSBen Walker struct spdk_nvmf_rdma_request *rdma_req) 18482625cf42SBen Walker { 184991f9c6f3SShuhei Matsumoto struct spdk_nvmf_request *req = &rdma_req->req; 18506fb90732SBen Walker struct spdk_nvme_cpl *rsp; 1851f1a584a9SBen Walker struct spdk_nvme_sgl_descriptor *sgl; 1852a8169c37SSeth Howell int rc; 18531bc5710aSAlexey Marchuk uint32_t length; 18542625cf42SBen Walker 185591f9c6f3SShuhei Matsumoto rsp = &req->rsp->nvme_cpl; 1856a335a524SAlexey Marchuk sgl = &req->cmd->nvme_cmd.dptr.sgl1; 18572625cf42SBen Walker 18582625cf42SBen Walker if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && 18592625cf42SBen Walker (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || 18602625cf42SBen Walker sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { 18611bc5710aSAlexey Marchuk 18621bc5710aSAlexey Marchuk length = sgl->keyed.length; 1863e718d8caSAlexey Marchuk if (spdk_unlikely(length > rtransport->transport.opts.max_io_size)) { 18648a701c3fSBen Walker SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", 18651bc5710aSAlexey Marchuk length, rtransport->transport.opts.max_io_size); 18662625cf42SBen Walker rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 18671ff5f4abSBen Walker return -1; 18682625cf42SBen Walker } 1869b4de8e11SSeth Howell #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 1870efe4c272SBen Walker if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { 1871b4de8e11SSeth Howell if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { 1872b4de8e11SSeth Howell rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; 1873b4de8e11SSeth Howell rdma_req->rsp.wr.imm_data = sgl->keyed.key; 1874b4de8e11SSeth Howell } 18751570c87fSSeth Howell } 1876b4de8e11SSeth Howell #endif 18772625cf42SBen Walker 18788580daa1SSrikanth kaligotla /* fill request length and populate iovs */ 187991f9c6f3SShuhei Matsumoto req->length = length; 18808580daa1SSrikanth kaligotla 1881e48475b7SShuhei Matsumoto rc = nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req); 188206481fc2SAlexey Marchuk if (spdk_unlikely(rc < 0)) { 188306481fc2SAlexey Marchuk if (rc == -EINVAL) { 188406481fc2SAlexey Marchuk SPDK_ERRLOG("SGL length exceeds the max I/O size\n"); 1885f2065513SJacek Kalwas rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 188606481fc2SAlexey Marchuk return -1; 188706481fc2SAlexey Marchuk } 1888f1a584a9SBen Walker /* No available buffers. Queue this request up. */ 18892172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "No available large data buffers. Queueing request %p\n", rdma_req); 18901ff5f4abSBen Walker return 0; 1891f1a584a9SBen Walker } 18928580daa1SSrikanth kaligotla 18932172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Request %p took %d buffer/s from central pool\n", rdma_req, 189491f9c6f3SShuhei Matsumoto req->iovcnt); 1895a4a3b5e7SBen Walker 18961ff5f4abSBen Walker return 0; 18972625cf42SBen Walker } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && 18982625cf42SBen Walker sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 18992625cf42SBen Walker uint64_t offset = sgl->address; 19008e808490SJohn Barnard uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; 19012625cf42SBen Walker 19022172c432STomasz Zawadzki SPDK_DEBUGLOG(nvmf, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", 19032625cf42SBen Walker offset, sgl->unkeyed.length); 19042625cf42SBen Walker 1905e718d8caSAlexey Marchuk if (spdk_unlikely(offset > max_len)) { 19062625cf42SBen Walker SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", 19072625cf42SBen Walker offset, max_len); 19082625cf42SBen Walker rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; 19091ff5f4abSBen Walker return -1; 19102625cf42SBen Walker } 19112625cf42SBen Walker max_len -= (uint32_t)offset; 19122625cf42SBen Walker 1913e718d8caSAlexey Marchuk if (spdk_unlikely(sgl->unkeyed.length > max_len)) { 19142625cf42SBen Walker SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", 19152625cf42SBen Walker sgl->unkeyed.length, max_len); 19162625cf42SBen Walker rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 19171ff5f4abSBen Walker return -1; 1918f1a584a9SBen Walker } 1919f1a584a9SBen Walker 19201f9ac117SSeth Howell rdma_req->num_outstanding_data_wr = 0; 192191f9c6f3SShuhei Matsumoto req->data_from_pool = false; 192291f9c6f3SShuhei Matsumoto req->length = sgl->unkeyed.length; 19238580daa1SSrikanth kaligotla 1924e1413e91SJohn Levon req->iov[0].iov_base = rdma_req->recv->buf + offset; 192591f9c6f3SShuhei Matsumoto req->iov[0].iov_len = req->length; 192691f9c6f3SShuhei Matsumoto req->iovcnt = 1; 19278580daa1SSrikanth kaligotla 19281ff5f4abSBen Walker return 0; 192962700dacSSeth Howell } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT && 193062700dacSSeth Howell sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { 1931a8169c37SSeth Howell 1932a8169c37SSeth Howell rc = nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req); 1933e718d8caSAlexey Marchuk if (spdk_unlikely(rc == -ENOMEM)) { 19342172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "No available large data buffers. Queueing request %p\n", rdma_req); 193562700dacSSeth Howell return 0; 1936e718d8caSAlexey Marchuk } else if (spdk_unlikely(rc == -EINVAL)) { 1937a8169c37SSeth Howell SPDK_ERRLOG("Multi SGL element request length exceeds the max I/O size\n"); 1938f2065513SJacek Kalwas rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; 1939a8169c37SSeth Howell return -1; 194062700dacSSeth Howell } 194162700dacSSeth Howell 19422172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Request %p took %d buffer/s from central pool\n", rdma_req, 194391f9c6f3SShuhei Matsumoto req->iovcnt); 194462700dacSSeth Howell 194562700dacSSeth Howell return 0; 1946f1a584a9SBen Walker } 1947f1a584a9SBen Walker 19482625cf42SBen Walker SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", 19492625cf42SBen Walker sgl->generic.type, sgl->generic.subtype); 19502625cf42SBen Walker rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; 19511ff5f4abSBen Walker return -1; 19521ff5f4abSBen Walker } 19531ff5f4abSBen Walker 19542b787d48SZiye Yang static void 195555d8d943SSeth Howell _nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req, 19562b787d48SZiye Yang struct spdk_nvmf_rdma_transport *rtransport) 19572b787d48SZiye Yang { 1958e6ddb7dfSSeth Howell struct spdk_nvmf_rdma_qpair *rqpair; 1959e6ddb7dfSSeth Howell struct spdk_nvmf_rdma_poll_group *rgroup; 1960e6ddb7dfSSeth Howell 1961e6ddb7dfSSeth Howell rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 1962005b053aSShuhei Matsumoto if (rdma_req->req.data_from_pool) { 1963e6ddb7dfSSeth Howell rgroup = rqpair->poller->group; 1964e6ddb7dfSSeth Howell 196579945ef0SShuhei Matsumoto spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport); 19662b787d48SZiye Yang } 19670db0c443SChunsong Feng if (rdma_req->req.stripped_data) { 19680db0c443SChunsong Feng nvmf_request_free_stripped_buffers(&rdma_req->req, 19690db0c443SChunsong Feng &rqpair->poller->group->group, 19700db0c443SChunsong Feng &rtransport->transport); 19710db0c443SChunsong Feng } 197262700dacSSeth Howell nvmf_rdma_request_free_data(rdma_req, rtransport); 19732b787d48SZiye Yang rdma_req->req.length = 0; 19742b787d48SZiye Yang rdma_req->req.iovcnt = 0; 1975019a5361SAlexey Marchuk rdma_req->offset = 0; 19764f36a2a6SChunsong Feng rdma_req->req.dif_enabled = false; 1977183c3485SJim Harris rdma_req->fused_failed = false; 197831beb3edSKonrad Sztyber rdma_req->transfer_wr = NULL; 1979183c3485SJim Harris if (rdma_req->fused_pair) { 1980183c3485SJim Harris /* This req was part of a valid fused pair, but failed before it got to 1981183c3485SJim Harris * READ_TO_EXECUTE state. This means we need to fail the other request 1982183c3485SJim Harris * in the pair, because it is no longer part of a valid pair. If the pair 1983183c3485SJim Harris * already reached READY_TO_EXECUTE state, we need to kick it. 1984183c3485SJim Harris */ 1985183c3485SJim Harris rdma_req->fused_pair->fused_failed = true; 1986183c3485SJim Harris if (rdma_req->fused_pair->state == RDMA_REQUEST_STATE_READY_TO_EXECUTE) { 1987183c3485SJim Harris nvmf_rdma_request_process(rtransport, rdma_req->fused_pair); 1988183c3485SJim Harris } 1989183c3485SJim Harris rdma_req->fused_pair = NULL; 1990183c3485SJim Harris } 1991e1101529SAlexey Marchuk memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif)); 1992bfdc957cSSeth Howell rqpair->qd--; 1993b25751d9SBen Walker 1994b25751d9SBen Walker STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link); 199546d7b94fSAtul Malakar rqpair->qpair.queue_depth--; 1996bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_FREE; 19972b787d48SZiye Yang } 19982b787d48SZiye Yang 1999183c3485SJim Harris static void 2000183c3485SJim Harris nvmf_rdma_check_fused_ordering(struct spdk_nvmf_rdma_transport *rtransport, 2001183c3485SJim Harris struct spdk_nvmf_rdma_qpair *rqpair, 2002183c3485SJim Harris struct spdk_nvmf_rdma_request *rdma_req) 2003183c3485SJim Harris { 2004183c3485SJim Harris enum spdk_nvme_cmd_fuse last, next; 2005183c3485SJim Harris 2006183c3485SJim Harris last = rqpair->fused_first ? rqpair->fused_first->req.cmd->nvme_cmd.fuse : SPDK_NVME_CMD_FUSE_NONE; 2007183c3485SJim Harris next = rdma_req->req.cmd->nvme_cmd.fuse; 2008183c3485SJim Harris 2009183c3485SJim Harris assert(last != SPDK_NVME_CMD_FUSE_SECOND); 2010183c3485SJim Harris 2011183c3485SJim Harris if (spdk_likely(last == SPDK_NVME_CMD_FUSE_NONE && next == SPDK_NVME_CMD_FUSE_NONE)) { 2012183c3485SJim Harris return; 2013183c3485SJim Harris } 2014183c3485SJim Harris 2015183c3485SJim Harris if (last == SPDK_NVME_CMD_FUSE_FIRST) { 2016183c3485SJim Harris if (next == SPDK_NVME_CMD_FUSE_SECOND) { 2017183c3485SJim Harris /* This is a valid pair of fused commands. Point them at each other 2018183c3485SJim Harris * so they can be submitted consecutively once ready to be executed. 2019183c3485SJim Harris */ 2020183c3485SJim Harris rqpair->fused_first->fused_pair = rdma_req; 2021183c3485SJim Harris rdma_req->fused_pair = rqpair->fused_first; 2022183c3485SJim Harris rqpair->fused_first = NULL; 2023183c3485SJim Harris return; 2024183c3485SJim Harris } else { 2025183c3485SJim Harris /* Mark the last req as failed since it wasn't followed by a SECOND. */ 2026183c3485SJim Harris rqpair->fused_first->fused_failed = true; 2027183c3485SJim Harris 2028183c3485SJim Harris /* If the last req is in READY_TO_EXECUTE state, then call 2029183c3485SJim Harris * nvmf_rdma_request_process(), otherwise nothing else will kick it. 2030183c3485SJim Harris */ 2031183c3485SJim Harris if (rqpair->fused_first->state == RDMA_REQUEST_STATE_READY_TO_EXECUTE) { 2032183c3485SJim Harris nvmf_rdma_request_process(rtransport, rqpair->fused_first); 2033183c3485SJim Harris } 2034183c3485SJim Harris 2035183c3485SJim Harris rqpair->fused_first = NULL; 2036183c3485SJim Harris } 2037183c3485SJim Harris } 2038183c3485SJim Harris 2039183c3485SJim Harris if (next == SPDK_NVME_CMD_FUSE_FIRST) { 2040183c3485SJim Harris /* Set rqpair->fused_first here so that we know to check that the next request 2041183c3485SJim Harris * is a SECOND (and to fail this one if it isn't). 2042183c3485SJim Harris */ 2043183c3485SJim Harris rqpair->fused_first = rdma_req; 2044183c3485SJim Harris } else if (next == SPDK_NVME_CMD_FUSE_SECOND) { 2045183c3485SJim Harris /* Mark this req failed since it ia SECOND and the last one was not a FIRST. */ 2046183c3485SJim Harris rdma_req->fused_failed = true; 2047183c3485SJim Harris } 2048183c3485SJim Harris } 2049183c3485SJim Harris 20501d36ed84SJim Harris static void 20511d36ed84SJim Harris nvmf_rdma_poll_group_insert_need_buffer_req(struct spdk_nvmf_rdma_poll_group *rgroup, 20521d36ed84SJim Harris struct spdk_nvmf_rdma_request *rdma_req) 20531d36ed84SJim Harris { 20547251e4c2SJim Harris struct spdk_nvmf_request *r; 20557251e4c2SJim Harris 20567251e4c2SJim Harris /* CONNECT commands have a timeout, so we need to avoid a CONNECT command 20577251e4c2SJim Harris * from getting buried behind a long list of other non-FABRIC requests 20587251e4c2SJim Harris * waiting for a buffer. Note that even though the CONNECT command's data is 20597251e4c2SJim Harris * in-capsule, the request still goes to this STAILQ. 20607251e4c2SJim Harris */ 20617251e4c2SJim Harris if (spdk_likely(rdma_req->req.cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC)) { 20627251e4c2SJim Harris /* This is the most likely case. */ 20631d36ed84SJim Harris STAILQ_INSERT_TAIL(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link); 20647251e4c2SJim Harris return; 20657251e4c2SJim Harris } else { 20667251e4c2SJim Harris /* STAILQ doesn't have INSERT_BEFORE, so we need to either INSERT_HEAD 20677251e4c2SJim Harris * or INSERT_AFTER. Put it after any other FABRIC commands that are 20687251e4c2SJim Harris * already in the queue. 20697251e4c2SJim Harris */ 20707251e4c2SJim Harris r = STAILQ_FIRST(&rgroup->group.pending_buf_queue); 20717251e4c2SJim Harris if (r == NULL || r->cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC) { 20727251e4c2SJim Harris STAILQ_INSERT_HEAD(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link); 20737251e4c2SJim Harris return; 20747251e4c2SJim Harris } 20757251e4c2SJim Harris while (true) { 20767251e4c2SJim Harris struct spdk_nvmf_request *next; 20777251e4c2SJim Harris 20787251e4c2SJim Harris next = STAILQ_NEXT(r, buf_link); 20797251e4c2SJim Harris if (next == NULL || next->cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC) { 20807251e4c2SJim Harris STAILQ_INSERT_AFTER(&rgroup->group.pending_buf_queue, r, &rdma_req->req, buf_link); 20817251e4c2SJim Harris return; 20827251e4c2SJim Harris } 20837251e4c2SJim Harris r = next; 20847251e4c2SJim Harris } 20857251e4c2SJim Harris } 20861d36ed84SJim Harris } 20871d36ed84SJim Harris 2088f8cbdf2cSAlexey Marchuk bool 208955d8d943SSeth Howell nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, 20903c423f40SBen Walker struct spdk_nvmf_rdma_request *rdma_req) 20911ff5f4abSBen Walker { 20923c423f40SBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 20931ff5f4abSBen Walker struct spdk_nvmf_rdma_device *device; 2094608d80a0SBen Walker struct spdk_nvmf_rdma_poll_group *rgroup; 20953c423f40SBen Walker struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 20961ff5f4abSBen Walker int rc; 20973c423f40SBen Walker struct spdk_nvmf_rdma_recv *rdma_recv; 20983c423f40SBen Walker enum spdk_nvmf_rdma_request_state prev_state; 20993c423f40SBen Walker bool progress = false; 2100fdec444aSPhilipp Skadorov int data_posted; 2101ca59dd5dSAlexey Marchuk uint32_t num_blocks, num_rdma_reads_available, qdepth; 21021ff5f4abSBen Walker 21033c423f40SBen Walker rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 21048209c8cfSSeth Howell device = rqpair->device; 2105608d80a0SBen Walker rgroup = rqpair->poller->group; 21061ff5f4abSBen Walker 21073c423f40SBen Walker assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); 21081ff5f4abSBen Walker 2109745a54e4SBen Walker /* If the queue pair is in an error state, force the request to the completed state 2110745a54e4SBen Walker * to release resources. */ 21113caf2080SKonrad Sztyber if (spdk_unlikely(rqpair->ibv_in_error_state || !spdk_nvmf_qpair_is_active(&rqpair->qpair))) { 211204cd8e47SAlexey Marchuk switch (rdma_req->state) { 211304cd8e47SAlexey Marchuk case RDMA_REQUEST_STATE_NEED_BUFFER: 211497967681SShuhei Matsumoto STAILQ_REMOVE(&rgroup->group.pending_buf_queue, &rdma_req->req, spdk_nvmf_request, buf_link); 211504cd8e47SAlexey Marchuk break; 211604cd8e47SAlexey Marchuk case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: 2117e59ac513SSeth Howell STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 211804cd8e47SAlexey Marchuk break; 21194e45c563SAlexey Marchuk case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 21204e45c563SAlexey Marchuk if (rdma_req->num_remaining_data_wr) { 21214e45c563SAlexey Marchuk /* Partially sent request is still in the pending_rdma_read_queue, 21224e45c563SAlexey Marchuk * remove it before completing */ 21234e45c563SAlexey Marchuk rdma_req->num_remaining_data_wr = 0; 21244e45c563SAlexey Marchuk STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 21254e45c563SAlexey Marchuk } 21264e45c563SAlexey Marchuk break; 212704cd8e47SAlexey Marchuk case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: 2128e59ac513SSeth Howell STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 212904cd8e47SAlexey Marchuk break; 213004cd8e47SAlexey Marchuk case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING: 213104cd8e47SAlexey Marchuk STAILQ_REMOVE(&rqpair->pending_rdma_send_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 213204cd8e47SAlexey Marchuk break; 213304cd8e47SAlexey Marchuk default: 213404cd8e47SAlexey Marchuk break; 2135e6b2caeeSBen Walker } 2136bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2137745a54e4SBen Walker } 2138745a54e4SBen Walker 21393c423f40SBen Walker /* The loop here is to allow for several back-to-back state changes. */ 21403c423f40SBen Walker do { 21413c423f40SBen Walker prev_state = rdma_req->state; 21423c423f40SBen Walker 21432172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Request %p entering state %d\n", rdma_req, prev_state); 21443c423f40SBen Walker 21453c423f40SBen Walker switch (rdma_req->state) { 21463c423f40SBen Walker case RDMA_REQUEST_STATE_FREE: 21473c423f40SBen Walker /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW 21483c423f40SBen Walker * to escape this state. */ 21493c423f40SBen Walker break; 21503c423f40SBen Walker case RDMA_REQUEST_STATE_NEW: 21518bcbe397SJim Harris spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, 215246d7b94fSAtul Malakar (uintptr_t)rdma_req, (uintptr_t)rqpair, rqpair->qpair.queue_depth); 21533c423f40SBen Walker rdma_recv = rdma_req->recv; 21543c423f40SBen Walker 21553c423f40SBen Walker /* The first element of the SGL is the NVMe command */ 21563c423f40SBen Walker rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; 21573c423f40SBen Walker memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); 21585a6e7a41SAlexey Marchuk rdma_req->transfer_wr = &rdma_req->data.wr; 21593c423f40SBen Walker 21603caf2080SKonrad Sztyber if (spdk_unlikely(rqpair->ibv_in_error_state || !spdk_nvmf_qpair_is_active(&rqpair->qpair))) { 2161bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2162531fd76dSBen Walker break; 2163531fd76dSBen Walker } 2164531fd76dSBen Walker 2165e1101529SAlexey Marchuk if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->req.dif.dif_ctx))) { 216615ae31fbSBen Walker rdma_req->req.dif_enabled = true; 21671bc5710aSAlexey Marchuk } 21681bc5710aSAlexey Marchuk 2169183c3485SJim Harris nvmf_rdma_check_fused_ordering(rtransport, rqpair, rdma_req); 2170183c3485SJim Harris 2171bc13d022SChangpeng Liu #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 2172bc13d022SChangpeng Liu rdma_req->rsp.wr.opcode = IBV_WR_SEND; 2173bc13d022SChangpeng Liu rdma_req->rsp.wr.imm_data = 0; 2174bc13d022SChangpeng Liu #endif 2175bc13d022SChangpeng Liu 21763c423f40SBen Walker /* The next state transition depends on the data transfer needs of this request. */ 2177bc13d022SChangpeng Liu rdma_req->req.xfer = spdk_nvmf_req_get_xfer(&rdma_req->req); 21783c423f40SBen Walker 2179864d93c0SAlexey Marchuk if (spdk_unlikely(rdma_req->req.xfer == SPDK_NVME_DATA_BIDIRECTIONAL)) { 2180864d93c0SAlexey Marchuk rsp->status.sct = SPDK_NVME_SCT_GENERIC; 2181864d93c0SAlexey Marchuk rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE; 218204cd8e47SAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link); 218304cd8e47SAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING; 21842172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Request %p: invalid xfer type (BIDIRECTIONAL)\n", rdma_req); 2185864d93c0SAlexey Marchuk break; 2186864d93c0SAlexey Marchuk } 2187864d93c0SAlexey Marchuk 21883c423f40SBen Walker /* If no data to transfer, ready to execute. */ 21893c423f40SBen Walker if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { 2190bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 21913c423f40SBen Walker break; 21921ff5f4abSBen Walker } 21931ff5f4abSBen Walker 2194bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER; 21951d36ed84SJim Harris nvmf_rdma_poll_group_insert_need_buffer_req(rgroup, rdma_req); 21963c423f40SBen Walker break; 21973c423f40SBen Walker case RDMA_REQUEST_STATE_NEED_BUFFER: 21988bcbe397SJim Harris spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, 219962aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 22006a5ae72bSBen Walker 22013c423f40SBen Walker assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); 22021ff5f4abSBen Walker 220397967681SShuhei Matsumoto if (&rdma_req->req != STAILQ_FIRST(&rgroup->group.pending_buf_queue)) { 22043c423f40SBen Walker /* This request needs to wait in line to obtain a buffer */ 22053c423f40SBen Walker break; 22063c423f40SBen Walker } 22073c423f40SBen Walker 22083c423f40SBen Walker /* Try to get a data buffer */ 220955d8d943SSeth Howell rc = nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); 2210e718d8caSAlexey Marchuk if (spdk_unlikely(rc < 0)) { 221197967681SShuhei Matsumoto STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link); 221204cd8e47SAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link); 221304cd8e47SAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING; 22143c423f40SBen Walker break; 22152625cf42SBen Walker } 22162625cf42SBen Walker 2217e1413e91SJohn Levon if (rdma_req->req.iovcnt == 0) { 2218847c1c3aSZiye Yang /* No buffers available. */ 2219251db814SEvgeniy Kochetov rgroup->stat.pending_data_buffer++; 22203c423f40SBen Walker break; 22211ff5f4abSBen Walker } 22221ff5f4abSBen Walker 222397967681SShuhei Matsumoto STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link); 2224847c1c3aSZiye Yang 22251ff5f4abSBen Walker /* If data is transferring from host to controller and the data didn't 22261ff5f4abSBen Walker * arrive using in capsule data, we need to do a transfer from the host. 22271ff5f4abSBen Walker */ 2228005b053aSShuhei Matsumoto if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && 2229005b053aSShuhei Matsumoto rdma_req->req.data_from_pool) { 223004ebc6eaSSeth Howell STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link); 2231bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING; 2232349295caSBen Walker break; 2233349295caSBen Walker } 2234349295caSBen Walker 2235bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 22363c423f40SBen Walker break; 22371d0a8e1cSSeth Howell case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: 22381d0a8e1cSSeth Howell spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0, 223962aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 22406a5ae72bSBen Walker 224104ebc6eaSSeth Howell if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) { 22423c423f40SBen Walker /* This request needs to wait in line to perform RDMA */ 22433c423f40SBen Walker break; 22443c423f40SBen Walker } 2245a681f8d5SAlexey Marchuk assert(rqpair->max_send_depth >= rqpair->current_send_depth); 2246ca59dd5dSAlexey Marchuk qdepth = rqpair->max_send_depth - rqpair->current_send_depth; 2247a681f8d5SAlexey Marchuk assert(rqpair->max_read_depth >= rqpair->current_read_depth); 2248ca59dd5dSAlexey Marchuk num_rdma_reads_available = rqpair->max_read_depth - rqpair->current_read_depth; 2249ca59dd5dSAlexey Marchuk if (rdma_req->num_outstanding_data_wr > qdepth || 2250ca59dd5dSAlexey Marchuk rdma_req->num_outstanding_data_wr > num_rdma_reads_available) { 2251ca59dd5dSAlexey Marchuk if (num_rdma_reads_available && qdepth) { 2252ca59dd5dSAlexey Marchuk /* Send as much as we can */ 2253ca59dd5dSAlexey Marchuk request_prepare_transfer_in_part(&rdma_req->req, spdk_min(num_rdma_reads_available, qdepth)); 2254ca59dd5dSAlexey Marchuk } else { 2255158dc947SSeth Howell /* We can only have so many WRs outstanding. we have to wait until some finish. */ 2256251db814SEvgeniy Kochetov rqpair->poller->stat.pending_rdma_read++; 22577289d370SSeth Howell break; 22587289d370SSeth Howell } 2259ca59dd5dSAlexey Marchuk } 226004ebc6eaSSeth Howell 226104ebc6eaSSeth Howell /* We have already verified that this request is the head of the queue. */ 2262e53bd98aSxupeng-mingtu if (rdma_req->num_remaining_data_wr == 0) { 226304ebc6eaSSeth Howell STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link); 2264e53bd98aSxupeng-mingtu } 226504ebc6eaSSeth Howell 22668b9c92d3SAlexey Marchuk request_transfer_in(&rdma_req->req); 2267bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; 22688b9c92d3SAlexey Marchuk 22693c423f40SBen Walker break; 22703c423f40SBen Walker case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 22716a5ae72bSBen Walker spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, 227262aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 22733c423f40SBen Walker /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE 22743c423f40SBen Walker * to escape this state. */ 22753c423f40SBen Walker break; 22763c423f40SBen Walker case RDMA_REQUEST_STATE_READY_TO_EXECUTE: 22778bcbe397SJim Harris spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, 227862aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 22791bc5710aSAlexey Marchuk 228015ae31fbSBen Walker if (spdk_unlikely(rdma_req->req.dif_enabled)) { 22811bc5710aSAlexey Marchuk if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 22821bc5710aSAlexey Marchuk /* generate DIF for write operation */ 2283e1101529SAlexey Marchuk num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size); 22841bc5710aSAlexey Marchuk assert(num_blocks > 0); 22851bc5710aSAlexey Marchuk 22861bc5710aSAlexey Marchuk rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt, 2287e1101529SAlexey Marchuk num_blocks, &rdma_req->req.dif.dif_ctx); 22881bc5710aSAlexey Marchuk if (rc != 0) { 22891bc5710aSAlexey Marchuk SPDK_ERRLOG("DIF generation failed\n"); 22901bc5710aSAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2291608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rqpair->qpair); 22921bc5710aSAlexey Marchuk break; 22931bc5710aSAlexey Marchuk } 22941bc5710aSAlexey Marchuk } 22951bc5710aSAlexey Marchuk 2296e1101529SAlexey Marchuk assert(rdma_req->req.dif.elba_length >= rdma_req->req.length); 22971bc5710aSAlexey Marchuk /* set extended length before IO operation */ 2298e1101529SAlexey Marchuk rdma_req->req.length = rdma_req->req.dif.elba_length; 22991bc5710aSAlexey Marchuk } 23001bc5710aSAlexey Marchuk 2301183c3485SJim Harris if (rdma_req->req.cmd->nvme_cmd.fuse != SPDK_NVME_CMD_FUSE_NONE) { 2302183c3485SJim Harris if (rdma_req->fused_failed) { 2303183c3485SJim Harris /* This request failed FUSED semantics. Fail it immediately, without 2304183c3485SJim Harris * even sending it to the target layer. 2305183c3485SJim Harris */ 2306183c3485SJim Harris rsp->status.sct = SPDK_NVME_SCT_GENERIC; 2307183c3485SJim Harris rsp->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED; 230804cd8e47SAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link); 230904cd8e47SAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING; 2310183c3485SJim Harris break; 2311183c3485SJim Harris } 2312183c3485SJim Harris 2313183c3485SJim Harris if (rdma_req->fused_pair == NULL || 2314183c3485SJim Harris rdma_req->fused_pair->state != RDMA_REQUEST_STATE_READY_TO_EXECUTE) { 2315183c3485SJim Harris /* This request is ready to execute, but either we don't know yet if it's 2316183c3485SJim Harris * valid - i.e. this is a FIRST but we haven't received the next 2317183c3485SJim Harris * request yet or the other request of this fused pair isn't ready to 2318183c3485SJim Harris * execute. So break here and this request will get processed later either 2319183c3485SJim Harris * when the other request is ready or we find that this request isn't valid. 2320183c3485SJim Harris */ 2321183c3485SJim Harris break; 2322183c3485SJim Harris } 2323183c3485SJim Harris } 2324183c3485SJim Harris 2325183c3485SJim Harris /* If we get to this point, and this request is a fused command, we know that 2326183c3485SJim Harris * it is part of valid sequence (FIRST followed by a SECOND) and that both 2327183c3485SJim Harris * requests are READY_TO_EXECUTE. So call spdk_nvmf_request_exec() both on this 2328183c3485SJim Harris * request, and the other request of the fused pair, in the correct order. 2329183c3485SJim Harris * Also clear the ->fused_pair pointers on both requests, since after this point 2330183c3485SJim Harris * we no longer need to maintain the relationship between these two requests. 2331183c3485SJim Harris */ 2332183c3485SJim Harris if (rdma_req->req.cmd->nvme_cmd.fuse == SPDK_NVME_CMD_FUSE_SECOND) { 2333183c3485SJim Harris assert(rdma_req->fused_pair != NULL); 2334183c3485SJim Harris assert(rdma_req->fused_pair->fused_pair != NULL); 2335183c3485SJim Harris rdma_req->fused_pair->state = RDMA_REQUEST_STATE_EXECUTING; 2336183c3485SJim Harris spdk_nvmf_request_exec(&rdma_req->fused_pair->req); 2337183c3485SJim Harris rdma_req->fused_pair->fused_pair = NULL; 2338183c3485SJim Harris rdma_req->fused_pair = NULL; 2339183c3485SJim Harris } 2340bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_EXECUTING; 23413c423f40SBen Walker spdk_nvmf_request_exec(&rdma_req->req); 2342183c3485SJim Harris if (rdma_req->req.cmd->nvme_cmd.fuse == SPDK_NVME_CMD_FUSE_FIRST) { 2343183c3485SJim Harris assert(rdma_req->fused_pair != NULL); 2344183c3485SJim Harris assert(rdma_req->fused_pair->fused_pair != NULL); 2345183c3485SJim Harris rdma_req->fused_pair->state = RDMA_REQUEST_STATE_EXECUTING; 2346183c3485SJim Harris spdk_nvmf_request_exec(&rdma_req->fused_pair->req); 2347183c3485SJim Harris rdma_req->fused_pair->fused_pair = NULL; 2348183c3485SJim Harris rdma_req->fused_pair = NULL; 2349183c3485SJim Harris } 23503c423f40SBen Walker break; 23513c423f40SBen Walker case RDMA_REQUEST_STATE_EXECUTING: 23528bcbe397SJim Harris spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, 235362aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 23543c423f40SBen Walker /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED 23553c423f40SBen Walker * to escape this state. */ 23563c423f40SBen Walker break; 23573c423f40SBen Walker case RDMA_REQUEST_STATE_EXECUTED: 23588bcbe397SJim Harris spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, 235962aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 2360af61ab90SShuhei Matsumoto if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && 2361af61ab90SShuhei Matsumoto rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 236204ebc6eaSSeth Howell STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link); 2363bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING; 23643c423f40SBen Walker } else { 236504cd8e47SAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link); 236604cd8e47SAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING; 23673c423f40SBen Walker } 236815ae31fbSBen Walker if (spdk_unlikely(rdma_req->req.dif_enabled)) { 23691bc5710aSAlexey Marchuk /* restore the original length */ 2370e1101529SAlexey Marchuk rdma_req->req.length = rdma_req->req.dif.orig_length; 23717545e8c8SAlexey Marchuk 23727545e8c8SAlexey Marchuk if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 23737545e8c8SAlexey Marchuk struct spdk_dif_error error_blk; 23747545e8c8SAlexey Marchuk 2375e1101529SAlexey Marchuk num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size); 23760db0c443SChunsong Feng if (!rdma_req->req.stripped_data) { 2377e1101529SAlexey Marchuk rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks, 2378e1101529SAlexey Marchuk &rdma_req->req.dif.dif_ctx, &error_blk); 23790db0c443SChunsong Feng } else { 23800db0c443SChunsong Feng rc = spdk_dif_verify_copy(rdma_req->req.stripped_data->iov, 23810db0c443SChunsong Feng rdma_req->req.stripped_data->iovcnt, 23820db0c443SChunsong Feng rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks, 23830db0c443SChunsong Feng &rdma_req->req.dif.dif_ctx, &error_blk); 23840db0c443SChunsong Feng } 23857545e8c8SAlexey Marchuk if (rc) { 23867545e8c8SAlexey Marchuk struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; 23877545e8c8SAlexey Marchuk 23887545e8c8SAlexey Marchuk SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", error_blk.err_type, 23897545e8c8SAlexey Marchuk error_blk.err_offset); 23907545e8c8SAlexey Marchuk rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR; 239155d8d943SSeth Howell rsp->status.sc = nvmf_rdma_dif_error_to_compl_status(error_blk.err_type); 23927545e8c8SAlexey Marchuk STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 239304cd8e47SAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link); 239404cd8e47SAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING; 23957545e8c8SAlexey Marchuk } 23967545e8c8SAlexey Marchuk } 23971bc5710aSAlexey Marchuk } 23983c423f40SBen Walker break; 23991d0a8e1cSSeth Howell case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: 24001d0a8e1cSSeth Howell spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0, 240162aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 24021d0a8e1cSSeth Howell 240304ebc6eaSSeth Howell if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) { 24041d0a8e1cSSeth Howell /* This request needs to wait in line to perform RDMA */ 24051d0a8e1cSSeth Howell break; 24061d0a8e1cSSeth Howell } 24071d0a8e1cSSeth Howell if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) > 24081d0a8e1cSSeth Howell rqpair->max_send_depth) { 24091d0a8e1cSSeth Howell /* We can only have so many WRs outstanding. we have to wait until some finish. 24101d0a8e1cSSeth Howell * +1 since each request has an additional wr in the resp. */ 2411251db814SEvgeniy Kochetov rqpair->poller->stat.pending_rdma_write++; 24121d0a8e1cSSeth Howell break; 24131d0a8e1cSSeth Howell } 241404ebc6eaSSeth Howell 241504ebc6eaSSeth Howell /* We have already verified that this request is the head of the queue. */ 241604ebc6eaSSeth Howell STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link); 241704ebc6eaSSeth Howell 24181d0a8e1cSSeth Howell /* The data transfer will be kicked off from 24191d0a8e1cSSeth Howell * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. 242004cd8e47SAlexey Marchuk * We verified that data + response fit into send queue, so we can go to the next state directly 242104cd8e47SAlexey Marchuk */ 242204cd8e47SAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 242304cd8e47SAlexey Marchuk break; 242404cd8e47SAlexey Marchuk case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING: 242504cd8e47SAlexey Marchuk spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING, 0, 0, 242604cd8e47SAlexey Marchuk (uintptr_t)rdma_req, (uintptr_t)rqpair); 242704cd8e47SAlexey Marchuk 242804cd8e47SAlexey Marchuk if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_send_queue)) { 242904cd8e47SAlexey Marchuk /* This request needs to wait in line to send the completion */ 243004cd8e47SAlexey Marchuk break; 243104cd8e47SAlexey Marchuk } 243204cd8e47SAlexey Marchuk 2433a681f8d5SAlexey Marchuk assert(rqpair->current_send_depth <= rqpair->max_send_depth); 243404cd8e47SAlexey Marchuk if (rqpair->current_send_depth == rqpair->max_send_depth) { 243504cd8e47SAlexey Marchuk /* We can only have so many WRs outstanding. we have to wait until some finish */ 243604cd8e47SAlexey Marchuk rqpair->poller->stat.pending_rdma_send++; 243704cd8e47SAlexey Marchuk break; 243804cd8e47SAlexey Marchuk } 243904cd8e47SAlexey Marchuk 244004cd8e47SAlexey Marchuk /* We have already verified that this request is the head of the queue. */ 244104cd8e47SAlexey Marchuk STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_send_queue, state_link); 244204cd8e47SAlexey Marchuk 244304cd8e47SAlexey Marchuk /* The response sending will be kicked off from 244404cd8e47SAlexey Marchuk * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. 24451d0a8e1cSSeth Howell */ 2446bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; 24471d0a8e1cSSeth Howell break; 24483c423f40SBen Walker case RDMA_REQUEST_STATE_READY_TO_COMPLETE: 24498bcbe397SJim Harris spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, 245062aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 2451fdec444aSPhilipp Skadorov rc = request_transfer_out(&rdma_req->req, &data_posted); 24523c423f40SBen Walker assert(rc == 0); /* No good way to handle this currently */ 2453e718d8caSAlexey Marchuk if (spdk_unlikely(rc)) { 2454bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 2455e7988759SBen Walker } else { 2456bfdc957cSSeth Howell rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : 2457bfdc957cSSeth Howell RDMA_REQUEST_STATE_COMPLETING; 2458e7988759SBen Walker } 2459fdec444aSPhilipp Skadorov break; 2460fdec444aSPhilipp Skadorov case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 2461fdec444aSPhilipp Skadorov spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, 246262aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 2463fdec444aSPhilipp Skadorov /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 2464fdec444aSPhilipp Skadorov * to escape this state. */ 24653c423f40SBen Walker break; 24663c423f40SBen Walker case RDMA_REQUEST_STATE_COMPLETING: 24678bcbe397SJim Harris spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, 246862aa8bd8SKonrad Sztyber (uintptr_t)rdma_req, (uintptr_t)rqpair); 24693c423f40SBen Walker /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED 24703c423f40SBen Walker * to escape this state. */ 24713c423f40SBen Walker break; 24723c423f40SBen Walker case RDMA_REQUEST_STATE_COMPLETED: 24738bcbe397SJim Harris spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, 247446d7b94fSAtul Malakar (uintptr_t)rdma_req, (uintptr_t)rqpair, rqpair->qpair.queue_depth); 24753c423f40SBen Walker 2476fbe8f804SEvgeniy Kochetov rqpair->poller->stat.request_latency += spdk_get_ticks() - rdma_req->receive_tsc; 247755d8d943SSeth Howell _nvmf_rdma_request_free(rdma_req, rtransport); 2478fdec444aSPhilipp Skadorov break; 2479fdec444aSPhilipp Skadorov case RDMA_REQUEST_NUM_STATES: 2480fdec444aSPhilipp Skadorov default: 2481fdec444aSPhilipp Skadorov assert(0); 24823c423f40SBen Walker break; 24833c423f40SBen Walker } 24843c423f40SBen Walker 24853c423f40SBen Walker if (rdma_req->state != prev_state) { 24863c423f40SBen Walker progress = true; 24873c423f40SBen Walker } 24883c423f40SBen Walker } while (rdma_req->state != prev_state); 24893c423f40SBen Walker 24903c423f40SBen Walker return progress; 2491349295caSBen Walker } 2492349295caSBen Walker 2493349295caSBen Walker /* Public API callbacks begin here */ 2494349295caSBen Walker 2495183d81d0SJohn Barnard #define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 2496183d81d0SJohn Barnard #define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 2497ed0b611fSEvgeniy Kochetov #define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096 2498bf647c16SJim Harris #define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 128 2499183d81d0SJohn Barnard #define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 2500183d81d0SJohn Barnard #define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 2501b6b0a0baSSeth Howell #define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES) 250233f60621SSeth Howell #define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095 25033b138377SJim Harris #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE UINT32_MAX 25041bc5710aSAlexey Marchuk #define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false 25051bc5710aSAlexey Marchuk #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false 25063b830202SSeth Howell #define SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG 100 250726e0ef9aSShuhei Matsumoto #define SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC 1 2508c818233bSIvan Betsis #define SPDK_NVMF_RDMA_DEFAULT_NO_WR_BATCHING false 25097dab13c0SAlexey Marchuk #define SPDK_NVMF_RDMA_DEFAULT_DATA_WR_POOL_SIZE 4095 2510183d81d0SJohn Barnard 2511183d81d0SJohn Barnard static void 251255d8d943SSeth Howell nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) 2513183d81d0SJohn Barnard { 2514183d81d0SJohn Barnard opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; 2515183d81d0SJohn Barnard opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; 2516183d81d0SJohn Barnard opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; 2517183d81d0SJohn Barnard opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; 2518b6b0a0baSSeth Howell opts->io_unit_size = SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE; 2519183d81d0SJohn Barnard opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; 252058f16244SZiye Yang opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS; 2521e816c8fdSSeth Howell opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE; 25221bc5710aSAlexey Marchuk opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP; 252326e0ef9aSShuhei Matsumoto opts->abort_timeout_sec = SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC; 2524f766d1e4SDarek Stojaczyk opts->transport_specific = NULL; 25257dab13c0SAlexey Marchuk opts->data_wr_pool_size = SPDK_NVMF_RDMA_DEFAULT_DATA_WR_POOL_SIZE; 2526183d81d0SJohn Barnard } 2527183d81d0SJohn Barnard 25280d98a949SNaresh Gottumukkala static int nvmf_rdma_destroy(struct spdk_nvmf_transport *transport, 25290d98a949SNaresh Gottumukkala spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg); 25305518a327SDaniel Verkamp 2531efb6081cSAlexey Marchuk static inline bool 2532efb6081cSAlexey Marchuk nvmf_rdma_is_rxe_device(struct spdk_nvmf_rdma_device *device) 2533efb6081cSAlexey Marchuk { 25349cea3232SAlexey Marchuk return device->attr.vendor_id == SPDK_RDMA_RXE_VENDOR_ID_OLD || 25359cea3232SAlexey Marchuk device->attr.vendor_id == SPDK_RDMA_RXE_VENDOR_ID_NEW; 2536efb6081cSAlexey Marchuk } 2537efb6081cSAlexey Marchuk 25388dd1cd21SBen Walker static int nvmf_rdma_accept(void *ctx); 2539549be9adSsijie.sun static bool nvmf_rdma_retry_listen_port(struct spdk_nvmf_rdma_transport *rtransport); 2540549be9adSsijie.sun static void destroy_ib_device(struct spdk_nvmf_rdma_transport *rtransport, 2541549be9adSsijie.sun struct spdk_nvmf_rdma_device *device); 2542549be9adSsijie.sun 2543a5283034Ssijie.sun static int 2544a5283034Ssijie.sun create_ib_device(struct spdk_nvmf_rdma_transport *rtransport, struct ibv_context *context, 2545a5283034Ssijie.sun struct spdk_nvmf_rdma_device **new_device) 2546a5283034Ssijie.sun { 2547a5283034Ssijie.sun struct spdk_nvmf_rdma_device *device; 2548a5283034Ssijie.sun int flag = 0; 2549a5283034Ssijie.sun int rc = 0; 2550a5283034Ssijie.sun 2551a5283034Ssijie.sun device = calloc(1, sizeof(*device)); 2552a5283034Ssijie.sun if (!device) { 2553a5283034Ssijie.sun SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); 2554a5283034Ssijie.sun return -ENOMEM; 2555a5283034Ssijie.sun } 2556a5283034Ssijie.sun device->context = context; 2557a5283034Ssijie.sun rc = ibv_query_device(device->context, &device->attr); 2558a5283034Ssijie.sun if (rc < 0) { 2559a5283034Ssijie.sun SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); 2560a5283034Ssijie.sun free(device); 2561a5283034Ssijie.sun return rc; 2562a5283034Ssijie.sun } 2563a5283034Ssijie.sun 2564a5283034Ssijie.sun #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL 2565a5283034Ssijie.sun if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { 2566a5283034Ssijie.sun SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); 2567a5283034Ssijie.sun SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); 2568a5283034Ssijie.sun } 2569a5283034Ssijie.sun 2570a5283034Ssijie.sun /** 2571a5283034Ssijie.sun * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. 2572a5283034Ssijie.sun * The Soft-RoCE RXE driver does not currently support send with invalidate, 2573a5283034Ssijie.sun * but incorrectly reports that it does. There are changes making their way 2574a5283034Ssijie.sun * through the kernel now that will enable this feature. When they are merged, 2575a5283034Ssijie.sun * we can conditionally enable this feature. 2576a5283034Ssijie.sun * 2577a5283034Ssijie.sun * TODO: enable this for versions of the kernel rxe driver that support it. 2578a5283034Ssijie.sun */ 2579a5283034Ssijie.sun if (nvmf_rdma_is_rxe_device(device)) { 2580a5283034Ssijie.sun device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); 2581a5283034Ssijie.sun } 2582a5283034Ssijie.sun #endif 2583a5283034Ssijie.sun 2584a5283034Ssijie.sun /* set up device context async ev fd as NON_BLOCKING */ 2585a5283034Ssijie.sun flag = fcntl(device->context->async_fd, F_GETFL); 2586a5283034Ssijie.sun rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); 2587a5283034Ssijie.sun if (rc < 0) { 2588a5283034Ssijie.sun SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); 2589a5283034Ssijie.sun free(device); 2590a5283034Ssijie.sun return rc; 2591a5283034Ssijie.sun } 2592a5283034Ssijie.sun 2593a5283034Ssijie.sun TAILQ_INSERT_TAIL(&rtransport->devices, device, link); 259434edd9f1SKamil Godzwon SPDK_DEBUGLOG(rdma, "New device %p is added to RDMA transport\n", device); 2595a5283034Ssijie.sun 2596a5283034Ssijie.sun if (g_nvmf_hooks.get_ibv_pd) { 2597a5283034Ssijie.sun device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context); 2598a5283034Ssijie.sun } else { 2599a5283034Ssijie.sun device->pd = ibv_alloc_pd(device->context); 2600a5283034Ssijie.sun } 2601a5283034Ssijie.sun 2602a5283034Ssijie.sun if (!device->pd) { 2603a5283034Ssijie.sun SPDK_ERRLOG("Unable to allocate protection domain.\n"); 2604549be9adSsijie.sun destroy_ib_device(rtransport, device); 2605a5283034Ssijie.sun return -ENOMEM; 2606a5283034Ssijie.sun } 2607a5283034Ssijie.sun 2608a5283034Ssijie.sun assert(device->map == NULL); 2609a5283034Ssijie.sun 26108ffb2c09SAlexey Marchuk device->map = spdk_rdma_utils_create_mem_map(device->pd, &g_nvmf_hooks, IBV_ACCESS_LOCAL_WRITE); 2611a5283034Ssijie.sun if (!device->map) { 2612a5283034Ssijie.sun SPDK_ERRLOG("Unable to allocate memory map for listen address\n"); 2613549be9adSsijie.sun destroy_ib_device(rtransport, device); 2614a5283034Ssijie.sun return -ENOMEM; 2615a5283034Ssijie.sun } 2616a5283034Ssijie.sun 2617a5283034Ssijie.sun assert(device->map != NULL); 2618a5283034Ssijie.sun assert(device->pd != NULL); 2619a5283034Ssijie.sun 2620a5283034Ssijie.sun if (new_device) { 2621a5283034Ssijie.sun *new_device = device; 2622a5283034Ssijie.sun } 2623549be9adSsijie.sun SPDK_NOTICELOG("Create IB device %s(%p/%p) succeed.\n", ibv_get_device_name(context->device), 2624549be9adSsijie.sun device, context); 2625549be9adSsijie.sun 2626a5283034Ssijie.sun return 0; 2627a5283034Ssijie.sun } 2628a5283034Ssijie.sun 2629a5283034Ssijie.sun static void 2630a5283034Ssijie.sun free_poll_fds(struct spdk_nvmf_rdma_transport *rtransport) 2631a5283034Ssijie.sun { 2632a5283034Ssijie.sun if (rtransport->poll_fds) { 2633a5283034Ssijie.sun free(rtransport->poll_fds); 2634a5283034Ssijie.sun rtransport->poll_fds = NULL; 2635a5283034Ssijie.sun } 2636a5283034Ssijie.sun rtransport->npoll_fds = 0; 2637a5283034Ssijie.sun } 2638a5283034Ssijie.sun 2639a5283034Ssijie.sun static int 2640a5283034Ssijie.sun generate_poll_fds(struct spdk_nvmf_rdma_transport *rtransport) 2641a5283034Ssijie.sun { 2642a5283034Ssijie.sun /* Set up poll descriptor array to monitor events from RDMA and IB 2643a5283034Ssijie.sun * in a single poll syscall 2644a5283034Ssijie.sun */ 2645a5283034Ssijie.sun int device_count = 0; 2646a5283034Ssijie.sun int i = 0; 2647a5283034Ssijie.sun struct spdk_nvmf_rdma_device *device, *tmp; 2648a5283034Ssijie.sun 2649a5283034Ssijie.sun TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 2650a5283034Ssijie.sun device_count++; 2651a5283034Ssijie.sun } 2652a5283034Ssijie.sun 2653a5283034Ssijie.sun rtransport->npoll_fds = device_count + 1; 2654a5283034Ssijie.sun 2655a5283034Ssijie.sun rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); 2656a5283034Ssijie.sun if (rtransport->poll_fds == NULL) { 2657a5283034Ssijie.sun SPDK_ERRLOG("poll_fds allocation failed\n"); 2658a5283034Ssijie.sun return -ENOMEM; 2659a5283034Ssijie.sun } 2660a5283034Ssijie.sun 2661a5283034Ssijie.sun rtransport->poll_fds[i].fd = rtransport->event_channel->fd; 2662a5283034Ssijie.sun rtransport->poll_fds[i++].events = POLLIN; 2663a5283034Ssijie.sun 2664a5283034Ssijie.sun TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 2665a5283034Ssijie.sun rtransport->poll_fds[i].fd = device->context->async_fd; 2666a5283034Ssijie.sun rtransport->poll_fds[i++].events = POLLIN; 2667a5283034Ssijie.sun } 2668a5283034Ssijie.sun 2669a5283034Ssijie.sun return 0; 2670a5283034Ssijie.sun } 267143022da3SJacek Kalwas 267231d033f9SBen Walker static struct spdk_nvmf_transport * 267355d8d943SSeth Howell nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) 2674349295caSBen Walker { 2675349295caSBen Walker int rc; 2676ecc436fcSBen Walker struct spdk_nvmf_rdma_transport *rtransport; 2677a5283034Ssijie.sun struct spdk_nvmf_rdma_device *device; 2678958c68f1SBen Walker struct ibv_context **contexts; 26797dab13c0SAlexey Marchuk size_t data_wr_pool_size; 2680958c68f1SBen Walker uint32_t i; 2681161a3002STomasz Zawadzki int flag; 26828580daa1SSrikanth kaligotla uint32_t sge_count; 2683e816c8fdSSeth Howell uint32_t min_shared_buffers; 2684ed1a6c7dSAlexey Marchuk uint32_t min_in_capsule_data_size; 2685b6b0a0baSSeth Howell int max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES; 268631d033f9SBen Walker 2687ecc436fcSBen Walker rtransport = calloc(1, sizeof(*rtransport)); 2688ecc436fcSBen Walker if (!rtransport) { 268931d033f9SBen Walker return NULL; 269031d033f9SBen Walker } 269131d033f9SBen Walker 2692958c68f1SBen Walker TAILQ_INIT(&rtransport->devices); 26931cbc2b16SBen Walker TAILQ_INIT(&rtransport->ports); 2694645d5944SAlexey Marchuk TAILQ_INIT(&rtransport->poll_groups); 2695549be9adSsijie.sun TAILQ_INIT(&rtransport->retry_ports); 2696ecc436fcSBen Walker 2697ecc436fcSBen Walker rtransport->transport.ops = &spdk_nvmf_transport_rdma; 269897ef8701SMonica Kenguva rtransport->rdma_opts.num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE; 2699f766d1e4SDarek Stojaczyk rtransport->rdma_opts.max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH; 2700f766d1e4SDarek Stojaczyk rtransport->rdma_opts.no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ; 2701f766d1e4SDarek Stojaczyk rtransport->rdma_opts.acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG; 2702c818233bSIvan Betsis rtransport->rdma_opts.no_wr_batching = SPDK_NVMF_RDMA_DEFAULT_NO_WR_BATCHING; 2703f766d1e4SDarek Stojaczyk if (opts->transport_specific != NULL && 2704f766d1e4SDarek Stojaczyk spdk_json_decode_object_relaxed(opts->transport_specific, rdma_transport_opts_decoder, 2705f766d1e4SDarek Stojaczyk SPDK_COUNTOF(rdma_transport_opts_decoder), 2706f766d1e4SDarek Stojaczyk &rtransport->rdma_opts)) { 2707f766d1e4SDarek Stojaczyk SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 27080d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 2709f766d1e4SDarek Stojaczyk return NULL; 2710f766d1e4SDarek Stojaczyk } 2711349295caSBen Walker 27122172c432STomasz Zawadzki SPDK_INFOLOG(rdma, "*** RDMA Transport Init ***\n" 27138e808490SJohn Barnard " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" 27141551197dSAlexey Marchuk " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" 2715ed0b611fSEvgeniy Kochetov " in_capsule_data_size=%d, max_aq_depth=%d,\n" 271697ef8701SMonica Kenguva " num_shared_buffers=%d, num_cqe=%d, max_srq_depth=%d, no_srq=%d," 2717c818233bSIvan Betsis " acceptor_backlog=%d, no_wr_batching=%d abort_timeout_sec=%d\n", 27188e808490SJohn Barnard opts->max_queue_depth, 27198e808490SJohn Barnard opts->max_io_size, 27201551197dSAlexey Marchuk opts->max_qpairs_per_ctrlr - 1, 27218e808490SJohn Barnard opts->io_unit_size, 27228e808490SJohn Barnard opts->in_capsule_data_size, 272358f16244SZiye Yang opts->max_aq_depth, 2724ed0b611fSEvgeniy Kochetov opts->num_shared_buffers, 272597ef8701SMonica Kenguva rtransport->rdma_opts.num_cqe, 2726f766d1e4SDarek Stojaczyk rtransport->rdma_opts.max_srq_depth, 2727f766d1e4SDarek Stojaczyk rtransport->rdma_opts.no_srq, 2728f766d1e4SDarek Stojaczyk rtransport->rdma_opts.acceptor_backlog, 2729c818233bSIvan Betsis rtransport->rdma_opts.no_wr_batching, 273026e0ef9aSShuhei Matsumoto opts->abort_timeout_sec); 2731349295caSBen Walker 27328580daa1SSrikanth kaligotla /* I/O unit size cannot be larger than max I/O size */ 27338e808490SJohn Barnard if (opts->io_unit_size > opts->max_io_size) { 27348e808490SJohn Barnard opts->io_unit_size = opts->max_io_size; 27358580daa1SSrikanth kaligotla } 27368580daa1SSrikanth kaligotla 2737f766d1e4SDarek Stojaczyk if (rtransport->rdma_opts.acceptor_backlog <= 0) { 27383b830202SSeth Howell SPDK_ERRLOG("The acceptor backlog cannot be less than 1, setting to the default value of (%d).\n", 27393b830202SSeth Howell SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG); 2740f766d1e4SDarek Stojaczyk rtransport->rdma_opts.acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG; 27413b830202SSeth Howell } 27423b830202SSeth Howell 27430b20f2e5SZiye Yang if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) { 27440b20f2e5SZiye Yang SPDK_ERRLOG("The number of shared data buffers (%d) is less than" 27450b20f2e5SZiye Yang "the minimum number required to guarantee that forward progress can be made (%d)\n", 27460b20f2e5SZiye Yang opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2)); 27470d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 27480b20f2e5SZiye Yang return NULL; 27490b20f2e5SZiye Yang } 27500b20f2e5SZiye Yang 27513b138377SJim Harris /* If buf_cache_size == UINT32_MAX, we will dynamically pick a cache size later that we know will fit. */ 27523b138377SJim Harris if (opts->buf_cache_size < UINT32_MAX) { 2753e9b9510aSAlexey Marchuk min_shared_buffers = spdk_env_get_core_count() * opts->buf_cache_size; 2754e816c8fdSSeth Howell if (min_shared_buffers > opts->num_shared_buffers) { 2755e816c8fdSSeth Howell SPDK_ERRLOG("There are not enough buffers to satisfy" 2756e816c8fdSSeth Howell "per-poll group caches for each thread. (%" PRIu32 ")" 2757e816c8fdSSeth Howell "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers); 2758e816c8fdSSeth Howell SPDK_ERRLOG("Please specify a larger number of shared buffers\n"); 27590d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 2760e816c8fdSSeth Howell return NULL; 2761e816c8fdSSeth Howell } 27623b138377SJim Harris } 2763e816c8fdSSeth Howell 27648e808490SJohn Barnard sge_count = opts->max_io_size / opts->io_unit_size; 27651180bf83SSeth Howell if (sge_count > NVMF_DEFAULT_TX_SGE) { 27668e808490SJohn Barnard SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); 27670d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 27688580daa1SSrikanth kaligotla return NULL; 27698580daa1SSrikanth kaligotla } 27708580daa1SSrikanth kaligotla 2771ed1a6c7dSAlexey Marchuk min_in_capsule_data_size = sizeof(struct spdk_nvme_sgl_descriptor) * SPDK_NVMF_MAX_SGL_ENTRIES; 277295d710ddSAlexey Marchuk if (opts->in_capsule_data_size < min_in_capsule_data_size) { 2773ed1a6c7dSAlexey Marchuk SPDK_WARNLOG("In capsule data size is set to %u, this is minimum size required to support msdbd=16\n", 2774ed1a6c7dSAlexey Marchuk min_in_capsule_data_size); 2775ed1a6c7dSAlexey Marchuk opts->in_capsule_data_size = min_in_capsule_data_size; 2776ed1a6c7dSAlexey Marchuk } 2777ed1a6c7dSAlexey Marchuk 2778ecc436fcSBen Walker rtransport->event_channel = rdma_create_event_channel(); 2779ecc436fcSBen Walker if (rtransport->event_channel == NULL) { 2780891c12a6SPawel Wodkowski SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); 27810d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 278231d033f9SBen Walker return NULL; 2783349295caSBen Walker } 2784349295caSBen Walker 2785161a3002STomasz Zawadzki flag = fcntl(rtransport->event_channel->fd, F_GETFL); 2786161a3002STomasz Zawadzki if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { 2787161a3002STomasz Zawadzki SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", 2788891c12a6SPawel Wodkowski rtransport->event_channel->fd, spdk_strerror(errno)); 27890d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 279031d033f9SBen Walker return NULL; 2791349295caSBen Walker } 2792349295caSBen Walker 27937dab13c0SAlexey Marchuk data_wr_pool_size = opts->data_wr_pool_size; 27947dab13c0SAlexey Marchuk if (data_wr_pool_size < SPDK_NVMF_MAX_SGL_ENTRIES * 2 * spdk_env_get_core_count()) { 27957dab13c0SAlexey Marchuk data_wr_pool_size = SPDK_NVMF_MAX_SGL_ENTRIES * 2 * spdk_env_get_core_count(); 27967dab13c0SAlexey Marchuk SPDK_NOTICELOG("data_wr_pool_size is changed to %zu to guarantee enough cache for handling " 27977dab13c0SAlexey Marchuk "at least one IO in each core\n", data_wr_pool_size); 27987dab13c0SAlexey Marchuk } 27997dab13c0SAlexey Marchuk rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data", data_wr_pool_size, 28007dab13c0SAlexey Marchuk sizeof(struct spdk_nvmf_rdma_request_data), SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, 2801186b109dSJim Harris SPDK_ENV_NUMA_ID_ANY); 2802cf73fb2fSSeth Howell if (!rtransport->data_wr_pool) { 2803475b86aaSKonrad Sztyber if (spdk_mempool_lookup("spdk_nvmf_rdma_wr_data") != NULL) { 2804475b86aaSKonrad Sztyber SPDK_ERRLOG("Unable to allocate work request pool for poll group: already exists\n"); 2805475b86aaSKonrad Sztyber SPDK_ERRLOG("Probably running in multiprocess environment, which is " 2806475b86aaSKonrad Sztyber "unsupported by the nvmf library\n"); 2807475b86aaSKonrad Sztyber } else { 2808cf73fb2fSSeth Howell SPDK_ERRLOG("Unable to allocate work request pool for poll group\n"); 2809475b86aaSKonrad Sztyber } 28100d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 2811cf73fb2fSSeth Howell return NULL; 2812cf73fb2fSSeth Howell } 2813cf73fb2fSSeth Howell 2814958c68f1SBen Walker contexts = rdma_get_devices(NULL); 2815043e5edbSDaniel Verkamp if (contexts == NULL) { 2816043e5edbSDaniel Verkamp SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 28170d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 2818043e5edbSDaniel Verkamp return NULL; 2819043e5edbSDaniel Verkamp } 2820043e5edbSDaniel Verkamp 2821958c68f1SBen Walker i = 0; 2822958c68f1SBen Walker rc = 0; 2823958c68f1SBen Walker while (contexts[i] != NULL) { 2824a5283034Ssijie.sun rc = create_ib_device(rtransport, contexts[i], &device); 2825958c68f1SBen Walker if (rc < 0) { 2826b6f90c52SPhilipp Skadorov break; 2827b6f90c52SPhilipp Skadorov } 2828958c68f1SBen Walker i++; 2829a5283034Ssijie.sun max_device_sge = spdk_min(max_device_sge, device->attr.max_sge); 2830549be9adSsijie.sun device->is_ready = true; 2831958c68f1SBen Walker } 28325518a327SDaniel Verkamp rdma_free_devices(contexts); 2833958c68f1SBen Walker 2834b6b0a0baSSeth Howell if (opts->io_unit_size * max_device_sge < opts->max_io_size) { 2835b6b0a0baSSeth Howell /* divide and round up. */ 2836b6b0a0baSSeth Howell opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge; 2837b6b0a0baSSeth Howell 2838b6b0a0baSSeth Howell /* round up to the nearest 4k. */ 2839b6b0a0baSSeth Howell opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK; 2840b6b0a0baSSeth Howell 2841b6b0a0baSSeth Howell opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE); 2842b6b0a0baSSeth Howell SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n", 2843b6b0a0baSSeth Howell opts->io_unit_size); 2844b6b0a0baSSeth Howell } 2845b6b0a0baSSeth Howell 2846958c68f1SBen Walker if (rc < 0) { 28470d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 2848958c68f1SBen Walker return NULL; 28495518a327SDaniel Verkamp } 28505518a327SDaniel Verkamp 2851a5283034Ssijie.sun rc = generate_poll_fds(rtransport); 2852a5283034Ssijie.sun if (rc < 0) { 28530d98a949SNaresh Gottumukkala nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 28545518a327SDaniel Verkamp return NULL; 28555518a327SDaniel Verkamp } 28565518a327SDaniel Verkamp 285743022da3SJacek Kalwas rtransport->accept_poller = SPDK_POLLER_REGISTER(nvmf_rdma_accept, &rtransport->transport, 2858355806b5SAlexey Marchuk opts->acceptor_poll_rate); 285943022da3SJacek Kalwas if (!rtransport->accept_poller) { 286043022da3SJacek Kalwas nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); 286143022da3SJacek Kalwas return NULL; 286243022da3SJacek Kalwas } 286343022da3SJacek Kalwas 2864ecc436fcSBen Walker return &rtransport->transport; 2865349295caSBen Walker } 2866349295caSBen Walker 2867f766d1e4SDarek Stojaczyk static void 2868a5283034Ssijie.sun destroy_ib_device(struct spdk_nvmf_rdma_transport *rtransport, 2869a5283034Ssijie.sun struct spdk_nvmf_rdma_device *device) 2870a5283034Ssijie.sun { 2871a5283034Ssijie.sun TAILQ_REMOVE(&rtransport->devices, device, link); 28728a01b4d6SAlexey Marchuk spdk_rdma_utils_free_mem_map(&device->map); 2873a5283034Ssijie.sun if (device->pd) { 2874a5283034Ssijie.sun if (!g_nvmf_hooks.get_ibv_pd) { 2875a5283034Ssijie.sun ibv_dealloc_pd(device->pd); 2876a5283034Ssijie.sun } 2877a5283034Ssijie.sun } 2878549be9adSsijie.sun SPDK_DEBUGLOG(rdma, "IB device [%p] is destroyed.\n", device); 2879a5283034Ssijie.sun free(device); 2880a5283034Ssijie.sun } 2881a5283034Ssijie.sun 2882a5283034Ssijie.sun static void 2883f766d1e4SDarek Stojaczyk nvmf_rdma_dump_opts(struct spdk_nvmf_transport *transport, struct spdk_json_write_ctx *w) 2884f766d1e4SDarek Stojaczyk { 2885f766d1e4SDarek Stojaczyk struct spdk_nvmf_rdma_transport *rtransport; 2886f766d1e4SDarek Stojaczyk assert(w != NULL); 2887f766d1e4SDarek Stojaczyk 2888f766d1e4SDarek Stojaczyk rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2889f766d1e4SDarek Stojaczyk spdk_json_write_named_uint32(w, "max_srq_depth", rtransport->rdma_opts.max_srq_depth); 2890f766d1e4SDarek Stojaczyk spdk_json_write_named_bool(w, "no_srq", rtransport->rdma_opts.no_srq); 289197ef8701SMonica Kenguva if (rtransport->rdma_opts.no_srq == true) { 289297ef8701SMonica Kenguva spdk_json_write_named_int32(w, "num_cqe", rtransport->rdma_opts.num_cqe); 289397ef8701SMonica Kenguva } 2894f766d1e4SDarek Stojaczyk spdk_json_write_named_int32(w, "acceptor_backlog", rtransport->rdma_opts.acceptor_backlog); 2895bd3840a7SIvan Betsis spdk_json_write_named_bool(w, "no_wr_batching", rtransport->rdma_opts.no_wr_batching); 2896f766d1e4SDarek Stojaczyk } 2897f766d1e4SDarek Stojaczyk 2898349295caSBen Walker static int 28990d98a949SNaresh Gottumukkala nvmf_rdma_destroy(struct spdk_nvmf_transport *transport, 29000d98a949SNaresh Gottumukkala spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 2901349295caSBen Walker { 2902ecc436fcSBen Walker struct spdk_nvmf_rdma_transport *rtransport; 29036428de9eSBen Walker struct spdk_nvmf_rdma_port *port, *port_tmp; 2904958c68f1SBen Walker struct spdk_nvmf_rdma_device *device, *device_tmp; 290538980dedSZiye Yang 2906ecc436fcSBen Walker rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 2907ecc436fcSBen Walker 2908549be9adSsijie.sun TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, port_tmp) { 2909549be9adSsijie.sun TAILQ_REMOVE(&rtransport->retry_ports, port, link); 2910549be9adSsijie.sun free(port); 2911549be9adSsijie.sun } 2912549be9adSsijie.sun 29136428de9eSBen Walker TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 29146428de9eSBen Walker TAILQ_REMOVE(&rtransport->ports, port, link); 29156428de9eSBen Walker rdma_destroy_id(port->id); 29166428de9eSBen Walker free(port); 29176428de9eSBen Walker } 29186428de9eSBen Walker 2919a5283034Ssijie.sun free_poll_fds(rtransport); 2920b6f90c52SPhilipp Skadorov 2921ecc436fcSBen Walker if (rtransport->event_channel != NULL) { 2922ecc436fcSBen Walker rdma_destroy_event_channel(rtransport->event_channel); 29231290f02fSDaniel Verkamp } 2924349295caSBen Walker 2925958c68f1SBen Walker TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 2926a5283034Ssijie.sun destroy_ib_device(rtransport, device); 2927958c68f1SBen Walker } 2928958c68f1SBen Walker 2929cf73fb2fSSeth Howell if (rtransport->data_wr_pool != NULL) { 2930ab2395bbSAlexey Marchuk if (spdk_mempool_count(rtransport->data_wr_pool) != transport->opts.data_wr_pool_size) { 2931cf73fb2fSSeth Howell SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n", 2932cf73fb2fSSeth Howell spdk_mempool_count(rtransport->data_wr_pool), 2933cf73fb2fSSeth Howell transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES); 2934cf73fb2fSSeth Howell } 2935cf73fb2fSSeth Howell } 2936cf73fb2fSSeth Howell 2937cf73fb2fSSeth Howell spdk_mempool_free(rtransport->data_wr_pool); 2938645d5944SAlexey Marchuk 293943022da3SJacek Kalwas spdk_poller_unregister(&rtransport->accept_poller); 2940ecc436fcSBen Walker free(rtransport); 294131d033f9SBen Walker 29420d98a949SNaresh Gottumukkala if (cb_fn) { 29430d98a949SNaresh Gottumukkala cb_fn(cb_arg); 29440d98a949SNaresh Gottumukkala } 2945349295caSBen Walker return 0; 2946349295caSBen Walker } 2947349295caSBen Walker 29488dd1cd21SBen Walker static int nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 29498e2f0cdbSzkhatami88 struct spdk_nvme_transport_id *trid, 29508e2f0cdbSzkhatami88 bool peer); 29518e2f0cdbSzkhatami88 2952549be9adSsijie.sun static bool nvmf_rdma_rescan_devices(struct spdk_nvmf_rdma_transport *rtransport); 2953549be9adSsijie.sun 29548e2f0cdbSzkhatami88 static int 295587a062e6SJacek Kalwas nvmf_rdma_listen(struct spdk_nvmf_transport *transport, const struct spdk_nvme_transport_id *trid, 295687a062e6SJacek Kalwas struct spdk_nvmf_listen_opts *listen_opts) 29577e3b9f25SBen Walker { 2958ecc436fcSBen Walker struct spdk_nvmf_rdma_transport *rtransport; 2959958c68f1SBen Walker struct spdk_nvmf_rdma_device *device; 2960549be9adSsijie.sun struct spdk_nvmf_rdma_port *port, *tmp_port; 2961c7b8b414SDaniel Verkamp struct addrinfo *res; 2962c7b8b414SDaniel Verkamp struct addrinfo hints; 2963c7b8b414SDaniel Verkamp int family; 29647e3b9f25SBen Walker int rc; 2965c3d90406SJim Harris long int port_val; 2966549be9adSsijie.sun bool is_retry = false; 29677e3b9f25SBen Walker 29689a1cf1c5SJacek Kalwas if (!strlen(trid->trsvcid)) { 29699a1cf1c5SJacek Kalwas SPDK_ERRLOG("Service id is required\n"); 29709a1cf1c5SJacek Kalwas return -EINVAL; 29719a1cf1c5SJacek Kalwas } 29729a1cf1c5SJacek Kalwas 2973ecc436fcSBen Walker rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 29747cd56fb3SJacek Kalwas assert(rtransport->event_channel != NULL); 29757cd56fb3SJacek Kalwas 29761cbc2b16SBen Walker port = calloc(1, sizeof(*port)); 29771cbc2b16SBen Walker if (!port) { 29787cd56fb3SJacek Kalwas SPDK_ERRLOG("Port allocation failed\n"); 29797e3b9f25SBen Walker return -ENOMEM; 29807e3b9f25SBen Walker } 29817e3b9f25SBen Walker 29826d8f1fc6SJacek Kalwas port->trid = trid; 29837e3b9f25SBen Walker 29846d8f1fc6SJacek Kalwas switch (trid->adrfam) { 2985c7b8b414SDaniel Verkamp case SPDK_NVMF_ADRFAM_IPV4: 2986c7b8b414SDaniel Verkamp family = AF_INET; 2987c7b8b414SDaniel Verkamp break; 2988c7b8b414SDaniel Verkamp case SPDK_NVMF_ADRFAM_IPV6: 2989c7b8b414SDaniel Verkamp family = AF_INET6; 2990c7b8b414SDaniel Verkamp break; 2991c7b8b414SDaniel Verkamp default: 29926d8f1fc6SJacek Kalwas SPDK_ERRLOG("Unhandled ADRFAM %d\n", trid->adrfam); 2993c7b8b414SDaniel Verkamp free(port); 2994c7b8b414SDaniel Verkamp return -EINVAL; 2995c7b8b414SDaniel Verkamp } 2996c7b8b414SDaniel Verkamp 2997c7b8b414SDaniel Verkamp memset(&hints, 0, sizeof(hints)); 2998c7b8b414SDaniel Verkamp hints.ai_family = family; 29991c34d1a4SBen Walker hints.ai_flags = AI_NUMERICSERV; 3000c7b8b414SDaniel Verkamp hints.ai_socktype = SOCK_STREAM; 3001c7b8b414SDaniel Verkamp hints.ai_protocol = 0; 3002c7b8b414SDaniel Verkamp 3003c3d90406SJim Harris /* Range check the trsvcid. Fail in 3 cases: 3004c3d90406SJim Harris * < 0: means that spdk_strtol hit an error 3005c3d90406SJim Harris * 0: this results in ephemeral port which we don't want 3006c3d90406SJim Harris * > 65535: port too high 3007c3d90406SJim Harris */ 3008c3d90406SJim Harris port_val = spdk_strtol(trid->trsvcid, 10); 3009c3d90406SJim Harris if (port_val <= 0 || port_val > 65535) { 3010c3d90406SJim Harris SPDK_ERRLOG("invalid trsvcid %s\n", trid->trsvcid); 3011c3d90406SJim Harris free(port); 3012c3d90406SJim Harris return -EINVAL; 3013c3d90406SJim Harris } 3014c3d90406SJim Harris 30156d8f1fc6SJacek Kalwas rc = getaddrinfo(trid->traddr, trid->trsvcid, &hints, &res); 3016c7b8b414SDaniel Verkamp if (rc) { 3017c7b8b414SDaniel Verkamp SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); 3018c7b8b414SDaniel Verkamp free(port); 3019f6866117STomasz Zawadzki return -(abs(rc)); 3020e95e4028SJacek Kalwas } 3021e95e4028SJacek Kalwas 3022e95e4028SJacek Kalwas rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); 3023e95e4028SJacek Kalwas if (rc < 0) { 3024e95e4028SJacek Kalwas SPDK_ERRLOG("rdma_create_id() failed\n"); 3025e95e4028SJacek Kalwas freeaddrinfo(res); 3026e95e4028SJacek Kalwas free(port); 3027e95e4028SJacek Kalwas return rc; 3028e95e4028SJacek Kalwas } 3029e95e4028SJacek Kalwas 3030c7b8b414SDaniel Verkamp rc = rdma_bind_addr(port->id, res->ai_addr); 3031c7b8b414SDaniel Verkamp freeaddrinfo(res); 3032c7b8b414SDaniel Verkamp 30337e3b9f25SBen Walker if (rc < 0) { 3034549be9adSsijie.sun TAILQ_FOREACH(tmp_port, &rtransport->retry_ports, link) { 3035549be9adSsijie.sun if (spdk_nvme_transport_id_compare(tmp_port->trid, trid) == 0) { 3036549be9adSsijie.sun is_retry = true; 3037549be9adSsijie.sun break; 3038549be9adSsijie.sun } 3039549be9adSsijie.sun } 3040549be9adSsijie.sun if (!is_retry) { 30417e3b9f25SBen Walker SPDK_ERRLOG("rdma_bind_addr() failed\n"); 3042549be9adSsijie.sun } 30431cbc2b16SBen Walker rdma_destroy_id(port->id); 30441cbc2b16SBen Walker free(port); 30457e3b9f25SBen Walker return rc; 30467e3b9f25SBen Walker } 30477e3b9f25SBen Walker 3048a0246f65Sshahar salzman if (!port->id->verbs) { 3049a0246f65Sshahar salzman SPDK_ERRLOG("ibv_context is null\n"); 3050a0246f65Sshahar salzman rdma_destroy_id(port->id); 3051a0246f65Sshahar salzman free(port); 3052a0246f65Sshahar salzman return -1; 3053a0246f65Sshahar salzman } 3054a0246f65Sshahar salzman 3055f766d1e4SDarek Stojaczyk rc = rdma_listen(port->id, rtransport->rdma_opts.acceptor_backlog); 30567e3b9f25SBen Walker if (rc < 0) { 30577e3b9f25SBen Walker SPDK_ERRLOG("rdma_listen() failed\n"); 30581cbc2b16SBen Walker rdma_destroy_id(port->id); 30591cbc2b16SBen Walker free(port); 30607e3b9f25SBen Walker return rc; 30617e3b9f25SBen Walker } 30627e3b9f25SBen Walker 3063958c68f1SBen Walker TAILQ_FOREACH(device, &rtransport->devices, link) { 3064549be9adSsijie.sun if (device->context == port->id->verbs && device->is_ready) { 30651cbc2b16SBen Walker port->device = device; 3066958c68f1SBen Walker break; 3067958c68f1SBen Walker } 3068958c68f1SBen Walker } 30691cbc2b16SBen Walker if (!port->device) { 3070958c68f1SBen Walker SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", 30711cbc2b16SBen Walker port->id->verbs); 30721cbc2b16SBen Walker rdma_destroy_id(port->id); 30731cbc2b16SBen Walker free(port); 30742470b995Ssijie.sun nvmf_rdma_rescan_devices(rtransport); 3075958c68f1SBen Walker return -EINVAL; 3076958c68f1SBen Walker } 3077958c68f1SBen Walker 30787cd56fb3SJacek Kalwas SPDK_NOTICELOG("*** NVMe/RDMA Target Listening on %s port %s ***\n", 30797cd56fb3SJacek Kalwas trid->traddr, trid->trsvcid); 30807e3b9f25SBen Walker 30811cbc2b16SBen Walker TAILQ_INSERT_TAIL(&rtransport->ports, port, link); 30827e3b9f25SBen Walker return 0; 30837e3b9f25SBen Walker } 30847e3b9f25SBen Walker 30856d8f1fc6SJacek Kalwas static void 3086549be9adSsijie.sun nvmf_rdma_stop_listen_ex(struct spdk_nvmf_transport *transport, 3087549be9adSsijie.sun const struct spdk_nvme_transport_id *trid, bool need_retry) 30884440cd8dSZiye Yang { 3089ecc436fcSBen Walker struct spdk_nvmf_rdma_transport *rtransport; 30901cbc2b16SBen Walker struct spdk_nvmf_rdma_port *port, *tmp; 30917e3b9f25SBen Walker 3092ecc436fcSBen Walker rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3093ecc436fcSBen Walker 3094549be9adSsijie.sun if (!need_retry) { 3095549be9adSsijie.sun TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, tmp) { 3096549be9adSsijie.sun if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) { 3097549be9adSsijie.sun TAILQ_REMOVE(&rtransport->retry_ports, port, link); 3098549be9adSsijie.sun free(port); 3099549be9adSsijie.sun } 3100549be9adSsijie.sun } 3101549be9adSsijie.sun } 3102549be9adSsijie.sun 31031cbc2b16SBen Walker TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { 31046d8f1fc6SJacek Kalwas if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) { 3105549be9adSsijie.sun SPDK_DEBUGLOG(rdma, "Port %s:%s removed. need retry: %d\n", 3106549be9adSsijie.sun port->trid->traddr, port->trid->trsvcid, need_retry); 31071cbc2b16SBen Walker TAILQ_REMOVE(&rtransport->ports, port, link); 31081cbc2b16SBen Walker rdma_destroy_id(port->id); 3109549be9adSsijie.sun port->id = NULL; 3110549be9adSsijie.sun port->device = NULL; 3111549be9adSsijie.sun if (need_retry) { 3112549be9adSsijie.sun TAILQ_INSERT_TAIL(&rtransport->retry_ports, port, link); 3113549be9adSsijie.sun } else { 31141cbc2b16SBen Walker free(port); 3115549be9adSsijie.sun } 31164440cd8dSZiye Yang break; 31174440cd8dSZiye Yang } 31184440cd8dSZiye Yang } 31194440cd8dSZiye Yang } 31204440cd8dSZiye Yang 3121549be9adSsijie.sun static void 3122549be9adSsijie.sun nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, 3123549be9adSsijie.sun const struct spdk_nvme_transport_id *trid) 3124549be9adSsijie.sun { 3125549be9adSsijie.sun nvmf_rdma_stop_listen_ex(transport, trid, false); 3126549be9adSsijie.sun } 3127549be9adSsijie.sun 3128549be9adSsijie.sun static void _nvmf_rdma_register_poller_in_group(void *c); 31298ddc5cd4Ssijie.sun static void _nvmf_rdma_remove_poller_in_group(void *c); 31308ddc5cd4Ssijie.sun 31318ddc5cd4Ssijie.sun static bool 3132549be9adSsijie.sun nvmf_rdma_all_pollers_management_done(void *c) 31338ddc5cd4Ssijie.sun { 31348ddc5cd4Ssijie.sun struct poller_manage_ctx *ctx = c; 31358ddc5cd4Ssijie.sun int counter; 31368ddc5cd4Ssijie.sun 31378ddc5cd4Ssijie.sun counter = __atomic_sub_fetch(ctx->inflight_op_counter, 1, __ATOMIC_SEQ_CST); 3138549be9adSsijie.sun SPDK_DEBUGLOG(rdma, "nvmf_rdma_all_pollers_management_done called. counter: %d, poller: %p\n", 31398ddc5cd4Ssijie.sun counter, ctx->rpoller); 31408ddc5cd4Ssijie.sun 31418ddc5cd4Ssijie.sun if (counter == 0) { 31428ddc5cd4Ssijie.sun free((void *)ctx->inflight_op_counter); 31438ddc5cd4Ssijie.sun } 31448ddc5cd4Ssijie.sun free(ctx); 31458ddc5cd4Ssijie.sun 31468ddc5cd4Ssijie.sun return counter == 0; 31478ddc5cd4Ssijie.sun } 31488ddc5cd4Ssijie.sun 31498ddc5cd4Ssijie.sun static int 3150549be9adSsijie.sun nvmf_rdma_manage_poller(struct spdk_nvmf_rdma_transport *rtransport, 3151549be9adSsijie.sun struct spdk_nvmf_rdma_device *device, bool *has_inflight, bool is_add) 31528ddc5cd4Ssijie.sun { 31538ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_poll_group *rgroup; 31548ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_poller *rpoller; 31558ddc5cd4Ssijie.sun struct spdk_nvmf_poll_group *poll_group; 31568ddc5cd4Ssijie.sun struct poller_manage_ctx *ctx; 31578ddc5cd4Ssijie.sun bool found; 31588ddc5cd4Ssijie.sun int *inflight_counter; 31598ddc5cd4Ssijie.sun spdk_msg_fn do_fn; 31608ddc5cd4Ssijie.sun 31618ddc5cd4Ssijie.sun *has_inflight = false; 3162549be9adSsijie.sun do_fn = is_add ? _nvmf_rdma_register_poller_in_group : _nvmf_rdma_remove_poller_in_group; 31638ddc5cd4Ssijie.sun inflight_counter = calloc(1, sizeof(int)); 31648ddc5cd4Ssijie.sun if (!inflight_counter) { 31658ddc5cd4Ssijie.sun SPDK_ERRLOG("Failed to allocate inflight counter when removing pollers\n"); 31668ddc5cd4Ssijie.sun return -ENOMEM; 31678ddc5cd4Ssijie.sun } 31688ddc5cd4Ssijie.sun 31698ddc5cd4Ssijie.sun TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) { 31708ddc5cd4Ssijie.sun (*inflight_counter)++; 31718ddc5cd4Ssijie.sun } 31728ddc5cd4Ssijie.sun 31738ddc5cd4Ssijie.sun TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) { 31748ddc5cd4Ssijie.sun found = false; 31758ddc5cd4Ssijie.sun TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 31768ddc5cd4Ssijie.sun if (rpoller->device == device) { 31778ddc5cd4Ssijie.sun found = true; 31788ddc5cd4Ssijie.sun break; 31798ddc5cd4Ssijie.sun } 31808ddc5cd4Ssijie.sun } 3181549be9adSsijie.sun if (found == is_add) { 31828ddc5cd4Ssijie.sun __atomic_fetch_sub(inflight_counter, 1, __ATOMIC_SEQ_CST); 31838ddc5cd4Ssijie.sun continue; 31848ddc5cd4Ssijie.sun } 31858ddc5cd4Ssijie.sun 31868ddc5cd4Ssijie.sun ctx = calloc(1, sizeof(struct poller_manage_ctx)); 31878ddc5cd4Ssijie.sun if (!ctx) { 31888ddc5cd4Ssijie.sun SPDK_ERRLOG("Failed to allocate poller_manage_ctx when removing pollers\n"); 31898ddc5cd4Ssijie.sun if (!*has_inflight) { 31908ddc5cd4Ssijie.sun free(inflight_counter); 31918ddc5cd4Ssijie.sun } 31928ddc5cd4Ssijie.sun return -ENOMEM; 31938ddc5cd4Ssijie.sun } 31948ddc5cd4Ssijie.sun 31958ddc5cd4Ssijie.sun ctx->rtransport = rtransport; 31968ddc5cd4Ssijie.sun ctx->rgroup = rgroup; 31978ddc5cd4Ssijie.sun ctx->rpoller = rpoller; 31988ddc5cd4Ssijie.sun ctx->device = device; 31998ddc5cd4Ssijie.sun ctx->thread = spdk_get_thread(); 32008ddc5cd4Ssijie.sun ctx->inflight_op_counter = inflight_counter; 32018ddc5cd4Ssijie.sun *has_inflight = true; 32028ddc5cd4Ssijie.sun 32038ddc5cd4Ssijie.sun poll_group = rgroup->group.group; 32048ddc5cd4Ssijie.sun if (poll_group->thread != spdk_get_thread()) { 32058ddc5cd4Ssijie.sun spdk_thread_send_msg(poll_group->thread, do_fn, ctx); 32068ddc5cd4Ssijie.sun } else { 32078ddc5cd4Ssijie.sun do_fn(ctx); 32088ddc5cd4Ssijie.sun } 32098ddc5cd4Ssijie.sun } 32108ddc5cd4Ssijie.sun 32118ddc5cd4Ssijie.sun if (!*has_inflight) { 32128ddc5cd4Ssijie.sun free(inflight_counter); 32138ddc5cd4Ssijie.sun } 32148ddc5cd4Ssijie.sun 32158ddc5cd4Ssijie.sun return 0; 32168ddc5cd4Ssijie.sun } 32178ddc5cd4Ssijie.sun 3218549be9adSsijie.sun static void nvmf_rdma_handle_device_removal(struct spdk_nvmf_rdma_transport *rtransport, 3219549be9adSsijie.sun struct spdk_nvmf_rdma_device *device); 3220549be9adSsijie.sun 3221549be9adSsijie.sun static struct spdk_nvmf_rdma_device * 3222549be9adSsijie.sun nvmf_rdma_find_ib_device(struct spdk_nvmf_rdma_transport *rtransport, 3223549be9adSsijie.sun struct ibv_context *context) 3224549be9adSsijie.sun { 3225549be9adSsijie.sun struct spdk_nvmf_rdma_device *device, *tmp_device; 3226549be9adSsijie.sun 3227549be9adSsijie.sun TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp_device) { 3228549be9adSsijie.sun if (device->need_destroy) { 3229549be9adSsijie.sun continue; 3230549be9adSsijie.sun } 3231549be9adSsijie.sun 3232549be9adSsijie.sun if (strcmp(device->context->device->dev_name, context->device->dev_name) == 0) { 3233549be9adSsijie.sun return device; 3234549be9adSsijie.sun } 3235549be9adSsijie.sun } 3236549be9adSsijie.sun 3237549be9adSsijie.sun return NULL; 3238549be9adSsijie.sun } 3239549be9adSsijie.sun 3240549be9adSsijie.sun static bool 3241549be9adSsijie.sun nvmf_rdma_check_devices_context(struct spdk_nvmf_rdma_transport *rtransport, 3242549be9adSsijie.sun struct ibv_context *context) 3243549be9adSsijie.sun { 3244549be9adSsijie.sun struct spdk_nvmf_rdma_device *old_device, *new_device; 3245549be9adSsijie.sun int rc = 0; 3246549be9adSsijie.sun bool has_inflight; 3247549be9adSsijie.sun 3248549be9adSsijie.sun old_device = nvmf_rdma_find_ib_device(rtransport, context); 3249549be9adSsijie.sun 3250549be9adSsijie.sun if (old_device) { 3251549be9adSsijie.sun if (old_device->context != context && !old_device->need_destroy && old_device->is_ready) { 3252549be9adSsijie.sun /* context may not have time to be cleaned when rescan. exactly one context 3253549be9adSsijie.sun * is valid for a device so this context must be invalid and just remove it. */ 3254549be9adSsijie.sun SPDK_WARNLOG("Device %p has a invalid context %p\n", old_device, old_device->context); 3255549be9adSsijie.sun old_device->need_destroy = true; 3256549be9adSsijie.sun nvmf_rdma_handle_device_removal(rtransport, old_device); 3257549be9adSsijie.sun } 3258549be9adSsijie.sun return false; 3259549be9adSsijie.sun } 3260549be9adSsijie.sun 3261549be9adSsijie.sun rc = create_ib_device(rtransport, context, &new_device); 3262549be9adSsijie.sun /* TODO: update transport opts. */ 3263549be9adSsijie.sun if (rc < 0) { 3264549be9adSsijie.sun SPDK_ERRLOG("Failed to create ib device for context: %s(%p)\n", 3265549be9adSsijie.sun ibv_get_device_name(context->device), context); 3266549be9adSsijie.sun return false; 3267549be9adSsijie.sun } 3268549be9adSsijie.sun 3269549be9adSsijie.sun rc = nvmf_rdma_manage_poller(rtransport, new_device, &has_inflight, true); 3270549be9adSsijie.sun if (rc < 0) { 3271549be9adSsijie.sun SPDK_ERRLOG("Failed to add poller for device context: %s(%p)\n", 3272549be9adSsijie.sun ibv_get_device_name(context->device), context); 3273549be9adSsijie.sun return false; 3274549be9adSsijie.sun } 3275549be9adSsijie.sun 3276549be9adSsijie.sun if (has_inflight) { 3277549be9adSsijie.sun new_device->is_ready = true; 3278549be9adSsijie.sun } 3279549be9adSsijie.sun 3280549be9adSsijie.sun return true; 3281549be9adSsijie.sun } 3282549be9adSsijie.sun 3283549be9adSsijie.sun static bool 3284549be9adSsijie.sun nvmf_rdma_rescan_devices(struct spdk_nvmf_rdma_transport *rtransport) 3285549be9adSsijie.sun { 3286549be9adSsijie.sun struct spdk_nvmf_rdma_device *device; 3287549be9adSsijie.sun struct ibv_device **ibv_device_list = NULL; 3288549be9adSsijie.sun struct ibv_context **contexts = NULL; 3289549be9adSsijie.sun int i = 0; 3290549be9adSsijie.sun int num_dev = 0; 3291549be9adSsijie.sun bool new_create = false, has_new_device = false; 3292549be9adSsijie.sun struct ibv_context *tmp_verbs = NULL; 3293549be9adSsijie.sun 3294549be9adSsijie.sun /* do not rescan when any device is destroying, or context may be freed when 3295549be9adSsijie.sun * regenerating the poll fds. 3296549be9adSsijie.sun */ 3297549be9adSsijie.sun TAILQ_FOREACH(device, &rtransport->devices, link) { 3298549be9adSsijie.sun if (device->need_destroy) { 3299549be9adSsijie.sun return false; 3300549be9adSsijie.sun } 3301549be9adSsijie.sun } 3302549be9adSsijie.sun 3303549be9adSsijie.sun ibv_device_list = ibv_get_device_list(&num_dev); 3304549be9adSsijie.sun 3305549be9adSsijie.sun /* There is a bug in librdmacm. If verbs init failed in rdma_get_devices, it'll be 3306549be9adSsijie.sun * marked as dead verbs and never be init again. So we need to make sure the 3307549be9adSsijie.sun * verbs is available before we call rdma_get_devices. */ 3308549be9adSsijie.sun if (num_dev >= 0) { 3309549be9adSsijie.sun for (i = 0; i < num_dev; i++) { 3310549be9adSsijie.sun tmp_verbs = ibv_open_device(ibv_device_list[i]); 3311549be9adSsijie.sun if (!tmp_verbs) { 3312549be9adSsijie.sun SPDK_WARNLOG("Failed to init ibv device %p, err %d. Skip rescan.\n", ibv_device_list[i], errno); 3313549be9adSsijie.sun break; 3314549be9adSsijie.sun } 3315549be9adSsijie.sun if (nvmf_rdma_find_ib_device(rtransport, tmp_verbs) == NULL) { 3316549be9adSsijie.sun SPDK_DEBUGLOG(rdma, "Find new verbs init ibv device %p(%s).\n", ibv_device_list[i], 3317549be9adSsijie.sun tmp_verbs->device->dev_name); 3318549be9adSsijie.sun has_new_device = true; 3319549be9adSsijie.sun } 3320549be9adSsijie.sun ibv_close_device(tmp_verbs); 3321549be9adSsijie.sun } 3322549be9adSsijie.sun ibv_free_device_list(ibv_device_list); 3323549be9adSsijie.sun if (!tmp_verbs || !has_new_device) { 3324549be9adSsijie.sun return false; 3325549be9adSsijie.sun } 3326549be9adSsijie.sun } 3327549be9adSsijie.sun 3328549be9adSsijie.sun contexts = rdma_get_devices(NULL); 3329549be9adSsijie.sun 3330549be9adSsijie.sun for (i = 0; contexts && contexts[i] != NULL; i++) { 3331549be9adSsijie.sun new_create |= nvmf_rdma_check_devices_context(rtransport, contexts[i]); 3332549be9adSsijie.sun } 3333549be9adSsijie.sun 3334549be9adSsijie.sun if (new_create) { 3335549be9adSsijie.sun free_poll_fds(rtransport); 3336549be9adSsijie.sun generate_poll_fds(rtransport); 3337549be9adSsijie.sun } 3338549be9adSsijie.sun 3339549be9adSsijie.sun if (contexts) { 3340549be9adSsijie.sun rdma_free_devices(contexts); 3341549be9adSsijie.sun } 3342549be9adSsijie.sun 3343549be9adSsijie.sun return new_create; 3344549be9adSsijie.sun } 3345549be9adSsijie.sun 3346549be9adSsijie.sun static bool 3347549be9adSsijie.sun nvmf_rdma_retry_listen_port(struct spdk_nvmf_rdma_transport *rtransport) 3348549be9adSsijie.sun { 3349549be9adSsijie.sun struct spdk_nvmf_rdma_port *port, *tmp_port; 3350549be9adSsijie.sun int rc = 0; 3351549be9adSsijie.sun bool new_create = false; 3352549be9adSsijie.sun 3353549be9adSsijie.sun if (TAILQ_EMPTY(&rtransport->retry_ports)) { 3354549be9adSsijie.sun return false; 3355549be9adSsijie.sun } 3356549be9adSsijie.sun 3357549be9adSsijie.sun new_create = nvmf_rdma_rescan_devices(rtransport); 3358549be9adSsijie.sun 3359549be9adSsijie.sun TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, tmp_port) { 3360549be9adSsijie.sun rc = nvmf_rdma_listen(&rtransport->transport, port->trid, NULL); 3361549be9adSsijie.sun 3362549be9adSsijie.sun TAILQ_REMOVE(&rtransport->retry_ports, port, link); 3363549be9adSsijie.sun if (rc) { 3364549be9adSsijie.sun if (new_create) { 3365549be9adSsijie.sun SPDK_ERRLOG("Found new IB device but port %s:%s is still failed(%d) to listen.\n", 3366549be9adSsijie.sun port->trid->traddr, port->trid->trsvcid, rc); 3367549be9adSsijie.sun } 3368549be9adSsijie.sun TAILQ_INSERT_TAIL(&rtransport->retry_ports, port, link); 3369549be9adSsijie.sun break; 3370549be9adSsijie.sun } else { 3371549be9adSsijie.sun SPDK_NOTICELOG("Port %s:%s come back\n", port->trid->traddr, port->trid->trsvcid); 3372549be9adSsijie.sun free(port); 3373549be9adSsijie.sun } 3374549be9adSsijie.sun } 3375549be9adSsijie.sun 3376549be9adSsijie.sun return true; 3377549be9adSsijie.sun } 3378549be9adSsijie.sun 3379349295caSBen Walker static void 338055d8d943SSeth Howell nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, 3381e0280b11SSeth Howell struct spdk_nvmf_rdma_qpair *rqpair, bool drain) 33824bfb557dSPhilipp Skadorov { 338397967681SShuhei Matsumoto struct spdk_nvmf_request *req, *tmp; 33844bfb557dSPhilipp Skadorov struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; 3385b25751d9SBen Walker struct spdk_nvmf_rdma_resources *resources; 33864bfb557dSPhilipp Skadorov 338704cd8e47SAlexey Marchuk /* First process requests which are waiting for response to be sent */ 338804cd8e47SAlexey Marchuk STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_send_queue, state_link, req_tmp) { 338904cd8e47SAlexey Marchuk if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 339004cd8e47SAlexey Marchuk break; 339104cd8e47SAlexey Marchuk } 339204cd8e47SAlexey Marchuk } 339304cd8e47SAlexey Marchuk 339404cd8e47SAlexey Marchuk /* We process I/O in the data transfer pending queue at the highest priority. */ 339504ebc6eaSSeth Howell STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) { 33964e45c563SAlexey Marchuk if (rdma_req->state != RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) { 33974e45c563SAlexey Marchuk /* Requests in this queue might be in state RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 33984e45c563SAlexey Marchuk * they are transmitting data over network but we keep them in the list to guarantee 33994e45c563SAlexey Marchuk * fair processing. */ 34004e45c563SAlexey Marchuk continue; 34014e45c563SAlexey Marchuk } 340255d8d943SSeth Howell if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 34031d0a8e1cSSeth Howell break; 34041d0a8e1cSSeth Howell } 34051d0a8e1cSSeth Howell } 34061d0a8e1cSSeth Howell 34071d0a8e1cSSeth Howell /* Then RDMA writes since reads have stronger restrictions than writes */ 340804ebc6eaSSeth Howell STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) { 340955d8d943SSeth Howell if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 34104bfb557dSPhilipp Skadorov break; 34114bfb557dSPhilipp Skadorov } 34124bfb557dSPhilipp Skadorov } 34134bfb557dSPhilipp Skadorov 3414ac74de2fSZiye Yang /* Then we handle request waiting on memory buffers. */ 341597967681SShuhei Matsumoto STAILQ_FOREACH_SAFE(req, &rqpair->poller->group->group.pending_buf_queue, buf_link, tmp) { 341697967681SShuhei Matsumoto rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 341755d8d943SSeth Howell if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { 34184bfb557dSPhilipp Skadorov break; 34194bfb557dSPhilipp Skadorov } 34204bfb557dSPhilipp Skadorov } 34214bfb557dSPhilipp Skadorov 3422b25751d9SBen Walker resources = rqpair->resources; 3423b25751d9SBen Walker while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) { 3424b25751d9SBen Walker rdma_req = STAILQ_FIRST(&resources->free_queue); 3425b25751d9SBen Walker STAILQ_REMOVE_HEAD(&resources->free_queue, state_link); 3426b25751d9SBen Walker rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue); 3427b25751d9SBen Walker STAILQ_REMOVE_HEAD(&resources->incoming_queue, link); 342801201d3eSSeth Howell 3429fa79f64aSSeth Howell if (rqpair->srq != NULL) { 3430ed0b611fSEvgeniy Kochetov rdma_req->req.qpair = &rdma_req->recv->qpair->qpair; 3431b25751d9SBen Walker rdma_req->recv->qpair->qd++; 343201201d3eSSeth Howell } else { 3433bfdc957cSSeth Howell rqpair->qd++; 343401201d3eSSeth Howell } 343501201d3eSSeth Howell 3436fbe8f804SEvgeniy Kochetov rdma_req->receive_tsc = rdma_req->recv->receive_tsc; 3437bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_NEW; 343855d8d943SSeth Howell if (nvmf_rdma_request_process(rtransport, rdma_req) == false) { 34394bfb557dSPhilipp Skadorov break; 34404bfb557dSPhilipp Skadorov } 34414bfb557dSPhilipp Skadorov } 3442251db814SEvgeniy Kochetov if (!STAILQ_EMPTY(&resources->incoming_queue) && STAILQ_EMPTY(&resources->free_queue)) { 3443251db814SEvgeniy Kochetov rqpair->poller->stat.pending_free_request++; 3444251db814SEvgeniy Kochetov } 34454bfb557dSPhilipp Skadorov } 34464bfb557dSPhilipp Skadorov 34478e8f0434SAlexey Marchuk static void 34488e8f0434SAlexey Marchuk nvmf_rdma_poller_process_pending_buf_queue(struct spdk_nvmf_rdma_transport *rtransport, 34498e8f0434SAlexey Marchuk struct spdk_nvmf_rdma_poller *rpoller) 34508e8f0434SAlexey Marchuk { 34518e8f0434SAlexey Marchuk struct spdk_nvmf_request *req, *tmp; 34528e8f0434SAlexey Marchuk struct spdk_nvmf_rdma_request *rdma_req; 34538e8f0434SAlexey Marchuk 34548e8f0434SAlexey Marchuk STAILQ_FOREACH_SAFE(req, &rpoller->group->group.pending_buf_queue, buf_link, tmp) { 34558e8f0434SAlexey Marchuk rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 34568e8f0434SAlexey Marchuk if (nvmf_rdma_request_process(rtransport, rdma_req) == false) { 34578e8f0434SAlexey Marchuk break; 34588e8f0434SAlexey Marchuk } 34598e8f0434SAlexey Marchuk } 34608e8f0434SAlexey Marchuk } 34618e8f0434SAlexey Marchuk 34624ede9053SAlexey Marchuk static inline bool 3463e655d178SJim Harris nvmf_rdma_device_supports_last_wqe_reached(struct spdk_nvmf_rdma_device *device) 34644ede9053SAlexey Marchuk { 34654ede9053SAlexey Marchuk /* iWARP transport and SoftRoCE driver don't support LAST_WQE_REACHED ibv async event */ 3466e655d178SJim Harris return !nvmf_rdma_is_rxe_device(device) && 3467e655d178SJim Harris device->context->device->transport_type != IBV_TRANSPORT_IWARP; 34684ede9053SAlexey Marchuk } 34694ede9053SAlexey Marchuk 347085ff3fceSZiye Yang static void 347185ff3fceSZiye Yang nvmf_rdma_destroy_drained_qpair(struct spdk_nvmf_rdma_qpair *rqpair) 3472bb3e4413SSeth Howell { 347333668b22SSeth Howell struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, 347433668b22SSeth Howell struct spdk_nvmf_rdma_transport, transport); 347533668b22SSeth Howell 347685ff3fceSZiye Yang nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); 347785ff3fceSZiye Yang 3478bf41b46cSAleksey Marchuk /* nvmf_rdma_close_qpair is not called */ 347985ff3fceSZiye Yang if (!rqpair->to_close) { 348085ff3fceSZiye Yang return; 348185ff3fceSZiye Yang } 348285ff3fceSZiye Yang 34838ddc5cd4Ssijie.sun /* device is already destroyed and we should force destroy this qpair. */ 34848ddc5cd4Ssijie.sun if (rqpair->poller && rqpair->poller->need_destroy) { 34858ddc5cd4Ssijie.sun nvmf_rdma_qpair_destroy(rqpair); 34868ddc5cd4Ssijie.sun return; 34878ddc5cd4Ssijie.sun } 34888ddc5cd4Ssijie.sun 3489a9fc7e1dSSeth Howell /* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */ 3490a9fc7e1dSSeth Howell if (rqpair->current_send_depth != 0) { 3491a9fc7e1dSSeth Howell return; 3492a9fc7e1dSSeth Howell } 3493a9fc7e1dSSeth Howell 3494a9fc7e1dSSeth Howell if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) { 3495a9fc7e1dSSeth Howell return; 3496a9fc7e1dSSeth Howell } 3497a9fc7e1dSSeth Howell 3498e655d178SJim Harris /* For devices that support LAST_WQE_REACHED with srq, we need to 3499e655d178SJim Harris * wait to destroy the qpair until that event has been received. 3500e655d178SJim Harris */ 3501efb6081cSAlexey Marchuk if (rqpair->srq != NULL && rqpair->last_wqe_reached == false && 3502e655d178SJim Harris nvmf_rdma_device_supports_last_wqe_reached(rqpair->device)) { 3503a9fc7e1dSSeth Howell return; 3504a9fc7e1dSSeth Howell } 3505a9fc7e1dSSeth Howell 3506*5469bd2dSAlexey Marchuk assert(rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED || 3507*5469bd2dSAlexey Marchuk rqpair->qpair.state == SPDK_NVMF_QPAIR_ERROR); 3508b4d30668Slorneli 350955d8d943SSeth Howell nvmf_rdma_qpair_destroy(rqpair); 3510bb3e4413SSeth Howell } 3511bb3e4413SSeth Howell 3512d3fa0181SBen Walker static int 35132470b995Ssijie.sun nvmf_rdma_disconnect(struct rdma_cm_event *evt, bool *event_acked) 3514d3fa0181SBen Walker { 3515d3fa0181SBen Walker struct spdk_nvmf_qpair *qpair; 3516d3fa0181SBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 3517d3fa0181SBen Walker 3518d3fa0181SBen Walker if (evt->id == NULL) { 3519d3fa0181SBen Walker SPDK_ERRLOG("disconnect request: missing cm_id\n"); 3520d3fa0181SBen Walker return -1; 3521d3fa0181SBen Walker } 3522d3fa0181SBen Walker 3523d3fa0181SBen Walker qpair = evt->id->context; 3524d3fa0181SBen Walker if (qpair == NULL) { 3525d3fa0181SBen Walker SPDK_ERRLOG("disconnect request: no active connection\n"); 3526d3fa0181SBen Walker return -1; 3527d3fa0181SBen Walker } 3528d3fa0181SBen Walker 35292470b995Ssijie.sun rdma_ack_cm_event(evt); 35302470b995Ssijie.sun *event_acked = true; 35312470b995Ssijie.sun 3532d3fa0181SBen Walker rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 3533d3fa0181SBen Walker 3534c556b6b8SKonrad Sztyber spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair); 3535d3fa0181SBen Walker 3536608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rqpair->qpair); 3537d3fa0181SBen Walker 3538d3fa0181SBen Walker return 0; 3539d3fa0181SBen Walker } 3540d3fa0181SBen Walker 3541d3fa0181SBen Walker #ifdef DEBUG 3542d3fa0181SBen Walker static const char *CM_EVENT_STR[] = { 3543d3fa0181SBen Walker "RDMA_CM_EVENT_ADDR_RESOLVED", 3544d3fa0181SBen Walker "RDMA_CM_EVENT_ADDR_ERROR", 3545d3fa0181SBen Walker "RDMA_CM_EVENT_ROUTE_RESOLVED", 3546d3fa0181SBen Walker "RDMA_CM_EVENT_ROUTE_ERROR", 3547d3fa0181SBen Walker "RDMA_CM_EVENT_CONNECT_REQUEST", 3548d3fa0181SBen Walker "RDMA_CM_EVENT_CONNECT_RESPONSE", 3549d3fa0181SBen Walker "RDMA_CM_EVENT_CONNECT_ERROR", 3550d3fa0181SBen Walker "RDMA_CM_EVENT_UNREACHABLE", 3551d3fa0181SBen Walker "RDMA_CM_EVENT_REJECTED", 3552d3fa0181SBen Walker "RDMA_CM_EVENT_ESTABLISHED", 3553d3fa0181SBen Walker "RDMA_CM_EVENT_DISCONNECTED", 3554d3fa0181SBen Walker "RDMA_CM_EVENT_DEVICE_REMOVAL", 3555d3fa0181SBen Walker "RDMA_CM_EVENT_MULTICAST_JOIN", 3556d3fa0181SBen Walker "RDMA_CM_EVENT_MULTICAST_ERROR", 3557d3fa0181SBen Walker "RDMA_CM_EVENT_ADDR_CHANGE", 3558d3fa0181SBen Walker "RDMA_CM_EVENT_TIMEWAIT_EXIT" 3559d3fa0181SBen Walker }; 3560d3fa0181SBen Walker #endif /* DEBUG */ 3561d3fa0181SBen Walker 3562804b0669SAlexey Marchuk static void 3563804b0669SAlexey Marchuk nvmf_rdma_disconnect_qpairs_on_port(struct spdk_nvmf_rdma_transport *rtransport, 3564804b0669SAlexey Marchuk struct spdk_nvmf_rdma_port *port) 3565804b0669SAlexey Marchuk { 3566804b0669SAlexey Marchuk struct spdk_nvmf_rdma_poll_group *rgroup; 3567804b0669SAlexey Marchuk struct spdk_nvmf_rdma_poller *rpoller; 3568804b0669SAlexey Marchuk struct spdk_nvmf_rdma_qpair *rqpair; 3569804b0669SAlexey Marchuk 3570804b0669SAlexey Marchuk TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) { 3571804b0669SAlexey Marchuk TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 357252f7aeb7SShuhei Matsumoto RB_FOREACH(rqpair, qpairs_tree, &rpoller->qpairs) { 3573804b0669SAlexey Marchuk if (rqpair->listen_id == port->id) { 3574608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rqpair->qpair); 3575804b0669SAlexey Marchuk } 3576804b0669SAlexey Marchuk } 3577804b0669SAlexey Marchuk } 3578804b0669SAlexey Marchuk } 3579804b0669SAlexey Marchuk } 3580804b0669SAlexey Marchuk 358150cb6a04SSeth Howell static bool 358250cb6a04SSeth Howell nvmf_rdma_handle_cm_event_addr_change(struct spdk_nvmf_transport *transport, 358350cb6a04SSeth Howell struct rdma_cm_event *event) 358450cb6a04SSeth Howell { 35856d8f1fc6SJacek Kalwas const struct spdk_nvme_transport_id *trid; 358650cb6a04SSeth Howell struct spdk_nvmf_rdma_port *port; 358750cb6a04SSeth Howell struct spdk_nvmf_rdma_transport *rtransport; 358850cb6a04SSeth Howell bool event_acked = false; 358950cb6a04SSeth Howell 359050cb6a04SSeth Howell rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 359150cb6a04SSeth Howell TAILQ_FOREACH(port, &rtransport->ports, link) { 359250cb6a04SSeth Howell if (port->id == event->id) { 35936d8f1fc6SJacek Kalwas SPDK_ERRLOG("ADDR_CHANGE: IP %s:%s migrated\n", port->trid->traddr, port->trid->trsvcid); 359450cb6a04SSeth Howell rdma_ack_cm_event(event); 359550cb6a04SSeth Howell event_acked = true; 359650cb6a04SSeth Howell trid = port->trid; 359750cb6a04SSeth Howell break; 359850cb6a04SSeth Howell } 359950cb6a04SSeth Howell } 36006d8f1fc6SJacek Kalwas 360150cb6a04SSeth Howell if (event_acked) { 3602804b0669SAlexey Marchuk nvmf_rdma_disconnect_qpairs_on_port(rtransport, port); 360350cb6a04SSeth Howell 360455d8d943SSeth Howell nvmf_rdma_stop_listen(transport, trid); 360587a062e6SJacek Kalwas nvmf_rdma_listen(transport, trid, NULL); 360650cb6a04SSeth Howell } 36076d8f1fc6SJacek Kalwas 360850cb6a04SSeth Howell return event_acked; 360950cb6a04SSeth Howell } 361050cb6a04SSeth Howell 3611d3fa0181SBen Walker static void 36128ddc5cd4Ssijie.sun nvmf_rdma_handle_device_removal(struct spdk_nvmf_rdma_transport *rtransport, 36138ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_device *device) 36148ddc5cd4Ssijie.sun { 36158ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_port *port, *port_tmp; 36168ddc5cd4Ssijie.sun int rc; 36178ddc5cd4Ssijie.sun bool has_inflight; 36188ddc5cd4Ssijie.sun 3619549be9adSsijie.sun rc = nvmf_rdma_manage_poller(rtransport, device, &has_inflight, false); 36208ddc5cd4Ssijie.sun if (rc) { 36218ddc5cd4Ssijie.sun SPDK_ERRLOG("Failed to handle device removal, rc %d\n", rc); 36228ddc5cd4Ssijie.sun return; 36238ddc5cd4Ssijie.sun } 36248ddc5cd4Ssijie.sun 36258ddc5cd4Ssijie.sun if (!has_inflight) { 36268ddc5cd4Ssijie.sun /* no pollers, destroy the device */ 36278ddc5cd4Ssijie.sun device->ready_to_destroy = true; 36288ddc5cd4Ssijie.sun spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_remove_destroyed_device, rtransport); 36298ddc5cd4Ssijie.sun } 36308ddc5cd4Ssijie.sun 36318ddc5cd4Ssijie.sun TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { 36328ddc5cd4Ssijie.sun if (port->device == device) { 36338ddc5cd4Ssijie.sun SPDK_NOTICELOG("Port %s:%s on device %s is being removed.\n", 36348ddc5cd4Ssijie.sun port->trid->traddr, 36358ddc5cd4Ssijie.sun port->trid->trsvcid, 36368ddc5cd4Ssijie.sun ibv_get_device_name(port->device->context->device)); 36378ddc5cd4Ssijie.sun 36388ddc5cd4Ssijie.sun /* keep NVMF listener and only destroy structures of the 36398ddc5cd4Ssijie.sun * RDMA transport. when the device comes back we can retry listening 36408ddc5cd4Ssijie.sun * and the application's workflow will not be interrupted. 36418ddc5cd4Ssijie.sun */ 3642549be9adSsijie.sun nvmf_rdma_stop_listen_ex(&rtransport->transport, port->trid, true); 36438ddc5cd4Ssijie.sun } 36448ddc5cd4Ssijie.sun } 36458ddc5cd4Ssijie.sun } 36468ddc5cd4Ssijie.sun 36478ddc5cd4Ssijie.sun static void 3648804b0669SAlexey Marchuk nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport, 3649804b0669SAlexey Marchuk struct rdma_cm_event *event) 3650804b0669SAlexey Marchuk { 3651549be9adSsijie.sun struct spdk_nvmf_rdma_port *port, *tmp_port; 3652804b0669SAlexey Marchuk struct spdk_nvmf_rdma_transport *rtransport; 3653804b0669SAlexey Marchuk 3654804b0669SAlexey Marchuk port = event->id->context; 3655804b0669SAlexey Marchuk rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3656804b0669SAlexey Marchuk 3657804b0669SAlexey Marchuk rdma_ack_cm_event(event); 3658804b0669SAlexey Marchuk 3659549be9adSsijie.sun /* if device removal happens during ctrl qpair disconnecting, it's possible that we receive 3660549be9adSsijie.sun * an DEVICE_REMOVAL event on qpair but the id->qp is just NULL. So we should make sure that 3661549be9adSsijie.sun * we are handling a port event here. 3662549be9adSsijie.sun */ 3663549be9adSsijie.sun TAILQ_FOREACH(tmp_port, &rtransport->ports, link) { 3664549be9adSsijie.sun if (port == tmp_port && port->device && !port->device->need_destroy) { 36658ddc5cd4Ssijie.sun port->device->need_destroy = true; 36668ddc5cd4Ssijie.sun nvmf_rdma_handle_device_removal(rtransport, port->device); 3667804b0669SAlexey Marchuk } 3668804b0669SAlexey Marchuk } 3669549be9adSsijie.sun } 3670804b0669SAlexey Marchuk 3671804b0669SAlexey Marchuk static void 3672596f8a4aSAlexey Marchuk nvmf_process_cm_events(struct spdk_nvmf_transport *transport, uint32_t max_events) 3673d3fa0181SBen Walker { 3674d3fa0181SBen Walker struct spdk_nvmf_rdma_transport *rtransport; 3675d3fa0181SBen Walker struct rdma_cm_event *event; 3676e89ae156SAlexey Marchuk uint32_t i; 3677d3fa0181SBen Walker int rc; 367850cb6a04SSeth Howell bool event_acked; 3679d3fa0181SBen Walker 3680d3fa0181SBen Walker rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3681d3fa0181SBen Walker 3682d3fa0181SBen Walker if (rtransport->event_channel == NULL) { 3683d3fa0181SBen Walker return; 3684d3fa0181SBen Walker } 3685d3fa0181SBen Walker 3686596f8a4aSAlexey Marchuk for (i = 0; i < max_events; i++) { 368750cb6a04SSeth Howell event_acked = false; 3688d3fa0181SBen Walker rc = rdma_get_cm_event(rtransport->event_channel, &event); 3689804b0669SAlexey Marchuk if (rc) { 3690804b0669SAlexey Marchuk if (errno != EAGAIN && errno != EWOULDBLOCK) { 3691804b0669SAlexey Marchuk SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); 3692804b0669SAlexey Marchuk } 3693804b0669SAlexey Marchuk break; 3694804b0669SAlexey Marchuk } 3695804b0669SAlexey Marchuk 36962172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); 3697d3fa0181SBen Walker 3698d3fa0181SBen Walker spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); 3699d3fa0181SBen Walker 3700d3fa0181SBen Walker switch (event->event) { 3701d3fa0181SBen Walker case RDMA_CM_EVENT_ADDR_RESOLVED: 3702d3fa0181SBen Walker case RDMA_CM_EVENT_ADDR_ERROR: 3703d3fa0181SBen Walker case RDMA_CM_EVENT_ROUTE_RESOLVED: 3704d3fa0181SBen Walker case RDMA_CM_EVENT_ROUTE_ERROR: 3705d3fa0181SBen Walker /* No action required. The target never attempts to resolve routes. */ 3706d3fa0181SBen Walker break; 3707d3fa0181SBen Walker case RDMA_CM_EVENT_CONNECT_REQUEST: 37085584232cSBen Walker rc = nvmf_rdma_connect(transport, event); 3709d3fa0181SBen Walker if (rc < 0) { 3710d3fa0181SBen Walker SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); 3711d3fa0181SBen Walker break; 3712d3fa0181SBen Walker } 3713d3fa0181SBen Walker break; 3714d3fa0181SBen Walker case RDMA_CM_EVENT_CONNECT_RESPONSE: 3715d3fa0181SBen Walker /* The target never initiates a new connection. So this will not occur. */ 3716d3fa0181SBen Walker break; 3717d3fa0181SBen Walker case RDMA_CM_EVENT_CONNECT_ERROR: 3718d3fa0181SBen Walker /* Can this happen? The docs say it can, but not sure what causes it. */ 3719d3fa0181SBen Walker break; 3720d3fa0181SBen Walker case RDMA_CM_EVENT_UNREACHABLE: 3721d3fa0181SBen Walker case RDMA_CM_EVENT_REJECTED: 3722d3fa0181SBen Walker /* These only occur on the client side. */ 3723d3fa0181SBen Walker break; 3724d3fa0181SBen Walker case RDMA_CM_EVENT_ESTABLISHED: 3725d3fa0181SBen Walker /* TODO: Should we be waiting for this event anywhere? */ 3726d3fa0181SBen Walker break; 3727d3fa0181SBen Walker case RDMA_CM_EVENT_DISCONNECTED: 37282470b995Ssijie.sun rc = nvmf_rdma_disconnect(event, &event_acked); 3729d3fa0181SBen Walker if (rc < 0) { 3730d3fa0181SBen Walker SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 3731d3fa0181SBen Walker break; 3732d3fa0181SBen Walker } 3733d3fa0181SBen Walker break; 3734804b0669SAlexey Marchuk case RDMA_CM_EVENT_DEVICE_REMOVAL: 3735804b0669SAlexey Marchuk /* In case of device removal, kernel IB part triggers IBV_EVENT_DEVICE_FATAL 3736804b0669SAlexey Marchuk * which triggers RDMA_CM_EVENT_DEVICE_REMOVAL on all cma_id’s. 3737804b0669SAlexey Marchuk * Once these events are sent to SPDK, we should release all IB resources and 3738804b0669SAlexey Marchuk * don't make attempts to call any ibv_query/modify/create functions. We can only call 3739cc6920a4SJosh Soref * ibv_destroy* functions to release user space memory allocated by IB. All kernel 3740804b0669SAlexey Marchuk * resources are already cleaned. */ 3741549be9adSsijie.sun if (event->id->qp) { 3742804b0669SAlexey Marchuk /* If rdma_cm event has a valid `qp` pointer then the event refers to the 3743549be9adSsijie.sun * corresponding qpair. Otherwise the event refers to a listening device. */ 37442470b995Ssijie.sun rc = nvmf_rdma_disconnect(event, &event_acked); 3745549be9adSsijie.sun if (rc < 0) { 3746549be9adSsijie.sun SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); 3747549be9adSsijie.sun break; 3748549be9adSsijie.sun } 3749549be9adSsijie.sun } else { 3750804b0669SAlexey Marchuk nvmf_rdma_handle_cm_event_port_removal(transport, event); 3751804b0669SAlexey Marchuk event_acked = true; 3752804b0669SAlexey Marchuk } 3753804b0669SAlexey Marchuk break; 3754d3fa0181SBen Walker case RDMA_CM_EVENT_MULTICAST_JOIN: 3755d3fa0181SBen Walker case RDMA_CM_EVENT_MULTICAST_ERROR: 3756d3fa0181SBen Walker /* Multicast is not used */ 3757d3fa0181SBen Walker break; 3758d3fa0181SBen Walker case RDMA_CM_EVENT_ADDR_CHANGE: 375950cb6a04SSeth Howell event_acked = nvmf_rdma_handle_cm_event_addr_change(transport, event); 3760d3fa0181SBen Walker break; 3761d3fa0181SBen Walker case RDMA_CM_EVENT_TIMEWAIT_EXIT: 3762d3fa0181SBen Walker /* For now, do nothing. The target never re-uses queue pairs. */ 3763d3fa0181SBen Walker break; 3764d3fa0181SBen Walker default: 3765d3fa0181SBen Walker SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); 3766d3fa0181SBen Walker break; 3767d3fa0181SBen Walker } 376850cb6a04SSeth Howell if (!event_acked) { 3769d3fa0181SBen Walker rdma_ack_cm_event(event); 377050cb6a04SSeth Howell } 3771d3fa0181SBen Walker } 3772d3fa0181SBen Walker } 3773d3fa0181SBen Walker 3774d3fa0181SBen Walker static void 3775dc84fbaaSAlexey Marchuk nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair) 3776dc84fbaaSAlexey Marchuk { 3777dc84fbaaSAlexey Marchuk rqpair->last_wqe_reached = true; 3778dc84fbaaSAlexey Marchuk nvmf_rdma_destroy_drained_qpair(rqpair); 3779dc84fbaaSAlexey Marchuk } 3780dc84fbaaSAlexey Marchuk 3781dc84fbaaSAlexey Marchuk static void 37829645421cSJim Harris nvmf_rdma_qpair_process_last_wqe_event(void *ctx) 3783dc84fbaaSAlexey Marchuk { 3784dc84fbaaSAlexey Marchuk struct spdk_nvmf_rdma_ibv_event_ctx *event_ctx = ctx; 378543f6d338SJim Harris struct spdk_nvmf_rdma_qpair *rqpair; 3786dc84fbaaSAlexey Marchuk 378743f6d338SJim Harris rqpair = event_ctx->rqpair; 378843f6d338SJim Harris 378943f6d338SJim Harris if (rqpair) { 379043f6d338SJim Harris assert(event_ctx == rqpair->last_wqe_reached_ctx); 379143f6d338SJim Harris rqpair->last_wqe_reached_ctx = NULL; 37925e156a6eSJim Harris nvmf_rdma_handle_last_wqe_reached(rqpair); 3793dc84fbaaSAlexey Marchuk } 3794dc84fbaaSAlexey Marchuk free(event_ctx); 3795dc84fbaaSAlexey Marchuk } 3796dc84fbaaSAlexey Marchuk 3797dc84fbaaSAlexey Marchuk static int 3798e6da32eeSJim Harris nvmf_rdma_send_qpair_last_wqe_event(struct spdk_nvmf_rdma_qpair *rqpair) 3799dc84fbaaSAlexey Marchuk { 3800dc84fbaaSAlexey Marchuk struct spdk_nvmf_rdma_ibv_event_ctx *ctx; 38013d1d4fcfSAlexey Marchuk struct spdk_thread *thr = NULL; 38023d1d4fcfSAlexey Marchuk int rc; 3803dc84fbaaSAlexey Marchuk 38043d1d4fcfSAlexey Marchuk if (rqpair->qpair.group) { 38053d1d4fcfSAlexey Marchuk thr = rqpair->qpair.group->thread; 38063d1d4fcfSAlexey Marchuk } else if (rqpair->destruct_channel) { 38073d1d4fcfSAlexey Marchuk thr = spdk_io_channel_get_thread(rqpair->destruct_channel); 38083d1d4fcfSAlexey Marchuk } 38093d1d4fcfSAlexey Marchuk 38103d1d4fcfSAlexey Marchuk if (!thr) { 38112172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "rqpair %p has no thread\n", rqpair); 38123d1d4fcfSAlexey Marchuk return -EINVAL; 3813dc84fbaaSAlexey Marchuk } 3814dc84fbaaSAlexey Marchuk 381543f6d338SJim Harris if (rqpair->last_wqe_reached || rqpair->last_wqe_reached_ctx != NULL) { 381643f6d338SJim Harris SPDK_ERRLOG("LAST_WQE_REACHED already received for rqpair %p\n", rqpair); 381743f6d338SJim Harris return -EALREADY; 381843f6d338SJim Harris } 381943f6d338SJim Harris 3820dc84fbaaSAlexey Marchuk ctx = calloc(1, sizeof(*ctx)); 3821dc84fbaaSAlexey Marchuk if (!ctx) { 38223d1d4fcfSAlexey Marchuk return -ENOMEM; 3823dc84fbaaSAlexey Marchuk } 3824dc84fbaaSAlexey Marchuk 3825dc84fbaaSAlexey Marchuk ctx->rqpair = rqpair; 382643f6d338SJim Harris rqpair->last_wqe_reached_ctx = ctx; 3827dc84fbaaSAlexey Marchuk 38289645421cSJim Harris rc = spdk_thread_send_msg(thr, nvmf_rdma_qpair_process_last_wqe_event, ctx); 38293d1d4fcfSAlexey Marchuk if (rc) { 383043f6d338SJim Harris rqpair->last_wqe_reached_ctx = NULL; 38313d1d4fcfSAlexey Marchuk free(ctx); 38323d1d4fcfSAlexey Marchuk } 38333d1d4fcfSAlexey Marchuk 38343d1d4fcfSAlexey Marchuk return rc; 3835dc84fbaaSAlexey Marchuk } 3836dc84fbaaSAlexey Marchuk 383758f43df1SAlexey Marchuk static int 383855d8d943SSeth Howell nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) 3839b6f90c52SPhilipp Skadorov { 3840b6f90c52SPhilipp Skadorov int rc; 3841a9fc7e1dSSeth Howell struct spdk_nvmf_rdma_qpair *rqpair = NULL; 3842b6f90c52SPhilipp Skadorov struct ibv_async_event event; 3843b6f90c52SPhilipp Skadorov 3844b6f90c52SPhilipp Skadorov rc = ibv_get_async_event(device->context, &event); 3845b6f90c52SPhilipp Skadorov 3846b6f90c52SPhilipp Skadorov if (rc) { 384758f43df1SAlexey Marchuk /* In non-blocking mode -1 means there are no events available */ 384858f43df1SAlexey Marchuk return rc; 3849b6f90c52SPhilipp Skadorov } 3850b6f90c52SPhilipp Skadorov 38514bfb557dSPhilipp Skadorov switch (event.event_type) { 38524bfb557dSPhilipp Skadorov case IBV_EVENT_QP_FATAL: 3853b3e1db32SShuhei Matsumoto case IBV_EVENT_QP_LAST_WQE_REACHED: 3854b3e1db32SShuhei Matsumoto case IBV_EVENT_QP_REQ_ERR: 3855b3e1db32SShuhei Matsumoto case IBV_EVENT_QP_ACCESS_ERR: 3856b3e1db32SShuhei Matsumoto case IBV_EVENT_COMM_EST: 3857b3e1db32SShuhei Matsumoto case IBV_EVENT_PATH_MIG: 3858b3e1db32SShuhei Matsumoto case IBV_EVENT_PATH_MIG_ERR: 385944ab0033SMaciej Szwed rqpair = event.element.qp->qp_context; 3860b3e1db32SShuhei Matsumoto if (!rqpair) { 3861b3e1db32SShuhei Matsumoto /* Any QP event for NVMe-RDMA initiator may be returned. */ 3862b3e1db32SShuhei Matsumoto SPDK_NOTICELOG("Async QP event for unknown QP: %s\n", 3863b3e1db32SShuhei Matsumoto ibv_event_type_str(event.event_type)); 3864b3e1db32SShuhei Matsumoto break; 3865b3e1db32SShuhei Matsumoto } 3866b3e1db32SShuhei Matsumoto 3867b3e1db32SShuhei Matsumoto switch (event.event_type) { 3868b3e1db32SShuhei Matsumoto case IBV_EVENT_QP_FATAL: 3869d05c5538SSeth Howell SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair); 3870e8881867SJim Harris spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 387162aa8bd8SKonrad Sztyber (uintptr_t)rqpair, event.event_type); 3872feeaa282SAlexey Marchuk rqpair->ibv_in_error_state = true; 3873608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rqpair->qpair); 38744bfb557dSPhilipp Skadorov break; 38759f6d509bSBen Walker case IBV_EVENT_QP_LAST_WQE_REACHED: 3876a9fc7e1dSSeth Howell /* This event only occurs for shared receive queues. */ 38772172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Last WQE reached event received for rqpair %p\n", rqpair); 3878e6da32eeSJim Harris rc = nvmf_rdma_send_qpair_last_wqe_event(rqpair); 38793d1d4fcfSAlexey Marchuk if (rc) { 38803d1d4fcfSAlexey Marchuk SPDK_WARNLOG("Failed to send LAST_WQE_REACHED event. rqpair %p, err %d\n", rqpair, rc); 3881f0b7a6e7SAlexey Marchuk rqpair->last_wqe_reached = true; 3882a9fc7e1dSSeth Howell } 38839f6d509bSBen Walker break; 38844bfb557dSPhilipp Skadorov case IBV_EVENT_QP_REQ_ERR: 38854bfb557dSPhilipp Skadorov case IBV_EVENT_QP_ACCESS_ERR: 38864bfb557dSPhilipp Skadorov case IBV_EVENT_COMM_EST: 38874bfb557dSPhilipp Skadorov case IBV_EVENT_PATH_MIG: 38884bfb557dSPhilipp Skadorov case IBV_EVENT_PATH_MIG_ERR: 3889b3e1db32SShuhei Matsumoto SPDK_NOTICELOG("Async QP event: %s\n", 3890d05c5538SSeth Howell ibv_event_type_str(event.event_type)); 3891e8881867SJim Harris spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 389262aa8bd8SKonrad Sztyber (uintptr_t)rqpair, event.event_type); 3893feeaa282SAlexey Marchuk rqpair->ibv_in_error_state = true; 3894242201d2SMaciej Szwed break; 3895b3e1db32SShuhei Matsumoto default: 3896b3e1db32SShuhei Matsumoto break; 3897b3e1db32SShuhei Matsumoto } 3898b3e1db32SShuhei Matsumoto break; 38994bfb557dSPhilipp Skadorov case IBV_EVENT_DEVICE_FATAL: 39008ddc5cd4Ssijie.sun SPDK_ERRLOG("Device Fatal event[%s] received on %s. device: %p\n", 39018ddc5cd4Ssijie.sun ibv_event_type_str(event.event_type), ibv_get_device_name(device->context->device), device); 39028ddc5cd4Ssijie.sun device->need_destroy = true; 39038ddc5cd4Ssijie.sun break; 39048ddc5cd4Ssijie.sun case IBV_EVENT_CQ_ERR: 39054bfb557dSPhilipp Skadorov case IBV_EVENT_PORT_ACTIVE: 39064bfb557dSPhilipp Skadorov case IBV_EVENT_PORT_ERR: 39074bfb557dSPhilipp Skadorov case IBV_EVENT_LID_CHANGE: 39084bfb557dSPhilipp Skadorov case IBV_EVENT_PKEY_CHANGE: 39094bfb557dSPhilipp Skadorov case IBV_EVENT_SM_CHANGE: 39104bfb557dSPhilipp Skadorov case IBV_EVENT_SRQ_ERR: 39114bfb557dSPhilipp Skadorov case IBV_EVENT_SRQ_LIMIT_REACHED: 39124bfb557dSPhilipp Skadorov case IBV_EVENT_CLIENT_REREGISTER: 39134bfb557dSPhilipp Skadorov case IBV_EVENT_GID_CHANGE: 3914feeaa282SAlexey Marchuk case IBV_EVENT_SQ_DRAINED: 39154bfb557dSPhilipp Skadorov default: 3916d05c5538SSeth Howell SPDK_NOTICELOG("Async event: %s\n", 3917d05c5538SSeth Howell ibv_event_type_str(event.event_type)); 3918e8881867SJim Harris spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); 39194bfb557dSPhilipp Skadorov break; 39204bfb557dSPhilipp Skadorov } 3921b6f90c52SPhilipp Skadorov ibv_ack_async_event(&event); 392258f43df1SAlexey Marchuk 392358f43df1SAlexey Marchuk return 0; 392458f43df1SAlexey Marchuk } 392558f43df1SAlexey Marchuk 392658f43df1SAlexey Marchuk static void 392758f43df1SAlexey Marchuk nvmf_process_ib_events(struct spdk_nvmf_rdma_device *device, uint32_t max_events) 392858f43df1SAlexey Marchuk { 392958f43df1SAlexey Marchuk int rc = 0; 393058f43df1SAlexey Marchuk uint32_t i = 0; 393158f43df1SAlexey Marchuk 393258f43df1SAlexey Marchuk for (i = 0; i < max_events; i++) { 393358f43df1SAlexey Marchuk rc = nvmf_process_ib_event(device); 393458f43df1SAlexey Marchuk if (rc) { 393558f43df1SAlexey Marchuk break; 393658f43df1SAlexey Marchuk } 393758f43df1SAlexey Marchuk } 393858f43df1SAlexey Marchuk 39392172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Device %s: %u events processed\n", device->context->device->name, i); 3940b6f90c52SPhilipp Skadorov } 3941b6f90c52SPhilipp Skadorov 394243022da3SJacek Kalwas static int 394343022da3SJacek Kalwas nvmf_rdma_accept(void *ctx) 3944b6f90c52SPhilipp Skadorov { 3945b6f90c52SPhilipp Skadorov int nfds, i = 0; 394643022da3SJacek Kalwas struct spdk_nvmf_transport *transport = ctx; 3947b6f90c52SPhilipp Skadorov struct spdk_nvmf_rdma_transport *rtransport; 3948b6f90c52SPhilipp Skadorov struct spdk_nvmf_rdma_device *device, *tmp; 3949e7e10859SMaciej Szwed uint32_t count; 39508ddc5cd4Ssijie.sun short revents; 3951549be9adSsijie.sun bool do_retry; 3952b6f90c52SPhilipp Skadorov 3953b6f90c52SPhilipp Skadorov rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 3954549be9adSsijie.sun do_retry = nvmf_rdma_retry_listen_port(rtransport); 3955549be9adSsijie.sun 3956e7e10859SMaciej Szwed count = nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); 3957b6f90c52SPhilipp Skadorov 3958b6f90c52SPhilipp Skadorov if (nfds <= 0) { 3959549be9adSsijie.sun return do_retry ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 3960b6f90c52SPhilipp Skadorov } 3961b6f90c52SPhilipp Skadorov 3962b6f90c52SPhilipp Skadorov /* The first poll descriptor is RDMA CM event */ 3963b6f90c52SPhilipp Skadorov if (rtransport->poll_fds[i++].revents & POLLIN) { 3964596f8a4aSAlexey Marchuk nvmf_process_cm_events(transport, NVMF_RDMA_MAX_EVENTS_PER_POLL); 3965b6f90c52SPhilipp Skadorov nfds--; 3966b6f90c52SPhilipp Skadorov } 3967b6f90c52SPhilipp Skadorov 3968b6f90c52SPhilipp Skadorov if (nfds == 0) { 396943022da3SJacek Kalwas return SPDK_POLLER_BUSY; 3970b6f90c52SPhilipp Skadorov } 3971b6f90c52SPhilipp Skadorov 3972b6f90c52SPhilipp Skadorov /* Second and subsequent poll descriptors are IB async events */ 3973b6f90c52SPhilipp Skadorov TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { 39748ddc5cd4Ssijie.sun revents = rtransport->poll_fds[i++].revents; 39758ddc5cd4Ssijie.sun if (revents & POLLIN) { 39768ddc5cd4Ssijie.sun if (spdk_likely(!device->need_destroy)) { 3977e89ae156SAlexey Marchuk nvmf_process_ib_events(device, NVMF_RDMA_MAX_EVENTS_PER_POLL); 39788ddc5cd4Ssijie.sun if (spdk_unlikely(device->need_destroy)) { 39798ddc5cd4Ssijie.sun nvmf_rdma_handle_device_removal(rtransport, device); 39808ddc5cd4Ssijie.sun } 39818ddc5cd4Ssijie.sun } 39828ddc5cd4Ssijie.sun nfds--; 39838ddc5cd4Ssijie.sun } else if (revents & POLLNVAL || revents & POLLHUP) { 39848ddc5cd4Ssijie.sun SPDK_ERRLOG("Receive unknown revent %x on device %p\n", (int)revents, device); 3985b6f90c52SPhilipp Skadorov nfds--; 3986b6f90c52SPhilipp Skadorov } 3987b6f90c52SPhilipp Skadorov } 3988b6f90c52SPhilipp Skadorov /* check all flagged fd's have been served */ 3989b6f90c52SPhilipp Skadorov assert(nfds == 0); 3990e7e10859SMaciej Szwed 399143022da3SJacek Kalwas return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 3992b6f90c52SPhilipp Skadorov } 3993b6f90c52SPhilipp Skadorov 3994b6f90c52SPhilipp Skadorov static void 3995000e6f5bSJacek Kalwas nvmf_rdma_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem, 3996000e6f5bSJacek Kalwas struct spdk_nvmf_ctrlr_data *cdata) 3997000e6f5bSJacek Kalwas { 3998a3f09a8fSAlexey Marchuk cdata->nvmf_specific.msdbd = NVMF_DEFAULT_MSDBD; 3999000e6f5bSJacek Kalwas 4000000e6f5bSJacek Kalwas /* Disable in-capsule data transfer for RDMA controller when dif_insert_or_strip is enabled 4001000e6f5bSJacek Kalwas since in-capsule data only works with NVME drives that support SGL memory layout */ 4002000e6f5bSJacek Kalwas if (transport->opts.dif_insert_or_strip) { 4003000e6f5bSJacek Kalwas cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16; 4004000e6f5bSJacek Kalwas } 400534392f23SBen Walker 400634392f23SBen Walker if (cdata->nvmf_specific.ioccsz > ((sizeof(struct spdk_nvme_cmd) + 0x1000) / 16)) { 400734392f23SBen Walker SPDK_WARNLOG("RDMA is configured to support up to 16 SGL entries while in capsule" 400834392f23SBen Walker " data is greater than 4KiB.\n"); 400934392f23SBen Walker SPDK_WARNLOG("When used in conjunction with the NVMe-oF initiator from the Linux " 401034392f23SBen Walker "kernel between versions 5.4 and 5.12 data corruption may occur for " 401134392f23SBen Walker "writes that are not a multiple of 4KiB in size.\n"); 401234392f23SBen Walker } 4013000e6f5bSJacek Kalwas } 4014000e6f5bSJacek Kalwas 4015000e6f5bSJacek Kalwas static void 401655d8d943SSeth Howell nvmf_rdma_discover(struct spdk_nvmf_transport *transport, 40176428de9eSBen Walker struct spdk_nvme_transport_id *trid, 4018349295caSBen Walker struct spdk_nvmf_discovery_log_page_entry *entry) 4019349295caSBen Walker { 4020349295caSBen Walker entry->trtype = SPDK_NVMF_TRTYPE_RDMA; 40216428de9eSBen Walker entry->adrfam = trid->adrfam; 402262615117SMichal Ben Haim entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED; 4023349295caSBen Walker 40246428de9eSBen Walker spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); 40256428de9eSBen Walker spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); 4026349295caSBen Walker 4027349295caSBen Walker entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; 4028349295caSBen Walker entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; 4029349295caSBen Walker entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; 4030349295caSBen Walker } 4031349295caSBen Walker 4032a5283034Ssijie.sun static int 4033a5283034Ssijie.sun nvmf_rdma_poller_create(struct spdk_nvmf_rdma_transport *rtransport, 4034a5283034Ssijie.sun struct spdk_nvmf_rdma_poll_group *rgroup, struct spdk_nvmf_rdma_device *device, 4035a5283034Ssijie.sun struct spdk_nvmf_rdma_poller **out_poller) 4036d7b8da3bSBen Walker { 4037ed0b611fSEvgeniy Kochetov struct spdk_nvmf_rdma_poller *poller; 4038cf151d60SAlexey Marchuk struct spdk_rdma_provider_srq_init_attr srq_init_attr; 40390d3fcd10SSeth Howell struct spdk_nvmf_rdma_resource_opts opts; 40407dd3cf44SSeth Howell int num_cqe; 40410d3fcd10SSeth Howell 40423ee93c32SBen Walker poller = calloc(1, sizeof(*poller)); 40433ee93c32SBen Walker if (!poller) { 40443ee93c32SBen Walker SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); 4045a5283034Ssijie.sun return -1; 40463ee93c32SBen Walker } 40473ee93c32SBen Walker 40483ee93c32SBen Walker poller->device = device; 40493ee93c32SBen Walker poller->group = rgroup; 4050a5283034Ssijie.sun *out_poller = poller; 40513ee93c32SBen Walker 405252f7aeb7SShuhei Matsumoto RB_INIT(&poller->qpairs); 4053b4dc10fbSSeth Howell STAILQ_INIT(&poller->qpairs_pending_send); 405414777890SSeth Howell STAILQ_INIT(&poller->qpairs_pending_recv); 40553ee93c32SBen Walker 40563ee93c32SBen Walker TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); 4057a5283034Ssijie.sun SPDK_DEBUGLOG(rdma, "Create poller %p on device %p in poll group %p.\n", poller, device, rgroup); 4058f766d1e4SDarek Stojaczyk if (rtransport->rdma_opts.no_srq == false && device->num_srq < device->attr.max_srq) { 40593838d4d2SAlexey Marchuk if ((int)rtransport->rdma_opts.max_srq_depth > device->attr.max_srq_wr) { 40603838d4d2SAlexey Marchuk SPDK_WARNLOG("Requested SRQ depth %u, max supported by dev %s is %d\n", 40613838d4d2SAlexey Marchuk rtransport->rdma_opts.max_srq_depth, device->context->device->name, device->attr.max_srq_wr); 40623838d4d2SAlexey Marchuk } 40633838d4d2SAlexey Marchuk poller->max_srq_depth = spdk_min((int)rtransport->rdma_opts.max_srq_depth, device->attr.max_srq_wr); 4064ed0b611fSEvgeniy Kochetov 406561948a1cSSeth Howell device->num_srq++; 4066696e8580SAlexey Marchuk memset(&srq_init_attr, 0, sizeof(srq_init_attr)); 4067696e8580SAlexey Marchuk srq_init_attr.pd = device->pd; 406836ac75b9SAlexey Marchuk srq_init_attr.stats = &poller->stat.qp_stats.recv; 4069696e8580SAlexey Marchuk srq_init_attr.srq_init_attr.attr.max_wr = poller->max_srq_depth; 4070696e8580SAlexey Marchuk srq_init_attr.srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); 4071cf151d60SAlexey Marchuk poller->srq = spdk_rdma_provider_srq_create(&srq_init_attr); 4072ed0b611fSEvgeniy Kochetov if (!poller->srq) { 4073ed0b611fSEvgeniy Kochetov SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno); 4074a5283034Ssijie.sun return -1; 4075ed0b611fSEvgeniy Kochetov } 4076b25751d9SBen Walker 40770d3fcd10SSeth Howell opts.qp = poller->srq; 4078bf41b46cSAleksey Marchuk opts.map = device->map; 40790d3fcd10SSeth Howell opts.qpair = NULL; 40800d3fcd10SSeth Howell opts.shared = true; 40810d3fcd10SSeth Howell opts.max_queue_depth = poller->max_srq_depth; 4082a5283034Ssijie.sun opts.in_capsule_data_size = rtransport->transport.opts.in_capsule_data_size; 40830d3fcd10SSeth Howell 40840d3fcd10SSeth Howell poller->resources = nvmf_rdma_resources_create(&opts); 4085b25751d9SBen Walker if (!poller->resources) { 4086b25751d9SBen Walker SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n"); 4087a5283034Ssijie.sun return -1; 4088b25751d9SBen Walker } 408901201d3eSSeth Howell } 40907dd3cf44SSeth Howell 40917dd3cf44SSeth Howell /* 40927dd3cf44SSeth Howell * When using an srq, we can limit the completion queue at startup. 40937dd3cf44SSeth Howell * The following formula represents the calculation: 40947dd3cf44SSeth Howell * num_cqe = num_recv + num_data_wr + num_send_wr. 40957dd3cf44SSeth Howell * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth 40967dd3cf44SSeth Howell */ 40977dd3cf44SSeth Howell if (poller->srq) { 40987dd3cf44SSeth Howell num_cqe = poller->max_srq_depth * 3; 40997dd3cf44SSeth Howell } else { 410097ef8701SMonica Kenguva num_cqe = rtransport->rdma_opts.num_cqe; 41017dd3cf44SSeth Howell } 41027dd3cf44SSeth Howell 41037dd3cf44SSeth Howell poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0); 41047dd3cf44SSeth Howell if (!poller->cq) { 41057dd3cf44SSeth Howell SPDK_ERRLOG("Unable to create completion queue\n"); 4106a5283034Ssijie.sun return -1; 4107a5283034Ssijie.sun } 4108a5283034Ssijie.sun poller->num_cqe = num_cqe; 4109a5283034Ssijie.sun return 0; 4110a5283034Ssijie.sun } 4111a5283034Ssijie.sun 4112549be9adSsijie.sun static void 4113549be9adSsijie.sun _nvmf_rdma_register_poller_in_group(void *c) 4114549be9adSsijie.sun { 4115549be9adSsijie.sun struct spdk_nvmf_rdma_poller *poller; 4116549be9adSsijie.sun struct poller_manage_ctx *ctx = c; 4117549be9adSsijie.sun struct spdk_nvmf_rdma_device *device; 4118549be9adSsijie.sun int rc; 4119549be9adSsijie.sun 4120549be9adSsijie.sun rc = nvmf_rdma_poller_create(ctx->rtransport, ctx->rgroup, ctx->device, &poller); 4121549be9adSsijie.sun if (rc < 0 && poller) { 4122549be9adSsijie.sun nvmf_rdma_poller_destroy(poller); 4123549be9adSsijie.sun } 4124549be9adSsijie.sun 4125549be9adSsijie.sun device = ctx->device; 4126549be9adSsijie.sun if (nvmf_rdma_all_pollers_management_done(ctx)) { 4127549be9adSsijie.sun device->is_ready = true; 4128549be9adSsijie.sun } 4129549be9adSsijie.sun } 4130549be9adSsijie.sun 4131a5283034Ssijie.sun static void nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); 4132a5283034Ssijie.sun 4133a5283034Ssijie.sun static struct spdk_nvmf_transport_poll_group * 4134a5283034Ssijie.sun nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport, 4135a5283034Ssijie.sun struct spdk_nvmf_poll_group *group) 4136a5283034Ssijie.sun { 4137a5283034Ssijie.sun struct spdk_nvmf_rdma_transport *rtransport; 4138a5283034Ssijie.sun struct spdk_nvmf_rdma_poll_group *rgroup; 4139a5283034Ssijie.sun struct spdk_nvmf_rdma_poller *poller; 4140a5283034Ssijie.sun struct spdk_nvmf_rdma_device *device; 4141a5283034Ssijie.sun int rc; 4142a5283034Ssijie.sun 414354e1a03bSKrzysztof Goreczny if (spdk_interrupt_mode_is_enabled()) { 414454e1a03bSKrzysztof Goreczny SPDK_ERRLOG("RDMA transport does not support interrupt mode\n"); 414554e1a03bSKrzysztof Goreczny return NULL; 414654e1a03bSKrzysztof Goreczny } 414754e1a03bSKrzysztof Goreczny 4148a5283034Ssijie.sun rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); 4149a5283034Ssijie.sun 4150a5283034Ssijie.sun rgroup = calloc(1, sizeof(*rgroup)); 4151a5283034Ssijie.sun if (!rgroup) { 4152a5283034Ssijie.sun return NULL; 4153a5283034Ssijie.sun } 4154a5283034Ssijie.sun 4155a5283034Ssijie.sun TAILQ_INIT(&rgroup->pollers); 4156a5283034Ssijie.sun 4157a5283034Ssijie.sun TAILQ_FOREACH(device, &rtransport->devices, link) { 4158a5283034Ssijie.sun rc = nvmf_rdma_poller_create(rtransport, rgroup, device, &poller); 4159a5283034Ssijie.sun if (rc < 0) { 416055d8d943SSeth Howell nvmf_rdma_poll_group_destroy(&rgroup->group); 41617dd3cf44SSeth Howell return NULL; 41627dd3cf44SSeth Howell } 41633ee93c32SBen Walker } 41643ee93c32SBen Walker 4165645d5944SAlexey Marchuk TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link); 4166645d5944SAlexey Marchuk if (rtransport->conn_sched.next_admin_pg == NULL) { 4167645d5944SAlexey Marchuk rtransport->conn_sched.next_admin_pg = rgroup; 4168645d5944SAlexey Marchuk rtransport->conn_sched.next_io_pg = rgroup; 4169645d5944SAlexey Marchuk } 4170645d5944SAlexey Marchuk 4171baa936a1SBen Walker return &rgroup->group; 4172d7b8da3bSBen Walker } 4173d7b8da3bSBen Walker 417430c8b17fSJim Harris static uint32_t 417530c8b17fSJim Harris nvmf_poll_group_get_io_qpair_count(struct spdk_nvmf_poll_group *pg) 417630c8b17fSJim Harris { 417730c8b17fSJim Harris uint32_t count; 417830c8b17fSJim Harris 417930c8b17fSJim Harris /* Just assume that unassociated qpairs will eventually be io 418030c8b17fSJim Harris * qpairs. This is close enough for the use cases for this 418130c8b17fSJim Harris * function. 418230c8b17fSJim Harris */ 418330c8b17fSJim Harris pthread_mutex_lock(&pg->mutex); 418430c8b17fSJim Harris count = pg->stat.current_io_qpairs + pg->current_unassociated_qpairs; 418530c8b17fSJim Harris pthread_mutex_unlock(&pg->mutex); 418630c8b17fSJim Harris 418730c8b17fSJim Harris return count; 418830c8b17fSJim Harris } 418930c8b17fSJim Harris 419073e87ed2SAlexey Marchuk static struct spdk_nvmf_transport_poll_group * 419155d8d943SSeth Howell nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 419273e87ed2SAlexey Marchuk { 419373e87ed2SAlexey Marchuk struct spdk_nvmf_rdma_transport *rtransport; 419473e87ed2SAlexey Marchuk struct spdk_nvmf_rdma_poll_group **pg; 419573e87ed2SAlexey Marchuk struct spdk_nvmf_transport_poll_group *result; 419630020c2fSJim Harris uint32_t count; 419773e87ed2SAlexey Marchuk 419873e87ed2SAlexey Marchuk rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 419973e87ed2SAlexey Marchuk 420073e87ed2SAlexey Marchuk if (TAILQ_EMPTY(&rtransport->poll_groups)) { 420173e87ed2SAlexey Marchuk return NULL; 420273e87ed2SAlexey Marchuk } 420373e87ed2SAlexey Marchuk 420473e87ed2SAlexey Marchuk if (qpair->qid == 0) { 420573e87ed2SAlexey Marchuk pg = &rtransport->conn_sched.next_admin_pg; 420673e87ed2SAlexey Marchuk } else { 4207cd1b7ab0Sliuqinfei struct spdk_nvmf_rdma_poll_group *pg_min, *pg_start, *pg_current; 4208cd1b7ab0Sliuqinfei uint32_t min_value; 4209cd1b7ab0Sliuqinfei 421073e87ed2SAlexey Marchuk pg = &rtransport->conn_sched.next_io_pg; 4211cd1b7ab0Sliuqinfei pg_min = *pg; 4212cd1b7ab0Sliuqinfei pg_start = *pg; 4213cd1b7ab0Sliuqinfei pg_current = *pg; 421430c8b17fSJim Harris min_value = nvmf_poll_group_get_io_qpair_count(pg_current->group.group); 4215cd1b7ab0Sliuqinfei 4216dbadf72eSIgorVechriko while (1) { 4217dbadf72eSIgorVechriko count = nvmf_poll_group_get_io_qpair_count(pg_current->group.group); 4218dbadf72eSIgorVechriko 421930020c2fSJim Harris if (count < min_value) { 422030020c2fSJim Harris min_value = count; 4221cd1b7ab0Sliuqinfei pg_min = pg_current; 4222cd1b7ab0Sliuqinfei } 4223cd1b7ab0Sliuqinfei 42240441dce4SMichael Haeuptle pg_current = TAILQ_NEXT(pg_current, link); 42250441dce4SMichael Haeuptle if (pg_current == NULL) { 42260441dce4SMichael Haeuptle pg_current = TAILQ_FIRST(&rtransport->poll_groups); 42270441dce4SMichael Haeuptle } 42280441dce4SMichael Haeuptle 4229dbadf72eSIgorVechriko if (pg_current == pg_start || min_value == 0) { 4230cd1b7ab0Sliuqinfei break; 4231cd1b7ab0Sliuqinfei } 4232cd1b7ab0Sliuqinfei } 4233cd1b7ab0Sliuqinfei *pg = pg_min; 423473e87ed2SAlexey Marchuk } 423573e87ed2SAlexey Marchuk 423673e87ed2SAlexey Marchuk assert(*pg != NULL); 423773e87ed2SAlexey Marchuk 423873e87ed2SAlexey Marchuk result = &(*pg)->group; 423973e87ed2SAlexey Marchuk 424073e87ed2SAlexey Marchuk *pg = TAILQ_NEXT(*pg, link); 424173e87ed2SAlexey Marchuk if (*pg == NULL) { 424273e87ed2SAlexey Marchuk *pg = TAILQ_FIRST(&rtransport->poll_groups); 424373e87ed2SAlexey Marchuk } 424473e87ed2SAlexey Marchuk 424573e87ed2SAlexey Marchuk return result; 424673e87ed2SAlexey Marchuk } 424773e87ed2SAlexey Marchuk 4248d7b8da3bSBen Walker static void 4249a5283034Ssijie.sun nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller) 4250d7b8da3bSBen Walker { 425154c394c4SSeth Howell struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; 42528ddc5cd4Ssijie.sun int rc; 42538ddc5cd4Ssijie.sun 42548ddc5cd4Ssijie.sun TAILQ_REMOVE(&poller->group->pollers, poller, link); 425552f7aeb7SShuhei Matsumoto RB_FOREACH_SAFE(qpair, qpairs_tree, &poller->qpairs, tmp_qpair) { 425655d8d943SSeth Howell nvmf_rdma_qpair_destroy(qpair); 4257fc43fbbaSyidong0635 } 4258fc43fbbaSyidong0635 4259ed0b611fSEvgeniy Kochetov if (poller->srq) { 42609d93c082Syidong0635 if (poller->resources) { 426101201d3eSSeth Howell nvmf_rdma_resources_destroy(poller->resources); 42629d93c082Syidong0635 } 4263cf151d60SAlexey Marchuk spdk_rdma_provider_srq_destroy(poller->srq); 42642172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, "Destroyed RDMA shared queue %p\n", poller->srq); 4265ed0b611fSEvgeniy Kochetov } 4266ed0b611fSEvgeniy Kochetov 42672a0772e3SBen Walker if (poller->cq) { 42688ddc5cd4Ssijie.sun rc = ibv_destroy_cq(poller->cq); 42698ddc5cd4Ssijie.sun if (rc != 0) { 42708ddc5cd4Ssijie.sun SPDK_ERRLOG("Destroy cq return %d, error: %s\n", rc, strerror(errno)); 42718ddc5cd4Ssijie.sun } 42728ddc5cd4Ssijie.sun } 42738ddc5cd4Ssijie.sun 42748ddc5cd4Ssijie.sun if (poller->destroy_cb) { 42758ddc5cd4Ssijie.sun poller->destroy_cb(poller->destroy_cb_ctx); 42768ddc5cd4Ssijie.sun poller->destroy_cb = NULL; 42772a0772e3SBen Walker } 42782a0772e3SBen Walker 42793ee93c32SBen Walker free(poller); 42803ee93c32SBen Walker } 42813ee93c32SBen Walker 4282a5283034Ssijie.sun static void 4283a5283034Ssijie.sun nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4284a5283034Ssijie.sun { 4285a5283034Ssijie.sun struct spdk_nvmf_rdma_poll_group *rgroup, *next_rgroup; 4286a5283034Ssijie.sun struct spdk_nvmf_rdma_poller *poller, *tmp; 4287a5283034Ssijie.sun struct spdk_nvmf_rdma_transport *rtransport; 4288a5283034Ssijie.sun 4289a5283034Ssijie.sun rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 4290a5283034Ssijie.sun if (!rgroup) { 4291a5283034Ssijie.sun return; 4292a5283034Ssijie.sun } 4293a5283034Ssijie.sun 4294a5283034Ssijie.sun TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { 4295a5283034Ssijie.sun nvmf_rdma_poller_destroy(poller); 4296a5283034Ssijie.sun } 4297a5283034Ssijie.sun 42984ea996ceSTomasz Zawadzki if (rgroup->group.transport == NULL) { 429955d8d943SSeth Howell /* Transport can be NULL when nvmf_rdma_poll_group_create() 43004ea996ceSTomasz Zawadzki * calls this function directly in a failure path. */ 43014ea996ceSTomasz Zawadzki free(rgroup); 43024ea996ceSTomasz Zawadzki return; 43034ea996ceSTomasz Zawadzki } 43044ea996ceSTomasz Zawadzki 43054ea996ceSTomasz Zawadzki rtransport = SPDK_CONTAINEROF(rgroup->group.transport, struct spdk_nvmf_rdma_transport, transport); 43064ea996ceSTomasz Zawadzki 4307645d5944SAlexey Marchuk next_rgroup = TAILQ_NEXT(rgroup, link); 4308645d5944SAlexey Marchuk TAILQ_REMOVE(&rtransport->poll_groups, rgroup, link); 4309645d5944SAlexey Marchuk if (next_rgroup == NULL) { 4310645d5944SAlexey Marchuk next_rgroup = TAILQ_FIRST(&rtransport->poll_groups); 4311645d5944SAlexey Marchuk } 4312645d5944SAlexey Marchuk if (rtransport->conn_sched.next_admin_pg == rgroup) { 4313645d5944SAlexey Marchuk rtransport->conn_sched.next_admin_pg = next_rgroup; 4314645d5944SAlexey Marchuk } 4315645d5944SAlexey Marchuk if (rtransport->conn_sched.next_io_pg == rgroup) { 4316645d5944SAlexey Marchuk rtransport->conn_sched.next_io_pg = next_rgroup; 4317645d5944SAlexey Marchuk } 4318645d5944SAlexey Marchuk 4319baa936a1SBen Walker free(rgroup); 4320d7b8da3bSBen Walker } 4321d7b8da3bSBen Walker 4322b9526681SSeth Howell static void 432355d8d943SSeth Howell nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair) 4324b9526681SSeth Howell { 4325b70e6984Sjiaqizho if (rqpair->cm_id != NULL) { 432655d8d943SSeth Howell nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); 4327b70e6984Sjiaqizho } 4328b9526681SSeth Howell } 4329b9526681SSeth Howell 433021c450e1SDaniel Verkamp static int 433155d8d943SSeth Howell nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 43321d304bc5SBen Walker struct spdk_nvmf_qpair *qpair) 43338b487155SBen Walker { 4334baa936a1SBen Walker struct spdk_nvmf_rdma_poll_group *rgroup; 43353ee93c32SBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 4336958c68f1SBen Walker struct spdk_nvmf_rdma_device *device; 43373ee93c32SBen Walker struct spdk_nvmf_rdma_poller *poller; 43388b79ef33SBen Walker int rc; 43396fb90732SBen Walker 4340baa936a1SBen Walker rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 43413ee93c32SBen Walker rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 43428b487155SBen Walker 43438209c8cfSSeth Howell device = rqpair->device; 43448b487155SBen Walker 43453ee93c32SBen Walker TAILQ_FOREACH(poller, &rgroup->pollers, link) { 43463ee93c32SBen Walker if (poller->device == device) { 4347958c68f1SBen Walker break; 4348958c68f1SBen Walker } 4349958c68f1SBen Walker } 43503ee93c32SBen Walker 43513ee93c32SBen Walker if (!poller) { 43523ee93c32SBen Walker SPDK_ERRLOG("No poller found for device.\n"); 43533ee93c32SBen Walker return -1; 4354958c68f1SBen Walker } 4355958c68f1SBen Walker 4356549be9adSsijie.sun if (poller->need_destroy) { 4357549be9adSsijie.sun SPDK_ERRLOG("Poller is destroying.\n"); 4358549be9adSsijie.sun return -1; 4359549be9adSsijie.sun } 4360549be9adSsijie.sun 43618b79ef33SBen Walker rqpair->poller = poller; 4362fa79f64aSSeth Howell rqpair->srq = rqpair->poller->srq; 43638b79ef33SBen Walker 436455d8d943SSeth Howell rc = nvmf_rdma_qpair_initialize(qpair); 4365678fe328SZiye Yang if (rc < 0) { 4366678fe328SZiye Yang SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); 43677613e3feSShuhei Matsumoto rqpair->poller = NULL; 43687613e3feSShuhei Matsumoto rqpair->srq = NULL; 4369678fe328SZiye Yang return -1; 4370678fe328SZiye Yang } 43718b79ef33SBen Walker 437252f7aeb7SShuhei Matsumoto RB_INSERT(qpairs_tree, &poller->qpairs, rqpair); 43737613e3feSShuhei Matsumoto 437455d8d943SSeth Howell rc = nvmf_rdma_event_accept(rqpair->cm_id, rqpair); 43758b79ef33SBen Walker if (rc) { 43768b79ef33SBen Walker /* Try to reject, but we probably can't */ 437755d8d943SSeth Howell nvmf_rdma_qpair_reject_connection(rqpair); 43788b79ef33SBen Walker return -1; 43798b79ef33SBen Walker } 43808b487155SBen Walker 43818b487155SBen Walker return 0; 43828b487155SBen Walker } 43838b487155SBen Walker 43848b487155SBen Walker static int 43853d1d4fcfSAlexey Marchuk nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 43863d1d4fcfSAlexey Marchuk struct spdk_nvmf_qpair *qpair) 43873d1d4fcfSAlexey Marchuk { 43883d1d4fcfSAlexey Marchuk struct spdk_nvmf_rdma_qpair *rqpair; 43893d1d4fcfSAlexey Marchuk 43903d1d4fcfSAlexey Marchuk rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 43913d1d4fcfSAlexey Marchuk assert(group->transport->tgt != NULL); 43923d1d4fcfSAlexey Marchuk 43933d1d4fcfSAlexey Marchuk rqpair->destruct_channel = spdk_get_io_channel(group->transport->tgt); 43943d1d4fcfSAlexey Marchuk 43953d1d4fcfSAlexey Marchuk if (!rqpair->destruct_channel) { 43963d1d4fcfSAlexey Marchuk SPDK_WARNLOG("failed to get io_channel, qpair %p\n", qpair); 43973d1d4fcfSAlexey Marchuk return 0; 43983d1d4fcfSAlexey Marchuk } 43993d1d4fcfSAlexey Marchuk 44003d1d4fcfSAlexey Marchuk /* Sanity check that we get io_channel on the correct thread */ 44013d1d4fcfSAlexey Marchuk if (qpair->group) { 44023d1d4fcfSAlexey Marchuk assert(qpair->group->thread == spdk_io_channel_get_thread(rqpair->destruct_channel)); 44033d1d4fcfSAlexey Marchuk } 44043d1d4fcfSAlexey Marchuk 44053d1d4fcfSAlexey Marchuk return 0; 44063d1d4fcfSAlexey Marchuk } 44073d1d4fcfSAlexey Marchuk 44083d1d4fcfSAlexey Marchuk static int 440955d8d943SSeth Howell nvmf_rdma_request_free(struct spdk_nvmf_request *req) 4410388e3101SSeth Howell { 4411388e3101SSeth Howell struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); 4412388e3101SSeth Howell struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 4413388e3101SSeth Howell struct spdk_nvmf_rdma_transport, transport); 44146967fec6SAlexey Marchuk struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, 44156967fec6SAlexey Marchuk struct spdk_nvmf_rdma_qpair, qpair); 44166967fec6SAlexey Marchuk 44176967fec6SAlexey Marchuk /* 44186967fec6SAlexey Marchuk * AER requests are freed when a qpair is destroyed. The recv corresponding to that request 44196967fec6SAlexey Marchuk * needs to be returned to the shared receive queue or the poll group will eventually be 44206967fec6SAlexey Marchuk * starved of RECV structures. 44216967fec6SAlexey Marchuk */ 44226967fec6SAlexey Marchuk if (rqpair->srq && rdma_req->recv) { 44236967fec6SAlexey Marchuk int rc; 44246967fec6SAlexey Marchuk struct ibv_recv_wr *bad_recv_wr; 44256967fec6SAlexey Marchuk 4426cf151d60SAlexey Marchuk spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, &rdma_req->recv->wr); 4427cf151d60SAlexey Marchuk rc = spdk_rdma_provider_srq_flush_recv_wrs(rqpair->srq, &bad_recv_wr); 44286967fec6SAlexey Marchuk if (rc) { 44296967fec6SAlexey Marchuk SPDK_ERRLOG("Unable to re-post rx descriptor\n"); 44306967fec6SAlexey Marchuk } 44316967fec6SAlexey Marchuk } 4432388e3101SSeth Howell 443355d8d943SSeth Howell _nvmf_rdma_request_free(rdma_req, rtransport); 4434388e3101SSeth Howell return 0; 4435388e3101SSeth Howell } 4436388e3101SSeth Howell 4437388e3101SSeth Howell static int 443855d8d943SSeth Howell nvmf_rdma_request_complete(struct spdk_nvmf_request *req) 44390f912a0eSDaniel Verkamp { 44403c423f40SBen Walker struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, 44413c423f40SBen Walker struct spdk_nvmf_rdma_transport, transport); 44424bfb557dSPhilipp Skadorov struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, 44434bfb557dSPhilipp Skadorov struct spdk_nvmf_rdma_request, req); 44444bfb557dSPhilipp Skadorov struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, 44454bfb557dSPhilipp Skadorov struct spdk_nvmf_rdma_qpair, qpair); 4446cfafcc3eSBen Walker 4447feeaa282SAlexey Marchuk if (spdk_unlikely(rqpair->ibv_in_error_state)) { 4448531fd76dSBen Walker /* The connection is dead. Move the request directly to the completed state. */ 4449bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 4450feeaa282SAlexey Marchuk } else { 4451feeaa282SAlexey Marchuk /* The connection is alive, so process the request as normal */ 4452feeaa282SAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_EXECUTED; 4453531fd76dSBen Walker } 4454531fd76dSBen Walker 445555d8d943SSeth Howell nvmf_rdma_request_process(rtransport, rdma_req); 4456cfafcc3eSBen Walker 44573c423f40SBen Walker return 0; 4458cc294653SBen Walker } 4459cc294653SBen Walker 4460cc294653SBen Walker static void 4461ccd96eadSNaresh Gottumukkala nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair, 4462ccd96eadSNaresh Gottumukkala spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 44635ade1c40SBen Walker { 4464e03aca3cSSeth Howell struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 4465e03aca3cSSeth Howell 446685ff3fceSZiye Yang rqpair->to_close = true; 446785ff3fceSZiye Yang 4468b9526681SSeth Howell if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) { 446955d8d943SSeth Howell nvmf_rdma_qpair_reject_connection(rqpair); 4470b9526681SSeth Howell } 44718421f839SAlexey Marchuk if (rqpair->rdma_qp) { 4472cf151d60SAlexey Marchuk spdk_rdma_provider_qp_disconnect(rqpair->rdma_qp); 447390b4bd6cSEvgeniy Kochetov } 447490b4bd6cSEvgeniy Kochetov 447585ff3fceSZiye Yang nvmf_rdma_destroy_drained_qpair(rqpair); 4476ccd96eadSNaresh Gottumukkala 4477ccd96eadSNaresh Gottumukkala if (cb_fn) { 4478ccd96eadSNaresh Gottumukkala cb_fn(cb_arg); 4479ccd96eadSNaresh Gottumukkala } 44805ade1c40SBen Walker } 44815ade1c40SBen Walker 4482ed0b611fSEvgeniy Kochetov static struct spdk_nvmf_rdma_qpair * 4483ed0b611fSEvgeniy Kochetov get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc) 4484ed0b611fSEvgeniy Kochetov { 448552f7aeb7SShuhei Matsumoto struct spdk_nvmf_rdma_qpair find; 448652f7aeb7SShuhei Matsumoto 448752f7aeb7SShuhei Matsumoto find.qp_num = wc->qp_num; 448852f7aeb7SShuhei Matsumoto 448952f7aeb7SShuhei Matsumoto return RB_FIND(qpairs_tree, &rpoller->qpairs, &find); 4490ed0b611fSEvgeniy Kochetov } 4491ed0b611fSEvgeniy Kochetov 4492fdec444aSPhilipp Skadorov #ifdef DEBUG 4493fdec444aSPhilipp Skadorov static int 449455d8d943SSeth Howell nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) 4495fdec444aSPhilipp Skadorov { 4496fdec444aSPhilipp Skadorov return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || 4497fdec444aSPhilipp Skadorov rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; 4498fdec444aSPhilipp Skadorov } 4499fdec444aSPhilipp Skadorov #endif 4500fdec444aSPhilipp Skadorov 45019d63933bSSeth Howell static void 4502c3884f94SSeth Howell _poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_recv_wr *bad_recv_wr, 4503c3884f94SSeth Howell int rc) 4504c3884f94SSeth Howell { 4505c3884f94SSeth Howell struct spdk_nvmf_rdma_recv *rdma_recv; 4506c3884f94SSeth Howell struct spdk_nvmf_rdma_wr *bad_rdma_wr; 4507c3884f94SSeth Howell 4508c3884f94SSeth Howell SPDK_ERRLOG("Failed to post a recv for the poller %p with errno %d\n", rpoller, -rc); 4509c3884f94SSeth Howell while (bad_recv_wr != NULL) { 4510c3884f94SSeth Howell bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_recv_wr->wr_id; 4511c3884f94SSeth Howell rdma_recv = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 4512c3884f94SSeth Howell 4513c3884f94SSeth Howell rdma_recv->qpair->current_recv_depth++; 4514c3884f94SSeth Howell bad_recv_wr = bad_recv_wr->next; 4515c3884f94SSeth Howell SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc); 4516608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rdma_recv->qpair->qpair); 4517c3884f94SSeth Howell } 4518c3884f94SSeth Howell } 4519c3884f94SSeth Howell 4520c3884f94SSeth Howell static void 4521c3884f94SSeth Howell _qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *bad_recv_wr, int rc) 4522c3884f94SSeth Howell { 4523c3884f94SSeth Howell SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rqpair, -rc); 4524c3884f94SSeth Howell while (bad_recv_wr != NULL) { 4525c3884f94SSeth Howell bad_recv_wr = bad_recv_wr->next; 4526c3884f94SSeth Howell rqpair->current_recv_depth++; 4527c3884f94SSeth Howell } 4528608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rqpair->qpair); 4529c3884f94SSeth Howell } 4530c3884f94SSeth Howell 4531c3884f94SSeth Howell static void 4532c3884f94SSeth Howell _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport, 4533c3884f94SSeth Howell struct spdk_nvmf_rdma_poller *rpoller) 4534c3884f94SSeth Howell { 4535c3884f94SSeth Howell struct spdk_nvmf_rdma_qpair *rqpair; 4536c3884f94SSeth Howell struct ibv_recv_wr *bad_recv_wr; 4537c3884f94SSeth Howell int rc; 4538c3884f94SSeth Howell 4539c3884f94SSeth Howell if (rpoller->srq) { 4540cf151d60SAlexey Marchuk rc = spdk_rdma_provider_srq_flush_recv_wrs(rpoller->srq, &bad_recv_wr); 4541e718d8caSAlexey Marchuk if (spdk_unlikely(rc)) { 4542c3884f94SSeth Howell _poller_reset_failed_recvs(rpoller, bad_recv_wr, rc); 4543c3884f94SSeth Howell } 4544c3884f94SSeth Howell } else { 454514777890SSeth Howell while (!STAILQ_EMPTY(&rpoller->qpairs_pending_recv)) { 454614777890SSeth Howell rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_recv); 4547cf151d60SAlexey Marchuk rc = spdk_rdma_provider_qp_flush_recv_wrs(rqpair->rdma_qp, &bad_recv_wr); 4548e718d8caSAlexey Marchuk if (spdk_unlikely(rc)) { 4549c3884f94SSeth Howell _qp_reset_failed_recvs(rqpair, bad_recv_wr, rc); 4550c3884f94SSeth Howell } 455114777890SSeth Howell STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_recv, recv_link); 4552c3884f94SSeth Howell } 4553c3884f94SSeth Howell } 4554c3884f94SSeth Howell } 4555c3884f94SSeth Howell 4556c3884f94SSeth Howell static void 45579d63933bSSeth Howell _qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport, 45589d63933bSSeth Howell struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc) 45599d63933bSSeth Howell { 45609d63933bSSeth Howell struct spdk_nvmf_rdma_wr *bad_rdma_wr; 45619d63933bSSeth Howell struct spdk_nvmf_rdma_request *prev_rdma_req = NULL, *cur_rdma_req = NULL; 45629d63933bSSeth Howell 45639d63933bSSeth Howell SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc); 45649d63933bSSeth Howell for (; bad_wr != NULL; bad_wr = bad_wr->next) { 45659d63933bSSeth Howell bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id; 45669d63933bSSeth Howell assert(rqpair->current_send_depth > 0); 45679d63933bSSeth Howell rqpair->current_send_depth--; 45689d63933bSSeth Howell switch (bad_rdma_wr->type) { 45699d63933bSSeth Howell case RDMA_WR_TYPE_DATA: 45708288fcf9SAlexey Marchuk cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data_wr); 45719d63933bSSeth Howell if (bad_wr->opcode == IBV_WR_RDMA_READ) { 45729d63933bSSeth Howell assert(rqpair->current_read_depth > 0); 45739d63933bSSeth Howell rqpair->current_read_depth--; 45749d63933bSSeth Howell } 45759d63933bSSeth Howell break; 45769d63933bSSeth Howell case RDMA_WR_TYPE_SEND: 45778288fcf9SAlexey Marchuk cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp_wr); 45789d63933bSSeth Howell break; 45799d63933bSSeth Howell default: 45809d63933bSSeth Howell SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair); 45819d63933bSSeth Howell prev_rdma_req = cur_rdma_req; 45829d63933bSSeth Howell continue; 45839d63933bSSeth Howell } 45849d63933bSSeth Howell 45859d63933bSSeth Howell if (prev_rdma_req == cur_rdma_req) { 45869d63933bSSeth Howell /* this request was handled by an earlier wr. i.e. we were performing an nvme read. */ 45879d63933bSSeth Howell /* We only have to check against prev_wr since each requests wrs are contiguous in this list. */ 45889d63933bSSeth Howell continue; 45899d63933bSSeth Howell } 45909d63933bSSeth Howell 45919d63933bSSeth Howell switch (cur_rdma_req->state) { 45929d63933bSSeth Howell case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 45939d63933bSSeth Howell cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 459404cd8e47SAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, cur_rdma_req, state_link); 459504cd8e47SAlexey Marchuk cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING; 45969d63933bSSeth Howell break; 45979d63933bSSeth Howell case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: 45989d63933bSSeth Howell case RDMA_REQUEST_STATE_COMPLETING: 45999d63933bSSeth Howell cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 46009d63933bSSeth Howell break; 46019d63933bSSeth Howell default: 46029d63933bSSeth Howell SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n", 46039d63933bSSeth Howell cur_rdma_req->state, rqpair); 46049d63933bSSeth Howell continue; 46059d63933bSSeth Howell } 46069d63933bSSeth Howell 460755d8d943SSeth Howell nvmf_rdma_request_process(rtransport, cur_rdma_req); 46089d63933bSSeth Howell prev_rdma_req = cur_rdma_req; 46099d63933bSSeth Howell } 46109d63933bSSeth Howell 46113caf2080SKonrad Sztyber if (spdk_nvmf_qpair_is_active(&rqpair->qpair)) { 46129d63933bSSeth Howell /* Disconnect the connection. */ 4613608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rqpair->qpair); 46149d63933bSSeth Howell } 46159d63933bSSeth Howell 46169d63933bSSeth Howell } 46179d63933bSSeth Howell 46189d63933bSSeth Howell static void 46199d63933bSSeth Howell _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport, 46209d63933bSSeth Howell struct spdk_nvmf_rdma_poller *rpoller) 46219d63933bSSeth Howell { 46229d63933bSSeth Howell struct spdk_nvmf_rdma_qpair *rqpair; 46239d63933bSSeth Howell struct ibv_send_wr *bad_wr = NULL; 46249d63933bSSeth Howell int rc; 46259d63933bSSeth Howell 4626b4dc10fbSSeth Howell while (!STAILQ_EMPTY(&rpoller->qpairs_pending_send)) { 4627b4dc10fbSSeth Howell rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_send); 4628cf151d60SAlexey Marchuk rc = spdk_rdma_provider_qp_flush_send_wrs(rqpair->rdma_qp, &bad_wr); 4629b4dc10fbSSeth Howell 46309d63933bSSeth Howell /* bad wr always points to the first wr that failed. */ 4631e718d8caSAlexey Marchuk if (spdk_unlikely(rc)) { 46329d63933bSSeth Howell _qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc); 46339d63933bSSeth Howell } 4634b4dc10fbSSeth Howell STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_send, send_link); 46359d63933bSSeth Howell } 46369d63933bSSeth Howell } 46379d63933bSSeth Howell 4638db09de98SAlexey Marchuk static const char * 4639db09de98SAlexey Marchuk nvmf_rdma_wr_type_str(enum spdk_nvmf_rdma_wr_type wr_type) 4640db09de98SAlexey Marchuk { 4641db09de98SAlexey Marchuk switch (wr_type) { 4642db09de98SAlexey Marchuk case RDMA_WR_TYPE_RECV: 4643db09de98SAlexey Marchuk return "RECV"; 4644db09de98SAlexey Marchuk case RDMA_WR_TYPE_SEND: 4645db09de98SAlexey Marchuk return "SEND"; 4646db09de98SAlexey Marchuk case RDMA_WR_TYPE_DATA: 4647db09de98SAlexey Marchuk return "DATA"; 4648db09de98SAlexey Marchuk default: 4649db09de98SAlexey Marchuk SPDK_ERRLOG("Unknown WR type %d\n", wr_type); 4650db09de98SAlexey Marchuk SPDK_UNREACHABLE(); 4651db09de98SAlexey Marchuk } 4652db09de98SAlexey Marchuk } 4653db09de98SAlexey Marchuk 4654db09de98SAlexey Marchuk static inline void 4655db09de98SAlexey Marchuk nvmf_rdma_log_wc_status(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_wc *wc) 4656db09de98SAlexey Marchuk { 4657db09de98SAlexey Marchuk enum spdk_nvmf_rdma_wr_type wr_type = ((struct spdk_nvmf_rdma_wr *)wc->wr_id)->type; 4658db09de98SAlexey Marchuk 4659db09de98SAlexey Marchuk if (wc->status == IBV_WC_WR_FLUSH_ERR) { 4660db09de98SAlexey Marchuk /* If qpair is in ERR state, we will receive completions for all posted and not completed 4661db09de98SAlexey Marchuk * Work Requests with IBV_WC_WR_FLUSH_ERR status. Don't log an error in that case */ 46622172c432STomasz Zawadzki SPDK_DEBUGLOG(rdma, 4663feeaa282SAlexey Marchuk "Error on CQ %p, (qp state %d, in_error %d) request 0x%lu, type %s, status: (%d): %s\n", 4664feeaa282SAlexey Marchuk rqpair->poller->cq, rqpair->qpair.state, rqpair->ibv_in_error_state, wc->wr_id, 4665db09de98SAlexey Marchuk nvmf_rdma_wr_type_str(wr_type), wc->status, ibv_wc_status_str(wc->status)); 4666db09de98SAlexey Marchuk } else { 4667feeaa282SAlexey Marchuk SPDK_ERRLOG("Error on CQ %p, (qp state %d, in_error %d) request 0x%lu, type %s, status: (%d): %s\n", 4668feeaa282SAlexey Marchuk rqpair->poller->cq, rqpair->qpair.state, rqpair->ibv_in_error_state, wc->wr_id, 4669db09de98SAlexey Marchuk nvmf_rdma_wr_type_str(wr_type), wc->status, ibv_wc_status_str(wc->status)); 4670db09de98SAlexey Marchuk } 4671db09de98SAlexey Marchuk } 4672db09de98SAlexey Marchuk 46731db3a037SBen Walker static int 467455d8d943SSeth Howell nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, 46752a0772e3SBen Walker struct spdk_nvmf_rdma_poller *rpoller) 46761db3a037SBen Walker { 46771db3a037SBen Walker struct ibv_wc wc[32]; 46785941ab03SBen Walker struct spdk_nvmf_rdma_wr *rdma_wr; 46791db3a037SBen Walker struct spdk_nvmf_rdma_request *rdma_req; 46801db3a037SBen Walker struct spdk_nvmf_rdma_recv *rdma_recv; 46818ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_qpair *rqpair, *tmp_rqpair; 46823c423f40SBen Walker int reaped, i; 46831db3a037SBen Walker int count = 0; 4684ca59dd5dSAlexey Marchuk int rc; 46851db3a037SBen Walker bool error = false; 4686fbe8f804SEvgeniy Kochetov uint64_t poll_tsc = spdk_get_ticks(); 46871db3a037SBen Walker 46888ddc5cd4Ssijie.sun if (spdk_unlikely(rpoller->need_destroy)) { 46898ddc5cd4Ssijie.sun /* If qpair is closed before poller destroy, nvmf_rdma_destroy_drained_qpair may not 46908ddc5cd4Ssijie.sun * be called because we cannot poll anything from cq. So we call that here to force 46918ddc5cd4Ssijie.sun * destroy the qpair after to_close turning true. 46928ddc5cd4Ssijie.sun */ 46938ddc5cd4Ssijie.sun RB_FOREACH_SAFE(rqpair, qpairs_tree, &rpoller->qpairs, tmp_rqpair) { 46948ddc5cd4Ssijie.sun nvmf_rdma_destroy_drained_qpair(rqpair); 46958ddc5cd4Ssijie.sun } 46968ddc5cd4Ssijie.sun return 0; 46978ddc5cd4Ssijie.sun } 46988ddc5cd4Ssijie.sun 46991db3a037SBen Walker /* Poll for completing operations. */ 47002a0772e3SBen Walker reaped = ibv_poll_cq(rpoller->cq, 32, wc); 4701e718d8caSAlexey Marchuk if (spdk_unlikely(reaped < 0)) { 47021db3a037SBen Walker SPDK_ERRLOG("Error polling CQ! (%d): %s\n", 4703891c12a6SPawel Wodkowski errno, spdk_strerror(errno)); 47041db3a037SBen Walker return -1; 47053caf2e71SAlexey Marchuk } else if (reaped == 0) { 47063caf2e71SAlexey Marchuk rpoller->stat.idle_polls++; 47071db3a037SBen Walker } 47081db3a037SBen Walker 470938ab383aSEvgeniy Kochetov rpoller->stat.polls++; 471038ab383aSEvgeniy Kochetov rpoller->stat.completions += reaped; 471138ab383aSEvgeniy Kochetov 47121db3a037SBen Walker for (i = 0; i < reaped; i++) { 47135941ab03SBen Walker 47145941ab03SBen Walker rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id; 47155941ab03SBen Walker 471650a438d3SBen Walker switch (rdma_wr->type) { 471750a438d3SBen Walker case RDMA_WR_TYPE_SEND: 47188288fcf9SAlexey Marchuk rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp_wr); 47190d7d3a04SBen Walker rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 47200d7d3a04SBen Walker 4721e718d8caSAlexey Marchuk if (spdk_likely(!wc[i].status)) { 4722ab79560eSSeth Howell count++; 4723ab79560eSSeth Howell assert(wc[i].opcode == IBV_WC_SEND); 472455d8d943SSeth Howell assert(nvmf_rdma_req_is_completing(rdma_req)); 4725ab79560eSSeth Howell } 4726ab79560eSSeth Howell 4727bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 4728ce6b8a13SAlexey Marchuk /* RDMA_WRITE operation completed. +1 since it was chained with rsp WR */ 4729a681f8d5SAlexey Marchuk assert(rqpair->current_send_depth >= (uint32_t)rdma_req->num_outstanding_data_wr + 1); 473053777de8SAlexey Marchuk rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1; 473153777de8SAlexey Marchuk rdma_req->num_outstanding_data_wr = 0; 473292f5548aSSeth Howell 473355d8d943SSeth Howell nvmf_rdma_request_process(rtransport, rdma_req); 47340d7d3a04SBen Walker break; 473550a438d3SBen Walker case RDMA_WR_TYPE_RECV: 47366cc18a64SSeth Howell /* rdma_recv->qpair will be invalid if using an SRQ. In that case we have to get the qpair from the wc. */ 47375941ab03SBen Walker rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); 47386cc18a64SSeth Howell if (rpoller->srq != NULL) { 4739ed0b611fSEvgeniy Kochetov rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]); 474087ebcb08SEvgeniy Kochetov /* It is possible that there are still some completions for destroyed QP 474187ebcb08SEvgeniy Kochetov * associated with SRQ. We just ignore these late completions and re-post 474287ebcb08SEvgeniy Kochetov * receive WRs back to SRQ. 474387ebcb08SEvgeniy Kochetov */ 474487ebcb08SEvgeniy Kochetov if (spdk_unlikely(NULL == rdma_recv->qpair)) { 474587ebcb08SEvgeniy Kochetov struct ibv_recv_wr *bad_wr; 474687ebcb08SEvgeniy Kochetov 474787ebcb08SEvgeniy Kochetov rdma_recv->wr.next = NULL; 4748cf151d60SAlexey Marchuk spdk_rdma_provider_srq_queue_recv_wrs(rpoller->srq, &rdma_recv->wr); 4749cf151d60SAlexey Marchuk rc = spdk_rdma_provider_srq_flush_recv_wrs(rpoller->srq, &bad_wr); 475087ebcb08SEvgeniy Kochetov if (rc) { 475187ebcb08SEvgeniy Kochetov SPDK_ERRLOG("Failed to re-post recv WR to SRQ, err %d\n", rc); 475287ebcb08SEvgeniy Kochetov } 475387ebcb08SEvgeniy Kochetov continue; 475487ebcb08SEvgeniy Kochetov } 475501201d3eSSeth Howell } 47560d7d3a04SBen Walker rqpair = rdma_recv->qpair; 47570d7d3a04SBen Walker 475801201d3eSSeth Howell assert(rqpair != NULL); 4759e718d8caSAlexey Marchuk if (spdk_likely(!wc[i].status)) { 4760ab79560eSSeth Howell assert(wc[i].opcode == IBV_WC_RECV); 4761ab79560eSSeth Howell if (rqpair->current_recv_depth >= rqpair->max_queue_depth) { 4762608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rqpair->qpair); 4763ab79560eSSeth Howell break; 4764ab79560eSSeth Howell } 4765ab79560eSSeth Howell } 476601201d3eSSeth Howell 4767c3884f94SSeth Howell rdma_recv->wr.next = NULL; 4768158dc947SSeth Howell rqpair->current_recv_depth++; 4769fbe8f804SEvgeniy Kochetov rdma_recv->receive_tsc = poll_tsc; 4770fbe8f804SEvgeniy Kochetov rpoller->stat.requests++; 47715edb8edcSOr Gerlitz STAILQ_INSERT_HEAD(&rqpair->resources->incoming_queue, rdma_recv, link); 477246d7b94fSAtul Malakar rqpair->qpair.queue_depth++; 4773e06896b9SBen Walker break; 477450a438d3SBen Walker case RDMA_WR_TYPE_DATA: 47758288fcf9SAlexey Marchuk rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data_wr); 4776ab79560eSSeth Howell rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); 4777ab79560eSSeth Howell 4778ab79560eSSeth Howell assert(rdma_req->num_outstanding_data_wr > 0); 4779ab79560eSSeth Howell 4780ab79560eSSeth Howell rqpair->current_send_depth--; 4781ab79560eSSeth Howell rdma_req->num_outstanding_data_wr--; 4782e718d8caSAlexey Marchuk if (spdk_likely(!wc[i].status)) { 478353777de8SAlexey Marchuk assert(wc[i].opcode == IBV_WC_RDMA_READ); 47845e2101ceSAlexey Marchuk rqpair->current_read_depth--; 4785ab79560eSSeth Howell /* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */ 4786ab79560eSSeth Howell if (rdma_req->num_outstanding_data_wr == 0) { 47875b333e40SAlexey Marchuk if (rdma_req->num_remaining_data_wr) { 4788ca59dd5dSAlexey Marchuk /* Only part of RDMA_READ operations was submitted, process the rest */ 47898307ab43SAlexey Marchuk nvmf_rdma_request_reset_transfer_in(rdma_req, rtransport); 4790ca59dd5dSAlexey Marchuk rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING; 4791ca59dd5dSAlexey Marchuk nvmf_rdma_request_process(rtransport, rdma_req); 4792ca59dd5dSAlexey Marchuk break; 4793ca59dd5dSAlexey Marchuk } 4794ab79560eSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; 479555d8d943SSeth Howell nvmf_rdma_request_process(rtransport, rdma_req); 4796ab79560eSSeth Howell } 4797ab79560eSSeth Howell } else { 47984d5f288cSBen Walker /* If the data transfer fails still force the queue into the error state, 47995e2101ceSAlexey Marchuk * if we were performing an RDMA_READ, we need to force the request into a 48005e2101ceSAlexey Marchuk * completed state since it wasn't linked to a send. However, in the RDMA_WRITE 48015e2101ceSAlexey Marchuk * case, we should wait for the SEND to complete. */ 48025e2101ceSAlexey Marchuk if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) { 48035e2101ceSAlexey Marchuk rqpair->current_read_depth--; 48041f9ac117SSeth Howell if (rdma_req->num_outstanding_data_wr == 0) { 48054e45c563SAlexey Marchuk if (rdma_req->num_remaining_data_wr) { 48064e45c563SAlexey Marchuk /* Partially sent request is still in the pending_rdma_read_queue, 48074e45c563SAlexey Marchuk * remove it now before completing */ 48084e45c563SAlexey Marchuk rdma_req->num_remaining_data_wr = 0; 48094e45c563SAlexey Marchuk STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link); 48104e45c563SAlexey Marchuk } 4811bfdc957cSSeth Howell rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; 48124e45c563SAlexey Marchuk nvmf_rdma_request_process(rtransport, rdma_req); 4813212fd219SSeth Howell } 48141f9ac117SSeth Howell } 48155e2101ceSAlexey Marchuk } 48164d5f288cSBen Walker break; 48170d7d3a04SBen Walker default: 48180d7d3a04SBen Walker SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); 48190d7d3a04SBen Walker continue; 48200d7d3a04SBen Walker } 48210d7d3a04SBen Walker 4822ab79560eSSeth Howell /* Handle error conditions */ 4823e718d8caSAlexey Marchuk if (spdk_unlikely(wc[i].status)) { 4824feeaa282SAlexey Marchuk rqpair->ibv_in_error_state = true; 4825db09de98SAlexey Marchuk nvmf_rdma_log_wc_status(rqpair, &wc[i]); 4826ab79560eSSeth Howell 4827ab79560eSSeth Howell error = true; 4828ab79560eSSeth Howell 48293caf2080SKonrad Sztyber if (spdk_nvmf_qpair_is_active(&rqpair->qpair)) { 48308e729503SBen Walker /* Disconnect the connection. */ 4831608b54a2SKonrad Sztyber spdk_nvmf_qpair_disconnect(&rqpair->qpair); 4832bb3e4413SSeth Howell } else { 483333668b22SSeth Howell nvmf_rdma_destroy_drained_qpair(rqpair); 48348e729503SBen Walker } 48351db3a037SBen Walker continue; 48361db3a037SBen Walker } 48371db3a037SBen Walker 483855d8d943SSeth Howell nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); 4839bb3e4413SSeth Howell 48403caf2080SKonrad Sztyber if (spdk_unlikely(!spdk_nvmf_qpair_is_active(&rqpair->qpair))) { 484133668b22SSeth Howell nvmf_rdma_destroy_drained_qpair(rqpair); 4842bb3e4413SSeth Howell } 48436e5f700bSDaniel Verkamp } 48442d75d67aSDaniel Verkamp 4845e718d8caSAlexey Marchuk if (spdk_unlikely(error == true)) { 4846fcdb601eSGangCao return -1; 4847fcdb601eSGangCao } 4848fcdb601eSGangCao 48498e8f0434SAlexey Marchuk if (reaped == 0) { 48508e8f0434SAlexey Marchuk /* In some cases we may not receive any CQE but we still may have pending IO requests waiting for 48518e8f0434SAlexey Marchuk * a resource (e.g. a WR from the data_wr_pool). 48528e8f0434SAlexey Marchuk * We need to start processing of such requests if no CQE reaped */ 48538e8f0434SAlexey Marchuk nvmf_rdma_poller_process_pending_buf_queue(rtransport, rpoller); 48548e8f0434SAlexey Marchuk } 48558e8f0434SAlexey Marchuk 48569d63933bSSeth Howell /* submit outstanding work requests. */ 4857c3884f94SSeth Howell _poller_submit_recvs(rtransport, rpoller); 48589d63933bSSeth Howell _poller_submit_sends(rtransport, rpoller); 48599d63933bSSeth Howell 486004a0ac72SBen Walker return count; 48616e5f700bSDaniel Verkamp } 48626e5f700bSDaniel Verkamp 48638ddc5cd4Ssijie.sun static void 48648ddc5cd4Ssijie.sun _nvmf_rdma_remove_destroyed_device(void *c) 48658ddc5cd4Ssijie.sun { 48668ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_transport *rtransport = c; 48678ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_device *device, *device_tmp; 48688ddc5cd4Ssijie.sun int rc; 48698ddc5cd4Ssijie.sun 48708ddc5cd4Ssijie.sun TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { 48718ddc5cd4Ssijie.sun if (device->ready_to_destroy) { 48728ddc5cd4Ssijie.sun destroy_ib_device(rtransport, device); 48738ddc5cd4Ssijie.sun } 48748ddc5cd4Ssijie.sun } 48758ddc5cd4Ssijie.sun 48768ddc5cd4Ssijie.sun free_poll_fds(rtransport); 48778ddc5cd4Ssijie.sun rc = generate_poll_fds(rtransport); 48788ddc5cd4Ssijie.sun /* cannot handle fd allocation error here */ 48798ddc5cd4Ssijie.sun if (rc != 0) { 48808ddc5cd4Ssijie.sun SPDK_ERRLOG("Failed to generate poll fds after remove ib device.\n"); 48818ddc5cd4Ssijie.sun } 48828ddc5cd4Ssijie.sun } 48838ddc5cd4Ssijie.sun 48848ddc5cd4Ssijie.sun static void 48858ddc5cd4Ssijie.sun _nvmf_rdma_remove_poller_in_group_cb(void *c) 48868ddc5cd4Ssijie.sun { 48878ddc5cd4Ssijie.sun struct poller_manage_ctx *ctx = c; 48888ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_transport *rtransport = ctx->rtransport; 48898ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_device *device = ctx->device; 48908ddc5cd4Ssijie.sun struct spdk_thread *thread = ctx->thread; 48918ddc5cd4Ssijie.sun 4892549be9adSsijie.sun if (nvmf_rdma_all_pollers_management_done(c)) { 48938ddc5cd4Ssijie.sun /* destroy device when last poller is destroyed */ 48948ddc5cd4Ssijie.sun device->ready_to_destroy = true; 48958ddc5cd4Ssijie.sun spdk_thread_send_msg(thread, _nvmf_rdma_remove_destroyed_device, rtransport); 48968ddc5cd4Ssijie.sun } 48978ddc5cd4Ssijie.sun } 48988ddc5cd4Ssijie.sun 48998ddc5cd4Ssijie.sun static void 49008ddc5cd4Ssijie.sun _nvmf_rdma_remove_poller_in_group(void *c) 49018ddc5cd4Ssijie.sun { 49028ddc5cd4Ssijie.sun struct poller_manage_ctx *ctx = c; 49038ddc5cd4Ssijie.sun 49048ddc5cd4Ssijie.sun ctx->rpoller->need_destroy = true; 49058ddc5cd4Ssijie.sun ctx->rpoller->destroy_cb_ctx = ctx; 49068ddc5cd4Ssijie.sun ctx->rpoller->destroy_cb = _nvmf_rdma_remove_poller_in_group_cb; 49078ddc5cd4Ssijie.sun 4908549be9adSsijie.sun /* qp will be disconnected after receiving a RDMA_CM_EVENT_DEVICE_REMOVAL event. */ 49098ddc5cd4Ssijie.sun if (RB_EMPTY(&ctx->rpoller->qpairs)) { 49108ddc5cd4Ssijie.sun nvmf_rdma_poller_destroy(ctx->rpoller); 49118ddc5cd4Ssijie.sun } 49128ddc5cd4Ssijie.sun } 49138ddc5cd4Ssijie.sun 4914d5ce9cffSBen Walker static int 491555d8d943SSeth Howell nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 4916d5ce9cffSBen Walker { 4917d5ce9cffSBen Walker struct spdk_nvmf_rdma_transport *rtransport; 4918d5ce9cffSBen Walker struct spdk_nvmf_rdma_poll_group *rgroup; 49198ddc5cd4Ssijie.sun struct spdk_nvmf_rdma_poller *rpoller, *tmp; 4920596f8a4aSAlexey Marchuk int count = 0, rc, rc2 = 0; 4921d5ce9cffSBen Walker 4922d5ce9cffSBen Walker rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); 4923d5ce9cffSBen Walker rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 4924d5ce9cffSBen Walker 49258ddc5cd4Ssijie.sun TAILQ_FOREACH_SAFE(rpoller, &rgroup->pollers, link, tmp) { 492655d8d943SSeth Howell rc = nvmf_rdma_poller_poll(rtransport, rpoller); 4927e718d8caSAlexey Marchuk if (spdk_unlikely(rc < 0)) { 4928596f8a4aSAlexey Marchuk if (rc2 == 0) { 4929596f8a4aSAlexey Marchuk rc2 = rc; 4930596f8a4aSAlexey Marchuk } 4931596f8a4aSAlexey Marchuk continue; 4932d5ce9cffSBen Walker } 4933d5ce9cffSBen Walker count += rc; 4934d5ce9cffSBen Walker } 4935d5ce9cffSBen Walker 4936596f8a4aSAlexey Marchuk return rc2 ? rc2 : count; 4937d5ce9cffSBen Walker } 4938d5ce9cffSBen Walker 49398f64db18SBen Walker static int 494055d8d943SSeth Howell nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, 4941311ce0e2SBen Walker struct spdk_nvme_transport_id *trid, 4942311ce0e2SBen Walker bool peer) 49438f64db18SBen Walker { 49448f64db18SBen Walker struct sockaddr *saddr; 4945683c70c2SBen Walker uint16_t port; 49468f64db18SBen Walker 49477ed0904bSSeth Howell spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_RDMA); 49488f64db18SBen Walker 4949311ce0e2SBen Walker if (peer) { 4950311ce0e2SBen Walker saddr = rdma_get_peer_addr(id); 4951311ce0e2SBen Walker } else { 4952311ce0e2SBen Walker saddr = rdma_get_local_addr(id); 4953311ce0e2SBen Walker } 49548f64db18SBen Walker switch (saddr->sa_family) { 49558f64db18SBen Walker case AF_INET: { 49568f64db18SBen Walker struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; 49578f64db18SBen Walker 49588f64db18SBen Walker trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; 49598f64db18SBen Walker inet_ntop(AF_INET, &saddr_in->sin_addr, 49608f64db18SBen Walker trid->traddr, sizeof(trid->traddr)); 4961311ce0e2SBen Walker if (peer) { 4962311ce0e2SBen Walker port = ntohs(rdma_get_dst_port(id)); 4963311ce0e2SBen Walker } else { 4964311ce0e2SBen Walker port = ntohs(rdma_get_src_port(id)); 4965311ce0e2SBen Walker } 4966683c70c2SBen Walker snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 49678f64db18SBen Walker break; 49688f64db18SBen Walker } 49698f64db18SBen Walker case AF_INET6: { 49708f64db18SBen Walker struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; 49718f64db18SBen Walker trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; 49728f64db18SBen Walker inet_ntop(AF_INET6, &saddr_in->sin6_addr, 49738f64db18SBen Walker trid->traddr, sizeof(trid->traddr)); 4974311ce0e2SBen Walker if (peer) { 4975311ce0e2SBen Walker port = ntohs(rdma_get_dst_port(id)); 4976311ce0e2SBen Walker } else { 4977311ce0e2SBen Walker port = ntohs(rdma_get_src_port(id)); 4978311ce0e2SBen Walker } 4979683c70c2SBen Walker snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); 49808f64db18SBen Walker break; 49818f64db18SBen Walker } 49828f64db18SBen Walker default: 49838f64db18SBen Walker return -1; 49848f64db18SBen Walker 49858f64db18SBen Walker } 49868f64db18SBen Walker 49878f64db18SBen Walker return 0; 49888f64db18SBen Walker } 49898f64db18SBen Walker 4990311ce0e2SBen Walker static int 499155d8d943SSeth Howell nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 4992311ce0e2SBen Walker struct spdk_nvme_transport_id *trid) 4993311ce0e2SBen Walker { 4994311ce0e2SBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 4995311ce0e2SBen Walker 4996311ce0e2SBen Walker rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 4997311ce0e2SBen Walker 499855d8d943SSeth Howell return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); 4999311ce0e2SBen Walker } 5000311ce0e2SBen Walker 5001311ce0e2SBen Walker static int 500255d8d943SSeth Howell nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5003f10a91edSBen Walker struct spdk_nvme_transport_id *trid) 5004f10a91edSBen Walker { 5005f10a91edSBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 5006f10a91edSBen Walker 5007f10a91edSBen Walker rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 5008f10a91edSBen Walker 500955d8d943SSeth Howell return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); 5010f10a91edSBen Walker } 5011f10a91edSBen Walker 5012f10a91edSBen Walker static int 501355d8d943SSeth Howell nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5014311ce0e2SBen Walker struct spdk_nvme_transport_id *trid) 5015311ce0e2SBen Walker { 5016311ce0e2SBen Walker struct spdk_nvmf_rdma_qpair *rqpair; 5017311ce0e2SBen Walker 5018311ce0e2SBen Walker rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 5019311ce0e2SBen Walker 502055d8d943SSeth Howell return nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); 5021311ce0e2SBen Walker } 5022311ce0e2SBen Walker 50238e2f0cdbSzkhatami88 void 50248e2f0cdbSzkhatami88 spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) 50258e2f0cdbSzkhatami88 { 50268e2f0cdbSzkhatami88 g_nvmf_hooks = *hooks; 50278e2f0cdbSzkhatami88 } 50288e2f0cdbSzkhatami88 5029604b4503SShuhei Matsumoto static void 5030c1305e71SShuhei Matsumoto nvmf_rdma_request_set_abort_status(struct spdk_nvmf_request *req, 503104cd8e47SAlexey Marchuk struct spdk_nvmf_rdma_request *rdma_req_to_abort, 503204cd8e47SAlexey Marchuk struct spdk_nvmf_rdma_qpair *rqpair) 5033c1305e71SShuhei Matsumoto { 5034c1305e71SShuhei Matsumoto rdma_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5035c1305e71SShuhei Matsumoto rdma_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5036c1305e71SShuhei Matsumoto 503704cd8e47SAlexey Marchuk STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req_to_abort, state_link); 503804cd8e47SAlexey Marchuk rdma_req_to_abort->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING; 5039c1305e71SShuhei Matsumoto 5040c1305e71SShuhei Matsumoto req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command was successfully aborted. */ 5041c1305e71SShuhei Matsumoto } 5042c1305e71SShuhei Matsumoto 50433e1ab5eaSShuhei Matsumoto static int 50443e1ab5eaSShuhei Matsumoto _nvmf_rdma_qpair_abort_request(void *ctx) 50453e1ab5eaSShuhei Matsumoto { 50463e1ab5eaSShuhei Matsumoto struct spdk_nvmf_request *req = ctx; 50473e1ab5eaSShuhei Matsumoto struct spdk_nvmf_rdma_request *rdma_req_to_abort = SPDK_CONTAINEROF( 50483e1ab5eaSShuhei Matsumoto req->req_to_abort, struct spdk_nvmf_rdma_request, req); 50493e1ab5eaSShuhei Matsumoto struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair, 50503e1ab5eaSShuhei Matsumoto struct spdk_nvmf_rdma_qpair, qpair); 50513e1ab5eaSShuhei Matsumoto int rc; 50523e1ab5eaSShuhei Matsumoto 50533e1ab5eaSShuhei Matsumoto spdk_poller_unregister(&req->poller); 50543e1ab5eaSShuhei Matsumoto 50553e1ab5eaSShuhei Matsumoto switch (rdma_req_to_abort->state) { 50563e1ab5eaSShuhei Matsumoto case RDMA_REQUEST_STATE_EXECUTING: 505781437ff6SShuhei Matsumoto rc = nvmf_ctrlr_abort_request(req); 50583e1ab5eaSShuhei Matsumoto if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) { 50593e1ab5eaSShuhei Matsumoto return SPDK_POLLER_BUSY; 50603e1ab5eaSShuhei Matsumoto } 50613e1ab5eaSShuhei Matsumoto break; 50623e1ab5eaSShuhei Matsumoto 50633e1ab5eaSShuhei Matsumoto case RDMA_REQUEST_STATE_NEED_BUFFER: 50643e1ab5eaSShuhei Matsumoto STAILQ_REMOVE(&rqpair->poller->group->group.pending_buf_queue, 50653e1ab5eaSShuhei Matsumoto &rdma_req_to_abort->req, spdk_nvmf_request, buf_link); 50663e1ab5eaSShuhei Matsumoto 506704cd8e47SAlexey Marchuk nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair); 50683e1ab5eaSShuhei Matsumoto break; 50693e1ab5eaSShuhei Matsumoto 50703e1ab5eaSShuhei Matsumoto case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: 50713e1ab5eaSShuhei Matsumoto STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req_to_abort, 50723e1ab5eaSShuhei Matsumoto spdk_nvmf_rdma_request, state_link); 50733e1ab5eaSShuhei Matsumoto 507404cd8e47SAlexey Marchuk nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair); 50753e1ab5eaSShuhei Matsumoto break; 50763e1ab5eaSShuhei Matsumoto 50773e1ab5eaSShuhei Matsumoto case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: 50783e1ab5eaSShuhei Matsumoto STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req_to_abort, 50793e1ab5eaSShuhei Matsumoto spdk_nvmf_rdma_request, state_link); 50803e1ab5eaSShuhei Matsumoto 508104cd8e47SAlexey Marchuk nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair); 508204cd8e47SAlexey Marchuk break; 508304cd8e47SAlexey Marchuk 508404cd8e47SAlexey Marchuk case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING: 508504cd8e47SAlexey Marchuk /* Remove req from the list here to re-use common function */ 508604cd8e47SAlexey Marchuk STAILQ_REMOVE(&rqpair->pending_rdma_send_queue, rdma_req_to_abort, 508704cd8e47SAlexey Marchuk spdk_nvmf_rdma_request, state_link); 508804cd8e47SAlexey Marchuk 508904cd8e47SAlexey Marchuk nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair); 50903e1ab5eaSShuhei Matsumoto break; 50913e1ab5eaSShuhei Matsumoto 50923e1ab5eaSShuhei Matsumoto case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: 50933e1ab5eaSShuhei Matsumoto if (spdk_get_ticks() < req->timeout_tsc) { 50943e1ab5eaSShuhei Matsumoto req->poller = SPDK_POLLER_REGISTER(_nvmf_rdma_qpair_abort_request, req, 0); 50953e1ab5eaSShuhei Matsumoto return SPDK_POLLER_BUSY; 50963e1ab5eaSShuhei Matsumoto } 50973e1ab5eaSShuhei Matsumoto break; 50983e1ab5eaSShuhei Matsumoto 50993e1ab5eaSShuhei Matsumoto default: 51003e1ab5eaSShuhei Matsumoto break; 51013e1ab5eaSShuhei Matsumoto } 51023e1ab5eaSShuhei Matsumoto 51033e1ab5eaSShuhei Matsumoto spdk_nvmf_request_complete(req); 51043e1ab5eaSShuhei Matsumoto return SPDK_POLLER_BUSY; 51053e1ab5eaSShuhei Matsumoto } 51063e1ab5eaSShuhei Matsumoto 5107c1305e71SShuhei Matsumoto static void 5108604b4503SShuhei Matsumoto nvmf_rdma_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5109604b4503SShuhei Matsumoto struct spdk_nvmf_request *req) 5110604b4503SShuhei Matsumoto { 5111deec1fc7SShuhei Matsumoto struct spdk_nvmf_rdma_qpair *rqpair; 511226e0ef9aSShuhei Matsumoto struct spdk_nvmf_rdma_transport *rtransport; 511326e0ef9aSShuhei Matsumoto struct spdk_nvmf_transport *transport; 5114deec1fc7SShuhei Matsumoto uint16_t cid; 5115137866e5SAlexey Marchuk uint32_t i, max_req_count; 5116137866e5SAlexey Marchuk struct spdk_nvmf_rdma_request *rdma_req_to_abort = NULL, *rdma_req; 5117deec1fc7SShuhei Matsumoto 5118deec1fc7SShuhei Matsumoto rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); 511926e0ef9aSShuhei Matsumoto rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); 512026e0ef9aSShuhei Matsumoto transport = &rtransport->transport; 512126e0ef9aSShuhei Matsumoto 5122deec1fc7SShuhei Matsumoto cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5123137866e5SAlexey Marchuk max_req_count = rqpair->srq == NULL ? rqpair->max_queue_depth : rqpair->poller->max_srq_depth; 5124deec1fc7SShuhei Matsumoto 5125137866e5SAlexey Marchuk for (i = 0; i < max_req_count; i++) { 5126137866e5SAlexey Marchuk rdma_req = &rqpair->resources->reqs[i]; 5127137866e5SAlexey Marchuk /* When SRQ == NULL, rqpair has its own requests and req.qpair pointer always points to the qpair 5128137866e5SAlexey Marchuk * When SRQ != NULL all rqpairs share common requests and qpair pointer is assigned when we start to 5129137866e5SAlexey Marchuk * process a request. So in both cases all requests which are not in FREE state have valid qpair ptr */ 5130137866e5SAlexey Marchuk if (rdma_req->state != RDMA_REQUEST_STATE_FREE && rdma_req->req.cmd->nvme_cmd.cid == cid && 5131137866e5SAlexey Marchuk rdma_req->req.qpair == qpair) { 5132137866e5SAlexey Marchuk rdma_req_to_abort = rdma_req; 5133deec1fc7SShuhei Matsumoto break; 5134deec1fc7SShuhei Matsumoto } 5135deec1fc7SShuhei Matsumoto } 5136deec1fc7SShuhei Matsumoto 5137deec1fc7SShuhei Matsumoto if (rdma_req_to_abort == NULL) { 51383e1ab5eaSShuhei Matsumoto spdk_nvmf_request_complete(req); 5139deec1fc7SShuhei Matsumoto return; 5140deec1fc7SShuhei Matsumoto } 5141c1305e71SShuhei Matsumoto 51423e1ab5eaSShuhei Matsumoto req->req_to_abort = &rdma_req_to_abort->req; 514326e0ef9aSShuhei Matsumoto req->timeout_tsc = spdk_get_ticks() + 514426e0ef9aSShuhei Matsumoto transport->opts.abort_timeout_sec * spdk_get_ticks_hz(); 51453e1ab5eaSShuhei Matsumoto req->poller = NULL; 5146c1305e71SShuhei Matsumoto 51473e1ab5eaSShuhei Matsumoto _nvmf_rdma_qpair_abort_request(req); 5148604b4503SShuhei Matsumoto } 5149604b4503SShuhei Matsumoto 51508dfa1067SMaciej Szulik static void 51518dfa1067SMaciej Szulik nvmf_rdma_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 51528dfa1067SMaciej Szulik struct spdk_json_write_ctx *w) 51538dfa1067SMaciej Szulik { 51548dfa1067SMaciej Szulik struct spdk_nvmf_rdma_poll_group *rgroup; 51558dfa1067SMaciej Szulik struct spdk_nvmf_rdma_poller *rpoller; 51568dfa1067SMaciej Szulik 51578dfa1067SMaciej Szulik assert(w != NULL); 51588dfa1067SMaciej Szulik 51598dfa1067SMaciej Szulik rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); 51608dfa1067SMaciej Szulik 51618dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "pending_data_buffer", rgroup->stat.pending_data_buffer); 51628dfa1067SMaciej Szulik 51638dfa1067SMaciej Szulik spdk_json_write_named_array_begin(w, "devices"); 51648dfa1067SMaciej Szulik 51658dfa1067SMaciej Szulik TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { 51668dfa1067SMaciej Szulik spdk_json_write_object_begin(w); 51678dfa1067SMaciej Szulik spdk_json_write_named_string(w, "name", 51688dfa1067SMaciej Szulik ibv_get_device_name(rpoller->device->context->device)); 51698dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "polls", 51708dfa1067SMaciej Szulik rpoller->stat.polls); 51718dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "idle_polls", 51728dfa1067SMaciej Szulik rpoller->stat.idle_polls); 51738dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "completions", 51748dfa1067SMaciej Szulik rpoller->stat.completions); 51758dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "requests", 51768dfa1067SMaciej Szulik rpoller->stat.requests); 51778dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "request_latency", 51788dfa1067SMaciej Szulik rpoller->stat.request_latency); 51798dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "pending_free_request", 51808dfa1067SMaciej Szulik rpoller->stat.pending_free_request); 51818dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "pending_rdma_read", 51828dfa1067SMaciej Szulik rpoller->stat.pending_rdma_read); 51838dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "pending_rdma_write", 51848dfa1067SMaciej Szulik rpoller->stat.pending_rdma_write); 518504cd8e47SAlexey Marchuk spdk_json_write_named_uint64(w, "pending_rdma_send", 518604cd8e47SAlexey Marchuk rpoller->stat.pending_rdma_send); 51878dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "total_send_wrs", 51888dfa1067SMaciej Szulik rpoller->stat.qp_stats.send.num_submitted_wrs); 51898dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "send_doorbell_updates", 51908dfa1067SMaciej Szulik rpoller->stat.qp_stats.send.doorbell_updates); 51918dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "total_recv_wrs", 51928dfa1067SMaciej Szulik rpoller->stat.qp_stats.recv.num_submitted_wrs); 51938dfa1067SMaciej Szulik spdk_json_write_named_uint64(w, "recv_doorbell_updates", 51948dfa1067SMaciej Szulik rpoller->stat.qp_stats.recv.doorbell_updates); 51958dfa1067SMaciej Szulik spdk_json_write_object_end(w); 51968dfa1067SMaciej Szulik } 51978dfa1067SMaciej Szulik 51988dfa1067SMaciej Szulik spdk_json_write_array_end(w); 51998dfa1067SMaciej Szulik } 52008dfa1067SMaciej Szulik 520131d033f9SBen Walker const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { 52025b3e6cd1SSeth Howell .name = "RDMA", 520329f6172aSBen Walker .type = SPDK_NVME_TRANSPORT_RDMA, 520455d8d943SSeth Howell .opts_init = nvmf_rdma_opts_init, 520555d8d943SSeth Howell .create = nvmf_rdma_create, 5206f766d1e4SDarek Stojaczyk .dump_opts = nvmf_rdma_dump_opts, 520755d8d943SSeth Howell .destroy = nvmf_rdma_destroy, 52084c6e4d49SDaniel Verkamp 520955d8d943SSeth Howell .listen = nvmf_rdma_listen, 521055d8d943SSeth Howell .stop_listen = nvmf_rdma_stop_listen, 5211000e6f5bSJacek Kalwas .cdata_init = nvmf_rdma_cdata_init, 521221c450e1SDaniel Verkamp 521355d8d943SSeth Howell .listener_discover = nvmf_rdma_discover, 52142641c31aSChangpeng Liu 521555d8d943SSeth Howell .poll_group_create = nvmf_rdma_poll_group_create, 521655d8d943SSeth Howell .get_optimal_poll_group = nvmf_rdma_get_optimal_poll_group, 521755d8d943SSeth Howell .poll_group_destroy = nvmf_rdma_poll_group_destroy, 521855d8d943SSeth Howell .poll_group_add = nvmf_rdma_poll_group_add, 52193d1d4fcfSAlexey Marchuk .poll_group_remove = nvmf_rdma_poll_group_remove, 522055d8d943SSeth Howell .poll_group_poll = nvmf_rdma_poll_group_poll, 5221d7b8da3bSBen Walker 522255d8d943SSeth Howell .req_free = nvmf_rdma_request_free, 522355d8d943SSeth Howell .req_complete = nvmf_rdma_request_complete, 522421c450e1SDaniel Verkamp 522555d8d943SSeth Howell .qpair_fini = nvmf_rdma_close_qpair, 522655d8d943SSeth Howell .qpair_get_peer_trid = nvmf_rdma_qpair_get_peer_trid, 522755d8d943SSeth Howell .qpair_get_local_trid = nvmf_rdma_qpair_get_local_trid, 522855d8d943SSeth Howell .qpair_get_listen_trid = nvmf_rdma_qpair_get_listen_trid, 5229604b4503SShuhei Matsumoto .qpair_abort_request = nvmf_rdma_qpair_abort_request, 52302641c31aSChangpeng Liu 52318dfa1067SMaciej Szulik .poll_group_dump_stat = nvmf_rdma_poll_group_dump_stat, 523221c450e1SDaniel Verkamp }; 523321c450e1SDaniel Verkamp 5234f038354eSSeth Howell SPDK_NVMF_TRANSPORT_REGISTER(rdma, &spdk_nvmf_transport_rdma); 52352172c432STomasz Zawadzki SPDK_LOG_REGISTER_COMPONENT(rdma) 5236