xref: /spdk/lib/nvmf/rdma.c (revision 5469bd2d12b6f3fa914098168ea9ba8f214ca3ec)
1488570ebSJim Harris /*   SPDX-License-Identifier: BSD-3-Clause
2a6dbe372Spaul luse  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
34a2c27f7SAlexey Marchuk  *   Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved.
4a681f8d5SAlexey Marchuk  *   Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
50f912a0eSDaniel Verkamp  */
60f912a0eSDaniel Verkamp 
7b961d9ccSBen Walker #include "spdk/stdinc.h"
8b961d9ccSBen Walker 
9c4fee1e9SPawel Wodkowski #include "spdk/config.h"
10a83f91c2SBen Walker #include "spdk/thread.h"
11cc353f0eSBen Walker #include "spdk/likely.h"
12cc353f0eSBen Walker #include "spdk/nvmf_transport.h"
1337402f49SDaniel Verkamp #include "spdk/string.h"
140f912a0eSDaniel Verkamp #include "spdk/trace.h"
1552f7aeb7SShuhei Matsumoto #include "spdk/tree.h"
1650947d55SDaniel Verkamp #include "spdk/util.h"
170f912a0eSDaniel Verkamp 
18024127dcSyidong0635 #include "spdk_internal/assert.h"
194e8e97c8STomasz Zawadzki #include "spdk/log.h"
20cf151d60SAlexey Marchuk #include "spdk_internal/rdma_provider.h"
218a01b4d6SAlexey Marchuk #include "spdk_internal/rdma_utils.h"
22d27b24c9SDaniel Verkamp 
23deec1fc7SShuhei Matsumoto #include "nvmf_internal.h"
240db0c443SChunsong Feng #include "transport.h"
25deec1fc7SShuhei Matsumoto 
26c37e776eSKrzysztof Karas #include "spdk_internal/trace_defs.h"
27c37e776eSKrzysztof Karas 
288e2f0cdbSzkhatami88 struct spdk_nvme_rdma_hooks g_nvmf_hooks = {};
29f038354eSSeth Howell const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma;
308e2f0cdbSzkhatami88 
3154b538d1SDaniel Verkamp /*
326fa48bbfSChen Wang  RDMA Connection Resource Defaults
3354b538d1SDaniel Verkamp  */
34a3f09a8fSAlexey Marchuk #define NVMF_DEFAULT_MSDBD		16
351180bf83SSeth Howell #define NVMF_DEFAULT_TX_SGE		SPDK_NVMF_MAX_SGL_ENTRIES
361180bf83SSeth Howell #define NVMF_DEFAULT_RSP_SGE		1
3754b538d1SDaniel Verkamp #define NVMF_DEFAULT_RX_SGE		2
3854b538d1SDaniel Verkamp 
39e89ae156SAlexey Marchuk #define NVMF_RDMA_MAX_EVENTS_PER_POLL	32
40e89ae156SAlexey Marchuk 
41a3f09a8fSAlexey Marchuk SPDK_STATIC_ASSERT(NVMF_DEFAULT_MSDBD <= SPDK_NVMF_MAX_SGL_ENTRIES,
42a3f09a8fSAlexey Marchuk 		   "MSDBD must not exceed SPDK_NVMF_MAX_SGL_ENTRIES");
43a3f09a8fSAlexey Marchuk 
442a0772e3SBen Walker /* The RDMA completion queue size */
45db5c3ce3SXiaodong Liu #define DEFAULT_NVMF_RDMA_CQ_SIZE	4096
46db5c3ce3SXiaodong Liu #define MAX_WR_PER_QP(queue_depth)	(queue_depth * 3 + 2)
472a0772e3SBen Walker 
483c423f40SBen Walker enum spdk_nvmf_rdma_request_state {
493c423f40SBen Walker 	/* The request is not currently in use */
503c423f40SBen Walker 	RDMA_REQUEST_STATE_FREE = 0,
513c423f40SBen Walker 
523c423f40SBen Walker 	/* Initial state when request first received */
533c423f40SBen Walker 	RDMA_REQUEST_STATE_NEW,
543c423f40SBen Walker 
553c423f40SBen Walker 	/* The request is queued until a data buffer is available. */
563c423f40SBen Walker 	RDMA_REQUEST_STATE_NEED_BUFFER,
573c423f40SBen Walker 
583c423f40SBen Walker 	/* The request is waiting on RDMA queue depth availability
591d0a8e1cSSeth Howell 	 * to transfer data from the host to the controller.
603c423f40SBen Walker 	 */
611d0a8e1cSSeth Howell 	RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING,
623c423f40SBen Walker 
633c423f40SBen Walker 	/* The request is currently transferring data from the host to the controller. */
643c423f40SBen Walker 	RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
653c423f40SBen Walker 
663c423f40SBen Walker 	/* The request is ready to execute at the block device */
673c423f40SBen Walker 	RDMA_REQUEST_STATE_READY_TO_EXECUTE,
683c423f40SBen Walker 
693c423f40SBen Walker 	/* The request is currently executing at the block device */
703c423f40SBen Walker 	RDMA_REQUEST_STATE_EXECUTING,
713c423f40SBen Walker 
723c423f40SBen Walker 	/* The request finished executing at the block device */
733c423f40SBen Walker 	RDMA_REQUEST_STATE_EXECUTED,
743c423f40SBen Walker 
751d0a8e1cSSeth Howell 	/* The request is waiting on RDMA queue depth availability
761d0a8e1cSSeth Howell 	 * to transfer data from the controller to the host.
771d0a8e1cSSeth Howell 	 */
781d0a8e1cSSeth Howell 	RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING,
791d0a8e1cSSeth Howell 
8004cd8e47SAlexey Marchuk 	/* The request is waiting on RDMA queue depth availability
8104cd8e47SAlexey Marchuk 	 * to send response to the host.
8204cd8e47SAlexey Marchuk 	 */
8304cd8e47SAlexey Marchuk 	RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING,
8404cd8e47SAlexey Marchuk 
853c423f40SBen Walker 	/* The request is ready to send a completion */
863c423f40SBen Walker 	RDMA_REQUEST_STATE_READY_TO_COMPLETE,
873c423f40SBen Walker 
88fdec444aSPhilipp Skadorov 	/* The request is currently transferring data from the controller to the host. */
89fdec444aSPhilipp Skadorov 	RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
90fdec444aSPhilipp Skadorov 
91fdec444aSPhilipp Skadorov 	/* The request currently has an outstanding completion without an
92fdec444aSPhilipp Skadorov 	 * associated data transfer.
93fdec444aSPhilipp Skadorov 	 */
943c423f40SBen Walker 	RDMA_REQUEST_STATE_COMPLETING,
953c423f40SBen Walker 
963c423f40SBen Walker 	/* The request completed and can be marked free. */
973c423f40SBen Walker 	RDMA_REQUEST_STATE_COMPLETED,
98fdec444aSPhilipp Skadorov 
99fdec444aSPhilipp Skadorov 	/* Terminator */
100fdec444aSPhilipp Skadorov 	RDMA_REQUEST_NUM_STATES,
1013c423f40SBen Walker };
1023c423f40SBen Walker 
1030eae0106SJim Harris static void
1040eae0106SJim Harris nvmf_trace(void)
1056a5ae72bSBen Walker {
1066a5ae72bSBen Walker 	spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r');
10746d7b94fSAtul Malakar 
10846d7b94fSAtul Malakar 	struct spdk_trace_tpoint_opts opts[] = {
10946d7b94fSAtul Malakar 		{
11046d7b94fSAtul Malakar 			"RDMA_REQ_NEW", TRACE_RDMA_REQUEST_STATE_NEW,
11126d44a12SJim Harris 			OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 1,
11246d7b94fSAtul Malakar 			{
11346d7b94fSAtul Malakar 				{ "qpair", SPDK_TRACE_ARG_TYPE_PTR, 8 },
11446d7b94fSAtul Malakar 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
11546d7b94fSAtul Malakar 			}
11646d7b94fSAtul Malakar 		},
11746d7b94fSAtul Malakar 		{
11846d7b94fSAtul Malakar 			"RDMA_REQ_COMPLETED", TRACE_RDMA_REQUEST_STATE_COMPLETED,
11946d7b94fSAtul Malakar 			OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
12046d7b94fSAtul Malakar 			{
12146d7b94fSAtul Malakar 				{ "qpair", SPDK_TRACE_ARG_TYPE_PTR, 8 },
12246d7b94fSAtul Malakar 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
12346d7b94fSAtul Malakar 			}
12446d7b94fSAtul Malakar 		},
12546d7b94fSAtul Malakar 	};
12646d7b94fSAtul Malakar 
12746d7b94fSAtul Malakar 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
128617184beSJim Harris 	spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", TRACE_RDMA_REQUEST_STATE_NEED_BUFFER,
12926d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
130441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
131b6206d65SJim Harris 	spdk_trace_register_description("RDMA_REQ_TX_PENDING_C2H",
1321d0a8e1cSSeth Howell 					TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING,
13326d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
134441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
135b6206d65SJim Harris 	spdk_trace_register_description("RDMA_REQ_TX_PENDING_H2C",
1361d0a8e1cSSeth Howell 					TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING,
13726d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
138441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
139b6206d65SJim Harris 	spdk_trace_register_description("RDMA_REQ_TX_H2C",
1406a5ae72bSBen Walker 					TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
14126d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
142441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
143617184beSJim Harris 	spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE",
1446a5ae72bSBen Walker 					TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE,
14526d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
146441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
147617184beSJim Harris 	spdk_trace_register_description("RDMA_REQ_EXECUTING",
1486a5ae72bSBen Walker 					TRACE_RDMA_REQUEST_STATE_EXECUTING,
14926d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
150441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
151617184beSJim Harris 	spdk_trace_register_description("RDMA_REQ_EXECUTED",
1526a5ae72bSBen Walker 					TRACE_RDMA_REQUEST_STATE_EXECUTED,
15326d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
154441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
155dd1939d8SAlexey Marchuk 	spdk_trace_register_description("RDMA_REQ_RDY2COMPL_PEND",
15604cd8e47SAlexey Marchuk 					TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING,
15726d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
15804cd8e47SAlexey Marchuk 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
159b6206d65SJim Harris 	spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPL",
1606a5ae72bSBen Walker 					TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE,
16126d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
162441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
163b6206d65SJim Harris 	spdk_trace_register_description("RDMA_REQ_COMPLETING_C2H",
164fdec444aSPhilipp Skadorov 					TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
16526d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
166441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
167b6206d65SJim Harris 	spdk_trace_register_description("RDMA_REQ_COMPLETING",
1686a5ae72bSBen Walker 					TRACE_RDMA_REQUEST_STATE_COMPLETING,
16926d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
170441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_PTR, "qpair");
171e8881867SJim Harris 
172617184beSJim Harris 	spdk_trace_register_description("RDMA_QP_CREATE", TRACE_RDMA_QP_CREATE,
17326d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NONE, 0,
17440cf86f2SJim Harris 					SPDK_TRACE_ARG_TYPE_INT, "");
175617184beSJim Harris 	spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", TRACE_RDMA_IBV_ASYNC_EVENT,
17626d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NONE, 0,
177441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_INT, "type");
178617184beSJim Harris 	spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", TRACE_RDMA_CM_ASYNC_EVENT,
17926d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NONE, 0,
180441431d2SKonrad Sztyber 					SPDK_TRACE_ARG_TYPE_INT, "type");
181617184beSJim Harris 	spdk_trace_register_description("RDMA_QP_DISCONNECT", TRACE_RDMA_QP_DISCONNECT,
18226d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NONE, 0,
18340cf86f2SJim Harris 					SPDK_TRACE_ARG_TYPE_INT, "");
184617184beSJim Harris 	spdk_trace_register_description("RDMA_QP_DESTROY", TRACE_RDMA_QP_DESTROY,
18526d44a12SJim Harris 					OWNER_TYPE_NONE, OBJECT_NONE, 0,
18640cf86f2SJim Harris 					SPDK_TRACE_ARG_TYPE_INT, "");
1879937c016Sxupeng-mingtu 
1889937c016Sxupeng-mingtu 	spdk_trace_tpoint_register_relation(TRACE_BDEV_IO_START, OBJECT_NVMF_RDMA_IO, 1);
1899937c016Sxupeng-mingtu 	spdk_trace_tpoint_register_relation(TRACE_BDEV_IO_DONE, OBJECT_NVMF_RDMA_IO, 0);
1906a5ae72bSBen Walker }
1910eae0106SJim Harris SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA)
1926a5ae72bSBen Walker 
19350a438d3SBen Walker enum spdk_nvmf_rdma_wr_type {
19450a438d3SBen Walker 	RDMA_WR_TYPE_RECV,
19550a438d3SBen Walker 	RDMA_WR_TYPE_SEND,
19650a438d3SBen Walker 	RDMA_WR_TYPE_DATA,
19750a438d3SBen Walker };
19850a438d3SBen Walker 
19950a438d3SBen Walker struct spdk_nvmf_rdma_wr {
2008288fcf9SAlexey Marchuk 	/* Uses enum spdk_nvmf_rdma_wr_type */
2018288fcf9SAlexey Marchuk 	uint8_t type;
20250a438d3SBen Walker };
20350a438d3SBen Walker 
2041db3a037SBen Walker /* This structure holds commands as they are received off the wire.
2051db3a037SBen Walker  * It must be dynamically paired with a full request object
2061db3a037SBen Walker  * (spdk_nvmf_rdma_request) to service a request. It is separate
2071db3a037SBen Walker  * from the request because RDMA does not appear to order
2081db3a037SBen Walker  * completions, so occasionally we'll get a new incoming
2091db3a037SBen Walker  * command when there aren't any free request objects.
2101db3a037SBen Walker  */
2111db3a037SBen Walker struct spdk_nvmf_rdma_recv {
2121db3a037SBen Walker 	struct ibv_recv_wr			wr;
2131db3a037SBen Walker 	struct ibv_sge				sgl[NVMF_DEFAULT_RX_SGE];
2141db3a037SBen Walker 
2152a0772e3SBen Walker 	struct spdk_nvmf_rdma_qpair		*qpair;
2162a0772e3SBen Walker 
2171db3a037SBen Walker 	/* In-capsule data buffer */
2181db3a037SBen Walker 	uint8_t					*buf;
2191db3a037SBen Walker 
22050a438d3SBen Walker 	struct spdk_nvmf_rdma_wr		rdma_wr;
221fbe8f804SEvgeniy Kochetov 	uint64_t				receive_tsc;
22250a438d3SBen Walker 
22380eecdd8SSeth Howell 	STAILQ_ENTRY(spdk_nvmf_rdma_recv)	link;
2241db3a037SBen Walker };
2251db3a037SBen Walker 
226cf73fb2fSSeth Howell struct spdk_nvmf_rdma_request_data {
227cf73fb2fSSeth Howell 	struct ibv_send_wr		wr;
228cf73fb2fSSeth Howell 	struct ibv_sge			sgl[SPDK_NVMF_MAX_SGL_ENTRIES];
229cf73fb2fSSeth Howell };
230cf73fb2fSSeth Howell 
2315e152960SBen Walker struct spdk_nvmf_rdma_request {
2325e152960SBen Walker 	struct spdk_nvmf_request		req;
2335e152960SBen Walker 
2348288fcf9SAlexey Marchuk 	bool					fused_failed;
2358288fcf9SAlexey Marchuk 
2368288fcf9SAlexey Marchuk 	struct spdk_nvmf_rdma_wr		data_wr;
2378288fcf9SAlexey Marchuk 	struct spdk_nvmf_rdma_wr		rsp_wr;
2388288fcf9SAlexey Marchuk 
2398288fcf9SAlexey Marchuk 	/* Uses enum spdk_nvmf_rdma_request_state */
2408288fcf9SAlexey Marchuk 	uint8_t					state;
2413c423f40SBen Walker 
242019a5361SAlexey Marchuk 	/* Data offset in req.iov */
243019a5361SAlexey Marchuk 	uint32_t				offset;
244019a5361SAlexey Marchuk 
2451db3a037SBen Walker 	struct spdk_nvmf_rdma_recv		*recv;
2460239003aSZiye Yang 
2470239003aSZiye Yang 	struct {
2484e742338SZiye Yang 		struct	ibv_send_wr		wr;
2491180bf83SSeth Howell 		struct	ibv_sge			sgl[NVMF_DEFAULT_RSP_SGE];
2504e742338SZiye Yang 	} rsp;
2514e742338SZiye Yang 
252ca59dd5dSAlexey Marchuk 	uint16_t				iovpos;
253ca59dd5dSAlexey Marchuk 	uint16_t				num_outstanding_data_wr;
254ca59dd5dSAlexey Marchuk 	/* Used to split Write IO with multi SGL payload */
255ca59dd5dSAlexey Marchuk 	uint16_t				num_remaining_data_wr;
256fbe8f804SEvgeniy Kochetov 	uint64_t				receive_tsc;
257183c3485SJim Harris 	struct spdk_nvmf_rdma_request		*fused_pair;
25804ebc6eaSSeth Howell 	STAILQ_ENTRY(spdk_nvmf_rdma_request)	state_link;
259ca59dd5dSAlexey Marchuk 	struct ibv_send_wr			*remaining_tranfer_in_wrs;
2605a6e7a41SAlexey Marchuk 	struct ibv_send_wr			*transfer_wr;
2618288fcf9SAlexey Marchuk 	struct spdk_nvmf_rdma_request_data	data;
2625e152960SBen Walker };
2635e152960SBen Walker 
2640d3fcd10SSeth Howell struct spdk_nvmf_rdma_resource_opts {
2650d3fcd10SSeth Howell 	struct spdk_nvmf_rdma_qpair	*qpair;
2660d3fcd10SSeth Howell 	/* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */
2670d3fcd10SSeth Howell 	void				*qp;
2688a01b4d6SAlexey Marchuk 	struct spdk_rdma_utils_mem_map	*map;
2690d3fcd10SSeth Howell 	uint32_t			max_queue_depth;
2700d3fcd10SSeth Howell 	uint32_t			in_capsule_data_size;
2710d3fcd10SSeth Howell 	bool				shared;
2720d3fcd10SSeth Howell };
2730d3fcd10SSeth Howell 
274b25751d9SBen Walker struct spdk_nvmf_rdma_resources {
275b25751d9SBen Walker 	/* Array of size "max_queue_depth" containing RDMA requests. */
276b25751d9SBen Walker 	struct spdk_nvmf_rdma_request		*reqs;
277b25751d9SBen Walker 
278b25751d9SBen Walker 	/* Array of size "max_queue_depth" containing RDMA recvs. */
279b25751d9SBen Walker 	struct spdk_nvmf_rdma_recv		*recvs;
280b25751d9SBen Walker 
281b25751d9SBen Walker 	/* Array of size "max_queue_depth" containing 64 byte capsules
282b25751d9SBen Walker 	 * used for receive.
283b25751d9SBen Walker 	 */
284b25751d9SBen Walker 	union nvmf_h2c_msg			*cmds;
285b25751d9SBen Walker 
286b25751d9SBen Walker 	/* Array of size "max_queue_depth" containing 16 byte completions
287b25751d9SBen Walker 	 * to be sent back to the user.
288b25751d9SBen Walker 	 */
289b25751d9SBen Walker 	union nvmf_c2h_msg			*cpls;
290b25751d9SBen Walker 
291b25751d9SBen Walker 	/* Array of size "max_queue_depth * InCapsuleDataSize" containing
292b25751d9SBen Walker 	 * buffers to be used for in capsule data.
293b25751d9SBen Walker 	 */
294b25751d9SBen Walker 	void					*bufs;
295b25751d9SBen Walker 
296b25751d9SBen Walker 	/* Receives that are waiting for a request object */
297b25751d9SBen Walker 	STAILQ_HEAD(, spdk_nvmf_rdma_recv)	incoming_queue;
298b25751d9SBen Walker 
299b25751d9SBen Walker 	/* Queue to track free requests */
300b25751d9SBen Walker 	STAILQ_HEAD(, spdk_nvmf_rdma_request)	free_queue;
301b25751d9SBen Walker };
302b25751d9SBen Walker 
303dc84fbaaSAlexey Marchuk typedef void (*spdk_nvmf_rdma_qpair_ibv_event)(struct spdk_nvmf_rdma_qpair *rqpair);
304dc84fbaaSAlexey Marchuk 
3058ddc5cd4Ssijie.sun typedef void (*spdk_poller_destroy_cb)(void *ctx);
3068ddc5cd4Ssijie.sun 
307dc84fbaaSAlexey Marchuk struct spdk_nvmf_rdma_ibv_event_ctx {
308dc84fbaaSAlexey Marchuk 	struct spdk_nvmf_rdma_qpair			*rqpair;
309dc84fbaaSAlexey Marchuk };
310dc84fbaaSAlexey Marchuk 
3111d304bc5SBen Walker struct spdk_nvmf_rdma_qpair {
3121d304bc5SBen Walker 	struct spdk_nvmf_qpair			qpair;
313dcc055e3SDaniel Verkamp 
3148209c8cfSSeth Howell 	struct spdk_nvmf_rdma_device		*device;
3158b79ef33SBen Walker 	struct spdk_nvmf_rdma_poller		*poller;
3162b7b41eeSBen Walker 
317cf151d60SAlexey Marchuk 	struct spdk_rdma_provider_qp		*rdma_qp;
318dcc055e3SDaniel Verkamp 	struct rdma_cm_id			*cm_id;
319cf151d60SAlexey Marchuk 	struct spdk_rdma_provider_srq		*srq;
320311ce0e2SBen Walker 	struct rdma_cm_id			*listen_id;
321dcc055e3SDaniel Verkamp 
32252f7aeb7SShuhei Matsumoto 	/* Cache the QP number to improve QP search by RB tree. */
32352f7aeb7SShuhei Matsumoto 	uint32_t				qp_num;
32452f7aeb7SShuhei Matsumoto 
325ca0c1338SBen Walker 	/* The maximum number of I/O outstanding on this connection at one time */
326caf88609SBen Walker 	uint16_t				max_queue_depth;
327ca0c1338SBen Walker 
3287289d370SSeth Howell 	/* The maximum number of active RDMA READ and ATOMIC operations at one time */
3297289d370SSeth Howell 	uint16_t				max_read_depth;
330ca0c1338SBen Walker 
3314e614b31SBen Walker 	/* The maximum number of RDMA SEND operations at one time */
332158dc947SSeth Howell 	uint32_t				max_send_depth;
333158dc947SSeth Howell 
334158dc947SSeth Howell 	/* The current number of outstanding WRs from this qpair's
335158dc947SSeth Howell 	 * recv queue. Should not exceed device->attr.max_queue_depth.
336158dc947SSeth Howell 	 */
337158dc947SSeth Howell 	uint16_t				current_recv_depth;
338158dc947SSeth Howell 
3399f7582c3SSeth Howell 	/* The current number of active RDMA READ operations */
3409f7582c3SSeth Howell 	uint16_t				current_read_depth;
3419f7582c3SSeth Howell 
342158dc947SSeth Howell 	/* The current number of posted WRs from this qpair's
343158dc947SSeth Howell 	 * send queue. Should not exceed max_send_depth.
344158dc947SSeth Howell 	 */
345158dc947SSeth Howell 	uint32_t				current_send_depth;
346158dc947SSeth Howell 
3471180bf83SSeth Howell 	/* The maximum number of SGEs per WR on the send queue */
3481180bf83SSeth Howell 	uint32_t				max_send_sge;
3491180bf83SSeth Howell 
3501180bf83SSeth Howell 	/* The maximum number of SGEs per WR on the recv queue */
3511180bf83SSeth Howell 	uint32_t				max_recv_sge;
3521180bf83SSeth Howell 
353b25751d9SBen Walker 	struct spdk_nvmf_rdma_resources		*resources;
35404ebc6eaSSeth Howell 
35504ebc6eaSSeth Howell 	STAILQ_HEAD(, spdk_nvmf_rdma_request)	pending_rdma_read_queue;
35604ebc6eaSSeth Howell 
35704ebc6eaSSeth Howell 	STAILQ_HEAD(, spdk_nvmf_rdma_request)	pending_rdma_write_queue;
3581db3a037SBen Walker 
35904cd8e47SAlexey Marchuk 	STAILQ_HEAD(, spdk_nvmf_rdma_request)	pending_rdma_send_queue;
36004cd8e47SAlexey Marchuk 
361bfdc957cSSeth Howell 	/* Number of requests not in the free state */
362bfdc957cSSeth Howell 	uint32_t				qd;
363caf88609SBen Walker 
364feeaa282SAlexey Marchuk 	bool					ibv_in_error_state;
365feeaa282SAlexey Marchuk 
36652f7aeb7SShuhei Matsumoto 	RB_ENTRY(spdk_nvmf_rdma_qpair)		node;
3676f95c325SZiye Yang 
36814777890SSeth Howell 	STAILQ_ENTRY(spdk_nvmf_rdma_qpair)	recv_link;
36914777890SSeth Howell 
370b4dc10fbSSeth Howell 	STAILQ_ENTRY(spdk_nvmf_rdma_qpair)	send_link;
371b4dc10fbSSeth Howell 
372183c3485SJim Harris 	/* Points to the a request that has fuse bits set to
373183c3485SJim Harris 	 * SPDK_NVME_CMD_FUSE_FIRST, when the qpair is waiting
374183c3485SJim Harris 	 * for the request that has SPDK_NVME_CMD_FUSE_SECOND.
375183c3485SJim Harris 	 */
376183c3485SJim Harris 	struct spdk_nvmf_rdma_request		*fused_first;
377183c3485SJim Harris 
3783d1d4fcfSAlexey Marchuk 	/*
3793d1d4fcfSAlexey Marchuk 	 * io_channel which is used to destroy qpair when it is removed from poll group
3803d1d4fcfSAlexey Marchuk 	 */
3813d1d4fcfSAlexey Marchuk 	struct spdk_io_channel		*destruct_channel;
3823d1d4fcfSAlexey Marchuk 
38343f6d338SJim Harris 	/* ctx for async processing of last_wqe_reached event */
38443f6d338SJim Harris 	struct spdk_nvmf_rdma_ibv_event_ctx	*last_wqe_reached_ctx;
385dc84fbaaSAlexey Marchuk 
386a9fc7e1dSSeth Howell 	/* Lets us know that we have received the last_wqe event. */
387a9fc7e1dSSeth Howell 	bool					last_wqe_reached;
38885ff3fceSZiye Yang 
38985ff3fceSZiye Yang 	/* Indicate that nvmf_rdma_close_qpair is called */
39085ff3fceSZiye Yang 	bool					to_close;
391dcc055e3SDaniel Verkamp };
392dcc055e3SDaniel Verkamp 
39338ab383aSEvgeniy Kochetov struct spdk_nvmf_rdma_poller_stat {
39438ab383aSEvgeniy Kochetov 	uint64_t				completions;
39538ab383aSEvgeniy Kochetov 	uint64_t				polls;
3963caf2e71SAlexey Marchuk 	uint64_t				idle_polls;
397fbe8f804SEvgeniy Kochetov 	uint64_t				requests;
398fbe8f804SEvgeniy Kochetov 	uint64_t				request_latency;
399251db814SEvgeniy Kochetov 	uint64_t				pending_free_request;
400251db814SEvgeniy Kochetov 	uint64_t				pending_rdma_read;
401251db814SEvgeniy Kochetov 	uint64_t				pending_rdma_write;
40204cd8e47SAlexey Marchuk 	uint64_t				pending_rdma_send;
403cf151d60SAlexey Marchuk 	struct spdk_rdma_provider_qp_stats	qp_stats;
40438ab383aSEvgeniy Kochetov };
40538ab383aSEvgeniy Kochetov 
4063ee93c32SBen Walker struct spdk_nvmf_rdma_poller {
4073ee93c32SBen Walker 	struct spdk_nvmf_rdma_device		*device;
4083ee93c32SBen Walker 	struct spdk_nvmf_rdma_poll_group	*group;
4093ee93c32SBen Walker 
410db5c3ce3SXiaodong Liu 	int					num_cqe;
411db5c3ce3SXiaodong Liu 	int					required_num_wr;
4122a0772e3SBen Walker 	struct ibv_cq				*cq;
4132a0772e3SBen Walker 
414ed0b611fSEvgeniy Kochetov 	/* The maximum number of I/O outstanding on the shared receive queue at one time */
415ed0b611fSEvgeniy Kochetov 	uint16_t				max_srq_depth;
4168ddc5cd4Ssijie.sun 	bool					need_destroy;
417ed0b611fSEvgeniy Kochetov 
418ed0b611fSEvgeniy Kochetov 	/* Shared receive queue */
419cf151d60SAlexey Marchuk 	struct spdk_rdma_provider_srq		*srq;
420ed0b611fSEvgeniy Kochetov 
421b25751d9SBen Walker 	struct spdk_nvmf_rdma_resources		*resources;
42238ab383aSEvgeniy Kochetov 	struct spdk_nvmf_rdma_poller_stat	stat;
423ed0b611fSEvgeniy Kochetov 
4248ddc5cd4Ssijie.sun 	spdk_poller_destroy_cb			destroy_cb;
4258ddc5cd4Ssijie.sun 	void					*destroy_cb_ctx;
4268ddc5cd4Ssijie.sun 
42752f7aeb7SShuhei Matsumoto 	RB_HEAD(qpairs_tree, spdk_nvmf_rdma_qpair) qpairs;
4283ee93c32SBen Walker 
42914777890SSeth Howell 	STAILQ_HEAD(, spdk_nvmf_rdma_qpair)	qpairs_pending_recv;
43014777890SSeth Howell 
431b4dc10fbSSeth Howell 	STAILQ_HEAD(, spdk_nvmf_rdma_qpair)	qpairs_pending_send;
432b4dc10fbSSeth Howell 
4333ee93c32SBen Walker 	TAILQ_ENTRY(spdk_nvmf_rdma_poller)	link;
4343ee93c32SBen Walker };
4353ee93c32SBen Walker 
436251db814SEvgeniy Kochetov struct spdk_nvmf_rdma_poll_group_stat {
437251db814SEvgeniy Kochetov 	uint64_t				pending_data_buffer;
438251db814SEvgeniy Kochetov };
439251db814SEvgeniy Kochetov 
440baa936a1SBen Walker struct spdk_nvmf_rdma_poll_group {
441c1535ca0SBen Walker 	struct spdk_nvmf_transport_poll_group		group;
442251db814SEvgeniy Kochetov 	struct spdk_nvmf_rdma_poll_group_stat		stat;
443645d5944SAlexey Marchuk 	TAILQ_HEAD(, spdk_nvmf_rdma_poller)		pollers;
444645d5944SAlexey Marchuk 	TAILQ_ENTRY(spdk_nvmf_rdma_poll_group)		link;
445d7b8da3bSBen Walker };
446d7b8da3bSBen Walker 
447645d5944SAlexey Marchuk struct spdk_nvmf_rdma_conn_sched {
448645d5944SAlexey Marchuk 	struct spdk_nvmf_rdma_poll_group *next_admin_pg;
449645d5944SAlexey Marchuk 	struct spdk_nvmf_rdma_poll_group *next_io_pg;
450645d5944SAlexey Marchuk };
451645d5944SAlexey Marchuk 
452958c68f1SBen Walker /* Assuming rdma_cm uses just one protection domain per ibv_context. */
453958c68f1SBen Walker struct spdk_nvmf_rdma_device {
454958c68f1SBen Walker 	struct ibv_device_attr			attr;
455958c68f1SBen Walker 	struct ibv_context			*context;
456958c68f1SBen Walker 
4578a01b4d6SAlexey Marchuk 	struct spdk_rdma_utils_mem_map		*map;
458916d1f4fSBen Walker 	struct ibv_pd				*pd;
459916d1f4fSBen Walker 
46061948a1cSSeth Howell 	int					num_srq;
4618ddc5cd4Ssijie.sun 	bool					need_destroy;
4628ddc5cd4Ssijie.sun 	bool					ready_to_destroy;
463549be9adSsijie.sun 	bool					is_ready;
46461948a1cSSeth Howell 
465958c68f1SBen Walker 	TAILQ_ENTRY(spdk_nvmf_rdma_device)	link;
466958c68f1SBen Walker };
467958c68f1SBen Walker 
4681cbc2b16SBen Walker struct spdk_nvmf_rdma_port {
4696d8f1fc6SJacek Kalwas 	const struct spdk_nvme_transport_id	*trid;
4702641c31aSChangpeng Liu 	struct rdma_cm_id			*id;
471958c68f1SBen Walker 	struct spdk_nvmf_rdma_device		*device;
4721cbc2b16SBen Walker 	TAILQ_ENTRY(spdk_nvmf_rdma_port)	link;
4732641c31aSChangpeng Liu };
4742641c31aSChangpeng Liu 
475f766d1e4SDarek Stojaczyk struct rdma_transport_opts {
47697ef8701SMonica Kenguva 	int		num_cqe;
477f766d1e4SDarek Stojaczyk 	uint32_t	max_srq_depth;
478f766d1e4SDarek Stojaczyk 	bool		no_srq;
479c818233bSIvan Betsis 	bool		no_wr_batching;
480f766d1e4SDarek Stojaczyk 	int		acceptor_backlog;
481f766d1e4SDarek Stojaczyk };
482f766d1e4SDarek Stojaczyk 
483ecc436fcSBen Walker struct spdk_nvmf_rdma_transport {
484ecc436fcSBen Walker 	struct spdk_nvmf_transport	transport;
485f766d1e4SDarek Stojaczyk 	struct rdma_transport_opts	rdma_opts;
486ecc436fcSBen Walker 
487645d5944SAlexey Marchuk 	struct spdk_nvmf_rdma_conn_sched conn_sched;
488645d5944SAlexey Marchuk 
489a0a92ff4SBen Walker 	struct rdma_event_channel	*event_channel;
490a0a92ff4SBen Walker 
491cf73fb2fSSeth Howell 	struct spdk_mempool		*data_wr_pool;
492cf73fb2fSSeth Howell 
49343022da3SJacek Kalwas 	struct spdk_poller		*accept_poller;
494756df044SBen Walker 
495b6f90c52SPhilipp Skadorov 	/* fields used to poll RDMA/IB events */
496b6f90c52SPhilipp Skadorov 	nfds_t			npoll_fds;
497b6f90c52SPhilipp Skadorov 	struct pollfd		*poll_fds;
498b6f90c52SPhilipp Skadorov 
499958c68f1SBen Walker 	TAILQ_HEAD(, spdk_nvmf_rdma_device)	devices;
5001cbc2b16SBen Walker 	TAILQ_HEAD(, spdk_nvmf_rdma_port)	ports;
501645d5944SAlexey Marchuk 	TAILQ_HEAD(, spdk_nvmf_rdma_poll_group)	poll_groups;
502549be9adSsijie.sun 
503549be9adSsijie.sun 	/* ports that are removed unexpectedly and need retry listen */
504549be9adSsijie.sun 	TAILQ_HEAD(, spdk_nvmf_rdma_port)		retry_ports;
5051b9cc2a9SBen Walker };
5060f912a0eSDaniel Verkamp 
5078ddc5cd4Ssijie.sun struct poller_manage_ctx {
5088ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_transport		*rtransport;
5098ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_poll_group	*rgroup;
5108ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_poller		*rpoller;
5118ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_device		*device;
5128ddc5cd4Ssijie.sun 
5138ddc5cd4Ssijie.sun 	struct spdk_thread			*thread;
5148ddc5cd4Ssijie.sun 	volatile int				*inflight_op_counter;
5158ddc5cd4Ssijie.sun };
5168ddc5cd4Ssijie.sun 
517f766d1e4SDarek Stojaczyk static const struct spdk_json_object_decoder rdma_transport_opts_decoder[] = {
518f766d1e4SDarek Stojaczyk 	{
51997ef8701SMonica Kenguva 		"num_cqe", offsetof(struct rdma_transport_opts, num_cqe),
52097ef8701SMonica Kenguva 		spdk_json_decode_int32, true
52197ef8701SMonica Kenguva 	},
52297ef8701SMonica Kenguva 	{
523f766d1e4SDarek Stojaczyk 		"max_srq_depth", offsetof(struct rdma_transport_opts, max_srq_depth),
524f766d1e4SDarek Stojaczyk 		spdk_json_decode_uint32, true
525f766d1e4SDarek Stojaczyk 	},
526f766d1e4SDarek Stojaczyk 	{
527f766d1e4SDarek Stojaczyk 		"no_srq", offsetof(struct rdma_transport_opts, no_srq),
528f766d1e4SDarek Stojaczyk 		spdk_json_decode_bool, true
529f766d1e4SDarek Stojaczyk 	},
530f766d1e4SDarek Stojaczyk 	{
531bd3840a7SIvan Betsis 		"no_wr_batching", offsetof(struct rdma_transport_opts, no_wr_batching),
532bd3840a7SIvan Betsis 		spdk_json_decode_bool, true
533bd3840a7SIvan Betsis 	},
534bd3840a7SIvan Betsis 	{
535f766d1e4SDarek Stojaczyk 		"acceptor_backlog", offsetof(struct rdma_transport_opts, acceptor_backlog),
536f766d1e4SDarek Stojaczyk 		spdk_json_decode_int32, true
537f766d1e4SDarek Stojaczyk 	},
538f766d1e4SDarek Stojaczyk };
539f766d1e4SDarek Stojaczyk 
54052f7aeb7SShuhei Matsumoto static int
54152f7aeb7SShuhei Matsumoto nvmf_rdma_qpair_compare(struct spdk_nvmf_rdma_qpair *rqpair1, struct spdk_nvmf_rdma_qpair *rqpair2)
54252f7aeb7SShuhei Matsumoto {
543d8a10574SShuhei Matsumoto 	return rqpair1->qp_num < rqpair2->qp_num ? -1 : rqpair1->qp_num > rqpair2->qp_num;
54452f7aeb7SShuhei Matsumoto }
54552f7aeb7SShuhei Matsumoto 
54652f7aeb7SShuhei Matsumoto RB_GENERATE_STATIC(qpairs_tree, spdk_nvmf_rdma_qpair, node, nvmf_rdma_qpair_compare);
54752f7aeb7SShuhei Matsumoto 
5488dd1cd21SBen Walker static bool nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
549f8cbdf2cSAlexey Marchuk 				      struct spdk_nvmf_rdma_request *rdma_req);
550f8cbdf2cSAlexey Marchuk 
5518dd1cd21SBen Walker static void _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
552c818233bSIvan Betsis 				 struct spdk_nvmf_rdma_poller *rpoller);
553c818233bSIvan Betsis 
5548dd1cd21SBen Walker static void _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
555c818233bSIvan Betsis 				 struct spdk_nvmf_rdma_poller *rpoller);
556c818233bSIvan Betsis 
5578ddc5cd4Ssijie.sun static void _nvmf_rdma_remove_destroyed_device(void *c);
5588ddc5cd4Ssijie.sun 
5597545e8c8SAlexey Marchuk static inline enum spdk_nvme_media_error_status_code
56055d8d943SSeth Howell nvmf_rdma_dif_error_to_compl_status(uint8_t err_type) {
5617545e8c8SAlexey Marchuk 	enum spdk_nvme_media_error_status_code result;
5627545e8c8SAlexey Marchuk 	switch (err_type)
5637545e8c8SAlexey Marchuk 	{
5647545e8c8SAlexey Marchuk 	case SPDK_DIF_REFTAG_ERROR:
5657545e8c8SAlexey Marchuk 		result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR;
5667545e8c8SAlexey Marchuk 		break;
5677545e8c8SAlexey Marchuk 	case SPDK_DIF_APPTAG_ERROR:
5687545e8c8SAlexey Marchuk 		result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR;
5697545e8c8SAlexey Marchuk 		break;
5707545e8c8SAlexey Marchuk 	case SPDK_DIF_GUARD_ERROR:
5717545e8c8SAlexey Marchuk 		result = SPDK_NVME_SC_GUARD_CHECK_ERROR;
5727545e8c8SAlexey Marchuk 		break;
5737545e8c8SAlexey Marchuk 	default:
5747545e8c8SAlexey Marchuk 		SPDK_UNREACHABLE();
5757545e8c8SAlexey Marchuk 	}
5767545e8c8SAlexey Marchuk 
5777545e8c8SAlexey Marchuk 	return result;
5787545e8c8SAlexey Marchuk }
5797545e8c8SAlexey Marchuk 
58090d91cd3SAlexey Marchuk /*
58190d91cd3SAlexey Marchuk  * Return data_wrs to pool starting from \b data_wr
58290d91cd3SAlexey Marchuk  * Request's own response and data WR are excluded
58390d91cd3SAlexey Marchuk  */
5841cfff49fSBen Walker static void
58590d91cd3SAlexey Marchuk _nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
5865a6e7a41SAlexey Marchuk 			     struct ibv_send_wr *data_wr,
58790d91cd3SAlexey Marchuk 			     struct spdk_mempool *pool)
58862700dacSSeth Howell {
5896375b60cSAlexey Marchuk 	struct spdk_nvmf_rdma_request_data	*work_requests[SPDK_NVMF_MAX_SGL_ENTRIES];
5905a6e7a41SAlexey Marchuk 	struct spdk_nvmf_rdma_request_data	*nvmf_data;
59101887d3cSEvgeniy Kochetov 	struct ibv_send_wr			*next_send_wr;
592ca59dd5dSAlexey Marchuk 	uint64_t				req_wrid = (uint64_t)&rdma_req->data_wr;
5936375b60cSAlexey Marchuk 	uint32_t				num_wrs = 0;
59462700dacSSeth Howell 
5955a6e7a41SAlexey Marchuk 	while (data_wr && data_wr->wr_id == req_wrid) {
5965a6e7a41SAlexey Marchuk 		nvmf_data = SPDK_CONTAINEROF(data_wr, struct spdk_nvmf_rdma_request_data, wr);
5975a6e7a41SAlexey Marchuk 		memset(nvmf_data->sgl, 0, sizeof(data_wr->sg_list[0]) * data_wr->num_sge);
5985a6e7a41SAlexey Marchuk 		data_wr->num_sge = 0;
5995a6e7a41SAlexey Marchuk 		next_send_wr = data_wr->next;
6005a6e7a41SAlexey Marchuk 		if (data_wr != &rdma_req->data.wr) {
6015a6e7a41SAlexey Marchuk 			data_wr->next = NULL;
6026375b60cSAlexey Marchuk 			assert(num_wrs < SPDK_NVMF_MAX_SGL_ENTRIES);
6035a6e7a41SAlexey Marchuk 			work_requests[num_wrs] = nvmf_data;
6046375b60cSAlexey Marchuk 			num_wrs++;
60562700dacSSeth Howell 		}
6065a6e7a41SAlexey Marchuk 		data_wr = (!next_send_wr || next_send_wr == &rdma_req->rsp.wr) ? NULL : next_send_wr;
60762700dacSSeth Howell 	}
6086375b60cSAlexey Marchuk 
6096375b60cSAlexey Marchuk 	if (num_wrs) {
6106375b60cSAlexey Marchuk 		spdk_mempool_put_bulk(pool, (void **) work_requests, num_wrs);
6116375b60cSAlexey Marchuk 	}
61290d91cd3SAlexey Marchuk }
61390d91cd3SAlexey Marchuk 
61490d91cd3SAlexey Marchuk static void
61590d91cd3SAlexey Marchuk nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
61690d91cd3SAlexey Marchuk 			    struct spdk_nvmf_rdma_transport *rtransport)
61790d91cd3SAlexey Marchuk {
61890d91cd3SAlexey Marchuk 	rdma_req->num_outstanding_data_wr = 0;
61990d91cd3SAlexey Marchuk 
6205a6e7a41SAlexey Marchuk 	_nvmf_rdma_request_free_data(rdma_req, rdma_req->transfer_wr, rtransport->data_wr_pool);
62190d91cd3SAlexey Marchuk 
62270683284SAlexey Marchuk 	if (rdma_req->remaining_tranfer_in_wrs) {
62370683284SAlexey Marchuk 		_nvmf_rdma_request_free_data(rdma_req, rdma_req->remaining_tranfer_in_wrs,
62470683284SAlexey Marchuk 					     rtransport->data_wr_pool);
62570683284SAlexey Marchuk 		rdma_req->remaining_tranfer_in_wrs = NULL;
62670683284SAlexey Marchuk 	}
62770683284SAlexey Marchuk 
6287fbda6d9SAlexey Marchuk 	rdma_req->data.wr.next = NULL;
6297fbda6d9SAlexey Marchuk 	rdma_req->rsp.wr.next = NULL;
63062700dacSSeth Howell }
63162700dacSSeth Howell 
63262700dacSSeth Howell static void
633fa757dc9SSeth Howell nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req)
634fa757dc9SSeth Howell {
635005b053aSShuhei Matsumoto 	SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->req.data_from_pool);
6369d838d24Syidong0635 	if (req->req.cmd) {
637fa757dc9SSeth Howell 		SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode);
6389d838d24Syidong0635 	}
6399d838d24Syidong0635 	if (req->recv) {
640fa757dc9SSeth Howell 		SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id);
641fa757dc9SSeth Howell 	}
6429d838d24Syidong0635 }
643fa757dc9SSeth Howell 
644fa757dc9SSeth Howell static void
645fa757dc9SSeth Howell nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair)
646fa757dc9SSeth Howell {
647fa757dc9SSeth Howell 	int i;
64804ebc6eaSSeth Howell 
649fa757dc9SSeth Howell 	SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid);
65004ebc6eaSSeth Howell 	for (i = 0; i < rqpair->max_queue_depth; i++) {
651b25751d9SBen Walker 		if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) {
652b25751d9SBen Walker 			nvmf_rdma_dump_request(&rqpair->resources->reqs[i]);
653fa757dc9SSeth Howell 		}
654fa757dc9SSeth Howell 	}
655fa757dc9SSeth Howell }
656fa757dc9SSeth Howell 
657fa757dc9SSeth Howell static void
658353fbcdaSBen Walker nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources)
659353fbcdaSBen Walker {
66096ec8bffSDarek Stojaczyk 	spdk_free(resources->cmds);
66196ec8bffSDarek Stojaczyk 	spdk_free(resources->cpls);
66296ec8bffSDarek Stojaczyk 	spdk_free(resources->bufs);
663bfcfdb79SOr Gerlitz 	spdk_free(resources->reqs);
664bfcfdb79SOr Gerlitz 	spdk_free(resources->recvs);
665353fbcdaSBen Walker 	free(resources);
666353fbcdaSBen Walker }
667353fbcdaSBen Walker 
6680d3fcd10SSeth Howell 
6690d3fcd10SSeth Howell static struct spdk_nvmf_rdma_resources *
6700d3fcd10SSeth Howell nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts)
6710d3fcd10SSeth Howell {
6720d3fcd10SSeth Howell 	struct spdk_nvmf_rdma_resources		*resources;
6730d3fcd10SSeth Howell 	struct spdk_nvmf_rdma_request		*rdma_req;
6740d3fcd10SSeth Howell 	struct spdk_nvmf_rdma_recv		*rdma_recv;
675cf151d60SAlexey Marchuk 	struct spdk_rdma_provider_qp		*qp = NULL;
676cf151d60SAlexey Marchuk 	struct spdk_rdma_provider_srq		*srq = NULL;
677696e8580SAlexey Marchuk 	struct ibv_recv_wr			*bad_wr = NULL;
6788a01b4d6SAlexey Marchuk 	struct spdk_rdma_utils_memory_translation translation;
6790d3fcd10SSeth Howell 	uint32_t				i;
680696e8580SAlexey Marchuk 	int					rc = 0;
6810d3fcd10SSeth Howell 
6820d3fcd10SSeth Howell 	resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources));
6830d3fcd10SSeth Howell 	if (!resources) {
6840d3fcd10SSeth Howell 		SPDK_ERRLOG("Unable to allocate resources for receive queue.\n");
6850d3fcd10SSeth Howell 		return NULL;
6860d3fcd10SSeth Howell 	}
6870d3fcd10SSeth Howell 
688bfcfdb79SOr Gerlitz 	resources->reqs = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->reqs),
689bfcfdb79SOr Gerlitz 				       0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
690bfcfdb79SOr Gerlitz 	resources->recvs = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->recvs),
691bfcfdb79SOr Gerlitz 					0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
69296ec8bffSDarek Stojaczyk 	resources->cmds = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds),
69396ec8bffSDarek Stojaczyk 				       0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
69496ec8bffSDarek Stojaczyk 	resources->cpls = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls),
69596ec8bffSDarek Stojaczyk 				       0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
6960d3fcd10SSeth Howell 
6970d3fcd10SSeth Howell 	if (opts->in_capsule_data_size > 0) {
69896ec8bffSDarek Stojaczyk 		resources->bufs = spdk_zmalloc(opts->max_queue_depth * opts->in_capsule_data_size,
69996ec8bffSDarek Stojaczyk 					       0x1000, NULL, SPDK_ENV_LCORE_ID_ANY,
70096ec8bffSDarek Stojaczyk 					       SPDK_MALLOC_DMA);
7010d3fcd10SSeth Howell 	}
7020d3fcd10SSeth Howell 
7030d3fcd10SSeth Howell 	if (!resources->reqs || !resources->recvs || !resources->cmds ||
7040d3fcd10SSeth Howell 	    !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) {
7050d3fcd10SSeth Howell 		SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n");
7060d3fcd10SSeth Howell 		goto cleanup;
7070d3fcd10SSeth Howell 	}
7080d3fcd10SSeth Howell 
709bf41b46cSAleksey Marchuk 	SPDK_DEBUGLOG(rdma, "Command Array: %p Length: %lx\n",
710bf41b46cSAleksey Marchuk 		      resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds));
711bf41b46cSAleksey Marchuk 	SPDK_DEBUGLOG(rdma, "Completion Array: %p Length: %lx\n",
712bf41b46cSAleksey Marchuk 		      resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls));
713bf41b46cSAleksey Marchuk 	if (resources->bufs) {
714bf41b46cSAleksey Marchuk 		SPDK_DEBUGLOG(rdma, "In Capsule Data Array: %p Length: %x\n",
7150d3fcd10SSeth Howell 			      resources->bufs, opts->max_queue_depth *
716bf41b46cSAleksey Marchuk 			      opts->in_capsule_data_size);
7170d3fcd10SSeth Howell 	}
7180d3fcd10SSeth Howell 
7190d3fcd10SSeth Howell 	/* Initialize queues */
7200d3fcd10SSeth Howell 	STAILQ_INIT(&resources->incoming_queue);
7210d3fcd10SSeth Howell 	STAILQ_INIT(&resources->free_queue);
7220d3fcd10SSeth Howell 
723696e8580SAlexey Marchuk 	if (opts->shared) {
724cf151d60SAlexey Marchuk 		srq = (struct spdk_rdma_provider_srq *)opts->qp;
725696e8580SAlexey Marchuk 	} else {
726cf151d60SAlexey Marchuk 		qp = (struct spdk_rdma_provider_qp *)opts->qp;
727696e8580SAlexey Marchuk 	}
7280d3fcd10SSeth Howell 
729696e8580SAlexey Marchuk 	for (i = 0; i < opts->max_queue_depth; i++) {
7300d3fcd10SSeth Howell 		rdma_recv = &resources->recvs[i];
7310d3fcd10SSeth Howell 		rdma_recv->qpair = opts->qpair;
7320d3fcd10SSeth Howell 
7330d3fcd10SSeth Howell 		/* Set up memory to receive commands */
7340d3fcd10SSeth Howell 		if (resources->bufs) {
7350d3fcd10SSeth Howell 			rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i *
7360d3fcd10SSeth Howell 						  opts->in_capsule_data_size));
7370d3fcd10SSeth Howell 		}
7380d3fcd10SSeth Howell 
7390d3fcd10SSeth Howell 		rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV;
7400d3fcd10SSeth Howell 
7410d3fcd10SSeth Howell 		rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i];
7420d3fcd10SSeth Howell 		rdma_recv->sgl[0].length = sizeof(resources->cmds[i]);
7438a01b4d6SAlexey Marchuk 		rc = spdk_rdma_utils_get_translation(opts->map, &resources->cmds[i], sizeof(resources->cmds[i]),
744bf41b46cSAleksey Marchuk 						     &translation);
745bf41b46cSAleksey Marchuk 		if (rc) {
746bf41b46cSAleksey Marchuk 			goto cleanup;
747bf41b46cSAleksey Marchuk 		}
7488a01b4d6SAlexey Marchuk 		rdma_recv->sgl[0].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation);
7490d3fcd10SSeth Howell 		rdma_recv->wr.num_sge = 1;
7500d3fcd10SSeth Howell 
751bf41b46cSAleksey Marchuk 		if (rdma_recv->buf) {
7520d3fcd10SSeth Howell 			rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf;
7530d3fcd10SSeth Howell 			rdma_recv->sgl[1].length = opts->in_capsule_data_size;
7548a01b4d6SAlexey Marchuk 			rc = spdk_rdma_utils_get_translation(opts->map, rdma_recv->buf, opts->in_capsule_data_size,
7558a01b4d6SAlexey Marchuk 							     &translation);
756bf41b46cSAleksey Marchuk 			if (rc) {
757bf41b46cSAleksey Marchuk 				goto cleanup;
758bf41b46cSAleksey Marchuk 			}
7598a01b4d6SAlexey Marchuk 			rdma_recv->sgl[1].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation);
7600d3fcd10SSeth Howell 			rdma_recv->wr.num_sge++;
7610d3fcd10SSeth Howell 		}
7620d3fcd10SSeth Howell 
7630d3fcd10SSeth Howell 		rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr;
7640d3fcd10SSeth Howell 		rdma_recv->wr.sg_list = rdma_recv->sgl;
765d9ff7d09SAlexey Marchuk 		if (srq) {
766cf151d60SAlexey Marchuk 			spdk_rdma_provider_srq_queue_recv_wrs(srq, &rdma_recv->wr);
7670d3fcd10SSeth Howell 		} else {
768cf151d60SAlexey Marchuk 			spdk_rdma_provider_qp_queue_recv_wrs(qp, &rdma_recv->wr);
7690d3fcd10SSeth Howell 		}
7700d3fcd10SSeth Howell 	}
7710d3fcd10SSeth Howell 
7720d3fcd10SSeth Howell 	for (i = 0; i < opts->max_queue_depth; i++) {
7730d3fcd10SSeth Howell 		rdma_req = &resources->reqs[i];
7740d3fcd10SSeth Howell 
7750d3fcd10SSeth Howell 		if (opts->qpair != NULL) {
7760d3fcd10SSeth Howell 			rdma_req->req.qpair = &opts->qpair->qpair;
7770d3fcd10SSeth Howell 		} else {
7780d3fcd10SSeth Howell 			rdma_req->req.qpair = NULL;
7790d3fcd10SSeth Howell 		}
7800d3fcd10SSeth Howell 		rdma_req->req.cmd = NULL;
7810db0c443SChunsong Feng 		rdma_req->req.iovcnt = 0;
7820db0c443SChunsong Feng 		rdma_req->req.stripped_data = NULL;
7830d3fcd10SSeth Howell 
7840d3fcd10SSeth Howell 		/* Set up memory to send responses */
7850d3fcd10SSeth Howell 		rdma_req->req.rsp = &resources->cpls[i];
7860d3fcd10SSeth Howell 
7870d3fcd10SSeth Howell 		rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i];
7880d3fcd10SSeth Howell 		rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]);
7898a01b4d6SAlexey Marchuk 		rc = spdk_rdma_utils_get_translation(opts->map, &resources->cpls[i], sizeof(resources->cpls[i]),
790bf41b46cSAleksey Marchuk 						     &translation);
791bf41b46cSAleksey Marchuk 		if (rc) {
792bf41b46cSAleksey Marchuk 			goto cleanup;
793bf41b46cSAleksey Marchuk 		}
7948a01b4d6SAlexey Marchuk 		rdma_req->rsp.sgl[0].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation);
7950d3fcd10SSeth Howell 
7968288fcf9SAlexey Marchuk 		rdma_req->rsp_wr.type = RDMA_WR_TYPE_SEND;
7978288fcf9SAlexey Marchuk 		rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp_wr;
7980d3fcd10SSeth Howell 		rdma_req->rsp.wr.next = NULL;
7990d3fcd10SSeth Howell 		rdma_req->rsp.wr.opcode = IBV_WR_SEND;
8000d3fcd10SSeth Howell 		rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED;
8010d3fcd10SSeth Howell 		rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl;
8020d3fcd10SSeth Howell 		rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl);
8030d3fcd10SSeth Howell 
8040d3fcd10SSeth Howell 		/* Set up memory for data buffers */
8058288fcf9SAlexey Marchuk 		rdma_req->data_wr.type = RDMA_WR_TYPE_DATA;
8068288fcf9SAlexey Marchuk 		rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data_wr;
8070d3fcd10SSeth Howell 		rdma_req->data.wr.next = NULL;
8080d3fcd10SSeth Howell 		rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED;
8090d3fcd10SSeth Howell 		rdma_req->data.wr.sg_list = rdma_req->data.sgl;
8100d3fcd10SSeth Howell 		rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl);
8110d3fcd10SSeth Howell 
8120d3fcd10SSeth Howell 		/* Initialize request state to FREE */
8130d3fcd10SSeth Howell 		rdma_req->state = RDMA_REQUEST_STATE_FREE;
8140d3fcd10SSeth Howell 		STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link);
8150d3fcd10SSeth Howell 	}
8160d3fcd10SSeth Howell 
817d9ff7d09SAlexey Marchuk 	if (srq) {
818cf151d60SAlexey Marchuk 		rc = spdk_rdma_provider_srq_flush_recv_wrs(srq, &bad_wr);
819d9ff7d09SAlexey Marchuk 	} else {
820cf151d60SAlexey Marchuk 		rc = spdk_rdma_provider_qp_flush_recv_wrs(qp, &bad_wr);
821d9ff7d09SAlexey Marchuk 	}
822d9ff7d09SAlexey Marchuk 
823696e8580SAlexey Marchuk 	if (rc) {
824696e8580SAlexey Marchuk 		goto cleanup;
825696e8580SAlexey Marchuk 	}
826696e8580SAlexey Marchuk 
8270d3fcd10SSeth Howell 	return resources;
8280d3fcd10SSeth Howell 
8290d3fcd10SSeth Howell cleanup:
8300d3fcd10SSeth Howell 	nvmf_rdma_resources_destroy(resources);
8310d3fcd10SSeth Howell 	return NULL;
8320d3fcd10SSeth Howell }
8330d3fcd10SSeth Howell 
834353fbcdaSBen Walker static void
83555d8d943SSeth Howell nvmf_rdma_qpair_clean_ibv_events(struct spdk_nvmf_rdma_qpair *rqpair)
836dc84fbaaSAlexey Marchuk {
83743f6d338SJim Harris 	struct spdk_nvmf_rdma_ibv_event_ctx *ctx;
83843f6d338SJim Harris 
83943f6d338SJim Harris 	ctx = rqpair->last_wqe_reached_ctx;
84043f6d338SJim Harris 	if (ctx) {
841dc84fbaaSAlexey Marchuk 		ctx->rqpair = NULL;
8429645421cSJim Harris 		/* Memory allocated for ctx is freed in nvmf_rdma_qpair_process_last_wqe_event */
8435e156a6eSJim Harris 		rqpair->last_wqe_reached_ctx = NULL;
844dc84fbaaSAlexey Marchuk 	}
845dc84fbaaSAlexey Marchuk }
846dc84fbaaSAlexey Marchuk 
8478ddc5cd4Ssijie.sun static void nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller);
8488ddc5cd4Ssijie.sun 
849dc84fbaaSAlexey Marchuk static void
85055d8d943SSeth Howell nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair)
8515e152960SBen Walker {
852ed0b611fSEvgeniy Kochetov 	struct spdk_nvmf_rdma_recv	*rdma_recv, *recv_tmp;
853ed0b611fSEvgeniy Kochetov 	struct ibv_recv_wr		*bad_recv_wr = NULL;
854ed0b611fSEvgeniy Kochetov 	int				rc;
855ed0b611fSEvgeniy Kochetov 
856c556b6b8SKonrad Sztyber 	spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair);
85790b4bd6cSEvgeniy Kochetov 
858bfdc957cSSeth Howell 	if (rqpair->qd != 0) {
859f8cbdf2cSAlexey Marchuk 		struct spdk_nvmf_qpair *qpair = &rqpair->qpair;
860f8cbdf2cSAlexey Marchuk 		struct spdk_nvmf_rdma_transport	*rtransport = SPDK_CONTAINEROF(qpair->transport,
861f8cbdf2cSAlexey Marchuk 				struct spdk_nvmf_rdma_transport, transport);
862f8cbdf2cSAlexey Marchuk 		struct spdk_nvmf_rdma_request *req;
863f8cbdf2cSAlexey Marchuk 		uint32_t i, max_req_count = 0;
864f8cbdf2cSAlexey Marchuk 
865f8cbdf2cSAlexey Marchuk 		SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd);
866f8cbdf2cSAlexey Marchuk 
867fa79f64aSSeth Howell 		if (rqpair->srq == NULL) {
868fa757dc9SSeth Howell 			nvmf_rdma_dump_qpair_contents(rqpair);
869f8cbdf2cSAlexey Marchuk 			max_req_count = rqpair->max_queue_depth;
870f8cbdf2cSAlexey Marchuk 		} else if (rqpair->poller && rqpair->resources) {
871f8cbdf2cSAlexey Marchuk 			max_req_count = rqpair->poller->max_srq_depth;
87201201d3eSSeth Howell 		}
873f8cbdf2cSAlexey Marchuk 
8742172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "Release incomplete requests\n");
875f8cbdf2cSAlexey Marchuk 		for (i = 0; i < max_req_count; i++) {
876f8cbdf2cSAlexey Marchuk 			req = &rqpair->resources->reqs[i];
877f8cbdf2cSAlexey Marchuk 			if (req->req.qpair == qpair && req->state != RDMA_REQUEST_STATE_FREE) {
87855d8d943SSeth Howell 				/* nvmf_rdma_request_process checks qpair ibv and internal state
879f8cbdf2cSAlexey Marchuk 				 * and completes a request */
88055d8d943SSeth Howell 				nvmf_rdma_request_process(rtransport, req);
881f8cbdf2cSAlexey Marchuk 			}
882f8cbdf2cSAlexey Marchuk 		}
883f8cbdf2cSAlexey Marchuk 		assert(rqpair->qd == 0);
8849b47c7e7SBen Walker 	}
8859b47c7e7SBen Walker 
8868b79ef33SBen Walker 	if (rqpair->poller) {
88752f7aeb7SShuhei Matsumoto 		RB_REMOVE(qpairs_tree, &rqpair->poller->qpairs, rqpair);
8888b79ef33SBen Walker 
889dd90ff7aSJinYu 		if (rqpair->srq != NULL && rqpair->resources != NULL) {
890ed0b611fSEvgeniy Kochetov 			/* Drop all received but unprocessed commands for this queue and return them to SRQ */
891b25751d9SBen Walker 			STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) {
892ed0b611fSEvgeniy Kochetov 				if (rqpair == rdma_recv->qpair) {
8934af2b9bfSAlexey Marchuk 					STAILQ_REMOVE(&rqpair->resources->incoming_queue, rdma_recv, spdk_nvmf_rdma_recv, link);
894cf151d60SAlexey Marchuk 					spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, &rdma_recv->wr);
895cf151d60SAlexey Marchuk 					rc = spdk_rdma_provider_srq_flush_recv_wrs(rqpair->srq, &bad_recv_wr);
896ed0b611fSEvgeniy Kochetov 					if (rc) {
897ed0b611fSEvgeniy Kochetov 						SPDK_ERRLOG("Unable to re-post rx descriptor\n");
898ed0b611fSEvgeniy Kochetov 					}
899ed0b611fSEvgeniy Kochetov 				}
900ed0b611fSEvgeniy Kochetov 			}
90101201d3eSSeth Howell 		}
90201201d3eSSeth Howell 	}
9035e152960SBen Walker 
90455a624edSBen Walker 	if (rqpair->cm_id) {
905ea7a4f3cSAlexey Marchuk 		if (rqpair->rdma_qp != NULL) {
906cf151d60SAlexey Marchuk 			spdk_rdma_provider_qp_destroy(rqpair->rdma_qp);
907ea7a4f3cSAlexey Marchuk 			rqpair->rdma_qp = NULL;
908dd90ff7aSJinYu 		}
909db5c3ce3SXiaodong Liu 
9107dd3cf44SSeth Howell 		if (rqpair->poller != NULL && rqpair->srq == NULL) {
911db5c3ce3SXiaodong Liu 			rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth);
912db5c3ce3SXiaodong Liu 		}
9135e152960SBen Walker 	}
9145e152960SBen Walker 
915dd90ff7aSJinYu 	if (rqpair->srq == NULL && rqpair->resources != NULL) {
916353fbcdaSBen Walker 		nvmf_rdma_resources_destroy(rqpair->resources);
91701201d3eSSeth Howell 	}
918353fbcdaSBen Walker 
91955d8d943SSeth Howell 	nvmf_rdma_qpair_clean_ibv_events(rqpair);
920dc84fbaaSAlexey Marchuk 
9213d1d4fcfSAlexey Marchuk 	if (rqpair->destruct_channel) {
9223d1d4fcfSAlexey Marchuk 		spdk_put_io_channel(rqpair->destruct_channel);
9233d1d4fcfSAlexey Marchuk 		rqpair->destruct_channel = NULL;
9243d1d4fcfSAlexey Marchuk 	}
9253d1d4fcfSAlexey Marchuk 
9268ddc5cd4Ssijie.sun 	if (rqpair->poller && rqpair->poller->need_destroy && RB_EMPTY(&rqpair->poller->qpairs)) {
9278ddc5cd4Ssijie.sun 		nvmf_rdma_poller_destroy(rqpair->poller);
9288ddc5cd4Ssijie.sun 	}
929549be9adSsijie.sun 
930549be9adSsijie.sun 	/* destroy cm_id last so cma device will not be freed before we destroy the cq. */
931549be9adSsijie.sun 	if (rqpair->cm_id) {
932549be9adSsijie.sun 		rdma_destroy_id(rqpair->cm_id);
933549be9adSsijie.sun 	}
934549be9adSsijie.sun 
93555a624edSBen Walker 	free(rqpair);
9365e152960SBen Walker }
9379d9dc845SBen Walker 
938ee691fefSBen Walker static int
93997a43680SSeth Howell nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device)
94097a43680SSeth Howell {
94197a43680SSeth Howell 	struct spdk_nvmf_rdma_poller	*rpoller;
94297a43680SSeth Howell 	int				rc, num_cqe, required_num_wr;
94397a43680SSeth Howell 
94497a43680SSeth Howell 	/* Enlarge CQ size dynamically */
94597a43680SSeth Howell 	rpoller = rqpair->poller;
94697a43680SSeth Howell 	required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth);
94797a43680SSeth Howell 	num_cqe = rpoller->num_cqe;
94897a43680SSeth Howell 	if (num_cqe < required_num_wr) {
94997a43680SSeth Howell 		num_cqe = spdk_max(num_cqe * 2, required_num_wr);
95097a43680SSeth Howell 		num_cqe = spdk_min(num_cqe, device->attr.max_cqe);
95197a43680SSeth Howell 	}
95297a43680SSeth Howell 
95397a43680SSeth Howell 	if (rpoller->num_cqe != num_cqe) {
9544a2c27f7SAlexey Marchuk 		if (device->context->device->transport_type == IBV_TRANSPORT_IWARP) {
9554a2c27f7SAlexey Marchuk 			SPDK_ERRLOG("iWARP doesn't support CQ resize. Current capacity %u, required %u\n"
9564a2c27f7SAlexey Marchuk 				    "Using CQ of insufficient size may lead to CQ overrun\n", rpoller->num_cqe, num_cqe);
9574a2c27f7SAlexey Marchuk 			return -1;
9584a2c27f7SAlexey Marchuk 		}
95997a43680SSeth Howell 		if (required_num_wr > device->attr.max_cqe) {
96097a43680SSeth Howell 			SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n",
96197a43680SSeth Howell 				    required_num_wr, device->attr.max_cqe);
96297a43680SSeth Howell 			return -1;
96397a43680SSeth Howell 		}
96497a43680SSeth Howell 
9652172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe);
96697a43680SSeth Howell 		rc = ibv_resize_cq(rpoller->cq, num_cqe);
96797a43680SSeth Howell 		if (rc) {
96897a43680SSeth Howell 			SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno));
96997a43680SSeth Howell 			return -1;
97097a43680SSeth Howell 		}
97197a43680SSeth Howell 
97297a43680SSeth Howell 		rpoller->num_cqe = num_cqe;
97397a43680SSeth Howell 	}
97497a43680SSeth Howell 
97597a43680SSeth Howell 	rpoller->required_num_wr = required_num_wr;
97697a43680SSeth Howell 	return 0;
97797a43680SSeth Howell }
97897a43680SSeth Howell 
97997a43680SSeth Howell static int
98055d8d943SSeth Howell nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
9811ade6e1dSDaniel Verkamp {
98255a624edSBen Walker 	struct spdk_nvmf_rdma_qpair		*rqpair;
983ed0b611fSEvgeniy Kochetov 	struct spdk_nvmf_rdma_transport		*rtransport;
9848e808490SJohn Barnard 	struct spdk_nvmf_transport		*transport;
9850d3fcd10SSeth Howell 	struct spdk_nvmf_rdma_resource_opts	opts;
9861180bf83SSeth Howell 	struct spdk_nvmf_rdma_device		*device;
987cf151d60SAlexey Marchuk 	struct spdk_rdma_provider_qp_init_attr	qp_init_attr = {};
9881ade6e1dSDaniel Verkamp 
989ee691fefSBen Walker 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
9908209c8cfSSeth Howell 	device = rqpair->device;
991ecc436fcSBen Walker 
992ea7a4f3cSAlexey Marchuk 	qp_init_attr.qp_context	= rqpair;
993ea7a4f3cSAlexey Marchuk 	qp_init_attr.pd		= device->pd;
994ea7a4f3cSAlexey Marchuk 	qp_init_attr.send_cq	= rqpair->poller->cq;
995ea7a4f3cSAlexey Marchuk 	qp_init_attr.recv_cq	= rqpair->poller->cq;
99601201d3eSSeth Howell 
997fa79f64aSSeth Howell 	if (rqpair->srq) {
998696e8580SAlexey Marchuk 		qp_init_attr.srq		= rqpair->srq->srq;
99901201d3eSSeth Howell 	} else {
1000ea7a4f3cSAlexey Marchuk 		qp_init_attr.cap.max_recv_wr	= rqpair->max_queue_depth;
100101201d3eSSeth Howell 	}
100201201d3eSSeth Howell 
1003ea7a4f3cSAlexey Marchuk 	/* SEND, READ, and WRITE operations */
1004ea7a4f3cSAlexey Marchuk 	qp_init_attr.cap.max_send_wr	= (uint32_t)rqpair->max_queue_depth * 2;
1005ea7a4f3cSAlexey Marchuk 	qp_init_attr.cap.max_send_sge	= spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_TX_SGE);
1006ea7a4f3cSAlexey Marchuk 	qp_init_attr.cap.max_recv_sge	= spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
100736ac75b9SAlexey Marchuk 	qp_init_attr.stats		= &rqpair->poller->stat.qp_stats;
10081b17e4eeSBen Walker 
10097dd3cf44SSeth Howell 	if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) {
101097a43680SSeth Howell 		SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n");
1011a5972c62SSeth Howell 		goto error;
1012db5c3ce3SXiaodong Liu 	}
1013db5c3ce3SXiaodong Liu 
1014cf151d60SAlexey Marchuk 	rqpair->rdma_qp = spdk_rdma_provider_qp_create(rqpair->cm_id, &qp_init_attr);
1015ea7a4f3cSAlexey Marchuk 	if (!rqpair->rdma_qp) {
1016a5972c62SSeth Howell 		goto error;
10171b17e4eeSBen Walker 	}
10187e23841dSBen Walker 
101952f7aeb7SShuhei Matsumoto 	rqpair->qp_num = rqpair->rdma_qp->qp->qp_num;
102052f7aeb7SShuhei Matsumoto 
10211f626649SAlexey Marchuk 	rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2),
1022ea7a4f3cSAlexey Marchuk 					  qp_init_attr.cap.max_send_wr);
1023ea7a4f3cSAlexey Marchuk 	rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, qp_init_attr.cap.max_send_sge);
1024ea7a4f3cSAlexey Marchuk 	rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, qp_init_attr.cap.max_recv_sge);
1025c556b6b8SKonrad Sztyber 	spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair);
10262172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "New RDMA Connection: %p\n", qpair);
1027130fec66SBen Walker 
102801201d3eSSeth Howell 	if (rqpair->poller->srq == NULL) {
1029ed0b611fSEvgeniy Kochetov 		rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
1030ed0b611fSEvgeniy Kochetov 		transport = &rtransport->transport;
1031ed0b611fSEvgeniy Kochetov 
1032d9ff7d09SAlexey Marchuk 		opts.qp = rqpair->rdma_qp;
1033bf41b46cSAleksey Marchuk 		opts.map = device->map;
10340d3fcd10SSeth Howell 		opts.qpair = rqpair;
10350d3fcd10SSeth Howell 		opts.shared = false;
10360d3fcd10SSeth Howell 		opts.max_queue_depth = rqpair->max_queue_depth;
10370d3fcd10SSeth Howell 		opts.in_capsule_data_size = transport->opts.in_capsule_data_size;
10386138d3bcSSenthil Kumar V 
10390d3fcd10SSeth Howell 		rqpair->resources = nvmf_rdma_resources_create(&opts);
10408e808490SJohn Barnard 
10410d3fcd10SSeth Howell 		if (!rqpair->resources) {
10420d3fcd10SSeth Howell 			SPDK_ERRLOG("Unable to allocate resources for receive queue.\n");
10438a14af68SJacek Kalwas 			rdma_destroy_qp(rqpair->cm_id);
1044a5972c62SSeth Howell 			goto error;
10459d9dc845SBen Walker 		}
104601201d3eSSeth Howell 	} else {
10470d3fcd10SSeth Howell 		rqpair->resources = rqpair->poller->resources;
104801201d3eSSeth Howell 	}
10493d52e57cSBen Walker 
10500d3fcd10SSeth Howell 	rqpair->current_recv_depth = 0;
105104ebc6eaSSeth Howell 	STAILQ_INIT(&rqpair->pending_rdma_read_queue);
105204ebc6eaSSeth Howell 	STAILQ_INIT(&rqpair->pending_rdma_write_queue);
105304cd8e47SAlexey Marchuk 	STAILQ_INIT(&rqpair->pending_rdma_send_queue);
105446d7b94fSAtul Malakar 	rqpair->qpair.queue_depth = 0;
105504ebc6eaSSeth Howell 
1056ee691fefSBen Walker 	return 0;
1057a5972c62SSeth Howell 
1058a5972c62SSeth Howell error:
1059a5972c62SSeth Howell 	rdma_destroy_id(rqpair->cm_id);
1060a5972c62SSeth Howell 	rqpair->cm_id = NULL;
1061a5972c62SSeth Howell 	return -1;
1062989859bbSBen Walker }
1063989859bbSBen Walker 
1064c3884f94SSeth Howell /* Append the given recv wr structure to the resource structs outstanding recvs list. */
1065c3884f94SSeth Howell /* This function accepts either a single wr or the first wr in a linked list. */
1066c3884f94SSeth Howell static void
1067c3884f94SSeth Howell nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first)
1068c3884f94SSeth Howell {
1069c818233bSIvan Betsis 	struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
1070c818233bSIvan Betsis 			struct spdk_nvmf_rdma_transport, transport);
1071c3884f94SSeth Howell 
1072696e8580SAlexey Marchuk 	if (rqpair->srq != NULL) {
1073cf151d60SAlexey Marchuk 		spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, first);
1074c3884f94SSeth Howell 	} else {
1075cf151d60SAlexey Marchuk 		if (spdk_rdma_provider_qp_queue_recv_wrs(rqpair->rdma_qp, first)) {
1076d9ff7d09SAlexey Marchuk 			STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_recv, rqpair, recv_link);
1077d9ff7d09SAlexey Marchuk 		}
1078c3884f94SSeth Howell 	}
1079c818233bSIvan Betsis 
1080c818233bSIvan Betsis 	if (rtransport->rdma_opts.no_wr_batching) {
1081c818233bSIvan Betsis 		_poller_submit_recvs(rtransport, rqpair->poller);
1082c818233bSIvan Betsis 	}
1083c3884f94SSeth Howell }
10849d63933bSSeth Howell 
10858b9c92d3SAlexey Marchuk static inline void
1086cc294653SBen Walker request_transfer_in(struct spdk_nvmf_request *req)
10872e550d51SDaniel Verkamp {
10886fb90732SBen Walker 	struct spdk_nvmf_rdma_request	*rdma_req;
10896fb90732SBen Walker 	struct spdk_nvmf_qpair		*qpair;
109055a624edSBen Walker 	struct spdk_nvmf_rdma_qpair	*rqpair;
1091c818233bSIvan Betsis 	struct spdk_nvmf_rdma_transport *rtransport;
1092caf88609SBen Walker 
10936fb90732SBen Walker 	qpair = req->qpair;
10946fb90732SBen Walker 	rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
109555a624edSBen Walker 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
1096c818233bSIvan Betsis 	rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
1097c818233bSIvan Betsis 				      struct spdk_nvmf_rdma_transport, transport);
10986fb90732SBen Walker 
1099cc294653SBen Walker 	assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
1100158dc947SSeth Howell 	assert(rdma_req != NULL);
1101caf88609SBen Walker 
1102cf151d60SAlexey Marchuk 	if (spdk_rdma_provider_qp_queue_send_wrs(rqpair->rdma_qp, rdma_req->transfer_wr)) {
1103bbb493ceSAlexey Marchuk 		STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link);
1104bbb493ceSAlexey Marchuk 	}
1105c818233bSIvan Betsis 	if (rtransport->rdma_opts.no_wr_batching) {
1106c818233bSIvan Betsis 		_poller_submit_sends(rtransport, rqpair->poller);
1107c818233bSIvan Betsis 	}
1108bbb493ceSAlexey Marchuk 
1109a681f8d5SAlexey Marchuk 	assert(rqpair->current_read_depth + rdma_req->num_outstanding_data_wr <= rqpair->max_read_depth);
1110dfdd76cfSSeth Howell 	rqpair->current_read_depth += rdma_req->num_outstanding_data_wr;
1111a681f8d5SAlexey Marchuk 	assert(rqpair->current_send_depth + rdma_req->num_outstanding_data_wr <= rqpair->max_send_depth);
1112158dc947SSeth Howell 	rqpair->current_send_depth += rdma_req->num_outstanding_data_wr;
11132e550d51SDaniel Verkamp }
11142e550d51SDaniel Verkamp 
11158307ab43SAlexey Marchuk static inline void
1116ca59dd5dSAlexey Marchuk nvmf_rdma_request_reset_transfer_in(struct spdk_nvmf_rdma_request *rdma_req,
1117ca59dd5dSAlexey Marchuk 				    struct spdk_nvmf_rdma_transport *rtransport)
1118ca59dd5dSAlexey Marchuk {
1119ca59dd5dSAlexey Marchuk 	/* Put completed WRs back to pool and move transfer_wr pointer */
1120ca59dd5dSAlexey Marchuk 	_nvmf_rdma_request_free_data(rdma_req, rdma_req->transfer_wr, rtransport->data_wr_pool);
1121ca59dd5dSAlexey Marchuk 	rdma_req->transfer_wr = rdma_req->remaining_tranfer_in_wrs;
1122ca59dd5dSAlexey Marchuk 	rdma_req->remaining_tranfer_in_wrs = NULL;
1123ca59dd5dSAlexey Marchuk 	rdma_req->num_outstanding_data_wr = rdma_req->num_remaining_data_wr;
1124ca59dd5dSAlexey Marchuk 	rdma_req->num_remaining_data_wr = 0;
1125ca59dd5dSAlexey Marchuk }
1126ca59dd5dSAlexey Marchuk 
1127ca59dd5dSAlexey Marchuk static inline int
1128ca59dd5dSAlexey Marchuk request_prepare_transfer_in_part(struct spdk_nvmf_request *req, uint32_t num_reads_available)
1129ca59dd5dSAlexey Marchuk {
1130ca59dd5dSAlexey Marchuk 	struct spdk_nvmf_rdma_request	*rdma_req;
1131ca59dd5dSAlexey Marchuk 	struct ibv_send_wr		*wr;
1132ca59dd5dSAlexey Marchuk 	uint32_t i;
1133ca59dd5dSAlexey Marchuk 
1134ca59dd5dSAlexey Marchuk 	rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
1135ca59dd5dSAlexey Marchuk 
1136ca59dd5dSAlexey Marchuk 	assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
1137ca59dd5dSAlexey Marchuk 	assert(rdma_req != NULL);
1138ca59dd5dSAlexey Marchuk 	assert(num_reads_available > 0);
1139ca59dd5dSAlexey Marchuk 	assert(rdma_req->num_outstanding_data_wr > num_reads_available);
1140ca59dd5dSAlexey Marchuk 	wr = rdma_req->transfer_wr;
1141ca59dd5dSAlexey Marchuk 
1142ca59dd5dSAlexey Marchuk 	for (i = 0; i < num_reads_available - 1; i++) {
1143ca59dd5dSAlexey Marchuk 		wr = wr->next;
1144ca59dd5dSAlexey Marchuk 	}
1145ca59dd5dSAlexey Marchuk 
1146ca59dd5dSAlexey Marchuk 	rdma_req->remaining_tranfer_in_wrs = wr->next;
1147ca59dd5dSAlexey Marchuk 	rdma_req->num_remaining_data_wr = rdma_req->num_outstanding_data_wr - num_reads_available;
1148ca59dd5dSAlexey Marchuk 	rdma_req->num_outstanding_data_wr = num_reads_available;
1149ca59dd5dSAlexey Marchuk 	/* Break chain of WRs to send only part. Once this portion completes, we continue sending RDMA_READs */
1150ca59dd5dSAlexey Marchuk 	wr->next = NULL;
1151ca59dd5dSAlexey Marchuk 
1152ca59dd5dSAlexey Marchuk 	return 0;
1153ca59dd5dSAlexey Marchuk }
1154ca59dd5dSAlexey Marchuk 
1155411df9adSDaniel Verkamp static int
1156fdec444aSPhilipp Skadorov request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
1157eee64c69SBen Walker {
1158c7395a11SJinYu 	int				num_outstanding_data_wr = 0;
11596fb90732SBen Walker 	struct spdk_nvmf_rdma_request	*rdma_req;
11606fb90732SBen Walker 	struct spdk_nvmf_qpair		*qpair;
116155a624edSBen Walker 	struct spdk_nvmf_rdma_qpair	*rqpair;
11626fb90732SBen Walker 	struct spdk_nvme_cpl		*rsp;
11639d63933bSSeth Howell 	struct ibv_send_wr		*first = NULL;
1164c818233bSIvan Betsis 	struct spdk_nvmf_rdma_transport *rtransport;
1165eee64c69SBen Walker 
1166fdec444aSPhilipp Skadorov 	*data_posted = 0;
11676fb90732SBen Walker 	qpair = req->qpair;
11686fb90732SBen Walker 	rsp = &req->rsp->nvme_cpl;
11696fb90732SBen Walker 	rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
117055a624edSBen Walker 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
1171c818233bSIvan Betsis 	rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
1172c818233bSIvan Betsis 				      struct spdk_nvmf_rdma_transport, transport);
11736fb90732SBen Walker 
1174eee64c69SBen Walker 	/* Advance our sq_head pointer */
11751d304bc5SBen Walker 	if (qpair->sq_head == qpair->sq_head_max) {
11761d304bc5SBen Walker 		qpair->sq_head = 0;
117718498460SDaniel Verkamp 	} else {
11781d304bc5SBen Walker 		qpair->sq_head++;
1179eee64c69SBen Walker 	}
11801d304bc5SBen Walker 	rsp->sqhd = qpair->sq_head;
1181eee64c69SBen Walker 
1182c3884f94SSeth Howell 	/* queue the capsule for the recv buffer */
11831db3a037SBen Walker 	assert(rdma_req->recv != NULL);
118401201d3eSSeth Howell 
1185c3884f94SSeth Howell 	nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr);
1186c3884f94SSeth Howell 
11871db3a037SBen Walker 	rdma_req->recv = NULL;
1188e1dd85a5SBen Walker 	assert(rqpair->current_recv_depth > 0);
1189158dc947SSeth Howell 	rqpair->current_recv_depth--;
1190eee64c69SBen Walker 
119162700dacSSeth Howell 	/* Build the response which consists of optional
119262700dacSSeth Howell 	 * RDMA WRITEs to transfer data, plus an RDMA SEND
11931f382439SBen Walker 	 * containing the response.
11941f382439SBen Walker 	 */
11959d63933bSSeth Howell 	first = &rdma_req->rsp.wr;
11961f382439SBen Walker 
1197e718d8caSAlexey Marchuk 	if (spdk_unlikely(rsp->status.sc != SPDK_NVME_SC_SUCCESS)) {
1198e0cd084bSShuhei Matsumoto 		/* On failure, data was not read from the controller. So clear the
1199e0cd084bSShuhei Matsumoto 		 * number of outstanding data WRs to zero.
1200e0cd084bSShuhei Matsumoto 		 */
1201e0cd084bSShuhei Matsumoto 		rdma_req->num_outstanding_data_wr = 0;
1202e0cd084bSShuhei Matsumoto 	} else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
12035a6e7a41SAlexey Marchuk 		first = rdma_req->transfer_wr;
12045301be93SSeth Howell 		*data_posted = 1;
1205c7395a11SJinYu 		num_outstanding_data_wr = rdma_req->num_outstanding_data_wr;
1206cc294653SBen Walker 	}
1207cf151d60SAlexey Marchuk 	if (spdk_rdma_provider_qp_queue_send_wrs(rqpair->rdma_qp, first)) {
1208bbb493ceSAlexey Marchuk 		STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link);
1209bbb493ceSAlexey Marchuk 	}
1210c818233bSIvan Betsis 	if (rtransport->rdma_opts.no_wr_batching) {
1211c818233bSIvan Betsis 		_poller_submit_sends(rtransport, rqpair->poller);
1212c818233bSIvan Betsis 	}
1213bbb493ceSAlexey Marchuk 
1214158dc947SSeth Howell 	/* +1 for the rsp wr */
1215a681f8d5SAlexey Marchuk 	assert(rqpair->current_send_depth + num_outstanding_data_wr + 1 <= rqpair->max_send_depth);
1216c7395a11SJinYu 	rqpair->current_send_depth += num_outstanding_data_wr + 1;
1217eee64c69SBen Walker 
1218dfdd76cfSSeth Howell 	return 0;
1219eee64c69SBen Walker }
1220eee64c69SBen Walker 
1221eee64c69SBen Walker static int
122255d8d943SSeth Howell nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair)
1223ba3d96e8SBen Walker {
1224ba3d96e8SBen Walker 	struct spdk_nvmf_rdma_accept_private_data	accept_data;
1225ba3d96e8SBen Walker 	struct rdma_conn_param				ctrlr_event_data = {};
1226ba3d96e8SBen Walker 	int						rc;
1227ba3d96e8SBen Walker 
1228ba3d96e8SBen Walker 	accept_data.recfmt = 0;
1229ba3d96e8SBen Walker 	accept_data.crqsize = rqpair->max_queue_depth;
1230ba3d96e8SBen Walker 
1231ba3d96e8SBen Walker 	ctrlr_event_data.private_data = &accept_data;
1232ba3d96e8SBen Walker 	ctrlr_event_data.private_data_len = sizeof(accept_data);
1233ba3d96e8SBen Walker 	if (id->ps == RDMA_PS_TCP) {
1234ba3d96e8SBen Walker 		ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */
12357289d370SSeth Howell 		ctrlr_event_data.initiator_depth = rqpair->max_read_depth;
1236ba3d96e8SBen Walker 	}
1237ba3d96e8SBen Walker 
123889d2efe0SSeth Howell 	/* Configure infinite retries for the initiator side qpair.
123957dc541cSChunsong Feng 	 * We need to pass this value to the initiator to prevent the
124089d2efe0SSeth Howell 	 * initiator side NIC from completing SEND requests back to the
124189d2efe0SSeth Howell 	 * initiator with status rnr_retry_count_exceeded. */
124289d2efe0SSeth Howell 	ctrlr_event_data.rnr_retry_count = 0x7;
124389d2efe0SSeth Howell 
1244ea7a4f3cSAlexey Marchuk 	/* When qpair is created without use of rdma cm API, an additional
1245ea7a4f3cSAlexey Marchuk 	 * information must be provided to initiator in the connection response:
1246ea7a4f3cSAlexey Marchuk 	 * whether qpair is using SRQ and its qp_num
1247ea7a4f3cSAlexey Marchuk 	 * Fields below are ignored by rdma cm if qpair has been
1248ea7a4f3cSAlexey Marchuk 	 * created using rdma cm API. */
1249ea7a4f3cSAlexey Marchuk 	ctrlr_event_data.srq = rqpair->srq ? 1 : 0;
125052f7aeb7SShuhei Matsumoto 	ctrlr_event_data.qp_num = rqpair->qp_num;
1251ea7a4f3cSAlexey Marchuk 
1252cf151d60SAlexey Marchuk 	rc = spdk_rdma_provider_qp_accept(rqpair->rdma_qp, &ctrlr_event_data);
1253ba3d96e8SBen Walker 	if (rc) {
1254cf151d60SAlexey Marchuk 		SPDK_ERRLOG("Error %d on spdk_rdma_provider_qp_accept\n", errno);
1255ba3d96e8SBen Walker 	} else {
12562172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "Sent back the accept\n");
1257ba3d96e8SBen Walker 	}
1258ba3d96e8SBen Walker 
1259ba3d96e8SBen Walker 	return rc;
1260ba3d96e8SBen Walker }
1261ba3d96e8SBen Walker 
1262ba3d96e8SBen Walker static void
126355d8d943SSeth Howell nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error)
1264ba3d96e8SBen Walker {
1265ba3d96e8SBen Walker 	struct spdk_nvmf_rdma_reject_private_data	rej_data;
1266ba3d96e8SBen Walker 
1267ba3d96e8SBen Walker 	rej_data.recfmt = 0;
1268ba3d96e8SBen Walker 	rej_data.sts = error;
1269ba3d96e8SBen Walker 
1270ba3d96e8SBen Walker 	rdma_reject(id, &rej_data, sizeof(rej_data));
1271ba3d96e8SBen Walker }
1272ba3d96e8SBen Walker 
1273ba3d96e8SBen Walker static int
12745584232cSBen Walker nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event)
12750f912a0eSDaniel Verkamp {
1276ecc436fcSBen Walker 	struct spdk_nvmf_rdma_transport *rtransport;
127755a624edSBen Walker 	struct spdk_nvmf_rdma_qpair	*rqpair = NULL;
12781cbc2b16SBen Walker 	struct spdk_nvmf_rdma_port	*port;
1279a9f5ffbdSBen Walker 	struct rdma_conn_param		*rdma_param = NULL;
1280a9f5ffbdSBen Walker 	const struct spdk_nvmf_rdma_request_private_data *private_data = NULL;
1281caf88609SBen Walker 	uint16_t			max_queue_depth;
12827289d370SSeth Howell 	uint16_t			max_read_depth;
12830f912a0eSDaniel Verkamp 
1284ecc436fcSBen Walker 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
1285ecc436fcSBen Walker 
1286ba3d96e8SBen Walker 	assert(event->id != NULL); /* Impossible. Can't even reject the connection. */
1287ba3d96e8SBen Walker 	assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */
1288765173a7SBen Walker 
1289765173a7SBen Walker 	rdma_param = &event->param.conn;
1290765173a7SBen Walker 	if (rdma_param->private_data == NULL ||
1291765173a7SBen Walker 	    rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) {
1292765173a7SBen Walker 		SPDK_ERRLOG("connect request: no private data provided\n");
129355d8d943SSeth Howell 		nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH);
1294ba3d96e8SBen Walker 		return -1;
1295765173a7SBen Walker 	}
1296ba3d96e8SBen Walker 
1297765173a7SBen Walker 	private_data = rdma_param->private_data;
1298ba3d96e8SBen Walker 	if (private_data->recfmt != 0) {
1299ba3d96e8SBen Walker 		SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n");
130055d8d943SSeth Howell 		nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT);
1301ba3d96e8SBen Walker 		return -1;
1302ba3d96e8SBen Walker 	}
1303765173a7SBen Walker 
13042172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "Connect Recv on fabric intf name %s, dev_name %s\n",
13056a61126fSBen Walker 		      event->id->verbs->device->name, event->id->verbs->device->dev_name);
13060f912a0eSDaniel Verkamp 
13071cbc2b16SBen Walker 	port = event->listen_id->context;
13082172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "Listen Id was %p with verbs %p. ListenAddr: %p\n",
13091cbc2b16SBen Walker 		      event->listen_id, event->listen_id->verbs, port);
1310a0a92ff4SBen Walker 
1311a9f5ffbdSBen Walker 	/* Figure out the supported queue depth. This is a multi-step process
1312a9f5ffbdSBen Walker 	 * that takes into account hardware maximums, host provided values,
1313a9f5ffbdSBen Walker 	 * and our target's internal memory limits */
13140f912a0eSDaniel Verkamp 
13152172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "Calculating Queue Depth\n");
1316a9f5ffbdSBen Walker 
1317a9f5ffbdSBen Walker 	/* Start with the maximum queue depth allowed by the target */
13188e808490SJohn Barnard 	max_queue_depth = rtransport->transport.opts.max_queue_depth;
13197289d370SSeth Howell 	max_read_depth = rtransport->transport.opts.max_queue_depth;
13202172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "Target Max Queue Depth: %d\n",
13218e808490SJohn Barnard 		      rtransport->transport.opts.max_queue_depth);
1322a9f5ffbdSBen Walker 
1323a9f5ffbdSBen Walker 	/* Next check the local NIC's hardware limitations */
13242172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma,
132535bc1e93SBen Walker 		      "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n",
13261cbc2b16SBen Walker 		      port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom);
13271cbc2b16SBen Walker 	max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr);
132841cd5ff4SSeth Howell 	max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom);
1329a9f5ffbdSBen Walker 
1330a9f5ffbdSBen Walker 	/* Next check the remote NIC's hardware limitations */
13312172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma,
1332b2a86421SBen Walker 		      "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n",
1333ca0c1338SBen Walker 		      rdma_param->initiator_depth, rdma_param->responder_resources);
13346bc8d265SAlexey Marchuk 	/* from man3 rdma_get_cm_event
13356bc8d265SAlexey Marchuk 	 * responder_resources - Specifies the number of responder resources that is requested by the recipient.
13366bc8d265SAlexey Marchuk 	 * The responder_resources field must match the initiator depth specified by the remote node when running
13376bc8d265SAlexey Marchuk 	 * the rdma_connect and rdma_accept functions. */
13386bc8d265SAlexey Marchuk 	if (rdma_param->responder_resources != 0) {
1339c8b9bbafSAlexey Marchuk 		if (private_data->qid) {
1340c8b9bbafSAlexey Marchuk 			SPDK_DEBUGLOG(rdma, "Host (Initiator) is not allowed to use RDMA operations,"
1341c8b9bbafSAlexey Marchuk 				      " responder_resources must be 0 but set to %u\n",
13426bc8d265SAlexey Marchuk 				      rdma_param->responder_resources);
1343c8b9bbafSAlexey Marchuk 		} else {
1344c8b9bbafSAlexey Marchuk 			SPDK_WARNLOG("Host (Initiator) is not allowed to use RDMA operations,"
1345c8b9bbafSAlexey Marchuk 				     " responder_resources must be 0 but set to %u\n",
1346c8b9bbafSAlexey Marchuk 				     rdma_param->responder_resources);
1347c8b9bbafSAlexey Marchuk 		}
1348f64690d4SBen Walker 	}
13496bc8d265SAlexey Marchuk 	/* from man3 rdma_get_cm_event
13506bc8d265SAlexey Marchuk 	 * initiator_depth - Specifies the maximum number of outstanding RDMA read operations that the recipient holds.
13516bc8d265SAlexey Marchuk 	 * The initiator_depth field must match the responder resources specified by the remote node when running
13526bc8d265SAlexey Marchuk 	 * the rdma_connect and rdma_accept functions. */
13536bc8d265SAlexey Marchuk 	if (rdma_param->initiator_depth == 0) {
13546bc8d265SAlexey Marchuk 		SPDK_ERRLOG("Host (Initiator) doesn't support RDMA_READ or atomic operations\n");
13556bc8d265SAlexey Marchuk 		nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_IRD);
13566bc8d265SAlexey Marchuk 		return -1;
13576bc8d265SAlexey Marchuk 	}
13586bc8d265SAlexey Marchuk 	max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth);
1359a9f5ffbdSBen Walker 
13602172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "Host Receive Queue Size: %d\n", private_data->hrqsize);
13612172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "Host Send Queue Size: %d\n", private_data->hsqsize);
136284d90484SDaniel Verkamp 	max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize);
1363b4ed77efSBen Walker 	max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1);
1364a9f5ffbdSBen Walker 
13652172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "Final Negotiated Queue Depth: %d R/W Depth: %d\n",
13667289d370SSeth Howell 		      max_queue_depth, max_read_depth);
1367ca0c1338SBen Walker 
1368ee691fefSBen Walker 	rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair));
136955a624edSBen Walker 	if (rqpair == NULL) {
1370ee691fefSBen Walker 		SPDK_ERRLOG("Could not allocate new connection.\n");
137155d8d943SSeth Howell 		nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
1372ba3d96e8SBen Walker 		return -1;
13730f912a0eSDaniel Verkamp 	}
13740f912a0eSDaniel Verkamp 
13758209c8cfSSeth Howell 	rqpair->device = port->device;
1376ee691fefSBen Walker 	rqpair->max_queue_depth = max_queue_depth;
13777289d370SSeth Howell 	rqpair->max_read_depth = max_read_depth;
1378ee691fefSBen Walker 	rqpair->cm_id = event->id;
1379311ce0e2SBen Walker 	rqpair->listen_id = event->listen_id;
1380ee691fefSBen Walker 	rqpair->qpair.transport = transport;
138173e87ed2SAlexey Marchuk 	/* use qid from the private data to determine the qpair type
138273e87ed2SAlexey Marchuk 	   qid will be set to the appropriate value when the controller is created */
138373e87ed2SAlexey Marchuk 	rqpair->qpair.qid = private_data->qid;
138445f2e732SJim Harris 	rqpair->qpair.numa.id_valid = 1;
138545f2e732SJim Harris 	rqpair->qpair.numa.id = spdk_rdma_cm_id_get_numa_id(rqpair->cm_id);
1386b25751d9SBen Walker 
1387ee691fefSBen Walker 	event->id->context = &rqpair->qpair;
1388ee691fefSBen Walker 
13895584232cSBen Walker 	spdk_nvmf_tgt_new_qpair(transport->tgt, &rqpair->qpair);
13906f95c325SZiye Yang 
13910f912a0eSDaniel Verkamp 	return 0;
13920f912a0eSDaniel Verkamp }
13930f912a0eSDaniel Verkamp 
1394568f4d2bSAlexey Marchuk static inline void
1395568f4d2bSAlexey Marchuk nvmf_rdma_setup_wr(struct ibv_send_wr *wr, struct ibv_send_wr *next,
1396568f4d2bSAlexey Marchuk 		   enum spdk_nvme_data_transfer xfer)
1397568f4d2bSAlexey Marchuk {
1398568f4d2bSAlexey Marchuk 	if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
1399568f4d2bSAlexey Marchuk 		wr->opcode = IBV_WR_RDMA_WRITE;
1400568f4d2bSAlexey Marchuk 		wr->send_flags = 0;
1401568f4d2bSAlexey Marchuk 		wr->next = next;
1402568f4d2bSAlexey Marchuk 	} else if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
1403568f4d2bSAlexey Marchuk 		wr->opcode = IBV_WR_RDMA_READ;
1404568f4d2bSAlexey Marchuk 		wr->send_flags = IBV_SEND_SIGNALED;
1405568f4d2bSAlexey Marchuk 		wr->next = NULL;
1406568f4d2bSAlexey Marchuk 	} else {
1407568f4d2bSAlexey Marchuk 		assert(0);
1408568f4d2bSAlexey Marchuk 	}
1409568f4d2bSAlexey Marchuk }
1410568f4d2bSAlexey Marchuk 
14111ff5f4abSBen Walker static int
141262700dacSSeth Howell nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
141362700dacSSeth Howell 		       struct spdk_nvmf_rdma_request *rdma_req,
141462700dacSSeth Howell 		       uint32_t num_sgl_descriptors)
141562700dacSSeth Howell {
141662700dacSSeth Howell 	struct spdk_nvmf_rdma_request_data	*work_requests[SPDK_NVMF_MAX_SGL_ENTRIES];
141762700dacSSeth Howell 	struct spdk_nvmf_rdma_request_data	*current_data_wr;
141862700dacSSeth Howell 	uint32_t				i;
141962700dacSSeth Howell 
1420e718d8caSAlexey Marchuk 	if (spdk_unlikely(num_sgl_descriptors > SPDK_NVMF_MAX_SGL_ENTRIES)) {
14216a77723eSAlexey Marchuk 		SPDK_ERRLOG("Requested too much entries (%u), the limit is %u\n",
14226a77723eSAlexey Marchuk 			    num_sgl_descriptors, SPDK_NVMF_MAX_SGL_ENTRIES);
14236a77723eSAlexey Marchuk 		return -EINVAL;
14246a77723eSAlexey Marchuk 	}
14256a77723eSAlexey Marchuk 
1426e718d8caSAlexey Marchuk 	if (spdk_unlikely(spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests,
1427e718d8caSAlexey Marchuk 						num_sgl_descriptors))) {
142862700dacSSeth Howell 		return -ENOMEM;
142962700dacSSeth Howell 	}
143062700dacSSeth Howell 
143162700dacSSeth Howell 	current_data_wr = &rdma_req->data;
143262700dacSSeth Howell 
143362700dacSSeth Howell 	for (i = 0; i < num_sgl_descriptors; i++) {
1434568f4d2bSAlexey Marchuk 		nvmf_rdma_setup_wr(&current_data_wr->wr, &work_requests[i]->wr, rdma_req->req.xfer);
143562700dacSSeth Howell 		current_data_wr->wr.next = &work_requests[i]->wr;
143662700dacSSeth Howell 		current_data_wr = work_requests[i];
1437568f4d2bSAlexey Marchuk 		current_data_wr->wr.sg_list = current_data_wr->sgl;
1438568f4d2bSAlexey Marchuk 		current_data_wr->wr.wr_id = rdma_req->data.wr.wr_id;
143962700dacSSeth Howell 	}
144062700dacSSeth Howell 
1441568f4d2bSAlexey Marchuk 	nvmf_rdma_setup_wr(&current_data_wr->wr, &rdma_req->rsp.wr, rdma_req->req.xfer);
1442568f4d2bSAlexey Marchuk 
144362700dacSSeth Howell 	return 0;
144462700dacSSeth Howell }
144562700dacSSeth Howell 
1446a335a524SAlexey Marchuk static inline void
1447a335a524SAlexey Marchuk nvmf_rdma_setup_request(struct spdk_nvmf_rdma_request *rdma_req)
1448a335a524SAlexey Marchuk {
1449a335a524SAlexey Marchuk 	struct ibv_send_wr		*wr = &rdma_req->data.wr;
1450a335a524SAlexey Marchuk 	struct spdk_nvme_sgl_descriptor	*sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1;
1451a335a524SAlexey Marchuk 
1452a335a524SAlexey Marchuk 	wr->wr.rdma.rkey = sgl->keyed.key;
1453a335a524SAlexey Marchuk 	wr->wr.rdma.remote_addr = sgl->address;
1454568f4d2bSAlexey Marchuk 	nvmf_rdma_setup_wr(wr, &rdma_req->rsp.wr, rdma_req->req.xfer);
1455a335a524SAlexey Marchuk }
1456a335a524SAlexey Marchuk 
14576a77723eSAlexey Marchuk static inline void
14586a77723eSAlexey Marchuk nvmf_rdma_update_remote_addr(struct spdk_nvmf_rdma_request *rdma_req, uint32_t num_wrs)
14596a77723eSAlexey Marchuk {
14606a77723eSAlexey Marchuk 	struct ibv_send_wr		*wr = &rdma_req->data.wr;
14616a77723eSAlexey Marchuk 	struct spdk_nvme_sgl_descriptor	*sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1;
14626a77723eSAlexey Marchuk 	uint32_t			i;
14636a77723eSAlexey Marchuk 	int				j;
14646a77723eSAlexey Marchuk 	uint64_t			remote_addr_offset = 0;
14656a77723eSAlexey Marchuk 
14666a77723eSAlexey Marchuk 	for (i = 0; i < num_wrs; ++i) {
14676a77723eSAlexey Marchuk 		wr->wr.rdma.rkey = sgl->keyed.key;
14686a77723eSAlexey Marchuk 		wr->wr.rdma.remote_addr = sgl->address + remote_addr_offset;
14696a77723eSAlexey Marchuk 		for (j = 0; j < wr->num_sge; ++j) {
14706a77723eSAlexey Marchuk 			remote_addr_offset += wr->sg_list[j].length;
14716a77723eSAlexey Marchuk 		}
14726a77723eSAlexey Marchuk 		wr = wr->next;
14736a77723eSAlexey Marchuk 	}
14746a77723eSAlexey Marchuk }
14756a77723eSAlexey Marchuk 
147689a28bfdSShuhei Matsumoto static int
1477ad0221afSShuhei Matsumoto nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_device *device,
14785593b61fSShuhei Matsumoto 		      struct spdk_nvmf_rdma_request *rdma_req,
147989a28bfdSShuhei Matsumoto 		      struct ibv_send_wr *wr,
14809db2571dSShuhei Matsumoto 		      uint32_t total_length)
148189a28bfdSShuhei Matsumoto {
14828a01b4d6SAlexey Marchuk 	struct spdk_rdma_utils_memory_translation mem_translation;
14834642d7b2SAlexey Marchuk 	struct ibv_sge	*sg_ele;
14844642d7b2SAlexey Marchuk 	struct iovec *iov;
1485019a5361SAlexey Marchuk 	uint32_t lkey, remaining;
14864642d7b2SAlexey Marchuk 	int rc;
148716365fd8SShuhei Matsumoto 
148889a28bfdSShuhei Matsumoto 	wr->num_sge = 0;
148916365fd8SShuhei Matsumoto 
14909db2571dSShuhei Matsumoto 	while (total_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) {
1491019a5361SAlexey Marchuk 		iov = &rdma_req->req.iov[rdma_req->iovpos];
14928a01b4d6SAlexey Marchuk 		rc = spdk_rdma_utils_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation);
14934642d7b2SAlexey Marchuk 		if (spdk_unlikely(rc)) {
14949db2571dSShuhei Matsumoto 			return rc;
14955593b61fSShuhei Matsumoto 		}
14965593b61fSShuhei Matsumoto 
14978a01b4d6SAlexey Marchuk 		lkey = spdk_rdma_utils_memory_translation_get_lkey(&mem_translation);
14984642d7b2SAlexey Marchuk 		sg_ele = &wr->sg_list[wr->num_sge];
1499019a5361SAlexey Marchuk 		remaining = spdk_min((uint32_t)iov->iov_len - rdma_req->offset, total_length);
15004642d7b2SAlexey Marchuk 
15014642d7b2SAlexey Marchuk 		sg_ele->lkey = lkey;
1502019a5361SAlexey Marchuk 		sg_ele->addr = (uintptr_t)iov->iov_base + rdma_req->offset;
1503019a5361SAlexey Marchuk 		sg_ele->length = remaining;
1504019a5361SAlexey Marchuk 		SPDK_DEBUGLOG(rdma, "sge[%d] %p addr 0x%"PRIx64", len %u\n", wr->num_sge, sg_ele, sg_ele->addr,
1505019a5361SAlexey Marchuk 			      sg_ele->length);
1506019a5361SAlexey Marchuk 		rdma_req->offset += sg_ele->length;
1507019a5361SAlexey Marchuk 		total_length -= sg_ele->length;
15084642d7b2SAlexey Marchuk 		wr->num_sge++;
1509019a5361SAlexey Marchuk 
1510019a5361SAlexey Marchuk 		if (rdma_req->offset == iov->iov_len) {
1511019a5361SAlexey Marchuk 			rdma_req->offset = 0;
1512019a5361SAlexey Marchuk 			rdma_req->iovpos++;
1513019a5361SAlexey Marchuk 		}
15149db2571dSShuhei Matsumoto 	}
15159db2571dSShuhei Matsumoto 
1516e718d8caSAlexey Marchuk 	if (spdk_unlikely(total_length)) {
15179db2571dSShuhei Matsumoto 		SPDK_ERRLOG("Not enough SG entries to hold data buffer\n");
15189db2571dSShuhei Matsumoto 		return -EINVAL;
15199db2571dSShuhei Matsumoto 	}
15209db2571dSShuhei Matsumoto 
15219db2571dSShuhei Matsumoto 	return 0;
15229db2571dSShuhei Matsumoto }
15239db2571dSShuhei Matsumoto 
15249db2571dSShuhei Matsumoto static int
1525ad0221afSShuhei Matsumoto nvmf_rdma_fill_wr_sgl_with_dif(struct spdk_nvmf_rdma_device *device,
15269db2571dSShuhei Matsumoto 			       struct spdk_nvmf_rdma_request *rdma_req,
15279db2571dSShuhei Matsumoto 			       struct ibv_send_wr *wr,
15289db2571dSShuhei Matsumoto 			       uint32_t total_length,
15299db2571dSShuhei Matsumoto 			       uint32_t num_extra_wrs)
15309db2571dSShuhei Matsumoto {
15318a01b4d6SAlexey Marchuk 	struct spdk_rdma_utils_memory_translation mem_translation;
15329db2571dSShuhei Matsumoto 	struct spdk_dif_ctx *dif_ctx = &rdma_req->req.dif.dif_ctx;
15339db2571dSShuhei Matsumoto 	struct ibv_sge *sg_ele;
15349db2571dSShuhei Matsumoto 	struct iovec *iov;
15350db0c443SChunsong Feng 	struct iovec *rdma_iov;
15369db2571dSShuhei Matsumoto 	uint32_t lkey, remaining;
15379db2571dSShuhei Matsumoto 	uint32_t remaining_data_block, data_block_size, md_size;
15384642d7b2SAlexey Marchuk 	uint32_t sge_len;
15399db2571dSShuhei Matsumoto 	int rc;
15409db2571dSShuhei Matsumoto 
15419db2571dSShuhei Matsumoto 	data_block_size = dif_ctx->block_size - dif_ctx->md_size;
15420db0c443SChunsong Feng 
15430db0c443SChunsong Feng 	if (spdk_likely(!rdma_req->req.stripped_data)) {
15440db0c443SChunsong Feng 		rdma_iov = rdma_req->req.iov;
15459db2571dSShuhei Matsumoto 		remaining_data_block = data_block_size;
15460db0c443SChunsong Feng 		md_size = dif_ctx->md_size;
15470db0c443SChunsong Feng 	} else {
15480db0c443SChunsong Feng 		rdma_iov = rdma_req->req.stripped_data->iov;
15490db0c443SChunsong Feng 		total_length = total_length / dif_ctx->block_size * data_block_size;
15500db0c443SChunsong Feng 		remaining_data_block = total_length;
15510db0c443SChunsong Feng 		md_size = 0;
15520db0c443SChunsong Feng 	}
15539db2571dSShuhei Matsumoto 
15549db2571dSShuhei Matsumoto 	wr->num_sge = 0;
15559db2571dSShuhei Matsumoto 
15569db2571dSShuhei Matsumoto 	while (total_length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) {
15570db0c443SChunsong Feng 		iov = rdma_iov + rdma_req->iovpos;
15588a01b4d6SAlexey Marchuk 		rc = spdk_rdma_utils_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation);
15599db2571dSShuhei Matsumoto 		if (spdk_unlikely(rc)) {
15609db2571dSShuhei Matsumoto 			return rc;
15619db2571dSShuhei Matsumoto 		}
15629db2571dSShuhei Matsumoto 
15638a01b4d6SAlexey Marchuk 		lkey = spdk_rdma_utils_memory_translation_get_lkey(&mem_translation);
15649db2571dSShuhei Matsumoto 		sg_ele = &wr->sg_list[wr->num_sge];
15659db2571dSShuhei Matsumoto 		remaining = spdk_min((uint32_t)iov->iov_len - rdma_req->offset, total_length);
15664642d7b2SAlexey Marchuk 
15674642d7b2SAlexey Marchuk 		while (remaining) {
15684642d7b2SAlexey Marchuk 			if (wr->num_sge >= SPDK_NVMF_MAX_SGL_ENTRIES) {
15694642d7b2SAlexey Marchuk 				if (num_extra_wrs > 0 && wr->next) {
15704642d7b2SAlexey Marchuk 					wr = wr->next;
15714642d7b2SAlexey Marchuk 					wr->num_sge = 0;
15724642d7b2SAlexey Marchuk 					sg_ele = &wr->sg_list[wr->num_sge];
15734642d7b2SAlexey Marchuk 					num_extra_wrs--;
15744642d7b2SAlexey Marchuk 				} else {
15754642d7b2SAlexey Marchuk 					break;
15764642d7b2SAlexey Marchuk 				}
15774642d7b2SAlexey Marchuk 			}
15784642d7b2SAlexey Marchuk 			sg_ele->lkey = lkey;
1579019a5361SAlexey Marchuk 			sg_ele->addr = (uintptr_t)((char *)iov->iov_base + rdma_req->offset);
15804642d7b2SAlexey Marchuk 			sge_len = spdk_min(remaining, remaining_data_block);
15814642d7b2SAlexey Marchuk 			sg_ele->length = sge_len;
15829db2571dSShuhei Matsumoto 			SPDK_DEBUGLOG(rdma, "sge[%d] %p addr 0x%"PRIx64", len %u\n", wr->num_sge, sg_ele,
15839db2571dSShuhei Matsumoto 				      sg_ele->addr, sg_ele->length);
15844642d7b2SAlexey Marchuk 			remaining -= sge_len;
15854642d7b2SAlexey Marchuk 			remaining_data_block -= sge_len;
1586019a5361SAlexey Marchuk 			rdma_req->offset += sge_len;
1587019a5361SAlexey Marchuk 			total_length -= sge_len;
15884642d7b2SAlexey Marchuk 
15894642d7b2SAlexey Marchuk 			sg_ele++;
15904642d7b2SAlexey Marchuk 			wr->num_sge++;
15914642d7b2SAlexey Marchuk 
15924642d7b2SAlexey Marchuk 			if (remaining_data_block == 0) {
15934642d7b2SAlexey Marchuk 				/* skip metadata */
1594019a5361SAlexey Marchuk 				rdma_req->offset += md_size;
1595019a5361SAlexey Marchuk 				total_length -= md_size;
15964642d7b2SAlexey Marchuk 				/* Metadata that do not fit this IO buffer will be included in the next IO buffer */
15974642d7b2SAlexey Marchuk 				remaining -= spdk_min(remaining, md_size);
15984642d7b2SAlexey Marchuk 				remaining_data_block = data_block_size;
15994642d7b2SAlexey Marchuk 			}
16004642d7b2SAlexey Marchuk 
16014642d7b2SAlexey Marchuk 			if (remaining == 0) {
16024642d7b2SAlexey Marchuk 				/* By subtracting the size of the last IOV from the offset, we ensure that we skip
16034642d7b2SAlexey Marchuk 				   the remaining metadata bits at the beginning of the next buffer */
1604019a5361SAlexey Marchuk 				rdma_req->offset -= spdk_min(iov->iov_len, rdma_req->offset);
16055593b61fSShuhei Matsumoto 				rdma_req->iovpos++;
16065593b61fSShuhei Matsumoto 			}
1607019a5361SAlexey Marchuk 		}
1608019a5361SAlexey Marchuk 	}
16095593b61fSShuhei Matsumoto 
1610e718d8caSAlexey Marchuk 	if (spdk_unlikely(total_length)) {
161116365fd8SShuhei Matsumoto 		SPDK_ERRLOG("Not enough SG entries to hold data buffer\n");
161216365fd8SShuhei Matsumoto 		return -EINVAL;
161316365fd8SShuhei Matsumoto 	}
161416365fd8SShuhei Matsumoto 
16155593b61fSShuhei Matsumoto 	return 0;
16165593b61fSShuhei Matsumoto }
16175593b61fSShuhei Matsumoto 
1618653496d2SAlexey Marchuk static inline uint32_t
1619653496d2SAlexey Marchuk nvmf_rdma_calc_num_wrs(uint32_t length, uint32_t io_unit_size, uint32_t block_size)
1620653496d2SAlexey Marchuk {
1621653496d2SAlexey Marchuk 	/* estimate the number of SG entries and WRs needed to process the request */
1622653496d2SAlexey Marchuk 	uint32_t num_sge = 0;
1623653496d2SAlexey Marchuk 	uint32_t i;
1624653496d2SAlexey Marchuk 	uint32_t num_buffers = SPDK_CEIL_DIV(length, io_unit_size);
1625653496d2SAlexey Marchuk 
1626653496d2SAlexey Marchuk 	for (i = 0; i < num_buffers && length > 0; i++) {
1627653496d2SAlexey Marchuk 		uint32_t buffer_len = spdk_min(length, io_unit_size);
1628653496d2SAlexey Marchuk 		uint32_t num_sge_in_block = SPDK_CEIL_DIV(buffer_len, block_size);
1629653496d2SAlexey Marchuk 
1630653496d2SAlexey Marchuk 		if (num_sge_in_block * block_size > buffer_len) {
1631653496d2SAlexey Marchuk 			++num_sge_in_block;
1632653496d2SAlexey Marchuk 		}
1633653496d2SAlexey Marchuk 		num_sge += num_sge_in_block;
1634653496d2SAlexey Marchuk 		length -= buffer_len;
1635653496d2SAlexey Marchuk 	}
1636653496d2SAlexey Marchuk 	return SPDK_CEIL_DIV(num_sge, SPDK_NVMF_MAX_SGL_ENTRIES);
1637653496d2SAlexey Marchuk }
1638653496d2SAlexey Marchuk 
1639e70a7594SSeth Howell static int
164055d8d943SSeth Howell nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
1641e70a7594SSeth Howell 			    struct spdk_nvmf_rdma_device *device,
1642e48475b7SShuhei Matsumoto 			    struct spdk_nvmf_rdma_request *rdma_req)
1643e70a7594SSeth Howell {
1644e70a7594SSeth Howell 	struct spdk_nvmf_rdma_qpair		*rqpair;
1645e70a7594SSeth Howell 	struct spdk_nvmf_rdma_poll_group	*rgroup;
16460b068f85SShuhei Matsumoto 	struct spdk_nvmf_request		*req = &rdma_req->req;
1647d409da0cSShuhei Matsumoto 	struct ibv_send_wr			*wr = &rdma_req->data.wr;
1648fda0e558SShuhei Matsumoto 	int					rc;
1649653496d2SAlexey Marchuk 	uint32_t				num_wrs = 1;
1650e48475b7SShuhei Matsumoto 	uint32_t				length;
1651e70a7594SSeth Howell 
16520b068f85SShuhei Matsumoto 	rqpair = SPDK_CONTAINEROF(req->qpair, struct spdk_nvmf_rdma_qpair, qpair);
1653e70a7594SSeth Howell 	rgroup = rqpair->poller->group;
1654e70a7594SSeth Howell 
1655838c45c8SAlexey Marchuk 	/* rdma wr specifics */
1656838c45c8SAlexey Marchuk 	nvmf_rdma_setup_request(rdma_req);
1657838c45c8SAlexey Marchuk 
1658e48475b7SShuhei Matsumoto 	length = req->length;
1659e48475b7SShuhei Matsumoto 	if (spdk_unlikely(req->dif_enabled)) {
1660e48475b7SShuhei Matsumoto 		req->dif.orig_length = length;
1661e48475b7SShuhei Matsumoto 		length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
1662e48475b7SShuhei Matsumoto 		req->dif.elba_length = length;
1663e48475b7SShuhei Matsumoto 	}
1664e48475b7SShuhei Matsumoto 
1665fda0e558SShuhei Matsumoto 	rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport,
1666fda0e558SShuhei Matsumoto 					   length);
1667e718d8caSAlexey Marchuk 	if (spdk_unlikely(rc != 0)) {
1668fda0e558SShuhei Matsumoto 		return rc;
16696812b63cSSeth Howell 	}
16706812b63cSSeth Howell 
1671c0ee8ef7SShuhei Matsumoto 	assert(req->iovcnt <= rqpair->max_send_sge);
16726812b63cSSeth Howell 
16730db0c443SChunsong Feng 	/* When dif_insert_or_strip is true and the I/O data length is greater than one block,
16740db0c443SChunsong Feng 	 * the stripped_buffers are got for DIF stripping. */
16750db0c443SChunsong Feng 	if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST)
16760db0c443SChunsong Feng 			  && (req->dif.elba_length > req->dif.dif_ctx.block_size))) {
16770db0c443SChunsong Feng 		rc = nvmf_request_get_stripped_buffers(req, &rgroup->group,
16780db0c443SChunsong Feng 						       &rtransport->transport, req->dif.orig_length);
16790db0c443SChunsong Feng 		if (rc != 0) {
16800db0c443SChunsong Feng 			SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc);
16810db0c443SChunsong Feng 		}
16820db0c443SChunsong Feng 	}
16830db0c443SChunsong Feng 
1684c0ee8ef7SShuhei Matsumoto 	rdma_req->iovpos = 0;
1685b48a97d4SShuhei Matsumoto 
168615ae31fbSBen Walker 	if (spdk_unlikely(req->dif_enabled)) {
1687653496d2SAlexey Marchuk 		num_wrs = nvmf_rdma_calc_num_wrs(length, rtransport->transport.opts.io_unit_size,
1688653496d2SAlexey Marchuk 						 req->dif.dif_ctx.block_size);
1689653496d2SAlexey Marchuk 		if (num_wrs > 1) {
1690653496d2SAlexey Marchuk 			rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_wrs - 1);
1691e718d8caSAlexey Marchuk 			if (spdk_unlikely(rc != 0)) {
1692e70a7594SSeth Howell 				goto err_exit;
1693e70a7594SSeth Howell 			}
1694653496d2SAlexey Marchuk 		}
1695653496d2SAlexey Marchuk 
1696ad0221afSShuhei Matsumoto 		rc = nvmf_rdma_fill_wr_sgl_with_dif(device, rdma_req, wr, length, num_wrs - 1);
1697653496d2SAlexey Marchuk 		if (spdk_unlikely(rc != 0)) {
1698653496d2SAlexey Marchuk 			goto err_exit;
1699653496d2SAlexey Marchuk 		}
1700e70a7594SSeth Howell 
17019db2571dSShuhei Matsumoto 		if (num_wrs > 1) {
17026a77723eSAlexey Marchuk 			nvmf_rdma_update_remote_addr(rdma_req, num_wrs);
17036a77723eSAlexey Marchuk 		}
17049db2571dSShuhei Matsumoto 	} else {
1705ad0221afSShuhei Matsumoto 		rc = nvmf_rdma_fill_wr_sgl(device, rdma_req, wr, length);
17069db2571dSShuhei Matsumoto 		if (spdk_unlikely(rc != 0)) {
17079db2571dSShuhei Matsumoto 			goto err_exit;
17089db2571dSShuhei Matsumoto 		}
17099db2571dSShuhei Matsumoto 	}
17106a77723eSAlexey Marchuk 
1711838c45c8SAlexey Marchuk 	/* set the number of outstanding data WRs for this request. */
1712653496d2SAlexey Marchuk 	rdma_req->num_outstanding_data_wr = num_wrs;
1713838c45c8SAlexey Marchuk 
1714a451c838SSeth Howell 	return rc;
17158580daa1SSrikanth kaligotla 
1716a451c838SSeth Howell err_exit:
171779945ef0SShuhei Matsumoto 	spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport);
1718653496d2SAlexey Marchuk 	nvmf_rdma_request_free_data(rdma_req, rtransport);
17190b068f85SShuhei Matsumoto 	req->iovcnt = 0;
1720a451c838SSeth Howell 	return rc;
17218580daa1SSrikanth kaligotla }
17228580daa1SSrikanth kaligotla 
17238580daa1SSrikanth kaligotla static int
172462700dacSSeth Howell nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport,
172562700dacSSeth Howell 				      struct spdk_nvmf_rdma_device *device,
172662700dacSSeth Howell 				      struct spdk_nvmf_rdma_request *rdma_req)
172762700dacSSeth Howell {
172862700dacSSeth Howell 	struct spdk_nvmf_rdma_qpair		*rqpair;
172962700dacSSeth Howell 	struct spdk_nvmf_rdma_poll_group	*rgroup;
173062700dacSSeth Howell 	struct ibv_send_wr			*current_wr;
173162700dacSSeth Howell 	struct spdk_nvmf_request		*req = &rdma_req->req;
173262700dacSSeth Howell 	struct spdk_nvme_sgl_descriptor		*inline_segment, *desc;
173362700dacSSeth Howell 	uint32_t				num_sgl_descriptors;
1734019a5361SAlexey Marchuk 	uint32_t				lengths[SPDK_NVMF_MAX_SGL_ENTRIES], total_length = 0;
173562700dacSSeth Howell 	uint32_t				i;
173662700dacSSeth Howell 	int					rc;
173762700dacSSeth Howell 
173862700dacSSeth Howell 	rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
173962700dacSSeth Howell 	rgroup = rqpair->poller->group;
174062700dacSSeth Howell 
174162700dacSSeth Howell 	inline_segment = &req->cmd->nvme_cmd.dptr.sgl1;
174262700dacSSeth Howell 	assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT);
174362700dacSSeth Howell 	assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET);
174462700dacSSeth Howell 
174562700dacSSeth Howell 	num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor);
174662700dacSSeth Howell 	assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES);
1747410455e4SShuhei Matsumoto 
174804621576SShuhei Matsumoto 	desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
174904621576SShuhei Matsumoto 	for (i = 0; i < num_sgl_descriptors; i++) {
175015ae31fbSBen Walker 		if (spdk_likely(!req->dif_enabled)) {
175104621576SShuhei Matsumoto 			lengths[i] = desc->keyed.length;
17526ec974edSAlexey Marchuk 		} else {
17536ec974edSAlexey Marchuk 			req->dif.orig_length += desc->keyed.length;
17546ec974edSAlexey Marchuk 			lengths[i] = spdk_dif_get_length_with_md(desc->keyed.length, &req->dif.dif_ctx);
17556ec974edSAlexey Marchuk 			req->dif.elba_length += lengths[i];
17566ec974edSAlexey Marchuk 		}
1757019a5361SAlexey Marchuk 		total_length += lengths[i];
175804621576SShuhei Matsumoto 		desc++;
175904621576SShuhei Matsumoto 	}
176004621576SShuhei Matsumoto 
1761e718d8caSAlexey Marchuk 	if (spdk_unlikely(total_length > rtransport->transport.opts.max_io_size)) {
1762019a5361SAlexey Marchuk 		SPDK_ERRLOG("Multi SGL length 0x%x exceeds max io size 0x%x\n",
1763019a5361SAlexey Marchuk 			    total_length, rtransport->transport.opts.max_io_size);
1764019a5361SAlexey Marchuk 		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
1765019a5361SAlexey Marchuk 		return -EINVAL;
1766019a5361SAlexey Marchuk 	}
1767019a5361SAlexey Marchuk 
1768e718d8caSAlexey Marchuk 	rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1);
1769e718d8caSAlexey Marchuk 	if (spdk_unlikely(rc != 0)) {
1770019a5361SAlexey Marchuk 		return -ENOMEM;
1771019a5361SAlexey Marchuk 	}
1772019a5361SAlexey Marchuk 
1773019a5361SAlexey Marchuk 	rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, total_length);
1774e718d8caSAlexey Marchuk 	if (spdk_unlikely(rc != 0)) {
177504621576SShuhei Matsumoto 		nvmf_rdma_request_free_data(rdma_req, rtransport);
1776fda0e558SShuhei Matsumoto 		return rc;
177704621576SShuhei Matsumoto 	}
177804621576SShuhei Matsumoto 
17790db0c443SChunsong Feng 	/* When dif_insert_or_strip is true and the I/O data length is greater than one block,
17800db0c443SChunsong Feng 	 * the stripped_buffers are got for DIF stripping. */
17810db0c443SChunsong Feng 	if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST)
17820db0c443SChunsong Feng 			  && (req->dif.elba_length > req->dif.dif_ctx.block_size))) {
17830db0c443SChunsong Feng 		rc = nvmf_request_get_stripped_buffers(req, &rgroup->group,
17840db0c443SChunsong Feng 						       &rtransport->transport, req->dif.orig_length);
1785e718d8caSAlexey Marchuk 		if (spdk_unlikely(rc != 0)) {
17860db0c443SChunsong Feng 			SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc);
17870db0c443SChunsong Feng 		}
17880db0c443SChunsong Feng 	}
17890db0c443SChunsong Feng 
179062700dacSSeth Howell 	/* The first WR must always be the embedded data WR. This is how we unwind them later. */
179162700dacSSeth Howell 	current_wr = &rdma_req->data.wr;
179273a171a0SHailiang Wang 	assert(current_wr != NULL);
179362700dacSSeth Howell 
1794f0c21261SShuhei Matsumoto 	req->length = 0;
17955593b61fSShuhei Matsumoto 	rdma_req->iovpos = 0;
17966812b63cSSeth Howell 	desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
179762700dacSSeth Howell 	for (i = 0; i < num_sgl_descriptors; i++) {
179862700dacSSeth Howell 		/* The descriptors must be keyed data block descriptors with an address, not an offset. */
179962700dacSSeth Howell 		if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK ||
180062700dacSSeth Howell 				  desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) {
180162700dacSSeth Howell 			rc = -EINVAL;
180262700dacSSeth Howell 			goto err_exit;
180362700dacSSeth Howell 		}
180462700dacSSeth Howell 
18059db2571dSShuhei Matsumoto 		if (spdk_likely(!req->dif_enabled)) {
1806ad0221afSShuhei Matsumoto 			rc = nvmf_rdma_fill_wr_sgl(device, rdma_req, current_wr, lengths[i]);
18079db2571dSShuhei Matsumoto 		} else {
1808ad0221afSShuhei Matsumoto 			rc = nvmf_rdma_fill_wr_sgl_with_dif(device, rdma_req, current_wr,
18099db2571dSShuhei Matsumoto 							    lengths[i], 0);
18109db2571dSShuhei Matsumoto 		}
1811e718d8caSAlexey Marchuk 		if (spdk_unlikely(rc != 0)) {
181262700dacSSeth Howell 			rc = -ENOMEM;
181362700dacSSeth Howell 			goto err_exit;
181462700dacSSeth Howell 		}
181562700dacSSeth Howell 
1816f0c21261SShuhei Matsumoto 		req->length += desc->keyed.length;
181762700dacSSeth Howell 		current_wr->wr.rdma.rkey = desc->keyed.key;
181862700dacSSeth Howell 		current_wr->wr.rdma.remote_addr = desc->address;
181962700dacSSeth Howell 		current_wr = current_wr->next;
182062700dacSSeth Howell 		desc++;
182162700dacSSeth Howell 	}
182262700dacSSeth Howell 
182362700dacSSeth Howell #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
18247d7b44f2SSeth Howell 	/* Go back to the last descriptor in the list. */
18257d7b44f2SSeth Howell 	desc--;
182662700dacSSeth Howell 	if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) {
182762700dacSSeth Howell 		if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) {
182862700dacSSeth Howell 			rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV;
182962700dacSSeth Howell 			rdma_req->rsp.wr.imm_data = desc->keyed.key;
183062700dacSSeth Howell 		}
183162700dacSSeth Howell 	}
183262700dacSSeth Howell #endif
183362700dacSSeth Howell 
183462700dacSSeth Howell 	rdma_req->num_outstanding_data_wr = num_sgl_descriptors;
183562700dacSSeth Howell 
183662700dacSSeth Howell 	return 0;
183762700dacSSeth Howell 
183862700dacSSeth Howell err_exit:
183979945ef0SShuhei Matsumoto 	spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport);
184062700dacSSeth Howell 	nvmf_rdma_request_free_data(rdma_req, rtransport);
184162700dacSSeth Howell 	return rc;
184262700dacSSeth Howell }
184362700dacSSeth Howell 
184462700dacSSeth Howell static int
184555d8d943SSeth Howell nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
18461ff5f4abSBen Walker 			    struct spdk_nvmf_rdma_device *device,
18471ff5f4abSBen Walker 			    struct spdk_nvmf_rdma_request *rdma_req)
18482625cf42SBen Walker {
184991f9c6f3SShuhei Matsumoto 	struct spdk_nvmf_request		*req = &rdma_req->req;
18506fb90732SBen Walker 	struct spdk_nvme_cpl			*rsp;
1851f1a584a9SBen Walker 	struct spdk_nvme_sgl_descriptor		*sgl;
1852a8169c37SSeth Howell 	int					rc;
18531bc5710aSAlexey Marchuk 	uint32_t				length;
18542625cf42SBen Walker 
185591f9c6f3SShuhei Matsumoto 	rsp = &req->rsp->nvme_cpl;
1856a335a524SAlexey Marchuk 	sgl = &req->cmd->nvme_cmd.dptr.sgl1;
18572625cf42SBen Walker 
18582625cf42SBen Walker 	if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK &&
18592625cf42SBen Walker 	    (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS ||
18602625cf42SBen Walker 	     sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) {
18611bc5710aSAlexey Marchuk 
18621bc5710aSAlexey Marchuk 		length = sgl->keyed.length;
1863e718d8caSAlexey Marchuk 		if (spdk_unlikely(length > rtransport->transport.opts.max_io_size)) {
18648a701c3fSBen Walker 			SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
18651bc5710aSAlexey Marchuk 				    length, rtransport->transport.opts.max_io_size);
18662625cf42SBen Walker 			rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
18671ff5f4abSBen Walker 			return -1;
18682625cf42SBen Walker 		}
1869b4de8e11SSeth Howell #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
1870efe4c272SBen Walker 		if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) {
1871b4de8e11SSeth Howell 			if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) {
1872b4de8e11SSeth Howell 				rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV;
1873b4de8e11SSeth Howell 				rdma_req->rsp.wr.imm_data = sgl->keyed.key;
1874b4de8e11SSeth Howell 			}
18751570c87fSSeth Howell 		}
1876b4de8e11SSeth Howell #endif
18772625cf42SBen Walker 
18788580daa1SSrikanth kaligotla 		/* fill request length and populate iovs */
187991f9c6f3SShuhei Matsumoto 		req->length = length;
18808580daa1SSrikanth kaligotla 
1881e48475b7SShuhei Matsumoto 		rc = nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req);
188206481fc2SAlexey Marchuk 		if (spdk_unlikely(rc < 0)) {
188306481fc2SAlexey Marchuk 			if (rc == -EINVAL) {
188406481fc2SAlexey Marchuk 				SPDK_ERRLOG("SGL length exceeds the max I/O size\n");
1885f2065513SJacek Kalwas 				rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
188606481fc2SAlexey Marchuk 				return -1;
188706481fc2SAlexey Marchuk 			}
1888f1a584a9SBen Walker 			/* No available buffers. Queue this request up. */
18892172c432STomasz Zawadzki 			SPDK_DEBUGLOG(rdma, "No available large data buffers. Queueing request %p\n", rdma_req);
18901ff5f4abSBen Walker 			return 0;
1891f1a584a9SBen Walker 		}
18928580daa1SSrikanth kaligotla 
18932172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "Request %p took %d buffer/s from central pool\n", rdma_req,
189491f9c6f3SShuhei Matsumoto 			      req->iovcnt);
1895a4a3b5e7SBen Walker 
18961ff5f4abSBen Walker 		return 0;
18972625cf42SBen Walker 	} else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
18982625cf42SBen Walker 		   sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
18992625cf42SBen Walker 		uint64_t offset = sgl->address;
19008e808490SJohn Barnard 		uint32_t max_len = rtransport->transport.opts.in_capsule_data_size;
19012625cf42SBen Walker 
19022172c432STomasz Zawadzki 		SPDK_DEBUGLOG(nvmf, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
19032625cf42SBen Walker 			      offset, sgl->unkeyed.length);
19042625cf42SBen Walker 
1905e718d8caSAlexey Marchuk 		if (spdk_unlikely(offset > max_len)) {
19062625cf42SBen Walker 			SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n",
19072625cf42SBen Walker 				    offset, max_len);
19082625cf42SBen Walker 			rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET;
19091ff5f4abSBen Walker 			return -1;
19102625cf42SBen Walker 		}
19112625cf42SBen Walker 		max_len -= (uint32_t)offset;
19122625cf42SBen Walker 
1913e718d8caSAlexey Marchuk 		if (spdk_unlikely(sgl->unkeyed.length > max_len)) {
19142625cf42SBen Walker 			SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
19152625cf42SBen Walker 				    sgl->unkeyed.length, max_len);
19162625cf42SBen Walker 			rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
19171ff5f4abSBen Walker 			return -1;
1918f1a584a9SBen Walker 		}
1919f1a584a9SBen Walker 
19201f9ac117SSeth Howell 		rdma_req->num_outstanding_data_wr = 0;
192191f9c6f3SShuhei Matsumoto 		req->data_from_pool = false;
192291f9c6f3SShuhei Matsumoto 		req->length = sgl->unkeyed.length;
19238580daa1SSrikanth kaligotla 
1924e1413e91SJohn Levon 		req->iov[0].iov_base = rdma_req->recv->buf + offset;
192591f9c6f3SShuhei Matsumoto 		req->iov[0].iov_len = req->length;
192691f9c6f3SShuhei Matsumoto 		req->iovcnt = 1;
19278580daa1SSrikanth kaligotla 
19281ff5f4abSBen Walker 		return 0;
192962700dacSSeth Howell 	} else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT &&
193062700dacSSeth Howell 		   sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
1931a8169c37SSeth Howell 
1932a8169c37SSeth Howell 		rc = nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req);
1933e718d8caSAlexey Marchuk 		if (spdk_unlikely(rc == -ENOMEM)) {
19342172c432STomasz Zawadzki 			SPDK_DEBUGLOG(rdma, "No available large data buffers. Queueing request %p\n", rdma_req);
193562700dacSSeth Howell 			return 0;
1936e718d8caSAlexey Marchuk 		} else if (spdk_unlikely(rc == -EINVAL)) {
1937a8169c37SSeth Howell 			SPDK_ERRLOG("Multi SGL element request length exceeds the max I/O size\n");
1938f2065513SJacek Kalwas 			rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
1939a8169c37SSeth Howell 			return -1;
194062700dacSSeth Howell 		}
194162700dacSSeth Howell 
19422172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "Request %p took %d buffer/s from central pool\n", rdma_req,
194391f9c6f3SShuhei Matsumoto 			      req->iovcnt);
194462700dacSSeth Howell 
194562700dacSSeth Howell 		return 0;
1946f1a584a9SBen Walker 	}
1947f1a584a9SBen Walker 
19482625cf42SBen Walker 	SPDK_ERRLOG("Invalid NVMf I/O Command SGL:  Type 0x%x, Subtype 0x%x\n",
19492625cf42SBen Walker 		    sgl->generic.type, sgl->generic.subtype);
19502625cf42SBen Walker 	rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID;
19511ff5f4abSBen Walker 	return -1;
19521ff5f4abSBen Walker }
19531ff5f4abSBen Walker 
19542b787d48SZiye Yang static void
195555d8d943SSeth Howell _nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
19562b787d48SZiye Yang 			struct spdk_nvmf_rdma_transport	*rtransport)
19572b787d48SZiye Yang {
1958e6ddb7dfSSeth Howell 	struct spdk_nvmf_rdma_qpair		*rqpair;
1959e6ddb7dfSSeth Howell 	struct spdk_nvmf_rdma_poll_group	*rgroup;
1960e6ddb7dfSSeth Howell 
1961e6ddb7dfSSeth Howell 	rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
1962005b053aSShuhei Matsumoto 	if (rdma_req->req.data_from_pool) {
1963e6ddb7dfSSeth Howell 		rgroup = rqpair->poller->group;
1964e6ddb7dfSSeth Howell 
196579945ef0SShuhei Matsumoto 		spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport);
19662b787d48SZiye Yang 	}
19670db0c443SChunsong Feng 	if (rdma_req->req.stripped_data) {
19680db0c443SChunsong Feng 		nvmf_request_free_stripped_buffers(&rdma_req->req,
19690db0c443SChunsong Feng 						   &rqpair->poller->group->group,
19700db0c443SChunsong Feng 						   &rtransport->transport);
19710db0c443SChunsong Feng 	}
197262700dacSSeth Howell 	nvmf_rdma_request_free_data(rdma_req, rtransport);
19732b787d48SZiye Yang 	rdma_req->req.length = 0;
19742b787d48SZiye Yang 	rdma_req->req.iovcnt = 0;
1975019a5361SAlexey Marchuk 	rdma_req->offset = 0;
19764f36a2a6SChunsong Feng 	rdma_req->req.dif_enabled = false;
1977183c3485SJim Harris 	rdma_req->fused_failed = false;
197831beb3edSKonrad Sztyber 	rdma_req->transfer_wr = NULL;
1979183c3485SJim Harris 	if (rdma_req->fused_pair) {
1980183c3485SJim Harris 		/* This req was part of a valid fused pair, but failed before it got to
1981183c3485SJim Harris 		 * READ_TO_EXECUTE state.  This means we need to fail the other request
1982183c3485SJim Harris 		 * in the pair, because it is no longer part of a valid pair.  If the pair
1983183c3485SJim Harris 		 * already reached READY_TO_EXECUTE state, we need to kick it.
1984183c3485SJim Harris 		 */
1985183c3485SJim Harris 		rdma_req->fused_pair->fused_failed = true;
1986183c3485SJim Harris 		if (rdma_req->fused_pair->state == RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
1987183c3485SJim Harris 			nvmf_rdma_request_process(rtransport, rdma_req->fused_pair);
1988183c3485SJim Harris 		}
1989183c3485SJim Harris 		rdma_req->fused_pair = NULL;
1990183c3485SJim Harris 	}
1991e1101529SAlexey Marchuk 	memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif));
1992bfdc957cSSeth Howell 	rqpair->qd--;
1993b25751d9SBen Walker 
1994b25751d9SBen Walker 	STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link);
199546d7b94fSAtul Malakar 	rqpair->qpair.queue_depth--;
1996bfdc957cSSeth Howell 	rdma_req->state = RDMA_REQUEST_STATE_FREE;
19972b787d48SZiye Yang }
19982b787d48SZiye Yang 
1999183c3485SJim Harris static void
2000183c3485SJim Harris nvmf_rdma_check_fused_ordering(struct spdk_nvmf_rdma_transport *rtransport,
2001183c3485SJim Harris 			       struct spdk_nvmf_rdma_qpair *rqpair,
2002183c3485SJim Harris 			       struct spdk_nvmf_rdma_request *rdma_req)
2003183c3485SJim Harris {
2004183c3485SJim Harris 	enum spdk_nvme_cmd_fuse last, next;
2005183c3485SJim Harris 
2006183c3485SJim Harris 	last = rqpair->fused_first ? rqpair->fused_first->req.cmd->nvme_cmd.fuse : SPDK_NVME_CMD_FUSE_NONE;
2007183c3485SJim Harris 	next = rdma_req->req.cmd->nvme_cmd.fuse;
2008183c3485SJim Harris 
2009183c3485SJim Harris 	assert(last != SPDK_NVME_CMD_FUSE_SECOND);
2010183c3485SJim Harris 
2011183c3485SJim Harris 	if (spdk_likely(last == SPDK_NVME_CMD_FUSE_NONE && next == SPDK_NVME_CMD_FUSE_NONE)) {
2012183c3485SJim Harris 		return;
2013183c3485SJim Harris 	}
2014183c3485SJim Harris 
2015183c3485SJim Harris 	if (last == SPDK_NVME_CMD_FUSE_FIRST) {
2016183c3485SJim Harris 		if (next == SPDK_NVME_CMD_FUSE_SECOND) {
2017183c3485SJim Harris 			/* This is a valid pair of fused commands.  Point them at each other
2018183c3485SJim Harris 			 * so they can be submitted consecutively once ready to be executed.
2019183c3485SJim Harris 			 */
2020183c3485SJim Harris 			rqpair->fused_first->fused_pair = rdma_req;
2021183c3485SJim Harris 			rdma_req->fused_pair = rqpair->fused_first;
2022183c3485SJim Harris 			rqpair->fused_first = NULL;
2023183c3485SJim Harris 			return;
2024183c3485SJim Harris 		} else {
2025183c3485SJim Harris 			/* Mark the last req as failed since it wasn't followed by a SECOND. */
2026183c3485SJim Harris 			rqpair->fused_first->fused_failed = true;
2027183c3485SJim Harris 
2028183c3485SJim Harris 			/* If the last req is in READY_TO_EXECUTE state, then call
2029183c3485SJim Harris 			 * nvmf_rdma_request_process(), otherwise nothing else will kick it.
2030183c3485SJim Harris 			 */
2031183c3485SJim Harris 			if (rqpair->fused_first->state == RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
2032183c3485SJim Harris 				nvmf_rdma_request_process(rtransport, rqpair->fused_first);
2033183c3485SJim Harris 			}
2034183c3485SJim Harris 
2035183c3485SJim Harris 			rqpair->fused_first = NULL;
2036183c3485SJim Harris 		}
2037183c3485SJim Harris 	}
2038183c3485SJim Harris 
2039183c3485SJim Harris 	if (next == SPDK_NVME_CMD_FUSE_FIRST) {
2040183c3485SJim Harris 		/* Set rqpair->fused_first here so that we know to check that the next request
2041183c3485SJim Harris 		 * is a SECOND (and to fail this one if it isn't).
2042183c3485SJim Harris 		 */
2043183c3485SJim Harris 		rqpair->fused_first = rdma_req;
2044183c3485SJim Harris 	} else if (next == SPDK_NVME_CMD_FUSE_SECOND) {
2045183c3485SJim Harris 		/* Mark this req failed since it ia SECOND and the last one was not a FIRST. */
2046183c3485SJim Harris 		rdma_req->fused_failed = true;
2047183c3485SJim Harris 	}
2048183c3485SJim Harris }
2049183c3485SJim Harris 
20501d36ed84SJim Harris static void
20511d36ed84SJim Harris nvmf_rdma_poll_group_insert_need_buffer_req(struct spdk_nvmf_rdma_poll_group *rgroup,
20521d36ed84SJim Harris 		struct spdk_nvmf_rdma_request *rdma_req)
20531d36ed84SJim Harris {
20547251e4c2SJim Harris 	struct spdk_nvmf_request *r;
20557251e4c2SJim Harris 
20567251e4c2SJim Harris 	/* CONNECT commands have a timeout, so we need to avoid a CONNECT command
20577251e4c2SJim Harris 	 * from getting buried behind a long list of other non-FABRIC requests
20587251e4c2SJim Harris 	 * waiting for a buffer. Note that even though the CONNECT command's data is
20597251e4c2SJim Harris 	 * in-capsule, the request still goes to this STAILQ.
20607251e4c2SJim Harris 	 */
20617251e4c2SJim Harris 	if (spdk_likely(rdma_req->req.cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC)) {
20627251e4c2SJim Harris 		/* This is the most likely case. */
20631d36ed84SJim Harris 		STAILQ_INSERT_TAIL(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link);
20647251e4c2SJim Harris 		return;
20657251e4c2SJim Harris 	} else {
20667251e4c2SJim Harris 		/* STAILQ doesn't have INSERT_BEFORE, so we need to either INSERT_HEAD
20677251e4c2SJim Harris 		 * or INSERT_AFTER. Put it after any other FABRIC commands that are
20687251e4c2SJim Harris 		 * already in the queue.
20697251e4c2SJim Harris 		 */
20707251e4c2SJim Harris 		r = STAILQ_FIRST(&rgroup->group.pending_buf_queue);
20717251e4c2SJim Harris 		if (r == NULL || r->cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC) {
20727251e4c2SJim Harris 			STAILQ_INSERT_HEAD(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link);
20737251e4c2SJim Harris 			return;
20747251e4c2SJim Harris 		}
20757251e4c2SJim Harris 		while (true) {
20767251e4c2SJim Harris 			struct spdk_nvmf_request *next;
20777251e4c2SJim Harris 
20787251e4c2SJim Harris 			next = STAILQ_NEXT(r, buf_link);
20797251e4c2SJim Harris 			if (next == NULL || next->cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC) {
20807251e4c2SJim Harris 				STAILQ_INSERT_AFTER(&rgroup->group.pending_buf_queue, r, &rdma_req->req, buf_link);
20817251e4c2SJim Harris 				return;
20827251e4c2SJim Harris 			}
20837251e4c2SJim Harris 			r = next;
20847251e4c2SJim Harris 		}
20857251e4c2SJim Harris 	}
20861d36ed84SJim Harris }
20871d36ed84SJim Harris 
2088f8cbdf2cSAlexey Marchuk bool
208955d8d943SSeth Howell nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
20903c423f40SBen Walker 			  struct spdk_nvmf_rdma_request *rdma_req)
20911ff5f4abSBen Walker {
20923c423f40SBen Walker 	struct spdk_nvmf_rdma_qpair	*rqpair;
20931ff5f4abSBen Walker 	struct spdk_nvmf_rdma_device	*device;
2094608d80a0SBen Walker 	struct spdk_nvmf_rdma_poll_group *rgroup;
20953c423f40SBen Walker 	struct spdk_nvme_cpl		*rsp = &rdma_req->req.rsp->nvme_cpl;
20961ff5f4abSBen Walker 	int				rc;
20973c423f40SBen Walker 	struct spdk_nvmf_rdma_recv	*rdma_recv;
20983c423f40SBen Walker 	enum spdk_nvmf_rdma_request_state prev_state;
20993c423f40SBen Walker 	bool				progress = false;
2100fdec444aSPhilipp Skadorov 	int				data_posted;
2101ca59dd5dSAlexey Marchuk 	uint32_t			num_blocks, num_rdma_reads_available, qdepth;
21021ff5f4abSBen Walker 
21033c423f40SBen Walker 	rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
21048209c8cfSSeth Howell 	device = rqpair->device;
2105608d80a0SBen Walker 	rgroup = rqpair->poller->group;
21061ff5f4abSBen Walker 
21073c423f40SBen Walker 	assert(rdma_req->state != RDMA_REQUEST_STATE_FREE);
21081ff5f4abSBen Walker 
2109745a54e4SBen Walker 	/* If the queue pair is in an error state, force the request to the completed state
2110745a54e4SBen Walker 	 * to release resources. */
21113caf2080SKonrad Sztyber 	if (spdk_unlikely(rqpair->ibv_in_error_state || !spdk_nvmf_qpair_is_active(&rqpair->qpair))) {
211204cd8e47SAlexey Marchuk 		switch (rdma_req->state) {
211304cd8e47SAlexey Marchuk 		case RDMA_REQUEST_STATE_NEED_BUFFER:
211497967681SShuhei Matsumoto 			STAILQ_REMOVE(&rgroup->group.pending_buf_queue, &rdma_req->req, spdk_nvmf_request, buf_link);
211504cd8e47SAlexey Marchuk 			break;
211604cd8e47SAlexey Marchuk 		case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
2117e59ac513SSeth Howell 			STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
211804cd8e47SAlexey Marchuk 			break;
21194e45c563SAlexey Marchuk 		case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
21204e45c563SAlexey Marchuk 			if (rdma_req->num_remaining_data_wr) {
21214e45c563SAlexey Marchuk 				/* Partially sent request is still in the pending_rdma_read_queue,
21224e45c563SAlexey Marchuk 				 * remove it before completing */
21234e45c563SAlexey Marchuk 				rdma_req->num_remaining_data_wr = 0;
21244e45c563SAlexey Marchuk 				STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
21254e45c563SAlexey Marchuk 			}
21264e45c563SAlexey Marchuk 			break;
212704cd8e47SAlexey Marchuk 		case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
2128e59ac513SSeth Howell 			STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
212904cd8e47SAlexey Marchuk 			break;
213004cd8e47SAlexey Marchuk 		case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING:
213104cd8e47SAlexey Marchuk 			STAILQ_REMOVE(&rqpair->pending_rdma_send_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
213204cd8e47SAlexey Marchuk 			break;
213304cd8e47SAlexey Marchuk 		default:
213404cd8e47SAlexey Marchuk 			break;
2135e6b2caeeSBen Walker 		}
2136bfdc957cSSeth Howell 		rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
2137745a54e4SBen Walker 	}
2138745a54e4SBen Walker 
21393c423f40SBen Walker 	/* The loop here is to allow for several back-to-back state changes. */
21403c423f40SBen Walker 	do {
21413c423f40SBen Walker 		prev_state = rdma_req->state;
21423c423f40SBen Walker 
21432172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "Request %p entering state %d\n", rdma_req, prev_state);
21443c423f40SBen Walker 
21453c423f40SBen Walker 		switch (rdma_req->state) {
21463c423f40SBen Walker 		case RDMA_REQUEST_STATE_FREE:
21473c423f40SBen Walker 			/* Some external code must kick a request into RDMA_REQUEST_STATE_NEW
21483c423f40SBen Walker 			 * to escape this state. */
21493c423f40SBen Walker 			break;
21503c423f40SBen Walker 		case RDMA_REQUEST_STATE_NEW:
21518bcbe397SJim Harris 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0,
215246d7b94fSAtul Malakar 					  (uintptr_t)rdma_req, (uintptr_t)rqpair, rqpair->qpair.queue_depth);
21533c423f40SBen Walker 			rdma_recv = rdma_req->recv;
21543c423f40SBen Walker 
21553c423f40SBen Walker 			/* The first element of the SGL is the NVMe command */
21563c423f40SBen Walker 			rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr;
21573c423f40SBen Walker 			memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp));
21585a6e7a41SAlexey Marchuk 			rdma_req->transfer_wr = &rdma_req->data.wr;
21593c423f40SBen Walker 
21603caf2080SKonrad Sztyber 			if (spdk_unlikely(rqpair->ibv_in_error_state || !spdk_nvmf_qpair_is_active(&rqpair->qpair))) {
2161bfdc957cSSeth Howell 				rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
2162531fd76dSBen Walker 				break;
2163531fd76dSBen Walker 			}
2164531fd76dSBen Walker 
2165e1101529SAlexey Marchuk 			if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->req.dif.dif_ctx))) {
216615ae31fbSBen Walker 				rdma_req->req.dif_enabled = true;
21671bc5710aSAlexey Marchuk 			}
21681bc5710aSAlexey Marchuk 
2169183c3485SJim Harris 			nvmf_rdma_check_fused_ordering(rtransport, rqpair, rdma_req);
2170183c3485SJim Harris 
2171bc13d022SChangpeng Liu #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
2172bc13d022SChangpeng Liu 			rdma_req->rsp.wr.opcode = IBV_WR_SEND;
2173bc13d022SChangpeng Liu 			rdma_req->rsp.wr.imm_data = 0;
2174bc13d022SChangpeng Liu #endif
2175bc13d022SChangpeng Liu 
21763c423f40SBen Walker 			/* The next state transition depends on the data transfer needs of this request. */
2177bc13d022SChangpeng Liu 			rdma_req->req.xfer = spdk_nvmf_req_get_xfer(&rdma_req->req);
21783c423f40SBen Walker 
2179864d93c0SAlexey Marchuk 			if (spdk_unlikely(rdma_req->req.xfer == SPDK_NVME_DATA_BIDIRECTIONAL)) {
2180864d93c0SAlexey Marchuk 				rsp->status.sct = SPDK_NVME_SCT_GENERIC;
2181864d93c0SAlexey Marchuk 				rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE;
218204cd8e47SAlexey Marchuk 				STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
218304cd8e47SAlexey Marchuk 				rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
21842172c432STomasz Zawadzki 				SPDK_DEBUGLOG(rdma, "Request %p: invalid xfer type (BIDIRECTIONAL)\n", rdma_req);
2185864d93c0SAlexey Marchuk 				break;
2186864d93c0SAlexey Marchuk 			}
2187864d93c0SAlexey Marchuk 
21883c423f40SBen Walker 			/* If no data to transfer, ready to execute. */
21893c423f40SBen Walker 			if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) {
2190bfdc957cSSeth Howell 				rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
21913c423f40SBen Walker 				break;
21921ff5f4abSBen Walker 			}
21931ff5f4abSBen Walker 
2194bfdc957cSSeth Howell 			rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER;
21951d36ed84SJim Harris 			nvmf_rdma_poll_group_insert_need_buffer_req(rgroup, rdma_req);
21963c423f40SBen Walker 			break;
21973c423f40SBen Walker 		case RDMA_REQUEST_STATE_NEED_BUFFER:
21988bcbe397SJim Harris 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0,
219962aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
22006a5ae72bSBen Walker 
22013c423f40SBen Walker 			assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE);
22021ff5f4abSBen Walker 
220397967681SShuhei Matsumoto 			if (&rdma_req->req != STAILQ_FIRST(&rgroup->group.pending_buf_queue)) {
22043c423f40SBen Walker 				/* This request needs to wait in line to obtain a buffer */
22053c423f40SBen Walker 				break;
22063c423f40SBen Walker 			}
22073c423f40SBen Walker 
22083c423f40SBen Walker 			/* Try to get a data buffer */
220955d8d943SSeth Howell 			rc = nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req);
2210e718d8caSAlexey Marchuk 			if (spdk_unlikely(rc < 0)) {
221197967681SShuhei Matsumoto 				STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link);
221204cd8e47SAlexey Marchuk 				STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
221304cd8e47SAlexey Marchuk 				rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
22143c423f40SBen Walker 				break;
22152625cf42SBen Walker 			}
22162625cf42SBen Walker 
2217e1413e91SJohn Levon 			if (rdma_req->req.iovcnt == 0) {
2218847c1c3aSZiye Yang 				/* No buffers available. */
2219251db814SEvgeniy Kochetov 				rgroup->stat.pending_data_buffer++;
22203c423f40SBen Walker 				break;
22211ff5f4abSBen Walker 			}
22221ff5f4abSBen Walker 
222397967681SShuhei Matsumoto 			STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link);
2224847c1c3aSZiye Yang 
22251ff5f4abSBen Walker 			/* If data is transferring from host to controller and the data didn't
22261ff5f4abSBen Walker 			 * arrive using in capsule data, we need to do a transfer from the host.
22271ff5f4abSBen Walker 			 */
2228005b053aSShuhei Matsumoto 			if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER &&
2229005b053aSShuhei Matsumoto 			    rdma_req->req.data_from_pool) {
223004ebc6eaSSeth Howell 				STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link);
2231bfdc957cSSeth Howell 				rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING;
2232349295caSBen Walker 				break;
2233349295caSBen Walker 			}
2234349295caSBen Walker 
2235bfdc957cSSeth Howell 			rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
22363c423f40SBen Walker 			break;
22371d0a8e1cSSeth Howell 		case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
22381d0a8e1cSSeth Howell 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0,
223962aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
22406a5ae72bSBen Walker 
224104ebc6eaSSeth Howell 			if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) {
22423c423f40SBen Walker 				/* This request needs to wait in line to perform RDMA */
22433c423f40SBen Walker 				break;
22443c423f40SBen Walker 			}
2245a681f8d5SAlexey Marchuk 			assert(rqpair->max_send_depth >= rqpair->current_send_depth);
2246ca59dd5dSAlexey Marchuk 			qdepth = rqpair->max_send_depth - rqpair->current_send_depth;
2247a681f8d5SAlexey Marchuk 			assert(rqpair->max_read_depth >= rqpair->current_read_depth);
2248ca59dd5dSAlexey Marchuk 			num_rdma_reads_available = rqpair->max_read_depth - rqpair->current_read_depth;
2249ca59dd5dSAlexey Marchuk 			if (rdma_req->num_outstanding_data_wr > qdepth ||
2250ca59dd5dSAlexey Marchuk 			    rdma_req->num_outstanding_data_wr > num_rdma_reads_available) {
2251ca59dd5dSAlexey Marchuk 				if (num_rdma_reads_available && qdepth) {
2252ca59dd5dSAlexey Marchuk 					/* Send as much as we can */
2253ca59dd5dSAlexey Marchuk 					request_prepare_transfer_in_part(&rdma_req->req, spdk_min(num_rdma_reads_available, qdepth));
2254ca59dd5dSAlexey Marchuk 				} else {
2255158dc947SSeth Howell 					/* We can only have so many WRs outstanding. we have to wait until some finish. */
2256251db814SEvgeniy Kochetov 					rqpair->poller->stat.pending_rdma_read++;
22577289d370SSeth Howell 					break;
22587289d370SSeth Howell 				}
2259ca59dd5dSAlexey Marchuk 			}
226004ebc6eaSSeth Howell 
226104ebc6eaSSeth Howell 			/* We have already verified that this request is the head of the queue. */
2262e53bd98aSxupeng-mingtu 			if (rdma_req->num_remaining_data_wr == 0) {
226304ebc6eaSSeth Howell 				STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link);
2264e53bd98aSxupeng-mingtu 			}
226504ebc6eaSSeth Howell 
22668b9c92d3SAlexey Marchuk 			request_transfer_in(&rdma_req->req);
2267bfdc957cSSeth Howell 			rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER;
22688b9c92d3SAlexey Marchuk 
22693c423f40SBen Walker 			break;
22703c423f40SBen Walker 		case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
22716a5ae72bSBen Walker 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0,
227262aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
22733c423f40SBen Walker 			/* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE
22743c423f40SBen Walker 			 * to escape this state. */
22753c423f40SBen Walker 			break;
22763c423f40SBen Walker 		case RDMA_REQUEST_STATE_READY_TO_EXECUTE:
22778bcbe397SJim Harris 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0,
227862aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
22791bc5710aSAlexey Marchuk 
228015ae31fbSBen Walker 			if (spdk_unlikely(rdma_req->req.dif_enabled)) {
22811bc5710aSAlexey Marchuk 				if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
22821bc5710aSAlexey Marchuk 					/* generate DIF for write operation */
2283e1101529SAlexey Marchuk 					num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
22841bc5710aSAlexey Marchuk 					assert(num_blocks > 0);
22851bc5710aSAlexey Marchuk 
22861bc5710aSAlexey Marchuk 					rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt,
2287e1101529SAlexey Marchuk 							       num_blocks, &rdma_req->req.dif.dif_ctx);
22881bc5710aSAlexey Marchuk 					if (rc != 0) {
22891bc5710aSAlexey Marchuk 						SPDK_ERRLOG("DIF generation failed\n");
22901bc5710aSAlexey Marchuk 						rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
2291608b54a2SKonrad Sztyber 						spdk_nvmf_qpair_disconnect(&rqpair->qpair);
22921bc5710aSAlexey Marchuk 						break;
22931bc5710aSAlexey Marchuk 					}
22941bc5710aSAlexey Marchuk 				}
22951bc5710aSAlexey Marchuk 
2296e1101529SAlexey Marchuk 				assert(rdma_req->req.dif.elba_length >= rdma_req->req.length);
22971bc5710aSAlexey Marchuk 				/* set extended length before IO operation */
2298e1101529SAlexey Marchuk 				rdma_req->req.length = rdma_req->req.dif.elba_length;
22991bc5710aSAlexey Marchuk 			}
23001bc5710aSAlexey Marchuk 
2301183c3485SJim Harris 			if (rdma_req->req.cmd->nvme_cmd.fuse != SPDK_NVME_CMD_FUSE_NONE) {
2302183c3485SJim Harris 				if (rdma_req->fused_failed) {
2303183c3485SJim Harris 					/* This request failed FUSED semantics.  Fail it immediately, without
2304183c3485SJim Harris 					 * even sending it to the target layer.
2305183c3485SJim Harris 					 */
2306183c3485SJim Harris 					rsp->status.sct = SPDK_NVME_SCT_GENERIC;
2307183c3485SJim Harris 					rsp->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
230804cd8e47SAlexey Marchuk 					STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
230904cd8e47SAlexey Marchuk 					rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
2310183c3485SJim Harris 					break;
2311183c3485SJim Harris 				}
2312183c3485SJim Harris 
2313183c3485SJim Harris 				if (rdma_req->fused_pair == NULL ||
2314183c3485SJim Harris 				    rdma_req->fused_pair->state != RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
2315183c3485SJim Harris 					/* This request is ready to execute, but either we don't know yet if it's
2316183c3485SJim Harris 					 * valid - i.e. this is a FIRST but we haven't received the next
2317183c3485SJim Harris 					 * request yet or the other request of this fused pair isn't ready to
2318183c3485SJim Harris 					 * execute.  So break here and this request will get processed later either
2319183c3485SJim Harris 					 * when the other request is ready or we find that this request isn't valid.
2320183c3485SJim Harris 					 */
2321183c3485SJim Harris 					break;
2322183c3485SJim Harris 				}
2323183c3485SJim Harris 			}
2324183c3485SJim Harris 
2325183c3485SJim Harris 			/* If we get to this point, and this request is a fused command, we know that
2326183c3485SJim Harris 			 * it is part of valid sequence (FIRST followed by a SECOND) and that both
2327183c3485SJim Harris 			 * requests are READY_TO_EXECUTE. So call spdk_nvmf_request_exec() both on this
2328183c3485SJim Harris 			 * request, and the other request of the fused pair, in the correct order.
2329183c3485SJim Harris 			 * Also clear the ->fused_pair pointers on both requests, since after this point
2330183c3485SJim Harris 			 * we no longer need to maintain the relationship between these two requests.
2331183c3485SJim Harris 			 */
2332183c3485SJim Harris 			if (rdma_req->req.cmd->nvme_cmd.fuse == SPDK_NVME_CMD_FUSE_SECOND) {
2333183c3485SJim Harris 				assert(rdma_req->fused_pair != NULL);
2334183c3485SJim Harris 				assert(rdma_req->fused_pair->fused_pair != NULL);
2335183c3485SJim Harris 				rdma_req->fused_pair->state = RDMA_REQUEST_STATE_EXECUTING;
2336183c3485SJim Harris 				spdk_nvmf_request_exec(&rdma_req->fused_pair->req);
2337183c3485SJim Harris 				rdma_req->fused_pair->fused_pair = NULL;
2338183c3485SJim Harris 				rdma_req->fused_pair = NULL;
2339183c3485SJim Harris 			}
2340bfdc957cSSeth Howell 			rdma_req->state = RDMA_REQUEST_STATE_EXECUTING;
23413c423f40SBen Walker 			spdk_nvmf_request_exec(&rdma_req->req);
2342183c3485SJim Harris 			if (rdma_req->req.cmd->nvme_cmd.fuse == SPDK_NVME_CMD_FUSE_FIRST) {
2343183c3485SJim Harris 				assert(rdma_req->fused_pair != NULL);
2344183c3485SJim Harris 				assert(rdma_req->fused_pair->fused_pair != NULL);
2345183c3485SJim Harris 				rdma_req->fused_pair->state = RDMA_REQUEST_STATE_EXECUTING;
2346183c3485SJim Harris 				spdk_nvmf_request_exec(&rdma_req->fused_pair->req);
2347183c3485SJim Harris 				rdma_req->fused_pair->fused_pair = NULL;
2348183c3485SJim Harris 				rdma_req->fused_pair = NULL;
2349183c3485SJim Harris 			}
23503c423f40SBen Walker 			break;
23513c423f40SBen Walker 		case RDMA_REQUEST_STATE_EXECUTING:
23528bcbe397SJim Harris 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0,
235362aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
23543c423f40SBen Walker 			/* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED
23553c423f40SBen Walker 			 * to escape this state. */
23563c423f40SBen Walker 			break;
23573c423f40SBen Walker 		case RDMA_REQUEST_STATE_EXECUTED:
23588bcbe397SJim Harris 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0,
235962aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
2360af61ab90SShuhei Matsumoto 			if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
2361af61ab90SShuhei Matsumoto 			    rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
236204ebc6eaSSeth Howell 				STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link);
2363bfdc957cSSeth Howell 				rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING;
23643c423f40SBen Walker 			} else {
236504cd8e47SAlexey Marchuk 				STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
236604cd8e47SAlexey Marchuk 				rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
23673c423f40SBen Walker 			}
236815ae31fbSBen Walker 			if (spdk_unlikely(rdma_req->req.dif_enabled)) {
23691bc5710aSAlexey Marchuk 				/* restore the original length */
2370e1101529SAlexey Marchuk 				rdma_req->req.length = rdma_req->req.dif.orig_length;
23717545e8c8SAlexey Marchuk 
23727545e8c8SAlexey Marchuk 				if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
23737545e8c8SAlexey Marchuk 					struct spdk_dif_error error_blk;
23747545e8c8SAlexey Marchuk 
2375e1101529SAlexey Marchuk 					num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
23760db0c443SChunsong Feng 					if (!rdma_req->req.stripped_data) {
2377e1101529SAlexey Marchuk 						rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
2378e1101529SAlexey Marchuk 								     &rdma_req->req.dif.dif_ctx, &error_blk);
23790db0c443SChunsong Feng 					} else {
23800db0c443SChunsong Feng 						rc = spdk_dif_verify_copy(rdma_req->req.stripped_data->iov,
23810db0c443SChunsong Feng 									  rdma_req->req.stripped_data->iovcnt,
23820db0c443SChunsong Feng 									  rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
23830db0c443SChunsong Feng 									  &rdma_req->req.dif.dif_ctx, &error_blk);
23840db0c443SChunsong Feng 					}
23857545e8c8SAlexey Marchuk 					if (rc) {
23867545e8c8SAlexey Marchuk 						struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl;
23877545e8c8SAlexey Marchuk 
23887545e8c8SAlexey Marchuk 						SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", error_blk.err_type,
23897545e8c8SAlexey Marchuk 							    error_blk.err_offset);
23907545e8c8SAlexey Marchuk 						rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR;
239155d8d943SSeth Howell 						rsp->status.sc = nvmf_rdma_dif_error_to_compl_status(error_blk.err_type);
23927545e8c8SAlexey Marchuk 						STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
239304cd8e47SAlexey Marchuk 						STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
239404cd8e47SAlexey Marchuk 						rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
23957545e8c8SAlexey Marchuk 					}
23967545e8c8SAlexey Marchuk 				}
23971bc5710aSAlexey Marchuk 			}
23983c423f40SBen Walker 			break;
23991d0a8e1cSSeth Howell 		case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
24001d0a8e1cSSeth Howell 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0,
240162aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
24021d0a8e1cSSeth Howell 
240304ebc6eaSSeth Howell 			if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) {
24041d0a8e1cSSeth Howell 				/* This request needs to wait in line to perform RDMA */
24051d0a8e1cSSeth Howell 				break;
24061d0a8e1cSSeth Howell 			}
24071d0a8e1cSSeth Howell 			if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) >
24081d0a8e1cSSeth Howell 			    rqpair->max_send_depth) {
24091d0a8e1cSSeth Howell 				/* We can only have so many WRs outstanding. we have to wait until some finish.
24101d0a8e1cSSeth Howell 				 * +1 since each request has an additional wr in the resp. */
2411251db814SEvgeniy Kochetov 				rqpair->poller->stat.pending_rdma_write++;
24121d0a8e1cSSeth Howell 				break;
24131d0a8e1cSSeth Howell 			}
241404ebc6eaSSeth Howell 
241504ebc6eaSSeth Howell 			/* We have already verified that this request is the head of the queue. */
241604ebc6eaSSeth Howell 			STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link);
241704ebc6eaSSeth Howell 
24181d0a8e1cSSeth Howell 			/* The data transfer will be kicked off from
24191d0a8e1cSSeth Howell 			 * RDMA_REQUEST_STATE_READY_TO_COMPLETE state.
242004cd8e47SAlexey Marchuk 			 * We verified that data + response fit into send queue, so we can go to the next state directly
242104cd8e47SAlexey Marchuk 			 */
242204cd8e47SAlexey Marchuk 			rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
242304cd8e47SAlexey Marchuk 			break;
242404cd8e47SAlexey Marchuk 		case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING:
242504cd8e47SAlexey Marchuk 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING, 0, 0,
242604cd8e47SAlexey Marchuk 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
242704cd8e47SAlexey Marchuk 
242804cd8e47SAlexey Marchuk 			if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_send_queue)) {
242904cd8e47SAlexey Marchuk 				/* This request needs to wait in line to send the completion */
243004cd8e47SAlexey Marchuk 				break;
243104cd8e47SAlexey Marchuk 			}
243204cd8e47SAlexey Marchuk 
2433a681f8d5SAlexey Marchuk 			assert(rqpair->current_send_depth <= rqpair->max_send_depth);
243404cd8e47SAlexey Marchuk 			if (rqpair->current_send_depth == rqpair->max_send_depth) {
243504cd8e47SAlexey Marchuk 				/* We can only have so many WRs outstanding. we have to wait until some finish */
243604cd8e47SAlexey Marchuk 				rqpair->poller->stat.pending_rdma_send++;
243704cd8e47SAlexey Marchuk 				break;
243804cd8e47SAlexey Marchuk 			}
243904cd8e47SAlexey Marchuk 
244004cd8e47SAlexey Marchuk 			/* We have already verified that this request is the head of the queue. */
244104cd8e47SAlexey Marchuk 			STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_send_queue, state_link);
244204cd8e47SAlexey Marchuk 
244304cd8e47SAlexey Marchuk 			/* The response sending will be kicked off from
244404cd8e47SAlexey Marchuk 			 * RDMA_REQUEST_STATE_READY_TO_COMPLETE state.
24451d0a8e1cSSeth Howell 			 */
2446bfdc957cSSeth Howell 			rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
24471d0a8e1cSSeth Howell 			break;
24483c423f40SBen Walker 		case RDMA_REQUEST_STATE_READY_TO_COMPLETE:
24498bcbe397SJim Harris 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0,
245062aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
2451fdec444aSPhilipp Skadorov 			rc = request_transfer_out(&rdma_req->req, &data_posted);
24523c423f40SBen Walker 			assert(rc == 0); /* No good way to handle this currently */
2453e718d8caSAlexey Marchuk 			if (spdk_unlikely(rc)) {
2454bfdc957cSSeth Howell 				rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
2455e7988759SBen Walker 			} else {
2456bfdc957cSSeth Howell 				rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST :
2457bfdc957cSSeth Howell 						  RDMA_REQUEST_STATE_COMPLETING;
2458e7988759SBen Walker 			}
2459fdec444aSPhilipp Skadorov 			break;
2460fdec444aSPhilipp Skadorov 		case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
2461fdec444aSPhilipp Skadorov 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0,
246262aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
2463fdec444aSPhilipp Skadorov 			/* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
2464fdec444aSPhilipp Skadorov 			 * to escape this state. */
24653c423f40SBen Walker 			break;
24663c423f40SBen Walker 		case RDMA_REQUEST_STATE_COMPLETING:
24678bcbe397SJim Harris 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0,
246862aa8bd8SKonrad Sztyber 					  (uintptr_t)rdma_req, (uintptr_t)rqpair);
24693c423f40SBen Walker 			/* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
24703c423f40SBen Walker 			 * to escape this state. */
24713c423f40SBen Walker 			break;
24723c423f40SBen Walker 		case RDMA_REQUEST_STATE_COMPLETED:
24738bcbe397SJim Harris 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0,
247446d7b94fSAtul Malakar 					  (uintptr_t)rdma_req, (uintptr_t)rqpair, rqpair->qpair.queue_depth);
24753c423f40SBen Walker 
2476fbe8f804SEvgeniy Kochetov 			rqpair->poller->stat.request_latency += spdk_get_ticks() - rdma_req->receive_tsc;
247755d8d943SSeth Howell 			_nvmf_rdma_request_free(rdma_req, rtransport);
2478fdec444aSPhilipp Skadorov 			break;
2479fdec444aSPhilipp Skadorov 		case RDMA_REQUEST_NUM_STATES:
2480fdec444aSPhilipp Skadorov 		default:
2481fdec444aSPhilipp Skadorov 			assert(0);
24823c423f40SBen Walker 			break;
24833c423f40SBen Walker 		}
24843c423f40SBen Walker 
24853c423f40SBen Walker 		if (rdma_req->state != prev_state) {
24863c423f40SBen Walker 			progress = true;
24873c423f40SBen Walker 		}
24883c423f40SBen Walker 	} while (rdma_req->state != prev_state);
24893c423f40SBen Walker 
24903c423f40SBen Walker 	return progress;
2491349295caSBen Walker }
2492349295caSBen Walker 
2493349295caSBen Walker /* Public API callbacks begin here */
2494349295caSBen Walker 
2495183d81d0SJohn Barnard #define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128
2496183d81d0SJohn Barnard #define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128
2497ed0b611fSEvgeniy Kochetov #define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096
2498bf647c16SJim Harris #define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 128
2499183d81d0SJohn Barnard #define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096
2500183d81d0SJohn Barnard #define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072
2501b6b0a0baSSeth Howell #define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES)
250233f60621SSeth Howell #define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095
25033b138377SJim Harris #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE UINT32_MAX
25041bc5710aSAlexey Marchuk #define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
25051bc5710aSAlexey Marchuk #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
25063b830202SSeth Howell #define SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG 100
250726e0ef9aSShuhei Matsumoto #define SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC 1
2508c818233bSIvan Betsis #define SPDK_NVMF_RDMA_DEFAULT_NO_WR_BATCHING false
25097dab13c0SAlexey Marchuk #define SPDK_NVMF_RDMA_DEFAULT_DATA_WR_POOL_SIZE 4095
2510183d81d0SJohn Barnard 
2511183d81d0SJohn Barnard static void
251255d8d943SSeth Howell nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
2513183d81d0SJohn Barnard {
2514183d81d0SJohn Barnard 	opts->max_queue_depth =		SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH;
2515183d81d0SJohn Barnard 	opts->max_qpairs_per_ctrlr =	SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR;
2516183d81d0SJohn Barnard 	opts->in_capsule_data_size =	SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE;
2517183d81d0SJohn Barnard 	opts->max_io_size =		SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE;
2518b6b0a0baSSeth Howell 	opts->io_unit_size =		SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE;
2519183d81d0SJohn Barnard 	opts->max_aq_depth =		SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH;
252058f16244SZiye Yang 	opts->num_shared_buffers =	SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS;
2521e816c8fdSSeth Howell 	opts->buf_cache_size =		SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE;
25221bc5710aSAlexey Marchuk 	opts->dif_insert_or_strip =	SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
252326e0ef9aSShuhei Matsumoto 	opts->abort_timeout_sec =	SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC;
2524f766d1e4SDarek Stojaczyk 	opts->transport_specific =      NULL;
25257dab13c0SAlexey Marchuk 	opts->data_wr_pool_size	=	SPDK_NVMF_RDMA_DEFAULT_DATA_WR_POOL_SIZE;
2526183d81d0SJohn Barnard }
2527183d81d0SJohn Barnard 
25280d98a949SNaresh Gottumukkala static int nvmf_rdma_destroy(struct spdk_nvmf_transport *transport,
25290d98a949SNaresh Gottumukkala 			     spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg);
25305518a327SDaniel Verkamp 
2531efb6081cSAlexey Marchuk static inline bool
2532efb6081cSAlexey Marchuk nvmf_rdma_is_rxe_device(struct spdk_nvmf_rdma_device *device)
2533efb6081cSAlexey Marchuk {
25349cea3232SAlexey Marchuk 	return device->attr.vendor_id == SPDK_RDMA_RXE_VENDOR_ID_OLD ||
25359cea3232SAlexey Marchuk 	       device->attr.vendor_id == SPDK_RDMA_RXE_VENDOR_ID_NEW;
2536efb6081cSAlexey Marchuk }
2537efb6081cSAlexey Marchuk 
25388dd1cd21SBen Walker static int nvmf_rdma_accept(void *ctx);
2539549be9adSsijie.sun static bool nvmf_rdma_retry_listen_port(struct spdk_nvmf_rdma_transport *rtransport);
2540549be9adSsijie.sun static void destroy_ib_device(struct spdk_nvmf_rdma_transport *rtransport,
2541549be9adSsijie.sun 			      struct spdk_nvmf_rdma_device *device);
2542549be9adSsijie.sun 
2543a5283034Ssijie.sun static int
2544a5283034Ssijie.sun create_ib_device(struct spdk_nvmf_rdma_transport *rtransport, struct ibv_context *context,
2545a5283034Ssijie.sun 		 struct spdk_nvmf_rdma_device **new_device)
2546a5283034Ssijie.sun {
2547a5283034Ssijie.sun 	struct spdk_nvmf_rdma_device	*device;
2548a5283034Ssijie.sun 	int				flag = 0;
2549a5283034Ssijie.sun 	int				rc = 0;
2550a5283034Ssijie.sun 
2551a5283034Ssijie.sun 	device = calloc(1, sizeof(*device));
2552a5283034Ssijie.sun 	if (!device) {
2553a5283034Ssijie.sun 		SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n");
2554a5283034Ssijie.sun 		return -ENOMEM;
2555a5283034Ssijie.sun 	}
2556a5283034Ssijie.sun 	device->context = context;
2557a5283034Ssijie.sun 	rc = ibv_query_device(device->context, &device->attr);
2558a5283034Ssijie.sun 	if (rc < 0) {
2559a5283034Ssijie.sun 		SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
2560a5283034Ssijie.sun 		free(device);
2561a5283034Ssijie.sun 		return rc;
2562a5283034Ssijie.sun 	}
2563a5283034Ssijie.sun 
2564a5283034Ssijie.sun #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
2565a5283034Ssijie.sun 	if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) {
2566a5283034Ssijie.sun 		SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,");
2567a5283034Ssijie.sun 		SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id);
2568a5283034Ssijie.sun 	}
2569a5283034Ssijie.sun 
2570a5283034Ssijie.sun 	/**
2571a5283034Ssijie.sun 	 * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE.
2572a5283034Ssijie.sun 	 * The Soft-RoCE RXE driver does not currently support send with invalidate,
2573a5283034Ssijie.sun 	 * but incorrectly reports that it does. There are changes making their way
2574a5283034Ssijie.sun 	 * through the kernel now that will enable this feature. When they are merged,
2575a5283034Ssijie.sun 	 * we can conditionally enable this feature.
2576a5283034Ssijie.sun 	 *
2577a5283034Ssijie.sun 	 * TODO: enable this for versions of the kernel rxe driver that support it.
2578a5283034Ssijie.sun 	 */
2579a5283034Ssijie.sun 	if (nvmf_rdma_is_rxe_device(device)) {
2580a5283034Ssijie.sun 		device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS);
2581a5283034Ssijie.sun 	}
2582a5283034Ssijie.sun #endif
2583a5283034Ssijie.sun 
2584a5283034Ssijie.sun 	/* set up device context async ev fd as NON_BLOCKING */
2585a5283034Ssijie.sun 	flag = fcntl(device->context->async_fd, F_GETFL);
2586a5283034Ssijie.sun 	rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK);
2587a5283034Ssijie.sun 	if (rc < 0) {
2588a5283034Ssijie.sun 		SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n");
2589a5283034Ssijie.sun 		free(device);
2590a5283034Ssijie.sun 		return rc;
2591a5283034Ssijie.sun 	}
2592a5283034Ssijie.sun 
2593a5283034Ssijie.sun 	TAILQ_INSERT_TAIL(&rtransport->devices, device, link);
259434edd9f1SKamil Godzwon 	SPDK_DEBUGLOG(rdma, "New device %p is added to RDMA transport\n", device);
2595a5283034Ssijie.sun 
2596a5283034Ssijie.sun 	if (g_nvmf_hooks.get_ibv_pd) {
2597a5283034Ssijie.sun 		device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context);
2598a5283034Ssijie.sun 	} else {
2599a5283034Ssijie.sun 		device->pd = ibv_alloc_pd(device->context);
2600a5283034Ssijie.sun 	}
2601a5283034Ssijie.sun 
2602a5283034Ssijie.sun 	if (!device->pd) {
2603a5283034Ssijie.sun 		SPDK_ERRLOG("Unable to allocate protection domain.\n");
2604549be9adSsijie.sun 		destroy_ib_device(rtransport, device);
2605a5283034Ssijie.sun 		return -ENOMEM;
2606a5283034Ssijie.sun 	}
2607a5283034Ssijie.sun 
2608a5283034Ssijie.sun 	assert(device->map == NULL);
2609a5283034Ssijie.sun 
26108ffb2c09SAlexey Marchuk 	device->map = spdk_rdma_utils_create_mem_map(device->pd, &g_nvmf_hooks, IBV_ACCESS_LOCAL_WRITE);
2611a5283034Ssijie.sun 	if (!device->map) {
2612a5283034Ssijie.sun 		SPDK_ERRLOG("Unable to allocate memory map for listen address\n");
2613549be9adSsijie.sun 		destroy_ib_device(rtransport, device);
2614a5283034Ssijie.sun 		return -ENOMEM;
2615a5283034Ssijie.sun 	}
2616a5283034Ssijie.sun 
2617a5283034Ssijie.sun 	assert(device->map != NULL);
2618a5283034Ssijie.sun 	assert(device->pd != NULL);
2619a5283034Ssijie.sun 
2620a5283034Ssijie.sun 	if (new_device) {
2621a5283034Ssijie.sun 		*new_device = device;
2622a5283034Ssijie.sun 	}
2623549be9adSsijie.sun 	SPDK_NOTICELOG("Create IB device %s(%p/%p) succeed.\n", ibv_get_device_name(context->device),
2624549be9adSsijie.sun 		       device, context);
2625549be9adSsijie.sun 
2626a5283034Ssijie.sun 	return 0;
2627a5283034Ssijie.sun }
2628a5283034Ssijie.sun 
2629a5283034Ssijie.sun static void
2630a5283034Ssijie.sun free_poll_fds(struct spdk_nvmf_rdma_transport *rtransport)
2631a5283034Ssijie.sun {
2632a5283034Ssijie.sun 	if (rtransport->poll_fds) {
2633a5283034Ssijie.sun 		free(rtransport->poll_fds);
2634a5283034Ssijie.sun 		rtransport->poll_fds = NULL;
2635a5283034Ssijie.sun 	}
2636a5283034Ssijie.sun 	rtransport->npoll_fds = 0;
2637a5283034Ssijie.sun }
2638a5283034Ssijie.sun 
2639a5283034Ssijie.sun static int
2640a5283034Ssijie.sun generate_poll_fds(struct spdk_nvmf_rdma_transport *rtransport)
2641a5283034Ssijie.sun {
2642a5283034Ssijie.sun 	/* Set up poll descriptor array to monitor events from RDMA and IB
2643a5283034Ssijie.sun 	 * in a single poll syscall
2644a5283034Ssijie.sun 	 */
2645a5283034Ssijie.sun 	int device_count = 0;
2646a5283034Ssijie.sun 	int i = 0;
2647a5283034Ssijie.sun 	struct spdk_nvmf_rdma_device *device, *tmp;
2648a5283034Ssijie.sun 
2649a5283034Ssijie.sun 	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
2650a5283034Ssijie.sun 		device_count++;
2651a5283034Ssijie.sun 	}
2652a5283034Ssijie.sun 
2653a5283034Ssijie.sun 	rtransport->npoll_fds = device_count + 1;
2654a5283034Ssijie.sun 
2655a5283034Ssijie.sun 	rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd));
2656a5283034Ssijie.sun 	if (rtransport->poll_fds == NULL) {
2657a5283034Ssijie.sun 		SPDK_ERRLOG("poll_fds allocation failed\n");
2658a5283034Ssijie.sun 		return -ENOMEM;
2659a5283034Ssijie.sun 	}
2660a5283034Ssijie.sun 
2661a5283034Ssijie.sun 	rtransport->poll_fds[i].fd = rtransport->event_channel->fd;
2662a5283034Ssijie.sun 	rtransport->poll_fds[i++].events = POLLIN;
2663a5283034Ssijie.sun 
2664a5283034Ssijie.sun 	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
2665a5283034Ssijie.sun 		rtransport->poll_fds[i].fd = device->context->async_fd;
2666a5283034Ssijie.sun 		rtransport->poll_fds[i++].events = POLLIN;
2667a5283034Ssijie.sun 	}
2668a5283034Ssijie.sun 
2669a5283034Ssijie.sun 	return 0;
2670a5283034Ssijie.sun }
267143022da3SJacek Kalwas 
267231d033f9SBen Walker static struct spdk_nvmf_transport *
267355d8d943SSeth Howell nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
2674349295caSBen Walker {
2675349295caSBen Walker 	int rc;
2676ecc436fcSBen Walker 	struct spdk_nvmf_rdma_transport *rtransport;
2677a5283034Ssijie.sun 	struct spdk_nvmf_rdma_device	*device;
2678958c68f1SBen Walker 	struct ibv_context		**contexts;
26797dab13c0SAlexey Marchuk 	size_t				data_wr_pool_size;
2680958c68f1SBen Walker 	uint32_t			i;
2681161a3002STomasz Zawadzki 	int				flag;
26828580daa1SSrikanth kaligotla 	uint32_t			sge_count;
2683e816c8fdSSeth Howell 	uint32_t			min_shared_buffers;
2684ed1a6c7dSAlexey Marchuk 	uint32_t			min_in_capsule_data_size;
2685b6b0a0baSSeth Howell 	int				max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES;
268631d033f9SBen Walker 
2687ecc436fcSBen Walker 	rtransport = calloc(1, sizeof(*rtransport));
2688ecc436fcSBen Walker 	if (!rtransport) {
268931d033f9SBen Walker 		return NULL;
269031d033f9SBen Walker 	}
269131d033f9SBen Walker 
2692958c68f1SBen Walker 	TAILQ_INIT(&rtransport->devices);
26931cbc2b16SBen Walker 	TAILQ_INIT(&rtransport->ports);
2694645d5944SAlexey Marchuk 	TAILQ_INIT(&rtransport->poll_groups);
2695549be9adSsijie.sun 	TAILQ_INIT(&rtransport->retry_ports);
2696ecc436fcSBen Walker 
2697ecc436fcSBen Walker 	rtransport->transport.ops = &spdk_nvmf_transport_rdma;
269897ef8701SMonica Kenguva 	rtransport->rdma_opts.num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE;
2699f766d1e4SDarek Stojaczyk 	rtransport->rdma_opts.max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
2700f766d1e4SDarek Stojaczyk 	rtransport->rdma_opts.no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
2701f766d1e4SDarek Stojaczyk 	rtransport->rdma_opts.acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG;
2702c818233bSIvan Betsis 	rtransport->rdma_opts.no_wr_batching = SPDK_NVMF_RDMA_DEFAULT_NO_WR_BATCHING;
2703f766d1e4SDarek Stojaczyk 	if (opts->transport_specific != NULL &&
2704f766d1e4SDarek Stojaczyk 	    spdk_json_decode_object_relaxed(opts->transport_specific, rdma_transport_opts_decoder,
2705f766d1e4SDarek Stojaczyk 					    SPDK_COUNTOF(rdma_transport_opts_decoder),
2706f766d1e4SDarek Stojaczyk 					    &rtransport->rdma_opts)) {
2707f766d1e4SDarek Stojaczyk 		SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n");
27080d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
2709f766d1e4SDarek Stojaczyk 		return NULL;
2710f766d1e4SDarek Stojaczyk 	}
2711349295caSBen Walker 
27122172c432STomasz Zawadzki 	SPDK_INFOLOG(rdma, "*** RDMA Transport Init ***\n"
27138e808490SJohn Barnard 		     "  Transport opts:  max_ioq_depth=%d, max_io_size=%d,\n"
27141551197dSAlexey Marchuk 		     "  max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
2715ed0b611fSEvgeniy Kochetov 		     "  in_capsule_data_size=%d, max_aq_depth=%d,\n"
271697ef8701SMonica Kenguva 		     "  num_shared_buffers=%d, num_cqe=%d, max_srq_depth=%d, no_srq=%d,"
2717c818233bSIvan Betsis 		     "  acceptor_backlog=%d, no_wr_batching=%d abort_timeout_sec=%d\n",
27188e808490SJohn Barnard 		     opts->max_queue_depth,
27198e808490SJohn Barnard 		     opts->max_io_size,
27201551197dSAlexey Marchuk 		     opts->max_qpairs_per_ctrlr - 1,
27218e808490SJohn Barnard 		     opts->io_unit_size,
27228e808490SJohn Barnard 		     opts->in_capsule_data_size,
272358f16244SZiye Yang 		     opts->max_aq_depth,
2724ed0b611fSEvgeniy Kochetov 		     opts->num_shared_buffers,
272597ef8701SMonica Kenguva 		     rtransport->rdma_opts.num_cqe,
2726f766d1e4SDarek Stojaczyk 		     rtransport->rdma_opts.max_srq_depth,
2727f766d1e4SDarek Stojaczyk 		     rtransport->rdma_opts.no_srq,
2728f766d1e4SDarek Stojaczyk 		     rtransport->rdma_opts.acceptor_backlog,
2729c818233bSIvan Betsis 		     rtransport->rdma_opts.no_wr_batching,
273026e0ef9aSShuhei Matsumoto 		     opts->abort_timeout_sec);
2731349295caSBen Walker 
27328580daa1SSrikanth kaligotla 	/* I/O unit size cannot be larger than max I/O size */
27338e808490SJohn Barnard 	if (opts->io_unit_size > opts->max_io_size) {
27348e808490SJohn Barnard 		opts->io_unit_size = opts->max_io_size;
27358580daa1SSrikanth kaligotla 	}
27368580daa1SSrikanth kaligotla 
2737f766d1e4SDarek Stojaczyk 	if (rtransport->rdma_opts.acceptor_backlog <= 0) {
27383b830202SSeth Howell 		SPDK_ERRLOG("The acceptor backlog cannot be less than 1, setting to the default value of (%d).\n",
27393b830202SSeth Howell 			    SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG);
2740f766d1e4SDarek Stojaczyk 		rtransport->rdma_opts.acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG;
27413b830202SSeth Howell 	}
27423b830202SSeth Howell 
27430b20f2e5SZiye Yang 	if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) {
27440b20f2e5SZiye Yang 		SPDK_ERRLOG("The number of shared data buffers (%d) is less than"
27450b20f2e5SZiye Yang 			    "the minimum number required to guarantee that forward progress can be made (%d)\n",
27460b20f2e5SZiye Yang 			    opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2));
27470d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
27480b20f2e5SZiye Yang 		return NULL;
27490b20f2e5SZiye Yang 	}
27500b20f2e5SZiye Yang 
27513b138377SJim Harris 	/* If buf_cache_size == UINT32_MAX, we will dynamically pick a cache size later that we know will fit. */
27523b138377SJim Harris 	if (opts->buf_cache_size < UINT32_MAX) {
2753e9b9510aSAlexey Marchuk 		min_shared_buffers = spdk_env_get_core_count() * opts->buf_cache_size;
2754e816c8fdSSeth Howell 		if (min_shared_buffers > opts->num_shared_buffers) {
2755e816c8fdSSeth Howell 			SPDK_ERRLOG("There are not enough buffers to satisfy"
2756e816c8fdSSeth Howell 				    "per-poll group caches for each thread. (%" PRIu32 ")"
2757e816c8fdSSeth Howell 				    "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers);
2758e816c8fdSSeth Howell 			SPDK_ERRLOG("Please specify a larger number of shared buffers\n");
27590d98a949SNaresh Gottumukkala 			nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
2760e816c8fdSSeth Howell 			return NULL;
2761e816c8fdSSeth Howell 		}
27623b138377SJim Harris 	}
2763e816c8fdSSeth Howell 
27648e808490SJohn Barnard 	sge_count = opts->max_io_size / opts->io_unit_size;
27651180bf83SSeth Howell 	if (sge_count > NVMF_DEFAULT_TX_SGE) {
27668e808490SJohn Barnard 		SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size);
27670d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
27688580daa1SSrikanth kaligotla 		return NULL;
27698580daa1SSrikanth kaligotla 	}
27708580daa1SSrikanth kaligotla 
2771ed1a6c7dSAlexey Marchuk 	min_in_capsule_data_size = sizeof(struct spdk_nvme_sgl_descriptor) * SPDK_NVMF_MAX_SGL_ENTRIES;
277295d710ddSAlexey Marchuk 	if (opts->in_capsule_data_size < min_in_capsule_data_size) {
2773ed1a6c7dSAlexey Marchuk 		SPDK_WARNLOG("In capsule data size is set to %u, this is minimum size required to support msdbd=16\n",
2774ed1a6c7dSAlexey Marchuk 			     min_in_capsule_data_size);
2775ed1a6c7dSAlexey Marchuk 		opts->in_capsule_data_size = min_in_capsule_data_size;
2776ed1a6c7dSAlexey Marchuk 	}
2777ed1a6c7dSAlexey Marchuk 
2778ecc436fcSBen Walker 	rtransport->event_channel = rdma_create_event_channel();
2779ecc436fcSBen Walker 	if (rtransport->event_channel == NULL) {
2780891c12a6SPawel Wodkowski 		SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno));
27810d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
278231d033f9SBen Walker 		return NULL;
2783349295caSBen Walker 	}
2784349295caSBen Walker 
2785161a3002STomasz Zawadzki 	flag = fcntl(rtransport->event_channel->fd, F_GETFL);
2786161a3002STomasz Zawadzki 	if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) {
2787161a3002STomasz Zawadzki 		SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
2788891c12a6SPawel Wodkowski 			    rtransport->event_channel->fd, spdk_strerror(errno));
27890d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
279031d033f9SBen Walker 		return NULL;
2791349295caSBen Walker 	}
2792349295caSBen Walker 
27937dab13c0SAlexey Marchuk 	data_wr_pool_size = opts->data_wr_pool_size;
27947dab13c0SAlexey Marchuk 	if (data_wr_pool_size < SPDK_NVMF_MAX_SGL_ENTRIES * 2 * spdk_env_get_core_count()) {
27957dab13c0SAlexey Marchuk 		data_wr_pool_size = SPDK_NVMF_MAX_SGL_ENTRIES * 2 * spdk_env_get_core_count();
27967dab13c0SAlexey Marchuk 		SPDK_NOTICELOG("data_wr_pool_size is changed to %zu to guarantee enough cache for handling "
27977dab13c0SAlexey Marchuk 			       "at least one IO in each core\n", data_wr_pool_size);
27987dab13c0SAlexey Marchuk 	}
27997dab13c0SAlexey Marchuk 	rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data", data_wr_pool_size,
28007dab13c0SAlexey Marchuk 				   sizeof(struct spdk_nvmf_rdma_request_data), SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
2801186b109dSJim Harris 				   SPDK_ENV_NUMA_ID_ANY);
2802cf73fb2fSSeth Howell 	if (!rtransport->data_wr_pool) {
2803475b86aaSKonrad Sztyber 		if (spdk_mempool_lookup("spdk_nvmf_rdma_wr_data") != NULL) {
2804475b86aaSKonrad Sztyber 			SPDK_ERRLOG("Unable to allocate work request pool for poll group: already exists\n");
2805475b86aaSKonrad Sztyber 			SPDK_ERRLOG("Probably running in multiprocess environment, which is "
2806475b86aaSKonrad Sztyber 				    "unsupported by the nvmf library\n");
2807475b86aaSKonrad Sztyber 		} else {
2808cf73fb2fSSeth Howell 			SPDK_ERRLOG("Unable to allocate work request pool for poll group\n");
2809475b86aaSKonrad Sztyber 		}
28100d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
2811cf73fb2fSSeth Howell 		return NULL;
2812cf73fb2fSSeth Howell 	}
2813cf73fb2fSSeth Howell 
2814958c68f1SBen Walker 	contexts = rdma_get_devices(NULL);
2815043e5edbSDaniel Verkamp 	if (contexts == NULL) {
2816043e5edbSDaniel Verkamp 		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
28170d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
2818043e5edbSDaniel Verkamp 		return NULL;
2819043e5edbSDaniel Verkamp 	}
2820043e5edbSDaniel Verkamp 
2821958c68f1SBen Walker 	i = 0;
2822958c68f1SBen Walker 	rc = 0;
2823958c68f1SBen Walker 	while (contexts[i] != NULL) {
2824a5283034Ssijie.sun 		rc = create_ib_device(rtransport, contexts[i], &device);
2825958c68f1SBen Walker 		if (rc < 0) {
2826b6f90c52SPhilipp Skadorov 			break;
2827b6f90c52SPhilipp Skadorov 		}
2828958c68f1SBen Walker 		i++;
2829a5283034Ssijie.sun 		max_device_sge = spdk_min(max_device_sge, device->attr.max_sge);
2830549be9adSsijie.sun 		device->is_ready = true;
2831958c68f1SBen Walker 	}
28325518a327SDaniel Verkamp 	rdma_free_devices(contexts);
2833958c68f1SBen Walker 
2834b6b0a0baSSeth Howell 	if (opts->io_unit_size * max_device_sge < opts->max_io_size) {
2835b6b0a0baSSeth Howell 		/* divide and round up. */
2836b6b0a0baSSeth Howell 		opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge;
2837b6b0a0baSSeth Howell 
2838b6b0a0baSSeth Howell 		/* round up to the nearest 4k. */
2839b6b0a0baSSeth Howell 		opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK;
2840b6b0a0baSSeth Howell 
2841b6b0a0baSSeth Howell 		opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE);
2842b6b0a0baSSeth Howell 		SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n",
2843b6b0a0baSSeth Howell 			       opts->io_unit_size);
2844b6b0a0baSSeth Howell 	}
2845b6b0a0baSSeth Howell 
2846958c68f1SBen Walker 	if (rc < 0) {
28470d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
2848958c68f1SBen Walker 		return NULL;
28495518a327SDaniel Verkamp 	}
28505518a327SDaniel Verkamp 
2851a5283034Ssijie.sun 	rc = generate_poll_fds(rtransport);
2852a5283034Ssijie.sun 	if (rc < 0) {
28530d98a949SNaresh Gottumukkala 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
28545518a327SDaniel Verkamp 		return NULL;
28555518a327SDaniel Verkamp 	}
28565518a327SDaniel Verkamp 
285743022da3SJacek Kalwas 	rtransport->accept_poller = SPDK_POLLER_REGISTER(nvmf_rdma_accept, &rtransport->transport,
2858355806b5SAlexey Marchuk 				    opts->acceptor_poll_rate);
285943022da3SJacek Kalwas 	if (!rtransport->accept_poller) {
286043022da3SJacek Kalwas 		nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
286143022da3SJacek Kalwas 		return NULL;
286243022da3SJacek Kalwas 	}
286343022da3SJacek Kalwas 
2864ecc436fcSBen Walker 	return &rtransport->transport;
2865349295caSBen Walker }
2866349295caSBen Walker 
2867f766d1e4SDarek Stojaczyk static void
2868a5283034Ssijie.sun destroy_ib_device(struct spdk_nvmf_rdma_transport *rtransport,
2869a5283034Ssijie.sun 		  struct spdk_nvmf_rdma_device *device)
2870a5283034Ssijie.sun {
2871a5283034Ssijie.sun 	TAILQ_REMOVE(&rtransport->devices, device, link);
28728a01b4d6SAlexey Marchuk 	spdk_rdma_utils_free_mem_map(&device->map);
2873a5283034Ssijie.sun 	if (device->pd) {
2874a5283034Ssijie.sun 		if (!g_nvmf_hooks.get_ibv_pd) {
2875a5283034Ssijie.sun 			ibv_dealloc_pd(device->pd);
2876a5283034Ssijie.sun 		}
2877a5283034Ssijie.sun 	}
2878549be9adSsijie.sun 	SPDK_DEBUGLOG(rdma, "IB device [%p] is destroyed.\n", device);
2879a5283034Ssijie.sun 	free(device);
2880a5283034Ssijie.sun }
2881a5283034Ssijie.sun 
2882a5283034Ssijie.sun static void
2883f766d1e4SDarek Stojaczyk nvmf_rdma_dump_opts(struct spdk_nvmf_transport *transport, struct spdk_json_write_ctx *w)
2884f766d1e4SDarek Stojaczyk {
2885f766d1e4SDarek Stojaczyk 	struct spdk_nvmf_rdma_transport	*rtransport;
2886f766d1e4SDarek Stojaczyk 	assert(w != NULL);
2887f766d1e4SDarek Stojaczyk 
2888f766d1e4SDarek Stojaczyk 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
2889f766d1e4SDarek Stojaczyk 	spdk_json_write_named_uint32(w, "max_srq_depth", rtransport->rdma_opts.max_srq_depth);
2890f766d1e4SDarek Stojaczyk 	spdk_json_write_named_bool(w, "no_srq", rtransport->rdma_opts.no_srq);
289197ef8701SMonica Kenguva 	if (rtransport->rdma_opts.no_srq == true) {
289297ef8701SMonica Kenguva 		spdk_json_write_named_int32(w, "num_cqe", rtransport->rdma_opts.num_cqe);
289397ef8701SMonica Kenguva 	}
2894f766d1e4SDarek Stojaczyk 	spdk_json_write_named_int32(w, "acceptor_backlog", rtransport->rdma_opts.acceptor_backlog);
2895bd3840a7SIvan Betsis 	spdk_json_write_named_bool(w, "no_wr_batching", rtransport->rdma_opts.no_wr_batching);
2896f766d1e4SDarek Stojaczyk }
2897f766d1e4SDarek Stojaczyk 
2898349295caSBen Walker static int
28990d98a949SNaresh Gottumukkala nvmf_rdma_destroy(struct spdk_nvmf_transport *transport,
29000d98a949SNaresh Gottumukkala 		  spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
2901349295caSBen Walker {
2902ecc436fcSBen Walker 	struct spdk_nvmf_rdma_transport	*rtransport;
29036428de9eSBen Walker 	struct spdk_nvmf_rdma_port	*port, *port_tmp;
2904958c68f1SBen Walker 	struct spdk_nvmf_rdma_device	*device, *device_tmp;
290538980dedSZiye Yang 
2906ecc436fcSBen Walker 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
2907ecc436fcSBen Walker 
2908549be9adSsijie.sun 	TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, port_tmp) {
2909549be9adSsijie.sun 		TAILQ_REMOVE(&rtransport->retry_ports, port, link);
2910549be9adSsijie.sun 		free(port);
2911549be9adSsijie.sun 	}
2912549be9adSsijie.sun 
29136428de9eSBen Walker 	TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) {
29146428de9eSBen Walker 		TAILQ_REMOVE(&rtransport->ports, port, link);
29156428de9eSBen Walker 		rdma_destroy_id(port->id);
29166428de9eSBen Walker 		free(port);
29176428de9eSBen Walker 	}
29186428de9eSBen Walker 
2919a5283034Ssijie.sun 	free_poll_fds(rtransport);
2920b6f90c52SPhilipp Skadorov 
2921ecc436fcSBen Walker 	if (rtransport->event_channel != NULL) {
2922ecc436fcSBen Walker 		rdma_destroy_event_channel(rtransport->event_channel);
29231290f02fSDaniel Verkamp 	}
2924349295caSBen Walker 
2925958c68f1SBen Walker 	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
2926a5283034Ssijie.sun 		destroy_ib_device(rtransport, device);
2927958c68f1SBen Walker 	}
2928958c68f1SBen Walker 
2929cf73fb2fSSeth Howell 	if (rtransport->data_wr_pool != NULL) {
2930ab2395bbSAlexey Marchuk 		if (spdk_mempool_count(rtransport->data_wr_pool) != transport->opts.data_wr_pool_size) {
2931cf73fb2fSSeth Howell 			SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n",
2932cf73fb2fSSeth Howell 				    spdk_mempool_count(rtransport->data_wr_pool),
2933cf73fb2fSSeth Howell 				    transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES);
2934cf73fb2fSSeth Howell 		}
2935cf73fb2fSSeth Howell 	}
2936cf73fb2fSSeth Howell 
2937cf73fb2fSSeth Howell 	spdk_mempool_free(rtransport->data_wr_pool);
2938645d5944SAlexey Marchuk 
293943022da3SJacek Kalwas 	spdk_poller_unregister(&rtransport->accept_poller);
2940ecc436fcSBen Walker 	free(rtransport);
294131d033f9SBen Walker 
29420d98a949SNaresh Gottumukkala 	if (cb_fn) {
29430d98a949SNaresh Gottumukkala 		cb_fn(cb_arg);
29440d98a949SNaresh Gottumukkala 	}
2945349295caSBen Walker 	return 0;
2946349295caSBen Walker }
2947349295caSBen Walker 
29488dd1cd21SBen Walker static int nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id,
29498e2f0cdbSzkhatami88 				     struct spdk_nvme_transport_id *trid,
29508e2f0cdbSzkhatami88 				     bool peer);
29518e2f0cdbSzkhatami88 
2952549be9adSsijie.sun static bool nvmf_rdma_rescan_devices(struct spdk_nvmf_rdma_transport *rtransport);
2953549be9adSsijie.sun 
29548e2f0cdbSzkhatami88 static int
295587a062e6SJacek Kalwas nvmf_rdma_listen(struct spdk_nvmf_transport *transport, const struct spdk_nvme_transport_id *trid,
295687a062e6SJacek Kalwas 		 struct spdk_nvmf_listen_opts *listen_opts)
29577e3b9f25SBen Walker {
2958ecc436fcSBen Walker 	struct spdk_nvmf_rdma_transport	*rtransport;
2959958c68f1SBen Walker 	struct spdk_nvmf_rdma_device	*device;
2960549be9adSsijie.sun 	struct spdk_nvmf_rdma_port	*port, *tmp_port;
2961c7b8b414SDaniel Verkamp 	struct addrinfo			*res;
2962c7b8b414SDaniel Verkamp 	struct addrinfo			hints;
2963c7b8b414SDaniel Verkamp 	int				family;
29647e3b9f25SBen Walker 	int				rc;
2965c3d90406SJim Harris 	long int			port_val;
2966549be9adSsijie.sun 	bool				is_retry = false;
29677e3b9f25SBen Walker 
29689a1cf1c5SJacek Kalwas 	if (!strlen(trid->trsvcid)) {
29699a1cf1c5SJacek Kalwas 		SPDK_ERRLOG("Service id is required\n");
29709a1cf1c5SJacek Kalwas 		return -EINVAL;
29719a1cf1c5SJacek Kalwas 	}
29729a1cf1c5SJacek Kalwas 
2973ecc436fcSBen Walker 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
29747cd56fb3SJacek Kalwas 	assert(rtransport->event_channel != NULL);
29757cd56fb3SJacek Kalwas 
29761cbc2b16SBen Walker 	port = calloc(1, sizeof(*port));
29771cbc2b16SBen Walker 	if (!port) {
29787cd56fb3SJacek Kalwas 		SPDK_ERRLOG("Port allocation failed\n");
29797e3b9f25SBen Walker 		return -ENOMEM;
29807e3b9f25SBen Walker 	}
29817e3b9f25SBen Walker 
29826d8f1fc6SJacek Kalwas 	port->trid = trid;
29837e3b9f25SBen Walker 
29846d8f1fc6SJacek Kalwas 	switch (trid->adrfam) {
2985c7b8b414SDaniel Verkamp 	case SPDK_NVMF_ADRFAM_IPV4:
2986c7b8b414SDaniel Verkamp 		family = AF_INET;
2987c7b8b414SDaniel Verkamp 		break;
2988c7b8b414SDaniel Verkamp 	case SPDK_NVMF_ADRFAM_IPV6:
2989c7b8b414SDaniel Verkamp 		family = AF_INET6;
2990c7b8b414SDaniel Verkamp 		break;
2991c7b8b414SDaniel Verkamp 	default:
29926d8f1fc6SJacek Kalwas 		SPDK_ERRLOG("Unhandled ADRFAM %d\n", trid->adrfam);
2993c7b8b414SDaniel Verkamp 		free(port);
2994c7b8b414SDaniel Verkamp 		return -EINVAL;
2995c7b8b414SDaniel Verkamp 	}
2996c7b8b414SDaniel Verkamp 
2997c7b8b414SDaniel Verkamp 	memset(&hints, 0, sizeof(hints));
2998c7b8b414SDaniel Verkamp 	hints.ai_family = family;
29991c34d1a4SBen Walker 	hints.ai_flags = AI_NUMERICSERV;
3000c7b8b414SDaniel Verkamp 	hints.ai_socktype = SOCK_STREAM;
3001c7b8b414SDaniel Verkamp 	hints.ai_protocol = 0;
3002c7b8b414SDaniel Verkamp 
3003c3d90406SJim Harris 	/* Range check the trsvcid. Fail in 3 cases:
3004c3d90406SJim Harris 	 * < 0: means that spdk_strtol hit an error
3005c3d90406SJim Harris 	 * 0: this results in ephemeral port which we don't want
3006c3d90406SJim Harris 	 * > 65535: port too high
3007c3d90406SJim Harris 	 */
3008c3d90406SJim Harris 	port_val = spdk_strtol(trid->trsvcid, 10);
3009c3d90406SJim Harris 	if (port_val <= 0 || port_val > 65535) {
3010c3d90406SJim Harris 		SPDK_ERRLOG("invalid trsvcid %s\n", trid->trsvcid);
3011c3d90406SJim Harris 		free(port);
3012c3d90406SJim Harris 		return -EINVAL;
3013c3d90406SJim Harris 	}
3014c3d90406SJim Harris 
30156d8f1fc6SJacek Kalwas 	rc = getaddrinfo(trid->traddr, trid->trsvcid, &hints, &res);
3016c7b8b414SDaniel Verkamp 	if (rc) {
3017c7b8b414SDaniel Verkamp 		SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc);
3018c7b8b414SDaniel Verkamp 		free(port);
3019f6866117STomasz Zawadzki 		return -(abs(rc));
3020e95e4028SJacek Kalwas 	}
3021e95e4028SJacek Kalwas 
3022e95e4028SJacek Kalwas 	rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP);
3023e95e4028SJacek Kalwas 	if (rc < 0) {
3024e95e4028SJacek Kalwas 		SPDK_ERRLOG("rdma_create_id() failed\n");
3025e95e4028SJacek Kalwas 		freeaddrinfo(res);
3026e95e4028SJacek Kalwas 		free(port);
3027e95e4028SJacek Kalwas 		return rc;
3028e95e4028SJacek Kalwas 	}
3029e95e4028SJacek Kalwas 
3030c7b8b414SDaniel Verkamp 	rc = rdma_bind_addr(port->id, res->ai_addr);
3031c7b8b414SDaniel Verkamp 	freeaddrinfo(res);
3032c7b8b414SDaniel Verkamp 
30337e3b9f25SBen Walker 	if (rc < 0) {
3034549be9adSsijie.sun 		TAILQ_FOREACH(tmp_port, &rtransport->retry_ports, link) {
3035549be9adSsijie.sun 			if (spdk_nvme_transport_id_compare(tmp_port->trid, trid) == 0) {
3036549be9adSsijie.sun 				is_retry = true;
3037549be9adSsijie.sun 				break;
3038549be9adSsijie.sun 			}
3039549be9adSsijie.sun 		}
3040549be9adSsijie.sun 		if (!is_retry) {
30417e3b9f25SBen Walker 			SPDK_ERRLOG("rdma_bind_addr() failed\n");
3042549be9adSsijie.sun 		}
30431cbc2b16SBen Walker 		rdma_destroy_id(port->id);
30441cbc2b16SBen Walker 		free(port);
30457e3b9f25SBen Walker 		return rc;
30467e3b9f25SBen Walker 	}
30477e3b9f25SBen Walker 
3048a0246f65Sshahar salzman 	if (!port->id->verbs) {
3049a0246f65Sshahar salzman 		SPDK_ERRLOG("ibv_context is null\n");
3050a0246f65Sshahar salzman 		rdma_destroy_id(port->id);
3051a0246f65Sshahar salzman 		free(port);
3052a0246f65Sshahar salzman 		return -1;
3053a0246f65Sshahar salzman 	}
3054a0246f65Sshahar salzman 
3055f766d1e4SDarek Stojaczyk 	rc = rdma_listen(port->id, rtransport->rdma_opts.acceptor_backlog);
30567e3b9f25SBen Walker 	if (rc < 0) {
30577e3b9f25SBen Walker 		SPDK_ERRLOG("rdma_listen() failed\n");
30581cbc2b16SBen Walker 		rdma_destroy_id(port->id);
30591cbc2b16SBen Walker 		free(port);
30607e3b9f25SBen Walker 		return rc;
30617e3b9f25SBen Walker 	}
30627e3b9f25SBen Walker 
3063958c68f1SBen Walker 	TAILQ_FOREACH(device, &rtransport->devices, link) {
3064549be9adSsijie.sun 		if (device->context == port->id->verbs && device->is_ready) {
30651cbc2b16SBen Walker 			port->device = device;
3066958c68f1SBen Walker 			break;
3067958c68f1SBen Walker 		}
3068958c68f1SBen Walker 	}
30691cbc2b16SBen Walker 	if (!port->device) {
3070958c68f1SBen Walker 		SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n",
30711cbc2b16SBen Walker 			    port->id->verbs);
30721cbc2b16SBen Walker 		rdma_destroy_id(port->id);
30731cbc2b16SBen Walker 		free(port);
30742470b995Ssijie.sun 		nvmf_rdma_rescan_devices(rtransport);
3075958c68f1SBen Walker 		return -EINVAL;
3076958c68f1SBen Walker 	}
3077958c68f1SBen Walker 
30787cd56fb3SJacek Kalwas 	SPDK_NOTICELOG("*** NVMe/RDMA Target Listening on %s port %s ***\n",
30797cd56fb3SJacek Kalwas 		       trid->traddr, trid->trsvcid);
30807e3b9f25SBen Walker 
30811cbc2b16SBen Walker 	TAILQ_INSERT_TAIL(&rtransport->ports, port, link);
30827e3b9f25SBen Walker 	return 0;
30837e3b9f25SBen Walker }
30847e3b9f25SBen Walker 
30856d8f1fc6SJacek Kalwas static void
3086549be9adSsijie.sun nvmf_rdma_stop_listen_ex(struct spdk_nvmf_transport *transport,
3087549be9adSsijie.sun 			 const struct spdk_nvme_transport_id *trid, bool need_retry)
30884440cd8dSZiye Yang {
3089ecc436fcSBen Walker 	struct spdk_nvmf_rdma_transport	*rtransport;
30901cbc2b16SBen Walker 	struct spdk_nvmf_rdma_port	*port, *tmp;
30917e3b9f25SBen Walker 
3092ecc436fcSBen Walker 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
3093ecc436fcSBen Walker 
3094549be9adSsijie.sun 	if (!need_retry) {
3095549be9adSsijie.sun 		TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, tmp) {
3096549be9adSsijie.sun 			if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) {
3097549be9adSsijie.sun 				TAILQ_REMOVE(&rtransport->retry_ports, port, link);
3098549be9adSsijie.sun 				free(port);
3099549be9adSsijie.sun 			}
3100549be9adSsijie.sun 		}
3101549be9adSsijie.sun 	}
3102549be9adSsijie.sun 
31031cbc2b16SBen Walker 	TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) {
31046d8f1fc6SJacek Kalwas 		if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) {
3105549be9adSsijie.sun 			SPDK_DEBUGLOG(rdma, "Port %s:%s removed. need retry: %d\n",
3106549be9adSsijie.sun 				      port->trid->traddr, port->trid->trsvcid, need_retry);
31071cbc2b16SBen Walker 			TAILQ_REMOVE(&rtransport->ports, port, link);
31081cbc2b16SBen Walker 			rdma_destroy_id(port->id);
3109549be9adSsijie.sun 			port->id = NULL;
3110549be9adSsijie.sun 			port->device = NULL;
3111549be9adSsijie.sun 			if (need_retry) {
3112549be9adSsijie.sun 				TAILQ_INSERT_TAIL(&rtransport->retry_ports, port, link);
3113549be9adSsijie.sun 			} else {
31141cbc2b16SBen Walker 				free(port);
3115549be9adSsijie.sun 			}
31164440cd8dSZiye Yang 			break;
31174440cd8dSZiye Yang 		}
31184440cd8dSZiye Yang 	}
31194440cd8dSZiye Yang }
31204440cd8dSZiye Yang 
3121549be9adSsijie.sun static void
3122549be9adSsijie.sun nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport,
3123549be9adSsijie.sun 		      const struct spdk_nvme_transport_id *trid)
3124549be9adSsijie.sun {
3125549be9adSsijie.sun 	nvmf_rdma_stop_listen_ex(transport, trid, false);
3126549be9adSsijie.sun }
3127549be9adSsijie.sun 
3128549be9adSsijie.sun static void _nvmf_rdma_register_poller_in_group(void *c);
31298ddc5cd4Ssijie.sun static void _nvmf_rdma_remove_poller_in_group(void *c);
31308ddc5cd4Ssijie.sun 
31318ddc5cd4Ssijie.sun static bool
3132549be9adSsijie.sun nvmf_rdma_all_pollers_management_done(void *c)
31338ddc5cd4Ssijie.sun {
31348ddc5cd4Ssijie.sun 	struct poller_manage_ctx	*ctx = c;
31358ddc5cd4Ssijie.sun 	int				counter;
31368ddc5cd4Ssijie.sun 
31378ddc5cd4Ssijie.sun 	counter = __atomic_sub_fetch(ctx->inflight_op_counter, 1, __ATOMIC_SEQ_CST);
3138549be9adSsijie.sun 	SPDK_DEBUGLOG(rdma, "nvmf_rdma_all_pollers_management_done called. counter: %d, poller: %p\n",
31398ddc5cd4Ssijie.sun 		      counter, ctx->rpoller);
31408ddc5cd4Ssijie.sun 
31418ddc5cd4Ssijie.sun 	if (counter == 0) {
31428ddc5cd4Ssijie.sun 		free((void *)ctx->inflight_op_counter);
31438ddc5cd4Ssijie.sun 	}
31448ddc5cd4Ssijie.sun 	free(ctx);
31458ddc5cd4Ssijie.sun 
31468ddc5cd4Ssijie.sun 	return counter == 0;
31478ddc5cd4Ssijie.sun }
31488ddc5cd4Ssijie.sun 
31498ddc5cd4Ssijie.sun static int
3150549be9adSsijie.sun nvmf_rdma_manage_poller(struct spdk_nvmf_rdma_transport *rtransport,
3151549be9adSsijie.sun 			struct spdk_nvmf_rdma_device *device, bool *has_inflight, bool is_add)
31528ddc5cd4Ssijie.sun {
31538ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_poll_group	*rgroup;
31548ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_poller		*rpoller;
31558ddc5cd4Ssijie.sun 	struct spdk_nvmf_poll_group		*poll_group;
31568ddc5cd4Ssijie.sun 	struct poller_manage_ctx		*ctx;
31578ddc5cd4Ssijie.sun 	bool					found;
31588ddc5cd4Ssijie.sun 	int					*inflight_counter;
31598ddc5cd4Ssijie.sun 	spdk_msg_fn				do_fn;
31608ddc5cd4Ssijie.sun 
31618ddc5cd4Ssijie.sun 	*has_inflight = false;
3162549be9adSsijie.sun 	do_fn = is_add ? _nvmf_rdma_register_poller_in_group : _nvmf_rdma_remove_poller_in_group;
31638ddc5cd4Ssijie.sun 	inflight_counter = calloc(1, sizeof(int));
31648ddc5cd4Ssijie.sun 	if (!inflight_counter) {
31658ddc5cd4Ssijie.sun 		SPDK_ERRLOG("Failed to allocate inflight counter when removing pollers\n");
31668ddc5cd4Ssijie.sun 		return -ENOMEM;
31678ddc5cd4Ssijie.sun 	}
31688ddc5cd4Ssijie.sun 
31698ddc5cd4Ssijie.sun 	TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
31708ddc5cd4Ssijie.sun 		(*inflight_counter)++;
31718ddc5cd4Ssijie.sun 	}
31728ddc5cd4Ssijie.sun 
31738ddc5cd4Ssijie.sun 	TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
31748ddc5cd4Ssijie.sun 		found = false;
31758ddc5cd4Ssijie.sun 		TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
31768ddc5cd4Ssijie.sun 			if (rpoller->device == device) {
31778ddc5cd4Ssijie.sun 				found = true;
31788ddc5cd4Ssijie.sun 				break;
31798ddc5cd4Ssijie.sun 			}
31808ddc5cd4Ssijie.sun 		}
3181549be9adSsijie.sun 		if (found == is_add) {
31828ddc5cd4Ssijie.sun 			__atomic_fetch_sub(inflight_counter, 1, __ATOMIC_SEQ_CST);
31838ddc5cd4Ssijie.sun 			continue;
31848ddc5cd4Ssijie.sun 		}
31858ddc5cd4Ssijie.sun 
31868ddc5cd4Ssijie.sun 		ctx = calloc(1, sizeof(struct poller_manage_ctx));
31878ddc5cd4Ssijie.sun 		if (!ctx) {
31888ddc5cd4Ssijie.sun 			SPDK_ERRLOG("Failed to allocate poller_manage_ctx when removing pollers\n");
31898ddc5cd4Ssijie.sun 			if (!*has_inflight) {
31908ddc5cd4Ssijie.sun 				free(inflight_counter);
31918ddc5cd4Ssijie.sun 			}
31928ddc5cd4Ssijie.sun 			return -ENOMEM;
31938ddc5cd4Ssijie.sun 		}
31948ddc5cd4Ssijie.sun 
31958ddc5cd4Ssijie.sun 		ctx->rtransport = rtransport;
31968ddc5cd4Ssijie.sun 		ctx->rgroup = rgroup;
31978ddc5cd4Ssijie.sun 		ctx->rpoller = rpoller;
31988ddc5cd4Ssijie.sun 		ctx->device = device;
31998ddc5cd4Ssijie.sun 		ctx->thread = spdk_get_thread();
32008ddc5cd4Ssijie.sun 		ctx->inflight_op_counter = inflight_counter;
32018ddc5cd4Ssijie.sun 		*has_inflight = true;
32028ddc5cd4Ssijie.sun 
32038ddc5cd4Ssijie.sun 		poll_group = rgroup->group.group;
32048ddc5cd4Ssijie.sun 		if (poll_group->thread != spdk_get_thread()) {
32058ddc5cd4Ssijie.sun 			spdk_thread_send_msg(poll_group->thread, do_fn, ctx);
32068ddc5cd4Ssijie.sun 		} else {
32078ddc5cd4Ssijie.sun 			do_fn(ctx);
32088ddc5cd4Ssijie.sun 		}
32098ddc5cd4Ssijie.sun 	}
32108ddc5cd4Ssijie.sun 
32118ddc5cd4Ssijie.sun 	if (!*has_inflight) {
32128ddc5cd4Ssijie.sun 		free(inflight_counter);
32138ddc5cd4Ssijie.sun 	}
32148ddc5cd4Ssijie.sun 
32158ddc5cd4Ssijie.sun 	return 0;
32168ddc5cd4Ssijie.sun }
32178ddc5cd4Ssijie.sun 
3218549be9adSsijie.sun static void nvmf_rdma_handle_device_removal(struct spdk_nvmf_rdma_transport *rtransport,
3219549be9adSsijie.sun 		struct spdk_nvmf_rdma_device *device);
3220549be9adSsijie.sun 
3221549be9adSsijie.sun static struct spdk_nvmf_rdma_device *
3222549be9adSsijie.sun nvmf_rdma_find_ib_device(struct spdk_nvmf_rdma_transport *rtransport,
3223549be9adSsijie.sun 			 struct ibv_context *context)
3224549be9adSsijie.sun {
3225549be9adSsijie.sun 	struct spdk_nvmf_rdma_device	*device, *tmp_device;
3226549be9adSsijie.sun 
3227549be9adSsijie.sun 	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp_device) {
3228549be9adSsijie.sun 		if (device->need_destroy) {
3229549be9adSsijie.sun 			continue;
3230549be9adSsijie.sun 		}
3231549be9adSsijie.sun 
3232549be9adSsijie.sun 		if (strcmp(device->context->device->dev_name, context->device->dev_name) == 0) {
3233549be9adSsijie.sun 			return device;
3234549be9adSsijie.sun 		}
3235549be9adSsijie.sun 	}
3236549be9adSsijie.sun 
3237549be9adSsijie.sun 	return NULL;
3238549be9adSsijie.sun }
3239549be9adSsijie.sun 
3240549be9adSsijie.sun static bool
3241549be9adSsijie.sun nvmf_rdma_check_devices_context(struct spdk_nvmf_rdma_transport *rtransport,
3242549be9adSsijie.sun 				struct ibv_context *context)
3243549be9adSsijie.sun {
3244549be9adSsijie.sun 	struct spdk_nvmf_rdma_device	*old_device, *new_device;
3245549be9adSsijie.sun 	int				rc = 0;
3246549be9adSsijie.sun 	bool				has_inflight;
3247549be9adSsijie.sun 
3248549be9adSsijie.sun 	old_device = nvmf_rdma_find_ib_device(rtransport, context);
3249549be9adSsijie.sun 
3250549be9adSsijie.sun 	if (old_device) {
3251549be9adSsijie.sun 		if (old_device->context != context && !old_device->need_destroy && old_device->is_ready) {
3252549be9adSsijie.sun 			/* context may not have time to be cleaned when rescan. exactly one context
3253549be9adSsijie.sun 			 * is valid for a device so this context must be invalid and just remove it. */
3254549be9adSsijie.sun 			SPDK_WARNLOG("Device %p has a invalid context %p\n", old_device, old_device->context);
3255549be9adSsijie.sun 			old_device->need_destroy = true;
3256549be9adSsijie.sun 			nvmf_rdma_handle_device_removal(rtransport, old_device);
3257549be9adSsijie.sun 		}
3258549be9adSsijie.sun 		return false;
3259549be9adSsijie.sun 	}
3260549be9adSsijie.sun 
3261549be9adSsijie.sun 	rc = create_ib_device(rtransport, context, &new_device);
3262549be9adSsijie.sun 	/* TODO: update transport opts. */
3263549be9adSsijie.sun 	if (rc < 0) {
3264549be9adSsijie.sun 		SPDK_ERRLOG("Failed to create ib device for context: %s(%p)\n",
3265549be9adSsijie.sun 			    ibv_get_device_name(context->device), context);
3266549be9adSsijie.sun 		return false;
3267549be9adSsijie.sun 	}
3268549be9adSsijie.sun 
3269549be9adSsijie.sun 	rc = nvmf_rdma_manage_poller(rtransport, new_device, &has_inflight, true);
3270549be9adSsijie.sun 	if (rc < 0) {
3271549be9adSsijie.sun 		SPDK_ERRLOG("Failed to add poller for device context: %s(%p)\n",
3272549be9adSsijie.sun 			    ibv_get_device_name(context->device), context);
3273549be9adSsijie.sun 		return false;
3274549be9adSsijie.sun 	}
3275549be9adSsijie.sun 
3276549be9adSsijie.sun 	if (has_inflight) {
3277549be9adSsijie.sun 		new_device->is_ready = true;
3278549be9adSsijie.sun 	}
3279549be9adSsijie.sun 
3280549be9adSsijie.sun 	return true;
3281549be9adSsijie.sun }
3282549be9adSsijie.sun 
3283549be9adSsijie.sun static bool
3284549be9adSsijie.sun nvmf_rdma_rescan_devices(struct spdk_nvmf_rdma_transport *rtransport)
3285549be9adSsijie.sun {
3286549be9adSsijie.sun 	struct spdk_nvmf_rdma_device	*device;
3287549be9adSsijie.sun 	struct ibv_device		**ibv_device_list = NULL;
3288549be9adSsijie.sun 	struct ibv_context		**contexts = NULL;
3289549be9adSsijie.sun 	int				i = 0;
3290549be9adSsijie.sun 	int				num_dev = 0;
3291549be9adSsijie.sun 	bool				new_create = false, has_new_device = false;
3292549be9adSsijie.sun 	struct ibv_context		*tmp_verbs = NULL;
3293549be9adSsijie.sun 
3294549be9adSsijie.sun 	/* do not rescan when any device is destroying, or context may be freed when
3295549be9adSsijie.sun 	 * regenerating the poll fds.
3296549be9adSsijie.sun 	 */
3297549be9adSsijie.sun 	TAILQ_FOREACH(device, &rtransport->devices, link) {
3298549be9adSsijie.sun 		if (device->need_destroy) {
3299549be9adSsijie.sun 			return false;
3300549be9adSsijie.sun 		}
3301549be9adSsijie.sun 	}
3302549be9adSsijie.sun 
3303549be9adSsijie.sun 	ibv_device_list = ibv_get_device_list(&num_dev);
3304549be9adSsijie.sun 
3305549be9adSsijie.sun 	/* There is a bug in librdmacm. If verbs init failed in rdma_get_devices, it'll be
3306549be9adSsijie.sun 	 * marked as dead verbs and never be init again. So we need to make sure the
3307549be9adSsijie.sun 	 * verbs is available before we call rdma_get_devices. */
3308549be9adSsijie.sun 	if (num_dev >= 0) {
3309549be9adSsijie.sun 		for (i = 0; i < num_dev; i++) {
3310549be9adSsijie.sun 			tmp_verbs = ibv_open_device(ibv_device_list[i]);
3311549be9adSsijie.sun 			if (!tmp_verbs) {
3312549be9adSsijie.sun 				SPDK_WARNLOG("Failed to init ibv device %p, err %d. Skip rescan.\n", ibv_device_list[i], errno);
3313549be9adSsijie.sun 				break;
3314549be9adSsijie.sun 			}
3315549be9adSsijie.sun 			if (nvmf_rdma_find_ib_device(rtransport, tmp_verbs) == NULL) {
3316549be9adSsijie.sun 				SPDK_DEBUGLOG(rdma, "Find new verbs init ibv device %p(%s).\n", ibv_device_list[i],
3317549be9adSsijie.sun 					      tmp_verbs->device->dev_name);
3318549be9adSsijie.sun 				has_new_device = true;
3319549be9adSsijie.sun 			}
3320549be9adSsijie.sun 			ibv_close_device(tmp_verbs);
3321549be9adSsijie.sun 		}
3322549be9adSsijie.sun 		ibv_free_device_list(ibv_device_list);
3323549be9adSsijie.sun 		if (!tmp_verbs || !has_new_device) {
3324549be9adSsijie.sun 			return false;
3325549be9adSsijie.sun 		}
3326549be9adSsijie.sun 	}
3327549be9adSsijie.sun 
3328549be9adSsijie.sun 	contexts = rdma_get_devices(NULL);
3329549be9adSsijie.sun 
3330549be9adSsijie.sun 	for (i = 0; contexts && contexts[i] != NULL; i++) {
3331549be9adSsijie.sun 		new_create |= nvmf_rdma_check_devices_context(rtransport, contexts[i]);
3332549be9adSsijie.sun 	}
3333549be9adSsijie.sun 
3334549be9adSsijie.sun 	if (new_create) {
3335549be9adSsijie.sun 		free_poll_fds(rtransport);
3336549be9adSsijie.sun 		generate_poll_fds(rtransport);
3337549be9adSsijie.sun 	}
3338549be9adSsijie.sun 
3339549be9adSsijie.sun 	if (contexts) {
3340549be9adSsijie.sun 		rdma_free_devices(contexts);
3341549be9adSsijie.sun 	}
3342549be9adSsijie.sun 
3343549be9adSsijie.sun 	return new_create;
3344549be9adSsijie.sun }
3345549be9adSsijie.sun 
3346549be9adSsijie.sun static bool
3347549be9adSsijie.sun nvmf_rdma_retry_listen_port(struct spdk_nvmf_rdma_transport *rtransport)
3348549be9adSsijie.sun {
3349549be9adSsijie.sun 	struct spdk_nvmf_rdma_port	*port, *tmp_port;
3350549be9adSsijie.sun 	int				rc = 0;
3351549be9adSsijie.sun 	bool				new_create = false;
3352549be9adSsijie.sun 
3353549be9adSsijie.sun 	if (TAILQ_EMPTY(&rtransport->retry_ports)) {
3354549be9adSsijie.sun 		return false;
3355549be9adSsijie.sun 	}
3356549be9adSsijie.sun 
3357549be9adSsijie.sun 	new_create = nvmf_rdma_rescan_devices(rtransport);
3358549be9adSsijie.sun 
3359549be9adSsijie.sun 	TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, tmp_port) {
3360549be9adSsijie.sun 		rc = nvmf_rdma_listen(&rtransport->transport, port->trid, NULL);
3361549be9adSsijie.sun 
3362549be9adSsijie.sun 		TAILQ_REMOVE(&rtransport->retry_ports, port, link);
3363549be9adSsijie.sun 		if (rc) {
3364549be9adSsijie.sun 			if (new_create) {
3365549be9adSsijie.sun 				SPDK_ERRLOG("Found new IB device but port %s:%s is still failed(%d) to listen.\n",
3366549be9adSsijie.sun 					    port->trid->traddr, port->trid->trsvcid, rc);
3367549be9adSsijie.sun 			}
3368549be9adSsijie.sun 			TAILQ_INSERT_TAIL(&rtransport->retry_ports, port, link);
3369549be9adSsijie.sun 			break;
3370549be9adSsijie.sun 		} else {
3371549be9adSsijie.sun 			SPDK_NOTICELOG("Port %s:%s come back\n", port->trid->traddr, port->trid->trsvcid);
3372549be9adSsijie.sun 			free(port);
3373549be9adSsijie.sun 		}
3374549be9adSsijie.sun 	}
3375549be9adSsijie.sun 
3376549be9adSsijie.sun 	return true;
3377549be9adSsijie.sun }
3378549be9adSsijie.sun 
3379349295caSBen Walker static void
338055d8d943SSeth Howell nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport,
3381e0280b11SSeth Howell 				struct spdk_nvmf_rdma_qpair *rqpair, bool drain)
33824bfb557dSPhilipp Skadorov {
338397967681SShuhei Matsumoto 	struct spdk_nvmf_request *req, *tmp;
33844bfb557dSPhilipp Skadorov 	struct spdk_nvmf_rdma_request	*rdma_req, *req_tmp;
3385b25751d9SBen Walker 	struct spdk_nvmf_rdma_resources *resources;
33864bfb557dSPhilipp Skadorov 
338704cd8e47SAlexey Marchuk 	/* First process requests which are waiting for response to be sent */
338804cd8e47SAlexey Marchuk 	STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_send_queue, state_link, req_tmp) {
338904cd8e47SAlexey Marchuk 		if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
339004cd8e47SAlexey Marchuk 			break;
339104cd8e47SAlexey Marchuk 		}
339204cd8e47SAlexey Marchuk 	}
339304cd8e47SAlexey Marchuk 
339404cd8e47SAlexey Marchuk 	/* We process I/O in the data transfer pending queue at the highest priority. */
339504ebc6eaSSeth Howell 	STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) {
33964e45c563SAlexey Marchuk 		if (rdma_req->state != RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) {
33974e45c563SAlexey Marchuk 			/* Requests in this queue might be in state RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
33984e45c563SAlexey Marchuk 			 * they are transmitting data over network but we keep them in the list to guarantee
33994e45c563SAlexey Marchuk 			 * fair processing. */
34004e45c563SAlexey Marchuk 			continue;
34014e45c563SAlexey Marchuk 		}
340255d8d943SSeth Howell 		if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
34031d0a8e1cSSeth Howell 			break;
34041d0a8e1cSSeth Howell 		}
34051d0a8e1cSSeth Howell 	}
34061d0a8e1cSSeth Howell 
34071d0a8e1cSSeth Howell 	/* Then RDMA writes since reads have stronger restrictions than writes */
340804ebc6eaSSeth Howell 	STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) {
340955d8d943SSeth Howell 		if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
34104bfb557dSPhilipp Skadorov 			break;
34114bfb557dSPhilipp Skadorov 		}
34124bfb557dSPhilipp Skadorov 	}
34134bfb557dSPhilipp Skadorov 
3414ac74de2fSZiye Yang 	/* Then we handle request waiting on memory buffers. */
341597967681SShuhei Matsumoto 	STAILQ_FOREACH_SAFE(req, &rqpair->poller->group->group.pending_buf_queue, buf_link, tmp) {
341697967681SShuhei Matsumoto 		rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
341755d8d943SSeth Howell 		if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
34184bfb557dSPhilipp Skadorov 			break;
34194bfb557dSPhilipp Skadorov 		}
34204bfb557dSPhilipp Skadorov 	}
34214bfb557dSPhilipp Skadorov 
3422b25751d9SBen Walker 	resources = rqpair->resources;
3423b25751d9SBen Walker 	while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) {
3424b25751d9SBen Walker 		rdma_req = STAILQ_FIRST(&resources->free_queue);
3425b25751d9SBen Walker 		STAILQ_REMOVE_HEAD(&resources->free_queue, state_link);
3426b25751d9SBen Walker 		rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue);
3427b25751d9SBen Walker 		STAILQ_REMOVE_HEAD(&resources->incoming_queue, link);
342801201d3eSSeth Howell 
3429fa79f64aSSeth Howell 		if (rqpair->srq != NULL) {
3430ed0b611fSEvgeniy Kochetov 			rdma_req->req.qpair = &rdma_req->recv->qpair->qpair;
3431b25751d9SBen Walker 			rdma_req->recv->qpair->qd++;
343201201d3eSSeth Howell 		} else {
3433bfdc957cSSeth Howell 			rqpair->qd++;
343401201d3eSSeth Howell 		}
343501201d3eSSeth Howell 
3436fbe8f804SEvgeniy Kochetov 		rdma_req->receive_tsc = rdma_req->recv->receive_tsc;
3437bfdc957cSSeth Howell 		rdma_req->state = RDMA_REQUEST_STATE_NEW;
343855d8d943SSeth Howell 		if (nvmf_rdma_request_process(rtransport, rdma_req) == false) {
34394bfb557dSPhilipp Skadorov 			break;
34404bfb557dSPhilipp Skadorov 		}
34414bfb557dSPhilipp Skadorov 	}
3442251db814SEvgeniy Kochetov 	if (!STAILQ_EMPTY(&resources->incoming_queue) && STAILQ_EMPTY(&resources->free_queue)) {
3443251db814SEvgeniy Kochetov 		rqpair->poller->stat.pending_free_request++;
3444251db814SEvgeniy Kochetov 	}
34454bfb557dSPhilipp Skadorov }
34464bfb557dSPhilipp Skadorov 
34478e8f0434SAlexey Marchuk static void
34488e8f0434SAlexey Marchuk nvmf_rdma_poller_process_pending_buf_queue(struct spdk_nvmf_rdma_transport *rtransport,
34498e8f0434SAlexey Marchuk 		struct spdk_nvmf_rdma_poller *rpoller)
34508e8f0434SAlexey Marchuk {
34518e8f0434SAlexey Marchuk 	struct spdk_nvmf_request *req, *tmp;
34528e8f0434SAlexey Marchuk 	struct spdk_nvmf_rdma_request *rdma_req;
34538e8f0434SAlexey Marchuk 
34548e8f0434SAlexey Marchuk 	STAILQ_FOREACH_SAFE(req, &rpoller->group->group.pending_buf_queue, buf_link, tmp) {
34558e8f0434SAlexey Marchuk 		rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
34568e8f0434SAlexey Marchuk 		if (nvmf_rdma_request_process(rtransport, rdma_req) == false) {
34578e8f0434SAlexey Marchuk 			break;
34588e8f0434SAlexey Marchuk 		}
34598e8f0434SAlexey Marchuk 	}
34608e8f0434SAlexey Marchuk }
34618e8f0434SAlexey Marchuk 
34624ede9053SAlexey Marchuk static inline bool
3463e655d178SJim Harris nvmf_rdma_device_supports_last_wqe_reached(struct spdk_nvmf_rdma_device *device)
34644ede9053SAlexey Marchuk {
34654ede9053SAlexey Marchuk 	/* iWARP transport and SoftRoCE driver don't support LAST_WQE_REACHED ibv async event */
3466e655d178SJim Harris 	return !nvmf_rdma_is_rxe_device(device) &&
3467e655d178SJim Harris 	       device->context->device->transport_type != IBV_TRANSPORT_IWARP;
34684ede9053SAlexey Marchuk }
34694ede9053SAlexey Marchuk 
347085ff3fceSZiye Yang static void
347185ff3fceSZiye Yang nvmf_rdma_destroy_drained_qpair(struct spdk_nvmf_rdma_qpair *rqpair)
3472bb3e4413SSeth Howell {
347333668b22SSeth Howell 	struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
347433668b22SSeth Howell 			struct spdk_nvmf_rdma_transport, transport);
347533668b22SSeth Howell 
347685ff3fceSZiye Yang 	nvmf_rdma_qpair_process_pending(rtransport, rqpair, true);
347785ff3fceSZiye Yang 
3478bf41b46cSAleksey Marchuk 	/* nvmf_rdma_close_qpair is not called */
347985ff3fceSZiye Yang 	if (!rqpair->to_close) {
348085ff3fceSZiye Yang 		return;
348185ff3fceSZiye Yang 	}
348285ff3fceSZiye Yang 
34838ddc5cd4Ssijie.sun 	/* device is already destroyed and we should force destroy this qpair. */
34848ddc5cd4Ssijie.sun 	if (rqpair->poller && rqpair->poller->need_destroy) {
34858ddc5cd4Ssijie.sun 		nvmf_rdma_qpair_destroy(rqpair);
34868ddc5cd4Ssijie.sun 		return;
34878ddc5cd4Ssijie.sun 	}
34888ddc5cd4Ssijie.sun 
3489a9fc7e1dSSeth Howell 	/* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */
3490a9fc7e1dSSeth Howell 	if (rqpair->current_send_depth != 0) {
3491a9fc7e1dSSeth Howell 		return;
3492a9fc7e1dSSeth Howell 	}
3493a9fc7e1dSSeth Howell 
3494a9fc7e1dSSeth Howell 	if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) {
3495a9fc7e1dSSeth Howell 		return;
3496a9fc7e1dSSeth Howell 	}
3497a9fc7e1dSSeth Howell 
3498e655d178SJim Harris 	/* For devices that support LAST_WQE_REACHED with srq, we need to
3499e655d178SJim Harris 	 * wait to destroy the qpair until that event has been received.
3500e655d178SJim Harris 	 */
3501efb6081cSAlexey Marchuk 	if (rqpair->srq != NULL && rqpair->last_wqe_reached == false &&
3502e655d178SJim Harris 	    nvmf_rdma_device_supports_last_wqe_reached(rqpair->device)) {
3503a9fc7e1dSSeth Howell 		return;
3504a9fc7e1dSSeth Howell 	}
3505a9fc7e1dSSeth Howell 
3506*5469bd2dSAlexey Marchuk 	assert(rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED ||
3507*5469bd2dSAlexey Marchuk 	       rqpair->qpair.state == SPDK_NVMF_QPAIR_ERROR);
3508b4d30668Slorneli 
350955d8d943SSeth Howell 	nvmf_rdma_qpair_destroy(rqpair);
3510bb3e4413SSeth Howell }
3511bb3e4413SSeth Howell 
3512d3fa0181SBen Walker static int
35132470b995Ssijie.sun nvmf_rdma_disconnect(struct rdma_cm_event *evt, bool *event_acked)
3514d3fa0181SBen Walker {
3515d3fa0181SBen Walker 	struct spdk_nvmf_qpair		*qpair;
3516d3fa0181SBen Walker 	struct spdk_nvmf_rdma_qpair	*rqpair;
3517d3fa0181SBen Walker 
3518d3fa0181SBen Walker 	if (evt->id == NULL) {
3519d3fa0181SBen Walker 		SPDK_ERRLOG("disconnect request: missing cm_id\n");
3520d3fa0181SBen Walker 		return -1;
3521d3fa0181SBen Walker 	}
3522d3fa0181SBen Walker 
3523d3fa0181SBen Walker 	qpair = evt->id->context;
3524d3fa0181SBen Walker 	if (qpair == NULL) {
3525d3fa0181SBen Walker 		SPDK_ERRLOG("disconnect request: no active connection\n");
3526d3fa0181SBen Walker 		return -1;
3527d3fa0181SBen Walker 	}
3528d3fa0181SBen Walker 
35292470b995Ssijie.sun 	rdma_ack_cm_event(evt);
35302470b995Ssijie.sun 	*event_acked = true;
35312470b995Ssijie.sun 
3532d3fa0181SBen Walker 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
3533d3fa0181SBen Walker 
3534c556b6b8SKonrad Sztyber 	spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair);
3535d3fa0181SBen Walker 
3536608b54a2SKonrad Sztyber 	spdk_nvmf_qpair_disconnect(&rqpair->qpair);
3537d3fa0181SBen Walker 
3538d3fa0181SBen Walker 	return 0;
3539d3fa0181SBen Walker }
3540d3fa0181SBen Walker 
3541d3fa0181SBen Walker #ifdef DEBUG
3542d3fa0181SBen Walker static const char *CM_EVENT_STR[] = {
3543d3fa0181SBen Walker 	"RDMA_CM_EVENT_ADDR_RESOLVED",
3544d3fa0181SBen Walker 	"RDMA_CM_EVENT_ADDR_ERROR",
3545d3fa0181SBen Walker 	"RDMA_CM_EVENT_ROUTE_RESOLVED",
3546d3fa0181SBen Walker 	"RDMA_CM_EVENT_ROUTE_ERROR",
3547d3fa0181SBen Walker 	"RDMA_CM_EVENT_CONNECT_REQUEST",
3548d3fa0181SBen Walker 	"RDMA_CM_EVENT_CONNECT_RESPONSE",
3549d3fa0181SBen Walker 	"RDMA_CM_EVENT_CONNECT_ERROR",
3550d3fa0181SBen Walker 	"RDMA_CM_EVENT_UNREACHABLE",
3551d3fa0181SBen Walker 	"RDMA_CM_EVENT_REJECTED",
3552d3fa0181SBen Walker 	"RDMA_CM_EVENT_ESTABLISHED",
3553d3fa0181SBen Walker 	"RDMA_CM_EVENT_DISCONNECTED",
3554d3fa0181SBen Walker 	"RDMA_CM_EVENT_DEVICE_REMOVAL",
3555d3fa0181SBen Walker 	"RDMA_CM_EVENT_MULTICAST_JOIN",
3556d3fa0181SBen Walker 	"RDMA_CM_EVENT_MULTICAST_ERROR",
3557d3fa0181SBen Walker 	"RDMA_CM_EVENT_ADDR_CHANGE",
3558d3fa0181SBen Walker 	"RDMA_CM_EVENT_TIMEWAIT_EXIT"
3559d3fa0181SBen Walker };
3560d3fa0181SBen Walker #endif /* DEBUG */
3561d3fa0181SBen Walker 
3562804b0669SAlexey Marchuk static void
3563804b0669SAlexey Marchuk nvmf_rdma_disconnect_qpairs_on_port(struct spdk_nvmf_rdma_transport *rtransport,
3564804b0669SAlexey Marchuk 				    struct spdk_nvmf_rdma_port *port)
3565804b0669SAlexey Marchuk {
3566804b0669SAlexey Marchuk 	struct spdk_nvmf_rdma_poll_group	*rgroup;
3567804b0669SAlexey Marchuk 	struct spdk_nvmf_rdma_poller		*rpoller;
3568804b0669SAlexey Marchuk 	struct spdk_nvmf_rdma_qpair		*rqpair;
3569804b0669SAlexey Marchuk 
3570804b0669SAlexey Marchuk 	TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
3571804b0669SAlexey Marchuk 		TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
357252f7aeb7SShuhei Matsumoto 			RB_FOREACH(rqpair, qpairs_tree, &rpoller->qpairs) {
3573804b0669SAlexey Marchuk 				if (rqpair->listen_id == port->id) {
3574608b54a2SKonrad Sztyber 					spdk_nvmf_qpair_disconnect(&rqpair->qpair);
3575804b0669SAlexey Marchuk 				}
3576804b0669SAlexey Marchuk 			}
3577804b0669SAlexey Marchuk 		}
3578804b0669SAlexey Marchuk 	}
3579804b0669SAlexey Marchuk }
3580804b0669SAlexey Marchuk 
358150cb6a04SSeth Howell static bool
358250cb6a04SSeth Howell nvmf_rdma_handle_cm_event_addr_change(struct spdk_nvmf_transport *transport,
358350cb6a04SSeth Howell 				      struct rdma_cm_event *event)
358450cb6a04SSeth Howell {
35856d8f1fc6SJacek Kalwas 	const struct spdk_nvme_transport_id	*trid;
358650cb6a04SSeth Howell 	struct spdk_nvmf_rdma_port		*port;
358750cb6a04SSeth Howell 	struct spdk_nvmf_rdma_transport		*rtransport;
358850cb6a04SSeth Howell 	bool					event_acked = false;
358950cb6a04SSeth Howell 
359050cb6a04SSeth Howell 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
359150cb6a04SSeth Howell 	TAILQ_FOREACH(port, &rtransport->ports, link) {
359250cb6a04SSeth Howell 		if (port->id == event->id) {
35936d8f1fc6SJacek Kalwas 			SPDK_ERRLOG("ADDR_CHANGE: IP %s:%s migrated\n", port->trid->traddr, port->trid->trsvcid);
359450cb6a04SSeth Howell 			rdma_ack_cm_event(event);
359550cb6a04SSeth Howell 			event_acked = true;
359650cb6a04SSeth Howell 			trid = port->trid;
359750cb6a04SSeth Howell 			break;
359850cb6a04SSeth Howell 		}
359950cb6a04SSeth Howell 	}
36006d8f1fc6SJacek Kalwas 
360150cb6a04SSeth Howell 	if (event_acked) {
3602804b0669SAlexey Marchuk 		nvmf_rdma_disconnect_qpairs_on_port(rtransport, port);
360350cb6a04SSeth Howell 
360455d8d943SSeth Howell 		nvmf_rdma_stop_listen(transport, trid);
360587a062e6SJacek Kalwas 		nvmf_rdma_listen(transport, trid, NULL);
360650cb6a04SSeth Howell 	}
36076d8f1fc6SJacek Kalwas 
360850cb6a04SSeth Howell 	return event_acked;
360950cb6a04SSeth Howell }
361050cb6a04SSeth Howell 
3611d3fa0181SBen Walker static void
36128ddc5cd4Ssijie.sun nvmf_rdma_handle_device_removal(struct spdk_nvmf_rdma_transport *rtransport,
36138ddc5cd4Ssijie.sun 				struct spdk_nvmf_rdma_device *device)
36148ddc5cd4Ssijie.sun {
36158ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_port	*port, *port_tmp;
36168ddc5cd4Ssijie.sun 	int				rc;
36178ddc5cd4Ssijie.sun 	bool				has_inflight;
36188ddc5cd4Ssijie.sun 
3619549be9adSsijie.sun 	rc = nvmf_rdma_manage_poller(rtransport, device, &has_inflight, false);
36208ddc5cd4Ssijie.sun 	if (rc) {
36218ddc5cd4Ssijie.sun 		SPDK_ERRLOG("Failed to handle device removal, rc %d\n", rc);
36228ddc5cd4Ssijie.sun 		return;
36238ddc5cd4Ssijie.sun 	}
36248ddc5cd4Ssijie.sun 
36258ddc5cd4Ssijie.sun 	if (!has_inflight) {
36268ddc5cd4Ssijie.sun 		/* no pollers, destroy the device */
36278ddc5cd4Ssijie.sun 		device->ready_to_destroy = true;
36288ddc5cd4Ssijie.sun 		spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_remove_destroyed_device, rtransport);
36298ddc5cd4Ssijie.sun 	}
36308ddc5cd4Ssijie.sun 
36318ddc5cd4Ssijie.sun 	TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) {
36328ddc5cd4Ssijie.sun 		if (port->device == device) {
36338ddc5cd4Ssijie.sun 			SPDK_NOTICELOG("Port %s:%s on device %s is being removed.\n",
36348ddc5cd4Ssijie.sun 				       port->trid->traddr,
36358ddc5cd4Ssijie.sun 				       port->trid->trsvcid,
36368ddc5cd4Ssijie.sun 				       ibv_get_device_name(port->device->context->device));
36378ddc5cd4Ssijie.sun 
36388ddc5cd4Ssijie.sun 			/* keep NVMF listener and only destroy structures of the
36398ddc5cd4Ssijie.sun 			 * RDMA transport. when the device comes back we can retry listening
36408ddc5cd4Ssijie.sun 			 * and the application's workflow will not be interrupted.
36418ddc5cd4Ssijie.sun 			 */
3642549be9adSsijie.sun 			nvmf_rdma_stop_listen_ex(&rtransport->transport, port->trid, true);
36438ddc5cd4Ssijie.sun 		}
36448ddc5cd4Ssijie.sun 	}
36458ddc5cd4Ssijie.sun }
36468ddc5cd4Ssijie.sun 
36478ddc5cd4Ssijie.sun static void
3648804b0669SAlexey Marchuk nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport,
3649804b0669SAlexey Marchuk 				       struct rdma_cm_event *event)
3650804b0669SAlexey Marchuk {
3651549be9adSsijie.sun 	struct spdk_nvmf_rdma_port		*port, *tmp_port;
3652804b0669SAlexey Marchuk 	struct spdk_nvmf_rdma_transport		*rtransport;
3653804b0669SAlexey Marchuk 
3654804b0669SAlexey Marchuk 	port = event->id->context;
3655804b0669SAlexey Marchuk 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
3656804b0669SAlexey Marchuk 
3657804b0669SAlexey Marchuk 	rdma_ack_cm_event(event);
3658804b0669SAlexey Marchuk 
3659549be9adSsijie.sun 	/* if device removal happens during ctrl qpair disconnecting, it's possible that we receive
3660549be9adSsijie.sun 	 * an DEVICE_REMOVAL event on qpair but the id->qp is just NULL. So we should make sure that
3661549be9adSsijie.sun 	 * we are handling a port event here.
3662549be9adSsijie.sun 	 */
3663549be9adSsijie.sun 	TAILQ_FOREACH(tmp_port, &rtransport->ports, link) {
3664549be9adSsijie.sun 		if (port == tmp_port && port->device && !port->device->need_destroy) {
36658ddc5cd4Ssijie.sun 			port->device->need_destroy = true;
36668ddc5cd4Ssijie.sun 			nvmf_rdma_handle_device_removal(rtransport, port->device);
3667804b0669SAlexey Marchuk 		}
3668804b0669SAlexey Marchuk 	}
3669549be9adSsijie.sun }
3670804b0669SAlexey Marchuk 
3671804b0669SAlexey Marchuk static void
3672596f8a4aSAlexey Marchuk nvmf_process_cm_events(struct spdk_nvmf_transport *transport, uint32_t max_events)
3673d3fa0181SBen Walker {
3674d3fa0181SBen Walker 	struct spdk_nvmf_rdma_transport *rtransport;
3675d3fa0181SBen Walker 	struct rdma_cm_event		*event;
3676e89ae156SAlexey Marchuk 	uint32_t			i;
3677d3fa0181SBen Walker 	int				rc;
367850cb6a04SSeth Howell 	bool				event_acked;
3679d3fa0181SBen Walker 
3680d3fa0181SBen Walker 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
3681d3fa0181SBen Walker 
3682d3fa0181SBen Walker 	if (rtransport->event_channel == NULL) {
3683d3fa0181SBen Walker 		return;
3684d3fa0181SBen Walker 	}
3685d3fa0181SBen Walker 
3686596f8a4aSAlexey Marchuk 	for (i = 0; i < max_events; i++) {
368750cb6a04SSeth Howell 		event_acked = false;
3688d3fa0181SBen Walker 		rc = rdma_get_cm_event(rtransport->event_channel, &event);
3689804b0669SAlexey Marchuk 		if (rc) {
3690804b0669SAlexey Marchuk 			if (errno != EAGAIN && errno != EWOULDBLOCK) {
3691804b0669SAlexey Marchuk 				SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno));
3692804b0669SAlexey Marchuk 			}
3693804b0669SAlexey Marchuk 			break;
3694804b0669SAlexey Marchuk 		}
3695804b0669SAlexey Marchuk 
36962172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]);
3697d3fa0181SBen Walker 
3698d3fa0181SBen Walker 		spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event);
3699d3fa0181SBen Walker 
3700d3fa0181SBen Walker 		switch (event->event) {
3701d3fa0181SBen Walker 		case RDMA_CM_EVENT_ADDR_RESOLVED:
3702d3fa0181SBen Walker 		case RDMA_CM_EVENT_ADDR_ERROR:
3703d3fa0181SBen Walker 		case RDMA_CM_EVENT_ROUTE_RESOLVED:
3704d3fa0181SBen Walker 		case RDMA_CM_EVENT_ROUTE_ERROR:
3705d3fa0181SBen Walker 			/* No action required. The target never attempts to resolve routes. */
3706d3fa0181SBen Walker 			break;
3707d3fa0181SBen Walker 		case RDMA_CM_EVENT_CONNECT_REQUEST:
37085584232cSBen Walker 			rc = nvmf_rdma_connect(transport, event);
3709d3fa0181SBen Walker 			if (rc < 0) {
3710d3fa0181SBen Walker 				SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc);
3711d3fa0181SBen Walker 				break;
3712d3fa0181SBen Walker 			}
3713d3fa0181SBen Walker 			break;
3714d3fa0181SBen Walker 		case RDMA_CM_EVENT_CONNECT_RESPONSE:
3715d3fa0181SBen Walker 			/* The target never initiates a new connection. So this will not occur. */
3716d3fa0181SBen Walker 			break;
3717d3fa0181SBen Walker 		case RDMA_CM_EVENT_CONNECT_ERROR:
3718d3fa0181SBen Walker 			/* Can this happen? The docs say it can, but not sure what causes it. */
3719d3fa0181SBen Walker 			break;
3720d3fa0181SBen Walker 		case RDMA_CM_EVENT_UNREACHABLE:
3721d3fa0181SBen Walker 		case RDMA_CM_EVENT_REJECTED:
3722d3fa0181SBen Walker 			/* These only occur on the client side. */
3723d3fa0181SBen Walker 			break;
3724d3fa0181SBen Walker 		case RDMA_CM_EVENT_ESTABLISHED:
3725d3fa0181SBen Walker 			/* TODO: Should we be waiting for this event anywhere? */
3726d3fa0181SBen Walker 			break;
3727d3fa0181SBen Walker 		case RDMA_CM_EVENT_DISCONNECTED:
37282470b995Ssijie.sun 			rc = nvmf_rdma_disconnect(event, &event_acked);
3729d3fa0181SBen Walker 			if (rc < 0) {
3730d3fa0181SBen Walker 				SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
3731d3fa0181SBen Walker 				break;
3732d3fa0181SBen Walker 			}
3733d3fa0181SBen Walker 			break;
3734804b0669SAlexey Marchuk 		case RDMA_CM_EVENT_DEVICE_REMOVAL:
3735804b0669SAlexey Marchuk 			/* In case of device removal, kernel IB part triggers IBV_EVENT_DEVICE_FATAL
3736804b0669SAlexey Marchuk 			 * which triggers RDMA_CM_EVENT_DEVICE_REMOVAL on all cma_id’s.
3737804b0669SAlexey Marchuk 			 * Once these events are sent to SPDK, we should release all IB resources and
3738804b0669SAlexey Marchuk 			 * don't make attempts to call any ibv_query/modify/create functions. We can only call
3739cc6920a4SJosh Soref 			 * ibv_destroy* functions to release user space memory allocated by IB. All kernel
3740804b0669SAlexey Marchuk 			 * resources are already cleaned. */
3741549be9adSsijie.sun 			if (event->id->qp) {
3742804b0669SAlexey Marchuk 				/* If rdma_cm event has a valid `qp` pointer then the event refers to the
3743549be9adSsijie.sun 				 * corresponding qpair. Otherwise the event refers to a listening device. */
37442470b995Ssijie.sun 				rc = nvmf_rdma_disconnect(event, &event_acked);
3745549be9adSsijie.sun 				if (rc < 0) {
3746549be9adSsijie.sun 					SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
3747549be9adSsijie.sun 					break;
3748549be9adSsijie.sun 				}
3749549be9adSsijie.sun 			} else {
3750804b0669SAlexey Marchuk 				nvmf_rdma_handle_cm_event_port_removal(transport, event);
3751804b0669SAlexey Marchuk 				event_acked = true;
3752804b0669SAlexey Marchuk 			}
3753804b0669SAlexey Marchuk 			break;
3754d3fa0181SBen Walker 		case RDMA_CM_EVENT_MULTICAST_JOIN:
3755d3fa0181SBen Walker 		case RDMA_CM_EVENT_MULTICAST_ERROR:
3756d3fa0181SBen Walker 			/* Multicast is not used */
3757d3fa0181SBen Walker 			break;
3758d3fa0181SBen Walker 		case RDMA_CM_EVENT_ADDR_CHANGE:
375950cb6a04SSeth Howell 			event_acked = nvmf_rdma_handle_cm_event_addr_change(transport, event);
3760d3fa0181SBen Walker 			break;
3761d3fa0181SBen Walker 		case RDMA_CM_EVENT_TIMEWAIT_EXIT:
3762d3fa0181SBen Walker 			/* For now, do nothing. The target never re-uses queue pairs. */
3763d3fa0181SBen Walker 			break;
3764d3fa0181SBen Walker 		default:
3765d3fa0181SBen Walker 			SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event);
3766d3fa0181SBen Walker 			break;
3767d3fa0181SBen Walker 		}
376850cb6a04SSeth Howell 		if (!event_acked) {
3769d3fa0181SBen Walker 			rdma_ack_cm_event(event);
377050cb6a04SSeth Howell 		}
3771d3fa0181SBen Walker 	}
3772d3fa0181SBen Walker }
3773d3fa0181SBen Walker 
3774d3fa0181SBen Walker static void
3775dc84fbaaSAlexey Marchuk nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair)
3776dc84fbaaSAlexey Marchuk {
3777dc84fbaaSAlexey Marchuk 	rqpair->last_wqe_reached = true;
3778dc84fbaaSAlexey Marchuk 	nvmf_rdma_destroy_drained_qpair(rqpair);
3779dc84fbaaSAlexey Marchuk }
3780dc84fbaaSAlexey Marchuk 
3781dc84fbaaSAlexey Marchuk static void
37829645421cSJim Harris nvmf_rdma_qpair_process_last_wqe_event(void *ctx)
3783dc84fbaaSAlexey Marchuk {
3784dc84fbaaSAlexey Marchuk 	struct spdk_nvmf_rdma_ibv_event_ctx *event_ctx = ctx;
378543f6d338SJim Harris 	struct spdk_nvmf_rdma_qpair *rqpair;
3786dc84fbaaSAlexey Marchuk 
378743f6d338SJim Harris 	rqpair = event_ctx->rqpair;
378843f6d338SJim Harris 
378943f6d338SJim Harris 	if (rqpair) {
379043f6d338SJim Harris 		assert(event_ctx == rqpair->last_wqe_reached_ctx);
379143f6d338SJim Harris 		rqpair->last_wqe_reached_ctx = NULL;
37925e156a6eSJim Harris 		nvmf_rdma_handle_last_wqe_reached(rqpair);
3793dc84fbaaSAlexey Marchuk 	}
3794dc84fbaaSAlexey Marchuk 	free(event_ctx);
3795dc84fbaaSAlexey Marchuk }
3796dc84fbaaSAlexey Marchuk 
3797dc84fbaaSAlexey Marchuk static int
3798e6da32eeSJim Harris nvmf_rdma_send_qpair_last_wqe_event(struct spdk_nvmf_rdma_qpair *rqpair)
3799dc84fbaaSAlexey Marchuk {
3800dc84fbaaSAlexey Marchuk 	struct spdk_nvmf_rdma_ibv_event_ctx *ctx;
38013d1d4fcfSAlexey Marchuk 	struct spdk_thread *thr = NULL;
38023d1d4fcfSAlexey Marchuk 	int rc;
3803dc84fbaaSAlexey Marchuk 
38043d1d4fcfSAlexey Marchuk 	if (rqpair->qpair.group) {
38053d1d4fcfSAlexey Marchuk 		thr = rqpair->qpair.group->thread;
38063d1d4fcfSAlexey Marchuk 	} else if (rqpair->destruct_channel) {
38073d1d4fcfSAlexey Marchuk 		thr = spdk_io_channel_get_thread(rqpair->destruct_channel);
38083d1d4fcfSAlexey Marchuk 	}
38093d1d4fcfSAlexey Marchuk 
38103d1d4fcfSAlexey Marchuk 	if (!thr) {
38112172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "rqpair %p has no thread\n", rqpair);
38123d1d4fcfSAlexey Marchuk 		return -EINVAL;
3813dc84fbaaSAlexey Marchuk 	}
3814dc84fbaaSAlexey Marchuk 
381543f6d338SJim Harris 	if (rqpair->last_wqe_reached || rqpair->last_wqe_reached_ctx != NULL) {
381643f6d338SJim Harris 		SPDK_ERRLOG("LAST_WQE_REACHED already received for rqpair %p\n", rqpair);
381743f6d338SJim Harris 		return -EALREADY;
381843f6d338SJim Harris 	}
381943f6d338SJim Harris 
3820dc84fbaaSAlexey Marchuk 	ctx = calloc(1, sizeof(*ctx));
3821dc84fbaaSAlexey Marchuk 	if (!ctx) {
38223d1d4fcfSAlexey Marchuk 		return -ENOMEM;
3823dc84fbaaSAlexey Marchuk 	}
3824dc84fbaaSAlexey Marchuk 
3825dc84fbaaSAlexey Marchuk 	ctx->rqpair = rqpair;
382643f6d338SJim Harris 	rqpair->last_wqe_reached_ctx = ctx;
3827dc84fbaaSAlexey Marchuk 
38289645421cSJim Harris 	rc = spdk_thread_send_msg(thr, nvmf_rdma_qpair_process_last_wqe_event, ctx);
38293d1d4fcfSAlexey Marchuk 	if (rc) {
383043f6d338SJim Harris 		rqpair->last_wqe_reached_ctx = NULL;
38313d1d4fcfSAlexey Marchuk 		free(ctx);
38323d1d4fcfSAlexey Marchuk 	}
38333d1d4fcfSAlexey Marchuk 
38343d1d4fcfSAlexey Marchuk 	return rc;
3835dc84fbaaSAlexey Marchuk }
3836dc84fbaaSAlexey Marchuk 
383758f43df1SAlexey Marchuk static int
383855d8d943SSeth Howell nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device)
3839b6f90c52SPhilipp Skadorov {
3840b6f90c52SPhilipp Skadorov 	int				rc;
3841a9fc7e1dSSeth Howell 	struct spdk_nvmf_rdma_qpair	*rqpair = NULL;
3842b6f90c52SPhilipp Skadorov 	struct ibv_async_event		event;
3843b6f90c52SPhilipp Skadorov 
3844b6f90c52SPhilipp Skadorov 	rc = ibv_get_async_event(device->context, &event);
3845b6f90c52SPhilipp Skadorov 
3846b6f90c52SPhilipp Skadorov 	if (rc) {
384758f43df1SAlexey Marchuk 		/* In non-blocking mode -1 means there are no events available */
384858f43df1SAlexey Marchuk 		return rc;
3849b6f90c52SPhilipp Skadorov 	}
3850b6f90c52SPhilipp Skadorov 
38514bfb557dSPhilipp Skadorov 	switch (event.event_type) {
38524bfb557dSPhilipp Skadorov 	case IBV_EVENT_QP_FATAL:
3853b3e1db32SShuhei Matsumoto 	case IBV_EVENT_QP_LAST_WQE_REACHED:
3854b3e1db32SShuhei Matsumoto 	case IBV_EVENT_QP_REQ_ERR:
3855b3e1db32SShuhei Matsumoto 	case IBV_EVENT_QP_ACCESS_ERR:
3856b3e1db32SShuhei Matsumoto 	case IBV_EVENT_COMM_EST:
3857b3e1db32SShuhei Matsumoto 	case IBV_EVENT_PATH_MIG:
3858b3e1db32SShuhei Matsumoto 	case IBV_EVENT_PATH_MIG_ERR:
385944ab0033SMaciej Szwed 		rqpair = event.element.qp->qp_context;
3860b3e1db32SShuhei Matsumoto 		if (!rqpair) {
3861b3e1db32SShuhei Matsumoto 			/* Any QP event for NVMe-RDMA initiator may be returned. */
3862b3e1db32SShuhei Matsumoto 			SPDK_NOTICELOG("Async QP event for unknown QP: %s\n",
3863b3e1db32SShuhei Matsumoto 				       ibv_event_type_str(event.event_type));
3864b3e1db32SShuhei Matsumoto 			break;
3865b3e1db32SShuhei Matsumoto 		}
3866b3e1db32SShuhei Matsumoto 
3867b3e1db32SShuhei Matsumoto 		switch (event.event_type) {
3868b3e1db32SShuhei Matsumoto 		case IBV_EVENT_QP_FATAL:
3869d05c5538SSeth Howell 			SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair);
3870e8881867SJim Harris 			spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
387162aa8bd8SKonrad Sztyber 					  (uintptr_t)rqpair, event.event_type);
3872feeaa282SAlexey Marchuk 			rqpair->ibv_in_error_state = true;
3873608b54a2SKonrad Sztyber 			spdk_nvmf_qpair_disconnect(&rqpair->qpair);
38744bfb557dSPhilipp Skadorov 			break;
38759f6d509bSBen Walker 		case IBV_EVENT_QP_LAST_WQE_REACHED:
3876a9fc7e1dSSeth Howell 			/* This event only occurs for shared receive queues. */
38772172c432STomasz Zawadzki 			SPDK_DEBUGLOG(rdma, "Last WQE reached event received for rqpair %p\n", rqpair);
3878e6da32eeSJim Harris 			rc = nvmf_rdma_send_qpair_last_wqe_event(rqpair);
38793d1d4fcfSAlexey Marchuk 			if (rc) {
38803d1d4fcfSAlexey Marchuk 				SPDK_WARNLOG("Failed to send LAST_WQE_REACHED event. rqpair %p, err %d\n", rqpair, rc);
3881f0b7a6e7SAlexey Marchuk 				rqpair->last_wqe_reached = true;
3882a9fc7e1dSSeth Howell 			}
38839f6d509bSBen Walker 			break;
38844bfb557dSPhilipp Skadorov 		case IBV_EVENT_QP_REQ_ERR:
38854bfb557dSPhilipp Skadorov 		case IBV_EVENT_QP_ACCESS_ERR:
38864bfb557dSPhilipp Skadorov 		case IBV_EVENT_COMM_EST:
38874bfb557dSPhilipp Skadorov 		case IBV_EVENT_PATH_MIG:
38884bfb557dSPhilipp Skadorov 		case IBV_EVENT_PATH_MIG_ERR:
3889b3e1db32SShuhei Matsumoto 			SPDK_NOTICELOG("Async QP event: %s\n",
3890d05c5538SSeth Howell 				       ibv_event_type_str(event.event_type));
3891e8881867SJim Harris 			spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
389262aa8bd8SKonrad Sztyber 					  (uintptr_t)rqpair, event.event_type);
3893feeaa282SAlexey Marchuk 			rqpair->ibv_in_error_state = true;
3894242201d2SMaciej Szwed 			break;
3895b3e1db32SShuhei Matsumoto 		default:
3896b3e1db32SShuhei Matsumoto 			break;
3897b3e1db32SShuhei Matsumoto 		}
3898b3e1db32SShuhei Matsumoto 		break;
38994bfb557dSPhilipp Skadorov 	case IBV_EVENT_DEVICE_FATAL:
39008ddc5cd4Ssijie.sun 		SPDK_ERRLOG("Device Fatal event[%s] received on %s. device: %p\n",
39018ddc5cd4Ssijie.sun 			    ibv_event_type_str(event.event_type), ibv_get_device_name(device->context->device), device);
39028ddc5cd4Ssijie.sun 		device->need_destroy = true;
39038ddc5cd4Ssijie.sun 		break;
39048ddc5cd4Ssijie.sun 	case IBV_EVENT_CQ_ERR:
39054bfb557dSPhilipp Skadorov 	case IBV_EVENT_PORT_ACTIVE:
39064bfb557dSPhilipp Skadorov 	case IBV_EVENT_PORT_ERR:
39074bfb557dSPhilipp Skadorov 	case IBV_EVENT_LID_CHANGE:
39084bfb557dSPhilipp Skadorov 	case IBV_EVENT_PKEY_CHANGE:
39094bfb557dSPhilipp Skadorov 	case IBV_EVENT_SM_CHANGE:
39104bfb557dSPhilipp Skadorov 	case IBV_EVENT_SRQ_ERR:
39114bfb557dSPhilipp Skadorov 	case IBV_EVENT_SRQ_LIMIT_REACHED:
39124bfb557dSPhilipp Skadorov 	case IBV_EVENT_CLIENT_REREGISTER:
39134bfb557dSPhilipp Skadorov 	case IBV_EVENT_GID_CHANGE:
3914feeaa282SAlexey Marchuk 	case IBV_EVENT_SQ_DRAINED:
39154bfb557dSPhilipp Skadorov 	default:
3916d05c5538SSeth Howell 		SPDK_NOTICELOG("Async event: %s\n",
3917d05c5538SSeth Howell 			       ibv_event_type_str(event.event_type));
3918e8881867SJim Harris 		spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type);
39194bfb557dSPhilipp Skadorov 		break;
39204bfb557dSPhilipp Skadorov 	}
3921b6f90c52SPhilipp Skadorov 	ibv_ack_async_event(&event);
392258f43df1SAlexey Marchuk 
392358f43df1SAlexey Marchuk 	return 0;
392458f43df1SAlexey Marchuk }
392558f43df1SAlexey Marchuk 
392658f43df1SAlexey Marchuk static void
392758f43df1SAlexey Marchuk nvmf_process_ib_events(struct spdk_nvmf_rdma_device *device, uint32_t max_events)
392858f43df1SAlexey Marchuk {
392958f43df1SAlexey Marchuk 	int rc = 0;
393058f43df1SAlexey Marchuk 	uint32_t i = 0;
393158f43df1SAlexey Marchuk 
393258f43df1SAlexey Marchuk 	for (i = 0; i < max_events; i++) {
393358f43df1SAlexey Marchuk 		rc = nvmf_process_ib_event(device);
393458f43df1SAlexey Marchuk 		if (rc) {
393558f43df1SAlexey Marchuk 			break;
393658f43df1SAlexey Marchuk 		}
393758f43df1SAlexey Marchuk 	}
393858f43df1SAlexey Marchuk 
39392172c432STomasz Zawadzki 	SPDK_DEBUGLOG(rdma, "Device %s: %u events processed\n", device->context->device->name, i);
3940b6f90c52SPhilipp Skadorov }
3941b6f90c52SPhilipp Skadorov 
394243022da3SJacek Kalwas static int
394343022da3SJacek Kalwas nvmf_rdma_accept(void *ctx)
3944b6f90c52SPhilipp Skadorov {
3945b6f90c52SPhilipp Skadorov 	int	nfds, i = 0;
394643022da3SJacek Kalwas 	struct spdk_nvmf_transport *transport = ctx;
3947b6f90c52SPhilipp Skadorov 	struct spdk_nvmf_rdma_transport *rtransport;
3948b6f90c52SPhilipp Skadorov 	struct spdk_nvmf_rdma_device *device, *tmp;
3949e7e10859SMaciej Szwed 	uint32_t count;
39508ddc5cd4Ssijie.sun 	short revents;
3951549be9adSsijie.sun 	bool do_retry;
3952b6f90c52SPhilipp Skadorov 
3953b6f90c52SPhilipp Skadorov 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
3954549be9adSsijie.sun 	do_retry = nvmf_rdma_retry_listen_port(rtransport);
3955549be9adSsijie.sun 
3956e7e10859SMaciej Szwed 	count = nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0);
3957b6f90c52SPhilipp Skadorov 
3958b6f90c52SPhilipp Skadorov 	if (nfds <= 0) {
3959549be9adSsijie.sun 		return do_retry ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
3960b6f90c52SPhilipp Skadorov 	}
3961b6f90c52SPhilipp Skadorov 
3962b6f90c52SPhilipp Skadorov 	/* The first poll descriptor is RDMA CM event */
3963b6f90c52SPhilipp Skadorov 	if (rtransport->poll_fds[i++].revents & POLLIN) {
3964596f8a4aSAlexey Marchuk 		nvmf_process_cm_events(transport, NVMF_RDMA_MAX_EVENTS_PER_POLL);
3965b6f90c52SPhilipp Skadorov 		nfds--;
3966b6f90c52SPhilipp Skadorov 	}
3967b6f90c52SPhilipp Skadorov 
3968b6f90c52SPhilipp Skadorov 	if (nfds == 0) {
396943022da3SJacek Kalwas 		return SPDK_POLLER_BUSY;
3970b6f90c52SPhilipp Skadorov 	}
3971b6f90c52SPhilipp Skadorov 
3972b6f90c52SPhilipp Skadorov 	/* Second and subsequent poll descriptors are IB async events */
3973b6f90c52SPhilipp Skadorov 	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
39748ddc5cd4Ssijie.sun 		revents = rtransport->poll_fds[i++].revents;
39758ddc5cd4Ssijie.sun 		if (revents & POLLIN) {
39768ddc5cd4Ssijie.sun 			if (spdk_likely(!device->need_destroy)) {
3977e89ae156SAlexey Marchuk 				nvmf_process_ib_events(device, NVMF_RDMA_MAX_EVENTS_PER_POLL);
39788ddc5cd4Ssijie.sun 				if (spdk_unlikely(device->need_destroy)) {
39798ddc5cd4Ssijie.sun 					nvmf_rdma_handle_device_removal(rtransport, device);
39808ddc5cd4Ssijie.sun 				}
39818ddc5cd4Ssijie.sun 			}
39828ddc5cd4Ssijie.sun 			nfds--;
39838ddc5cd4Ssijie.sun 		} else if (revents & POLLNVAL || revents & POLLHUP) {
39848ddc5cd4Ssijie.sun 			SPDK_ERRLOG("Receive unknown revent %x on device %p\n", (int)revents, device);
3985b6f90c52SPhilipp Skadorov 			nfds--;
3986b6f90c52SPhilipp Skadorov 		}
3987b6f90c52SPhilipp Skadorov 	}
3988b6f90c52SPhilipp Skadorov 	/* check all flagged fd's have been served */
3989b6f90c52SPhilipp Skadorov 	assert(nfds == 0);
3990e7e10859SMaciej Szwed 
399143022da3SJacek Kalwas 	return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
3992b6f90c52SPhilipp Skadorov }
3993b6f90c52SPhilipp Skadorov 
3994b6f90c52SPhilipp Skadorov static void
3995000e6f5bSJacek Kalwas nvmf_rdma_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem,
3996000e6f5bSJacek Kalwas 		     struct spdk_nvmf_ctrlr_data *cdata)
3997000e6f5bSJacek Kalwas {
3998a3f09a8fSAlexey Marchuk 	cdata->nvmf_specific.msdbd = NVMF_DEFAULT_MSDBD;
3999000e6f5bSJacek Kalwas 
4000000e6f5bSJacek Kalwas 	/* Disable in-capsule data transfer for RDMA controller when dif_insert_or_strip is enabled
4001000e6f5bSJacek Kalwas 	since in-capsule data only works with NVME drives that support SGL memory layout */
4002000e6f5bSJacek Kalwas 	if (transport->opts.dif_insert_or_strip) {
4003000e6f5bSJacek Kalwas 		cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16;
4004000e6f5bSJacek Kalwas 	}
400534392f23SBen Walker 
400634392f23SBen Walker 	if (cdata->nvmf_specific.ioccsz > ((sizeof(struct spdk_nvme_cmd) + 0x1000) / 16)) {
400734392f23SBen Walker 		SPDK_WARNLOG("RDMA is configured to support up to 16 SGL entries while in capsule"
400834392f23SBen Walker 			     " data is greater than 4KiB.\n");
400934392f23SBen Walker 		SPDK_WARNLOG("When used in conjunction with the NVMe-oF initiator from the Linux "
401034392f23SBen Walker 			     "kernel between versions 5.4 and 5.12 data corruption may occur for "
401134392f23SBen Walker 			     "writes that are not a multiple of 4KiB in size.\n");
401234392f23SBen Walker 	}
4013000e6f5bSJacek Kalwas }
4014000e6f5bSJacek Kalwas 
4015000e6f5bSJacek Kalwas static void
401655d8d943SSeth Howell nvmf_rdma_discover(struct spdk_nvmf_transport *transport,
40176428de9eSBen Walker 		   struct spdk_nvme_transport_id *trid,
4018349295caSBen Walker 		   struct spdk_nvmf_discovery_log_page_entry *entry)
4019349295caSBen Walker {
4020349295caSBen Walker 	entry->trtype = SPDK_NVMF_TRTYPE_RDMA;
40216428de9eSBen Walker 	entry->adrfam = trid->adrfam;
402262615117SMichal Ben Haim 	entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED;
4023349295caSBen Walker 
40246428de9eSBen Walker 	spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
40256428de9eSBen Walker 	spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
4026349295caSBen Walker 
4027349295caSBen Walker 	entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED;
4028349295caSBen Walker 	entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE;
4029349295caSBen Walker 	entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM;
4030349295caSBen Walker }
4031349295caSBen Walker 
4032a5283034Ssijie.sun static int
4033a5283034Ssijie.sun nvmf_rdma_poller_create(struct spdk_nvmf_rdma_transport *rtransport,
4034a5283034Ssijie.sun 			struct spdk_nvmf_rdma_poll_group *rgroup, struct spdk_nvmf_rdma_device *device,
4035a5283034Ssijie.sun 			struct spdk_nvmf_rdma_poller **out_poller)
4036d7b8da3bSBen Walker {
4037ed0b611fSEvgeniy Kochetov 	struct spdk_nvmf_rdma_poller		*poller;
4038cf151d60SAlexey Marchuk 	struct spdk_rdma_provider_srq_init_attr	srq_init_attr;
40390d3fcd10SSeth Howell 	struct spdk_nvmf_rdma_resource_opts	opts;
40407dd3cf44SSeth Howell 	int					num_cqe;
40410d3fcd10SSeth Howell 
40423ee93c32SBen Walker 	poller = calloc(1, sizeof(*poller));
40433ee93c32SBen Walker 	if (!poller) {
40443ee93c32SBen Walker 		SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n");
4045a5283034Ssijie.sun 		return -1;
40463ee93c32SBen Walker 	}
40473ee93c32SBen Walker 
40483ee93c32SBen Walker 	poller->device = device;
40493ee93c32SBen Walker 	poller->group = rgroup;
4050a5283034Ssijie.sun 	*out_poller = poller;
40513ee93c32SBen Walker 
405252f7aeb7SShuhei Matsumoto 	RB_INIT(&poller->qpairs);
4053b4dc10fbSSeth Howell 	STAILQ_INIT(&poller->qpairs_pending_send);
405414777890SSeth Howell 	STAILQ_INIT(&poller->qpairs_pending_recv);
40553ee93c32SBen Walker 
40563ee93c32SBen Walker 	TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link);
4057a5283034Ssijie.sun 	SPDK_DEBUGLOG(rdma, "Create poller %p on device %p in poll group %p.\n", poller, device, rgroup);
4058f766d1e4SDarek Stojaczyk 	if (rtransport->rdma_opts.no_srq == false && device->num_srq < device->attr.max_srq) {
40593838d4d2SAlexey Marchuk 		if ((int)rtransport->rdma_opts.max_srq_depth > device->attr.max_srq_wr) {
40603838d4d2SAlexey Marchuk 			SPDK_WARNLOG("Requested SRQ depth %u, max supported by dev %s is %d\n",
40613838d4d2SAlexey Marchuk 				     rtransport->rdma_opts.max_srq_depth, device->context->device->name, device->attr.max_srq_wr);
40623838d4d2SAlexey Marchuk 		}
40633838d4d2SAlexey Marchuk 		poller->max_srq_depth = spdk_min((int)rtransport->rdma_opts.max_srq_depth, device->attr.max_srq_wr);
4064ed0b611fSEvgeniy Kochetov 
406561948a1cSSeth Howell 		device->num_srq++;
4066696e8580SAlexey Marchuk 		memset(&srq_init_attr, 0, sizeof(srq_init_attr));
4067696e8580SAlexey Marchuk 		srq_init_attr.pd = device->pd;
406836ac75b9SAlexey Marchuk 		srq_init_attr.stats = &poller->stat.qp_stats.recv;
4069696e8580SAlexey Marchuk 		srq_init_attr.srq_init_attr.attr.max_wr = poller->max_srq_depth;
4070696e8580SAlexey Marchuk 		srq_init_attr.srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
4071cf151d60SAlexey Marchuk 		poller->srq = spdk_rdma_provider_srq_create(&srq_init_attr);
4072ed0b611fSEvgeniy Kochetov 		if (!poller->srq) {
4073ed0b611fSEvgeniy Kochetov 			SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno);
4074a5283034Ssijie.sun 			return -1;
4075ed0b611fSEvgeniy Kochetov 		}
4076b25751d9SBen Walker 
40770d3fcd10SSeth Howell 		opts.qp = poller->srq;
4078bf41b46cSAleksey Marchuk 		opts.map = device->map;
40790d3fcd10SSeth Howell 		opts.qpair = NULL;
40800d3fcd10SSeth Howell 		opts.shared = true;
40810d3fcd10SSeth Howell 		opts.max_queue_depth = poller->max_srq_depth;
4082a5283034Ssijie.sun 		opts.in_capsule_data_size = rtransport->transport.opts.in_capsule_data_size;
40830d3fcd10SSeth Howell 
40840d3fcd10SSeth Howell 		poller->resources = nvmf_rdma_resources_create(&opts);
4085b25751d9SBen Walker 		if (!poller->resources) {
4086b25751d9SBen Walker 			SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n");
4087a5283034Ssijie.sun 			return -1;
4088b25751d9SBen Walker 		}
408901201d3eSSeth Howell 	}
40907dd3cf44SSeth Howell 
40917dd3cf44SSeth Howell 	/*
40927dd3cf44SSeth Howell 	 * When using an srq, we can limit the completion queue at startup.
40937dd3cf44SSeth Howell 	 * The following formula represents the calculation:
40947dd3cf44SSeth Howell 	 * num_cqe = num_recv + num_data_wr + num_send_wr.
40957dd3cf44SSeth Howell 	 * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth
40967dd3cf44SSeth Howell 	 */
40977dd3cf44SSeth Howell 	if (poller->srq) {
40987dd3cf44SSeth Howell 		num_cqe = poller->max_srq_depth * 3;
40997dd3cf44SSeth Howell 	} else {
410097ef8701SMonica Kenguva 		num_cqe = rtransport->rdma_opts.num_cqe;
41017dd3cf44SSeth Howell 	}
41027dd3cf44SSeth Howell 
41037dd3cf44SSeth Howell 	poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0);
41047dd3cf44SSeth Howell 	if (!poller->cq) {
41057dd3cf44SSeth Howell 		SPDK_ERRLOG("Unable to create completion queue\n");
4106a5283034Ssijie.sun 		return -1;
4107a5283034Ssijie.sun 	}
4108a5283034Ssijie.sun 	poller->num_cqe = num_cqe;
4109a5283034Ssijie.sun 	return 0;
4110a5283034Ssijie.sun }
4111a5283034Ssijie.sun 
4112549be9adSsijie.sun static void
4113549be9adSsijie.sun _nvmf_rdma_register_poller_in_group(void *c)
4114549be9adSsijie.sun {
4115549be9adSsijie.sun 	struct spdk_nvmf_rdma_poller	*poller;
4116549be9adSsijie.sun 	struct poller_manage_ctx	*ctx = c;
4117549be9adSsijie.sun 	struct spdk_nvmf_rdma_device	*device;
4118549be9adSsijie.sun 	int				rc;
4119549be9adSsijie.sun 
4120549be9adSsijie.sun 	rc = nvmf_rdma_poller_create(ctx->rtransport, ctx->rgroup, ctx->device, &poller);
4121549be9adSsijie.sun 	if (rc < 0 && poller) {
4122549be9adSsijie.sun 		nvmf_rdma_poller_destroy(poller);
4123549be9adSsijie.sun 	}
4124549be9adSsijie.sun 
4125549be9adSsijie.sun 	device = ctx->device;
4126549be9adSsijie.sun 	if (nvmf_rdma_all_pollers_management_done(ctx)) {
4127549be9adSsijie.sun 		device->is_ready = true;
4128549be9adSsijie.sun 	}
4129549be9adSsijie.sun }
4130549be9adSsijie.sun 
4131a5283034Ssijie.sun static void nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group);
4132a5283034Ssijie.sun 
4133a5283034Ssijie.sun static struct spdk_nvmf_transport_poll_group *
4134a5283034Ssijie.sun nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport,
4135a5283034Ssijie.sun 			    struct spdk_nvmf_poll_group *group)
4136a5283034Ssijie.sun {
4137a5283034Ssijie.sun 	struct spdk_nvmf_rdma_transport		*rtransport;
4138a5283034Ssijie.sun 	struct spdk_nvmf_rdma_poll_group	*rgroup;
4139a5283034Ssijie.sun 	struct spdk_nvmf_rdma_poller		*poller;
4140a5283034Ssijie.sun 	struct spdk_nvmf_rdma_device		*device;
4141a5283034Ssijie.sun 	int					rc;
4142a5283034Ssijie.sun 
414354e1a03bSKrzysztof Goreczny 	if (spdk_interrupt_mode_is_enabled()) {
414454e1a03bSKrzysztof Goreczny 		SPDK_ERRLOG("RDMA transport does not support interrupt mode\n");
414554e1a03bSKrzysztof Goreczny 		return NULL;
414654e1a03bSKrzysztof Goreczny 	}
414754e1a03bSKrzysztof Goreczny 
4148a5283034Ssijie.sun 	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
4149a5283034Ssijie.sun 
4150a5283034Ssijie.sun 	rgroup = calloc(1, sizeof(*rgroup));
4151a5283034Ssijie.sun 	if (!rgroup) {
4152a5283034Ssijie.sun 		return NULL;
4153a5283034Ssijie.sun 	}
4154a5283034Ssijie.sun 
4155a5283034Ssijie.sun 	TAILQ_INIT(&rgroup->pollers);
4156a5283034Ssijie.sun 
4157a5283034Ssijie.sun 	TAILQ_FOREACH(device, &rtransport->devices, link) {
4158a5283034Ssijie.sun 		rc = nvmf_rdma_poller_create(rtransport, rgroup, device, &poller);
4159a5283034Ssijie.sun 		if (rc < 0) {
416055d8d943SSeth Howell 			nvmf_rdma_poll_group_destroy(&rgroup->group);
41617dd3cf44SSeth Howell 			return NULL;
41627dd3cf44SSeth Howell 		}
41633ee93c32SBen Walker 	}
41643ee93c32SBen Walker 
4165645d5944SAlexey Marchuk 	TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link);
4166645d5944SAlexey Marchuk 	if (rtransport->conn_sched.next_admin_pg == NULL) {
4167645d5944SAlexey Marchuk 		rtransport->conn_sched.next_admin_pg = rgroup;
4168645d5944SAlexey Marchuk 		rtransport->conn_sched.next_io_pg = rgroup;
4169645d5944SAlexey Marchuk 	}
4170645d5944SAlexey Marchuk 
4171baa936a1SBen Walker 	return &rgroup->group;
4172d7b8da3bSBen Walker }
4173d7b8da3bSBen Walker 
417430c8b17fSJim Harris static uint32_t
417530c8b17fSJim Harris nvmf_poll_group_get_io_qpair_count(struct spdk_nvmf_poll_group *pg)
417630c8b17fSJim Harris {
417730c8b17fSJim Harris 	uint32_t count;
417830c8b17fSJim Harris 
417930c8b17fSJim Harris 	/* Just assume that unassociated qpairs will eventually be io
418030c8b17fSJim Harris 	 * qpairs.  This is close enough for the use cases for this
418130c8b17fSJim Harris 	 * function.
418230c8b17fSJim Harris 	 */
418330c8b17fSJim Harris 	pthread_mutex_lock(&pg->mutex);
418430c8b17fSJim Harris 	count = pg->stat.current_io_qpairs + pg->current_unassociated_qpairs;
418530c8b17fSJim Harris 	pthread_mutex_unlock(&pg->mutex);
418630c8b17fSJim Harris 
418730c8b17fSJim Harris 	return count;
418830c8b17fSJim Harris }
418930c8b17fSJim Harris 
419073e87ed2SAlexey Marchuk static struct spdk_nvmf_transport_poll_group *
419155d8d943SSeth Howell nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
419273e87ed2SAlexey Marchuk {
419373e87ed2SAlexey Marchuk 	struct spdk_nvmf_rdma_transport *rtransport;
419473e87ed2SAlexey Marchuk 	struct spdk_nvmf_rdma_poll_group **pg;
419573e87ed2SAlexey Marchuk 	struct spdk_nvmf_transport_poll_group *result;
419630020c2fSJim Harris 	uint32_t count;
419773e87ed2SAlexey Marchuk 
419873e87ed2SAlexey Marchuk 	rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
419973e87ed2SAlexey Marchuk 
420073e87ed2SAlexey Marchuk 	if (TAILQ_EMPTY(&rtransport->poll_groups)) {
420173e87ed2SAlexey Marchuk 		return NULL;
420273e87ed2SAlexey Marchuk 	}
420373e87ed2SAlexey Marchuk 
420473e87ed2SAlexey Marchuk 	if (qpair->qid == 0) {
420573e87ed2SAlexey Marchuk 		pg = &rtransport->conn_sched.next_admin_pg;
420673e87ed2SAlexey Marchuk 	} else {
4207cd1b7ab0Sliuqinfei 		struct spdk_nvmf_rdma_poll_group *pg_min, *pg_start, *pg_current;
4208cd1b7ab0Sliuqinfei 		uint32_t min_value;
4209cd1b7ab0Sliuqinfei 
421073e87ed2SAlexey Marchuk 		pg = &rtransport->conn_sched.next_io_pg;
4211cd1b7ab0Sliuqinfei 		pg_min = *pg;
4212cd1b7ab0Sliuqinfei 		pg_start = *pg;
4213cd1b7ab0Sliuqinfei 		pg_current = *pg;
421430c8b17fSJim Harris 		min_value = nvmf_poll_group_get_io_qpair_count(pg_current->group.group);
4215cd1b7ab0Sliuqinfei 
4216dbadf72eSIgorVechriko 		while (1) {
4217dbadf72eSIgorVechriko 			count = nvmf_poll_group_get_io_qpair_count(pg_current->group.group);
4218dbadf72eSIgorVechriko 
421930020c2fSJim Harris 			if (count < min_value) {
422030020c2fSJim Harris 				min_value = count;
4221cd1b7ab0Sliuqinfei 				pg_min = pg_current;
4222cd1b7ab0Sliuqinfei 			}
4223cd1b7ab0Sliuqinfei 
42240441dce4SMichael Haeuptle 			pg_current = TAILQ_NEXT(pg_current, link);
42250441dce4SMichael Haeuptle 			if (pg_current == NULL) {
42260441dce4SMichael Haeuptle 				pg_current = TAILQ_FIRST(&rtransport->poll_groups);
42270441dce4SMichael Haeuptle 			}
42280441dce4SMichael Haeuptle 
4229dbadf72eSIgorVechriko 			if (pg_current == pg_start || min_value == 0) {
4230cd1b7ab0Sliuqinfei 				break;
4231cd1b7ab0Sliuqinfei 			}
4232cd1b7ab0Sliuqinfei 		}
4233cd1b7ab0Sliuqinfei 		*pg = pg_min;
423473e87ed2SAlexey Marchuk 	}
423573e87ed2SAlexey Marchuk 
423673e87ed2SAlexey Marchuk 	assert(*pg != NULL);
423773e87ed2SAlexey Marchuk 
423873e87ed2SAlexey Marchuk 	result = &(*pg)->group;
423973e87ed2SAlexey Marchuk 
424073e87ed2SAlexey Marchuk 	*pg = TAILQ_NEXT(*pg, link);
424173e87ed2SAlexey Marchuk 	if (*pg == NULL) {
424273e87ed2SAlexey Marchuk 		*pg = TAILQ_FIRST(&rtransport->poll_groups);
424373e87ed2SAlexey Marchuk 	}
424473e87ed2SAlexey Marchuk 
424573e87ed2SAlexey Marchuk 	return result;
424673e87ed2SAlexey Marchuk }
424773e87ed2SAlexey Marchuk 
4248d7b8da3bSBen Walker static void
4249a5283034Ssijie.sun nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller)
4250d7b8da3bSBen Walker {
425154c394c4SSeth Howell 	struct spdk_nvmf_rdma_qpair	*qpair, *tmp_qpair;
42528ddc5cd4Ssijie.sun 	int				rc;
42538ddc5cd4Ssijie.sun 
42548ddc5cd4Ssijie.sun 	TAILQ_REMOVE(&poller->group->pollers, poller, link);
425552f7aeb7SShuhei Matsumoto 	RB_FOREACH_SAFE(qpair, qpairs_tree, &poller->qpairs, tmp_qpair) {
425655d8d943SSeth Howell 		nvmf_rdma_qpair_destroy(qpair);
4257fc43fbbaSyidong0635 	}
4258fc43fbbaSyidong0635 
4259ed0b611fSEvgeniy Kochetov 	if (poller->srq) {
42609d93c082Syidong0635 		if (poller->resources) {
426101201d3eSSeth Howell 			nvmf_rdma_resources_destroy(poller->resources);
42629d93c082Syidong0635 		}
4263cf151d60SAlexey Marchuk 		spdk_rdma_provider_srq_destroy(poller->srq);
42642172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma, "Destroyed RDMA shared queue %p\n", poller->srq);
4265ed0b611fSEvgeniy Kochetov 	}
4266ed0b611fSEvgeniy Kochetov 
42672a0772e3SBen Walker 	if (poller->cq) {
42688ddc5cd4Ssijie.sun 		rc = ibv_destroy_cq(poller->cq);
42698ddc5cd4Ssijie.sun 		if (rc != 0) {
42708ddc5cd4Ssijie.sun 			SPDK_ERRLOG("Destroy cq return %d, error: %s\n", rc, strerror(errno));
42718ddc5cd4Ssijie.sun 		}
42728ddc5cd4Ssijie.sun 	}
42738ddc5cd4Ssijie.sun 
42748ddc5cd4Ssijie.sun 	if (poller->destroy_cb) {
42758ddc5cd4Ssijie.sun 		poller->destroy_cb(poller->destroy_cb_ctx);
42768ddc5cd4Ssijie.sun 		poller->destroy_cb = NULL;
42772a0772e3SBen Walker 	}
42782a0772e3SBen Walker 
42793ee93c32SBen Walker 	free(poller);
42803ee93c32SBen Walker }
42813ee93c32SBen Walker 
4282a5283034Ssijie.sun static void
4283a5283034Ssijie.sun nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
4284a5283034Ssijie.sun {
4285a5283034Ssijie.sun 	struct spdk_nvmf_rdma_poll_group	*rgroup, *next_rgroup;
4286a5283034Ssijie.sun 	struct spdk_nvmf_rdma_poller		*poller, *tmp;
4287a5283034Ssijie.sun 	struct spdk_nvmf_rdma_transport		*rtransport;
4288a5283034Ssijie.sun 
4289a5283034Ssijie.sun 	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
4290a5283034Ssijie.sun 	if (!rgroup) {
4291a5283034Ssijie.sun 		return;
4292a5283034Ssijie.sun 	}
4293a5283034Ssijie.sun 
4294a5283034Ssijie.sun 	TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
4295a5283034Ssijie.sun 		nvmf_rdma_poller_destroy(poller);
4296a5283034Ssijie.sun 	}
4297a5283034Ssijie.sun 
42984ea996ceSTomasz Zawadzki 	if (rgroup->group.transport == NULL) {
429955d8d943SSeth Howell 		/* Transport can be NULL when nvmf_rdma_poll_group_create()
43004ea996ceSTomasz Zawadzki 		 * calls this function directly in a failure path. */
43014ea996ceSTomasz Zawadzki 		free(rgroup);
43024ea996ceSTomasz Zawadzki 		return;
43034ea996ceSTomasz Zawadzki 	}
43044ea996ceSTomasz Zawadzki 
43054ea996ceSTomasz Zawadzki 	rtransport = SPDK_CONTAINEROF(rgroup->group.transport, struct spdk_nvmf_rdma_transport, transport);
43064ea996ceSTomasz Zawadzki 
4307645d5944SAlexey Marchuk 	next_rgroup = TAILQ_NEXT(rgroup, link);
4308645d5944SAlexey Marchuk 	TAILQ_REMOVE(&rtransport->poll_groups, rgroup, link);
4309645d5944SAlexey Marchuk 	if (next_rgroup == NULL) {
4310645d5944SAlexey Marchuk 		next_rgroup = TAILQ_FIRST(&rtransport->poll_groups);
4311645d5944SAlexey Marchuk 	}
4312645d5944SAlexey Marchuk 	if (rtransport->conn_sched.next_admin_pg == rgroup) {
4313645d5944SAlexey Marchuk 		rtransport->conn_sched.next_admin_pg = next_rgroup;
4314645d5944SAlexey Marchuk 	}
4315645d5944SAlexey Marchuk 	if (rtransport->conn_sched.next_io_pg == rgroup) {
4316645d5944SAlexey Marchuk 		rtransport->conn_sched.next_io_pg = next_rgroup;
4317645d5944SAlexey Marchuk 	}
4318645d5944SAlexey Marchuk 
4319baa936a1SBen Walker 	free(rgroup);
4320d7b8da3bSBen Walker }
4321d7b8da3bSBen Walker 
4322b9526681SSeth Howell static void
432355d8d943SSeth Howell nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair)
4324b9526681SSeth Howell {
4325b70e6984Sjiaqizho 	if (rqpair->cm_id != NULL) {
432655d8d943SSeth Howell 		nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
4327b70e6984Sjiaqizho 	}
4328b9526681SSeth Howell }
4329b9526681SSeth Howell 
433021c450e1SDaniel Verkamp static int
433155d8d943SSeth Howell nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
43321d304bc5SBen Walker 			 struct spdk_nvmf_qpair *qpair)
43338b487155SBen Walker {
4334baa936a1SBen Walker 	struct spdk_nvmf_rdma_poll_group	*rgroup;
43353ee93c32SBen Walker 	struct spdk_nvmf_rdma_qpair		*rqpair;
4336958c68f1SBen Walker 	struct spdk_nvmf_rdma_device		*device;
43373ee93c32SBen Walker 	struct spdk_nvmf_rdma_poller		*poller;
43388b79ef33SBen Walker 	int					rc;
43396fb90732SBen Walker 
4340baa936a1SBen Walker 	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
43413ee93c32SBen Walker 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
43428b487155SBen Walker 
43438209c8cfSSeth Howell 	device = rqpair->device;
43448b487155SBen Walker 
43453ee93c32SBen Walker 	TAILQ_FOREACH(poller, &rgroup->pollers, link) {
43463ee93c32SBen Walker 		if (poller->device == device) {
4347958c68f1SBen Walker 			break;
4348958c68f1SBen Walker 		}
4349958c68f1SBen Walker 	}
43503ee93c32SBen Walker 
43513ee93c32SBen Walker 	if (!poller) {
43523ee93c32SBen Walker 		SPDK_ERRLOG("No poller found for device.\n");
43533ee93c32SBen Walker 		return -1;
4354958c68f1SBen Walker 	}
4355958c68f1SBen Walker 
4356549be9adSsijie.sun 	if (poller->need_destroy) {
4357549be9adSsijie.sun 		SPDK_ERRLOG("Poller is destroying.\n");
4358549be9adSsijie.sun 		return -1;
4359549be9adSsijie.sun 	}
4360549be9adSsijie.sun 
43618b79ef33SBen Walker 	rqpair->poller = poller;
4362fa79f64aSSeth Howell 	rqpair->srq = rqpair->poller->srq;
43638b79ef33SBen Walker 
436455d8d943SSeth Howell 	rc = nvmf_rdma_qpair_initialize(qpair);
4365678fe328SZiye Yang 	if (rc < 0) {
4366678fe328SZiye Yang 		SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair);
43677613e3feSShuhei Matsumoto 		rqpair->poller = NULL;
43687613e3feSShuhei Matsumoto 		rqpair->srq = NULL;
4369678fe328SZiye Yang 		return -1;
4370678fe328SZiye Yang 	}
43718b79ef33SBen Walker 
437252f7aeb7SShuhei Matsumoto 	RB_INSERT(qpairs_tree, &poller->qpairs, rqpair);
43737613e3feSShuhei Matsumoto 
437455d8d943SSeth Howell 	rc = nvmf_rdma_event_accept(rqpair->cm_id, rqpair);
43758b79ef33SBen Walker 	if (rc) {
43768b79ef33SBen Walker 		/* Try to reject, but we probably can't */
437755d8d943SSeth Howell 		nvmf_rdma_qpair_reject_connection(rqpair);
43788b79ef33SBen Walker 		return -1;
43798b79ef33SBen Walker 	}
43808b487155SBen Walker 
43818b487155SBen Walker 	return 0;
43828b487155SBen Walker }
43838b487155SBen Walker 
43848b487155SBen Walker static int
43853d1d4fcfSAlexey Marchuk nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
43863d1d4fcfSAlexey Marchuk 			    struct spdk_nvmf_qpair *qpair)
43873d1d4fcfSAlexey Marchuk {
43883d1d4fcfSAlexey Marchuk 	struct spdk_nvmf_rdma_qpair		*rqpair;
43893d1d4fcfSAlexey Marchuk 
43903d1d4fcfSAlexey Marchuk 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
43913d1d4fcfSAlexey Marchuk 	assert(group->transport->tgt != NULL);
43923d1d4fcfSAlexey Marchuk 
43933d1d4fcfSAlexey Marchuk 	rqpair->destruct_channel = spdk_get_io_channel(group->transport->tgt);
43943d1d4fcfSAlexey Marchuk 
43953d1d4fcfSAlexey Marchuk 	if (!rqpair->destruct_channel) {
43963d1d4fcfSAlexey Marchuk 		SPDK_WARNLOG("failed to get io_channel, qpair %p\n", qpair);
43973d1d4fcfSAlexey Marchuk 		return 0;
43983d1d4fcfSAlexey Marchuk 	}
43993d1d4fcfSAlexey Marchuk 
44003d1d4fcfSAlexey Marchuk 	/* Sanity check that we get io_channel on the correct thread */
44013d1d4fcfSAlexey Marchuk 	if (qpair->group) {
44023d1d4fcfSAlexey Marchuk 		assert(qpair->group->thread == spdk_io_channel_get_thread(rqpair->destruct_channel));
44033d1d4fcfSAlexey Marchuk 	}
44043d1d4fcfSAlexey Marchuk 
44053d1d4fcfSAlexey Marchuk 	return 0;
44063d1d4fcfSAlexey Marchuk }
44073d1d4fcfSAlexey Marchuk 
44083d1d4fcfSAlexey Marchuk static int
440955d8d943SSeth Howell nvmf_rdma_request_free(struct spdk_nvmf_request *req)
4410388e3101SSeth Howell {
4411388e3101SSeth Howell 	struct spdk_nvmf_rdma_request	*rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
4412388e3101SSeth Howell 	struct spdk_nvmf_rdma_transport	*rtransport = SPDK_CONTAINEROF(req->qpair->transport,
4413388e3101SSeth Howell 			struct spdk_nvmf_rdma_transport, transport);
44146967fec6SAlexey Marchuk 	struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair,
44156967fec6SAlexey Marchuk 					      struct spdk_nvmf_rdma_qpair, qpair);
44166967fec6SAlexey Marchuk 
44176967fec6SAlexey Marchuk 	/*
44186967fec6SAlexey Marchuk 	 * AER requests are freed when a qpair is destroyed. The recv corresponding to that request
44196967fec6SAlexey Marchuk 	 * needs to be returned to the shared receive queue or the poll group will eventually be
44206967fec6SAlexey Marchuk 	 * starved of RECV structures.
44216967fec6SAlexey Marchuk 	 */
44226967fec6SAlexey Marchuk 	if (rqpair->srq && rdma_req->recv) {
44236967fec6SAlexey Marchuk 		int rc;
44246967fec6SAlexey Marchuk 		struct ibv_recv_wr *bad_recv_wr;
44256967fec6SAlexey Marchuk 
4426cf151d60SAlexey Marchuk 		spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, &rdma_req->recv->wr);
4427cf151d60SAlexey Marchuk 		rc = spdk_rdma_provider_srq_flush_recv_wrs(rqpair->srq, &bad_recv_wr);
44286967fec6SAlexey Marchuk 		if (rc) {
44296967fec6SAlexey Marchuk 			SPDK_ERRLOG("Unable to re-post rx descriptor\n");
44306967fec6SAlexey Marchuk 		}
44316967fec6SAlexey Marchuk 	}
4432388e3101SSeth Howell 
443355d8d943SSeth Howell 	_nvmf_rdma_request_free(rdma_req, rtransport);
4434388e3101SSeth Howell 	return 0;
4435388e3101SSeth Howell }
4436388e3101SSeth Howell 
4437388e3101SSeth Howell static int
443855d8d943SSeth Howell nvmf_rdma_request_complete(struct spdk_nvmf_request *req)
44390f912a0eSDaniel Verkamp {
44403c423f40SBen Walker 	struct spdk_nvmf_rdma_transport	*rtransport = SPDK_CONTAINEROF(req->qpair->transport,
44413c423f40SBen Walker 			struct spdk_nvmf_rdma_transport, transport);
44424bfb557dSPhilipp Skadorov 	struct spdk_nvmf_rdma_request	*rdma_req = SPDK_CONTAINEROF(req,
44434bfb557dSPhilipp Skadorov 			struct spdk_nvmf_rdma_request, req);
44444bfb557dSPhilipp Skadorov 	struct spdk_nvmf_rdma_qpair     *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair,
44454bfb557dSPhilipp Skadorov 			struct spdk_nvmf_rdma_qpair, qpair);
4446cfafcc3eSBen Walker 
4447feeaa282SAlexey Marchuk 	if (spdk_unlikely(rqpair->ibv_in_error_state)) {
4448531fd76dSBen Walker 		/* The connection is dead. Move the request directly to the completed state. */
4449bfdc957cSSeth Howell 		rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
4450feeaa282SAlexey Marchuk 	} else {
4451feeaa282SAlexey Marchuk 		/* The connection is alive, so process the request as normal */
4452feeaa282SAlexey Marchuk 		rdma_req->state = RDMA_REQUEST_STATE_EXECUTED;
4453531fd76dSBen Walker 	}
4454531fd76dSBen Walker 
445555d8d943SSeth Howell 	nvmf_rdma_request_process(rtransport, rdma_req);
4456cfafcc3eSBen Walker 
44573c423f40SBen Walker 	return 0;
4458cc294653SBen Walker }
4459cc294653SBen Walker 
4460cc294653SBen Walker static void
4461ccd96eadSNaresh Gottumukkala nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair,
4462ccd96eadSNaresh Gottumukkala 		      spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
44635ade1c40SBen Walker {
4464e03aca3cSSeth Howell 	struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
4465e03aca3cSSeth Howell 
446685ff3fceSZiye Yang 	rqpair->to_close = true;
446785ff3fceSZiye Yang 
4468b9526681SSeth Howell 	if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) {
446955d8d943SSeth Howell 		nvmf_rdma_qpair_reject_connection(rqpair);
4470b9526681SSeth Howell 	}
44718421f839SAlexey Marchuk 	if (rqpair->rdma_qp) {
4472cf151d60SAlexey Marchuk 		spdk_rdma_provider_qp_disconnect(rqpair->rdma_qp);
447390b4bd6cSEvgeniy Kochetov 	}
447490b4bd6cSEvgeniy Kochetov 
447585ff3fceSZiye Yang 	nvmf_rdma_destroy_drained_qpair(rqpair);
4476ccd96eadSNaresh Gottumukkala 
4477ccd96eadSNaresh Gottumukkala 	if (cb_fn) {
4478ccd96eadSNaresh Gottumukkala 		cb_fn(cb_arg);
4479ccd96eadSNaresh Gottumukkala 	}
44805ade1c40SBen Walker }
44815ade1c40SBen Walker 
4482ed0b611fSEvgeniy Kochetov static struct spdk_nvmf_rdma_qpair *
4483ed0b611fSEvgeniy Kochetov get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc)
4484ed0b611fSEvgeniy Kochetov {
448552f7aeb7SShuhei Matsumoto 	struct spdk_nvmf_rdma_qpair find;
448652f7aeb7SShuhei Matsumoto 
448752f7aeb7SShuhei Matsumoto 	find.qp_num = wc->qp_num;
448852f7aeb7SShuhei Matsumoto 
448952f7aeb7SShuhei Matsumoto 	return RB_FIND(qpairs_tree, &rpoller->qpairs, &find);
4490ed0b611fSEvgeniy Kochetov }
4491ed0b611fSEvgeniy Kochetov 
4492fdec444aSPhilipp Skadorov #ifdef DEBUG
4493fdec444aSPhilipp Skadorov static int
449455d8d943SSeth Howell nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req)
4495fdec444aSPhilipp Skadorov {
4496fdec444aSPhilipp Skadorov 	return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST ||
4497fdec444aSPhilipp Skadorov 	       rdma_req->state == RDMA_REQUEST_STATE_COMPLETING;
4498fdec444aSPhilipp Skadorov }
4499fdec444aSPhilipp Skadorov #endif
4500fdec444aSPhilipp Skadorov 
45019d63933bSSeth Howell static void
4502c3884f94SSeth Howell _poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_recv_wr *bad_recv_wr,
4503c3884f94SSeth Howell 			   int rc)
4504c3884f94SSeth Howell {
4505c3884f94SSeth Howell 	struct spdk_nvmf_rdma_recv	*rdma_recv;
4506c3884f94SSeth Howell 	struct spdk_nvmf_rdma_wr	*bad_rdma_wr;
4507c3884f94SSeth Howell 
4508c3884f94SSeth Howell 	SPDK_ERRLOG("Failed to post a recv for the poller %p with errno %d\n", rpoller, -rc);
4509c3884f94SSeth Howell 	while (bad_recv_wr != NULL) {
4510c3884f94SSeth Howell 		bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_recv_wr->wr_id;
4511c3884f94SSeth Howell 		rdma_recv = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr);
4512c3884f94SSeth Howell 
4513c3884f94SSeth Howell 		rdma_recv->qpair->current_recv_depth++;
4514c3884f94SSeth Howell 		bad_recv_wr = bad_recv_wr->next;
4515c3884f94SSeth Howell 		SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc);
4516608b54a2SKonrad Sztyber 		spdk_nvmf_qpair_disconnect(&rdma_recv->qpair->qpair);
4517c3884f94SSeth Howell 	}
4518c3884f94SSeth Howell }
4519c3884f94SSeth Howell 
4520c3884f94SSeth Howell static void
4521c3884f94SSeth Howell _qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *bad_recv_wr, int rc)
4522c3884f94SSeth Howell {
4523c3884f94SSeth Howell 	SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rqpair, -rc);
4524c3884f94SSeth Howell 	while (bad_recv_wr != NULL) {
4525c3884f94SSeth Howell 		bad_recv_wr = bad_recv_wr->next;
4526c3884f94SSeth Howell 		rqpair->current_recv_depth++;
4527c3884f94SSeth Howell 	}
4528608b54a2SKonrad Sztyber 	spdk_nvmf_qpair_disconnect(&rqpair->qpair);
4529c3884f94SSeth Howell }
4530c3884f94SSeth Howell 
4531c3884f94SSeth Howell static void
4532c3884f94SSeth Howell _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
4533c3884f94SSeth Howell 		     struct spdk_nvmf_rdma_poller *rpoller)
4534c3884f94SSeth Howell {
4535c3884f94SSeth Howell 	struct spdk_nvmf_rdma_qpair	*rqpair;
4536c3884f94SSeth Howell 	struct ibv_recv_wr		*bad_recv_wr;
4537c3884f94SSeth Howell 	int				rc;
4538c3884f94SSeth Howell 
4539c3884f94SSeth Howell 	if (rpoller->srq) {
4540cf151d60SAlexey Marchuk 		rc = spdk_rdma_provider_srq_flush_recv_wrs(rpoller->srq, &bad_recv_wr);
4541e718d8caSAlexey Marchuk 		if (spdk_unlikely(rc)) {
4542c3884f94SSeth Howell 			_poller_reset_failed_recvs(rpoller, bad_recv_wr, rc);
4543c3884f94SSeth Howell 		}
4544c3884f94SSeth Howell 	} else {
454514777890SSeth Howell 		while (!STAILQ_EMPTY(&rpoller->qpairs_pending_recv)) {
454614777890SSeth Howell 			rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_recv);
4547cf151d60SAlexey Marchuk 			rc = spdk_rdma_provider_qp_flush_recv_wrs(rqpair->rdma_qp, &bad_recv_wr);
4548e718d8caSAlexey Marchuk 			if (spdk_unlikely(rc)) {
4549c3884f94SSeth Howell 				_qp_reset_failed_recvs(rqpair, bad_recv_wr, rc);
4550c3884f94SSeth Howell 			}
455114777890SSeth Howell 			STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_recv, recv_link);
4552c3884f94SSeth Howell 		}
4553c3884f94SSeth Howell 	}
4554c3884f94SSeth Howell }
4555c3884f94SSeth Howell 
4556c3884f94SSeth Howell static void
45579d63933bSSeth Howell _qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport,
45589d63933bSSeth Howell 		       struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc)
45599d63933bSSeth Howell {
45609d63933bSSeth Howell 	struct spdk_nvmf_rdma_wr	*bad_rdma_wr;
45619d63933bSSeth Howell 	struct spdk_nvmf_rdma_request	*prev_rdma_req = NULL, *cur_rdma_req = NULL;
45629d63933bSSeth Howell 
45639d63933bSSeth Howell 	SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc);
45649d63933bSSeth Howell 	for (; bad_wr != NULL; bad_wr = bad_wr->next) {
45659d63933bSSeth Howell 		bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id;
45669d63933bSSeth Howell 		assert(rqpair->current_send_depth > 0);
45679d63933bSSeth Howell 		rqpair->current_send_depth--;
45689d63933bSSeth Howell 		switch (bad_rdma_wr->type) {
45699d63933bSSeth Howell 		case RDMA_WR_TYPE_DATA:
45708288fcf9SAlexey Marchuk 			cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data_wr);
45719d63933bSSeth Howell 			if (bad_wr->opcode == IBV_WR_RDMA_READ) {
45729d63933bSSeth Howell 				assert(rqpair->current_read_depth > 0);
45739d63933bSSeth Howell 				rqpair->current_read_depth--;
45749d63933bSSeth Howell 			}
45759d63933bSSeth Howell 			break;
45769d63933bSSeth Howell 		case RDMA_WR_TYPE_SEND:
45778288fcf9SAlexey Marchuk 			cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp_wr);
45789d63933bSSeth Howell 			break;
45799d63933bSSeth Howell 		default:
45809d63933bSSeth Howell 			SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair);
45819d63933bSSeth Howell 			prev_rdma_req = cur_rdma_req;
45829d63933bSSeth Howell 			continue;
45839d63933bSSeth Howell 		}
45849d63933bSSeth Howell 
45859d63933bSSeth Howell 		if (prev_rdma_req == cur_rdma_req) {
45869d63933bSSeth Howell 			/* this request was handled by an earlier wr. i.e. we were performing an nvme read. */
45879d63933bSSeth Howell 			/* We only have to check against prev_wr since each requests wrs are contiguous in this list. */
45889d63933bSSeth Howell 			continue;
45899d63933bSSeth Howell 		}
45909d63933bSSeth Howell 
45919d63933bSSeth Howell 		switch (cur_rdma_req->state) {
45929d63933bSSeth Howell 		case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
45939d63933bSSeth Howell 			cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
459404cd8e47SAlexey Marchuk 			STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, cur_rdma_req, state_link);
459504cd8e47SAlexey Marchuk 			cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
45969d63933bSSeth Howell 			break;
45979d63933bSSeth Howell 		case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
45989d63933bSSeth Howell 		case RDMA_REQUEST_STATE_COMPLETING:
45999d63933bSSeth Howell 			cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
46009d63933bSSeth Howell 			break;
46019d63933bSSeth Howell 		default:
46029d63933bSSeth Howell 			SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n",
46039d63933bSSeth Howell 				    cur_rdma_req->state, rqpair);
46049d63933bSSeth Howell 			continue;
46059d63933bSSeth Howell 		}
46069d63933bSSeth Howell 
460755d8d943SSeth Howell 		nvmf_rdma_request_process(rtransport, cur_rdma_req);
46089d63933bSSeth Howell 		prev_rdma_req = cur_rdma_req;
46099d63933bSSeth Howell 	}
46109d63933bSSeth Howell 
46113caf2080SKonrad Sztyber 	if (spdk_nvmf_qpair_is_active(&rqpair->qpair)) {
46129d63933bSSeth Howell 		/* Disconnect the connection. */
4613608b54a2SKonrad Sztyber 		spdk_nvmf_qpair_disconnect(&rqpair->qpair);
46149d63933bSSeth Howell 	}
46159d63933bSSeth Howell 
46169d63933bSSeth Howell }
46179d63933bSSeth Howell 
46189d63933bSSeth Howell static void
46199d63933bSSeth Howell _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
46209d63933bSSeth Howell 		     struct spdk_nvmf_rdma_poller *rpoller)
46219d63933bSSeth Howell {
46229d63933bSSeth Howell 	struct spdk_nvmf_rdma_qpair	*rqpair;
46239d63933bSSeth Howell 	struct ibv_send_wr		*bad_wr = NULL;
46249d63933bSSeth Howell 	int				rc;
46259d63933bSSeth Howell 
4626b4dc10fbSSeth Howell 	while (!STAILQ_EMPTY(&rpoller->qpairs_pending_send)) {
4627b4dc10fbSSeth Howell 		rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_send);
4628cf151d60SAlexey Marchuk 		rc = spdk_rdma_provider_qp_flush_send_wrs(rqpair->rdma_qp, &bad_wr);
4629b4dc10fbSSeth Howell 
46309d63933bSSeth Howell 		/* bad wr always points to the first wr that failed. */
4631e718d8caSAlexey Marchuk 		if (spdk_unlikely(rc)) {
46329d63933bSSeth Howell 			_qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc);
46339d63933bSSeth Howell 		}
4634b4dc10fbSSeth Howell 		STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_send, send_link);
46359d63933bSSeth Howell 	}
46369d63933bSSeth Howell }
46379d63933bSSeth Howell 
4638db09de98SAlexey Marchuk static const char *
4639db09de98SAlexey Marchuk nvmf_rdma_wr_type_str(enum spdk_nvmf_rdma_wr_type wr_type)
4640db09de98SAlexey Marchuk {
4641db09de98SAlexey Marchuk 	switch (wr_type) {
4642db09de98SAlexey Marchuk 	case RDMA_WR_TYPE_RECV:
4643db09de98SAlexey Marchuk 		return "RECV";
4644db09de98SAlexey Marchuk 	case RDMA_WR_TYPE_SEND:
4645db09de98SAlexey Marchuk 		return "SEND";
4646db09de98SAlexey Marchuk 	case RDMA_WR_TYPE_DATA:
4647db09de98SAlexey Marchuk 		return "DATA";
4648db09de98SAlexey Marchuk 	default:
4649db09de98SAlexey Marchuk 		SPDK_ERRLOG("Unknown WR type %d\n", wr_type);
4650db09de98SAlexey Marchuk 		SPDK_UNREACHABLE();
4651db09de98SAlexey Marchuk 	}
4652db09de98SAlexey Marchuk }
4653db09de98SAlexey Marchuk 
4654db09de98SAlexey Marchuk static inline void
4655db09de98SAlexey Marchuk nvmf_rdma_log_wc_status(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_wc *wc)
4656db09de98SAlexey Marchuk {
4657db09de98SAlexey Marchuk 	enum spdk_nvmf_rdma_wr_type wr_type = ((struct spdk_nvmf_rdma_wr *)wc->wr_id)->type;
4658db09de98SAlexey Marchuk 
4659db09de98SAlexey Marchuk 	if (wc->status == IBV_WC_WR_FLUSH_ERR) {
4660db09de98SAlexey Marchuk 		/* If qpair is in ERR state, we will receive completions for all posted and not completed
4661db09de98SAlexey Marchuk 		 * Work Requests with IBV_WC_WR_FLUSH_ERR status. Don't log an error in that case */
46622172c432STomasz Zawadzki 		SPDK_DEBUGLOG(rdma,
4663feeaa282SAlexey Marchuk 			      "Error on CQ %p, (qp state %d, in_error %d) request 0x%lu, type %s, status: (%d): %s\n",
4664feeaa282SAlexey Marchuk 			      rqpair->poller->cq, rqpair->qpair.state, rqpair->ibv_in_error_state, wc->wr_id,
4665db09de98SAlexey Marchuk 			      nvmf_rdma_wr_type_str(wr_type), wc->status, ibv_wc_status_str(wc->status));
4666db09de98SAlexey Marchuk 	} else {
4667feeaa282SAlexey Marchuk 		SPDK_ERRLOG("Error on CQ %p, (qp state %d, in_error %d) request 0x%lu, type %s, status: (%d): %s\n",
4668feeaa282SAlexey Marchuk 			    rqpair->poller->cq, rqpair->qpair.state, rqpair->ibv_in_error_state, wc->wr_id,
4669db09de98SAlexey Marchuk 			    nvmf_rdma_wr_type_str(wr_type), wc->status, ibv_wc_status_str(wc->status));
4670db09de98SAlexey Marchuk 	}
4671db09de98SAlexey Marchuk }
4672db09de98SAlexey Marchuk 
46731db3a037SBen Walker static int
467455d8d943SSeth Howell nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
46752a0772e3SBen Walker 		      struct spdk_nvmf_rdma_poller *rpoller)
46761db3a037SBen Walker {
46771db3a037SBen Walker 	struct ibv_wc wc[32];
46785941ab03SBen Walker 	struct spdk_nvmf_rdma_wr	*rdma_wr;
46791db3a037SBen Walker 	struct spdk_nvmf_rdma_request	*rdma_req;
46801db3a037SBen Walker 	struct spdk_nvmf_rdma_recv	*rdma_recv;
46818ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_qpair	*rqpair, *tmp_rqpair;
46823c423f40SBen Walker 	int reaped, i;
46831db3a037SBen Walker 	int count = 0;
4684ca59dd5dSAlexey Marchuk 	int rc;
46851db3a037SBen Walker 	bool error = false;
4686fbe8f804SEvgeniy Kochetov 	uint64_t poll_tsc = spdk_get_ticks();
46871db3a037SBen Walker 
46888ddc5cd4Ssijie.sun 	if (spdk_unlikely(rpoller->need_destroy)) {
46898ddc5cd4Ssijie.sun 		/* If qpair is closed before poller destroy, nvmf_rdma_destroy_drained_qpair may not
46908ddc5cd4Ssijie.sun 		 * be called because we cannot poll anything from cq. So we call that here to force
46918ddc5cd4Ssijie.sun 		 * destroy the qpair after to_close turning true.
46928ddc5cd4Ssijie.sun 		 */
46938ddc5cd4Ssijie.sun 		RB_FOREACH_SAFE(rqpair, qpairs_tree, &rpoller->qpairs, tmp_rqpair) {
46948ddc5cd4Ssijie.sun 			nvmf_rdma_destroy_drained_qpair(rqpair);
46958ddc5cd4Ssijie.sun 		}
46968ddc5cd4Ssijie.sun 		return 0;
46978ddc5cd4Ssijie.sun 	}
46988ddc5cd4Ssijie.sun 
46991db3a037SBen Walker 	/* Poll for completing operations. */
47002a0772e3SBen Walker 	reaped = ibv_poll_cq(rpoller->cq, 32, wc);
4701e718d8caSAlexey Marchuk 	if (spdk_unlikely(reaped < 0)) {
47021db3a037SBen Walker 		SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
4703891c12a6SPawel Wodkowski 			    errno, spdk_strerror(errno));
47041db3a037SBen Walker 		return -1;
47053caf2e71SAlexey Marchuk 	} else if (reaped == 0) {
47063caf2e71SAlexey Marchuk 		rpoller->stat.idle_polls++;
47071db3a037SBen Walker 	}
47081db3a037SBen Walker 
470938ab383aSEvgeniy Kochetov 	rpoller->stat.polls++;
471038ab383aSEvgeniy Kochetov 	rpoller->stat.completions += reaped;
471138ab383aSEvgeniy Kochetov 
47121db3a037SBen Walker 	for (i = 0; i < reaped; i++) {
47135941ab03SBen Walker 
47145941ab03SBen Walker 		rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id;
47155941ab03SBen Walker 
471650a438d3SBen Walker 		switch (rdma_wr->type) {
471750a438d3SBen Walker 		case RDMA_WR_TYPE_SEND:
47188288fcf9SAlexey Marchuk 			rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp_wr);
47190d7d3a04SBen Walker 			rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
47200d7d3a04SBen Walker 
4721e718d8caSAlexey Marchuk 			if (spdk_likely(!wc[i].status)) {
4722ab79560eSSeth Howell 				count++;
4723ab79560eSSeth Howell 				assert(wc[i].opcode == IBV_WC_SEND);
472455d8d943SSeth Howell 				assert(nvmf_rdma_req_is_completing(rdma_req));
4725ab79560eSSeth Howell 			}
4726ab79560eSSeth Howell 
4727bfdc957cSSeth Howell 			rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
4728ce6b8a13SAlexey Marchuk 			/* RDMA_WRITE operation completed. +1 since it was chained with rsp WR */
4729a681f8d5SAlexey Marchuk 			assert(rqpair->current_send_depth >= (uint32_t)rdma_req->num_outstanding_data_wr + 1);
473053777de8SAlexey Marchuk 			rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1;
473153777de8SAlexey Marchuk 			rdma_req->num_outstanding_data_wr = 0;
473292f5548aSSeth Howell 
473355d8d943SSeth Howell 			nvmf_rdma_request_process(rtransport, rdma_req);
47340d7d3a04SBen Walker 			break;
473550a438d3SBen Walker 		case RDMA_WR_TYPE_RECV:
47366cc18a64SSeth Howell 			/* rdma_recv->qpair will be invalid if using an SRQ.  In that case we have to get the qpair from the wc. */
47375941ab03SBen Walker 			rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr);
47386cc18a64SSeth Howell 			if (rpoller->srq != NULL) {
4739ed0b611fSEvgeniy Kochetov 				rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]);
474087ebcb08SEvgeniy Kochetov 				/* It is possible that there are still some completions for destroyed QP
474187ebcb08SEvgeniy Kochetov 				 * associated with SRQ. We just ignore these late completions and re-post
474287ebcb08SEvgeniy Kochetov 				 * receive WRs back to SRQ.
474387ebcb08SEvgeniy Kochetov 				 */
474487ebcb08SEvgeniy Kochetov 				if (spdk_unlikely(NULL == rdma_recv->qpair)) {
474587ebcb08SEvgeniy Kochetov 					struct ibv_recv_wr *bad_wr;
474687ebcb08SEvgeniy Kochetov 
474787ebcb08SEvgeniy Kochetov 					rdma_recv->wr.next = NULL;
4748cf151d60SAlexey Marchuk 					spdk_rdma_provider_srq_queue_recv_wrs(rpoller->srq, &rdma_recv->wr);
4749cf151d60SAlexey Marchuk 					rc = spdk_rdma_provider_srq_flush_recv_wrs(rpoller->srq, &bad_wr);
475087ebcb08SEvgeniy Kochetov 					if (rc) {
475187ebcb08SEvgeniy Kochetov 						SPDK_ERRLOG("Failed to re-post recv WR to SRQ, err %d\n", rc);
475287ebcb08SEvgeniy Kochetov 					}
475387ebcb08SEvgeniy Kochetov 					continue;
475487ebcb08SEvgeniy Kochetov 				}
475501201d3eSSeth Howell 			}
47560d7d3a04SBen Walker 			rqpair = rdma_recv->qpair;
47570d7d3a04SBen Walker 
475801201d3eSSeth Howell 			assert(rqpair != NULL);
4759e718d8caSAlexey Marchuk 			if (spdk_likely(!wc[i].status)) {
4760ab79560eSSeth Howell 				assert(wc[i].opcode == IBV_WC_RECV);
4761ab79560eSSeth Howell 				if (rqpair->current_recv_depth >= rqpair->max_queue_depth) {
4762608b54a2SKonrad Sztyber 					spdk_nvmf_qpair_disconnect(&rqpair->qpair);
4763ab79560eSSeth Howell 					break;
4764ab79560eSSeth Howell 				}
4765ab79560eSSeth Howell 			}
476601201d3eSSeth Howell 
4767c3884f94SSeth Howell 			rdma_recv->wr.next = NULL;
4768158dc947SSeth Howell 			rqpair->current_recv_depth++;
4769fbe8f804SEvgeniy Kochetov 			rdma_recv->receive_tsc = poll_tsc;
4770fbe8f804SEvgeniy Kochetov 			rpoller->stat.requests++;
47715edb8edcSOr Gerlitz 			STAILQ_INSERT_HEAD(&rqpair->resources->incoming_queue, rdma_recv, link);
477246d7b94fSAtul Malakar 			rqpair->qpair.queue_depth++;
4773e06896b9SBen Walker 			break;
477450a438d3SBen Walker 		case RDMA_WR_TYPE_DATA:
47758288fcf9SAlexey Marchuk 			rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data_wr);
4776ab79560eSSeth Howell 			rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
4777ab79560eSSeth Howell 
4778ab79560eSSeth Howell 			assert(rdma_req->num_outstanding_data_wr > 0);
4779ab79560eSSeth Howell 
4780ab79560eSSeth Howell 			rqpair->current_send_depth--;
4781ab79560eSSeth Howell 			rdma_req->num_outstanding_data_wr--;
4782e718d8caSAlexey Marchuk 			if (spdk_likely(!wc[i].status)) {
478353777de8SAlexey Marchuk 				assert(wc[i].opcode == IBV_WC_RDMA_READ);
47845e2101ceSAlexey Marchuk 				rqpair->current_read_depth--;
4785ab79560eSSeth Howell 				/* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */
4786ab79560eSSeth Howell 				if (rdma_req->num_outstanding_data_wr == 0) {
47875b333e40SAlexey Marchuk 					if (rdma_req->num_remaining_data_wr) {
4788ca59dd5dSAlexey Marchuk 						/* Only part of RDMA_READ operations was submitted, process the rest */
47898307ab43SAlexey Marchuk 						nvmf_rdma_request_reset_transfer_in(rdma_req, rtransport);
4790ca59dd5dSAlexey Marchuk 						rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING;
4791ca59dd5dSAlexey Marchuk 						nvmf_rdma_request_process(rtransport, rdma_req);
4792ca59dd5dSAlexey Marchuk 						break;
4793ca59dd5dSAlexey Marchuk 					}
4794ab79560eSSeth Howell 					rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
479555d8d943SSeth Howell 					nvmf_rdma_request_process(rtransport, rdma_req);
4796ab79560eSSeth Howell 				}
4797ab79560eSSeth Howell 			} else {
47984d5f288cSBen Walker 				/* If the data transfer fails still force the queue into the error state,
47995e2101ceSAlexey Marchuk 				 * if we were performing an RDMA_READ, we need to force the request into a
48005e2101ceSAlexey Marchuk 				 * completed state since it wasn't linked to a send. However, in the RDMA_WRITE
48015e2101ceSAlexey Marchuk 				 * case, we should wait for the SEND to complete. */
48025e2101ceSAlexey Marchuk 				if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) {
48035e2101ceSAlexey Marchuk 					rqpair->current_read_depth--;
48041f9ac117SSeth Howell 					if (rdma_req->num_outstanding_data_wr == 0) {
48054e45c563SAlexey Marchuk 						if (rdma_req->num_remaining_data_wr) {
48064e45c563SAlexey Marchuk 							/* Partially sent request is still in the pending_rdma_read_queue,
48074e45c563SAlexey Marchuk 							 * remove it now before completing */
48084e45c563SAlexey Marchuk 							rdma_req->num_remaining_data_wr = 0;
48094e45c563SAlexey Marchuk 							STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
48104e45c563SAlexey Marchuk 						}
4811bfdc957cSSeth Howell 						rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
48124e45c563SAlexey Marchuk 						nvmf_rdma_request_process(rtransport, rdma_req);
4813212fd219SSeth Howell 					}
48141f9ac117SSeth Howell 				}
48155e2101ceSAlexey Marchuk 			}
48164d5f288cSBen Walker 			break;
48170d7d3a04SBen Walker 		default:
48180d7d3a04SBen Walker 			SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode);
48190d7d3a04SBen Walker 			continue;
48200d7d3a04SBen Walker 		}
48210d7d3a04SBen Walker 
4822ab79560eSSeth Howell 		/* Handle error conditions */
4823e718d8caSAlexey Marchuk 		if (spdk_unlikely(wc[i].status)) {
4824feeaa282SAlexey Marchuk 			rqpair->ibv_in_error_state = true;
4825db09de98SAlexey Marchuk 			nvmf_rdma_log_wc_status(rqpair, &wc[i]);
4826ab79560eSSeth Howell 
4827ab79560eSSeth Howell 			error = true;
4828ab79560eSSeth Howell 
48293caf2080SKonrad Sztyber 			if (spdk_nvmf_qpair_is_active(&rqpair->qpair)) {
48308e729503SBen Walker 				/* Disconnect the connection. */
4831608b54a2SKonrad Sztyber 				spdk_nvmf_qpair_disconnect(&rqpair->qpair);
4832bb3e4413SSeth Howell 			} else {
483333668b22SSeth Howell 				nvmf_rdma_destroy_drained_qpair(rqpair);
48348e729503SBen Walker 			}
48351db3a037SBen Walker 			continue;
48361db3a037SBen Walker 		}
48371db3a037SBen Walker 
483855d8d943SSeth Howell 		nvmf_rdma_qpair_process_pending(rtransport, rqpair, false);
4839bb3e4413SSeth Howell 
48403caf2080SKonrad Sztyber 		if (spdk_unlikely(!spdk_nvmf_qpair_is_active(&rqpair->qpair))) {
484133668b22SSeth Howell 			nvmf_rdma_destroy_drained_qpair(rqpair);
4842bb3e4413SSeth Howell 		}
48436e5f700bSDaniel Verkamp 	}
48442d75d67aSDaniel Verkamp 
4845e718d8caSAlexey Marchuk 	if (spdk_unlikely(error == true)) {
4846fcdb601eSGangCao 		return -1;
4847fcdb601eSGangCao 	}
4848fcdb601eSGangCao 
48498e8f0434SAlexey Marchuk 	if (reaped == 0) {
48508e8f0434SAlexey Marchuk 		/* In some cases we may not receive any CQE but we still may have pending IO requests waiting for
48518e8f0434SAlexey Marchuk 		 * a resource (e.g. a WR from the data_wr_pool).
48528e8f0434SAlexey Marchuk 		 * We need to start processing of such requests if no CQE reaped */
48538e8f0434SAlexey Marchuk 		nvmf_rdma_poller_process_pending_buf_queue(rtransport, rpoller);
48548e8f0434SAlexey Marchuk 	}
48558e8f0434SAlexey Marchuk 
48569d63933bSSeth Howell 	/* submit outstanding work requests. */
4857c3884f94SSeth Howell 	_poller_submit_recvs(rtransport, rpoller);
48589d63933bSSeth Howell 	_poller_submit_sends(rtransport, rpoller);
48599d63933bSSeth Howell 
486004a0ac72SBen Walker 	return count;
48616e5f700bSDaniel Verkamp }
48626e5f700bSDaniel Verkamp 
48638ddc5cd4Ssijie.sun static void
48648ddc5cd4Ssijie.sun _nvmf_rdma_remove_destroyed_device(void *c)
48658ddc5cd4Ssijie.sun {
48668ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_transport	*rtransport = c;
48678ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_device	*device, *device_tmp;
48688ddc5cd4Ssijie.sun 	int				rc;
48698ddc5cd4Ssijie.sun 
48708ddc5cd4Ssijie.sun 	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
48718ddc5cd4Ssijie.sun 		if (device->ready_to_destroy) {
48728ddc5cd4Ssijie.sun 			destroy_ib_device(rtransport, device);
48738ddc5cd4Ssijie.sun 		}
48748ddc5cd4Ssijie.sun 	}
48758ddc5cd4Ssijie.sun 
48768ddc5cd4Ssijie.sun 	free_poll_fds(rtransport);
48778ddc5cd4Ssijie.sun 	rc = generate_poll_fds(rtransport);
48788ddc5cd4Ssijie.sun 	/* cannot handle fd allocation error here */
48798ddc5cd4Ssijie.sun 	if (rc != 0) {
48808ddc5cd4Ssijie.sun 		SPDK_ERRLOG("Failed to generate poll fds after remove ib device.\n");
48818ddc5cd4Ssijie.sun 	}
48828ddc5cd4Ssijie.sun }
48838ddc5cd4Ssijie.sun 
48848ddc5cd4Ssijie.sun static void
48858ddc5cd4Ssijie.sun _nvmf_rdma_remove_poller_in_group_cb(void *c)
48868ddc5cd4Ssijie.sun {
48878ddc5cd4Ssijie.sun 	struct poller_manage_ctx	*ctx = c;
48888ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_transport	*rtransport = ctx->rtransport;
48898ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_device	*device = ctx->device;
48908ddc5cd4Ssijie.sun 	struct spdk_thread		*thread = ctx->thread;
48918ddc5cd4Ssijie.sun 
4892549be9adSsijie.sun 	if (nvmf_rdma_all_pollers_management_done(c)) {
48938ddc5cd4Ssijie.sun 		/* destroy device when last poller is destroyed */
48948ddc5cd4Ssijie.sun 		device->ready_to_destroy = true;
48958ddc5cd4Ssijie.sun 		spdk_thread_send_msg(thread, _nvmf_rdma_remove_destroyed_device, rtransport);
48968ddc5cd4Ssijie.sun 	}
48978ddc5cd4Ssijie.sun }
48988ddc5cd4Ssijie.sun 
48998ddc5cd4Ssijie.sun static void
49008ddc5cd4Ssijie.sun _nvmf_rdma_remove_poller_in_group(void *c)
49018ddc5cd4Ssijie.sun {
49028ddc5cd4Ssijie.sun 	struct poller_manage_ctx		*ctx = c;
49038ddc5cd4Ssijie.sun 
49048ddc5cd4Ssijie.sun 	ctx->rpoller->need_destroy = true;
49058ddc5cd4Ssijie.sun 	ctx->rpoller->destroy_cb_ctx = ctx;
49068ddc5cd4Ssijie.sun 	ctx->rpoller->destroy_cb = _nvmf_rdma_remove_poller_in_group_cb;
49078ddc5cd4Ssijie.sun 
4908549be9adSsijie.sun 	/* qp will be disconnected after receiving a RDMA_CM_EVENT_DEVICE_REMOVAL event. */
49098ddc5cd4Ssijie.sun 	if (RB_EMPTY(&ctx->rpoller->qpairs)) {
49108ddc5cd4Ssijie.sun 		nvmf_rdma_poller_destroy(ctx->rpoller);
49118ddc5cd4Ssijie.sun 	}
49128ddc5cd4Ssijie.sun }
49138ddc5cd4Ssijie.sun 
4914d5ce9cffSBen Walker static int
491555d8d943SSeth Howell nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
4916d5ce9cffSBen Walker {
4917d5ce9cffSBen Walker 	struct spdk_nvmf_rdma_transport *rtransport;
4918d5ce9cffSBen Walker 	struct spdk_nvmf_rdma_poll_group *rgroup;
49198ddc5cd4Ssijie.sun 	struct spdk_nvmf_rdma_poller	*rpoller, *tmp;
4920596f8a4aSAlexey Marchuk 	int				count = 0, rc, rc2 = 0;
4921d5ce9cffSBen Walker 
4922d5ce9cffSBen Walker 	rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport);
4923d5ce9cffSBen Walker 	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
4924d5ce9cffSBen Walker 
49258ddc5cd4Ssijie.sun 	TAILQ_FOREACH_SAFE(rpoller, &rgroup->pollers, link, tmp) {
492655d8d943SSeth Howell 		rc = nvmf_rdma_poller_poll(rtransport, rpoller);
4927e718d8caSAlexey Marchuk 		if (spdk_unlikely(rc < 0)) {
4928596f8a4aSAlexey Marchuk 			if (rc2 == 0) {
4929596f8a4aSAlexey Marchuk 				rc2 = rc;
4930596f8a4aSAlexey Marchuk 			}
4931596f8a4aSAlexey Marchuk 			continue;
4932d5ce9cffSBen Walker 		}
4933d5ce9cffSBen Walker 		count += rc;
4934d5ce9cffSBen Walker 	}
4935d5ce9cffSBen Walker 
4936596f8a4aSAlexey Marchuk 	return rc2 ? rc2 : count;
4937d5ce9cffSBen Walker }
4938d5ce9cffSBen Walker 
49398f64db18SBen Walker static int
494055d8d943SSeth Howell nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id,
4941311ce0e2SBen Walker 			  struct spdk_nvme_transport_id *trid,
4942311ce0e2SBen Walker 			  bool peer)
49438f64db18SBen Walker {
49448f64db18SBen Walker 	struct sockaddr *saddr;
4945683c70c2SBen Walker 	uint16_t port;
49468f64db18SBen Walker 
49477ed0904bSSeth Howell 	spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_RDMA);
49488f64db18SBen Walker 
4949311ce0e2SBen Walker 	if (peer) {
4950311ce0e2SBen Walker 		saddr = rdma_get_peer_addr(id);
4951311ce0e2SBen Walker 	} else {
4952311ce0e2SBen Walker 		saddr = rdma_get_local_addr(id);
4953311ce0e2SBen Walker 	}
49548f64db18SBen Walker 	switch (saddr->sa_family) {
49558f64db18SBen Walker 	case AF_INET: {
49568f64db18SBen Walker 		struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr;
49578f64db18SBen Walker 
49588f64db18SBen Walker 		trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
49598f64db18SBen Walker 		inet_ntop(AF_INET, &saddr_in->sin_addr,
49608f64db18SBen Walker 			  trid->traddr, sizeof(trid->traddr));
4961311ce0e2SBen Walker 		if (peer) {
4962311ce0e2SBen Walker 			port = ntohs(rdma_get_dst_port(id));
4963311ce0e2SBen Walker 		} else {
4964311ce0e2SBen Walker 			port = ntohs(rdma_get_src_port(id));
4965311ce0e2SBen Walker 		}
4966683c70c2SBen Walker 		snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port);
49678f64db18SBen Walker 		break;
49688f64db18SBen Walker 	}
49698f64db18SBen Walker 	case AF_INET6: {
49708f64db18SBen Walker 		struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr;
49718f64db18SBen Walker 		trid->adrfam = SPDK_NVMF_ADRFAM_IPV6;
49728f64db18SBen Walker 		inet_ntop(AF_INET6, &saddr_in->sin6_addr,
49738f64db18SBen Walker 			  trid->traddr, sizeof(trid->traddr));
4974311ce0e2SBen Walker 		if (peer) {
4975311ce0e2SBen Walker 			port = ntohs(rdma_get_dst_port(id));
4976311ce0e2SBen Walker 		} else {
4977311ce0e2SBen Walker 			port = ntohs(rdma_get_src_port(id));
4978311ce0e2SBen Walker 		}
4979683c70c2SBen Walker 		snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port);
49808f64db18SBen Walker 		break;
49818f64db18SBen Walker 	}
49828f64db18SBen Walker 	default:
49838f64db18SBen Walker 		return -1;
49848f64db18SBen Walker 
49858f64db18SBen Walker 	}
49868f64db18SBen Walker 
49878f64db18SBen Walker 	return 0;
49888f64db18SBen Walker }
49898f64db18SBen Walker 
4990311ce0e2SBen Walker static int
499155d8d943SSeth Howell nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
4992311ce0e2SBen Walker 			      struct spdk_nvme_transport_id *trid)
4993311ce0e2SBen Walker {
4994311ce0e2SBen Walker 	struct spdk_nvmf_rdma_qpair	*rqpair;
4995311ce0e2SBen Walker 
4996311ce0e2SBen Walker 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
4997311ce0e2SBen Walker 
499855d8d943SSeth Howell 	return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true);
4999311ce0e2SBen Walker }
5000311ce0e2SBen Walker 
5001311ce0e2SBen Walker static int
500255d8d943SSeth Howell nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
5003f10a91edSBen Walker 			       struct spdk_nvme_transport_id *trid)
5004f10a91edSBen Walker {
5005f10a91edSBen Walker 	struct spdk_nvmf_rdma_qpair	*rqpair;
5006f10a91edSBen Walker 
5007f10a91edSBen Walker 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
5008f10a91edSBen Walker 
500955d8d943SSeth Howell 	return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false);
5010f10a91edSBen Walker }
5011f10a91edSBen Walker 
5012f10a91edSBen Walker static int
501355d8d943SSeth Howell nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
5014311ce0e2SBen Walker 				struct spdk_nvme_transport_id *trid)
5015311ce0e2SBen Walker {
5016311ce0e2SBen Walker 	struct spdk_nvmf_rdma_qpair	*rqpair;
5017311ce0e2SBen Walker 
5018311ce0e2SBen Walker 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
5019311ce0e2SBen Walker 
502055d8d943SSeth Howell 	return nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false);
5021311ce0e2SBen Walker }
5022311ce0e2SBen Walker 
50238e2f0cdbSzkhatami88 void
50248e2f0cdbSzkhatami88 spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks)
50258e2f0cdbSzkhatami88 {
50268e2f0cdbSzkhatami88 	g_nvmf_hooks = *hooks;
50278e2f0cdbSzkhatami88 }
50288e2f0cdbSzkhatami88 
5029604b4503SShuhei Matsumoto static void
5030c1305e71SShuhei Matsumoto nvmf_rdma_request_set_abort_status(struct spdk_nvmf_request *req,
503104cd8e47SAlexey Marchuk 				   struct spdk_nvmf_rdma_request *rdma_req_to_abort,
503204cd8e47SAlexey Marchuk 				   struct spdk_nvmf_rdma_qpair *rqpair)
5033c1305e71SShuhei Matsumoto {
5034c1305e71SShuhei Matsumoto 	rdma_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
5035c1305e71SShuhei Matsumoto 	rdma_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
5036c1305e71SShuhei Matsumoto 
503704cd8e47SAlexey Marchuk 	STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req_to_abort, state_link);
503804cd8e47SAlexey Marchuk 	rdma_req_to_abort->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
5039c1305e71SShuhei Matsumoto 
5040c1305e71SShuhei Matsumoto 	req->rsp->nvme_cpl.cdw0 &= ~1U;	/* Command was successfully aborted. */
5041c1305e71SShuhei Matsumoto }
5042c1305e71SShuhei Matsumoto 
50433e1ab5eaSShuhei Matsumoto static int
50443e1ab5eaSShuhei Matsumoto _nvmf_rdma_qpair_abort_request(void *ctx)
50453e1ab5eaSShuhei Matsumoto {
50463e1ab5eaSShuhei Matsumoto 	struct spdk_nvmf_request *req = ctx;
50473e1ab5eaSShuhei Matsumoto 	struct spdk_nvmf_rdma_request *rdma_req_to_abort = SPDK_CONTAINEROF(
50483e1ab5eaSShuhei Matsumoto 				req->req_to_abort, struct spdk_nvmf_rdma_request, req);
50493e1ab5eaSShuhei Matsumoto 	struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair,
50503e1ab5eaSShuhei Matsumoto 					      struct spdk_nvmf_rdma_qpair, qpair);
50513e1ab5eaSShuhei Matsumoto 	int rc;
50523e1ab5eaSShuhei Matsumoto 
50533e1ab5eaSShuhei Matsumoto 	spdk_poller_unregister(&req->poller);
50543e1ab5eaSShuhei Matsumoto 
50553e1ab5eaSShuhei Matsumoto 	switch (rdma_req_to_abort->state) {
50563e1ab5eaSShuhei Matsumoto 	case RDMA_REQUEST_STATE_EXECUTING:
505781437ff6SShuhei Matsumoto 		rc = nvmf_ctrlr_abort_request(req);
50583e1ab5eaSShuhei Matsumoto 		if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) {
50593e1ab5eaSShuhei Matsumoto 			return SPDK_POLLER_BUSY;
50603e1ab5eaSShuhei Matsumoto 		}
50613e1ab5eaSShuhei Matsumoto 		break;
50623e1ab5eaSShuhei Matsumoto 
50633e1ab5eaSShuhei Matsumoto 	case RDMA_REQUEST_STATE_NEED_BUFFER:
50643e1ab5eaSShuhei Matsumoto 		STAILQ_REMOVE(&rqpair->poller->group->group.pending_buf_queue,
50653e1ab5eaSShuhei Matsumoto 			      &rdma_req_to_abort->req, spdk_nvmf_request, buf_link);
50663e1ab5eaSShuhei Matsumoto 
506704cd8e47SAlexey Marchuk 		nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair);
50683e1ab5eaSShuhei Matsumoto 		break;
50693e1ab5eaSShuhei Matsumoto 
50703e1ab5eaSShuhei Matsumoto 	case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
50713e1ab5eaSShuhei Matsumoto 		STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req_to_abort,
50723e1ab5eaSShuhei Matsumoto 			      spdk_nvmf_rdma_request, state_link);
50733e1ab5eaSShuhei Matsumoto 
507404cd8e47SAlexey Marchuk 		nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair);
50753e1ab5eaSShuhei Matsumoto 		break;
50763e1ab5eaSShuhei Matsumoto 
50773e1ab5eaSShuhei Matsumoto 	case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
50783e1ab5eaSShuhei Matsumoto 		STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req_to_abort,
50793e1ab5eaSShuhei Matsumoto 			      spdk_nvmf_rdma_request, state_link);
50803e1ab5eaSShuhei Matsumoto 
508104cd8e47SAlexey Marchuk 		nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair);
508204cd8e47SAlexey Marchuk 		break;
508304cd8e47SAlexey Marchuk 
508404cd8e47SAlexey Marchuk 	case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING:
508504cd8e47SAlexey Marchuk 		/* Remove req from the list here to re-use common function */
508604cd8e47SAlexey Marchuk 		STAILQ_REMOVE(&rqpair->pending_rdma_send_queue, rdma_req_to_abort,
508704cd8e47SAlexey Marchuk 			      spdk_nvmf_rdma_request, state_link);
508804cd8e47SAlexey Marchuk 
508904cd8e47SAlexey Marchuk 		nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair);
50903e1ab5eaSShuhei Matsumoto 		break;
50913e1ab5eaSShuhei Matsumoto 
50923e1ab5eaSShuhei Matsumoto 	case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
50933e1ab5eaSShuhei Matsumoto 		if (spdk_get_ticks() < req->timeout_tsc) {
50943e1ab5eaSShuhei Matsumoto 			req->poller = SPDK_POLLER_REGISTER(_nvmf_rdma_qpair_abort_request, req, 0);
50953e1ab5eaSShuhei Matsumoto 			return SPDK_POLLER_BUSY;
50963e1ab5eaSShuhei Matsumoto 		}
50973e1ab5eaSShuhei Matsumoto 		break;
50983e1ab5eaSShuhei Matsumoto 
50993e1ab5eaSShuhei Matsumoto 	default:
51003e1ab5eaSShuhei Matsumoto 		break;
51013e1ab5eaSShuhei Matsumoto 	}
51023e1ab5eaSShuhei Matsumoto 
51033e1ab5eaSShuhei Matsumoto 	spdk_nvmf_request_complete(req);
51043e1ab5eaSShuhei Matsumoto 	return SPDK_POLLER_BUSY;
51053e1ab5eaSShuhei Matsumoto }
51063e1ab5eaSShuhei Matsumoto 
5107c1305e71SShuhei Matsumoto static void
5108604b4503SShuhei Matsumoto nvmf_rdma_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
5109604b4503SShuhei Matsumoto 			      struct spdk_nvmf_request *req)
5110604b4503SShuhei Matsumoto {
5111deec1fc7SShuhei Matsumoto 	struct spdk_nvmf_rdma_qpair *rqpair;
511226e0ef9aSShuhei Matsumoto 	struct spdk_nvmf_rdma_transport *rtransport;
511326e0ef9aSShuhei Matsumoto 	struct spdk_nvmf_transport *transport;
5114deec1fc7SShuhei Matsumoto 	uint16_t cid;
5115137866e5SAlexey Marchuk 	uint32_t i, max_req_count;
5116137866e5SAlexey Marchuk 	struct spdk_nvmf_rdma_request *rdma_req_to_abort = NULL, *rdma_req;
5117deec1fc7SShuhei Matsumoto 
5118deec1fc7SShuhei Matsumoto 	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
511926e0ef9aSShuhei Matsumoto 	rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
512026e0ef9aSShuhei Matsumoto 	transport = &rtransport->transport;
512126e0ef9aSShuhei Matsumoto 
5122deec1fc7SShuhei Matsumoto 	cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
5123137866e5SAlexey Marchuk 	max_req_count = rqpair->srq == NULL ? rqpair->max_queue_depth : rqpair->poller->max_srq_depth;
5124deec1fc7SShuhei Matsumoto 
5125137866e5SAlexey Marchuk 	for (i = 0; i < max_req_count; i++) {
5126137866e5SAlexey Marchuk 		rdma_req = &rqpair->resources->reqs[i];
5127137866e5SAlexey Marchuk 		/* When SRQ == NULL, rqpair has its own requests and req.qpair pointer always points to the qpair
5128137866e5SAlexey Marchuk 		 * When SRQ != NULL all rqpairs share common requests and qpair pointer is assigned when we start to
5129137866e5SAlexey Marchuk 		 * process a request. So in both cases all requests which are not in FREE state have valid qpair ptr */
5130137866e5SAlexey Marchuk 		if (rdma_req->state != RDMA_REQUEST_STATE_FREE && rdma_req->req.cmd->nvme_cmd.cid == cid &&
5131137866e5SAlexey Marchuk 		    rdma_req->req.qpair == qpair) {
5132137866e5SAlexey Marchuk 			rdma_req_to_abort = rdma_req;
5133deec1fc7SShuhei Matsumoto 			break;
5134deec1fc7SShuhei Matsumoto 		}
5135deec1fc7SShuhei Matsumoto 	}
5136deec1fc7SShuhei Matsumoto 
5137deec1fc7SShuhei Matsumoto 	if (rdma_req_to_abort == NULL) {
51383e1ab5eaSShuhei Matsumoto 		spdk_nvmf_request_complete(req);
5139deec1fc7SShuhei Matsumoto 		return;
5140deec1fc7SShuhei Matsumoto 	}
5141c1305e71SShuhei Matsumoto 
51423e1ab5eaSShuhei Matsumoto 	req->req_to_abort = &rdma_req_to_abort->req;
514326e0ef9aSShuhei Matsumoto 	req->timeout_tsc = spdk_get_ticks() +
514426e0ef9aSShuhei Matsumoto 			   transport->opts.abort_timeout_sec * spdk_get_ticks_hz();
51453e1ab5eaSShuhei Matsumoto 	req->poller = NULL;
5146c1305e71SShuhei Matsumoto 
51473e1ab5eaSShuhei Matsumoto 	_nvmf_rdma_qpair_abort_request(req);
5148604b4503SShuhei Matsumoto }
5149604b4503SShuhei Matsumoto 
51508dfa1067SMaciej Szulik static void
51518dfa1067SMaciej Szulik nvmf_rdma_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group,
51528dfa1067SMaciej Szulik 			       struct spdk_json_write_ctx *w)
51538dfa1067SMaciej Szulik {
51548dfa1067SMaciej Szulik 	struct spdk_nvmf_rdma_poll_group *rgroup;
51558dfa1067SMaciej Szulik 	struct spdk_nvmf_rdma_poller *rpoller;
51568dfa1067SMaciej Szulik 
51578dfa1067SMaciej Szulik 	assert(w != NULL);
51588dfa1067SMaciej Szulik 
51598dfa1067SMaciej Szulik 	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
51608dfa1067SMaciej Szulik 
51618dfa1067SMaciej Szulik 	spdk_json_write_named_uint64(w, "pending_data_buffer", rgroup->stat.pending_data_buffer);
51628dfa1067SMaciej Szulik 
51638dfa1067SMaciej Szulik 	spdk_json_write_named_array_begin(w, "devices");
51648dfa1067SMaciej Szulik 
51658dfa1067SMaciej Szulik 	TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
51668dfa1067SMaciej Szulik 		spdk_json_write_object_begin(w);
51678dfa1067SMaciej Szulik 		spdk_json_write_named_string(w, "name",
51688dfa1067SMaciej Szulik 					     ibv_get_device_name(rpoller->device->context->device));
51698dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "polls",
51708dfa1067SMaciej Szulik 					     rpoller->stat.polls);
51718dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "idle_polls",
51728dfa1067SMaciej Szulik 					     rpoller->stat.idle_polls);
51738dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "completions",
51748dfa1067SMaciej Szulik 					     rpoller->stat.completions);
51758dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "requests",
51768dfa1067SMaciej Szulik 					     rpoller->stat.requests);
51778dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "request_latency",
51788dfa1067SMaciej Szulik 					     rpoller->stat.request_latency);
51798dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "pending_free_request",
51808dfa1067SMaciej Szulik 					     rpoller->stat.pending_free_request);
51818dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "pending_rdma_read",
51828dfa1067SMaciej Szulik 					     rpoller->stat.pending_rdma_read);
51838dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "pending_rdma_write",
51848dfa1067SMaciej Szulik 					     rpoller->stat.pending_rdma_write);
518504cd8e47SAlexey Marchuk 		spdk_json_write_named_uint64(w, "pending_rdma_send",
518604cd8e47SAlexey Marchuk 					     rpoller->stat.pending_rdma_send);
51878dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "total_send_wrs",
51888dfa1067SMaciej Szulik 					     rpoller->stat.qp_stats.send.num_submitted_wrs);
51898dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "send_doorbell_updates",
51908dfa1067SMaciej Szulik 					     rpoller->stat.qp_stats.send.doorbell_updates);
51918dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "total_recv_wrs",
51928dfa1067SMaciej Szulik 					     rpoller->stat.qp_stats.recv.num_submitted_wrs);
51938dfa1067SMaciej Szulik 		spdk_json_write_named_uint64(w, "recv_doorbell_updates",
51948dfa1067SMaciej Szulik 					     rpoller->stat.qp_stats.recv.doorbell_updates);
51958dfa1067SMaciej Szulik 		spdk_json_write_object_end(w);
51968dfa1067SMaciej Szulik 	}
51978dfa1067SMaciej Szulik 
51988dfa1067SMaciej Szulik 	spdk_json_write_array_end(w);
51998dfa1067SMaciej Szulik }
52008dfa1067SMaciej Szulik 
520131d033f9SBen Walker const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = {
52025b3e6cd1SSeth Howell 	.name = "RDMA",
520329f6172aSBen Walker 	.type = SPDK_NVME_TRANSPORT_RDMA,
520455d8d943SSeth Howell 	.opts_init = nvmf_rdma_opts_init,
520555d8d943SSeth Howell 	.create = nvmf_rdma_create,
5206f766d1e4SDarek Stojaczyk 	.dump_opts = nvmf_rdma_dump_opts,
520755d8d943SSeth Howell 	.destroy = nvmf_rdma_destroy,
52084c6e4d49SDaniel Verkamp 
520955d8d943SSeth Howell 	.listen = nvmf_rdma_listen,
521055d8d943SSeth Howell 	.stop_listen = nvmf_rdma_stop_listen,
5211000e6f5bSJacek Kalwas 	.cdata_init = nvmf_rdma_cdata_init,
521221c450e1SDaniel Verkamp 
521355d8d943SSeth Howell 	.listener_discover = nvmf_rdma_discover,
52142641c31aSChangpeng Liu 
521555d8d943SSeth Howell 	.poll_group_create = nvmf_rdma_poll_group_create,
521655d8d943SSeth Howell 	.get_optimal_poll_group = nvmf_rdma_get_optimal_poll_group,
521755d8d943SSeth Howell 	.poll_group_destroy = nvmf_rdma_poll_group_destroy,
521855d8d943SSeth Howell 	.poll_group_add = nvmf_rdma_poll_group_add,
52193d1d4fcfSAlexey Marchuk 	.poll_group_remove = nvmf_rdma_poll_group_remove,
522055d8d943SSeth Howell 	.poll_group_poll = nvmf_rdma_poll_group_poll,
5221d7b8da3bSBen Walker 
522255d8d943SSeth Howell 	.req_free = nvmf_rdma_request_free,
522355d8d943SSeth Howell 	.req_complete = nvmf_rdma_request_complete,
522421c450e1SDaniel Verkamp 
522555d8d943SSeth Howell 	.qpair_fini = nvmf_rdma_close_qpair,
522655d8d943SSeth Howell 	.qpair_get_peer_trid = nvmf_rdma_qpair_get_peer_trid,
522755d8d943SSeth Howell 	.qpair_get_local_trid = nvmf_rdma_qpair_get_local_trid,
522855d8d943SSeth Howell 	.qpair_get_listen_trid = nvmf_rdma_qpair_get_listen_trid,
5229604b4503SShuhei Matsumoto 	.qpair_abort_request = nvmf_rdma_qpair_abort_request,
52302641c31aSChangpeng Liu 
52318dfa1067SMaciej Szulik 	.poll_group_dump_stat = nvmf_rdma_poll_group_dump_stat,
523221c450e1SDaniel Verkamp };
523321c450e1SDaniel Verkamp 
5234f038354eSSeth Howell SPDK_NVMF_TRANSPORT_REGISTER(rdma, &spdk_nvmf_transport_rdma);
52352172c432STomasz Zawadzki SPDK_LOG_REGISTER_COMPONENT(rdma)
5236