xref: /spdk/lib/nvme/nvme_tcp.c (revision ba23cec1820104cc710ad776f0127e1cf82033aa)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * NVMe/TCP transport
36  */
37 
38 #include "nvme_internal.h"
39 
40 #include "spdk/endian.h"
41 #include "spdk/likely.h"
42 #include "spdk/string.h"
43 #include "spdk/stdinc.h"
44 #include "spdk/crc32.h"
45 #include "spdk/endian.h"
46 #include "spdk/assert.h"
47 #include "spdk/string.h"
48 #include "spdk/thread.h"
49 #include "spdk/trace.h"
50 #include "spdk/util.h"
51 
52 #include "spdk_internal/nvme_tcp.h"
53 
54 #define NVME_TCP_RW_BUFFER_SIZE 131072
55 #define NVME_TCP_TIME_OUT_IN_SECONDS 2
56 
57 #define NVME_TCP_HPDA_DEFAULT			0
58 #define NVME_TCP_MAX_R2T_DEFAULT		1
59 #define NVME_TCP_PDU_H2C_MIN_DATA_SIZE		4096
60 #define NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE	8192
61 
62 /* NVMe TCP transport extensions for spdk_nvme_ctrlr */
63 struct nvme_tcp_ctrlr {
64 	struct spdk_nvme_ctrlr			ctrlr;
65 };
66 
67 struct nvme_tcp_poll_group {
68 	struct spdk_nvme_transport_poll_group group;
69 	struct spdk_sock_group *sock_group;
70 	uint32_t completions_per_qpair;
71 	int64_t num_completions;
72 };
73 
74 /* NVMe TCP qpair extensions for spdk_nvme_qpair */
75 struct nvme_tcp_qpair {
76 	struct spdk_nvme_qpair			qpair;
77 	struct spdk_sock			*sock;
78 
79 	TAILQ_HEAD(, nvme_tcp_req)		free_reqs;
80 	TAILQ_HEAD(, nvme_tcp_req)		outstanding_reqs;
81 
82 	TAILQ_HEAD(, nvme_tcp_pdu)		send_queue;
83 	struct nvme_tcp_pdu			recv_pdu;
84 	struct nvme_tcp_pdu			send_pdu; /* only for error pdu and init pdu */
85 	enum nvme_tcp_pdu_recv_state		recv_state;
86 
87 	struct nvme_tcp_req			*tcp_reqs;
88 
89 	uint16_t				num_entries;
90 
91 	bool					host_hdgst_enable;
92 	bool					host_ddgst_enable;
93 
94 	/** Specifies the maximum number of PDU-Data bytes per H2C Data Transfer PDU */
95 	uint32_t				maxh2cdata;
96 
97 	uint32_t				maxr2t;
98 
99 	/* 0 based value, which is used to guide the padding */
100 	uint8_t					cpda;
101 
102 	enum nvme_tcp_qpair_state		state;
103 };
104 
105 enum nvme_tcp_req_state {
106 	NVME_TCP_REQ_FREE,
107 	NVME_TCP_REQ_ACTIVE,
108 	NVME_TCP_REQ_ACTIVE_R2T,
109 };
110 
111 struct nvme_tcp_req {
112 	struct nvme_request			*req;
113 	enum nvme_tcp_req_state			state;
114 	uint16_t				cid;
115 	uint16_t				ttag;
116 	uint32_t				datao;
117 	uint32_t				r2tl_remain;
118 	uint32_t				active_r2ts;
119 	bool					in_capsule_data;
120 	struct nvme_tcp_pdu			send_pdu;
121 	struct iovec				iov[NVME_TCP_MAX_SGL_DESCRIPTORS];
122 	uint32_t				iovcnt;
123 	TAILQ_ENTRY(nvme_tcp_req)		link;
124 };
125 
126 static void nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req);
127 
128 static inline struct nvme_tcp_qpair *
129 nvme_tcp_qpair(struct spdk_nvme_qpair *qpair)
130 {
131 	assert(qpair->trtype == SPDK_NVME_TRANSPORT_TCP);
132 	return SPDK_CONTAINEROF(qpair, struct nvme_tcp_qpair, qpair);
133 }
134 
135 static inline struct nvme_tcp_poll_group *
136 nvme_tcp_poll_group(struct spdk_nvme_transport_poll_group *group)
137 {
138 	return SPDK_CONTAINEROF(group, struct nvme_tcp_poll_group, group);
139 }
140 
141 static inline struct nvme_tcp_ctrlr *
142 nvme_tcp_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
143 {
144 	assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP);
145 	return SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr);
146 }
147 
148 static struct nvme_tcp_req *
149 nvme_tcp_req_get(struct nvme_tcp_qpair *tqpair)
150 {
151 	struct nvme_tcp_req *tcp_req;
152 
153 	tcp_req = TAILQ_FIRST(&tqpair->free_reqs);
154 	if (!tcp_req) {
155 		return NULL;
156 	}
157 
158 	assert(tcp_req->state == NVME_TCP_REQ_FREE);
159 	tcp_req->state = NVME_TCP_REQ_ACTIVE;
160 	TAILQ_REMOVE(&tqpair->free_reqs, tcp_req, link);
161 	tcp_req->datao = 0;
162 	tcp_req->req = NULL;
163 	tcp_req->in_capsule_data = false;
164 	tcp_req->r2tl_remain = 0;
165 	tcp_req->active_r2ts = 0;
166 	tcp_req->iovcnt = 0;
167 	memset(&tcp_req->send_pdu, 0, sizeof(tcp_req->send_pdu));
168 	TAILQ_INSERT_TAIL(&tqpair->outstanding_reqs, tcp_req, link);
169 
170 	return tcp_req;
171 }
172 
173 static void
174 nvme_tcp_req_put(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
175 {
176 	assert(tcp_req->state != NVME_TCP_REQ_FREE);
177 	tcp_req->state = NVME_TCP_REQ_FREE;
178 	TAILQ_REMOVE(&tqpair->outstanding_reqs, tcp_req, link);
179 	TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link);
180 }
181 
182 static int
183 nvme_tcp_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service)
184 {
185 	struct addrinfo *res;
186 	struct addrinfo hints;
187 	int ret;
188 
189 	memset(&hints, 0, sizeof(hints));
190 	hints.ai_family = family;
191 	hints.ai_socktype = SOCK_STREAM;
192 	hints.ai_protocol = 0;
193 
194 	ret = getaddrinfo(addr, service, &hints, &res);
195 	if (ret) {
196 		SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret);
197 		return ret;
198 	}
199 
200 	if (res->ai_addrlen > sizeof(*sa)) {
201 		SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen);
202 		ret = EINVAL;
203 	} else {
204 		memcpy(sa, res->ai_addr, res->ai_addrlen);
205 	}
206 
207 	freeaddrinfo(res);
208 	return ret;
209 }
210 
211 static void
212 nvme_tcp_free_reqs(struct nvme_tcp_qpair *tqpair)
213 {
214 	free(tqpair->tcp_reqs);
215 	tqpair->tcp_reqs = NULL;
216 }
217 
218 static int
219 nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair)
220 {
221 	uint16_t i;
222 	struct nvme_tcp_req	*tcp_req;
223 
224 	tqpair->tcp_reqs = calloc(tqpair->num_entries, sizeof(struct nvme_tcp_req));
225 	if (tqpair->tcp_reqs == NULL) {
226 		SPDK_ERRLOG("Failed to allocate tcp_reqs\n");
227 		goto fail;
228 	}
229 
230 	TAILQ_INIT(&tqpair->send_queue);
231 	TAILQ_INIT(&tqpair->free_reqs);
232 	TAILQ_INIT(&tqpair->outstanding_reqs);
233 	for (i = 0; i < tqpair->num_entries; i++) {
234 		tcp_req = &tqpair->tcp_reqs[i];
235 		tcp_req->cid = i;
236 		TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link);
237 	}
238 
239 	return 0;
240 fail:
241 	nvme_tcp_free_reqs(tqpair);
242 	return -ENOMEM;
243 }
244 
245 static void
246 nvme_tcp_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
247 {
248 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
249 	struct nvme_tcp_pdu *pdu;
250 
251 	spdk_sock_close(&tqpair->sock);
252 
253 	/* clear the send_queue */
254 	while (!TAILQ_EMPTY(&tqpair->send_queue)) {
255 		pdu = TAILQ_FIRST(&tqpair->send_queue);
256 		/* Remove the pdu from the send_queue to prevent the wrong sending out
257 		 * in the next round connection
258 		 */
259 		TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
260 	}
261 }
262 
263 static void nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
264 
265 static int
266 nvme_tcp_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
267 {
268 	struct nvme_tcp_qpair *tqpair;
269 
270 	if (!qpair) {
271 		return -1;
272 	}
273 
274 	nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
275 	nvme_tcp_qpair_abort_reqs(qpair, 1);
276 	nvme_qpair_deinit(qpair);
277 	tqpair = nvme_tcp_qpair(qpair);
278 	nvme_tcp_free_reqs(tqpair);
279 	free(tqpair);
280 
281 	return 0;
282 }
283 
284 static int
285 nvme_tcp_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
286 {
287 	return 0;
288 }
289 
290 static int
291 nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
292 {
293 	struct nvme_tcp_ctrlr *tctrlr = nvme_tcp_ctrlr(ctrlr);
294 
295 	if (ctrlr->adminq) {
296 		nvme_tcp_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq);
297 	}
298 
299 	nvme_ctrlr_destruct_finish(ctrlr);
300 
301 	free(tctrlr);
302 
303 	return 0;
304 }
305 
306 static void
307 _pdu_write_done(void *cb_arg, int err)
308 {
309 	struct nvme_tcp_pdu *pdu = cb_arg;
310 	struct nvme_tcp_qpair *tqpair = pdu->qpair;
311 
312 	TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
313 
314 	if (err != 0) {
315 		nvme_transport_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair);
316 		return;
317 	}
318 
319 	assert(pdu->cb_fn != NULL);
320 	pdu->cb_fn(pdu->cb_arg);
321 }
322 
323 static int
324 nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
325 			 struct nvme_tcp_pdu *pdu,
326 			 nvme_tcp_qpair_xfer_complete_cb cb_fn,
327 			 void *cb_arg)
328 {
329 	int hlen;
330 	uint32_t crc32c;
331 	uint32_t mapped_length = 0;
332 
333 	hlen = pdu->hdr.common.hlen;
334 
335 	/* Header Digest */
336 	if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->host_hdgst_enable) {
337 		crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
338 		MAKE_DIGEST_WORD((uint8_t *)pdu->hdr.raw + hlen, crc32c);
339 	}
340 
341 	/* Data Digest */
342 	if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && tqpair->host_ddgst_enable) {
343 		crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
344 		MAKE_DIGEST_WORD(pdu->data_digest, crc32c);
345 	}
346 
347 	pdu->cb_fn = cb_fn;
348 	pdu->cb_arg = cb_arg;
349 
350 	pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu,
351 			       tqpair->host_hdgst_enable, tqpair->host_ddgst_enable,
352 			       &mapped_length);
353 	pdu->qpair = tqpair;
354 	pdu->sock_req.cb_fn = _pdu_write_done;
355 	pdu->sock_req.cb_arg = pdu;
356 	TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
357 	spdk_sock_writev_async(tqpair->sock, &pdu->sock_req);
358 
359 	return 0;
360 }
361 
362 /*
363  * Build SGL describing contiguous payload buffer.
364  */
365 static int
366 nvme_tcp_build_contig_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
367 {
368 	struct nvme_request *req = tcp_req->req;
369 
370 	tcp_req->iov[0].iov_base = req->payload.contig_or_cb_arg + req->payload_offset;
371 	tcp_req->iov[0].iov_len = req->payload_size;
372 	tcp_req->iovcnt = 1;
373 
374 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
375 
376 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
377 
378 	return 0;
379 }
380 
381 /*
382  * Build SGL describing scattered payload buffer.
383  */
384 static int
385 nvme_tcp_build_sgl_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
386 {
387 	int rc;
388 	uint32_t length, remaining_size, iovcnt = 0, max_num_sgl;
389 	struct nvme_request *req = tcp_req->req;
390 
391 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
392 
393 	assert(req->payload_size != 0);
394 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
395 	assert(req->payload.reset_sgl_fn != NULL);
396 	assert(req->payload.next_sge_fn != NULL);
397 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
398 
399 	max_num_sgl = spdk_min(req->qpair->ctrlr->max_sges, NVME_TCP_MAX_SGL_DESCRIPTORS);
400 	remaining_size = req->payload_size;
401 
402 	do {
403 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &tcp_req->iov[iovcnt].iov_base,
404 					      &length);
405 		if (rc) {
406 			return -1;
407 		}
408 
409 		length = spdk_min(length, remaining_size);
410 		tcp_req->iov[iovcnt].iov_len = length;
411 		remaining_size -= length;
412 		iovcnt++;
413 	} while (remaining_size > 0 && iovcnt < max_num_sgl);
414 
415 
416 	/* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */
417 	if (remaining_size > 0) {
418 		SPDK_ERRLOG("Failed to construct tcp_req=%p, and the iovcnt=%u, remaining_size=%u\n",
419 			    tcp_req, iovcnt, remaining_size);
420 		return -1;
421 	}
422 
423 	tcp_req->iovcnt = iovcnt;
424 
425 	return 0;
426 }
427 
428 static int
429 nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req,
430 		  struct nvme_tcp_req *tcp_req)
431 {
432 	struct spdk_nvme_ctrlr *ctrlr = tqpair->qpair.ctrlr;
433 	int rc = 0;
434 	enum spdk_nvme_data_transfer xfer;
435 	uint32_t max_incapsule_data_size;
436 
437 	tcp_req->req = req;
438 	req->cmd.cid = tcp_req->cid;
439 	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
440 	req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK;
441 	req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_TRANSPORT;
442 	req->cmd.dptr.sgl1.unkeyed.length = req->payload_size;
443 
444 	if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) {
445 		rc = nvme_tcp_build_contig_request(tqpair, tcp_req);
446 	} else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) {
447 		rc = nvme_tcp_build_sgl_request(tqpair, tcp_req);
448 	} else {
449 		rc = -1;
450 	}
451 
452 	if (rc) {
453 		return rc;
454 	}
455 
456 	if (req->cmd.opc == SPDK_NVME_OPC_FABRIC) {
457 		struct spdk_nvmf_capsule_cmd *nvmf_cmd = (struct spdk_nvmf_capsule_cmd *)&req->cmd;
458 
459 		xfer = spdk_nvme_opc_get_data_transfer(nvmf_cmd->fctype);
460 	} else {
461 		xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc);
462 	}
463 	if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
464 		max_incapsule_data_size = ctrlr->ioccsz_bytes;
465 		if ((req->cmd.opc == SPDK_NVME_OPC_FABRIC) || nvme_qpair_is_admin_queue(&tqpair->qpair)) {
466 			max_incapsule_data_size = spdk_min(max_incapsule_data_size, NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE);
467 		}
468 
469 		if (req->payload_size <= max_incapsule_data_size) {
470 			req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
471 			req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
472 			req->cmd.dptr.sgl1.address = 0;
473 			tcp_req->in_capsule_data = true;
474 		}
475 	}
476 
477 	return 0;
478 }
479 
480 static void
481 nvme_tcp_qpair_cmd_send_complete(void *cb_arg)
482 {
483 }
484 
485 static int
486 nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair,
487 				struct nvme_tcp_req *tcp_req)
488 {
489 	struct nvme_tcp_pdu *pdu;
490 	struct spdk_nvme_tcp_cmd *capsule_cmd;
491 	uint32_t plen = 0, alignment;
492 	uint8_t pdo;
493 
494 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
495 	pdu = &tcp_req->send_pdu;
496 
497 	capsule_cmd = &pdu->hdr.capsule_cmd;
498 	capsule_cmd->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD;
499 	plen = capsule_cmd->common.hlen = sizeof(*capsule_cmd);
500 	capsule_cmd->ccsqe = tcp_req->req->cmd;
501 
502 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "capsule_cmd cid=%u on tqpair(%p)\n", tcp_req->req->cmd.cid, tqpair);
503 
504 	if (tqpair->host_hdgst_enable) {
505 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Header digest is enabled for capsule command on tcp_req=%p\n",
506 			      tcp_req);
507 		capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
508 		plen += SPDK_NVME_TCP_DIGEST_LEN;
509 	}
510 
511 	if ((tcp_req->req->payload_size == 0) || !tcp_req->in_capsule_data) {
512 		goto end;
513 	}
514 
515 	pdo = plen;
516 	pdu->padding_len = 0;
517 	if (tqpair->cpda) {
518 		alignment = (tqpair->cpda + 1) << 2;
519 		if (alignment > plen) {
520 			pdu->padding_len = alignment - plen;
521 			pdo = alignment;
522 			plen = alignment;
523 		}
524 	}
525 
526 	capsule_cmd->common.pdo = pdo;
527 	plen += tcp_req->req->payload_size;
528 	if (tqpair->host_ddgst_enable) {
529 		capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF;
530 		plen += SPDK_NVME_TCP_DIGEST_LEN;
531 	}
532 
533 	tcp_req->datao = 0;
534 	nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt,
535 				  0, tcp_req->req->payload_size);
536 end:
537 	capsule_cmd->common.plen = plen;
538 	return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, NULL);
539 
540 }
541 
542 static int
543 nvme_tcp_qpair_submit_request(struct spdk_nvme_qpair *qpair,
544 			      struct nvme_request *req)
545 {
546 	struct nvme_tcp_qpair *tqpair;
547 	struct nvme_tcp_req *tcp_req;
548 
549 	tqpair = nvme_tcp_qpair(qpair);
550 	assert(tqpair != NULL);
551 	assert(req != NULL);
552 
553 	tcp_req = nvme_tcp_req_get(tqpair);
554 	if (!tcp_req) {
555 		/* Inform the upper layer to try again later. */
556 		return -EAGAIN;
557 	}
558 
559 	if (nvme_tcp_req_init(tqpair, req, tcp_req)) {
560 		SPDK_ERRLOG("nvme_tcp_req_init() failed\n");
561 		nvme_tcp_req_put(tqpair, tcp_req);
562 		return -1;
563 	}
564 
565 	return nvme_tcp_qpair_capsule_cmd_send(tqpair, tcp_req);
566 }
567 
568 static int
569 nvme_tcp_qpair_reset(struct spdk_nvme_qpair *qpair)
570 {
571 	return 0;
572 }
573 
574 static void
575 nvme_tcp_req_complete(struct nvme_request *req,
576 		      struct spdk_nvme_cpl *rsp)
577 {
578 	nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp);
579 	nvme_free_request(req);
580 }
581 
582 static void
583 nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
584 {
585 	struct nvme_tcp_req *tcp_req, *tmp;
586 	struct nvme_request *req;
587 	struct spdk_nvme_cpl cpl;
588 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
589 
590 	cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
591 	cpl.status.sct = SPDK_NVME_SCT_GENERIC;
592 	cpl.status.dnr = dnr;
593 
594 	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
595 		assert(tcp_req->req != NULL);
596 		req = tcp_req->req;
597 
598 		nvme_tcp_req_complete(req, &cpl);
599 		nvme_tcp_req_put(tqpair, tcp_req);
600 	}
601 }
602 
603 static void
604 nvme_tcp_qpair_set_recv_state(struct nvme_tcp_qpair *tqpair,
605 			      enum nvme_tcp_pdu_recv_state state)
606 {
607 	if (tqpair->recv_state == state) {
608 		SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n",
609 			    tqpair, state);
610 		return;
611 	}
612 
613 	tqpair->recv_state = state;
614 	switch (state) {
615 	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
616 	case NVME_TCP_PDU_RECV_STATE_ERROR:
617 		memset(&tqpair->recv_pdu, 0, sizeof(struct nvme_tcp_pdu));
618 		break;
619 	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
620 	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
621 	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
622 	default:
623 		break;
624 	}
625 }
626 
627 static void
628 nvme_tcp_qpair_send_h2c_term_req_complete(void *cb_arg)
629 {
630 	struct nvme_tcp_qpair *tqpair = cb_arg;
631 
632 	tqpair->state = NVME_TCP_QPAIR_STATE_EXITING;
633 }
634 
635 static void
636 nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu,
637 				 enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset)
638 {
639 	struct nvme_tcp_pdu *rsp_pdu;
640 	struct spdk_nvme_tcp_term_req_hdr *h2c_term_req;
641 	uint32_t h2c_term_req_hdr_len = sizeof(*h2c_term_req);
642 	uint8_t copy_len;
643 
644 	rsp_pdu = &tqpair->send_pdu;
645 	memset(rsp_pdu, 0, sizeof(*rsp_pdu));
646 	h2c_term_req = &rsp_pdu->hdr.term_req;
647 	h2c_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
648 	h2c_term_req->common.hlen = h2c_term_req_hdr_len;
649 
650 	if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
651 	    (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
652 		DSET32(&h2c_term_req->fei, error_offset);
653 	}
654 
655 	copy_len = pdu->hdr.common.hlen;
656 	if (copy_len > SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) {
657 		copy_len = SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
658 	}
659 
660 	/* Copy the error info into the buffer */
661 	memcpy((uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, pdu->hdr.raw, copy_len);
662 	nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, copy_len);
663 
664 	/* Contain the header len of the wrong received pdu */
665 	h2c_term_req->common.plen = h2c_term_req->common.hlen + copy_len;
666 	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
667 	nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_send_h2c_term_req_complete, NULL);
668 
669 }
670 
671 static void
672 nvme_tcp_pdu_ch_handle(struct nvme_tcp_qpair *tqpair)
673 {
674 	struct nvme_tcp_pdu *pdu;
675 	uint32_t error_offset = 0;
676 	enum spdk_nvme_tcp_term_req_fes fes;
677 	uint32_t expected_hlen, hd_len = 0;
678 	bool plen_error = false;
679 
680 	pdu = &tqpair->recv_pdu;
681 
682 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "pdu type = %d\n", pdu->hdr.common.pdu_type);
683 	if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP) {
684 		if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) {
685 			SPDK_ERRLOG("Already received IC_RESP PDU, and we should reject this pdu=%p\n", pdu);
686 			fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
687 			goto err;
688 		}
689 		expected_hlen = sizeof(struct spdk_nvme_tcp_ic_resp);
690 		if (pdu->hdr.common.plen != expected_hlen) {
691 			plen_error = true;
692 		}
693 	} else {
694 		if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) {
695 			SPDK_ERRLOG("The TCP/IP tqpair connection is not negotitated\n");
696 			fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
697 			goto err;
698 		}
699 
700 		switch (pdu->hdr.common.pdu_type) {
701 		case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP:
702 			expected_hlen = sizeof(struct spdk_nvme_tcp_rsp);
703 			if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) {
704 				hd_len = SPDK_NVME_TCP_DIGEST_LEN;
705 			}
706 
707 			if (pdu->hdr.common.plen != (expected_hlen + hd_len)) {
708 				plen_error = true;
709 			}
710 			break;
711 		case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
712 			expected_hlen = sizeof(struct spdk_nvme_tcp_c2h_data_hdr);
713 			if (pdu->hdr.common.plen < pdu->hdr.common.pdo) {
714 				plen_error = true;
715 			}
716 			break;
717 		case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
718 			expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr);
719 			if ((pdu->hdr.common.plen <= expected_hlen) ||
720 			    (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) {
721 				plen_error = true;
722 			}
723 			break;
724 		case SPDK_NVME_TCP_PDU_TYPE_R2T:
725 			expected_hlen = sizeof(struct spdk_nvme_tcp_r2t_hdr);
726 			if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) {
727 				hd_len = SPDK_NVME_TCP_DIGEST_LEN;
728 			}
729 
730 			if (pdu->hdr.common.plen != (expected_hlen + hd_len)) {
731 				plen_error = true;
732 			}
733 			break;
734 
735 		default:
736 			SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type);
737 			fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
738 			error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type);
739 			goto err;
740 		}
741 	}
742 
743 	if (pdu->hdr.common.hlen != expected_hlen) {
744 		SPDK_ERRLOG("Expected PDU header length %u, got %u\n",
745 			    expected_hlen, pdu->hdr.common.hlen);
746 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
747 		error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen);
748 		goto err;
749 
750 	} else if (plen_error) {
751 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
752 		error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen);
753 		goto err;
754 	} else {
755 		nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
756 		nvme_tcp_pdu_calc_psh_len(&tqpair->recv_pdu, tqpair->host_hdgst_enable);
757 		return;
758 	}
759 err:
760 	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
761 }
762 
763 static struct nvme_tcp_req *
764 get_nvme_active_req_by_cid(struct nvme_tcp_qpair *tqpair, uint32_t cid)
765 {
766 	assert(tqpair != NULL);
767 	if ((cid >= tqpair->num_entries) || (tqpair->tcp_reqs[cid].state == NVME_TCP_REQ_FREE)) {
768 		return NULL;
769 	}
770 
771 	return &tqpair->tcp_reqs[cid];
772 }
773 
774 static void
775 nvme_tcp_c2h_data_payload_handle(struct nvme_tcp_qpair *tqpair,
776 				 struct nvme_tcp_pdu *pdu, uint32_t *reaped)
777 {
778 	struct nvme_tcp_req *tcp_req;
779 	struct spdk_nvme_tcp_c2h_data_hdr *c2h_data;
780 	struct spdk_nvme_cpl cpl = {};
781 	uint8_t flags;
782 
783 	tcp_req = pdu->req;
784 	assert(tcp_req != NULL);
785 
786 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
787 	c2h_data = &pdu->hdr.c2h_data;
788 	tcp_req->datao += pdu->data_len;
789 	flags = c2h_data->common.flags;
790 
791 	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
792 	if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) {
793 		if (tcp_req->datao == tcp_req->req->payload_size) {
794 			cpl.status.p = 0;
795 		} else {
796 			cpl.status.p = 1;
797 		}
798 
799 		cpl.cid = tcp_req->cid;
800 		cpl.sqid = tqpair->qpair.id;
801 		nvme_tcp_req_complete(tcp_req->req, &cpl);
802 		nvme_tcp_req_put(tqpair, tcp_req);
803 		(*reaped)++;
804 	}
805 }
806 
807 static const char *spdk_nvme_tcp_term_req_fes_str[] = {
808 	"Invalid PDU Header Field",
809 	"PDU Sequence Error",
810 	"Header Digest Error",
811 	"Data Transfer Out of Range",
812 	"Data Transfer Limit Exceeded",
813 	"Unsupported parameter",
814 };
815 
816 static void
817 nvme_tcp_c2h_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *c2h_term_req)
818 {
819 	SPDK_ERRLOG("Error info of pdu(%p): %s\n", c2h_term_req,
820 		    spdk_nvme_tcp_term_req_fes_str[c2h_term_req->fes]);
821 	if ((c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
822 	    (c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
823 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "The offset from the start of the PDU header is %u\n",
824 			      DGET32(c2h_term_req->fei));
825 	}
826 	/* we may also need to dump some other info here */
827 }
828 
829 static void
830 nvme_tcp_c2h_term_req_payload_handle(struct nvme_tcp_qpair *tqpair,
831 				     struct nvme_tcp_pdu *pdu)
832 {
833 	nvme_tcp_c2h_term_req_dump(&pdu->hdr.term_req);
834 	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
835 }
836 
837 static void
838 nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair,
839 			    uint32_t *reaped)
840 {
841 	int rc = 0;
842 	struct nvme_tcp_pdu *pdu;
843 	uint32_t crc32c, error_offset = 0;
844 	enum spdk_nvme_tcp_term_req_fes fes;
845 
846 	assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
847 	pdu = &tqpair->recv_pdu;
848 
849 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
850 
851 	/* check data digest if need */
852 	if (pdu->ddgst_enable) {
853 		crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
854 		rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c);
855 		if (rc == 0) {
856 			SPDK_ERRLOG("data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
857 			fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
858 			nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
859 			return;
860 		}
861 	}
862 
863 	switch (pdu->hdr.common.pdu_type) {
864 	case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
865 		nvme_tcp_c2h_data_payload_handle(tqpair, pdu, reaped);
866 		break;
867 
868 	case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
869 		nvme_tcp_c2h_term_req_payload_handle(tqpair, pdu);
870 		break;
871 
872 	default:
873 		/* The code should not go to here */
874 		SPDK_ERRLOG("The code should not go to here\n");
875 		break;
876 	}
877 }
878 
879 static void
880 nvme_tcp_send_icreq_complete(void *cb_arg)
881 {
882 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "Complete the icreq send for tqpair=%p\n",
883 		      (struct nvme_tcp_qpair *)cb_arg);
884 }
885 
886 static void
887 nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair,
888 		       struct nvme_tcp_pdu *pdu)
889 {
890 	struct spdk_nvme_tcp_ic_resp *ic_resp = &pdu->hdr.ic_resp;
891 	uint32_t error_offset = 0;
892 	enum spdk_nvme_tcp_term_req_fes fes;
893 	int recv_buf_size;
894 
895 	/* Only PFV 0 is defined currently */
896 	if (ic_resp->pfv != 0) {
897 		SPDK_ERRLOG("Expected ICResp PFV %u, got %u\n", 0u, ic_resp->pfv);
898 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
899 		error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, pfv);
900 		goto end;
901 	}
902 
903 	if (ic_resp->maxh2cdata < NVME_TCP_PDU_H2C_MIN_DATA_SIZE) {
904 		SPDK_ERRLOG("Expected ICResp maxh2cdata >=%u, got %u\n", NVME_TCP_PDU_H2C_MIN_DATA_SIZE,
905 			    ic_resp->maxh2cdata);
906 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
907 		error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, maxh2cdata);
908 		goto end;
909 	}
910 	tqpair->maxh2cdata = ic_resp->maxh2cdata;
911 
912 	if (ic_resp->cpda > SPDK_NVME_TCP_CPDA_MAX) {
913 		SPDK_ERRLOG("Expected ICResp cpda <=%u, got %u\n", SPDK_NVME_TCP_CPDA_MAX, ic_resp->cpda);
914 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
915 		error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, cpda);
916 		goto end;
917 	}
918 	tqpair->cpda = ic_resp->cpda;
919 
920 	tqpair->host_hdgst_enable = ic_resp->dgst.bits.hdgst_enable ? true : false;
921 	tqpair->host_ddgst_enable = ic_resp->dgst.bits.ddgst_enable ? true : false;
922 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_hdgst_enable: %u\n", tqpair->host_hdgst_enable);
923 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable);
924 
925 	/* Now that we know whether digests are enabled, properly size the receive buffer to
926 	 * handle 4 incoming 4K read commands. */
927 	recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_cmd);
928 
929 	if (tqpair->host_hdgst_enable) {
930 		recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN;
931 	}
932 
933 	if (tqpair->host_ddgst_enable) {
934 		recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN;
935 	}
936 
937 	if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * 4) < 0) {
938 		SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n",
939 			     tqpair,
940 			     recv_buf_size);
941 		/* Not fatal. */
942 	}
943 
944 	tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING;
945 	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
946 	return;
947 end:
948 	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
949 	return;
950 }
951 
952 static void
953 nvme_tcp_capsule_resp_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu,
954 				 uint32_t *reaped)
955 {
956 	struct nvme_tcp_req *tcp_req;
957 	struct spdk_nvme_tcp_rsp *capsule_resp = &pdu->hdr.capsule_resp;
958 	uint32_t cid, error_offset = 0;
959 	enum spdk_nvme_tcp_term_req_fes fes;
960 	struct spdk_nvme_cpl cpl;
961 
962 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
963 	cpl = capsule_resp->rccqe;
964 	cid = cpl.cid;
965 
966 	/* Recv the pdu again */
967 	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
968 
969 	tcp_req = get_nvme_active_req_by_cid(tqpair, cid);
970 	if (!tcp_req) {
971 		SPDK_ERRLOG("no tcp_req is found with cid=%u for tqpair=%p\n", cid, tqpair);
972 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
973 		error_offset = offsetof(struct spdk_nvme_tcp_rsp, rccqe);
974 		goto end;
975 
976 	}
977 
978 	assert(tcp_req->req != NULL);
979 	assert(tcp_req->state == NVME_TCP_REQ_ACTIVE);
980 	nvme_tcp_req_complete(tcp_req->req, &cpl);
981 	nvme_tcp_req_put(tqpair, tcp_req);
982 	(*reaped)++;
983 
984 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "complete tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair);
985 
986 	return;
987 
988 end:
989 	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
990 	return;
991 }
992 
993 static void
994 nvme_tcp_c2h_term_req_hdr_handle(struct nvme_tcp_qpair *tqpair,
995 				 struct nvme_tcp_pdu *pdu)
996 {
997 	struct spdk_nvme_tcp_term_req_hdr *c2h_term_req = &pdu->hdr.term_req;
998 	uint32_t error_offset = 0;
999 	enum spdk_nvme_tcp_term_req_fes fes;
1000 
1001 	if (c2h_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) {
1002 		SPDK_ERRLOG("Fatal Error Stauts(FES) is unknown for c2h_term_req pdu=%p\n", pdu);
1003 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
1004 		error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes);
1005 		goto end;
1006 	}
1007 
1008 	/* set the data buffer */
1009 	nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + c2h_term_req->common.hlen,
1010 			      c2h_term_req->common.plen - c2h_term_req->common.hlen);
1011 	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
1012 	return;
1013 end:
1014 	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
1015 	return;
1016 }
1017 
1018 static void
1019 nvme_tcp_c2h_data_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu)
1020 {
1021 	struct nvme_tcp_req *tcp_req;
1022 	struct spdk_nvme_tcp_c2h_data_hdr *c2h_data = &pdu->hdr.c2h_data;
1023 	uint32_t error_offset = 0;
1024 	enum spdk_nvme_tcp_term_req_fes fes;
1025 
1026 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
1027 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "c2h_data info on tqpair(%p): datao=%u, datal=%u, cccid=%d\n",
1028 		      tqpair, c2h_data->datao, c2h_data->datal, c2h_data->cccid);
1029 	tcp_req = get_nvme_active_req_by_cid(tqpair, c2h_data->cccid);
1030 	if (!tcp_req) {
1031 		SPDK_ERRLOG("no tcp_req found for c2hdata cid=%d\n", c2h_data->cccid);
1032 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
1033 		error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, cccid);
1034 		goto end;
1035 
1036 	}
1037 
1038 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "tcp_req(%p) on tqpair(%p): datao=%u, payload_size=%u\n",
1039 		      tcp_req, tqpair, tcp_req->datao, tcp_req->req->payload_size);
1040 
1041 	if (c2h_data->datal > tcp_req->req->payload_size) {
1042 		SPDK_ERRLOG("Invalid datal for tcp_req(%p), datal(%u) exceeds payload_size(%u)\n",
1043 			    tcp_req, c2h_data->datal, tcp_req->req->payload_size);
1044 		fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
1045 		goto end;
1046 	}
1047 
1048 	if (tcp_req->datao != c2h_data->datao) {
1049 		SPDK_ERRLOG("Invalid datao for tcp_req(%p), received datal(%u) != datao(%u) in tcp_req\n",
1050 			    tcp_req, c2h_data->datao, tcp_req->datao);
1051 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
1052 		error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datao);
1053 		goto end;
1054 	}
1055 
1056 	if ((c2h_data->datao + c2h_data->datal) > tcp_req->req->payload_size) {
1057 		SPDK_ERRLOG("Invalid data range for tcp_req(%p), received (datao(%u) + datal(%u)) > datao(%u) in tcp_req\n",
1058 			    tcp_req, c2h_data->datao, c2h_data->datal, tcp_req->req->payload_size);
1059 		fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
1060 		error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datal);
1061 		goto end;
1062 
1063 	}
1064 
1065 	nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt,
1066 				  c2h_data->datao, c2h_data->datal);
1067 	pdu->req = tcp_req;
1068 
1069 	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
1070 	return;
1071 
1072 end:
1073 	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
1074 	return;
1075 }
1076 
1077 static void
1078 nvme_tcp_qpair_h2c_data_send_complete(void *cb_arg)
1079 {
1080 	struct nvme_tcp_req *tcp_req = cb_arg;
1081 
1082 	assert(tcp_req != NULL);
1083 
1084 	if (tcp_req->r2tl_remain) {
1085 		nvme_tcp_send_h2c_data(tcp_req);
1086 	} else {
1087 		assert(tcp_req->active_r2ts > 0);
1088 		tcp_req->active_r2ts--;
1089 		tcp_req->state = NVME_TCP_REQ_ACTIVE;
1090 	}
1091 }
1092 
1093 static void
1094 nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req)
1095 {
1096 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(tcp_req->req->qpair);
1097 	struct nvme_tcp_pdu *rsp_pdu;
1098 	struct spdk_nvme_tcp_h2c_data_hdr *h2c_data;
1099 	uint32_t plen, pdo, alignment;
1100 
1101 	rsp_pdu = &tcp_req->send_pdu;
1102 	memset(rsp_pdu, 0, sizeof(*rsp_pdu));
1103 	h2c_data = &rsp_pdu->hdr.h2c_data;
1104 
1105 	h2c_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_DATA;
1106 	plen = h2c_data->common.hlen = sizeof(*h2c_data);
1107 	h2c_data->cccid = tcp_req->cid;
1108 	h2c_data->ttag = tcp_req->ttag;
1109 	h2c_data->datao = tcp_req->datao;
1110 
1111 	h2c_data->datal = spdk_min(tcp_req->r2tl_remain, tqpair->maxh2cdata);
1112 	nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->iov, tcp_req->iovcnt,
1113 				  h2c_data->datao, h2c_data->datal);
1114 	tcp_req->r2tl_remain -= h2c_data->datal;
1115 
1116 	if (tqpair->host_hdgst_enable) {
1117 		h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
1118 		plen += SPDK_NVME_TCP_DIGEST_LEN;
1119 	}
1120 
1121 	rsp_pdu->padding_len = 0;
1122 	pdo = plen;
1123 	if (tqpair->cpda) {
1124 		alignment = (tqpair->cpda + 1) << 2;
1125 		if (alignment > plen) {
1126 			rsp_pdu->padding_len = alignment - plen;
1127 			pdo = plen = alignment;
1128 		}
1129 	}
1130 
1131 	h2c_data->common.pdo = pdo;
1132 	plen += h2c_data->datal;
1133 	if (tqpair->host_ddgst_enable) {
1134 		h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF;
1135 		plen += SPDK_NVME_TCP_DIGEST_LEN;
1136 	}
1137 
1138 	h2c_data->common.plen = plen;
1139 	tcp_req->datao += h2c_data->datal;
1140 	if (!tcp_req->r2tl_remain) {
1141 		h2c_data->common.flags |= SPDK_NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
1142 	}
1143 
1144 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "h2c_data info: datao=%u, datal=%u, pdu_len=%u for tqpair=%p\n",
1145 		      h2c_data->datao, h2c_data->datal, h2c_data->common.plen, tqpair);
1146 
1147 	nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_h2c_data_send_complete, tcp_req);
1148 }
1149 
1150 static void
1151 nvme_tcp_r2t_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu)
1152 {
1153 	struct nvme_tcp_req *tcp_req;
1154 	struct spdk_nvme_tcp_r2t_hdr *r2t = &pdu->hdr.r2t;
1155 	uint32_t cid, error_offset = 0;
1156 	enum spdk_nvme_tcp_term_req_fes fes;
1157 
1158 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
1159 	cid = r2t->cccid;
1160 	tcp_req = get_nvme_active_req_by_cid(tqpair, cid);
1161 	if (!tcp_req) {
1162 		SPDK_ERRLOG("Cannot find tcp_req for tqpair=%p\n", tqpair);
1163 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
1164 		error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, cccid);
1165 		goto end;
1166 	}
1167 
1168 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "r2t info: r2to=%u, r2tl=%u for tqpair=%p\n", r2t->r2to, r2t->r2tl,
1169 		      tqpair);
1170 
1171 	if (tcp_req->state == NVME_TCP_REQ_ACTIVE) {
1172 		assert(tcp_req->active_r2ts == 0);
1173 		tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T;
1174 	}
1175 
1176 	tcp_req->active_r2ts++;
1177 	if (tcp_req->active_r2ts > tqpair->maxr2t) {
1178 		fes = SPDK_NVME_TCP_TERM_REQ_FES_R2T_LIMIT_EXCEEDED;
1179 		SPDK_ERRLOG("Invalid R2T: it exceeds the R2T maixmal=%u for tqpair=%p\n", tqpair->maxr2t, tqpair);
1180 		goto end;
1181 	}
1182 
1183 	if (tcp_req->datao != r2t->r2to) {
1184 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
1185 		error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2to);
1186 		goto end;
1187 
1188 	}
1189 
1190 	if ((r2t->r2tl + r2t->r2to) > tcp_req->req->payload_size) {
1191 		SPDK_ERRLOG("Invalid R2T info for tcp_req=%p: (r2to(%u) + r2tl(%u)) exceeds payload_size(%u)\n",
1192 			    tcp_req, r2t->r2to, r2t->r2tl, tqpair->maxh2cdata);
1193 		fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
1194 		error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2tl);
1195 		goto end;
1196 
1197 	}
1198 
1199 	tcp_req->ttag = r2t->ttag;
1200 	tcp_req->r2tl_remain = r2t->r2tl;
1201 	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
1202 
1203 	nvme_tcp_send_h2c_data(tcp_req);
1204 	return;
1205 
1206 end:
1207 	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
1208 	return;
1209 
1210 }
1211 
1212 static void
1213 nvme_tcp_pdu_psh_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped)
1214 {
1215 	struct nvme_tcp_pdu *pdu;
1216 	int rc;
1217 	uint32_t crc32c, error_offset = 0;
1218 	enum spdk_nvme_tcp_term_req_fes fes;
1219 
1220 	assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
1221 	pdu = &tqpair->recv_pdu;
1222 
1223 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter: pdu type =%u\n", pdu->hdr.common.pdu_type);
1224 	/* check header digest if needed */
1225 	if (pdu->has_hdgst) {
1226 		crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
1227 		rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c);
1228 		if (rc == 0) {
1229 			SPDK_ERRLOG("header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
1230 			fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
1231 			nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
1232 			return;
1233 
1234 		}
1235 	}
1236 
1237 	switch (pdu->hdr.common.pdu_type) {
1238 	case SPDK_NVME_TCP_PDU_TYPE_IC_RESP:
1239 		nvme_tcp_icresp_handle(tqpair, pdu);
1240 		break;
1241 	case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1242 		nvme_tcp_capsule_resp_hdr_handle(tqpair, pdu, reaped);
1243 		break;
1244 	case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
1245 		nvme_tcp_c2h_data_hdr_handle(tqpair, pdu);
1246 		break;
1247 
1248 	case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1249 		nvme_tcp_c2h_term_req_hdr_handle(tqpair, pdu);
1250 		break;
1251 	case SPDK_NVME_TCP_PDU_TYPE_R2T:
1252 		nvme_tcp_r2t_hdr_handle(tqpair, pdu);
1253 		break;
1254 
1255 	default:
1256 		SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type);
1257 		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
1258 		error_offset = 1;
1259 		nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
1260 		break;
1261 	}
1262 
1263 }
1264 
1265 static int
1266 nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped)
1267 {
1268 	int rc = 0;
1269 	struct nvme_tcp_pdu *pdu;
1270 	uint32_t data_len;
1271 	enum nvme_tcp_pdu_recv_state prev_state;
1272 
1273 	/* The loop here is to allow for several back-to-back state changes. */
1274 	do {
1275 		prev_state = tqpair->recv_state;
1276 		switch (tqpair->recv_state) {
1277 		/* If in a new state */
1278 		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
1279 			nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
1280 			break;
1281 		/* common header */
1282 		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
1283 			pdu = &tqpair->recv_pdu;
1284 			if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
1285 				rc = nvme_tcp_read_data(tqpair->sock,
1286 							sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
1287 							(uint8_t *)&pdu->hdr.common + pdu->ch_valid_bytes);
1288 				if (rc < 0) {
1289 					nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
1290 					break;
1291 				}
1292 				pdu->ch_valid_bytes += rc;
1293 				if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
1294 					return NVME_TCP_PDU_IN_PROGRESS;
1295 				}
1296 			}
1297 
1298 			/* The command header of this PDU has now been read from the socket. */
1299 			nvme_tcp_pdu_ch_handle(tqpair);
1300 			break;
1301 		/* Wait for the pdu specific header  */
1302 		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
1303 			pdu = &tqpair->recv_pdu;
1304 			rc = nvme_tcp_read_data(tqpair->sock,
1305 						pdu->psh_len - pdu->psh_valid_bytes,
1306 						(uint8_t *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
1307 			if (rc < 0) {
1308 				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
1309 				break;
1310 			}
1311 
1312 			pdu->psh_valid_bytes += rc;
1313 			if (pdu->psh_valid_bytes < pdu->psh_len) {
1314 				return NVME_TCP_PDU_IN_PROGRESS;
1315 			}
1316 
1317 			/* All header(ch, psh, head digist) of this PDU has now been read from the socket. */
1318 			nvme_tcp_pdu_psh_handle(tqpair, reaped);
1319 			break;
1320 		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
1321 			pdu = &tqpair->recv_pdu;
1322 			/* check whether the data is valid, if not we just return */
1323 			if (!pdu->data_len) {
1324 				return NVME_TCP_PDU_IN_PROGRESS;
1325 			}
1326 
1327 			data_len = pdu->data_len;
1328 			/* data digest */
1329 			if (spdk_unlikely((pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_DATA) &&
1330 					  tqpair->host_ddgst_enable)) {
1331 				data_len += SPDK_NVME_TCP_DIGEST_LEN;
1332 				pdu->ddgst_enable = true;
1333 			}
1334 
1335 			rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
1336 			if (rc < 0) {
1337 				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
1338 				break;
1339 			}
1340 
1341 			pdu->readv_offset += rc;
1342 			if (pdu->readv_offset < data_len) {
1343 				return NVME_TCP_PDU_IN_PROGRESS;
1344 			}
1345 
1346 			assert(pdu->readv_offset == data_len);
1347 			/* All of this PDU has now been read from the socket. */
1348 			nvme_tcp_pdu_payload_handle(tqpair, reaped);
1349 			break;
1350 		case NVME_TCP_PDU_RECV_STATE_ERROR:
1351 			rc = NVME_TCP_PDU_FATAL;
1352 			break;
1353 		default:
1354 			assert(0);
1355 			break;
1356 		}
1357 	} while (prev_state != tqpair->recv_state);
1358 
1359 	return rc;
1360 }
1361 
1362 static void
1363 nvme_tcp_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
1364 {
1365 	uint64_t t02;
1366 	struct nvme_tcp_req *tcp_req, *tmp;
1367 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
1368 	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1369 	struct spdk_nvme_ctrlr_process *active_proc;
1370 
1371 	/* Don't check timeouts during controller initialization. */
1372 	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
1373 		return;
1374 	}
1375 
1376 	if (nvme_qpair_is_admin_queue(qpair)) {
1377 		active_proc = nvme_ctrlr_get_current_process(ctrlr);
1378 	} else {
1379 		active_proc = qpair->active_proc;
1380 	}
1381 
1382 	/* Only check timeouts if the current process has a timeout callback. */
1383 	if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
1384 		return;
1385 	}
1386 
1387 	t02 = spdk_get_ticks();
1388 	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
1389 		assert(tcp_req->req != NULL);
1390 
1391 		if (nvme_request_check_timeout(tcp_req->req, tcp_req->cid, active_proc, t02)) {
1392 			/*
1393 			 * The requests are in order, so as soon as one has not timed out,
1394 			 * stop iterating.
1395 			 */
1396 			break;
1397 		}
1398 	}
1399 }
1400 
1401 static int
1402 nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
1403 {
1404 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
1405 	uint32_t reaped;
1406 	int rc;
1407 
1408 	rc = spdk_sock_flush(tqpair->sock);
1409 	if (rc < 0) {
1410 		return rc;
1411 	}
1412 
1413 	if (max_completions == 0) {
1414 		max_completions = tqpair->num_entries;
1415 	} else {
1416 		max_completions = spdk_min(max_completions, tqpair->num_entries);
1417 	}
1418 
1419 	reaped = 0;
1420 	do {
1421 		rc = nvme_tcp_read_pdu(tqpair, &reaped);
1422 		if (rc < 0) {
1423 			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Error polling CQ! (%d): %s\n",
1424 				      errno, spdk_strerror(errno));
1425 			goto fail;
1426 		} else if (rc == 0) {
1427 			/* Partial PDU is read */
1428 			break;
1429 		}
1430 
1431 	} while (reaped < max_completions);
1432 
1433 	if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) {
1434 		nvme_tcp_qpair_check_timeout(qpair);
1435 	}
1436 
1437 	return reaped;
1438 fail:
1439 
1440 	/*
1441 	 * Since admin queues take the ctrlr_lock before entering this function,
1442 	 * we can call nvme_transport_ctrlr_disconnect_qpair. For other qpairs we need
1443 	 * to call the generic function which will take the lock for us.
1444 	 */
1445 	qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN;
1446 
1447 	if (nvme_qpair_is_admin_queue(qpair)) {
1448 		nvme_transport_ctrlr_disconnect_qpair(qpair->ctrlr, qpair);
1449 	} else {
1450 		nvme_ctrlr_disconnect_qpair(qpair);
1451 	}
1452 	return -ENXIO;
1453 }
1454 
1455 static void
1456 nvme_tcp_qpair_sock_cb(void *ctx, struct spdk_sock_group *group, struct spdk_sock *sock)
1457 {
1458 	struct spdk_nvme_qpair *qpair = ctx;
1459 	struct nvme_tcp_poll_group *pgroup = nvme_tcp_poll_group(qpair->poll_group);
1460 	int32_t num_completions;
1461 
1462 	num_completions = spdk_nvme_qpair_process_completions(qpair, pgroup->completions_per_qpair);
1463 
1464 	if (pgroup->num_completions >= 0 && num_completions >= 0) {
1465 		pgroup->num_completions += num_completions;
1466 	} else {
1467 		pgroup->num_completions = -ENXIO;
1468 	}
1469 }
1470 
1471 static int
1472 nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair)
1473 {
1474 	struct spdk_nvme_tcp_ic_req *ic_req;
1475 	struct nvme_tcp_pdu *pdu;
1476 	uint64_t icreq_timeout_tsc;
1477 	int rc;
1478 
1479 	pdu = &tqpair->send_pdu;
1480 	memset(&tqpair->send_pdu, 0, sizeof(tqpair->send_pdu));
1481 	ic_req = &pdu->hdr.ic_req;
1482 
1483 	ic_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_REQ;
1484 	ic_req->common.hlen = ic_req->common.plen = sizeof(*ic_req);
1485 	ic_req->pfv = 0;
1486 	ic_req->maxr2t = NVME_TCP_MAX_R2T_DEFAULT - 1;
1487 	ic_req->hpda = NVME_TCP_HPDA_DEFAULT;
1488 
1489 	ic_req->dgst.bits.hdgst_enable = tqpair->qpair.ctrlr->opts.header_digest;
1490 	ic_req->dgst.bits.ddgst_enable = tqpair->qpair.ctrlr->opts.data_digest;
1491 
1492 	nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_send_icreq_complete, tqpair);
1493 
1494 	icreq_timeout_tsc = spdk_get_ticks() + (NVME_TCP_TIME_OUT_IN_SECONDS * spdk_get_ticks_hz());
1495 	do {
1496 		rc = nvme_tcp_qpair_process_completions(&tqpair->qpair, 0);
1497 	} while ((tqpair->state == NVME_TCP_QPAIR_STATE_INVALID) &&
1498 		 (rc == 0) && (spdk_get_ticks() <= icreq_timeout_tsc));
1499 
1500 	if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) {
1501 		SPDK_ERRLOG("Failed to construct the tqpair=%p via correct icresp\n", tqpair);
1502 		return -1;
1503 	}
1504 
1505 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "Succesfully construct the tqpair=%p via correct icresp\n", tqpair);
1506 
1507 	return 0;
1508 }
1509 
1510 static int
1511 nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1512 {
1513 	struct sockaddr_storage dst_addr;
1514 	struct sockaddr_storage src_addr;
1515 	int rc;
1516 	struct nvme_tcp_qpair *tqpair;
1517 	int family;
1518 	long int port;
1519 	struct spdk_sock_opts opts;
1520 
1521 	tqpair = nvme_tcp_qpair(qpair);
1522 
1523 	switch (ctrlr->trid.adrfam) {
1524 	case SPDK_NVMF_ADRFAM_IPV4:
1525 		family = AF_INET;
1526 		break;
1527 	case SPDK_NVMF_ADRFAM_IPV6:
1528 		family = AF_INET6;
1529 		break;
1530 	default:
1531 		SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam);
1532 		return -1;
1533 	}
1534 
1535 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family);
1536 
1537 	memset(&dst_addr, 0, sizeof(dst_addr));
1538 
1539 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid);
1540 	rc = nvme_tcp_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid);
1541 	if (rc != 0) {
1542 		SPDK_ERRLOG("dst_addr nvme_tcp_parse_addr() failed\n");
1543 		return -1;
1544 	}
1545 
1546 	if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) {
1547 		memset(&src_addr, 0, sizeof(src_addr));
1548 		rc = nvme_tcp_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid);
1549 		if (rc != 0) {
1550 			SPDK_ERRLOG("src_addr nvme_tcp_parse_addr() failed\n");
1551 			return -1;
1552 		}
1553 	}
1554 
1555 	port = spdk_strtol(ctrlr->trid.trsvcid, 10);
1556 	if (port <= 0 || port >= INT_MAX) {
1557 		SPDK_ERRLOG("Invalid port: %s\n", ctrlr->trid.trsvcid);
1558 		return -1;
1559 	}
1560 
1561 	opts.opts_size = sizeof(opts);
1562 	spdk_sock_get_default_opts(&opts);
1563 	opts.priority = ctrlr->trid.priority;
1564 	tqpair->sock = spdk_sock_connect_ext(ctrlr->trid.traddr, port, NULL, &opts);
1565 	if (!tqpair->sock) {
1566 		SPDK_ERRLOG("sock connection error of tqpair=%p with addr=%s, port=%ld\n",
1567 			    tqpair, ctrlr->trid.traddr, port);
1568 		return -1;
1569 	}
1570 
1571 	tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT;
1572 	/* Explicitly set the state and recv_state of tqpair */
1573 	tqpair->state = NVME_TCP_QPAIR_STATE_INVALID;
1574 	if (tqpair->recv_state != NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) {
1575 		nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
1576 	}
1577 	rc = nvme_tcp_qpair_icreq_send(tqpair);
1578 	if (rc != 0) {
1579 		SPDK_ERRLOG("Unable to connect the tqpair\n");
1580 		return -1;
1581 	}
1582 
1583 	rc = nvme_fabric_qpair_connect(&tqpair->qpair, tqpair->num_entries);
1584 	if (rc < 0) {
1585 		SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n");
1586 		return -1;
1587 	}
1588 
1589 	return 0;
1590 }
1591 
1592 static struct spdk_nvme_qpair *
1593 nvme_tcp_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr,
1594 			    uint16_t qid, uint32_t qsize,
1595 			    enum spdk_nvme_qprio qprio,
1596 			    uint32_t num_requests)
1597 {
1598 	struct nvme_tcp_qpair *tqpair;
1599 	struct spdk_nvme_qpair *qpair;
1600 	int rc;
1601 
1602 	tqpair = calloc(1, sizeof(struct nvme_tcp_qpair));
1603 	if (!tqpair) {
1604 		SPDK_ERRLOG("failed to get create tqpair\n");
1605 		return NULL;
1606 	}
1607 
1608 	tqpair->num_entries = qsize;
1609 	qpair = &tqpair->qpair;
1610 	rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests);
1611 	if (rc != 0) {
1612 		free(tqpair);
1613 		return NULL;
1614 	}
1615 
1616 	rc = nvme_tcp_alloc_reqs(tqpair);
1617 	if (rc) {
1618 		nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair);
1619 		return NULL;
1620 	}
1621 
1622 	return qpair;
1623 }
1624 
1625 static struct spdk_nvme_qpair *
1626 nvme_tcp_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
1627 			       const struct spdk_nvme_io_qpair_opts *opts)
1628 {
1629 	return nvme_tcp_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio,
1630 					   opts->io_queue_requests);
1631 }
1632 
1633 static struct spdk_nvme_ctrlr *nvme_tcp_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
1634 		const struct spdk_nvme_ctrlr_opts *opts,
1635 		void *devhandle)
1636 {
1637 	struct nvme_tcp_ctrlr *tctrlr;
1638 	union spdk_nvme_cap_register cap;
1639 	union spdk_nvme_vs_register vs;
1640 	int rc;
1641 
1642 	tctrlr = calloc(1, sizeof(*tctrlr));
1643 	if (tctrlr == NULL) {
1644 		SPDK_ERRLOG("could not allocate ctrlr\n");
1645 		return NULL;
1646 	}
1647 
1648 	tctrlr->ctrlr.opts = *opts;
1649 	tctrlr->ctrlr.trid = *trid;
1650 
1651 	rc = nvme_ctrlr_construct(&tctrlr->ctrlr);
1652 	if (rc != 0) {
1653 		free(tctrlr);
1654 		return NULL;
1655 	}
1656 
1657 	tctrlr->ctrlr.adminq = nvme_tcp_ctrlr_create_qpair(&tctrlr->ctrlr, 0,
1658 			       tctrlr->ctrlr.opts.admin_queue_size, 0,
1659 			       tctrlr->ctrlr.opts.admin_queue_size);
1660 	if (!tctrlr->ctrlr.adminq) {
1661 		SPDK_ERRLOG("failed to create admin qpair\n");
1662 		nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr);
1663 		return NULL;
1664 	}
1665 
1666 	rc = nvme_transport_ctrlr_connect_qpair(&tctrlr->ctrlr, tctrlr->ctrlr.adminq);
1667 	if (rc < 0) {
1668 		SPDK_ERRLOG("failed to connect admin qpair\n");
1669 		nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr);
1670 		return NULL;
1671 	}
1672 
1673 	if (nvme_ctrlr_get_cap(&tctrlr->ctrlr, &cap)) {
1674 		SPDK_ERRLOG("get_cap() failed\n");
1675 		nvme_ctrlr_destruct(&tctrlr->ctrlr);
1676 		return NULL;
1677 	}
1678 
1679 	if (nvme_ctrlr_get_vs(&tctrlr->ctrlr, &vs)) {
1680 		SPDK_ERRLOG("get_vs() failed\n");
1681 		nvme_ctrlr_destruct(&tctrlr->ctrlr);
1682 		return NULL;
1683 	}
1684 
1685 	if (nvme_ctrlr_add_process(&tctrlr->ctrlr, 0) != 0) {
1686 		SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n");
1687 		nvme_ctrlr_destruct(&tctrlr->ctrlr);
1688 		return NULL;
1689 	}
1690 
1691 	nvme_ctrlr_init_cap(&tctrlr->ctrlr, &cap, &vs);
1692 
1693 	return &tctrlr->ctrlr;
1694 }
1695 
1696 static uint32_t
1697 nvme_tcp_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
1698 {
1699 	/* TCP transport doens't limit maximum IO transfer size. */
1700 	return UINT32_MAX;
1701 }
1702 
1703 static uint16_t
1704 nvme_tcp_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
1705 {
1706 	/*
1707 	 * We do not support >1 SGE in the initiator currently,
1708 	 *  so we can only return 1 here.  Once that support is
1709 	 *  added, this should return ctrlr->cdata.nvmf_specific.msdbd
1710 	 *  instead.
1711 	 */
1712 	return 1;
1713 }
1714 
1715 static void
1716 nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
1717 {
1718 	struct nvme_tcp_req *tcp_req, *tmp;
1719 	struct nvme_request *req;
1720 	struct spdk_nvme_cpl cpl;
1721 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
1722 
1723 	cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
1724 	cpl.status.sct = SPDK_NVME_SCT_GENERIC;
1725 
1726 	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
1727 		assert(tcp_req->req != NULL);
1728 		req = tcp_req->req;
1729 		if (req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
1730 			continue;
1731 		}
1732 
1733 		nvme_tcp_req_complete(req, &cpl);
1734 		nvme_tcp_req_put(tqpair, tcp_req);
1735 	}
1736 }
1737 
1738 static struct spdk_nvme_transport_poll_group *
1739 nvme_tcp_poll_group_create(void)
1740 {
1741 	struct nvme_tcp_poll_group *group = calloc(1, sizeof(*group));
1742 
1743 	if (group == NULL) {
1744 		SPDK_ERRLOG("Unable to allocate poll group.\n");
1745 		return NULL;
1746 	}
1747 
1748 	group->sock_group = spdk_sock_group_create(group);
1749 	if (group->sock_group == NULL) {
1750 		free(group);
1751 		SPDK_ERRLOG("Unable to allocate sock group.\n");
1752 		return NULL;
1753 	}
1754 
1755 	return &group->group;
1756 }
1757 
1758 static int
1759 nvme_tcp_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
1760 {
1761 	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group);
1762 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
1763 
1764 	if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) {
1765 		return -EPROTO;
1766 	}
1767 	return 0;
1768 }
1769 
1770 static int
1771 nvme_tcp_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
1772 {
1773 	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group);
1774 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
1775 
1776 	if (tqpair->sock && group->sock_group) {
1777 		if (spdk_sock_group_remove_sock(group->sock_group, tqpair->sock)) {
1778 			return -EPROTO;
1779 		}
1780 	}
1781 	return 0;
1782 }
1783 
1784 static int
1785 nvme_tcp_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
1786 			struct spdk_nvme_qpair *qpair)
1787 {
1788 	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
1789 	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
1790 
1791 	/* disconnected qpairs won't have a sock to add. */
1792 	if (nvme_qpair_get_state(qpair) >= NVME_QPAIR_CONNECTED) {
1793 		if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) {
1794 			return -EPROTO;
1795 		}
1796 	}
1797 
1798 	return 0;
1799 }
1800 
1801 static int
1802 nvme_tcp_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
1803 			   struct spdk_nvme_qpair *qpair)
1804 {
1805 	if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
1806 		return nvme_poll_group_disconnect_qpair(qpair);
1807 	}
1808 
1809 	return 0;
1810 }
1811 
1812 static int64_t
1813 nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
1814 					uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
1815 {
1816 	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
1817 	struct spdk_nvme_qpair *qpair, *tmp_qpair;
1818 
1819 	group->completions_per_qpair = completions_per_qpair;
1820 	group->num_completions = 0;
1821 
1822 	spdk_sock_group_poll(group->sock_group);
1823 
1824 	STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
1825 		disconnected_qpair_cb(qpair, tgroup->group->ctx);
1826 	}
1827 
1828 	return group->num_completions;
1829 }
1830 
1831 static int
1832 nvme_tcp_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
1833 {
1834 	int rc;
1835 	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
1836 
1837 	if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
1838 		return -EBUSY;
1839 	}
1840 
1841 	rc = spdk_sock_group_close(&group->sock_group);
1842 	if (rc != 0) {
1843 		SPDK_ERRLOG("Failed to close the sock group for a tcp poll group.\n");
1844 		assert(false);
1845 	}
1846 
1847 	free(tgroup);
1848 
1849 	return 0;
1850 }
1851 
1852 const struct spdk_nvme_transport_ops tcp_ops = {
1853 	.name = "TCP",
1854 	.type = SPDK_NVME_TRANSPORT_TCP,
1855 	.ctrlr_construct = nvme_tcp_ctrlr_construct,
1856 	.ctrlr_scan = nvme_fabric_ctrlr_scan,
1857 	.ctrlr_destruct = nvme_tcp_ctrlr_destruct,
1858 	.ctrlr_enable = nvme_tcp_ctrlr_enable,
1859 
1860 	.ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4,
1861 	.ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8,
1862 	.ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4,
1863 	.ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8,
1864 
1865 	.ctrlr_get_max_xfer_size = nvme_tcp_ctrlr_get_max_xfer_size,
1866 	.ctrlr_get_max_sges = nvme_tcp_ctrlr_get_max_sges,
1867 
1868 	.ctrlr_create_io_qpair = nvme_tcp_ctrlr_create_io_qpair,
1869 	.ctrlr_delete_io_qpair = nvme_tcp_ctrlr_delete_io_qpair,
1870 	.ctrlr_connect_qpair = nvme_tcp_ctrlr_connect_qpair,
1871 	.ctrlr_disconnect_qpair = nvme_tcp_ctrlr_disconnect_qpair,
1872 
1873 	.qpair_abort_reqs = nvme_tcp_qpair_abort_reqs,
1874 	.qpair_reset = nvme_tcp_qpair_reset,
1875 	.qpair_submit_request = nvme_tcp_qpair_submit_request,
1876 	.qpair_process_completions = nvme_tcp_qpair_process_completions,
1877 	.admin_qpair_abort_aers = nvme_tcp_admin_qpair_abort_aers,
1878 
1879 	.poll_group_create = nvme_tcp_poll_group_create,
1880 	.poll_group_connect_qpair = nvme_tcp_poll_group_connect_qpair,
1881 	.poll_group_disconnect_qpair = nvme_tcp_poll_group_disconnect_qpair,
1882 	.poll_group_add = nvme_tcp_poll_group_add,
1883 	.poll_group_remove = nvme_tcp_poll_group_remove,
1884 	.poll_group_process_completions = nvme_tcp_poll_group_process_completions,
1885 	.poll_group_destroy = nvme_tcp_poll_group_destroy,
1886 };
1887 
1888 SPDK_NVME_TRANSPORT_REGISTER(tcp, &tcp_ops);
1889