1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. All rights reserved. 3 * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe/TCP transport 9 */ 10 11 #include "nvme_internal.h" 12 13 #include "spdk/endian.h" 14 #include "spdk/likely.h" 15 #include "spdk/string.h" 16 #include "spdk/stdinc.h" 17 #include "spdk/crc32.h" 18 #include "spdk/assert.h" 19 #include "spdk/trace.h" 20 #include "spdk/util.h" 21 #include "spdk/nvmf.h" 22 #include "spdk/dma.h" 23 24 #include "spdk_internal/nvme_tcp.h" 25 #include "spdk_internal/trace_defs.h" 26 27 #define NVME_TCP_RW_BUFFER_SIZE 131072 28 29 /* For async connect workloads, allow more time since we are more likely 30 * to be processing lots ICREQs at once. 31 */ 32 #define ICREQ_TIMEOUT_SYNC 2 /* in seconds */ 33 #define ICREQ_TIMEOUT_ASYNC 10 /* in seconds */ 34 35 #define NVME_TCP_HPDA_DEFAULT 0 36 #define NVME_TCP_MAX_R2T_DEFAULT 1 37 #define NVME_TCP_PDU_H2C_MIN_DATA_SIZE 4096 38 39 /* 40 * Maximum value of transport_ack_timeout used by TCP controller 41 */ 42 #define NVME_TCP_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT 31 43 44 enum nvme_tcp_qpair_state { 45 NVME_TCP_QPAIR_STATE_INVALID = 0, 46 NVME_TCP_QPAIR_STATE_INITIALIZING = 1, 47 NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND = 2, 48 NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL = 3, 49 NVME_TCP_QPAIR_STATE_AUTHENTICATING = 4, 50 NVME_TCP_QPAIR_STATE_RUNNING = 5, 51 NVME_TCP_QPAIR_STATE_EXITING = 6, 52 NVME_TCP_QPAIR_STATE_EXITED = 7, 53 }; 54 55 /* NVMe TCP transport extensions for spdk_nvme_ctrlr */ 56 struct nvme_tcp_ctrlr { 57 struct spdk_nvme_ctrlr ctrlr; 58 char psk_identity[NVMF_PSK_IDENTITY_LEN]; 59 uint8_t psk[SPDK_TLS_PSK_MAX_LEN]; 60 int psk_size; 61 char *tls_cipher_suite; 62 }; 63 64 struct nvme_tcp_poll_group { 65 struct spdk_nvme_transport_poll_group group; 66 struct spdk_sock_group *sock_group; 67 uint32_t completions_per_qpair; 68 int64_t num_completions; 69 70 TAILQ_HEAD(, nvme_tcp_qpair) needs_poll; 71 struct spdk_nvme_tcp_stat stats; 72 }; 73 74 /* NVMe TCP qpair extensions for spdk_nvme_qpair */ 75 struct nvme_tcp_qpair { 76 struct spdk_nvme_qpair qpair; 77 struct spdk_sock *sock; 78 79 TAILQ_HEAD(, nvme_tcp_req) free_reqs; 80 TAILQ_HEAD(, nvme_tcp_req) outstanding_reqs; 81 82 TAILQ_HEAD(, nvme_tcp_pdu) send_queue; 83 struct nvme_tcp_pdu *recv_pdu; 84 struct nvme_tcp_pdu *send_pdu; /* only for error pdu and init pdu */ 85 struct nvme_tcp_pdu *send_pdus; /* Used by tcp_reqs */ 86 enum nvme_tcp_pdu_recv_state recv_state; 87 struct nvme_tcp_req *tcp_reqs; 88 struct spdk_nvme_tcp_stat *stats; 89 90 uint16_t num_entries; 91 uint16_t async_complete; 92 93 struct { 94 uint16_t host_hdgst_enable: 1; 95 uint16_t host_ddgst_enable: 1; 96 uint16_t icreq_send_ack: 1; 97 uint16_t in_connect_poll: 1; 98 uint16_t reserved: 12; 99 } flags; 100 101 /** Specifies the maximum number of PDU-Data bytes per H2C Data Transfer PDU */ 102 uint32_t maxh2cdata; 103 104 uint32_t maxr2t; 105 106 /* 0 based value, which is used to guide the padding */ 107 uint8_t cpda; 108 109 enum nvme_tcp_qpair_state state; 110 111 TAILQ_ENTRY(nvme_tcp_qpair) link; 112 bool needs_poll; 113 114 uint64_t icreq_timeout_tsc; 115 116 bool shared_stats; 117 }; 118 119 enum nvme_tcp_req_state { 120 NVME_TCP_REQ_FREE, 121 NVME_TCP_REQ_ACTIVE, 122 NVME_TCP_REQ_ACTIVE_R2T, 123 }; 124 125 struct nvme_tcp_req { 126 struct nvme_request *req; 127 enum nvme_tcp_req_state state; 128 uint16_t cid; 129 uint16_t ttag; 130 uint32_t datao; 131 uint32_t expected_datao; 132 uint32_t r2tl_remain; 133 uint32_t active_r2ts; 134 /* Used to hold a value received from subsequent R2T while we are still 135 * waiting for H2C complete */ 136 uint16_t ttag_r2t_next; 137 bool in_capsule_data; 138 /* It is used to track whether the req can be safely freed */ 139 union { 140 uint8_t raw; 141 struct { 142 /* The last send operation completed - kernel released send buffer */ 143 uint8_t send_ack : 1; 144 /* Data transfer completed - target send resp or last data bit */ 145 uint8_t data_recv : 1; 146 /* tcp_req is waiting for completion of the previous send operation (buffer reclaim notification 147 * from kernel) to send H2C */ 148 uint8_t h2c_send_waiting_ack : 1; 149 /* tcp_req received subsequent r2t while it is still waiting for send_ack. 150 * Rare case, actual when dealing with target that can send several R2T requests. 151 * SPDK TCP target sends 1 R2T for the whole data buffer */ 152 uint8_t r2t_waiting_h2c_complete : 1; 153 /* Accel operation is in progress */ 154 uint8_t in_progress_accel : 1; 155 uint8_t domain_in_use: 1; 156 uint8_t reserved : 2; 157 } bits; 158 } ordering; 159 struct nvme_tcp_pdu *pdu; 160 struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS]; 161 uint32_t iovcnt; 162 /* Used to hold a value received from subsequent R2T while we are still 163 * waiting for H2C ack */ 164 uint32_t r2tl_remain_next; 165 struct nvme_tcp_qpair *tqpair; 166 TAILQ_ENTRY(nvme_tcp_req) link; 167 struct spdk_nvme_cpl rsp; 168 uint8_t rsvd1[32]; 169 }; 170 SPDK_STATIC_ASSERT(sizeof(struct nvme_tcp_req) % SPDK_CACHE_LINE_SIZE == 0, "unaligned size"); 171 172 static struct spdk_nvme_tcp_stat g_dummy_stats = {}; 173 174 static void nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req); 175 static int64_t nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group 176 *tgroup, uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb); 177 static void nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu); 178 static void nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req, struct nvme_tcp_qpair *tqpair, 179 struct spdk_nvme_cpl *rsp, bool print_on_error); 180 181 static inline struct nvme_tcp_qpair * 182 nvme_tcp_qpair(struct spdk_nvme_qpair *qpair) 183 { 184 assert(qpair->trtype == SPDK_NVME_TRANSPORT_TCP); 185 return SPDK_CONTAINEROF(qpair, struct nvme_tcp_qpair, qpair); 186 } 187 188 static inline struct nvme_tcp_poll_group * 189 nvme_tcp_poll_group(struct spdk_nvme_transport_poll_group *group) 190 { 191 return SPDK_CONTAINEROF(group, struct nvme_tcp_poll_group, group); 192 } 193 194 static inline struct nvme_tcp_ctrlr * 195 nvme_tcp_ctrlr(struct spdk_nvme_ctrlr *ctrlr) 196 { 197 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP); 198 return SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr); 199 } 200 201 static struct nvme_tcp_req * 202 nvme_tcp_req_get(struct nvme_tcp_qpair *tqpair) 203 { 204 struct nvme_tcp_req *tcp_req; 205 206 tcp_req = TAILQ_FIRST(&tqpair->free_reqs); 207 if (!tcp_req) { 208 return NULL; 209 } 210 211 assert(tcp_req->state == NVME_TCP_REQ_FREE); 212 tcp_req->state = NVME_TCP_REQ_ACTIVE; 213 TAILQ_REMOVE(&tqpair->free_reqs, tcp_req, link); 214 tcp_req->datao = 0; 215 tcp_req->expected_datao = 0; 216 tcp_req->req = NULL; 217 tcp_req->in_capsule_data = false; 218 tcp_req->r2tl_remain = 0; 219 tcp_req->r2tl_remain_next = 0; 220 tcp_req->active_r2ts = 0; 221 tcp_req->iovcnt = 0; 222 tcp_req->ordering.raw = 0; 223 memset(tcp_req->pdu, 0, sizeof(struct nvme_tcp_pdu)); 224 memset(&tcp_req->rsp, 0, sizeof(struct spdk_nvme_cpl)); 225 226 return tcp_req; 227 } 228 229 static void 230 nvme_tcp_req_put(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) 231 { 232 assert(tcp_req->state != NVME_TCP_REQ_FREE); 233 tcp_req->state = NVME_TCP_REQ_FREE; 234 TAILQ_INSERT_HEAD(&tqpair->free_reqs, tcp_req, link); 235 } 236 237 static inline void 238 nvme_tcp_accel_finish_sequence(struct nvme_tcp_poll_group *tgroup, struct nvme_tcp_req *treq, 239 void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 240 { 241 struct spdk_nvme_poll_group *pg = tgroup->group.group; 242 243 treq->ordering.bits.in_progress_accel = 1; 244 pg->accel_fn_table.finish_sequence(seq, cb_fn, cb_arg); 245 } 246 247 static inline void 248 nvme_tcp_accel_reverse_sequence(struct nvme_tcp_poll_group *tgroup, void *seq) 249 { 250 struct spdk_nvme_poll_group *pg = tgroup->group.group; 251 252 pg->accel_fn_table.reverse_sequence(seq); 253 } 254 255 static inline int 256 nvme_tcp_accel_append_crc32c(struct nvme_tcp_poll_group *tgroup, void **seq, uint32_t *dst, 257 struct iovec *iovs, uint32_t iovcnt, uint32_t seed, 258 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 259 { 260 struct spdk_nvme_poll_group *pg = tgroup->group.group; 261 262 return pg->accel_fn_table.append_crc32c(pg->ctx, seq, dst, iovs, iovcnt, NULL, NULL, 263 seed, cb_fn, cb_arg); 264 } 265 266 static void 267 nvme_tcp_free_reqs(struct nvme_tcp_qpair *tqpair) 268 { 269 free(tqpair->tcp_reqs); 270 tqpair->tcp_reqs = NULL; 271 272 spdk_free(tqpair->send_pdus); 273 tqpair->send_pdus = NULL; 274 } 275 276 static int 277 nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair) 278 { 279 uint16_t i; 280 struct nvme_tcp_req *tcp_req; 281 282 tqpair->tcp_reqs = aligned_alloc(SPDK_CACHE_LINE_SIZE, 283 tqpair->num_entries * sizeof(*tcp_req)); 284 if (tqpair->tcp_reqs == NULL) { 285 SPDK_ERRLOG("Failed to allocate tcp_reqs on tqpair=%p\n", tqpair); 286 goto fail; 287 } 288 289 /* Add additional 2 member for the send_pdu, recv_pdu owned by the tqpair */ 290 tqpair->send_pdus = spdk_zmalloc((tqpair->num_entries + 2) * sizeof(struct nvme_tcp_pdu), 291 0x1000, NULL, 292 SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 293 294 if (tqpair->send_pdus == NULL) { 295 SPDK_ERRLOG("Failed to allocate send_pdus on tqpair=%p\n", tqpair); 296 goto fail; 297 } 298 299 memset(tqpair->tcp_reqs, 0, tqpair->num_entries * sizeof(*tcp_req)); 300 TAILQ_INIT(&tqpair->send_queue); 301 TAILQ_INIT(&tqpair->free_reqs); 302 TAILQ_INIT(&tqpair->outstanding_reqs); 303 tqpair->qpair.queue_depth = 0; 304 for (i = 0; i < tqpair->num_entries; i++) { 305 tcp_req = &tqpair->tcp_reqs[i]; 306 tcp_req->cid = i; 307 tcp_req->tqpair = tqpair; 308 tcp_req->pdu = &tqpair->send_pdus[i]; 309 TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link); 310 } 311 312 tqpair->send_pdu = &tqpair->send_pdus[i]; 313 tqpair->recv_pdu = &tqpair->send_pdus[i + 1]; 314 315 return 0; 316 fail: 317 nvme_tcp_free_reqs(tqpair); 318 return -ENOMEM; 319 } 320 321 static inline void 322 nvme_tcp_qpair_set_recv_state(struct nvme_tcp_qpair *tqpair, 323 enum nvme_tcp_pdu_recv_state state) 324 { 325 if (tqpair->recv_state == state) { 326 SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n", 327 tqpair, state); 328 return; 329 } 330 331 if (state == NVME_TCP_PDU_RECV_STATE_ERROR) { 332 assert(TAILQ_EMPTY(&tqpair->outstanding_reqs)); 333 } 334 335 tqpair->recv_state = state; 336 } 337 338 static void nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); 339 340 static void 341 nvme_tcp_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 342 { 343 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 344 struct nvme_tcp_pdu *pdu; 345 int rc; 346 struct nvme_tcp_poll_group *group; 347 348 if (tqpair->needs_poll) { 349 group = nvme_tcp_poll_group(qpair->poll_group); 350 TAILQ_REMOVE(&group->needs_poll, tqpair, link); 351 tqpair->needs_poll = false; 352 } 353 354 rc = spdk_sock_close(&tqpair->sock); 355 356 if (tqpair->sock != NULL) { 357 SPDK_ERRLOG("tqpair=%p, errno=%d, rc=%d\n", tqpair, errno, rc); 358 /* Set it to NULL manually */ 359 tqpair->sock = NULL; 360 } 361 362 /* clear the send_queue */ 363 while (!TAILQ_EMPTY(&tqpair->send_queue)) { 364 pdu = TAILQ_FIRST(&tqpair->send_queue); 365 /* Remove the pdu from the send_queue to prevent the wrong sending out 366 * in the next round connection 367 */ 368 TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); 369 } 370 371 nvme_tcp_qpair_abort_reqs(qpair, qpair->abort_dnr); 372 373 /* If the qpair is marked as asynchronous, let it go through the process_completions() to 374 * let any outstanding requests (e.g. those with outstanding accel operations) complete. 375 * Otherwise, there's no way of waiting for them, so tqpair->outstanding_reqs has to be 376 * empty. 377 */ 378 if (qpair->async) { 379 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 380 } else { 381 assert(TAILQ_EMPTY(&tqpair->outstanding_reqs)); 382 nvme_transport_ctrlr_disconnect_qpair_done(qpair); 383 } 384 } 385 386 static int 387 nvme_tcp_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 388 { 389 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 390 391 assert(qpair != NULL); 392 nvme_tcp_qpair_abort_reqs(qpair, qpair->abort_dnr); 393 assert(TAILQ_EMPTY(&tqpair->outstanding_reqs)); 394 395 nvme_qpair_deinit(qpair); 396 nvme_tcp_free_reqs(tqpair); 397 if (!tqpair->shared_stats) { 398 free(tqpair->stats); 399 } 400 free(tqpair); 401 402 return 0; 403 } 404 405 static int 406 nvme_tcp_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 407 { 408 return 0; 409 } 410 411 static int 412 nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 413 { 414 struct nvme_tcp_ctrlr *tctrlr = nvme_tcp_ctrlr(ctrlr); 415 416 if (ctrlr->adminq) { 417 nvme_tcp_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq); 418 } 419 420 nvme_ctrlr_destruct_finish(ctrlr); 421 422 free(tctrlr); 423 424 return 0; 425 } 426 427 /* If there are queued requests, we assume they are queued because they are waiting 428 * for resources to be released. Those resources are almost certainly released in 429 * response to a PDU completing. However, to attempt to make forward progress 430 * the qpair needs to be polled and we can't rely on another network event to make 431 * that happen. Add it to a list of qpairs to poll regardless of network activity. 432 * 433 * Besides, when tqpair state is NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL or 434 * NVME_TCP_QPAIR_STATE_INITIALIZING, need to add it to needs_poll list too to make 435 * forward progress in case that the resources are released after icreq's or CONNECT's 436 * resp is processed. */ 437 static void 438 nvme_tcp_cond_schedule_qpair_polling(struct nvme_tcp_qpair *tqpair) 439 { 440 struct nvme_tcp_poll_group *pgroup; 441 442 if (tqpair->needs_poll || !tqpair->qpair.poll_group) { 443 return; 444 } 445 446 if (STAILQ_EMPTY(&tqpair->qpair.queued_req) && 447 spdk_likely(tqpair->state != NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL && 448 tqpair->state != NVME_TCP_QPAIR_STATE_INITIALIZING)) { 449 return; 450 } 451 452 pgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 453 TAILQ_INSERT_TAIL(&pgroup->needs_poll, tqpair, link); 454 tqpair->needs_poll = true; 455 } 456 457 static void 458 pdu_write_done(void *cb_arg, int err) 459 { 460 struct nvme_tcp_pdu *pdu = cb_arg; 461 struct nvme_tcp_qpair *tqpair = pdu->qpair; 462 463 nvme_tcp_cond_schedule_qpair_polling(tqpair); 464 TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); 465 466 if (err != 0) { 467 nvme_transport_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair); 468 return; 469 } 470 471 assert(pdu->cb_fn != NULL); 472 pdu->cb_fn(pdu->cb_arg); 473 } 474 475 static void 476 pdu_write_fail(struct nvme_tcp_pdu *pdu, int status) 477 { 478 struct nvme_tcp_qpair *tqpair = pdu->qpair; 479 480 /* This function is similar to pdu_write_done(), but it should be called before a PDU is 481 * sent over the socket */ 482 TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); 483 pdu_write_done(pdu, status); 484 } 485 486 static void 487 pdu_seq_fail(struct nvme_tcp_pdu *pdu, int status) 488 { 489 struct nvme_tcp_req *treq = pdu->req; 490 491 SPDK_ERRLOG("Failed to execute accel sequence: %d\n", status); 492 nvme_tcp_cond_schedule_qpair_polling(pdu->qpair); 493 treq->rsp.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 494 nvme_tcp_req_complete(treq, treq->tqpair, &treq->rsp, true); 495 } 496 497 static void 498 _tcp_write_pdu(struct nvme_tcp_pdu *pdu) 499 { 500 uint32_t mapped_length = 0; 501 struct nvme_tcp_qpair *tqpair = pdu->qpair; 502 503 pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, SPDK_COUNTOF(pdu->iov), pdu, 504 (bool)tqpair->flags.host_hdgst_enable, (bool)tqpair->flags.host_ddgst_enable, 505 &mapped_length); 506 TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); 507 if (spdk_unlikely(mapped_length < pdu->data_len)) { 508 SPDK_ERRLOG("could not map the whole %u bytes (mapped only %u bytes)\n", pdu->data_len, 509 mapped_length); 510 pdu_write_done(pdu, -EINVAL); 511 return; 512 } 513 pdu->sock_req.cb_fn = pdu_write_done; 514 pdu->sock_req.cb_arg = pdu; 515 tqpair->stats->submitted_requests++; 516 spdk_sock_writev_async(tqpair->sock, &pdu->sock_req); 517 } 518 519 static void 520 tcp_write_pdu_seq_cb(void *ctx, int status) 521 { 522 struct nvme_tcp_pdu *pdu = ctx; 523 struct nvme_tcp_req *treq = pdu->req; 524 struct nvme_request *req = treq->req; 525 526 assert(treq->ordering.bits.in_progress_accel); 527 treq->ordering.bits.in_progress_accel = 0; 528 529 req->accel_sequence = NULL; 530 if (spdk_unlikely(status != 0)) { 531 pdu_seq_fail(pdu, status); 532 return; 533 } 534 535 _tcp_write_pdu(pdu); 536 } 537 538 static void 539 tcp_write_pdu(struct nvme_tcp_pdu *pdu) 540 { 541 struct nvme_tcp_req *treq = pdu->req; 542 struct nvme_tcp_qpair *tqpair = pdu->qpair; 543 struct nvme_tcp_poll_group *tgroup; 544 struct nvme_request *req; 545 546 if (spdk_likely(treq != NULL)) { 547 req = treq->req; 548 if (req->accel_sequence != NULL && 549 spdk_nvme_opc_get_data_transfer(req->cmd.opc) == SPDK_NVME_DATA_HOST_TO_CONTROLLER && 550 pdu->data_len > 0) { 551 assert(tqpair->qpair.poll_group != NULL); 552 tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 553 nvme_tcp_accel_finish_sequence(tgroup, treq, req->accel_sequence, 554 tcp_write_pdu_seq_cb, pdu); 555 return; 556 } 557 } 558 559 _tcp_write_pdu(pdu); 560 } 561 562 static void 563 pdu_accel_seq_compute_crc32_done(void *cb_arg) 564 { 565 struct nvme_tcp_pdu *pdu = cb_arg; 566 567 pdu->data_digest_crc32 ^= SPDK_CRC32C_XOR; 568 MAKE_DIGEST_WORD(pdu->data_digest, pdu->data_digest_crc32); 569 } 570 571 static bool 572 pdu_accel_compute_crc32(struct nvme_tcp_pdu *pdu) 573 { 574 struct nvme_tcp_qpair *tqpair = pdu->qpair; 575 struct nvme_tcp_poll_group *tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 576 struct nvme_request *req = ((struct nvme_tcp_req *)pdu->req)->req; 577 int rc; 578 579 /* Only support this limited case for the first step */ 580 if (spdk_unlikely(nvme_qpair_get_state(&tqpair->qpair) < NVME_QPAIR_CONNECTED || 581 pdu->dif_ctx != NULL || 582 pdu->data_len % SPDK_NVME_TCP_DIGEST_ALIGNMENT != 0)) { 583 return false; 584 } 585 586 if (tqpair->qpair.poll_group == NULL || 587 tgroup->group.group->accel_fn_table.append_crc32c == NULL) { 588 return false; 589 } 590 591 rc = nvme_tcp_accel_append_crc32c(tgroup, &req->accel_sequence, 592 &pdu->data_digest_crc32, 593 pdu->data_iov, pdu->data_iovcnt, 0, 594 pdu_accel_seq_compute_crc32_done, pdu); 595 if (spdk_unlikely(rc != 0)) { 596 /* If accel is out of resources, fall back to non-accelerated crc32 */ 597 if (rc == -ENOMEM) { 598 return false; 599 } 600 601 SPDK_ERRLOG("Failed to append crc32c operation: %d\n", rc); 602 pdu_write_fail(pdu, rc); 603 return true; 604 } 605 606 tcp_write_pdu(pdu); 607 608 return true; 609 } 610 611 static void 612 pdu_compute_crc32_seq_cb(void *cb_arg, int status) 613 { 614 struct nvme_tcp_pdu *pdu = cb_arg; 615 struct nvme_tcp_req *treq = pdu->req; 616 struct nvme_request *req = treq->req; 617 uint32_t crc32c; 618 619 assert(treq->ordering.bits.in_progress_accel); 620 treq->ordering.bits.in_progress_accel = 0; 621 622 req->accel_sequence = NULL; 623 if (spdk_unlikely(status != 0)) { 624 pdu_seq_fail(pdu, status); 625 return; 626 } 627 628 crc32c = nvme_tcp_pdu_calc_data_digest(pdu); 629 crc32c = crc32c ^ SPDK_CRC32C_XOR; 630 MAKE_DIGEST_WORD(pdu->data_digest, crc32c); 631 632 _tcp_write_pdu(pdu); 633 } 634 635 static void 636 pdu_compute_crc32(struct nvme_tcp_pdu *pdu) 637 { 638 struct nvme_tcp_qpair *tqpair = pdu->qpair; 639 struct nvme_tcp_poll_group *tgroup; 640 struct nvme_request *req; 641 uint32_t crc32c; 642 643 /* Data Digest */ 644 if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && 645 tqpair->flags.host_ddgst_enable) { 646 if (pdu_accel_compute_crc32(pdu)) { 647 return; 648 } 649 650 req = ((struct nvme_tcp_req *)pdu->req)->req; 651 if (req->accel_sequence != NULL) { 652 tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 653 nvme_tcp_accel_finish_sequence(tgroup, pdu->req, req->accel_sequence, 654 pdu_compute_crc32_seq_cb, pdu); 655 return; 656 } 657 658 crc32c = nvme_tcp_pdu_calc_data_digest(pdu); 659 crc32c = crc32c ^ SPDK_CRC32C_XOR; 660 MAKE_DIGEST_WORD(pdu->data_digest, crc32c); 661 } 662 663 tcp_write_pdu(pdu); 664 } 665 666 static int 667 nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair, 668 struct nvme_tcp_pdu *pdu, 669 nvme_tcp_qpair_xfer_complete_cb cb_fn, 670 void *cb_arg) 671 { 672 int hlen; 673 uint32_t crc32c; 674 675 hlen = pdu->hdr.common.hlen; 676 pdu->cb_fn = cb_fn; 677 pdu->cb_arg = cb_arg; 678 pdu->qpair = tqpair; 679 680 /* Header Digest */ 681 if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->flags.host_hdgst_enable) { 682 crc32c = nvme_tcp_pdu_calc_header_digest(pdu); 683 MAKE_DIGEST_WORD((uint8_t *)&pdu->hdr.raw[hlen], crc32c); 684 } 685 686 pdu_compute_crc32(pdu); 687 688 return 0; 689 } 690 691 static int 692 nvme_tcp_try_memory_translation(struct nvme_tcp_req *tcp_req, void **addr, uint32_t length) 693 { 694 struct nvme_request *req = tcp_req->req; 695 struct spdk_memory_domain_translation_result translation = { 696 .iov_count = 0, 697 .size = sizeof(translation) 698 }; 699 int rc; 700 701 if (!tcp_req->ordering.bits.domain_in_use) { 702 return 0; 703 } 704 705 rc = spdk_memory_domain_translate_data(req->payload.opts->memory_domain, 706 req->payload.opts->memory_domain_ctx, spdk_memory_domain_get_system_domain(), NULL, *addr, length, 707 &translation); 708 if (spdk_unlikely(rc || translation.iov_count != 1)) { 709 SPDK_ERRLOG("DMA memory translation failed, rc %d, iov_count %u\n", rc, translation.iov_count); 710 return -EFAULT; 711 } 712 713 assert(length == translation.iov.iov_len); 714 *addr = translation.iov.iov_base; 715 return 0; 716 } 717 718 /* 719 * Build SGL describing contiguous payload buffer. 720 */ 721 static int 722 nvme_tcp_build_contig_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) 723 { 724 struct nvme_request *req = tcp_req->req; 725 726 /* ubsan complains about applying zero offset to null pointer if contig_or_cb_arg is NULL, 727 * so just double cast it to make it go away */ 728 void *addr = (void *)((uintptr_t)req->payload.contig_or_cb_arg + req->payload_offset); 729 size_t length = req->payload_size; 730 int rc; 731 732 SPDK_DEBUGLOG(nvme, "enter\n"); 733 734 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); 735 rc = nvme_tcp_try_memory_translation(tcp_req, &addr, length); 736 if (spdk_unlikely(rc)) { 737 return rc; 738 } 739 740 tcp_req->iov[0].iov_base = addr; 741 tcp_req->iov[0].iov_len = length; 742 tcp_req->iovcnt = 1; 743 return 0; 744 } 745 746 /* 747 * Build SGL describing scattered payload buffer. 748 */ 749 static int 750 nvme_tcp_build_sgl_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) 751 { 752 int rc; 753 uint32_t length, remaining_size, iovcnt = 0, max_num_sgl; 754 struct nvme_request *req = tcp_req->req; 755 756 SPDK_DEBUGLOG(nvme, "enter\n"); 757 758 assert(req->payload_size != 0); 759 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 760 assert(req->payload.reset_sgl_fn != NULL); 761 assert(req->payload.next_sge_fn != NULL); 762 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 763 764 max_num_sgl = spdk_min(req->qpair->ctrlr->max_sges, NVME_TCP_MAX_SGL_DESCRIPTORS); 765 remaining_size = req->payload_size; 766 767 do { 768 void *addr; 769 770 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &addr, &length); 771 if (rc) { 772 return -1; 773 } 774 775 rc = nvme_tcp_try_memory_translation(tcp_req, &addr, length); 776 if (spdk_unlikely(rc)) { 777 return rc; 778 } 779 780 length = spdk_min(length, remaining_size); 781 tcp_req->iov[iovcnt].iov_base = addr; 782 tcp_req->iov[iovcnt].iov_len = length; 783 remaining_size -= length; 784 iovcnt++; 785 } while (remaining_size > 0 && iovcnt < max_num_sgl); 786 787 788 /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ 789 if (remaining_size > 0) { 790 SPDK_ERRLOG("Failed to construct tcp_req=%p, and the iovcnt=%u, remaining_size=%u\n", 791 tcp_req, iovcnt, remaining_size); 792 return -1; 793 } 794 795 tcp_req->iovcnt = iovcnt; 796 797 return 0; 798 } 799 800 static int 801 nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req, 802 struct nvme_tcp_req *tcp_req) 803 { 804 struct spdk_nvme_ctrlr *ctrlr = tqpair->qpair.ctrlr; 805 int rc = 0; 806 enum spdk_nvme_data_transfer xfer; 807 uint32_t max_in_capsule_data_size; 808 809 tcp_req->req = req; 810 tcp_req->ordering.bits.domain_in_use = (req->payload.opts && req->payload.opts->memory_domain); 811 812 req->cmd.cid = tcp_req->cid; 813 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 814 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK; 815 req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_TRANSPORT; 816 req->cmd.dptr.sgl1.unkeyed.length = req->payload_size; 817 818 if (spdk_unlikely(req->cmd.opc == SPDK_NVME_OPC_FABRIC)) { 819 struct spdk_nvmf_capsule_cmd *nvmf_cmd = (struct spdk_nvmf_capsule_cmd *)&req->cmd; 820 821 xfer = spdk_nvme_opc_get_data_transfer(nvmf_cmd->fctype); 822 } else { 823 xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc); 824 } 825 826 /* For c2h delay filling in the iov until the data arrives. 827 * For h2c some delay is also possible if data doesn't fit into cmd capsule (not implemented). */ 828 if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { 829 if (xfer != SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 830 rc = nvme_tcp_build_contig_request(tqpair, tcp_req); 831 } 832 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { 833 if (xfer != SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 834 rc = nvme_tcp_build_sgl_request(tqpair, tcp_req); 835 } 836 } else { 837 rc = -1; 838 } 839 840 if (rc) { 841 return rc; 842 } 843 844 if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 845 max_in_capsule_data_size = ctrlr->ioccsz_bytes; 846 if (spdk_unlikely((req->cmd.opc == SPDK_NVME_OPC_FABRIC) || 847 nvme_qpair_is_admin_queue(&tqpair->qpair))) { 848 max_in_capsule_data_size = SPDK_NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE; 849 } 850 851 if (req->payload_size <= max_in_capsule_data_size) { 852 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 853 req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; 854 req->cmd.dptr.sgl1.address = 0; 855 tcp_req->in_capsule_data = true; 856 } 857 } 858 859 return 0; 860 } 861 862 static inline bool 863 nvme_tcp_req_complete_safe(struct nvme_tcp_req *tcp_req) 864 { 865 if (!(tcp_req->ordering.bits.send_ack && tcp_req->ordering.bits.data_recv && 866 !tcp_req->ordering.bits.in_progress_accel)) { 867 return false; 868 } 869 870 assert(tcp_req->state == NVME_TCP_REQ_ACTIVE); 871 assert(tcp_req->tqpair != NULL); 872 assert(tcp_req->req != NULL); 873 874 nvme_tcp_req_complete(tcp_req, tcp_req->tqpair, &tcp_req->rsp, true); 875 return true; 876 } 877 878 static void 879 nvme_tcp_qpair_cmd_send_complete(void *cb_arg) 880 { 881 struct nvme_tcp_req *tcp_req = cb_arg; 882 883 SPDK_DEBUGLOG(nvme, "tcp req %p, cid %u, qid %u\n", tcp_req, tcp_req->cid, 884 tcp_req->tqpair->qpair.id); 885 tcp_req->ordering.bits.send_ack = 1; 886 /* Handle the r2t case */ 887 if (spdk_unlikely(tcp_req->ordering.bits.h2c_send_waiting_ack)) { 888 SPDK_DEBUGLOG(nvme, "tcp req %p, send H2C data\n", tcp_req); 889 nvme_tcp_send_h2c_data(tcp_req); 890 } else { 891 if (tcp_req->in_capsule_data && tcp_req->ordering.bits.domain_in_use) { 892 spdk_memory_domain_invalidate_data(tcp_req->req->payload.opts->memory_domain, 893 tcp_req->req->payload.opts->memory_domain_ctx, tcp_req->iov, tcp_req->iovcnt); 894 } 895 896 nvme_tcp_req_complete_safe(tcp_req); 897 } 898 } 899 900 static int 901 nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair, 902 struct nvme_tcp_req *tcp_req) 903 { 904 struct nvme_tcp_pdu *pdu; 905 struct spdk_nvme_tcp_cmd *capsule_cmd; 906 uint32_t plen = 0, alignment; 907 uint8_t pdo; 908 909 SPDK_DEBUGLOG(nvme, "enter\n"); 910 pdu = tcp_req->pdu; 911 pdu->req = tcp_req; 912 913 capsule_cmd = &pdu->hdr.capsule_cmd; 914 capsule_cmd->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD; 915 plen = capsule_cmd->common.hlen = sizeof(*capsule_cmd); 916 capsule_cmd->ccsqe = tcp_req->req->cmd; 917 918 SPDK_DEBUGLOG(nvme, "capsule_cmd cid=%u on tqpair(%p)\n", tcp_req->req->cmd.cid, tqpair); 919 920 if (tqpair->flags.host_hdgst_enable) { 921 SPDK_DEBUGLOG(nvme, "Header digest is enabled for capsule command on tcp_req=%p\n", 922 tcp_req); 923 capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; 924 plen += SPDK_NVME_TCP_DIGEST_LEN; 925 } 926 927 if ((tcp_req->req->payload_size == 0) || !tcp_req->in_capsule_data) { 928 goto end; 929 } 930 931 pdo = plen; 932 pdu->padding_len = 0; 933 if (tqpair->cpda) { 934 alignment = (tqpair->cpda + 1) << 2; 935 if (alignment > plen) { 936 pdu->padding_len = alignment - plen; 937 pdo = alignment; 938 plen = alignment; 939 } 940 } 941 942 capsule_cmd->common.pdo = pdo; 943 plen += tcp_req->req->payload_size; 944 if (tqpair->flags.host_ddgst_enable) { 945 capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; 946 plen += SPDK_NVME_TCP_DIGEST_LEN; 947 } 948 949 tcp_req->datao = 0; 950 nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt, 951 0, tcp_req->req->payload_size); 952 end: 953 capsule_cmd->common.plen = plen; 954 return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, tcp_req); 955 956 } 957 958 static int 959 nvme_tcp_qpair_submit_request(struct spdk_nvme_qpair *qpair, 960 struct nvme_request *req) 961 { 962 struct nvme_tcp_qpair *tqpair; 963 struct nvme_tcp_req *tcp_req; 964 965 tqpair = nvme_tcp_qpair(qpair); 966 assert(tqpair != NULL); 967 assert(req != NULL); 968 969 tcp_req = nvme_tcp_req_get(tqpair); 970 if (!tcp_req) { 971 tqpair->stats->queued_requests++; 972 /* Inform the upper layer to try again later. */ 973 return -EAGAIN; 974 } 975 976 if (spdk_unlikely(nvme_tcp_req_init(tqpair, req, tcp_req))) { 977 SPDK_ERRLOG("nvme_tcp_req_init() failed\n"); 978 nvme_tcp_req_put(tqpair, tcp_req); 979 return -1; 980 } 981 982 tqpair->qpair.queue_depth++; 983 spdk_trace_record(TRACE_NVME_TCP_SUBMIT, qpair->id, 0, (uintptr_t)tcp_req->pdu, req->cb_arg, 984 (uint32_t)req->cmd.cid, (uint32_t)req->cmd.opc, 985 req->cmd.cdw10, req->cmd.cdw11, req->cmd.cdw12, tqpair->qpair.queue_depth); 986 TAILQ_INSERT_TAIL(&tqpair->outstanding_reqs, tcp_req, link); 987 return nvme_tcp_qpair_capsule_cmd_send(tqpair, tcp_req); 988 } 989 990 static int 991 nvme_tcp_qpair_reset(struct spdk_nvme_qpair *qpair) 992 { 993 return 0; 994 } 995 996 static void 997 nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req, 998 struct nvme_tcp_qpair *tqpair, 999 struct spdk_nvme_cpl *rsp, 1000 bool print_on_error) 1001 { 1002 struct spdk_nvme_cpl cpl; 1003 struct spdk_nvme_qpair *qpair; 1004 struct nvme_request *req; 1005 bool print_error; 1006 1007 assert(tcp_req->req != NULL); 1008 req = tcp_req->req; 1009 qpair = req->qpair; 1010 1011 SPDK_DEBUGLOG(nvme, "complete tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair); 1012 1013 if (!tcp_req->tqpair->qpair.in_completion_context) { 1014 tcp_req->tqpair->async_complete++; 1015 } 1016 1017 /* Cache arguments to be passed to nvme_complete_request since tcp_req can be zeroed when released */ 1018 memcpy(&cpl, rsp, sizeof(cpl)); 1019 1020 if (spdk_unlikely(spdk_nvme_cpl_is_error(rsp))) { 1021 print_error = print_on_error && !qpair->ctrlr->opts.disable_error_logging; 1022 1023 if (print_error) { 1024 spdk_nvme_qpair_print_command(qpair, &req->cmd); 1025 } 1026 1027 if (print_error || SPDK_DEBUGLOG_FLAG_ENABLED("nvme")) { 1028 spdk_nvme_qpair_print_completion(qpair, rsp); 1029 } 1030 } 1031 1032 tqpair->qpair.queue_depth--; 1033 spdk_trace_record(TRACE_NVME_TCP_COMPLETE, qpair->id, 0, (uintptr_t)tcp_req->pdu, req->cb_arg, 1034 (uint32_t)req->cmd.cid, (uint32_t)cpl.status_raw, tqpair->qpair.queue_depth); 1035 TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link); 1036 nvme_tcp_req_put(tqpair, tcp_req); 1037 nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, &cpl); 1038 } 1039 1040 static void 1041 nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1042 { 1043 struct nvme_tcp_req *tcp_req, *tmp; 1044 struct spdk_nvme_cpl cpl = {}; 1045 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 1046 1047 cpl.sqid = qpair->id; 1048 cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; 1049 cpl.status.sct = SPDK_NVME_SCT_GENERIC; 1050 cpl.status.dnr = dnr; 1051 1052 TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { 1053 /* We cannot abort requests with accel operations in progress */ 1054 if (tcp_req->ordering.bits.in_progress_accel) { 1055 continue; 1056 } 1057 1058 nvme_tcp_req_complete(tcp_req, tqpair, &cpl, true); 1059 } 1060 } 1061 1062 static void 1063 nvme_tcp_qpair_send_h2c_term_req_complete(void *cb_arg) 1064 { 1065 struct nvme_tcp_qpair *tqpair = cb_arg; 1066 1067 tqpair->state = NVME_TCP_QPAIR_STATE_EXITING; 1068 } 1069 1070 static void 1071 nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, 1072 enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset) 1073 { 1074 struct nvme_tcp_pdu *rsp_pdu; 1075 struct spdk_nvme_tcp_term_req_hdr *h2c_term_req; 1076 uint32_t h2c_term_req_hdr_len = sizeof(*h2c_term_req); 1077 uint8_t copy_len; 1078 1079 rsp_pdu = tqpair->send_pdu; 1080 memset(rsp_pdu, 0, sizeof(*rsp_pdu)); 1081 h2c_term_req = &rsp_pdu->hdr.term_req; 1082 h2c_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ; 1083 h2c_term_req->common.hlen = h2c_term_req_hdr_len; 1084 1085 if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || 1086 (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { 1087 DSET32(&h2c_term_req->fei, error_offset); 1088 } 1089 1090 copy_len = pdu->hdr.common.hlen; 1091 if (copy_len > SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) { 1092 copy_len = SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE; 1093 } 1094 1095 /* Copy the error info into the buffer */ 1096 memcpy((uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, pdu->hdr.raw, copy_len); 1097 nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, copy_len); 1098 1099 /* Contain the header len of the wrong received pdu */ 1100 h2c_term_req->common.plen = h2c_term_req->common.hlen + copy_len; 1101 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 1102 nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_send_h2c_term_req_complete, tqpair); 1103 } 1104 1105 static bool 1106 nvme_tcp_qpair_recv_state_valid(struct nvme_tcp_qpair *tqpair) 1107 { 1108 switch (tqpair->state) { 1109 case NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND: 1110 case NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL: 1111 case NVME_TCP_QPAIR_STATE_AUTHENTICATING: 1112 case NVME_TCP_QPAIR_STATE_RUNNING: 1113 return true; 1114 default: 1115 return false; 1116 } 1117 } 1118 1119 static void 1120 nvme_tcp_pdu_ch_handle(struct nvme_tcp_qpair *tqpair) 1121 { 1122 struct nvme_tcp_pdu *pdu; 1123 uint32_t error_offset = 0; 1124 enum spdk_nvme_tcp_term_req_fes fes; 1125 uint32_t expected_hlen, hd_len = 0; 1126 bool plen_error = false; 1127 1128 pdu = tqpair->recv_pdu; 1129 1130 SPDK_DEBUGLOG(nvme, "pdu type = %d\n", pdu->hdr.common.pdu_type); 1131 if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP) { 1132 if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) { 1133 SPDK_ERRLOG("Already received IC_RESP PDU, and we should reject this pdu=%p\n", pdu); 1134 fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; 1135 goto err; 1136 } 1137 expected_hlen = sizeof(struct spdk_nvme_tcp_ic_resp); 1138 if (pdu->hdr.common.plen != expected_hlen) { 1139 plen_error = true; 1140 } 1141 } else { 1142 if (spdk_unlikely(!nvme_tcp_qpair_recv_state_valid(tqpair))) { 1143 SPDK_ERRLOG("The TCP/IP tqpair connection is not negotiated\n"); 1144 fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; 1145 goto err; 1146 } 1147 1148 switch (pdu->hdr.common.pdu_type) { 1149 case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1150 expected_hlen = sizeof(struct spdk_nvme_tcp_rsp); 1151 if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) { 1152 hd_len = SPDK_NVME_TCP_DIGEST_LEN; 1153 } 1154 1155 if (pdu->hdr.common.plen != (expected_hlen + hd_len)) { 1156 plen_error = true; 1157 } 1158 break; 1159 case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: 1160 expected_hlen = sizeof(struct spdk_nvme_tcp_c2h_data_hdr); 1161 if (pdu->hdr.common.plen < pdu->hdr.common.pdo) { 1162 plen_error = true; 1163 } 1164 break; 1165 case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1166 expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr); 1167 if ((pdu->hdr.common.plen <= expected_hlen) || 1168 (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) { 1169 plen_error = true; 1170 } 1171 break; 1172 case SPDK_NVME_TCP_PDU_TYPE_R2T: 1173 expected_hlen = sizeof(struct spdk_nvme_tcp_r2t_hdr); 1174 if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) { 1175 hd_len = SPDK_NVME_TCP_DIGEST_LEN; 1176 } 1177 1178 if (pdu->hdr.common.plen != (expected_hlen + hd_len)) { 1179 plen_error = true; 1180 } 1181 break; 1182 1183 default: 1184 SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu->hdr.common.pdu_type); 1185 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1186 error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type); 1187 goto err; 1188 } 1189 } 1190 1191 if (pdu->hdr.common.hlen != expected_hlen) { 1192 SPDK_ERRLOG("Expected PDU header length %u, got %u\n", 1193 expected_hlen, pdu->hdr.common.hlen); 1194 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1195 error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen); 1196 goto err; 1197 1198 } else if (plen_error) { 1199 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1200 error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen); 1201 goto err; 1202 } else { 1203 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); 1204 nvme_tcp_pdu_calc_psh_len(tqpair->recv_pdu, tqpair->flags.host_hdgst_enable); 1205 return; 1206 } 1207 err: 1208 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1209 } 1210 1211 static struct nvme_tcp_req * 1212 get_nvme_active_req_by_cid(struct nvme_tcp_qpair *tqpair, uint32_t cid) 1213 { 1214 assert(tqpair != NULL); 1215 if ((cid >= tqpair->num_entries) || (tqpair->tcp_reqs[cid].state == NVME_TCP_REQ_FREE)) { 1216 return NULL; 1217 } 1218 1219 return &tqpair->tcp_reqs[cid]; 1220 } 1221 1222 static void 1223 nvme_tcp_recv_payload_seq_cb(void *cb_arg, int status) 1224 { 1225 struct nvme_tcp_req *treq = cb_arg; 1226 struct nvme_request *req = treq->req; 1227 struct nvme_tcp_qpair *tqpair = treq->tqpair; 1228 1229 assert(treq->ordering.bits.in_progress_accel); 1230 treq->ordering.bits.in_progress_accel = 0; 1231 1232 nvme_tcp_cond_schedule_qpair_polling(tqpair); 1233 1234 req->accel_sequence = NULL; 1235 if (spdk_unlikely(status != 0)) { 1236 pdu_seq_fail(treq->pdu, status); 1237 return; 1238 } 1239 1240 nvme_tcp_req_complete_safe(treq); 1241 } 1242 1243 static void 1244 nvme_tcp_c2h_data_payload_handle(struct nvme_tcp_qpair *tqpair, 1245 struct nvme_tcp_pdu *pdu, uint32_t *reaped) 1246 { 1247 struct nvme_tcp_req *tcp_req; 1248 struct nvme_tcp_poll_group *tgroup; 1249 struct spdk_nvme_tcp_c2h_data_hdr *c2h_data; 1250 uint8_t flags; 1251 1252 tcp_req = pdu->req; 1253 assert(tcp_req != NULL); 1254 1255 SPDK_DEBUGLOG(nvme, "enter\n"); 1256 c2h_data = &pdu->hdr.c2h_data; 1257 tcp_req->datao += pdu->data_len; 1258 flags = c2h_data->common.flags; 1259 1260 if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) { 1261 if (tcp_req->datao == tcp_req->req->payload_size) { 1262 tcp_req->rsp.status.p = 0; 1263 } else { 1264 tcp_req->rsp.status.p = 1; 1265 } 1266 1267 tcp_req->rsp.cid = tcp_req->cid; 1268 tcp_req->rsp.sqid = tqpair->qpair.id; 1269 if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) { 1270 tcp_req->ordering.bits.data_recv = 1; 1271 if (tcp_req->req->accel_sequence != NULL) { 1272 tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 1273 nvme_tcp_accel_reverse_sequence(tgroup, tcp_req->req->accel_sequence); 1274 nvme_tcp_accel_finish_sequence(tgroup, tcp_req, 1275 tcp_req->req->accel_sequence, 1276 nvme_tcp_recv_payload_seq_cb, 1277 tcp_req); 1278 return; 1279 } 1280 1281 if (nvme_tcp_req_complete_safe(tcp_req)) { 1282 (*reaped)++; 1283 } 1284 } 1285 } 1286 } 1287 1288 static const char *spdk_nvme_tcp_term_req_fes_str[] = { 1289 "Invalid PDU Header Field", 1290 "PDU Sequence Error", 1291 "Header Digest Error", 1292 "Data Transfer Out of Range", 1293 "Data Transfer Limit Exceeded", 1294 "Unsupported parameter", 1295 }; 1296 1297 static void 1298 nvme_tcp_c2h_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *c2h_term_req) 1299 { 1300 SPDK_ERRLOG("Error info of pdu(%p): %s\n", c2h_term_req, 1301 spdk_nvme_tcp_term_req_fes_str[c2h_term_req->fes]); 1302 if ((c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || 1303 (c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { 1304 SPDK_DEBUGLOG(nvme, "The offset from the start of the PDU header is %u\n", 1305 DGET32(c2h_term_req->fei)); 1306 } 1307 /* we may also need to dump some other info here */ 1308 } 1309 1310 static void 1311 nvme_tcp_c2h_term_req_payload_handle(struct nvme_tcp_qpair *tqpair, 1312 struct nvme_tcp_pdu *pdu) 1313 { 1314 nvme_tcp_c2h_term_req_dump(&pdu->hdr.term_req); 1315 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 1316 } 1317 1318 static void 1319 _nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped) 1320 { 1321 struct nvme_tcp_pdu *pdu; 1322 1323 assert(tqpair != NULL); 1324 pdu = tqpair->recv_pdu; 1325 1326 switch (pdu->hdr.common.pdu_type) { 1327 case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: 1328 nvme_tcp_c2h_data_payload_handle(tqpair, pdu, reaped); 1329 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1330 break; 1331 1332 case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1333 nvme_tcp_c2h_term_req_payload_handle(tqpair, pdu); 1334 break; 1335 1336 default: 1337 /* The code should not go to here */ 1338 SPDK_ERRLOG("The code should not go to here\n"); 1339 break; 1340 } 1341 } 1342 1343 static void 1344 nvme_tcp_req_copy_pdu(struct nvme_tcp_req *treq, struct nvme_tcp_pdu *pdu) 1345 { 1346 treq->pdu->hdr = pdu->hdr; 1347 treq->pdu->req = treq; 1348 memcpy(treq->pdu->data_digest, pdu->data_digest, sizeof(pdu->data_digest)); 1349 memcpy(treq->pdu->data_iov, pdu->data_iov, sizeof(pdu->data_iov[0]) * pdu->data_iovcnt); 1350 treq->pdu->data_iovcnt = pdu->data_iovcnt; 1351 treq->pdu->data_len = pdu->data_len; 1352 } 1353 1354 static void 1355 nvme_tcp_accel_seq_recv_compute_crc32_done(void *cb_arg) 1356 { 1357 struct nvme_tcp_req *treq = cb_arg; 1358 struct nvme_tcp_qpair *tqpair = treq->tqpair; 1359 struct nvme_tcp_pdu *pdu = treq->pdu; 1360 bool result; 1361 1362 pdu->data_digest_crc32 ^= SPDK_CRC32C_XOR; 1363 result = MATCH_DIGEST_WORD(pdu->data_digest, pdu->data_digest_crc32); 1364 if (spdk_unlikely(!result)) { 1365 SPDK_ERRLOG("data digest error on tqpair=(%p)\n", tqpair); 1366 treq->rsp.status.sc = SPDK_NVME_SC_COMMAND_TRANSIENT_TRANSPORT_ERROR; 1367 } 1368 } 1369 1370 static bool 1371 nvme_tcp_accel_recv_compute_crc32(struct nvme_tcp_req *treq, struct nvme_tcp_pdu *pdu) 1372 { 1373 struct nvme_tcp_qpair *tqpair = treq->tqpair; 1374 struct nvme_tcp_poll_group *tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 1375 struct nvme_request *req = treq->req; 1376 int rc, dummy = 0; 1377 1378 /* Only support this limited case that the request has only one c2h pdu */ 1379 if (spdk_unlikely(nvme_qpair_get_state(&tqpair->qpair) < NVME_QPAIR_CONNECTED || 1380 tqpair->qpair.poll_group == NULL || pdu->dif_ctx != NULL || 1381 pdu->data_len % SPDK_NVME_TCP_DIGEST_ALIGNMENT != 0 || 1382 pdu->data_len != req->payload_size)) { 1383 return false; 1384 } 1385 1386 if (tgroup->group.group->accel_fn_table.append_crc32c == NULL) { 1387 return false; 1388 } 1389 1390 nvme_tcp_req_copy_pdu(treq, pdu); 1391 rc = nvme_tcp_accel_append_crc32c(tgroup, &req->accel_sequence, 1392 &treq->pdu->data_digest_crc32, 1393 treq->pdu->data_iov, treq->pdu->data_iovcnt, 0, 1394 nvme_tcp_accel_seq_recv_compute_crc32_done, treq); 1395 if (spdk_unlikely(rc != 0)) { 1396 /* If accel is out of resources, fall back to non-accelerated crc32 */ 1397 if (rc == -ENOMEM) { 1398 return false; 1399 } 1400 1401 SPDK_ERRLOG("Failed to append crc32c operation: %d\n", rc); 1402 treq->rsp.status.sc = SPDK_NVME_SC_COMMAND_TRANSIENT_TRANSPORT_ERROR; 1403 } 1404 1405 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1406 nvme_tcp_c2h_data_payload_handle(tqpair, treq->pdu, &dummy); 1407 1408 return true; 1409 } 1410 1411 static void 1412 nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair, 1413 uint32_t *reaped) 1414 { 1415 int rc = 0; 1416 struct nvme_tcp_pdu *pdu = tqpair->recv_pdu; 1417 uint32_t crc32c; 1418 struct nvme_tcp_req *tcp_req = pdu->req; 1419 1420 assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); 1421 SPDK_DEBUGLOG(nvme, "enter\n"); 1422 1423 /* The request can be NULL, e.g. in case of C2HTermReq */ 1424 if (spdk_likely(tcp_req != NULL)) { 1425 tcp_req->expected_datao += pdu->data_len; 1426 } 1427 1428 /* check data digest if need */ 1429 if (pdu->ddgst_enable) { 1430 /* But if the data digest is enabled, tcp_req cannot be NULL */ 1431 assert(tcp_req != NULL); 1432 if (nvme_tcp_accel_recv_compute_crc32(tcp_req, pdu)) { 1433 return; 1434 } 1435 1436 crc32c = nvme_tcp_pdu_calc_data_digest(pdu); 1437 crc32c = crc32c ^ SPDK_CRC32C_XOR; 1438 rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c); 1439 if (rc == 0) { 1440 SPDK_ERRLOG("data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); 1441 tcp_req = pdu->req; 1442 assert(tcp_req != NULL); 1443 tcp_req->rsp.status.sc = SPDK_NVME_SC_COMMAND_TRANSIENT_TRANSPORT_ERROR; 1444 } 1445 } 1446 1447 _nvme_tcp_pdu_payload_handle(tqpair, reaped); 1448 } 1449 1450 static void 1451 nvme_tcp_send_icreq_complete(void *cb_arg) 1452 { 1453 struct nvme_tcp_qpair *tqpair = cb_arg; 1454 1455 SPDK_DEBUGLOG(nvme, "Complete the icreq send for tqpair=%p %u\n", tqpair, tqpair->qpair.id); 1456 1457 tqpair->flags.icreq_send_ack = true; 1458 1459 if (tqpair->state == NVME_TCP_QPAIR_STATE_INITIALIZING) { 1460 SPDK_DEBUGLOG(nvme, "tqpair %p %u, finalize icresp\n", tqpair, tqpair->qpair.id); 1461 tqpair->state = NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND; 1462 } 1463 } 1464 1465 static void 1466 nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair, 1467 struct nvme_tcp_pdu *pdu) 1468 { 1469 struct spdk_nvme_tcp_ic_resp *ic_resp = &pdu->hdr.ic_resp; 1470 uint32_t error_offset = 0; 1471 enum spdk_nvme_tcp_term_req_fes fes; 1472 int recv_buf_size; 1473 1474 /* Only PFV 0 is defined currently */ 1475 if (ic_resp->pfv != 0) { 1476 SPDK_ERRLOG("Expected ICResp PFV %u, got %u\n", 0u, ic_resp->pfv); 1477 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1478 error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, pfv); 1479 goto end; 1480 } 1481 1482 if (ic_resp->maxh2cdata < NVME_TCP_PDU_H2C_MIN_DATA_SIZE) { 1483 SPDK_ERRLOG("Expected ICResp maxh2cdata >=%u, got %u\n", NVME_TCP_PDU_H2C_MIN_DATA_SIZE, 1484 ic_resp->maxh2cdata); 1485 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1486 error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, maxh2cdata); 1487 goto end; 1488 } 1489 tqpair->maxh2cdata = ic_resp->maxh2cdata; 1490 1491 if (ic_resp->cpda > SPDK_NVME_TCP_CPDA_MAX) { 1492 SPDK_ERRLOG("Expected ICResp cpda <=%u, got %u\n", SPDK_NVME_TCP_CPDA_MAX, ic_resp->cpda); 1493 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1494 error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, cpda); 1495 goto end; 1496 } 1497 tqpair->cpda = ic_resp->cpda; 1498 1499 tqpair->flags.host_hdgst_enable = ic_resp->dgst.bits.hdgst_enable ? true : false; 1500 tqpair->flags.host_ddgst_enable = ic_resp->dgst.bits.ddgst_enable ? true : false; 1501 SPDK_DEBUGLOG(nvme, "host_hdgst_enable: %u\n", tqpair->flags.host_hdgst_enable); 1502 SPDK_DEBUGLOG(nvme, "host_ddgst_enable: %u\n", tqpair->flags.host_ddgst_enable); 1503 1504 /* Now that we know whether digests are enabled, properly size the receive buffer to 1505 * handle several incoming 4K read commands according to SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR 1506 * parameter. */ 1507 recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_c2h_data_hdr); 1508 1509 if (tqpair->flags.host_hdgst_enable) { 1510 recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; 1511 } 1512 1513 if (tqpair->flags.host_ddgst_enable) { 1514 recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; 1515 } 1516 1517 if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR) < 0) { 1518 SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n", 1519 tqpair, 1520 recv_buf_size); 1521 /* Not fatal. */ 1522 } 1523 1524 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1525 1526 if (!tqpair->flags.icreq_send_ack) { 1527 tqpair->state = NVME_TCP_QPAIR_STATE_INITIALIZING; 1528 SPDK_DEBUGLOG(nvme, "tqpair %p %u, waiting icreq ack\n", tqpair, tqpair->qpair.id); 1529 return; 1530 } 1531 1532 tqpair->state = NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND; 1533 return; 1534 end: 1535 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1536 } 1537 1538 static void 1539 nvme_tcp_capsule_resp_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, 1540 uint32_t *reaped) 1541 { 1542 struct nvme_tcp_req *tcp_req; 1543 struct nvme_tcp_poll_group *tgroup; 1544 struct spdk_nvme_tcp_rsp *capsule_resp = &pdu->hdr.capsule_resp; 1545 uint32_t cid, error_offset = 0; 1546 enum spdk_nvme_tcp_term_req_fes fes; 1547 1548 SPDK_DEBUGLOG(nvme, "enter\n"); 1549 cid = capsule_resp->rccqe.cid; 1550 tcp_req = get_nvme_active_req_by_cid(tqpair, cid); 1551 1552 if (!tcp_req) { 1553 SPDK_ERRLOG("no tcp_req is found with cid=%u for tqpair=%p\n", cid, tqpair); 1554 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1555 error_offset = offsetof(struct spdk_nvme_tcp_rsp, rccqe); 1556 goto end; 1557 } 1558 1559 assert(tcp_req->req != NULL); 1560 1561 tcp_req->rsp = capsule_resp->rccqe; 1562 tcp_req->ordering.bits.data_recv = 1; 1563 1564 /* Recv the pdu again */ 1565 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1566 1567 if (tcp_req->req->accel_sequence != NULL) { 1568 tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 1569 nvme_tcp_accel_reverse_sequence(tgroup, tcp_req->req->accel_sequence); 1570 nvme_tcp_accel_finish_sequence(tgroup, tcp_req, tcp_req->req->accel_sequence, 1571 nvme_tcp_recv_payload_seq_cb, tcp_req); 1572 return; 1573 } 1574 1575 if (nvme_tcp_req_complete_safe(tcp_req)) { 1576 (*reaped)++; 1577 } 1578 1579 return; 1580 1581 end: 1582 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1583 } 1584 1585 static void 1586 nvme_tcp_c2h_term_req_hdr_handle(struct nvme_tcp_qpair *tqpair, 1587 struct nvme_tcp_pdu *pdu) 1588 { 1589 struct spdk_nvme_tcp_term_req_hdr *c2h_term_req = &pdu->hdr.term_req; 1590 uint32_t error_offset = 0; 1591 enum spdk_nvme_tcp_term_req_fes fes; 1592 1593 if (c2h_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) { 1594 SPDK_ERRLOG("Fatal Error Status(FES) is unknown for c2h_term_req pdu=%p\n", pdu); 1595 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1596 error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes); 1597 goto end; 1598 } 1599 1600 /* set the data buffer */ 1601 nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + c2h_term_req->common.hlen, 1602 c2h_term_req->common.plen - c2h_term_req->common.hlen); 1603 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); 1604 return; 1605 end: 1606 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1607 } 1608 1609 static void 1610 nvme_tcp_c2h_data_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) 1611 { 1612 struct nvme_tcp_req *tcp_req; 1613 struct spdk_nvme_tcp_c2h_data_hdr *c2h_data = &pdu->hdr.c2h_data; 1614 uint32_t error_offset = 0; 1615 enum spdk_nvme_tcp_term_req_fes fes; 1616 int flags = c2h_data->common.flags; 1617 int rc; 1618 1619 SPDK_DEBUGLOG(nvme, "enter\n"); 1620 SPDK_DEBUGLOG(nvme, "c2h_data info on tqpair(%p): datao=%u, datal=%u, cccid=%d\n", 1621 tqpair, c2h_data->datao, c2h_data->datal, c2h_data->cccid); 1622 tcp_req = get_nvme_active_req_by_cid(tqpair, c2h_data->cccid); 1623 if (!tcp_req) { 1624 SPDK_ERRLOG("no tcp_req found for c2hdata cid=%d\n", c2h_data->cccid); 1625 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1626 error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, cccid); 1627 goto end; 1628 1629 } 1630 1631 SPDK_DEBUGLOG(nvme, "tcp_req(%p) on tqpair(%p): expected_datao=%u, payload_size=%u\n", 1632 tcp_req, tqpair, tcp_req->expected_datao, tcp_req->req->payload_size); 1633 1634 if (spdk_unlikely((flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) && 1635 !(flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_LAST_PDU))) { 1636 SPDK_ERRLOG("Invalid flag flags=%d in c2h_data=%p\n", flags, c2h_data); 1637 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1638 error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, common); 1639 goto end; 1640 } 1641 1642 if (c2h_data->datal > tcp_req->req->payload_size) { 1643 SPDK_ERRLOG("Invalid datal for tcp_req(%p), datal(%u) exceeds payload_size(%u)\n", 1644 tcp_req, c2h_data->datal, tcp_req->req->payload_size); 1645 fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; 1646 goto end; 1647 } 1648 1649 if (tcp_req->expected_datao != c2h_data->datao) { 1650 SPDK_ERRLOG("Invalid datao for tcp_req(%p), received datal(%u) != expected datao(%u) in tcp_req\n", 1651 tcp_req, c2h_data->datao, tcp_req->expected_datao); 1652 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1653 error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datao); 1654 goto end; 1655 } 1656 1657 if ((c2h_data->datao + c2h_data->datal) > tcp_req->req->payload_size) { 1658 SPDK_ERRLOG("Invalid data range for tcp_req(%p), received (datao(%u) + datal(%u)) > datao(%u) in tcp_req\n", 1659 tcp_req, c2h_data->datao, c2h_data->datal, tcp_req->req->payload_size); 1660 fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; 1661 error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datal); 1662 goto end; 1663 1664 } 1665 1666 if (nvme_payload_type(&tcp_req->req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { 1667 rc = nvme_tcp_build_contig_request(tqpair, tcp_req); 1668 } else { 1669 assert(nvme_payload_type(&tcp_req->req->payload) == NVME_PAYLOAD_TYPE_SGL); 1670 rc = nvme_tcp_build_sgl_request(tqpair, tcp_req); 1671 } 1672 1673 if (rc) { 1674 /* Not the right error message but at least it handles the failure. */ 1675 fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED; 1676 goto end; 1677 } 1678 1679 nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt, 1680 c2h_data->datao, c2h_data->datal); 1681 pdu->req = tcp_req; 1682 1683 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); 1684 return; 1685 1686 end: 1687 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1688 } 1689 1690 static void 1691 nvme_tcp_qpair_h2c_data_send_complete(void *cb_arg) 1692 { 1693 struct nvme_tcp_req *tcp_req = cb_arg; 1694 1695 assert(tcp_req != NULL); 1696 1697 tcp_req->ordering.bits.send_ack = 1; 1698 if (tcp_req->r2tl_remain) { 1699 nvme_tcp_send_h2c_data(tcp_req); 1700 } else { 1701 assert(tcp_req->active_r2ts > 0); 1702 tcp_req->active_r2ts--; 1703 tcp_req->state = NVME_TCP_REQ_ACTIVE; 1704 1705 if (tcp_req->ordering.bits.r2t_waiting_h2c_complete) { 1706 tcp_req->ordering.bits.r2t_waiting_h2c_complete = 0; 1707 SPDK_DEBUGLOG(nvme, "tcp_req %p: continue r2t\n", tcp_req); 1708 assert(tcp_req->active_r2ts > 0); 1709 tcp_req->ttag = tcp_req->ttag_r2t_next; 1710 tcp_req->r2tl_remain = tcp_req->r2tl_remain_next; 1711 tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T; 1712 nvme_tcp_send_h2c_data(tcp_req); 1713 return; 1714 } 1715 1716 if (tcp_req->ordering.bits.domain_in_use) { 1717 spdk_memory_domain_invalidate_data(tcp_req->req->payload.opts->memory_domain, 1718 tcp_req->req->payload.opts->memory_domain_ctx, tcp_req->iov, tcp_req->iovcnt); 1719 } 1720 1721 /* Need also call this function to free the resource */ 1722 nvme_tcp_req_complete_safe(tcp_req); 1723 } 1724 } 1725 1726 static void 1727 nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req) 1728 { 1729 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(tcp_req->req->qpair); 1730 struct nvme_tcp_pdu *rsp_pdu; 1731 struct spdk_nvme_tcp_h2c_data_hdr *h2c_data; 1732 uint32_t plen, pdo, alignment; 1733 1734 /* Reinit the send_ack and h2c_send_waiting_ack bits */ 1735 tcp_req->ordering.bits.send_ack = 0; 1736 tcp_req->ordering.bits.h2c_send_waiting_ack = 0; 1737 rsp_pdu = tcp_req->pdu; 1738 memset(rsp_pdu, 0, sizeof(*rsp_pdu)); 1739 rsp_pdu->req = tcp_req; 1740 h2c_data = &rsp_pdu->hdr.h2c_data; 1741 1742 h2c_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_DATA; 1743 plen = h2c_data->common.hlen = sizeof(*h2c_data); 1744 h2c_data->cccid = tcp_req->cid; 1745 h2c_data->ttag = tcp_req->ttag; 1746 h2c_data->datao = tcp_req->datao; 1747 1748 h2c_data->datal = spdk_min(tcp_req->r2tl_remain, tqpair->maxh2cdata); 1749 nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->iov, tcp_req->iovcnt, 1750 h2c_data->datao, h2c_data->datal); 1751 tcp_req->r2tl_remain -= h2c_data->datal; 1752 1753 if (tqpair->flags.host_hdgst_enable) { 1754 h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; 1755 plen += SPDK_NVME_TCP_DIGEST_LEN; 1756 } 1757 1758 rsp_pdu->padding_len = 0; 1759 pdo = plen; 1760 if (tqpair->cpda) { 1761 alignment = (tqpair->cpda + 1) << 2; 1762 if (alignment > plen) { 1763 rsp_pdu->padding_len = alignment - plen; 1764 pdo = plen = alignment; 1765 } 1766 } 1767 1768 h2c_data->common.pdo = pdo; 1769 plen += h2c_data->datal; 1770 if (tqpair->flags.host_ddgst_enable) { 1771 h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; 1772 plen += SPDK_NVME_TCP_DIGEST_LEN; 1773 } 1774 1775 h2c_data->common.plen = plen; 1776 tcp_req->datao += h2c_data->datal; 1777 if (!tcp_req->r2tl_remain) { 1778 h2c_data->common.flags |= SPDK_NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; 1779 } 1780 1781 SPDK_DEBUGLOG(nvme, "h2c_data info: datao=%u, datal=%u, pdu_len=%u for tqpair=%p\n", 1782 h2c_data->datao, h2c_data->datal, h2c_data->common.plen, tqpair); 1783 1784 nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_h2c_data_send_complete, tcp_req); 1785 } 1786 1787 static void 1788 nvme_tcp_r2t_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) 1789 { 1790 struct nvme_tcp_req *tcp_req; 1791 struct spdk_nvme_tcp_r2t_hdr *r2t = &pdu->hdr.r2t; 1792 uint32_t cid, error_offset = 0; 1793 enum spdk_nvme_tcp_term_req_fes fes; 1794 1795 SPDK_DEBUGLOG(nvme, "enter\n"); 1796 cid = r2t->cccid; 1797 tcp_req = get_nvme_active_req_by_cid(tqpair, cid); 1798 if (!tcp_req) { 1799 SPDK_ERRLOG("Cannot find tcp_req for tqpair=%p\n", tqpair); 1800 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1801 error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, cccid); 1802 goto end; 1803 } 1804 1805 SPDK_DEBUGLOG(nvme, "r2t info: r2to=%u, r2tl=%u for tqpair=%p\n", r2t->r2to, r2t->r2tl, 1806 tqpair); 1807 1808 if (tcp_req->state == NVME_TCP_REQ_ACTIVE) { 1809 assert(tcp_req->active_r2ts == 0); 1810 tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T; 1811 } 1812 1813 if (tcp_req->datao != r2t->r2to) { 1814 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1815 error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2to); 1816 goto end; 1817 1818 } 1819 1820 if ((r2t->r2tl + r2t->r2to) > tcp_req->req->payload_size) { 1821 SPDK_ERRLOG("Invalid R2T info for tcp_req=%p: (r2to(%u) + r2tl(%u)) exceeds payload_size(%u)\n", 1822 tcp_req, r2t->r2to, r2t->r2tl, tqpair->maxh2cdata); 1823 fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; 1824 error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2tl); 1825 goto end; 1826 } 1827 1828 tcp_req->active_r2ts++; 1829 if (spdk_unlikely(tcp_req->active_r2ts > tqpair->maxr2t)) { 1830 if (tcp_req->state == NVME_TCP_REQ_ACTIVE_R2T && !tcp_req->ordering.bits.send_ack) { 1831 /* We receive a subsequent R2T while we are waiting for H2C transfer to complete */ 1832 SPDK_DEBUGLOG(nvme, "received a subsequent R2T\n"); 1833 assert(tcp_req->active_r2ts == tqpair->maxr2t + 1); 1834 tcp_req->ttag_r2t_next = r2t->ttag; 1835 tcp_req->r2tl_remain_next = r2t->r2tl; 1836 tcp_req->ordering.bits.r2t_waiting_h2c_complete = 1; 1837 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1838 return; 1839 } else { 1840 fes = SPDK_NVME_TCP_TERM_REQ_FES_R2T_LIMIT_EXCEEDED; 1841 SPDK_ERRLOG("Invalid R2T: Maximum number of R2T exceeded! Max: %u for tqpair=%p\n", tqpair->maxr2t, 1842 tqpair); 1843 goto end; 1844 } 1845 } 1846 1847 tcp_req->ttag = r2t->ttag; 1848 tcp_req->r2tl_remain = r2t->r2tl; 1849 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1850 1851 if (spdk_likely(tcp_req->ordering.bits.send_ack)) { 1852 nvme_tcp_send_h2c_data(tcp_req); 1853 } else { 1854 tcp_req->ordering.bits.h2c_send_waiting_ack = 1; 1855 } 1856 1857 return; 1858 1859 end: 1860 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1861 1862 } 1863 1864 static void 1865 nvme_tcp_pdu_psh_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped) 1866 { 1867 struct nvme_tcp_pdu *pdu; 1868 int rc; 1869 uint32_t crc32c, error_offset = 0; 1870 enum spdk_nvme_tcp_term_req_fes fes; 1871 1872 assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); 1873 pdu = tqpair->recv_pdu; 1874 1875 SPDK_DEBUGLOG(nvme, "enter: pdu type =%u\n", pdu->hdr.common.pdu_type); 1876 /* check header digest if needed */ 1877 if (pdu->has_hdgst) { 1878 crc32c = nvme_tcp_pdu_calc_header_digest(pdu); 1879 rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c); 1880 if (rc == 0) { 1881 SPDK_ERRLOG("header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); 1882 fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR; 1883 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1884 return; 1885 1886 } 1887 } 1888 1889 switch (pdu->hdr.common.pdu_type) { 1890 case SPDK_NVME_TCP_PDU_TYPE_IC_RESP: 1891 nvme_tcp_icresp_handle(tqpair, pdu); 1892 break; 1893 case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1894 nvme_tcp_capsule_resp_hdr_handle(tqpair, pdu, reaped); 1895 break; 1896 case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: 1897 nvme_tcp_c2h_data_hdr_handle(tqpair, pdu); 1898 break; 1899 1900 case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1901 nvme_tcp_c2h_term_req_hdr_handle(tqpair, pdu); 1902 break; 1903 case SPDK_NVME_TCP_PDU_TYPE_R2T: 1904 nvme_tcp_r2t_hdr_handle(tqpair, pdu); 1905 break; 1906 1907 default: 1908 SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu->hdr.common.pdu_type); 1909 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1910 error_offset = 1; 1911 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1912 break; 1913 } 1914 1915 } 1916 1917 static int 1918 nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped, uint32_t max_completions) 1919 { 1920 int rc = 0; 1921 struct nvme_tcp_pdu *pdu; 1922 uint32_t data_len; 1923 enum nvme_tcp_pdu_recv_state prev_state; 1924 1925 *reaped = tqpair->async_complete; 1926 tqpair->async_complete = 0; 1927 1928 /* The loop here is to allow for several back-to-back state changes. */ 1929 do { 1930 if (*reaped >= max_completions) { 1931 break; 1932 } 1933 1934 prev_state = tqpair->recv_state; 1935 pdu = tqpair->recv_pdu; 1936 switch (tqpair->recv_state) { 1937 /* If in a new state */ 1938 case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY: 1939 memset(pdu, 0, sizeof(struct nvme_tcp_pdu)); 1940 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH); 1941 break; 1942 /* Wait for the pdu common header */ 1943 case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH: 1944 assert(pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)); 1945 rc = nvme_tcp_read_data(tqpair->sock, 1946 sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes, 1947 (uint8_t *)&pdu->hdr.common + pdu->ch_valid_bytes); 1948 if (rc < 0) { 1949 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 1950 break; 1951 } 1952 pdu->ch_valid_bytes += rc; 1953 if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) { 1954 return NVME_TCP_PDU_IN_PROGRESS; 1955 } 1956 1957 /* The command header of this PDU has now been read from the socket. */ 1958 nvme_tcp_pdu_ch_handle(tqpair); 1959 break; 1960 /* Wait for the pdu specific header */ 1961 case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH: 1962 assert(pdu->psh_valid_bytes < pdu->psh_len); 1963 rc = nvme_tcp_read_data(tqpair->sock, 1964 pdu->psh_len - pdu->psh_valid_bytes, 1965 (uint8_t *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes); 1966 if (rc < 0) { 1967 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 1968 break; 1969 } 1970 1971 pdu->psh_valid_bytes += rc; 1972 if (pdu->psh_valid_bytes < pdu->psh_len) { 1973 return NVME_TCP_PDU_IN_PROGRESS; 1974 } 1975 1976 /* All header(ch, psh, head digits) of this PDU has now been read from the socket. */ 1977 nvme_tcp_pdu_psh_handle(tqpair, reaped); 1978 break; 1979 case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: 1980 /* check whether the data is valid, if not we just return */ 1981 if (!pdu->data_len) { 1982 return NVME_TCP_PDU_IN_PROGRESS; 1983 } 1984 1985 data_len = pdu->data_len; 1986 /* data digest */ 1987 if (spdk_unlikely((pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_DATA) && 1988 tqpair->flags.host_ddgst_enable)) { 1989 data_len += SPDK_NVME_TCP_DIGEST_LEN; 1990 pdu->ddgst_enable = true; 1991 } 1992 1993 rc = nvme_tcp_read_payload_data(tqpair->sock, pdu); 1994 if (rc < 0) { 1995 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 1996 break; 1997 } 1998 1999 pdu->rw_offset += rc; 2000 if (pdu->rw_offset < data_len) { 2001 return NVME_TCP_PDU_IN_PROGRESS; 2002 } 2003 2004 assert(pdu->rw_offset == data_len); 2005 /* All of this PDU has now been read from the socket. */ 2006 nvme_tcp_pdu_payload_handle(tqpair, reaped); 2007 break; 2008 case NVME_TCP_PDU_RECV_STATE_QUIESCING: 2009 if (TAILQ_EMPTY(&tqpair->outstanding_reqs)) { 2010 if (nvme_qpair_get_state(&tqpair->qpair) == NVME_QPAIR_DISCONNECTING) { 2011 nvme_transport_ctrlr_disconnect_qpair_done(&tqpair->qpair); 2012 } 2013 2014 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); 2015 } 2016 break; 2017 case NVME_TCP_PDU_RECV_STATE_ERROR: 2018 memset(pdu, 0, sizeof(struct nvme_tcp_pdu)); 2019 return NVME_TCP_PDU_FATAL; 2020 default: 2021 assert(0); 2022 break; 2023 } 2024 } while (prev_state != tqpair->recv_state); 2025 2026 return rc > 0 ? 0 : rc; 2027 } 2028 2029 static void 2030 nvme_tcp_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 2031 { 2032 uint64_t t02; 2033 struct nvme_tcp_req *tcp_req, *tmp; 2034 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2035 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2036 struct spdk_nvme_ctrlr_process *active_proc; 2037 2038 /* Don't check timeouts during controller initialization. */ 2039 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 2040 return; 2041 } 2042 2043 if (nvme_qpair_is_admin_queue(qpair)) { 2044 active_proc = nvme_ctrlr_get_current_process(ctrlr); 2045 } else { 2046 active_proc = qpair->active_proc; 2047 } 2048 2049 /* Only check timeouts if the current process has a timeout callback. */ 2050 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 2051 return; 2052 } 2053 2054 t02 = spdk_get_ticks(); 2055 TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { 2056 if (ctrlr->is_failed) { 2057 /* The controller state may be changed to failed in one of the nvme_request_check_timeout callbacks. */ 2058 return; 2059 } 2060 assert(tcp_req->req != NULL); 2061 2062 if (nvme_request_check_timeout(tcp_req->req, tcp_req->cid, active_proc, t02)) { 2063 /* 2064 * The requests are in order, so as soon as one has not timed out, 2065 * stop iterating. 2066 */ 2067 break; 2068 } 2069 } 2070 } 2071 2072 static int nvme_tcp_ctrlr_connect_qpair_poll(struct spdk_nvme_ctrlr *ctrlr, 2073 struct spdk_nvme_qpair *qpair); 2074 2075 static int 2076 nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) 2077 { 2078 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2079 uint32_t reaped; 2080 int rc; 2081 2082 if (qpair->poll_group == NULL) { 2083 rc = spdk_sock_flush(tqpair->sock); 2084 if (rc < 0 && errno != EAGAIN) { 2085 SPDK_ERRLOG("Failed to flush tqpair=%p (%d): %s\n", tqpair, 2086 errno, spdk_strerror(errno)); 2087 if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) { 2088 nvme_tcp_qpair_check_timeout(qpair); 2089 } 2090 2091 if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING) { 2092 if (TAILQ_EMPTY(&tqpair->outstanding_reqs)) { 2093 nvme_transport_ctrlr_disconnect_qpair_done(qpair); 2094 } 2095 2096 /* Don't return errors until the qpair gets disconnected */ 2097 return 0; 2098 } 2099 2100 goto fail; 2101 } 2102 } 2103 2104 if (max_completions == 0) { 2105 max_completions = spdk_max(tqpair->num_entries, 1); 2106 } else { 2107 max_completions = spdk_min(max_completions, tqpair->num_entries); 2108 } 2109 2110 reaped = 0; 2111 rc = nvme_tcp_read_pdu(tqpair, &reaped, max_completions); 2112 if (rc < 0) { 2113 SPDK_DEBUGLOG(nvme, "Error polling CQ! (%d): %s\n", 2114 errno, spdk_strerror(errno)); 2115 goto fail; 2116 } 2117 2118 if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) { 2119 nvme_tcp_qpair_check_timeout(qpair); 2120 } 2121 2122 if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) { 2123 rc = nvme_tcp_ctrlr_connect_qpair_poll(qpair->ctrlr, qpair); 2124 if (rc != 0 && rc != -EAGAIN) { 2125 SPDK_ERRLOG("Failed to connect tqpair=%p\n", tqpair); 2126 goto fail; 2127 } else if (rc == 0) { 2128 /* Once the connection is completed, we can submit queued requests */ 2129 nvme_qpair_resubmit_requests(qpair, tqpair->num_entries); 2130 } 2131 } 2132 2133 return reaped; 2134 fail: 2135 2136 /* 2137 * Since admin queues take the ctrlr_lock before entering this function, 2138 * we can call nvme_transport_ctrlr_disconnect_qpair. For other qpairs we need 2139 * to call the generic function which will take the lock for us. 2140 */ 2141 qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; 2142 2143 if (nvme_qpair_is_admin_queue(qpair)) { 2144 enum nvme_qpair_state state_prev = nvme_qpair_get_state(qpair); 2145 2146 nvme_transport_ctrlr_disconnect_qpair(qpair->ctrlr, qpair); 2147 2148 if (state_prev == NVME_QPAIR_CONNECTING && qpair->poll_status != NULL) { 2149 /* Needed to free the poll_status */ 2150 nvme_tcp_ctrlr_connect_qpair_poll(qpair->ctrlr, qpair); 2151 } 2152 } else { 2153 nvme_ctrlr_disconnect_qpair(qpair); 2154 } 2155 return -ENXIO; 2156 } 2157 2158 static void 2159 nvme_tcp_qpair_sock_cb(void *ctx, struct spdk_sock_group *group, struct spdk_sock *sock) 2160 { 2161 struct spdk_nvme_qpair *qpair = ctx; 2162 struct nvme_tcp_poll_group *pgroup = nvme_tcp_poll_group(qpair->poll_group); 2163 int32_t num_completions; 2164 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2165 2166 if (tqpair->needs_poll) { 2167 TAILQ_REMOVE(&pgroup->needs_poll, tqpair, link); 2168 tqpair->needs_poll = false; 2169 } 2170 2171 num_completions = spdk_nvme_qpair_process_completions(qpair, pgroup->completions_per_qpair); 2172 2173 if (pgroup->num_completions >= 0 && num_completions >= 0) { 2174 pgroup->num_completions += num_completions; 2175 pgroup->stats.nvme_completions += num_completions; 2176 } else { 2177 pgroup->num_completions = -ENXIO; 2178 } 2179 } 2180 2181 static int 2182 nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair) 2183 { 2184 struct spdk_nvme_tcp_ic_req *ic_req; 2185 struct nvme_tcp_pdu *pdu; 2186 uint32_t timeout_in_sec; 2187 2188 pdu = tqpair->send_pdu; 2189 memset(tqpair->send_pdu, 0, sizeof(*tqpair->send_pdu)); 2190 ic_req = &pdu->hdr.ic_req; 2191 2192 ic_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_REQ; 2193 ic_req->common.hlen = ic_req->common.plen = sizeof(*ic_req); 2194 ic_req->pfv = 0; 2195 ic_req->maxr2t = NVME_TCP_MAX_R2T_DEFAULT - 1; 2196 ic_req->hpda = NVME_TCP_HPDA_DEFAULT; 2197 2198 ic_req->dgst.bits.hdgst_enable = tqpair->qpair.ctrlr->opts.header_digest; 2199 ic_req->dgst.bits.ddgst_enable = tqpair->qpair.ctrlr->opts.data_digest; 2200 2201 nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_send_icreq_complete, tqpair); 2202 2203 timeout_in_sec = tqpair->qpair.async ? ICREQ_TIMEOUT_ASYNC : ICREQ_TIMEOUT_SYNC; 2204 tqpair->icreq_timeout_tsc = spdk_get_ticks() + (timeout_in_sec * spdk_get_ticks_hz()); 2205 return 0; 2206 } 2207 2208 static int 2209 nvme_tcp_qpair_connect_sock(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 2210 { 2211 struct sockaddr_storage dst_addr; 2212 struct sockaddr_storage src_addr; 2213 int rc; 2214 struct nvme_tcp_qpair *tqpair; 2215 int family; 2216 long int port, src_port = 0; 2217 char *sock_impl_name; 2218 struct spdk_sock_impl_opts impl_opts = {}; 2219 size_t impl_opts_size = sizeof(impl_opts); 2220 struct spdk_sock_opts opts; 2221 struct nvme_tcp_ctrlr *tcp_ctrlr; 2222 2223 tqpair = nvme_tcp_qpair(qpair); 2224 2225 switch (ctrlr->trid.adrfam) { 2226 case SPDK_NVMF_ADRFAM_IPV4: 2227 family = AF_INET; 2228 break; 2229 case SPDK_NVMF_ADRFAM_IPV6: 2230 family = AF_INET6; 2231 break; 2232 default: 2233 SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); 2234 rc = -1; 2235 return rc; 2236 } 2237 2238 SPDK_DEBUGLOG(nvme, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); 2239 2240 memset(&dst_addr, 0, sizeof(dst_addr)); 2241 2242 SPDK_DEBUGLOG(nvme, "trsvcid is %s\n", ctrlr->trid.trsvcid); 2243 rc = nvme_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid, &port); 2244 if (rc != 0) { 2245 SPDK_ERRLOG("dst_addr nvme_parse_addr() failed\n"); 2246 return rc; 2247 } 2248 2249 if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { 2250 memset(&src_addr, 0, sizeof(src_addr)); 2251 rc = nvme_parse_addr(&src_addr, family, 2252 ctrlr->opts.src_addr[0] ? ctrlr->opts.src_addr : NULL, 2253 ctrlr->opts.src_svcid[0] ? ctrlr->opts.src_svcid : NULL, 2254 &src_port); 2255 if (rc != 0) { 2256 SPDK_ERRLOG("src_addr nvme_parse_addr() failed\n"); 2257 return rc; 2258 } 2259 } 2260 2261 tcp_ctrlr = SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr); 2262 sock_impl_name = tcp_ctrlr->psk[0] ? "ssl" : NULL; 2263 SPDK_DEBUGLOG(nvme, "sock_impl_name is %s\n", sock_impl_name); 2264 2265 if (sock_impl_name) { 2266 spdk_sock_impl_get_opts(sock_impl_name, &impl_opts, &impl_opts_size); 2267 impl_opts.tls_version = SPDK_TLS_VERSION_1_3; 2268 impl_opts.psk_identity = tcp_ctrlr->psk_identity; 2269 impl_opts.psk_key = tcp_ctrlr->psk; 2270 impl_opts.psk_key_size = tcp_ctrlr->psk_size; 2271 impl_opts.tls_cipher_suites = tcp_ctrlr->tls_cipher_suite; 2272 } 2273 opts.opts_size = sizeof(opts); 2274 spdk_sock_get_default_opts(&opts); 2275 opts.priority = ctrlr->trid.priority; 2276 opts.zcopy = !nvme_qpair_is_admin_queue(qpair); 2277 opts.src_addr = ctrlr->opts.src_addr[0] ? ctrlr->opts.src_addr : NULL; 2278 opts.src_port = src_port; 2279 if (ctrlr->opts.transport_ack_timeout) { 2280 opts.ack_timeout = 1ULL << ctrlr->opts.transport_ack_timeout; 2281 } 2282 if (sock_impl_name) { 2283 opts.impl_opts = &impl_opts; 2284 opts.impl_opts_size = sizeof(impl_opts); 2285 } 2286 tqpair->sock = spdk_sock_connect_ext(ctrlr->trid.traddr, port, sock_impl_name, &opts); 2287 if (!tqpair->sock) { 2288 SPDK_ERRLOG("sock connection error of tqpair=%p with addr=%s, port=%ld\n", 2289 tqpair, ctrlr->trid.traddr, port); 2290 rc = -1; 2291 return rc; 2292 } 2293 2294 return 0; 2295 } 2296 2297 static int 2298 nvme_tcp_ctrlr_connect_qpair_poll(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 2299 { 2300 struct nvme_tcp_qpair *tqpair; 2301 int rc; 2302 2303 tqpair = nvme_tcp_qpair(qpair); 2304 2305 /* Prevent this function from being called recursively, as it could lead to issues with 2306 * nvme_fabric_qpair_connect_poll() if the connect response is received in the recursive 2307 * call. 2308 */ 2309 if (tqpair->flags.in_connect_poll) { 2310 return -EAGAIN; 2311 } 2312 2313 tqpair->flags.in_connect_poll = 1; 2314 2315 switch (tqpair->state) { 2316 case NVME_TCP_QPAIR_STATE_INVALID: 2317 case NVME_TCP_QPAIR_STATE_INITIALIZING: 2318 if (spdk_get_ticks() > tqpair->icreq_timeout_tsc) { 2319 SPDK_ERRLOG("Failed to construct the tqpair=%p via correct icresp\n", tqpair); 2320 rc = -ETIMEDOUT; 2321 break; 2322 } 2323 rc = -EAGAIN; 2324 break; 2325 case NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND: 2326 rc = nvme_fabric_qpair_connect_async(&tqpair->qpair, tqpair->num_entries + 1); 2327 if (rc < 0) { 2328 SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); 2329 break; 2330 } 2331 tqpair->state = NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL; 2332 rc = -EAGAIN; 2333 break; 2334 case NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL: 2335 rc = nvme_fabric_qpair_connect_poll(&tqpair->qpair); 2336 if (rc == 0) { 2337 if (nvme_fabric_qpair_auth_required(qpair)) { 2338 rc = nvme_fabric_qpair_authenticate_async(qpair); 2339 if (rc == 0) { 2340 tqpair->state = NVME_TCP_QPAIR_STATE_AUTHENTICATING; 2341 rc = -EAGAIN; 2342 } 2343 } else { 2344 tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING; 2345 nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); 2346 } 2347 } else if (rc != -EAGAIN) { 2348 SPDK_ERRLOG("Failed to poll NVMe-oF Fabric CONNECT command\n"); 2349 } 2350 break; 2351 case NVME_TCP_QPAIR_STATE_AUTHENTICATING: 2352 rc = nvme_fabric_qpair_authenticate_poll(qpair); 2353 if (rc == 0) { 2354 tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING; 2355 nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); 2356 } 2357 break; 2358 case NVME_TCP_QPAIR_STATE_RUNNING: 2359 rc = 0; 2360 break; 2361 default: 2362 assert(false); 2363 rc = -EINVAL; 2364 break; 2365 } 2366 2367 tqpair->flags.in_connect_poll = 0; 2368 return rc; 2369 } 2370 2371 static int 2372 nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 2373 { 2374 int rc = 0; 2375 struct nvme_tcp_qpair *tqpair; 2376 struct nvme_tcp_poll_group *tgroup; 2377 2378 tqpair = nvme_tcp_qpair(qpair); 2379 2380 if (!tqpair->sock) { 2381 rc = nvme_tcp_qpair_connect_sock(ctrlr, qpair); 2382 if (rc < 0) { 2383 return rc; 2384 } 2385 } 2386 2387 if (qpair->poll_group) { 2388 rc = nvme_poll_group_connect_qpair(qpair); 2389 if (rc) { 2390 SPDK_ERRLOG("Unable to activate the tcp qpair.\n"); 2391 return rc; 2392 } 2393 tgroup = nvme_tcp_poll_group(qpair->poll_group); 2394 tqpair->stats = &tgroup->stats; 2395 tqpair->shared_stats = true; 2396 } else { 2397 /* When resetting a controller, we disconnect adminq and then reconnect. The stats 2398 * is not freed when disconnecting. So when reconnecting, don't allocate memory 2399 * again. 2400 */ 2401 if (tqpair->stats == NULL) { 2402 tqpair->stats = calloc(1, sizeof(*tqpair->stats)); 2403 if (!tqpair->stats) { 2404 SPDK_ERRLOG("tcp stats memory allocation failed\n"); 2405 return -ENOMEM; 2406 } 2407 } 2408 } 2409 2410 tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT; 2411 /* Explicitly set the state and recv_state of tqpair */ 2412 tqpair->state = NVME_TCP_QPAIR_STATE_INVALID; 2413 if (tqpair->recv_state != NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) { 2414 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 2415 } 2416 rc = nvme_tcp_qpair_icreq_send(tqpair); 2417 if (rc != 0) { 2418 SPDK_ERRLOG("Unable to connect the tqpair\n"); 2419 return rc; 2420 } 2421 2422 return rc; 2423 } 2424 2425 static struct spdk_nvme_qpair * 2426 nvme_tcp_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, 2427 uint16_t qid, uint32_t qsize, 2428 enum spdk_nvme_qprio qprio, 2429 uint32_t num_requests, bool async) 2430 { 2431 struct nvme_tcp_qpair *tqpair; 2432 struct spdk_nvme_qpair *qpair; 2433 int rc; 2434 2435 if (qsize < SPDK_NVME_QUEUE_MIN_ENTRIES) { 2436 SPDK_ERRLOG("Failed to create qpair with size %u. Minimum queue size is %d.\n", 2437 qsize, SPDK_NVME_QUEUE_MIN_ENTRIES); 2438 return NULL; 2439 } 2440 2441 tqpair = calloc(1, sizeof(struct nvme_tcp_qpair)); 2442 if (!tqpair) { 2443 SPDK_ERRLOG("failed to get create tqpair\n"); 2444 return NULL; 2445 } 2446 2447 /* Set num_entries one less than queue size. According to NVMe 2448 * and NVMe-oF specs we can not submit queue size requests, 2449 * one slot shall always remain empty. 2450 */ 2451 tqpair->num_entries = qsize - 1; 2452 qpair = &tqpair->qpair; 2453 rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests, async); 2454 if (rc != 0) { 2455 free(tqpair); 2456 return NULL; 2457 } 2458 2459 rc = nvme_tcp_alloc_reqs(tqpair); 2460 if (rc) { 2461 nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair); 2462 return NULL; 2463 } 2464 2465 /* spdk_nvme_qpair_get_optimal_poll_group needs socket information. 2466 * So create the socket first when creating a qpair. */ 2467 rc = nvme_tcp_qpair_connect_sock(ctrlr, qpair); 2468 if (rc) { 2469 nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair); 2470 return NULL; 2471 } 2472 2473 return qpair; 2474 } 2475 2476 static struct spdk_nvme_qpair * 2477 nvme_tcp_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 2478 const struct spdk_nvme_io_qpair_opts *opts) 2479 { 2480 return nvme_tcp_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, 2481 opts->io_queue_requests, opts->async_mode); 2482 } 2483 2484 static int 2485 nvme_tcp_generate_tls_credentials(struct nvme_tcp_ctrlr *tctrlr) 2486 { 2487 struct spdk_nvme_ctrlr *ctrlr = &tctrlr->ctrlr; 2488 int rc; 2489 uint8_t psk_retained[SPDK_TLS_PSK_MAX_LEN] = {}; 2490 uint8_t psk_configured[SPDK_TLS_PSK_MAX_LEN] = {}; 2491 uint8_t pskbuf[SPDK_TLS_PSK_MAX_LEN + 1] = {}; 2492 uint8_t tls_cipher_suite; 2493 uint8_t psk_retained_hash; 2494 uint64_t psk_configured_size; 2495 2496 rc = spdk_key_get_key(ctrlr->opts.tls_psk, pskbuf, SPDK_TLS_PSK_MAX_LEN); 2497 if (rc < 0) { 2498 SPDK_ERRLOG("Failed to obtain key '%s': %s\n", 2499 spdk_key_get_name(ctrlr->opts.tls_psk), spdk_strerror(-rc)); 2500 goto finish; 2501 } 2502 2503 rc = nvme_tcp_parse_interchange_psk(pskbuf, psk_configured, sizeof(psk_configured), 2504 &psk_configured_size, &psk_retained_hash); 2505 if (rc < 0) { 2506 SPDK_ERRLOG("Failed to parse PSK interchange!\n"); 2507 goto finish; 2508 } 2509 2510 /* The Base64 string encodes the configured PSK (32 or 48 bytes binary). 2511 * This check also ensures that psk_configured_size is smaller than 2512 * psk_retained buffer size. */ 2513 if (psk_configured_size == SHA256_DIGEST_LENGTH) { 2514 tls_cipher_suite = NVME_TCP_CIPHER_AES_128_GCM_SHA256; 2515 tctrlr->tls_cipher_suite = "TLS_AES_128_GCM_SHA256"; 2516 } else if (psk_configured_size == SHA384_DIGEST_LENGTH) { 2517 tls_cipher_suite = NVME_TCP_CIPHER_AES_256_GCM_SHA384; 2518 tctrlr->tls_cipher_suite = "TLS_AES_256_GCM_SHA384"; 2519 } else { 2520 SPDK_ERRLOG("Unrecognized cipher suite!\n"); 2521 rc = -ENOTSUP; 2522 goto finish; 2523 } 2524 2525 rc = nvme_tcp_generate_psk_identity(tctrlr->psk_identity, sizeof(tctrlr->psk_identity), 2526 ctrlr->opts.hostnqn, ctrlr->trid.subnqn, 2527 tls_cipher_suite); 2528 if (rc) { 2529 SPDK_ERRLOG("could not generate PSK identity\n"); 2530 goto finish; 2531 } 2532 2533 /* No hash indicates that Configured PSK must be used as Retained PSK. */ 2534 if (psk_retained_hash == NVME_TCP_HASH_ALGORITHM_NONE) { 2535 assert(psk_configured_size < sizeof(psk_retained)); 2536 memcpy(psk_retained, psk_configured, psk_configured_size); 2537 rc = psk_configured_size; 2538 } else { 2539 /* Derive retained PSK. */ 2540 rc = nvme_tcp_derive_retained_psk(psk_configured, psk_configured_size, ctrlr->opts.hostnqn, 2541 psk_retained, sizeof(psk_retained), psk_retained_hash); 2542 if (rc < 0) { 2543 SPDK_ERRLOG("Unable to derive retained PSK!\n"); 2544 goto finish; 2545 } 2546 } 2547 2548 rc = nvme_tcp_derive_tls_psk(psk_retained, rc, tctrlr->psk_identity, tctrlr->psk, 2549 sizeof(tctrlr->psk), tls_cipher_suite); 2550 if (rc < 0) { 2551 SPDK_ERRLOG("Could not generate TLS PSK!\n"); 2552 goto finish; 2553 } 2554 2555 tctrlr->psk_size = rc; 2556 rc = 0; 2557 finish: 2558 spdk_memset_s(psk_configured, sizeof(psk_configured), 0, sizeof(psk_configured)); 2559 spdk_memset_s(pskbuf, sizeof(pskbuf), 0, sizeof(pskbuf)); 2560 2561 return rc; 2562 } 2563 2564 /* We have to use the typedef in the function declaration to appease astyle. */ 2565 typedef struct spdk_nvme_ctrlr spdk_nvme_ctrlr_t; 2566 2567 static spdk_nvme_ctrlr_t * 2568 nvme_tcp_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 2569 const struct spdk_nvme_ctrlr_opts *opts, 2570 void *devhandle) 2571 { 2572 struct nvme_tcp_ctrlr *tctrlr; 2573 struct nvme_tcp_qpair *tqpair; 2574 int rc; 2575 2576 tctrlr = calloc(1, sizeof(*tctrlr)); 2577 if (tctrlr == NULL) { 2578 SPDK_ERRLOG("could not allocate ctrlr\n"); 2579 return NULL; 2580 } 2581 2582 tctrlr->ctrlr.opts = *opts; 2583 tctrlr->ctrlr.trid = *trid; 2584 2585 if (opts->tls_psk != NULL) { 2586 rc = nvme_tcp_generate_tls_credentials(tctrlr); 2587 if (rc != 0) { 2588 free(tctrlr); 2589 return NULL; 2590 } 2591 } 2592 2593 if (opts->transport_ack_timeout > NVME_TCP_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT) { 2594 SPDK_NOTICELOG("transport_ack_timeout exceeds max value %d, use max value\n", 2595 NVME_TCP_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT); 2596 tctrlr->ctrlr.opts.transport_ack_timeout = NVME_TCP_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT; 2597 } 2598 2599 rc = nvme_ctrlr_construct(&tctrlr->ctrlr); 2600 if (rc != 0) { 2601 free(tctrlr); 2602 return NULL; 2603 } 2604 2605 /* Sequence might be used not only for data digest offload purposes but 2606 * to handle a potential COPY operation appended as the result of translation. */ 2607 tctrlr->ctrlr.flags |= SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 2608 tctrlr->ctrlr.adminq = nvme_tcp_ctrlr_create_qpair(&tctrlr->ctrlr, 0, 2609 tctrlr->ctrlr.opts.admin_queue_size, 0, 2610 tctrlr->ctrlr.opts.admin_queue_size, true); 2611 if (!tctrlr->ctrlr.adminq) { 2612 SPDK_ERRLOG("failed to create admin qpair\n"); 2613 nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr); 2614 return NULL; 2615 } 2616 2617 tqpair = nvme_tcp_qpair(tctrlr->ctrlr.adminq); 2618 tctrlr->ctrlr.numa.id_valid = 1; 2619 tctrlr->ctrlr.numa.id = spdk_sock_get_numa_id(tqpair->sock); 2620 2621 if (nvme_ctrlr_add_process(&tctrlr->ctrlr, 0) != 0) { 2622 SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); 2623 nvme_ctrlr_destruct(&tctrlr->ctrlr); 2624 return NULL; 2625 } 2626 2627 return &tctrlr->ctrlr; 2628 } 2629 2630 static uint32_t 2631 nvme_tcp_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 2632 { 2633 /* TCP transport doesn't limit maximum IO transfer size. */ 2634 return UINT32_MAX; 2635 } 2636 2637 static uint16_t 2638 nvme_tcp_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 2639 { 2640 return NVME_TCP_MAX_SGL_DESCRIPTORS; 2641 } 2642 2643 static int 2644 nvme_tcp_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, 2645 int (*iter_fn)(struct nvme_request *req, void *arg), 2646 void *arg) 2647 { 2648 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2649 struct nvme_tcp_req *tcp_req, *tmp; 2650 int rc; 2651 2652 assert(iter_fn != NULL); 2653 2654 TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { 2655 assert(tcp_req->req != NULL); 2656 2657 rc = iter_fn(tcp_req->req, arg); 2658 if (rc != 0) { 2659 return rc; 2660 } 2661 } 2662 2663 return 0; 2664 } 2665 2666 static int 2667 nvme_tcp_qpair_authenticate(struct spdk_nvme_qpair *qpair) 2668 { 2669 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2670 int rc; 2671 2672 /* If the qpair is still connecting, it'll be forced to authenticate later on */ 2673 if (tqpair->state < NVME_TCP_QPAIR_STATE_RUNNING) { 2674 return 0; 2675 } else if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) { 2676 return -ENOTCONN; 2677 } 2678 2679 rc = nvme_fabric_qpair_authenticate_async(qpair); 2680 if (rc == 0) { 2681 nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTING); 2682 tqpair->state = NVME_TCP_QPAIR_STATE_AUTHENTICATING; 2683 } 2684 2685 return rc; 2686 } 2687 2688 static void 2689 nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 2690 { 2691 struct nvme_tcp_req *tcp_req, *tmp; 2692 struct spdk_nvme_cpl cpl = {}; 2693 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2694 2695 cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; 2696 cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2697 2698 TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { 2699 assert(tcp_req->req != NULL); 2700 if (tcp_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 2701 continue; 2702 } 2703 2704 nvme_tcp_req_complete(tcp_req, tqpair, &cpl, false); 2705 } 2706 } 2707 2708 static struct spdk_nvme_transport_poll_group * 2709 nvme_tcp_poll_group_create(void) 2710 { 2711 struct nvme_tcp_poll_group *group = calloc(1, sizeof(*group)); 2712 2713 if (group == NULL) { 2714 SPDK_ERRLOG("Unable to allocate poll group.\n"); 2715 return NULL; 2716 } 2717 2718 TAILQ_INIT(&group->needs_poll); 2719 2720 group->sock_group = spdk_sock_group_create(group); 2721 if (group->sock_group == NULL) { 2722 free(group); 2723 SPDK_ERRLOG("Unable to allocate sock group.\n"); 2724 return NULL; 2725 } 2726 2727 return &group->group; 2728 } 2729 2730 static struct spdk_nvme_transport_poll_group * 2731 nvme_tcp_qpair_get_optimal_poll_group(struct spdk_nvme_qpair *qpair) 2732 { 2733 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2734 struct spdk_sock_group *group = NULL; 2735 int rc; 2736 2737 rc = spdk_sock_get_optimal_sock_group(tqpair->sock, &group, NULL); 2738 if (!rc && group != NULL) { 2739 return spdk_sock_group_get_ctx(group); 2740 } 2741 2742 return NULL; 2743 } 2744 2745 static int 2746 nvme_tcp_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) 2747 { 2748 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group); 2749 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2750 2751 if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) { 2752 return -EPROTO; 2753 } 2754 return 0; 2755 } 2756 2757 static int 2758 nvme_tcp_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) 2759 { 2760 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group); 2761 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2762 2763 if (tqpair->needs_poll) { 2764 TAILQ_REMOVE(&group->needs_poll, tqpair, link); 2765 tqpair->needs_poll = false; 2766 } 2767 2768 if (tqpair->sock && group->sock_group) { 2769 if (spdk_sock_group_remove_sock(group->sock_group, tqpair->sock)) { 2770 return -EPROTO; 2771 } 2772 } 2773 return 0; 2774 } 2775 2776 static int 2777 nvme_tcp_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, 2778 struct spdk_nvme_qpair *qpair) 2779 { 2780 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2781 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); 2782 2783 /* disconnected qpairs won't have a sock to add. */ 2784 if (nvme_qpair_get_state(qpair) >= NVME_QPAIR_CONNECTED) { 2785 if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) { 2786 return -EPROTO; 2787 } 2788 } 2789 2790 return 0; 2791 } 2792 2793 static int 2794 nvme_tcp_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, 2795 struct spdk_nvme_qpair *qpair) 2796 { 2797 struct nvme_tcp_qpair *tqpair; 2798 struct nvme_tcp_poll_group *group; 2799 2800 assert(qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs); 2801 2802 tqpair = nvme_tcp_qpair(qpair); 2803 group = nvme_tcp_poll_group(tgroup); 2804 2805 assert(tqpair->shared_stats == true); 2806 tqpair->stats = &g_dummy_stats; 2807 2808 if (tqpair->needs_poll) { 2809 TAILQ_REMOVE(&group->needs_poll, tqpair, link); 2810 tqpair->needs_poll = false; 2811 } 2812 2813 return 0; 2814 } 2815 2816 static int64_t 2817 nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, 2818 uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) 2819 { 2820 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); 2821 struct spdk_nvme_qpair *qpair, *tmp_qpair; 2822 struct nvme_tcp_qpair *tqpair, *tmp_tqpair; 2823 int num_events; 2824 2825 group->completions_per_qpair = completions_per_qpair; 2826 group->num_completions = 0; 2827 group->stats.polls++; 2828 2829 num_events = spdk_sock_group_poll(group->sock_group); 2830 2831 STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { 2832 tqpair = nvme_tcp_qpair(qpair); 2833 if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING) { 2834 if (TAILQ_EMPTY(&tqpair->outstanding_reqs)) { 2835 nvme_transport_ctrlr_disconnect_qpair_done(qpair); 2836 } 2837 } 2838 /* Wait until the qpair transitions to the DISCONNECTED state, otherwise user might 2839 * want to free it from disconnect_qpair_cb, while it's not fully disconnected (and 2840 * might still have outstanding requests) */ 2841 if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTED) { 2842 disconnected_qpair_cb(qpair, tgroup->group->ctx); 2843 } 2844 } 2845 2846 /* If any qpairs were marked as needing to be polled due to an asynchronous write completion 2847 * and they weren't polled as a consequence of calling spdk_sock_group_poll above, poll them now. */ 2848 TAILQ_FOREACH_SAFE(tqpair, &group->needs_poll, link, tmp_tqpair) { 2849 nvme_tcp_qpair_sock_cb(&tqpair->qpair, group->sock_group, tqpair->sock); 2850 } 2851 2852 if (spdk_unlikely(num_events < 0)) { 2853 return num_events; 2854 } 2855 2856 group->stats.idle_polls += !num_events; 2857 group->stats.socket_completions += num_events; 2858 2859 return group->num_completions; 2860 } 2861 2862 static int 2863 nvme_tcp_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) 2864 { 2865 int rc; 2866 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); 2867 2868 if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { 2869 return -EBUSY; 2870 } 2871 2872 rc = spdk_sock_group_close(&group->sock_group); 2873 if (rc != 0) { 2874 SPDK_ERRLOG("Failed to close the sock group for a tcp poll group.\n"); 2875 assert(false); 2876 } 2877 2878 free(tgroup); 2879 2880 return 0; 2881 } 2882 2883 static int 2884 nvme_tcp_poll_group_get_stats(struct spdk_nvme_transport_poll_group *tgroup, 2885 struct spdk_nvme_transport_poll_group_stat **_stats) 2886 { 2887 struct nvme_tcp_poll_group *group; 2888 struct spdk_nvme_transport_poll_group_stat *stats; 2889 2890 if (tgroup == NULL || _stats == NULL) { 2891 SPDK_ERRLOG("Invalid stats or group pointer\n"); 2892 return -EINVAL; 2893 } 2894 2895 group = nvme_tcp_poll_group(tgroup); 2896 2897 stats = calloc(1, sizeof(*stats)); 2898 if (!stats) { 2899 SPDK_ERRLOG("Can't allocate memory for TCP stats\n"); 2900 return -ENOMEM; 2901 } 2902 stats->trtype = SPDK_NVME_TRANSPORT_TCP; 2903 memcpy(&stats->tcp, &group->stats, sizeof(group->stats)); 2904 2905 *_stats = stats; 2906 2907 return 0; 2908 } 2909 2910 static void 2911 nvme_tcp_poll_group_free_stats(struct spdk_nvme_transport_poll_group *tgroup, 2912 struct spdk_nvme_transport_poll_group_stat *stats) 2913 { 2914 free(stats); 2915 } 2916 2917 static int 2918 nvme_tcp_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr, 2919 struct spdk_memory_domain **domains, int array_size) 2920 { 2921 if (domains && array_size > 0) { 2922 domains[0] = spdk_memory_domain_get_system_domain(); 2923 } 2924 2925 return 1; 2926 } 2927 2928 const struct spdk_nvme_transport_ops tcp_ops = { 2929 .name = "TCP", 2930 .type = SPDK_NVME_TRANSPORT_TCP, 2931 .ctrlr_construct = nvme_tcp_ctrlr_construct, 2932 .ctrlr_scan = nvme_fabric_ctrlr_scan, 2933 .ctrlr_destruct = nvme_tcp_ctrlr_destruct, 2934 .ctrlr_enable = nvme_tcp_ctrlr_enable, 2935 2936 .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4, 2937 .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8, 2938 .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4, 2939 .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8, 2940 .ctrlr_set_reg_4_async = nvme_fabric_ctrlr_set_reg_4_async, 2941 .ctrlr_set_reg_8_async = nvme_fabric_ctrlr_set_reg_8_async, 2942 .ctrlr_get_reg_4_async = nvme_fabric_ctrlr_get_reg_4_async, 2943 .ctrlr_get_reg_8_async = nvme_fabric_ctrlr_get_reg_8_async, 2944 2945 .ctrlr_get_max_xfer_size = nvme_tcp_ctrlr_get_max_xfer_size, 2946 .ctrlr_get_max_sges = nvme_tcp_ctrlr_get_max_sges, 2947 2948 .ctrlr_create_io_qpair = nvme_tcp_ctrlr_create_io_qpair, 2949 .ctrlr_delete_io_qpair = nvme_tcp_ctrlr_delete_io_qpair, 2950 .ctrlr_connect_qpair = nvme_tcp_ctrlr_connect_qpair, 2951 .ctrlr_disconnect_qpair = nvme_tcp_ctrlr_disconnect_qpair, 2952 2953 .ctrlr_get_memory_domains = nvme_tcp_ctrlr_get_memory_domains, 2954 2955 .qpair_abort_reqs = nvme_tcp_qpair_abort_reqs, 2956 .qpair_reset = nvme_tcp_qpair_reset, 2957 .qpair_submit_request = nvme_tcp_qpair_submit_request, 2958 .qpair_process_completions = nvme_tcp_qpair_process_completions, 2959 .qpair_iterate_requests = nvme_tcp_qpair_iterate_requests, 2960 .qpair_authenticate = nvme_tcp_qpair_authenticate, 2961 .admin_qpair_abort_aers = nvme_tcp_admin_qpair_abort_aers, 2962 2963 .poll_group_create = nvme_tcp_poll_group_create, 2964 .qpair_get_optimal_poll_group = nvme_tcp_qpair_get_optimal_poll_group, 2965 .poll_group_connect_qpair = nvme_tcp_poll_group_connect_qpair, 2966 .poll_group_disconnect_qpair = nvme_tcp_poll_group_disconnect_qpair, 2967 .poll_group_add = nvme_tcp_poll_group_add, 2968 .poll_group_remove = nvme_tcp_poll_group_remove, 2969 .poll_group_process_completions = nvme_tcp_poll_group_process_completions, 2970 .poll_group_destroy = nvme_tcp_poll_group_destroy, 2971 .poll_group_get_stats = nvme_tcp_poll_group_get_stats, 2972 .poll_group_free_stats = nvme_tcp_poll_group_free_stats, 2973 }; 2974 2975 SPDK_NVME_TRANSPORT_REGISTER(tcp, &tcp_ops); 2976 2977 static void 2978 nvme_tcp_trace(void) 2979 { 2980 struct spdk_trace_tpoint_opts opts[] = { 2981 { 2982 "NVME_TCP_SUBMIT", TRACE_NVME_TCP_SUBMIT, 2983 OWNER_TYPE_NVME_TCP_QP, OBJECT_NVME_TCP_REQ, 1, 2984 { { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 2985 { "cid", SPDK_TRACE_ARG_TYPE_INT, 4 }, 2986 { "opc", SPDK_TRACE_ARG_TYPE_INT, 4 }, 2987 { "dw10", SPDK_TRACE_ARG_TYPE_PTR, 4 }, 2988 { "dw11", SPDK_TRACE_ARG_TYPE_PTR, 4 }, 2989 { "dw12", SPDK_TRACE_ARG_TYPE_PTR, 4 }, 2990 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 2991 } 2992 }, 2993 { 2994 "NVME_TCP_COMPLETE", TRACE_NVME_TCP_COMPLETE, 2995 OWNER_TYPE_NVME_TCP_QP, OBJECT_NVME_TCP_REQ, 0, 2996 { { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 2997 { "cid", SPDK_TRACE_ARG_TYPE_INT, 4 }, 2998 { "cpl", SPDK_TRACE_ARG_TYPE_PTR, 4 }, 2999 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 3000 } 3001 }, 3002 }; 3003 3004 spdk_trace_register_object(OBJECT_NVME_TCP_REQ, 'p'); 3005 spdk_trace_register_owner_type(OWNER_TYPE_NVME_TCP_QP, 'q'); 3006 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 3007 3008 spdk_trace_tpoint_register_relation(TRACE_SOCK_REQ_QUEUE, OBJECT_NVME_TCP_REQ, 0); 3009 spdk_trace_tpoint_register_relation(TRACE_SOCK_REQ_PEND, OBJECT_NVME_TCP_REQ, 0); 3010 spdk_trace_tpoint_register_relation(TRACE_SOCK_REQ_COMPLETE, OBJECT_NVME_TCP_REQ, 0); 3011 } 3012 SPDK_TRACE_REGISTER_FN(nvme_tcp_trace, "nvme_tcp", TRACE_GROUP_NVME_TCP) 3013