1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. All rights reserved. 3 * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe/TCP transport 9 */ 10 11 #include "nvme_internal.h" 12 13 #include "spdk/endian.h" 14 #include "spdk/likely.h" 15 #include "spdk/string.h" 16 #include "spdk/stdinc.h" 17 #include "spdk/crc32.h" 18 #include "spdk/assert.h" 19 #include "spdk/trace.h" 20 #include "spdk/util.h" 21 #include "spdk/nvmf.h" 22 #include "spdk/dma.h" 23 24 #include "spdk_internal/nvme_tcp.h" 25 #include "spdk_internal/trace_defs.h" 26 27 #define NVME_TCP_RW_BUFFER_SIZE 131072 28 29 /* For async connect workloads, allow more time since we are more likely 30 * to be processing lots ICREQs at once. 31 */ 32 #define ICREQ_TIMEOUT_SYNC 2 /* in seconds */ 33 #define ICREQ_TIMEOUT_ASYNC 10 /* in seconds */ 34 35 #define NVME_TCP_HPDA_DEFAULT 0 36 #define NVME_TCP_MAX_R2T_DEFAULT 1 37 #define NVME_TCP_PDU_H2C_MIN_DATA_SIZE 4096 38 39 /* 40 * Maximum value of transport_ack_timeout used by TCP controller 41 */ 42 #define NVME_TCP_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT 31 43 44 45 /* NVMe TCP transport extensions for spdk_nvme_ctrlr */ 46 struct nvme_tcp_ctrlr { 47 struct spdk_nvme_ctrlr ctrlr; 48 char psk_identity[NVMF_PSK_IDENTITY_LEN]; 49 uint8_t psk[SPDK_TLS_PSK_MAX_LEN]; 50 int psk_size; 51 char *tls_cipher_suite; 52 }; 53 54 struct nvme_tcp_poll_group { 55 struct spdk_nvme_transport_poll_group group; 56 struct spdk_sock_group *sock_group; 57 uint32_t completions_per_qpair; 58 int64_t num_completions; 59 60 TAILQ_HEAD(, nvme_tcp_qpair) needs_poll; 61 struct spdk_nvme_tcp_stat stats; 62 }; 63 64 /* NVMe TCP qpair extensions for spdk_nvme_qpair */ 65 struct nvme_tcp_qpair { 66 struct spdk_nvme_qpair qpair; 67 struct spdk_sock *sock; 68 69 TAILQ_HEAD(, nvme_tcp_req) free_reqs; 70 TAILQ_HEAD(, nvme_tcp_req) outstanding_reqs; 71 72 TAILQ_HEAD(, nvme_tcp_pdu) send_queue; 73 struct nvme_tcp_pdu *recv_pdu; 74 struct nvme_tcp_pdu *send_pdu; /* only for error pdu and init pdu */ 75 struct nvme_tcp_pdu *send_pdus; /* Used by tcp_reqs */ 76 enum nvme_tcp_pdu_recv_state recv_state; 77 struct nvme_tcp_req *tcp_reqs; 78 struct spdk_nvme_tcp_stat *stats; 79 80 uint16_t num_entries; 81 uint16_t async_complete; 82 83 struct { 84 uint16_t host_hdgst_enable: 1; 85 uint16_t host_ddgst_enable: 1; 86 uint16_t icreq_send_ack: 1; 87 uint16_t in_connect_poll: 1; 88 uint16_t reserved: 12; 89 } flags; 90 91 /** Specifies the maximum number of PDU-Data bytes per H2C Data Transfer PDU */ 92 uint32_t maxh2cdata; 93 94 uint32_t maxr2t; 95 96 /* 0 based value, which is used to guide the padding */ 97 uint8_t cpda; 98 99 enum nvme_tcp_qpair_state state; 100 101 TAILQ_ENTRY(nvme_tcp_qpair) link; 102 bool needs_poll; 103 104 uint64_t icreq_timeout_tsc; 105 106 bool shared_stats; 107 }; 108 109 enum nvme_tcp_req_state { 110 NVME_TCP_REQ_FREE, 111 NVME_TCP_REQ_ACTIVE, 112 NVME_TCP_REQ_ACTIVE_R2T, 113 }; 114 115 struct nvme_tcp_req { 116 struct nvme_request *req; 117 enum nvme_tcp_req_state state; 118 uint16_t cid; 119 uint16_t ttag; 120 uint32_t datao; 121 uint32_t expected_datao; 122 uint32_t r2tl_remain; 123 uint32_t active_r2ts; 124 /* Used to hold a value received from subsequent R2T while we are still 125 * waiting for H2C complete */ 126 uint16_t ttag_r2t_next; 127 bool in_capsule_data; 128 /* It is used to track whether the req can be safely freed */ 129 union { 130 uint8_t raw; 131 struct { 132 /* The last send operation completed - kernel released send buffer */ 133 uint8_t send_ack : 1; 134 /* Data transfer completed - target send resp or last data bit */ 135 uint8_t data_recv : 1; 136 /* tcp_req is waiting for completion of the previous send operation (buffer reclaim notification 137 * from kernel) to send H2C */ 138 uint8_t h2c_send_waiting_ack : 1; 139 /* tcp_req received subsequent r2t while it is still waiting for send_ack. 140 * Rare case, actual when dealing with target that can send several R2T requests. 141 * SPDK TCP target sends 1 R2T for the whole data buffer */ 142 uint8_t r2t_waiting_h2c_complete : 1; 143 /* Accel operation is in progress */ 144 uint8_t in_progress_accel : 1; 145 uint8_t domain_in_use: 1; 146 uint8_t reserved : 2; 147 } bits; 148 } ordering; 149 struct nvme_tcp_pdu *pdu; 150 struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS]; 151 uint32_t iovcnt; 152 /* Used to hold a value received from subsequent R2T while we are still 153 * waiting for H2C ack */ 154 uint32_t r2tl_remain_next; 155 struct nvme_tcp_qpair *tqpair; 156 TAILQ_ENTRY(nvme_tcp_req) link; 157 struct spdk_nvme_cpl rsp; 158 uint8_t rsvd1[32]; 159 }; 160 SPDK_STATIC_ASSERT(sizeof(struct nvme_tcp_req) % SPDK_CACHE_LINE_SIZE == 0, "unaligned size"); 161 162 static struct spdk_nvme_tcp_stat g_dummy_stats = {}; 163 164 static void nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req); 165 static int64_t nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group 166 *tgroup, uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb); 167 static void nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu); 168 static void nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req, struct nvme_tcp_qpair *tqpair, 169 struct spdk_nvme_cpl *rsp, bool print_on_error); 170 171 static inline struct nvme_tcp_qpair * 172 nvme_tcp_qpair(struct spdk_nvme_qpair *qpair) 173 { 174 assert(qpair->trtype == SPDK_NVME_TRANSPORT_TCP); 175 return SPDK_CONTAINEROF(qpair, struct nvme_tcp_qpair, qpair); 176 } 177 178 static inline struct nvme_tcp_poll_group * 179 nvme_tcp_poll_group(struct spdk_nvme_transport_poll_group *group) 180 { 181 return SPDK_CONTAINEROF(group, struct nvme_tcp_poll_group, group); 182 } 183 184 static inline struct nvme_tcp_ctrlr * 185 nvme_tcp_ctrlr(struct spdk_nvme_ctrlr *ctrlr) 186 { 187 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP); 188 return SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr); 189 } 190 191 static struct nvme_tcp_req * 192 nvme_tcp_req_get(struct nvme_tcp_qpair *tqpair) 193 { 194 struct nvme_tcp_req *tcp_req; 195 196 tcp_req = TAILQ_FIRST(&tqpair->free_reqs); 197 if (!tcp_req) { 198 return NULL; 199 } 200 201 assert(tcp_req->state == NVME_TCP_REQ_FREE); 202 tcp_req->state = NVME_TCP_REQ_ACTIVE; 203 TAILQ_REMOVE(&tqpair->free_reqs, tcp_req, link); 204 tcp_req->datao = 0; 205 tcp_req->expected_datao = 0; 206 tcp_req->req = NULL; 207 tcp_req->in_capsule_data = false; 208 tcp_req->r2tl_remain = 0; 209 tcp_req->r2tl_remain_next = 0; 210 tcp_req->active_r2ts = 0; 211 tcp_req->iovcnt = 0; 212 tcp_req->ordering.raw = 0; 213 memset(tcp_req->pdu, 0, sizeof(struct nvme_tcp_pdu)); 214 memset(&tcp_req->rsp, 0, sizeof(struct spdk_nvme_cpl)); 215 216 return tcp_req; 217 } 218 219 static void 220 nvme_tcp_req_put(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) 221 { 222 assert(tcp_req->state != NVME_TCP_REQ_FREE); 223 tcp_req->state = NVME_TCP_REQ_FREE; 224 TAILQ_INSERT_HEAD(&tqpair->free_reqs, tcp_req, link); 225 } 226 227 static inline void 228 nvme_tcp_accel_submit_crc32c(struct nvme_tcp_poll_group *tgroup, struct nvme_tcp_req *treq, 229 uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, uint32_t seed, 230 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 231 { 232 struct spdk_nvme_poll_group *pg = tgroup->group.group; 233 234 treq->ordering.bits.in_progress_accel = 1; 235 pg->accel_fn_table.submit_accel_crc32c(pg->ctx, dst, iovs, iovcnt, seed, cb_fn, cb_arg); 236 } 237 238 static inline void 239 nvme_tcp_accel_finish_sequence(struct nvme_tcp_poll_group *tgroup, struct nvme_tcp_req *treq, 240 void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 241 { 242 struct spdk_nvme_poll_group *pg = tgroup->group.group; 243 244 treq->ordering.bits.in_progress_accel = 1; 245 pg->accel_fn_table.finish_sequence(seq, cb_fn, cb_arg); 246 } 247 248 static inline void 249 nvme_tcp_accel_reverse_sequence(struct nvme_tcp_poll_group *tgroup, void *seq) 250 { 251 struct spdk_nvme_poll_group *pg = tgroup->group.group; 252 253 pg->accel_fn_table.reverse_sequence(seq); 254 } 255 256 static inline int 257 nvme_tcp_accel_append_crc32c(struct nvme_tcp_poll_group *tgroup, void **seq, uint32_t *dst, 258 struct iovec *iovs, uint32_t iovcnt, uint32_t seed, 259 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 260 { 261 struct spdk_nvme_poll_group *pg = tgroup->group.group; 262 263 return pg->accel_fn_table.append_crc32c(pg->ctx, seq, dst, iovs, iovcnt, NULL, NULL, 264 seed, cb_fn, cb_arg); 265 } 266 267 static void 268 nvme_tcp_free_reqs(struct nvme_tcp_qpair *tqpair) 269 { 270 free(tqpair->tcp_reqs); 271 tqpair->tcp_reqs = NULL; 272 273 spdk_free(tqpair->send_pdus); 274 tqpair->send_pdus = NULL; 275 } 276 277 static int 278 nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair) 279 { 280 uint16_t i; 281 struct nvme_tcp_req *tcp_req; 282 283 tqpair->tcp_reqs = aligned_alloc(SPDK_CACHE_LINE_SIZE, 284 tqpair->num_entries * sizeof(*tcp_req)); 285 if (tqpair->tcp_reqs == NULL) { 286 SPDK_ERRLOG("Failed to allocate tcp_reqs on tqpair=%p\n", tqpair); 287 goto fail; 288 } 289 290 /* Add additional 2 member for the send_pdu, recv_pdu owned by the tqpair */ 291 tqpair->send_pdus = spdk_zmalloc((tqpair->num_entries + 2) * sizeof(struct nvme_tcp_pdu), 292 0x1000, NULL, 293 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 294 295 if (tqpair->send_pdus == NULL) { 296 SPDK_ERRLOG("Failed to allocate send_pdus on tqpair=%p\n", tqpair); 297 goto fail; 298 } 299 300 memset(tqpair->tcp_reqs, 0, tqpair->num_entries * sizeof(*tcp_req)); 301 TAILQ_INIT(&tqpair->send_queue); 302 TAILQ_INIT(&tqpair->free_reqs); 303 TAILQ_INIT(&tqpair->outstanding_reqs); 304 tqpair->qpair.queue_depth = 0; 305 for (i = 0; i < tqpair->num_entries; i++) { 306 tcp_req = &tqpair->tcp_reqs[i]; 307 tcp_req->cid = i; 308 tcp_req->tqpair = tqpair; 309 tcp_req->pdu = &tqpair->send_pdus[i]; 310 TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link); 311 } 312 313 tqpair->send_pdu = &tqpair->send_pdus[i]; 314 tqpair->recv_pdu = &tqpair->send_pdus[i + 1]; 315 316 return 0; 317 fail: 318 nvme_tcp_free_reqs(tqpair); 319 return -ENOMEM; 320 } 321 322 static inline void 323 nvme_tcp_qpair_set_recv_state(struct nvme_tcp_qpair *tqpair, 324 enum nvme_tcp_pdu_recv_state state) 325 { 326 if (tqpair->recv_state == state) { 327 SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n", 328 tqpair, state); 329 return; 330 } 331 332 if (state == NVME_TCP_PDU_RECV_STATE_ERROR) { 333 assert(TAILQ_EMPTY(&tqpair->outstanding_reqs)); 334 } 335 336 tqpair->recv_state = state; 337 } 338 339 static void nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); 340 341 static void 342 nvme_tcp_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 343 { 344 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 345 struct nvme_tcp_pdu *pdu; 346 int rc; 347 struct nvme_tcp_poll_group *group; 348 349 if (tqpair->needs_poll) { 350 group = nvme_tcp_poll_group(qpair->poll_group); 351 TAILQ_REMOVE(&group->needs_poll, tqpair, link); 352 tqpair->needs_poll = false; 353 } 354 355 rc = spdk_sock_close(&tqpair->sock); 356 357 if (tqpair->sock != NULL) { 358 SPDK_ERRLOG("tqpair=%p, errno=%d, rc=%d\n", tqpair, errno, rc); 359 /* Set it to NULL manually */ 360 tqpair->sock = NULL; 361 } 362 363 /* clear the send_queue */ 364 while (!TAILQ_EMPTY(&tqpair->send_queue)) { 365 pdu = TAILQ_FIRST(&tqpair->send_queue); 366 /* Remove the pdu from the send_queue to prevent the wrong sending out 367 * in the next round connection 368 */ 369 TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); 370 } 371 372 nvme_tcp_qpair_abort_reqs(qpair, 0); 373 374 /* If the qpair is marked as asynchronous, let it go through the process_completions() to 375 * let any outstanding requests (e.g. those with outstanding accel operations) complete. 376 * Otherwise, there's no way of waiting for them, so tqpair->outstanding_reqs has to be 377 * empty. 378 */ 379 if (qpair->async) { 380 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 381 } else { 382 assert(TAILQ_EMPTY(&tqpair->outstanding_reqs)); 383 nvme_transport_ctrlr_disconnect_qpair_done(qpair); 384 } 385 } 386 387 static int 388 nvme_tcp_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 389 { 390 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 391 392 assert(qpair != NULL); 393 nvme_tcp_qpair_abort_reqs(qpair, 0); 394 assert(TAILQ_EMPTY(&tqpair->outstanding_reqs)); 395 396 nvme_qpair_deinit(qpair); 397 nvme_tcp_free_reqs(tqpair); 398 if (!tqpair->shared_stats) { 399 free(tqpair->stats); 400 } 401 free(tqpair); 402 403 return 0; 404 } 405 406 static int 407 nvme_tcp_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) 408 { 409 return 0; 410 } 411 412 static int 413 nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) 414 { 415 struct nvme_tcp_ctrlr *tctrlr = nvme_tcp_ctrlr(ctrlr); 416 417 if (ctrlr->adminq) { 418 nvme_tcp_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq); 419 } 420 421 nvme_ctrlr_destruct_finish(ctrlr); 422 423 free(tctrlr); 424 425 return 0; 426 } 427 428 /* If there are queued requests, we assume they are queued because they are waiting 429 * for resources to be released. Those resources are almost certainly released in 430 * response to a PDU completing. However, to attempt to make forward progress 431 * the qpair needs to be polled and we can't rely on another network event to make 432 * that happen. Add it to a list of qpairs to poll regardless of network activity. 433 * 434 * Besides, when tqpair state is NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL or 435 * NVME_TCP_QPAIR_STATE_INITIALIZING, need to add it to needs_poll list too to make 436 * forward progress in case that the resources are released after icreq's or CONNECT's 437 * resp is processed. */ 438 static void 439 nvme_tcp_cond_schedule_qpair_polling(struct nvme_tcp_qpair *tqpair) 440 { 441 struct nvme_tcp_poll_group *pgroup; 442 443 if (tqpair->needs_poll || !tqpair->qpair.poll_group) { 444 return; 445 } 446 447 if (STAILQ_EMPTY(&tqpair->qpair.queued_req) && 448 spdk_likely(tqpair->state != NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL && 449 tqpair->state != NVME_TCP_QPAIR_STATE_INITIALIZING)) { 450 return; 451 } 452 453 pgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 454 TAILQ_INSERT_TAIL(&pgroup->needs_poll, tqpair, link); 455 tqpair->needs_poll = true; 456 } 457 458 static void 459 pdu_write_done(void *cb_arg, int err) 460 { 461 struct nvme_tcp_pdu *pdu = cb_arg; 462 struct nvme_tcp_qpair *tqpair = pdu->qpair; 463 464 nvme_tcp_cond_schedule_qpair_polling(tqpair); 465 TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); 466 467 if (err != 0) { 468 nvme_transport_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair); 469 return; 470 } 471 472 assert(pdu->cb_fn != NULL); 473 pdu->cb_fn(pdu->cb_arg); 474 } 475 476 static void 477 pdu_write_fail(struct nvme_tcp_pdu *pdu, int status) 478 { 479 struct nvme_tcp_qpair *tqpair = pdu->qpair; 480 481 /* This function is similar to pdu_write_done(), but it should be called before a PDU is 482 * sent over the socket */ 483 TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); 484 pdu_write_done(pdu, status); 485 } 486 487 static void 488 pdu_seq_fail(struct nvme_tcp_pdu *pdu, int status) 489 { 490 struct nvme_tcp_req *treq = pdu->req; 491 492 SPDK_ERRLOG("Failed to execute accel sequence: %d\n", status); 493 nvme_tcp_cond_schedule_qpair_polling(pdu->qpair); 494 treq->rsp.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 495 nvme_tcp_req_complete(treq, treq->tqpair, &treq->rsp, true); 496 } 497 498 static void 499 _tcp_write_pdu(struct nvme_tcp_pdu *pdu) 500 { 501 uint32_t mapped_length = 0; 502 struct nvme_tcp_qpair *tqpair = pdu->qpair; 503 504 pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, SPDK_COUNTOF(pdu->iov), pdu, 505 (bool)tqpair->flags.host_hdgst_enable, (bool)tqpair->flags.host_ddgst_enable, 506 &mapped_length); 507 TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); 508 if (spdk_unlikely(mapped_length < pdu->data_len)) { 509 SPDK_ERRLOG("could not map the whole %u bytes (mapped only %u bytes)\n", pdu->data_len, 510 mapped_length); 511 pdu_write_done(pdu, -EINVAL); 512 return; 513 } 514 pdu->sock_req.cb_fn = pdu_write_done; 515 pdu->sock_req.cb_arg = pdu; 516 tqpair->stats->submitted_requests++; 517 spdk_sock_writev_async(tqpair->sock, &pdu->sock_req); 518 } 519 520 static void 521 tcp_write_pdu_seq_cb(void *ctx, int status) 522 { 523 struct nvme_tcp_pdu *pdu = ctx; 524 struct nvme_tcp_req *treq = pdu->req; 525 struct nvme_request *req = treq->req; 526 527 assert(treq->ordering.bits.in_progress_accel); 528 treq->ordering.bits.in_progress_accel = 0; 529 530 req->accel_sequence = NULL; 531 if (spdk_unlikely(status != 0)) { 532 pdu_seq_fail(pdu, status); 533 return; 534 } 535 536 _tcp_write_pdu(pdu); 537 } 538 539 static void 540 tcp_write_pdu(struct nvme_tcp_pdu *pdu) 541 { 542 struct nvme_tcp_req *treq = pdu->req; 543 struct nvme_tcp_qpair *tqpair = pdu->qpair; 544 struct nvme_tcp_poll_group *tgroup; 545 struct nvme_request *req; 546 547 if (spdk_likely(treq != NULL)) { 548 req = treq->req; 549 if (req->accel_sequence != NULL && 550 spdk_nvme_opc_get_data_transfer(req->cmd.opc) == SPDK_NVME_DATA_HOST_TO_CONTROLLER && 551 pdu->data_len > 0) { 552 assert(tqpair->qpair.poll_group != NULL); 553 tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 554 nvme_tcp_accel_finish_sequence(tgroup, treq, req->accel_sequence, 555 tcp_write_pdu_seq_cb, pdu); 556 return; 557 } 558 } 559 560 _tcp_write_pdu(pdu); 561 } 562 563 static void 564 pdu_accel_compute_crc32_done(void *cb_arg, int status) 565 { 566 struct nvme_tcp_pdu *pdu = cb_arg; 567 struct nvme_tcp_req *req = pdu->req; 568 569 assert(req->ordering.bits.in_progress_accel); 570 req->ordering.bits.in_progress_accel = 0; 571 572 if (spdk_unlikely(status)) { 573 SPDK_ERRLOG("Failed to compute the data digest for pdu =%p\n", pdu); 574 pdu_write_fail(pdu, status); 575 return; 576 } 577 578 pdu->data_digest_crc32 ^= SPDK_CRC32C_XOR; 579 MAKE_DIGEST_WORD(pdu->data_digest, pdu->data_digest_crc32); 580 581 _tcp_write_pdu(pdu); 582 } 583 584 static void 585 pdu_accel_compute_crc32_seq_cb(void *cb_arg, int status) 586 { 587 struct nvme_tcp_pdu *pdu = cb_arg; 588 struct nvme_tcp_qpair *tqpair = pdu->qpair; 589 struct nvme_tcp_poll_group *tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 590 struct nvme_tcp_req *treq = pdu->req; 591 struct nvme_request *req = treq->req; 592 593 assert(treq->ordering.bits.in_progress_accel); 594 treq->ordering.bits.in_progress_accel = 0; 595 596 req->accel_sequence = NULL; 597 if (spdk_unlikely(status != 0)) { 598 pdu_seq_fail(pdu, status); 599 return; 600 } 601 602 nvme_tcp_accel_submit_crc32c(tgroup, pdu->req, &pdu->data_digest_crc32, 603 pdu->data_iov, pdu->data_iovcnt, 0, 604 pdu_accel_compute_crc32_done, pdu); 605 } 606 607 static void 608 pdu_accel_seq_compute_crc32_done(void *cb_arg) 609 { 610 struct nvme_tcp_pdu *pdu = cb_arg; 611 612 pdu->data_digest_crc32 ^= SPDK_CRC32C_XOR; 613 MAKE_DIGEST_WORD(pdu->data_digest, pdu->data_digest_crc32); 614 } 615 616 static bool 617 pdu_accel_compute_crc32(struct nvme_tcp_pdu *pdu) 618 { 619 struct nvme_tcp_qpair *tqpair = pdu->qpair; 620 struct nvme_tcp_poll_group *tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 621 struct nvme_request *req = ((struct nvme_tcp_req *)pdu->req)->req; 622 int rc; 623 624 /* Only support this limited case for the first step */ 625 if (spdk_unlikely(nvme_qpair_get_state(&tqpair->qpair) < NVME_QPAIR_CONNECTED || 626 pdu->dif_ctx != NULL || 627 pdu->data_len % SPDK_NVME_TCP_DIGEST_ALIGNMENT != 0)) { 628 return false; 629 } 630 631 if (tqpair->qpair.poll_group == NULL) { 632 return false; 633 } 634 635 if (tgroup->group.group->accel_fn_table.append_crc32c != NULL) { 636 rc = nvme_tcp_accel_append_crc32c(tgroup, &req->accel_sequence, 637 &pdu->data_digest_crc32, 638 pdu->data_iov, pdu->data_iovcnt, 0, 639 pdu_accel_seq_compute_crc32_done, pdu); 640 if (spdk_unlikely(rc != 0)) { 641 /* If accel is out of resources, fall back to non-accelerated crc32 */ 642 if (rc == -ENOMEM) { 643 return false; 644 } 645 646 SPDK_ERRLOG("Failed to append crc32c operation: %d\n", rc); 647 pdu_write_fail(pdu, rc); 648 return true; 649 } 650 651 tcp_write_pdu(pdu); 652 return true; 653 } else if (tgroup->group.group->accel_fn_table.submit_accel_crc32c != NULL) { 654 if (req->accel_sequence != NULL) { 655 nvme_tcp_accel_finish_sequence(tgroup, pdu->req, req->accel_sequence, 656 pdu_accel_compute_crc32_seq_cb, pdu); 657 } else { 658 nvme_tcp_accel_submit_crc32c(tgroup, pdu->req, &pdu->data_digest_crc32, 659 pdu->data_iov, pdu->data_iovcnt, 0, 660 pdu_accel_compute_crc32_done, pdu); 661 } 662 663 return true; 664 } 665 666 return false; 667 } 668 669 static void 670 pdu_compute_crc32_seq_cb(void *cb_arg, int status) 671 { 672 struct nvme_tcp_pdu *pdu = cb_arg; 673 struct nvme_tcp_req *treq = pdu->req; 674 struct nvme_request *req = treq->req; 675 uint32_t crc32c; 676 677 assert(treq->ordering.bits.in_progress_accel); 678 treq->ordering.bits.in_progress_accel = 0; 679 680 req->accel_sequence = NULL; 681 if (spdk_unlikely(status != 0)) { 682 pdu_seq_fail(pdu, status); 683 return; 684 } 685 686 crc32c = nvme_tcp_pdu_calc_data_digest(pdu); 687 crc32c = crc32c ^ SPDK_CRC32C_XOR; 688 MAKE_DIGEST_WORD(pdu->data_digest, crc32c); 689 690 _tcp_write_pdu(pdu); 691 } 692 693 static void 694 pdu_compute_crc32(struct nvme_tcp_pdu *pdu) 695 { 696 struct nvme_tcp_qpair *tqpair = pdu->qpair; 697 struct nvme_tcp_poll_group *tgroup; 698 struct nvme_request *req; 699 uint32_t crc32c; 700 701 /* Data Digest */ 702 if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && 703 tqpair->flags.host_ddgst_enable) { 704 if (pdu_accel_compute_crc32(pdu)) { 705 return; 706 } 707 708 req = ((struct nvme_tcp_req *)pdu->req)->req; 709 if (req->accel_sequence != NULL) { 710 tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 711 nvme_tcp_accel_finish_sequence(tgroup, pdu->req, req->accel_sequence, 712 pdu_compute_crc32_seq_cb, pdu); 713 return; 714 } 715 716 crc32c = nvme_tcp_pdu_calc_data_digest(pdu); 717 crc32c = crc32c ^ SPDK_CRC32C_XOR; 718 MAKE_DIGEST_WORD(pdu->data_digest, crc32c); 719 } 720 721 tcp_write_pdu(pdu); 722 } 723 724 static int 725 nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair, 726 struct nvme_tcp_pdu *pdu, 727 nvme_tcp_qpair_xfer_complete_cb cb_fn, 728 void *cb_arg) 729 { 730 int hlen; 731 uint32_t crc32c; 732 733 hlen = pdu->hdr.common.hlen; 734 pdu->cb_fn = cb_fn; 735 pdu->cb_arg = cb_arg; 736 pdu->qpair = tqpair; 737 738 /* Header Digest */ 739 if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->flags.host_hdgst_enable) { 740 crc32c = nvme_tcp_pdu_calc_header_digest(pdu); 741 MAKE_DIGEST_WORD((uint8_t *)&pdu->hdr.raw[hlen], crc32c); 742 } 743 744 pdu_compute_crc32(pdu); 745 746 return 0; 747 } 748 749 static int 750 nvme_tcp_try_memory_translation(struct nvme_tcp_req *tcp_req, void **addr, uint32_t length) 751 { 752 struct nvme_request *req = tcp_req->req; 753 struct spdk_memory_domain_translation_result translation = { 754 .iov_count = 0, 755 .size = sizeof(translation) 756 }; 757 int rc; 758 759 if (!tcp_req->ordering.bits.domain_in_use) { 760 return 0; 761 } 762 763 rc = spdk_memory_domain_translate_data(req->payload.opts->memory_domain, 764 req->payload.opts->memory_domain_ctx, spdk_memory_domain_get_system_domain(), NULL, *addr, length, 765 &translation); 766 if (spdk_unlikely(rc || translation.iov_count != 1)) { 767 SPDK_ERRLOG("DMA memory translation failed, rc %d, iov_count %u\n", rc, translation.iov_count); 768 return -EFAULT; 769 } 770 771 assert(length == translation.iov.iov_len); 772 *addr = translation.iov.iov_base; 773 return 0; 774 } 775 776 /* 777 * Build SGL describing contiguous payload buffer. 778 */ 779 static int 780 nvme_tcp_build_contig_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) 781 { 782 struct nvme_request *req = tcp_req->req; 783 784 /* ubsan complains about applying zero offset to null pointer if contig_or_cb_arg is NULL, 785 * so just double cast it to make it go away */ 786 void *addr = (void *)((uintptr_t)req->payload.contig_or_cb_arg + req->payload_offset); 787 size_t length = req->payload_size; 788 int rc; 789 790 SPDK_DEBUGLOG(nvme, "enter\n"); 791 792 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); 793 rc = nvme_tcp_try_memory_translation(tcp_req, &addr, length); 794 if (spdk_unlikely(rc)) { 795 return rc; 796 } 797 798 tcp_req->iov[0].iov_base = addr; 799 tcp_req->iov[0].iov_len = length; 800 tcp_req->iovcnt = 1; 801 return 0; 802 } 803 804 /* 805 * Build SGL describing scattered payload buffer. 806 */ 807 static int 808 nvme_tcp_build_sgl_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) 809 { 810 int rc; 811 uint32_t length, remaining_size, iovcnt = 0, max_num_sgl; 812 struct nvme_request *req = tcp_req->req; 813 814 SPDK_DEBUGLOG(nvme, "enter\n"); 815 816 assert(req->payload_size != 0); 817 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); 818 assert(req->payload.reset_sgl_fn != NULL); 819 assert(req->payload.next_sge_fn != NULL); 820 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); 821 822 max_num_sgl = spdk_min(req->qpair->ctrlr->max_sges, NVME_TCP_MAX_SGL_DESCRIPTORS); 823 remaining_size = req->payload_size; 824 825 do { 826 void *addr; 827 828 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &addr, &length); 829 if (rc) { 830 return -1; 831 } 832 833 rc = nvme_tcp_try_memory_translation(tcp_req, &addr, length); 834 if (spdk_unlikely(rc)) { 835 return rc; 836 } 837 838 length = spdk_min(length, remaining_size); 839 tcp_req->iov[iovcnt].iov_base = addr; 840 tcp_req->iov[iovcnt].iov_len = length; 841 remaining_size -= length; 842 iovcnt++; 843 } while (remaining_size > 0 && iovcnt < max_num_sgl); 844 845 846 /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ 847 if (remaining_size > 0) { 848 SPDK_ERRLOG("Failed to construct tcp_req=%p, and the iovcnt=%u, remaining_size=%u\n", 849 tcp_req, iovcnt, remaining_size); 850 return -1; 851 } 852 853 tcp_req->iovcnt = iovcnt; 854 855 return 0; 856 } 857 858 static int 859 nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req, 860 struct nvme_tcp_req *tcp_req) 861 { 862 struct spdk_nvme_ctrlr *ctrlr = tqpair->qpair.ctrlr; 863 int rc = 0; 864 enum spdk_nvme_data_transfer xfer; 865 uint32_t max_in_capsule_data_size; 866 867 tcp_req->req = req; 868 tcp_req->ordering.bits.domain_in_use = (req->payload.opts && req->payload.opts->memory_domain); 869 870 req->cmd.cid = tcp_req->cid; 871 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; 872 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK; 873 req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_TRANSPORT; 874 req->cmd.dptr.sgl1.unkeyed.length = req->payload_size; 875 876 if (spdk_unlikely(req->cmd.opc == SPDK_NVME_OPC_FABRIC)) { 877 struct spdk_nvmf_capsule_cmd *nvmf_cmd = (struct spdk_nvmf_capsule_cmd *)&req->cmd; 878 879 xfer = spdk_nvme_opc_get_data_transfer(nvmf_cmd->fctype); 880 } else { 881 xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc); 882 } 883 884 /* For c2h delay filling in the iov until the data arrives. 885 * For h2c some delay is also possible if data doesn't fit into cmd capsule (not implemented). */ 886 if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { 887 if (xfer != SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 888 rc = nvme_tcp_build_contig_request(tqpair, tcp_req); 889 } 890 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { 891 if (xfer != SPDK_NVME_DATA_CONTROLLER_TO_HOST) { 892 rc = nvme_tcp_build_sgl_request(tqpair, tcp_req); 893 } 894 } else { 895 rc = -1; 896 } 897 898 if (rc) { 899 return rc; 900 } 901 902 if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { 903 max_in_capsule_data_size = ctrlr->ioccsz_bytes; 904 if (spdk_unlikely((req->cmd.opc == SPDK_NVME_OPC_FABRIC) || 905 nvme_qpair_is_admin_queue(&tqpair->qpair))) { 906 max_in_capsule_data_size = SPDK_NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE; 907 } 908 909 if (req->payload_size <= max_in_capsule_data_size) { 910 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; 911 req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; 912 req->cmd.dptr.sgl1.address = 0; 913 tcp_req->in_capsule_data = true; 914 } 915 } 916 917 return 0; 918 } 919 920 static inline bool 921 nvme_tcp_req_complete_safe(struct nvme_tcp_req *tcp_req) 922 { 923 if (!(tcp_req->ordering.bits.send_ack && tcp_req->ordering.bits.data_recv && 924 !tcp_req->ordering.bits.in_progress_accel)) { 925 return false; 926 } 927 928 assert(tcp_req->state == NVME_TCP_REQ_ACTIVE); 929 assert(tcp_req->tqpair != NULL); 930 assert(tcp_req->req != NULL); 931 932 nvme_tcp_req_complete(tcp_req, tcp_req->tqpair, &tcp_req->rsp, true); 933 return true; 934 } 935 936 static void 937 nvme_tcp_qpair_cmd_send_complete(void *cb_arg) 938 { 939 struct nvme_tcp_req *tcp_req = cb_arg; 940 941 SPDK_DEBUGLOG(nvme, "tcp req %p, cid %u, qid %u\n", tcp_req, tcp_req->cid, 942 tcp_req->tqpair->qpair.id); 943 tcp_req->ordering.bits.send_ack = 1; 944 /* Handle the r2t case */ 945 if (spdk_unlikely(tcp_req->ordering.bits.h2c_send_waiting_ack)) { 946 SPDK_DEBUGLOG(nvme, "tcp req %p, send H2C data\n", tcp_req); 947 nvme_tcp_send_h2c_data(tcp_req); 948 } else { 949 if (tcp_req->in_capsule_data && tcp_req->ordering.bits.domain_in_use) { 950 spdk_memory_domain_invalidate_data(tcp_req->req->payload.opts->memory_domain, 951 tcp_req->req->payload.opts->memory_domain_ctx, tcp_req->iov, tcp_req->iovcnt); 952 } 953 954 nvme_tcp_req_complete_safe(tcp_req); 955 } 956 } 957 958 static int 959 nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair, 960 struct nvme_tcp_req *tcp_req) 961 { 962 struct nvme_tcp_pdu *pdu; 963 struct spdk_nvme_tcp_cmd *capsule_cmd; 964 uint32_t plen = 0, alignment; 965 uint8_t pdo; 966 967 SPDK_DEBUGLOG(nvme, "enter\n"); 968 pdu = tcp_req->pdu; 969 pdu->req = tcp_req; 970 971 capsule_cmd = &pdu->hdr.capsule_cmd; 972 capsule_cmd->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD; 973 plen = capsule_cmd->common.hlen = sizeof(*capsule_cmd); 974 capsule_cmd->ccsqe = tcp_req->req->cmd; 975 976 SPDK_DEBUGLOG(nvme, "capsule_cmd cid=%u on tqpair(%p)\n", tcp_req->req->cmd.cid, tqpair); 977 978 if (tqpair->flags.host_hdgst_enable) { 979 SPDK_DEBUGLOG(nvme, "Header digest is enabled for capsule command on tcp_req=%p\n", 980 tcp_req); 981 capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; 982 plen += SPDK_NVME_TCP_DIGEST_LEN; 983 } 984 985 if ((tcp_req->req->payload_size == 0) || !tcp_req->in_capsule_data) { 986 goto end; 987 } 988 989 pdo = plen; 990 pdu->padding_len = 0; 991 if (tqpair->cpda) { 992 alignment = (tqpair->cpda + 1) << 2; 993 if (alignment > plen) { 994 pdu->padding_len = alignment - plen; 995 pdo = alignment; 996 plen = alignment; 997 } 998 } 999 1000 capsule_cmd->common.pdo = pdo; 1001 plen += tcp_req->req->payload_size; 1002 if (tqpair->flags.host_ddgst_enable) { 1003 capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; 1004 plen += SPDK_NVME_TCP_DIGEST_LEN; 1005 } 1006 1007 tcp_req->datao = 0; 1008 nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt, 1009 0, tcp_req->req->payload_size); 1010 end: 1011 capsule_cmd->common.plen = plen; 1012 return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, tcp_req); 1013 1014 } 1015 1016 static int 1017 nvme_tcp_qpair_submit_request(struct spdk_nvme_qpair *qpair, 1018 struct nvme_request *req) 1019 { 1020 struct nvme_tcp_qpair *tqpair; 1021 struct nvme_tcp_req *tcp_req; 1022 1023 tqpair = nvme_tcp_qpair(qpair); 1024 assert(tqpair != NULL); 1025 assert(req != NULL); 1026 1027 tcp_req = nvme_tcp_req_get(tqpair); 1028 if (!tcp_req) { 1029 tqpair->stats->queued_requests++; 1030 /* Inform the upper layer to try again later. */ 1031 return -EAGAIN; 1032 } 1033 1034 if (spdk_unlikely(nvme_tcp_req_init(tqpair, req, tcp_req))) { 1035 SPDK_ERRLOG("nvme_tcp_req_init() failed\n"); 1036 nvme_tcp_req_put(tqpair, tcp_req); 1037 return -1; 1038 } 1039 1040 tqpair->qpair.queue_depth++; 1041 spdk_trace_record(TRACE_NVME_TCP_SUBMIT, qpair->id, 0, (uintptr_t)tcp_req->pdu, req->cb_arg, 1042 (uint32_t)req->cmd.cid, (uint32_t)req->cmd.opc, 1043 req->cmd.cdw10, req->cmd.cdw11, req->cmd.cdw12, tqpair->qpair.queue_depth); 1044 TAILQ_INSERT_TAIL(&tqpair->outstanding_reqs, tcp_req, link); 1045 return nvme_tcp_qpair_capsule_cmd_send(tqpair, tcp_req); 1046 } 1047 1048 static int 1049 nvme_tcp_qpair_reset(struct spdk_nvme_qpair *qpair) 1050 { 1051 return 0; 1052 } 1053 1054 static void 1055 nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req, 1056 struct nvme_tcp_qpair *tqpair, 1057 struct spdk_nvme_cpl *rsp, 1058 bool print_on_error) 1059 { 1060 struct spdk_nvme_cpl cpl; 1061 struct spdk_nvme_qpair *qpair; 1062 struct nvme_request *req; 1063 bool print_error; 1064 1065 assert(tcp_req->req != NULL); 1066 req = tcp_req->req; 1067 qpair = req->qpair; 1068 1069 SPDK_DEBUGLOG(nvme, "complete tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair); 1070 1071 if (!tcp_req->tqpair->qpair.in_completion_context) { 1072 tcp_req->tqpair->async_complete++; 1073 } 1074 1075 /* Cache arguments to be passed to nvme_complete_request since tcp_req can be zeroed when released */ 1076 memcpy(&cpl, rsp, sizeof(cpl)); 1077 1078 if (spdk_unlikely(spdk_nvme_cpl_is_error(rsp))) { 1079 print_error = print_on_error && !qpair->ctrlr->opts.disable_error_logging; 1080 1081 if (print_error) { 1082 spdk_nvme_qpair_print_command(qpair, &req->cmd); 1083 } 1084 1085 if (print_error || SPDK_DEBUGLOG_FLAG_ENABLED("nvme")) { 1086 spdk_nvme_qpair_print_completion(qpair, rsp); 1087 } 1088 } 1089 1090 tqpair->qpair.queue_depth--; 1091 spdk_trace_record(TRACE_NVME_TCP_COMPLETE, qpair->id, 0, (uintptr_t)tcp_req->pdu, req->cb_arg, 1092 (uint32_t)req->cmd.cid, (uint32_t)cpl.status_raw, tqpair->qpair.queue_depth); 1093 TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link); 1094 nvme_tcp_req_put(tqpair, tcp_req); 1095 nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, &cpl); 1096 } 1097 1098 static void 1099 nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) 1100 { 1101 struct nvme_tcp_req *tcp_req, *tmp; 1102 struct spdk_nvme_cpl cpl = {}; 1103 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 1104 1105 cpl.sqid = qpair->id; 1106 cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; 1107 cpl.status.sct = SPDK_NVME_SCT_GENERIC; 1108 cpl.status.dnr = dnr; 1109 1110 TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { 1111 /* We cannot abort requests with accel operations in progress */ 1112 if (tcp_req->ordering.bits.in_progress_accel) { 1113 continue; 1114 } 1115 1116 nvme_tcp_req_complete(tcp_req, tqpair, &cpl, true); 1117 } 1118 } 1119 1120 static void 1121 nvme_tcp_qpair_send_h2c_term_req_complete(void *cb_arg) 1122 { 1123 struct nvme_tcp_qpair *tqpair = cb_arg; 1124 1125 tqpair->state = NVME_TCP_QPAIR_STATE_EXITING; 1126 } 1127 1128 static void 1129 nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, 1130 enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset) 1131 { 1132 struct nvme_tcp_pdu *rsp_pdu; 1133 struct spdk_nvme_tcp_term_req_hdr *h2c_term_req; 1134 uint32_t h2c_term_req_hdr_len = sizeof(*h2c_term_req); 1135 uint8_t copy_len; 1136 1137 rsp_pdu = tqpair->send_pdu; 1138 memset(rsp_pdu, 0, sizeof(*rsp_pdu)); 1139 h2c_term_req = &rsp_pdu->hdr.term_req; 1140 h2c_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ; 1141 h2c_term_req->common.hlen = h2c_term_req_hdr_len; 1142 1143 if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || 1144 (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { 1145 DSET32(&h2c_term_req->fei, error_offset); 1146 } 1147 1148 copy_len = pdu->hdr.common.hlen; 1149 if (copy_len > SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) { 1150 copy_len = SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE; 1151 } 1152 1153 /* Copy the error info into the buffer */ 1154 memcpy((uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, pdu->hdr.raw, copy_len); 1155 nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, copy_len); 1156 1157 /* Contain the header len of the wrong received pdu */ 1158 h2c_term_req->common.plen = h2c_term_req->common.hlen + copy_len; 1159 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 1160 nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_send_h2c_term_req_complete, tqpair); 1161 } 1162 1163 static bool 1164 nvme_tcp_qpair_recv_state_valid(struct nvme_tcp_qpair *tqpair) 1165 { 1166 switch (tqpair->state) { 1167 case NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND: 1168 case NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL: 1169 case NVME_TCP_QPAIR_STATE_RUNNING: 1170 return true; 1171 default: 1172 return false; 1173 } 1174 } 1175 1176 static void 1177 nvme_tcp_pdu_ch_handle(struct nvme_tcp_qpair *tqpair) 1178 { 1179 struct nvme_tcp_pdu *pdu; 1180 uint32_t error_offset = 0; 1181 enum spdk_nvme_tcp_term_req_fes fes; 1182 uint32_t expected_hlen, hd_len = 0; 1183 bool plen_error = false; 1184 1185 pdu = tqpair->recv_pdu; 1186 1187 SPDK_DEBUGLOG(nvme, "pdu type = %d\n", pdu->hdr.common.pdu_type); 1188 if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP) { 1189 if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) { 1190 SPDK_ERRLOG("Already received IC_RESP PDU, and we should reject this pdu=%p\n", pdu); 1191 fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; 1192 goto err; 1193 } 1194 expected_hlen = sizeof(struct spdk_nvme_tcp_ic_resp); 1195 if (pdu->hdr.common.plen != expected_hlen) { 1196 plen_error = true; 1197 } 1198 } else { 1199 if (spdk_unlikely(!nvme_tcp_qpair_recv_state_valid(tqpair))) { 1200 SPDK_ERRLOG("The TCP/IP tqpair connection is not negotiated\n"); 1201 fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; 1202 goto err; 1203 } 1204 1205 switch (pdu->hdr.common.pdu_type) { 1206 case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1207 expected_hlen = sizeof(struct spdk_nvme_tcp_rsp); 1208 if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) { 1209 hd_len = SPDK_NVME_TCP_DIGEST_LEN; 1210 } 1211 1212 if (pdu->hdr.common.plen != (expected_hlen + hd_len)) { 1213 plen_error = true; 1214 } 1215 break; 1216 case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: 1217 expected_hlen = sizeof(struct spdk_nvme_tcp_c2h_data_hdr); 1218 if (pdu->hdr.common.plen < pdu->hdr.common.pdo) { 1219 plen_error = true; 1220 } 1221 break; 1222 case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1223 expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr); 1224 if ((pdu->hdr.common.plen <= expected_hlen) || 1225 (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) { 1226 plen_error = true; 1227 } 1228 break; 1229 case SPDK_NVME_TCP_PDU_TYPE_R2T: 1230 expected_hlen = sizeof(struct spdk_nvme_tcp_r2t_hdr); 1231 if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) { 1232 hd_len = SPDK_NVME_TCP_DIGEST_LEN; 1233 } 1234 1235 if (pdu->hdr.common.plen != (expected_hlen + hd_len)) { 1236 plen_error = true; 1237 } 1238 break; 1239 1240 default: 1241 SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu->hdr.common.pdu_type); 1242 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1243 error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type); 1244 goto err; 1245 } 1246 } 1247 1248 if (pdu->hdr.common.hlen != expected_hlen) { 1249 SPDK_ERRLOG("Expected PDU header length %u, got %u\n", 1250 expected_hlen, pdu->hdr.common.hlen); 1251 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1252 error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen); 1253 goto err; 1254 1255 } else if (plen_error) { 1256 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1257 error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen); 1258 goto err; 1259 } else { 1260 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); 1261 nvme_tcp_pdu_calc_psh_len(tqpair->recv_pdu, tqpair->flags.host_hdgst_enable); 1262 return; 1263 } 1264 err: 1265 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1266 } 1267 1268 static struct nvme_tcp_req * 1269 get_nvme_active_req_by_cid(struct nvme_tcp_qpair *tqpair, uint32_t cid) 1270 { 1271 assert(tqpair != NULL); 1272 if ((cid >= tqpair->num_entries) || (tqpair->tcp_reqs[cid].state == NVME_TCP_REQ_FREE)) { 1273 return NULL; 1274 } 1275 1276 return &tqpair->tcp_reqs[cid]; 1277 } 1278 1279 static void 1280 nvme_tcp_recv_payload_seq_cb(void *cb_arg, int status) 1281 { 1282 struct nvme_tcp_req *treq = cb_arg; 1283 struct nvme_request *req = treq->req; 1284 struct nvme_tcp_qpair *tqpair = treq->tqpair; 1285 1286 assert(treq->ordering.bits.in_progress_accel); 1287 treq->ordering.bits.in_progress_accel = 0; 1288 1289 nvme_tcp_cond_schedule_qpair_polling(tqpair); 1290 1291 req->accel_sequence = NULL; 1292 if (spdk_unlikely(status != 0)) { 1293 pdu_seq_fail(treq->pdu, status); 1294 return; 1295 } 1296 1297 nvme_tcp_req_complete_safe(treq); 1298 } 1299 1300 static void 1301 nvme_tcp_c2h_data_payload_handle(struct nvme_tcp_qpair *tqpair, 1302 struct nvme_tcp_pdu *pdu, uint32_t *reaped) 1303 { 1304 struct nvme_tcp_req *tcp_req; 1305 struct nvme_tcp_poll_group *tgroup; 1306 struct spdk_nvme_tcp_c2h_data_hdr *c2h_data; 1307 uint8_t flags; 1308 1309 tcp_req = pdu->req; 1310 assert(tcp_req != NULL); 1311 1312 SPDK_DEBUGLOG(nvme, "enter\n"); 1313 c2h_data = &pdu->hdr.c2h_data; 1314 tcp_req->datao += pdu->data_len; 1315 flags = c2h_data->common.flags; 1316 1317 if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) { 1318 if (tcp_req->datao == tcp_req->req->payload_size) { 1319 tcp_req->rsp.status.p = 0; 1320 } else { 1321 tcp_req->rsp.status.p = 1; 1322 } 1323 1324 tcp_req->rsp.cid = tcp_req->cid; 1325 tcp_req->rsp.sqid = tqpair->qpair.id; 1326 if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) { 1327 tcp_req->ordering.bits.data_recv = 1; 1328 if (tcp_req->req->accel_sequence != NULL) { 1329 tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 1330 nvme_tcp_accel_reverse_sequence(tgroup, tcp_req->req->accel_sequence); 1331 nvme_tcp_accel_finish_sequence(tgroup, tcp_req, 1332 tcp_req->req->accel_sequence, 1333 nvme_tcp_recv_payload_seq_cb, 1334 tcp_req); 1335 return; 1336 } 1337 1338 if (nvme_tcp_req_complete_safe(tcp_req)) { 1339 (*reaped)++; 1340 } 1341 } 1342 } 1343 } 1344 1345 static const char *spdk_nvme_tcp_term_req_fes_str[] = { 1346 "Invalid PDU Header Field", 1347 "PDU Sequence Error", 1348 "Header Digest Error", 1349 "Data Transfer Out of Range", 1350 "Data Transfer Limit Exceeded", 1351 "Unsupported parameter", 1352 }; 1353 1354 static void 1355 nvme_tcp_c2h_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *c2h_term_req) 1356 { 1357 SPDK_ERRLOG("Error info of pdu(%p): %s\n", c2h_term_req, 1358 spdk_nvme_tcp_term_req_fes_str[c2h_term_req->fes]); 1359 if ((c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || 1360 (c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { 1361 SPDK_DEBUGLOG(nvme, "The offset from the start of the PDU header is %u\n", 1362 DGET32(c2h_term_req->fei)); 1363 } 1364 /* we may also need to dump some other info here */ 1365 } 1366 1367 static void 1368 nvme_tcp_c2h_term_req_payload_handle(struct nvme_tcp_qpair *tqpair, 1369 struct nvme_tcp_pdu *pdu) 1370 { 1371 nvme_tcp_c2h_term_req_dump(&pdu->hdr.term_req); 1372 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 1373 } 1374 1375 static void 1376 _nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped) 1377 { 1378 struct nvme_tcp_pdu *pdu; 1379 1380 assert(tqpair != NULL); 1381 pdu = tqpair->recv_pdu; 1382 1383 switch (pdu->hdr.common.pdu_type) { 1384 case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: 1385 nvme_tcp_c2h_data_payload_handle(tqpair, pdu, reaped); 1386 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1387 break; 1388 1389 case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1390 nvme_tcp_c2h_term_req_payload_handle(tqpair, pdu); 1391 break; 1392 1393 default: 1394 /* The code should not go to here */ 1395 SPDK_ERRLOG("The code should not go to here\n"); 1396 break; 1397 } 1398 } 1399 1400 static void 1401 nvme_tcp_accel_recv_compute_crc32_done(void *cb_arg, int status) 1402 { 1403 struct nvme_tcp_req *tcp_req = cb_arg; 1404 struct nvme_tcp_pdu *pdu; 1405 struct nvme_tcp_qpair *tqpair; 1406 int rc; 1407 int dummy_reaped = 0; 1408 1409 pdu = tcp_req->pdu; 1410 assert(pdu != NULL); 1411 1412 tqpair = tcp_req->tqpair; 1413 assert(tqpair != NULL); 1414 1415 assert(tcp_req->ordering.bits.in_progress_accel); 1416 tcp_req->ordering.bits.in_progress_accel = 0; 1417 1418 nvme_tcp_cond_schedule_qpair_polling(tqpair); 1419 1420 if (spdk_unlikely(status)) { 1421 SPDK_ERRLOG("Failed to compute the data digest for pdu =%p\n", pdu); 1422 tcp_req->rsp.status.sc = SPDK_NVME_SC_COMMAND_TRANSIENT_TRANSPORT_ERROR; 1423 goto end; 1424 } 1425 1426 pdu->data_digest_crc32 ^= SPDK_CRC32C_XOR; 1427 rc = MATCH_DIGEST_WORD(pdu->data_digest, pdu->data_digest_crc32); 1428 if (rc == 0) { 1429 SPDK_ERRLOG("data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); 1430 tcp_req->rsp.status.sc = SPDK_NVME_SC_COMMAND_TRANSIENT_TRANSPORT_ERROR; 1431 } 1432 1433 end: 1434 nvme_tcp_c2h_data_payload_handle(tqpair, tcp_req->pdu, &dummy_reaped); 1435 } 1436 1437 static void 1438 nvme_tcp_req_copy_pdu(struct nvme_tcp_req *treq, struct nvme_tcp_pdu *pdu) 1439 { 1440 treq->pdu->hdr = pdu->hdr; 1441 treq->pdu->req = treq; 1442 memcpy(treq->pdu->data_digest, pdu->data_digest, sizeof(pdu->data_digest)); 1443 memcpy(treq->pdu->data_iov, pdu->data_iov, sizeof(pdu->data_iov[0]) * pdu->data_iovcnt); 1444 treq->pdu->data_iovcnt = pdu->data_iovcnt; 1445 treq->pdu->data_len = pdu->data_len; 1446 } 1447 1448 static void 1449 nvme_tcp_accel_seq_recv_compute_crc32_done(void *cb_arg) 1450 { 1451 struct nvme_tcp_req *treq = cb_arg; 1452 struct nvme_tcp_qpair *tqpair = treq->tqpair; 1453 struct nvme_tcp_pdu *pdu = treq->pdu; 1454 bool result; 1455 1456 pdu->data_digest_crc32 ^= SPDK_CRC32C_XOR; 1457 result = MATCH_DIGEST_WORD(pdu->data_digest, pdu->data_digest_crc32); 1458 if (spdk_unlikely(!result)) { 1459 SPDK_ERRLOG("data digest error on tqpair=(%p)\n", tqpair); 1460 treq->rsp.status.sc = SPDK_NVME_SC_COMMAND_TRANSIENT_TRANSPORT_ERROR; 1461 } 1462 } 1463 1464 static bool 1465 nvme_tcp_accel_recv_compute_crc32(struct nvme_tcp_req *treq, struct nvme_tcp_pdu *pdu) 1466 { 1467 struct nvme_tcp_qpair *tqpair = treq->tqpair; 1468 struct nvme_tcp_poll_group *tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 1469 struct nvme_request *req = treq->req; 1470 int rc, dummy = 0; 1471 1472 /* Only support this limited case that the request has only one c2h pdu */ 1473 if (spdk_unlikely(nvme_qpair_get_state(&tqpair->qpair) < NVME_QPAIR_CONNECTED || 1474 tqpair->qpair.poll_group == NULL || pdu->dif_ctx != NULL || 1475 pdu->data_len % SPDK_NVME_TCP_DIGEST_ALIGNMENT != 0 || 1476 pdu->data_len != req->payload_size)) { 1477 return false; 1478 } 1479 1480 if (tgroup->group.group->accel_fn_table.append_crc32c != NULL) { 1481 nvme_tcp_req_copy_pdu(treq, pdu); 1482 rc = nvme_tcp_accel_append_crc32c(tgroup, &req->accel_sequence, 1483 &treq->pdu->data_digest_crc32, 1484 treq->pdu->data_iov, treq->pdu->data_iovcnt, 0, 1485 nvme_tcp_accel_seq_recv_compute_crc32_done, treq); 1486 if (spdk_unlikely(rc != 0)) { 1487 /* If accel is out of resources, fall back to non-accelerated crc32 */ 1488 if (rc == -ENOMEM) { 1489 return false; 1490 } 1491 1492 SPDK_ERRLOG("Failed to append crc32c operation: %d\n", rc); 1493 treq->rsp.status.sc = SPDK_NVME_SC_COMMAND_TRANSIENT_TRANSPORT_ERROR; 1494 } 1495 1496 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1497 nvme_tcp_c2h_data_payload_handle(tqpair, treq->pdu, &dummy); 1498 return true; 1499 } else if (tgroup->group.group->accel_fn_table.submit_accel_crc32c != NULL) { 1500 nvme_tcp_req_copy_pdu(treq, pdu); 1501 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1502 nvme_tcp_accel_submit_crc32c(tgroup, treq, &treq->pdu->data_digest_crc32, 1503 treq->pdu->data_iov, treq->pdu->data_iovcnt, 0, 1504 nvme_tcp_accel_recv_compute_crc32_done, treq); 1505 return true; 1506 } 1507 1508 return false; 1509 } 1510 1511 static void 1512 nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair, 1513 uint32_t *reaped) 1514 { 1515 int rc = 0; 1516 struct nvme_tcp_pdu *pdu = tqpair->recv_pdu; 1517 uint32_t crc32c; 1518 struct nvme_tcp_req *tcp_req = pdu->req; 1519 1520 assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); 1521 SPDK_DEBUGLOG(nvme, "enter\n"); 1522 1523 /* The request can be NULL, e.g. in case of C2HTermReq */ 1524 if (spdk_likely(tcp_req != NULL)) { 1525 tcp_req->expected_datao += pdu->data_len; 1526 } 1527 1528 /* check data digest if need */ 1529 if (pdu->ddgst_enable) { 1530 /* But if the data digest is enabled, tcp_req cannot be NULL */ 1531 assert(tcp_req != NULL); 1532 if (nvme_tcp_accel_recv_compute_crc32(tcp_req, pdu)) { 1533 return; 1534 } 1535 1536 crc32c = nvme_tcp_pdu_calc_data_digest(pdu); 1537 crc32c = crc32c ^ SPDK_CRC32C_XOR; 1538 rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c); 1539 if (rc == 0) { 1540 SPDK_ERRLOG("data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); 1541 tcp_req = pdu->req; 1542 assert(tcp_req != NULL); 1543 tcp_req->rsp.status.sc = SPDK_NVME_SC_COMMAND_TRANSIENT_TRANSPORT_ERROR; 1544 } 1545 } 1546 1547 _nvme_tcp_pdu_payload_handle(tqpair, reaped); 1548 } 1549 1550 static void 1551 nvme_tcp_send_icreq_complete(void *cb_arg) 1552 { 1553 struct nvme_tcp_qpair *tqpair = cb_arg; 1554 1555 SPDK_DEBUGLOG(nvme, "Complete the icreq send for tqpair=%p %u\n", tqpair, tqpair->qpair.id); 1556 1557 tqpair->flags.icreq_send_ack = true; 1558 1559 if (tqpair->state == NVME_TCP_QPAIR_STATE_INITIALIZING) { 1560 SPDK_DEBUGLOG(nvme, "tqpair %p %u, finalize icresp\n", tqpair, tqpair->qpair.id); 1561 tqpair->state = NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND; 1562 } 1563 } 1564 1565 static void 1566 nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair, 1567 struct nvme_tcp_pdu *pdu) 1568 { 1569 struct spdk_nvme_tcp_ic_resp *ic_resp = &pdu->hdr.ic_resp; 1570 uint32_t error_offset = 0; 1571 enum spdk_nvme_tcp_term_req_fes fes; 1572 int recv_buf_size; 1573 1574 /* Only PFV 0 is defined currently */ 1575 if (ic_resp->pfv != 0) { 1576 SPDK_ERRLOG("Expected ICResp PFV %u, got %u\n", 0u, ic_resp->pfv); 1577 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1578 error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, pfv); 1579 goto end; 1580 } 1581 1582 if (ic_resp->maxh2cdata < NVME_TCP_PDU_H2C_MIN_DATA_SIZE) { 1583 SPDK_ERRLOG("Expected ICResp maxh2cdata >=%u, got %u\n", NVME_TCP_PDU_H2C_MIN_DATA_SIZE, 1584 ic_resp->maxh2cdata); 1585 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1586 error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, maxh2cdata); 1587 goto end; 1588 } 1589 tqpair->maxh2cdata = ic_resp->maxh2cdata; 1590 1591 if (ic_resp->cpda > SPDK_NVME_TCP_CPDA_MAX) { 1592 SPDK_ERRLOG("Expected ICResp cpda <=%u, got %u\n", SPDK_NVME_TCP_CPDA_MAX, ic_resp->cpda); 1593 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1594 error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, cpda); 1595 goto end; 1596 } 1597 tqpair->cpda = ic_resp->cpda; 1598 1599 tqpair->flags.host_hdgst_enable = ic_resp->dgst.bits.hdgst_enable ? true : false; 1600 tqpair->flags.host_ddgst_enable = ic_resp->dgst.bits.ddgst_enable ? true : false; 1601 SPDK_DEBUGLOG(nvme, "host_hdgst_enable: %u\n", tqpair->flags.host_hdgst_enable); 1602 SPDK_DEBUGLOG(nvme, "host_ddgst_enable: %u\n", tqpair->flags.host_ddgst_enable); 1603 1604 /* Now that we know whether digests are enabled, properly size the receive buffer to 1605 * handle several incoming 4K read commands according to SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR 1606 * parameter. */ 1607 recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_c2h_data_hdr); 1608 1609 if (tqpair->flags.host_hdgst_enable) { 1610 recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; 1611 } 1612 1613 if (tqpair->flags.host_ddgst_enable) { 1614 recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; 1615 } 1616 1617 if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR) < 0) { 1618 SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n", 1619 tqpair, 1620 recv_buf_size); 1621 /* Not fatal. */ 1622 } 1623 1624 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1625 1626 if (!tqpair->flags.icreq_send_ack) { 1627 tqpair->state = NVME_TCP_QPAIR_STATE_INITIALIZING; 1628 SPDK_DEBUGLOG(nvme, "tqpair %p %u, waiting icreq ack\n", tqpair, tqpair->qpair.id); 1629 return; 1630 } 1631 1632 tqpair->state = NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND; 1633 return; 1634 end: 1635 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1636 } 1637 1638 static void 1639 nvme_tcp_capsule_resp_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, 1640 uint32_t *reaped) 1641 { 1642 struct nvme_tcp_req *tcp_req; 1643 struct nvme_tcp_poll_group *tgroup; 1644 struct spdk_nvme_tcp_rsp *capsule_resp = &pdu->hdr.capsule_resp; 1645 uint32_t cid, error_offset = 0; 1646 enum spdk_nvme_tcp_term_req_fes fes; 1647 1648 SPDK_DEBUGLOG(nvme, "enter\n"); 1649 cid = capsule_resp->rccqe.cid; 1650 tcp_req = get_nvme_active_req_by_cid(tqpair, cid); 1651 1652 if (!tcp_req) { 1653 SPDK_ERRLOG("no tcp_req is found with cid=%u for tqpair=%p\n", cid, tqpair); 1654 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1655 error_offset = offsetof(struct spdk_nvme_tcp_rsp, rccqe); 1656 goto end; 1657 } 1658 1659 assert(tcp_req->req != NULL); 1660 1661 tcp_req->rsp = capsule_resp->rccqe; 1662 tcp_req->ordering.bits.data_recv = 1; 1663 1664 /* Recv the pdu again */ 1665 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1666 1667 if (tcp_req->req->accel_sequence != NULL) { 1668 tgroup = nvme_tcp_poll_group(tqpair->qpair.poll_group); 1669 nvme_tcp_accel_reverse_sequence(tgroup, tcp_req->req->accel_sequence); 1670 nvme_tcp_accel_finish_sequence(tgroup, tcp_req, tcp_req->req->accel_sequence, 1671 nvme_tcp_recv_payload_seq_cb, tcp_req); 1672 return; 1673 } 1674 1675 if (nvme_tcp_req_complete_safe(tcp_req)) { 1676 (*reaped)++; 1677 } 1678 1679 return; 1680 1681 end: 1682 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1683 } 1684 1685 static void 1686 nvme_tcp_c2h_term_req_hdr_handle(struct nvme_tcp_qpair *tqpair, 1687 struct nvme_tcp_pdu *pdu) 1688 { 1689 struct spdk_nvme_tcp_term_req_hdr *c2h_term_req = &pdu->hdr.term_req; 1690 uint32_t error_offset = 0; 1691 enum spdk_nvme_tcp_term_req_fes fes; 1692 1693 if (c2h_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) { 1694 SPDK_ERRLOG("Fatal Error Status(FES) is unknown for c2h_term_req pdu=%p\n", pdu); 1695 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1696 error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes); 1697 goto end; 1698 } 1699 1700 /* set the data buffer */ 1701 nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + c2h_term_req->common.hlen, 1702 c2h_term_req->common.plen - c2h_term_req->common.hlen); 1703 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); 1704 return; 1705 end: 1706 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1707 } 1708 1709 static void 1710 nvme_tcp_c2h_data_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) 1711 { 1712 struct nvme_tcp_req *tcp_req; 1713 struct spdk_nvme_tcp_c2h_data_hdr *c2h_data = &pdu->hdr.c2h_data; 1714 uint32_t error_offset = 0; 1715 enum spdk_nvme_tcp_term_req_fes fes; 1716 int flags = c2h_data->common.flags; 1717 int rc; 1718 1719 SPDK_DEBUGLOG(nvme, "enter\n"); 1720 SPDK_DEBUGLOG(nvme, "c2h_data info on tqpair(%p): datao=%u, datal=%u, cccid=%d\n", 1721 tqpair, c2h_data->datao, c2h_data->datal, c2h_data->cccid); 1722 tcp_req = get_nvme_active_req_by_cid(tqpair, c2h_data->cccid); 1723 if (!tcp_req) { 1724 SPDK_ERRLOG("no tcp_req found for c2hdata cid=%d\n", c2h_data->cccid); 1725 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1726 error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, cccid); 1727 goto end; 1728 1729 } 1730 1731 SPDK_DEBUGLOG(nvme, "tcp_req(%p) on tqpair(%p): expected_datao=%u, payload_size=%u\n", 1732 tcp_req, tqpair, tcp_req->expected_datao, tcp_req->req->payload_size); 1733 1734 if (spdk_unlikely((flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) && 1735 !(flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_LAST_PDU))) { 1736 SPDK_ERRLOG("Invalid flag flags=%d in c2h_data=%p\n", flags, c2h_data); 1737 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1738 error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, common); 1739 goto end; 1740 } 1741 1742 if (c2h_data->datal > tcp_req->req->payload_size) { 1743 SPDK_ERRLOG("Invalid datal for tcp_req(%p), datal(%u) exceeds payload_size(%u)\n", 1744 tcp_req, c2h_data->datal, tcp_req->req->payload_size); 1745 fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; 1746 goto end; 1747 } 1748 1749 if (tcp_req->expected_datao != c2h_data->datao) { 1750 SPDK_ERRLOG("Invalid datao for tcp_req(%p), received datal(%u) != expected datao(%u) in tcp_req\n", 1751 tcp_req, c2h_data->datao, tcp_req->expected_datao); 1752 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1753 error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datao); 1754 goto end; 1755 } 1756 1757 if ((c2h_data->datao + c2h_data->datal) > tcp_req->req->payload_size) { 1758 SPDK_ERRLOG("Invalid data range for tcp_req(%p), received (datao(%u) + datal(%u)) > datao(%u) in tcp_req\n", 1759 tcp_req, c2h_data->datao, c2h_data->datal, tcp_req->req->payload_size); 1760 fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; 1761 error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datal); 1762 goto end; 1763 1764 } 1765 1766 if (nvme_payload_type(&tcp_req->req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { 1767 rc = nvme_tcp_build_contig_request(tqpair, tcp_req); 1768 } else { 1769 assert(nvme_payload_type(&tcp_req->req->payload) == NVME_PAYLOAD_TYPE_SGL); 1770 rc = nvme_tcp_build_sgl_request(tqpair, tcp_req); 1771 } 1772 1773 if (rc) { 1774 /* Not the right error message but at least it handles the failure. */ 1775 fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED; 1776 goto end; 1777 } 1778 1779 nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt, 1780 c2h_data->datao, c2h_data->datal); 1781 pdu->req = tcp_req; 1782 1783 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); 1784 return; 1785 1786 end: 1787 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1788 } 1789 1790 static void 1791 nvme_tcp_qpair_h2c_data_send_complete(void *cb_arg) 1792 { 1793 struct nvme_tcp_req *tcp_req = cb_arg; 1794 1795 assert(tcp_req != NULL); 1796 1797 tcp_req->ordering.bits.send_ack = 1; 1798 if (tcp_req->r2tl_remain) { 1799 nvme_tcp_send_h2c_data(tcp_req); 1800 } else { 1801 assert(tcp_req->active_r2ts > 0); 1802 tcp_req->active_r2ts--; 1803 tcp_req->state = NVME_TCP_REQ_ACTIVE; 1804 1805 if (tcp_req->ordering.bits.r2t_waiting_h2c_complete) { 1806 tcp_req->ordering.bits.r2t_waiting_h2c_complete = 0; 1807 SPDK_DEBUGLOG(nvme, "tcp_req %p: continue r2t\n", tcp_req); 1808 assert(tcp_req->active_r2ts > 0); 1809 tcp_req->ttag = tcp_req->ttag_r2t_next; 1810 tcp_req->r2tl_remain = tcp_req->r2tl_remain_next; 1811 tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T; 1812 nvme_tcp_send_h2c_data(tcp_req); 1813 return; 1814 } 1815 1816 if (tcp_req->ordering.bits.domain_in_use) { 1817 spdk_memory_domain_invalidate_data(tcp_req->req->payload.opts->memory_domain, 1818 tcp_req->req->payload.opts->memory_domain_ctx, tcp_req->iov, tcp_req->iovcnt); 1819 } 1820 1821 /* Need also call this function to free the resource */ 1822 nvme_tcp_req_complete_safe(tcp_req); 1823 } 1824 } 1825 1826 static void 1827 nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req) 1828 { 1829 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(tcp_req->req->qpair); 1830 struct nvme_tcp_pdu *rsp_pdu; 1831 struct spdk_nvme_tcp_h2c_data_hdr *h2c_data; 1832 uint32_t plen, pdo, alignment; 1833 1834 /* Reinit the send_ack and h2c_send_waiting_ack bits */ 1835 tcp_req->ordering.bits.send_ack = 0; 1836 tcp_req->ordering.bits.h2c_send_waiting_ack = 0; 1837 rsp_pdu = tcp_req->pdu; 1838 memset(rsp_pdu, 0, sizeof(*rsp_pdu)); 1839 rsp_pdu->req = tcp_req; 1840 h2c_data = &rsp_pdu->hdr.h2c_data; 1841 1842 h2c_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_DATA; 1843 plen = h2c_data->common.hlen = sizeof(*h2c_data); 1844 h2c_data->cccid = tcp_req->cid; 1845 h2c_data->ttag = tcp_req->ttag; 1846 h2c_data->datao = tcp_req->datao; 1847 1848 h2c_data->datal = spdk_min(tcp_req->r2tl_remain, tqpair->maxh2cdata); 1849 nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->iov, tcp_req->iovcnt, 1850 h2c_data->datao, h2c_data->datal); 1851 tcp_req->r2tl_remain -= h2c_data->datal; 1852 1853 if (tqpair->flags.host_hdgst_enable) { 1854 h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; 1855 plen += SPDK_NVME_TCP_DIGEST_LEN; 1856 } 1857 1858 rsp_pdu->padding_len = 0; 1859 pdo = plen; 1860 if (tqpair->cpda) { 1861 alignment = (tqpair->cpda + 1) << 2; 1862 if (alignment > plen) { 1863 rsp_pdu->padding_len = alignment - plen; 1864 pdo = plen = alignment; 1865 } 1866 } 1867 1868 h2c_data->common.pdo = pdo; 1869 plen += h2c_data->datal; 1870 if (tqpair->flags.host_ddgst_enable) { 1871 h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; 1872 plen += SPDK_NVME_TCP_DIGEST_LEN; 1873 } 1874 1875 h2c_data->common.plen = plen; 1876 tcp_req->datao += h2c_data->datal; 1877 if (!tcp_req->r2tl_remain) { 1878 h2c_data->common.flags |= SPDK_NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; 1879 } 1880 1881 SPDK_DEBUGLOG(nvme, "h2c_data info: datao=%u, datal=%u, pdu_len=%u for tqpair=%p\n", 1882 h2c_data->datao, h2c_data->datal, h2c_data->common.plen, tqpair); 1883 1884 nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_h2c_data_send_complete, tcp_req); 1885 } 1886 1887 static void 1888 nvme_tcp_r2t_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) 1889 { 1890 struct nvme_tcp_req *tcp_req; 1891 struct spdk_nvme_tcp_r2t_hdr *r2t = &pdu->hdr.r2t; 1892 uint32_t cid, error_offset = 0; 1893 enum spdk_nvme_tcp_term_req_fes fes; 1894 1895 SPDK_DEBUGLOG(nvme, "enter\n"); 1896 cid = r2t->cccid; 1897 tcp_req = get_nvme_active_req_by_cid(tqpair, cid); 1898 if (!tcp_req) { 1899 SPDK_ERRLOG("Cannot find tcp_req for tqpair=%p\n", tqpair); 1900 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1901 error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, cccid); 1902 goto end; 1903 } 1904 1905 SPDK_DEBUGLOG(nvme, "r2t info: r2to=%u, r2tl=%u for tqpair=%p\n", r2t->r2to, r2t->r2tl, 1906 tqpair); 1907 1908 if (tcp_req->state == NVME_TCP_REQ_ACTIVE) { 1909 assert(tcp_req->active_r2ts == 0); 1910 tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T; 1911 } 1912 1913 if (tcp_req->datao != r2t->r2to) { 1914 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 1915 error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2to); 1916 goto end; 1917 1918 } 1919 1920 if ((r2t->r2tl + r2t->r2to) > tcp_req->req->payload_size) { 1921 SPDK_ERRLOG("Invalid R2T info for tcp_req=%p: (r2to(%u) + r2tl(%u)) exceeds payload_size(%u)\n", 1922 tcp_req, r2t->r2to, r2t->r2tl, tqpair->maxh2cdata); 1923 fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; 1924 error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2tl); 1925 goto end; 1926 } 1927 1928 tcp_req->active_r2ts++; 1929 if (spdk_unlikely(tcp_req->active_r2ts > tqpair->maxr2t)) { 1930 if (tcp_req->state == NVME_TCP_REQ_ACTIVE_R2T && !tcp_req->ordering.bits.send_ack) { 1931 /* We receive a subsequent R2T while we are waiting for H2C transfer to complete */ 1932 SPDK_DEBUGLOG(nvme, "received a subsequent R2T\n"); 1933 assert(tcp_req->active_r2ts == tqpair->maxr2t + 1); 1934 tcp_req->ttag_r2t_next = r2t->ttag; 1935 tcp_req->r2tl_remain_next = r2t->r2tl; 1936 tcp_req->ordering.bits.r2t_waiting_h2c_complete = 1; 1937 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1938 return; 1939 } else { 1940 fes = SPDK_NVME_TCP_TERM_REQ_FES_R2T_LIMIT_EXCEEDED; 1941 SPDK_ERRLOG("Invalid R2T: Maximum number of R2T exceeded! Max: %u for tqpair=%p\n", tqpair->maxr2t, 1942 tqpair); 1943 goto end; 1944 } 1945 } 1946 1947 tcp_req->ttag = r2t->ttag; 1948 tcp_req->r2tl_remain = r2t->r2tl; 1949 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 1950 1951 if (spdk_likely(tcp_req->ordering.bits.send_ack)) { 1952 nvme_tcp_send_h2c_data(tcp_req); 1953 } else { 1954 tcp_req->ordering.bits.h2c_send_waiting_ack = 1; 1955 } 1956 1957 return; 1958 1959 end: 1960 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1961 1962 } 1963 1964 static void 1965 nvme_tcp_pdu_psh_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped) 1966 { 1967 struct nvme_tcp_pdu *pdu; 1968 int rc; 1969 uint32_t crc32c, error_offset = 0; 1970 enum spdk_nvme_tcp_term_req_fes fes; 1971 1972 assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); 1973 pdu = tqpair->recv_pdu; 1974 1975 SPDK_DEBUGLOG(nvme, "enter: pdu type =%u\n", pdu->hdr.common.pdu_type); 1976 /* check header digest if needed */ 1977 if (pdu->has_hdgst) { 1978 crc32c = nvme_tcp_pdu_calc_header_digest(pdu); 1979 rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c); 1980 if (rc == 0) { 1981 SPDK_ERRLOG("header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); 1982 fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR; 1983 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 1984 return; 1985 1986 } 1987 } 1988 1989 switch (pdu->hdr.common.pdu_type) { 1990 case SPDK_NVME_TCP_PDU_TYPE_IC_RESP: 1991 nvme_tcp_icresp_handle(tqpair, pdu); 1992 break; 1993 case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1994 nvme_tcp_capsule_resp_hdr_handle(tqpair, pdu, reaped); 1995 break; 1996 case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: 1997 nvme_tcp_c2h_data_hdr_handle(tqpair, pdu); 1998 break; 1999 2000 case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 2001 nvme_tcp_c2h_term_req_hdr_handle(tqpair, pdu); 2002 break; 2003 case SPDK_NVME_TCP_PDU_TYPE_R2T: 2004 nvme_tcp_r2t_hdr_handle(tqpair, pdu); 2005 break; 2006 2007 default: 2008 SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu->hdr.common.pdu_type); 2009 fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; 2010 error_offset = 1; 2011 nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); 2012 break; 2013 } 2014 2015 } 2016 2017 static int 2018 nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped, uint32_t max_completions) 2019 { 2020 int rc = 0; 2021 struct nvme_tcp_pdu *pdu; 2022 uint32_t data_len; 2023 enum nvme_tcp_pdu_recv_state prev_state; 2024 2025 *reaped = tqpair->async_complete; 2026 tqpair->async_complete = 0; 2027 2028 /* The loop here is to allow for several back-to-back state changes. */ 2029 do { 2030 if (*reaped >= max_completions) { 2031 break; 2032 } 2033 2034 prev_state = tqpair->recv_state; 2035 pdu = tqpair->recv_pdu; 2036 switch (tqpair->recv_state) { 2037 /* If in a new state */ 2038 case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY: 2039 memset(pdu, 0, sizeof(struct nvme_tcp_pdu)); 2040 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH); 2041 break; 2042 /* Wait for the pdu common header */ 2043 case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH: 2044 assert(pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)); 2045 rc = nvme_tcp_read_data(tqpair->sock, 2046 sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes, 2047 (uint8_t *)&pdu->hdr.common + pdu->ch_valid_bytes); 2048 if (rc < 0) { 2049 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 2050 break; 2051 } 2052 pdu->ch_valid_bytes += rc; 2053 if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) { 2054 return NVME_TCP_PDU_IN_PROGRESS; 2055 } 2056 2057 /* The command header of this PDU has now been read from the socket. */ 2058 nvme_tcp_pdu_ch_handle(tqpair); 2059 break; 2060 /* Wait for the pdu specific header */ 2061 case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH: 2062 assert(pdu->psh_valid_bytes < pdu->psh_len); 2063 rc = nvme_tcp_read_data(tqpair->sock, 2064 pdu->psh_len - pdu->psh_valid_bytes, 2065 (uint8_t *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes); 2066 if (rc < 0) { 2067 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 2068 break; 2069 } 2070 2071 pdu->psh_valid_bytes += rc; 2072 if (pdu->psh_valid_bytes < pdu->psh_len) { 2073 return NVME_TCP_PDU_IN_PROGRESS; 2074 } 2075 2076 /* All header(ch, psh, head digist) of this PDU has now been read from the socket. */ 2077 nvme_tcp_pdu_psh_handle(tqpair, reaped); 2078 break; 2079 case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: 2080 /* check whether the data is valid, if not we just return */ 2081 if (!pdu->data_len) { 2082 return NVME_TCP_PDU_IN_PROGRESS; 2083 } 2084 2085 data_len = pdu->data_len; 2086 /* data digest */ 2087 if (spdk_unlikely((pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_DATA) && 2088 tqpair->flags.host_ddgst_enable)) { 2089 data_len += SPDK_NVME_TCP_DIGEST_LEN; 2090 pdu->ddgst_enable = true; 2091 } 2092 2093 rc = nvme_tcp_read_payload_data(tqpair->sock, pdu); 2094 if (rc < 0) { 2095 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING); 2096 break; 2097 } 2098 2099 pdu->rw_offset += rc; 2100 if (pdu->rw_offset < data_len) { 2101 return NVME_TCP_PDU_IN_PROGRESS; 2102 } 2103 2104 assert(pdu->rw_offset == data_len); 2105 /* All of this PDU has now been read from the socket. */ 2106 nvme_tcp_pdu_payload_handle(tqpair, reaped); 2107 break; 2108 case NVME_TCP_PDU_RECV_STATE_QUIESCING: 2109 if (TAILQ_EMPTY(&tqpair->outstanding_reqs)) { 2110 if (nvme_qpair_get_state(&tqpair->qpair) == NVME_QPAIR_DISCONNECTING) { 2111 nvme_transport_ctrlr_disconnect_qpair_done(&tqpair->qpair); 2112 } 2113 2114 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); 2115 } 2116 break; 2117 case NVME_TCP_PDU_RECV_STATE_ERROR: 2118 memset(pdu, 0, sizeof(struct nvme_tcp_pdu)); 2119 return NVME_TCP_PDU_FATAL; 2120 default: 2121 assert(0); 2122 break; 2123 } 2124 } while (prev_state != tqpair->recv_state); 2125 2126 return rc > 0 ? 0 : rc; 2127 } 2128 2129 static void 2130 nvme_tcp_qpair_check_timeout(struct spdk_nvme_qpair *qpair) 2131 { 2132 uint64_t t02; 2133 struct nvme_tcp_req *tcp_req, *tmp; 2134 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2135 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; 2136 struct spdk_nvme_ctrlr_process *active_proc; 2137 2138 /* Don't check timeouts during controller initialization. */ 2139 if (ctrlr->state != NVME_CTRLR_STATE_READY) { 2140 return; 2141 } 2142 2143 if (nvme_qpair_is_admin_queue(qpair)) { 2144 active_proc = nvme_ctrlr_get_current_process(ctrlr); 2145 } else { 2146 active_proc = qpair->active_proc; 2147 } 2148 2149 /* Only check timeouts if the current process has a timeout callback. */ 2150 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { 2151 return; 2152 } 2153 2154 t02 = spdk_get_ticks(); 2155 TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { 2156 if (ctrlr->is_failed) { 2157 /* The controller state may be changed to failed in one of the nvme_request_check_timeout callbacks. */ 2158 return; 2159 } 2160 assert(tcp_req->req != NULL); 2161 2162 if (nvme_request_check_timeout(tcp_req->req, tcp_req->cid, active_proc, t02)) { 2163 /* 2164 * The requests are in order, so as soon as one has not timed out, 2165 * stop iterating. 2166 */ 2167 break; 2168 } 2169 } 2170 } 2171 2172 static int nvme_tcp_ctrlr_connect_qpair_poll(struct spdk_nvme_ctrlr *ctrlr, 2173 struct spdk_nvme_qpair *qpair); 2174 2175 static int 2176 nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) 2177 { 2178 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2179 uint32_t reaped; 2180 int rc; 2181 2182 if (qpair->poll_group == NULL) { 2183 rc = spdk_sock_flush(tqpair->sock); 2184 if (rc < 0 && errno != EAGAIN) { 2185 SPDK_ERRLOG("Failed to flush tqpair=%p (%d): %s\n", tqpair, 2186 errno, spdk_strerror(errno)); 2187 if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) { 2188 nvme_tcp_qpair_check_timeout(qpair); 2189 } 2190 2191 if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING) { 2192 if (TAILQ_EMPTY(&tqpair->outstanding_reqs)) { 2193 nvme_transport_ctrlr_disconnect_qpair_done(qpair); 2194 } 2195 2196 /* Don't return errors until the qpair gets disconnected */ 2197 return 0; 2198 } 2199 2200 goto fail; 2201 } 2202 } 2203 2204 if (max_completions == 0) { 2205 max_completions = spdk_max(tqpair->num_entries, 1); 2206 } else { 2207 max_completions = spdk_min(max_completions, tqpair->num_entries); 2208 } 2209 2210 reaped = 0; 2211 rc = nvme_tcp_read_pdu(tqpair, &reaped, max_completions); 2212 if (rc < 0) { 2213 SPDK_DEBUGLOG(nvme, "Error polling CQ! (%d): %s\n", 2214 errno, spdk_strerror(errno)); 2215 goto fail; 2216 } 2217 2218 if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) { 2219 nvme_tcp_qpair_check_timeout(qpair); 2220 } 2221 2222 if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) { 2223 rc = nvme_tcp_ctrlr_connect_qpair_poll(qpair->ctrlr, qpair); 2224 if (rc != 0 && rc != -EAGAIN) { 2225 SPDK_ERRLOG("Failed to connect tqpair=%p\n", tqpair); 2226 goto fail; 2227 } else if (rc == 0) { 2228 /* Once the connection is completed, we can submit queued requests */ 2229 nvme_qpair_resubmit_requests(qpair, tqpair->num_entries); 2230 } 2231 } 2232 2233 return reaped; 2234 fail: 2235 2236 /* 2237 * Since admin queues take the ctrlr_lock before entering this function, 2238 * we can call nvme_transport_ctrlr_disconnect_qpair. For other qpairs we need 2239 * to call the generic function which will take the lock for us. 2240 */ 2241 qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; 2242 2243 if (nvme_qpair_is_admin_queue(qpair)) { 2244 enum nvme_qpair_state state_prev = nvme_qpair_get_state(qpair); 2245 2246 nvme_transport_ctrlr_disconnect_qpair(qpair->ctrlr, qpair); 2247 2248 if (state_prev == NVME_QPAIR_CONNECTING && qpair->poll_status != NULL) { 2249 /* Needed to free the poll_status */ 2250 nvme_tcp_ctrlr_connect_qpair_poll(qpair->ctrlr, qpair); 2251 } 2252 } else { 2253 nvme_ctrlr_disconnect_qpair(qpair); 2254 } 2255 return -ENXIO; 2256 } 2257 2258 static void 2259 nvme_tcp_qpair_sock_cb(void *ctx, struct spdk_sock_group *group, struct spdk_sock *sock) 2260 { 2261 struct spdk_nvme_qpair *qpair = ctx; 2262 struct nvme_tcp_poll_group *pgroup = nvme_tcp_poll_group(qpair->poll_group); 2263 int32_t num_completions; 2264 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2265 2266 if (tqpair->needs_poll) { 2267 TAILQ_REMOVE(&pgroup->needs_poll, tqpair, link); 2268 tqpair->needs_poll = false; 2269 } 2270 2271 num_completions = spdk_nvme_qpair_process_completions(qpair, pgroup->completions_per_qpair); 2272 2273 if (pgroup->num_completions >= 0 && num_completions >= 0) { 2274 pgroup->num_completions += num_completions; 2275 pgroup->stats.nvme_completions += num_completions; 2276 } else { 2277 pgroup->num_completions = -ENXIO; 2278 } 2279 } 2280 2281 static int 2282 nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair) 2283 { 2284 struct spdk_nvme_tcp_ic_req *ic_req; 2285 struct nvme_tcp_pdu *pdu; 2286 uint32_t timeout_in_sec; 2287 2288 pdu = tqpair->send_pdu; 2289 memset(tqpair->send_pdu, 0, sizeof(*tqpair->send_pdu)); 2290 ic_req = &pdu->hdr.ic_req; 2291 2292 ic_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_REQ; 2293 ic_req->common.hlen = ic_req->common.plen = sizeof(*ic_req); 2294 ic_req->pfv = 0; 2295 ic_req->maxr2t = NVME_TCP_MAX_R2T_DEFAULT - 1; 2296 ic_req->hpda = NVME_TCP_HPDA_DEFAULT; 2297 2298 ic_req->dgst.bits.hdgst_enable = tqpair->qpair.ctrlr->opts.header_digest; 2299 ic_req->dgst.bits.ddgst_enable = tqpair->qpair.ctrlr->opts.data_digest; 2300 2301 nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_send_icreq_complete, tqpair); 2302 2303 timeout_in_sec = tqpair->qpair.async ? ICREQ_TIMEOUT_ASYNC : ICREQ_TIMEOUT_SYNC; 2304 tqpair->icreq_timeout_tsc = spdk_get_ticks() + (timeout_in_sec * spdk_get_ticks_hz()); 2305 return 0; 2306 } 2307 2308 static int 2309 nvme_tcp_qpair_connect_sock(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 2310 { 2311 struct sockaddr_storage dst_addr; 2312 struct sockaddr_storage src_addr; 2313 int rc; 2314 struct nvme_tcp_qpair *tqpair; 2315 int family; 2316 long int port, src_port; 2317 char *sock_impl_name; 2318 struct spdk_sock_impl_opts impl_opts = {}; 2319 size_t impl_opts_size = sizeof(impl_opts); 2320 struct spdk_sock_opts opts; 2321 struct nvme_tcp_ctrlr *tcp_ctrlr; 2322 2323 tqpair = nvme_tcp_qpair(qpair); 2324 2325 switch (ctrlr->trid.adrfam) { 2326 case SPDK_NVMF_ADRFAM_IPV4: 2327 family = AF_INET; 2328 break; 2329 case SPDK_NVMF_ADRFAM_IPV6: 2330 family = AF_INET6; 2331 break; 2332 default: 2333 SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); 2334 rc = -1; 2335 return rc; 2336 } 2337 2338 SPDK_DEBUGLOG(nvme, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); 2339 2340 memset(&dst_addr, 0, sizeof(dst_addr)); 2341 2342 SPDK_DEBUGLOG(nvme, "trsvcid is %s\n", ctrlr->trid.trsvcid); 2343 rc = nvme_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid, &port); 2344 if (rc != 0) { 2345 SPDK_ERRLOG("dst_addr nvme_parse_addr() failed\n"); 2346 return rc; 2347 } 2348 2349 if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { 2350 memset(&src_addr, 0, sizeof(src_addr)); 2351 rc = nvme_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid, &src_port); 2352 if (rc != 0) { 2353 SPDK_ERRLOG("src_addr nvme_parse_addr() failed\n"); 2354 return rc; 2355 } 2356 } 2357 2358 tcp_ctrlr = SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr); 2359 sock_impl_name = tcp_ctrlr->psk[0] ? "ssl" : NULL; 2360 SPDK_DEBUGLOG(nvme, "sock_impl_name is %s\n", sock_impl_name); 2361 2362 if (sock_impl_name) { 2363 spdk_sock_impl_get_opts(sock_impl_name, &impl_opts, &impl_opts_size); 2364 impl_opts.tls_version = SPDK_TLS_VERSION_1_3; 2365 impl_opts.psk_identity = tcp_ctrlr->psk_identity; 2366 impl_opts.psk_key = tcp_ctrlr->psk; 2367 impl_opts.psk_key_size = tcp_ctrlr->psk_size; 2368 impl_opts.tls_cipher_suites = tcp_ctrlr->tls_cipher_suite; 2369 } 2370 opts.opts_size = sizeof(opts); 2371 spdk_sock_get_default_opts(&opts); 2372 opts.priority = ctrlr->trid.priority; 2373 opts.zcopy = !nvme_qpair_is_admin_queue(qpair); 2374 if (ctrlr->opts.transport_ack_timeout) { 2375 opts.ack_timeout = 1ULL << ctrlr->opts.transport_ack_timeout; 2376 } 2377 if (sock_impl_name) { 2378 opts.impl_opts = &impl_opts; 2379 opts.impl_opts_size = sizeof(impl_opts); 2380 } 2381 tqpair->sock = spdk_sock_connect_ext(ctrlr->trid.traddr, port, sock_impl_name, &opts); 2382 if (!tqpair->sock) { 2383 SPDK_ERRLOG("sock connection error of tqpair=%p with addr=%s, port=%ld\n", 2384 tqpair, ctrlr->trid.traddr, port); 2385 rc = -1; 2386 return rc; 2387 } 2388 2389 return 0; 2390 } 2391 2392 static int 2393 nvme_tcp_ctrlr_connect_qpair_poll(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 2394 { 2395 struct nvme_tcp_qpair *tqpair; 2396 int rc; 2397 2398 tqpair = nvme_tcp_qpair(qpair); 2399 2400 /* Prevent this function from being called recursively, as it could lead to issues with 2401 * nvme_fabric_qpair_connect_poll() if the connect response is received in the recursive 2402 * call. 2403 */ 2404 if (tqpair->flags.in_connect_poll) { 2405 return -EAGAIN; 2406 } 2407 2408 tqpair->flags.in_connect_poll = 1; 2409 2410 switch (tqpair->state) { 2411 case NVME_TCP_QPAIR_STATE_INVALID: 2412 case NVME_TCP_QPAIR_STATE_INITIALIZING: 2413 if (spdk_get_ticks() > tqpair->icreq_timeout_tsc) { 2414 SPDK_ERRLOG("Failed to construct the tqpair=%p via correct icresp\n", tqpair); 2415 rc = -ETIMEDOUT; 2416 break; 2417 } 2418 rc = -EAGAIN; 2419 break; 2420 case NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_SEND: 2421 rc = nvme_fabric_qpair_connect_async(&tqpair->qpair, tqpair->num_entries + 1); 2422 if (rc < 0) { 2423 SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); 2424 break; 2425 } 2426 tqpair->state = NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL; 2427 rc = -EAGAIN; 2428 break; 2429 case NVME_TCP_QPAIR_STATE_FABRIC_CONNECT_POLL: 2430 rc = nvme_fabric_qpair_connect_poll(&tqpair->qpair); 2431 if (rc == 0) { 2432 tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING; 2433 nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); 2434 } else if (rc != -EAGAIN) { 2435 SPDK_ERRLOG("Failed to poll NVMe-oF Fabric CONNECT command\n"); 2436 } 2437 break; 2438 case NVME_TCP_QPAIR_STATE_RUNNING: 2439 rc = 0; 2440 break; 2441 default: 2442 assert(false); 2443 rc = -EINVAL; 2444 break; 2445 } 2446 2447 tqpair->flags.in_connect_poll = 0; 2448 return rc; 2449 } 2450 2451 static int 2452 nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) 2453 { 2454 int rc = 0; 2455 struct nvme_tcp_qpair *tqpair; 2456 struct nvme_tcp_poll_group *tgroup; 2457 2458 tqpair = nvme_tcp_qpair(qpair); 2459 2460 if (!tqpair->sock) { 2461 rc = nvme_tcp_qpair_connect_sock(ctrlr, qpair); 2462 if (rc < 0) { 2463 return rc; 2464 } 2465 } 2466 2467 if (qpair->poll_group) { 2468 rc = nvme_poll_group_connect_qpair(qpair); 2469 if (rc) { 2470 SPDK_ERRLOG("Unable to activate the tcp qpair.\n"); 2471 return rc; 2472 } 2473 tgroup = nvme_tcp_poll_group(qpair->poll_group); 2474 tqpair->stats = &tgroup->stats; 2475 tqpair->shared_stats = true; 2476 } else { 2477 /* When resetting a controller, we disconnect adminq and then reconnect. The stats 2478 * is not freed when disconnecting. So when reconnecting, don't allocate memory 2479 * again. 2480 */ 2481 if (tqpair->stats == NULL) { 2482 tqpair->stats = calloc(1, sizeof(*tqpair->stats)); 2483 if (!tqpair->stats) { 2484 SPDK_ERRLOG("tcp stats memory allocation failed\n"); 2485 return -ENOMEM; 2486 } 2487 } 2488 } 2489 2490 tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT; 2491 /* Explicitly set the state and recv_state of tqpair */ 2492 tqpair->state = NVME_TCP_QPAIR_STATE_INVALID; 2493 if (tqpair->recv_state != NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) { 2494 nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); 2495 } 2496 rc = nvme_tcp_qpair_icreq_send(tqpair); 2497 if (rc != 0) { 2498 SPDK_ERRLOG("Unable to connect the tqpair\n"); 2499 return rc; 2500 } 2501 2502 return rc; 2503 } 2504 2505 static struct spdk_nvme_qpair * 2506 nvme_tcp_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, 2507 uint16_t qid, uint32_t qsize, 2508 enum spdk_nvme_qprio qprio, 2509 uint32_t num_requests, bool async) 2510 { 2511 struct nvme_tcp_qpair *tqpair; 2512 struct spdk_nvme_qpair *qpair; 2513 int rc; 2514 2515 if (qsize < SPDK_NVME_QUEUE_MIN_ENTRIES) { 2516 SPDK_ERRLOG("Failed to create qpair with size %u. Minimum queue size is %d.\n", 2517 qsize, SPDK_NVME_QUEUE_MIN_ENTRIES); 2518 return NULL; 2519 } 2520 2521 tqpair = calloc(1, sizeof(struct nvme_tcp_qpair)); 2522 if (!tqpair) { 2523 SPDK_ERRLOG("failed to get create tqpair\n"); 2524 return NULL; 2525 } 2526 2527 /* Set num_entries one less than queue size. According to NVMe 2528 * and NVMe-oF specs we can not submit queue size requests, 2529 * one slot shall always remain empty. 2530 */ 2531 tqpair->num_entries = qsize - 1; 2532 qpair = &tqpair->qpair; 2533 rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests, async); 2534 if (rc != 0) { 2535 free(tqpair); 2536 return NULL; 2537 } 2538 2539 rc = nvme_tcp_alloc_reqs(tqpair); 2540 if (rc) { 2541 nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair); 2542 return NULL; 2543 } 2544 2545 /* spdk_nvme_qpair_get_optimal_poll_group needs socket information. 2546 * So create the socket first when creating a qpair. */ 2547 rc = nvme_tcp_qpair_connect_sock(ctrlr, qpair); 2548 if (rc) { 2549 nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair); 2550 return NULL; 2551 } 2552 2553 return qpair; 2554 } 2555 2556 static struct spdk_nvme_qpair * 2557 nvme_tcp_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, 2558 const struct spdk_nvme_io_qpair_opts *opts) 2559 { 2560 return nvme_tcp_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, 2561 opts->io_queue_requests, opts->async_mode); 2562 } 2563 2564 SPDK_LOG_DEPRECATION_REGISTER(nvme_ctrlr_psk, "spdk_nvme_ctrlr_opts.psk", "v24.09", 0); 2565 2566 static int 2567 nvme_tcp_generate_tls_credentials(struct nvme_tcp_ctrlr *tctrlr) 2568 { 2569 struct spdk_nvme_ctrlr *ctrlr = &tctrlr->ctrlr; 2570 int rc; 2571 uint8_t psk_retained[SPDK_TLS_PSK_MAX_LEN] = {}; 2572 uint8_t psk_configured[SPDK_TLS_PSK_MAX_LEN] = {}; 2573 uint8_t pskbuf[SPDK_TLS_PSK_MAX_LEN + 1] = {}; 2574 uint8_t tls_cipher_suite; 2575 uint8_t psk_retained_hash; 2576 uint64_t psk_configured_size; 2577 uint8_t *psk; 2578 2579 if (ctrlr->opts.tls_psk != NULL) { 2580 rc = spdk_key_get_key(ctrlr->opts.tls_psk, pskbuf, SPDK_TLS_PSK_MAX_LEN); 2581 if (rc < 0) { 2582 SPDK_ERRLOG("Failed to obtain key '%s': %s\n", 2583 spdk_key_get_name(ctrlr->opts.tls_psk), spdk_strerror(-rc)); 2584 goto finish; 2585 } 2586 2587 psk = pskbuf; 2588 } else { 2589 SPDK_LOG_DEPRECATED(nvme_ctrlr_psk); 2590 psk = ctrlr->opts.psk; 2591 } 2592 2593 rc = nvme_tcp_parse_interchange_psk(psk, psk_configured, sizeof(psk_configured), 2594 &psk_configured_size, &psk_retained_hash); 2595 if (rc < 0) { 2596 SPDK_ERRLOG("Failed to parse PSK interchange!\n"); 2597 goto finish; 2598 } 2599 2600 /* The Base64 string encodes the configured PSK (32 or 48 bytes binary). 2601 * This check also ensures that psk_configured_size is smaller than 2602 * psk_retained buffer size. */ 2603 if (psk_configured_size == SHA256_DIGEST_LENGTH) { 2604 tls_cipher_suite = NVME_TCP_CIPHER_AES_128_GCM_SHA256; 2605 tctrlr->tls_cipher_suite = "TLS_AES_128_GCM_SHA256"; 2606 } else if (psk_configured_size == SHA384_DIGEST_LENGTH) { 2607 tls_cipher_suite = NVME_TCP_CIPHER_AES_256_GCM_SHA384; 2608 tctrlr->tls_cipher_suite = "TLS_AES_256_GCM_SHA384"; 2609 } else { 2610 SPDK_ERRLOG("Unrecognized cipher suite!\n"); 2611 rc = -ENOTSUP; 2612 goto finish; 2613 } 2614 2615 rc = nvme_tcp_generate_psk_identity(tctrlr->psk_identity, sizeof(tctrlr->psk_identity), 2616 ctrlr->opts.hostnqn, ctrlr->trid.subnqn, 2617 tls_cipher_suite); 2618 if (rc) { 2619 SPDK_ERRLOG("could not generate PSK identity\n"); 2620 goto finish; 2621 } 2622 2623 /* No hash indicates that Configured PSK must be used as Retained PSK. */ 2624 if (psk_retained_hash == NVME_TCP_HASH_ALGORITHM_NONE) { 2625 assert(psk_configured_size < sizeof(psk_retained)); 2626 memcpy(psk_retained, psk_configured, psk_configured_size); 2627 rc = psk_configured_size; 2628 } else { 2629 /* Derive retained PSK. */ 2630 rc = nvme_tcp_derive_retained_psk(psk_configured, psk_configured_size, ctrlr->opts.hostnqn, 2631 psk_retained, sizeof(psk_retained), psk_retained_hash); 2632 if (rc < 0) { 2633 SPDK_ERRLOG("Unable to derive retained PSK!\n"); 2634 goto finish; 2635 } 2636 } 2637 2638 rc = nvme_tcp_derive_tls_psk(psk_retained, rc, tctrlr->psk_identity, tctrlr->psk, 2639 sizeof(tctrlr->psk), tls_cipher_suite); 2640 if (rc < 0) { 2641 SPDK_ERRLOG("Could not generate TLS PSK!\n"); 2642 goto finish; 2643 } 2644 2645 tctrlr->psk_size = rc; 2646 rc = 0; 2647 finish: 2648 spdk_memset_s(psk_configured, sizeof(psk_configured), 0, sizeof(psk_configured)); 2649 spdk_memset_s(pskbuf, sizeof(pskbuf), 0, sizeof(pskbuf)); 2650 2651 return rc; 2652 } 2653 2654 /* We have to use the typedef in the function declaration to appease astyle. */ 2655 typedef struct spdk_nvme_ctrlr spdk_nvme_ctrlr_t; 2656 2657 static spdk_nvme_ctrlr_t * 2658 nvme_tcp_ctrlr_construct(const struct spdk_nvme_transport_id *trid, 2659 const struct spdk_nvme_ctrlr_opts *opts, 2660 void *devhandle) 2661 { 2662 struct nvme_tcp_ctrlr *tctrlr; 2663 int rc; 2664 2665 tctrlr = calloc(1, sizeof(*tctrlr)); 2666 if (tctrlr == NULL) { 2667 SPDK_ERRLOG("could not allocate ctrlr\n"); 2668 return NULL; 2669 } 2670 2671 tctrlr->ctrlr.opts = *opts; 2672 tctrlr->ctrlr.trid = *trid; 2673 2674 if (opts->psk[0] != '\0' || opts->tls_psk != NULL) { 2675 /* Only allow either one at a time */ 2676 if (opts->tls_psk != NULL && opts->psk[0] != '\0') { 2677 SPDK_ERRLOG("Either spdk_nvme_ctrlr_opts.tls_psk or .psk can be set at " 2678 "the same time\n"); 2679 free(tctrlr); 2680 return NULL; 2681 } 2682 rc = nvme_tcp_generate_tls_credentials(tctrlr); 2683 spdk_memset_s(&tctrlr->ctrlr.opts.psk, sizeof(tctrlr->ctrlr.opts.psk), 0, 2684 sizeof(tctrlr->ctrlr.opts.psk)); 2685 2686 if (rc != 0) { 2687 free(tctrlr); 2688 return NULL; 2689 } 2690 } 2691 2692 if (opts->transport_ack_timeout > NVME_TCP_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT) { 2693 SPDK_NOTICELOG("transport_ack_timeout exceeds max value %d, use max value\n", 2694 NVME_TCP_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT); 2695 tctrlr->ctrlr.opts.transport_ack_timeout = NVME_TCP_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT; 2696 } 2697 2698 rc = nvme_ctrlr_construct(&tctrlr->ctrlr); 2699 if (rc != 0) { 2700 free(tctrlr); 2701 return NULL; 2702 } 2703 2704 /* Sequence might be used not only for data digest offload purposes but 2705 * to handle a potential COPY operation appended as the result of translation. */ 2706 tctrlr->ctrlr.flags |= SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 2707 tctrlr->ctrlr.adminq = nvme_tcp_ctrlr_create_qpair(&tctrlr->ctrlr, 0, 2708 tctrlr->ctrlr.opts.admin_queue_size, 0, 2709 tctrlr->ctrlr.opts.admin_queue_size, true); 2710 if (!tctrlr->ctrlr.adminq) { 2711 SPDK_ERRLOG("failed to create admin qpair\n"); 2712 nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr); 2713 return NULL; 2714 } 2715 2716 if (nvme_ctrlr_add_process(&tctrlr->ctrlr, 0) != 0) { 2717 SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); 2718 nvme_ctrlr_destruct(&tctrlr->ctrlr); 2719 return NULL; 2720 } 2721 2722 return &tctrlr->ctrlr; 2723 } 2724 2725 static uint32_t 2726 nvme_tcp_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) 2727 { 2728 /* TCP transport doesn't limit maximum IO transfer size. */ 2729 return UINT32_MAX; 2730 } 2731 2732 static uint16_t 2733 nvme_tcp_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) 2734 { 2735 return NVME_TCP_MAX_SGL_DESCRIPTORS; 2736 } 2737 2738 static int 2739 nvme_tcp_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, 2740 int (*iter_fn)(struct nvme_request *req, void *arg), 2741 void *arg) 2742 { 2743 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2744 struct nvme_tcp_req *tcp_req, *tmp; 2745 int rc; 2746 2747 assert(iter_fn != NULL); 2748 2749 TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { 2750 assert(tcp_req->req != NULL); 2751 2752 rc = iter_fn(tcp_req->req, arg); 2753 if (rc != 0) { 2754 return rc; 2755 } 2756 } 2757 2758 return 0; 2759 } 2760 2761 static void 2762 nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) 2763 { 2764 struct nvme_tcp_req *tcp_req, *tmp; 2765 struct spdk_nvme_cpl cpl = {}; 2766 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2767 2768 cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; 2769 cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2770 2771 TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { 2772 assert(tcp_req->req != NULL); 2773 if (tcp_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { 2774 continue; 2775 } 2776 2777 nvme_tcp_req_complete(tcp_req, tqpair, &cpl, false); 2778 } 2779 } 2780 2781 static struct spdk_nvme_transport_poll_group * 2782 nvme_tcp_poll_group_create(void) 2783 { 2784 struct nvme_tcp_poll_group *group = calloc(1, sizeof(*group)); 2785 2786 if (group == NULL) { 2787 SPDK_ERRLOG("Unable to allocate poll group.\n"); 2788 return NULL; 2789 } 2790 2791 TAILQ_INIT(&group->needs_poll); 2792 2793 group->sock_group = spdk_sock_group_create(group); 2794 if (group->sock_group == NULL) { 2795 free(group); 2796 SPDK_ERRLOG("Unable to allocate sock group.\n"); 2797 return NULL; 2798 } 2799 2800 return &group->group; 2801 } 2802 2803 static struct spdk_nvme_transport_poll_group * 2804 nvme_tcp_qpair_get_optimal_poll_group(struct spdk_nvme_qpair *qpair) 2805 { 2806 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2807 struct spdk_sock_group *group = NULL; 2808 int rc; 2809 2810 rc = spdk_sock_get_optimal_sock_group(tqpair->sock, &group, NULL); 2811 if (!rc && group != NULL) { 2812 return spdk_sock_group_get_ctx(group); 2813 } 2814 2815 return NULL; 2816 } 2817 2818 static int 2819 nvme_tcp_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) 2820 { 2821 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group); 2822 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2823 2824 if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) { 2825 return -EPROTO; 2826 } 2827 return 0; 2828 } 2829 2830 static int 2831 nvme_tcp_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) 2832 { 2833 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group); 2834 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2835 2836 if (tqpair->needs_poll) { 2837 TAILQ_REMOVE(&group->needs_poll, tqpair, link); 2838 tqpair->needs_poll = false; 2839 } 2840 2841 if (tqpair->sock && group->sock_group) { 2842 if (spdk_sock_group_remove_sock(group->sock_group, tqpair->sock)) { 2843 return -EPROTO; 2844 } 2845 } 2846 return 0; 2847 } 2848 2849 static int 2850 nvme_tcp_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, 2851 struct spdk_nvme_qpair *qpair) 2852 { 2853 struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); 2854 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); 2855 2856 /* disconnected qpairs won't have a sock to add. */ 2857 if (nvme_qpair_get_state(qpair) >= NVME_QPAIR_CONNECTED) { 2858 if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) { 2859 return -EPROTO; 2860 } 2861 } 2862 2863 return 0; 2864 } 2865 2866 static int 2867 nvme_tcp_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, 2868 struct spdk_nvme_qpair *qpair) 2869 { 2870 struct nvme_tcp_qpair *tqpair; 2871 struct nvme_tcp_poll_group *group; 2872 2873 assert(qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs); 2874 2875 tqpair = nvme_tcp_qpair(qpair); 2876 group = nvme_tcp_poll_group(tgroup); 2877 2878 assert(tqpair->shared_stats == true); 2879 tqpair->stats = &g_dummy_stats; 2880 2881 if (tqpair->needs_poll) { 2882 TAILQ_REMOVE(&group->needs_poll, tqpair, link); 2883 tqpair->needs_poll = false; 2884 } 2885 2886 return 0; 2887 } 2888 2889 static int64_t 2890 nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, 2891 uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) 2892 { 2893 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); 2894 struct spdk_nvme_qpair *qpair, *tmp_qpair; 2895 struct nvme_tcp_qpair *tqpair, *tmp_tqpair; 2896 int num_events; 2897 2898 group->completions_per_qpair = completions_per_qpair; 2899 group->num_completions = 0; 2900 group->stats.polls++; 2901 2902 num_events = spdk_sock_group_poll(group->sock_group); 2903 2904 STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { 2905 tqpair = nvme_tcp_qpair(qpair); 2906 if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING) { 2907 if (TAILQ_EMPTY(&tqpair->outstanding_reqs)) { 2908 nvme_transport_ctrlr_disconnect_qpair_done(qpair); 2909 } 2910 } 2911 /* Wait until the qpair transitions to the DISCONNECTED state, otherwise user might 2912 * want to free it from disconnect_qpair_cb, while it's not fully disconnected (and 2913 * might still have outstanding requests) */ 2914 if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTED) { 2915 disconnected_qpair_cb(qpair, tgroup->group->ctx); 2916 } 2917 } 2918 2919 /* If any qpairs were marked as needing to be polled due to an asynchronous write completion 2920 * and they weren't polled as a consequence of calling spdk_sock_group_poll above, poll them now. */ 2921 TAILQ_FOREACH_SAFE(tqpair, &group->needs_poll, link, tmp_tqpair) { 2922 nvme_tcp_qpair_sock_cb(&tqpair->qpair, group->sock_group, tqpair->sock); 2923 } 2924 2925 if (spdk_unlikely(num_events < 0)) { 2926 return num_events; 2927 } 2928 2929 group->stats.idle_polls += !num_events; 2930 group->stats.socket_completions += num_events; 2931 2932 return group->num_completions; 2933 } 2934 2935 static int 2936 nvme_tcp_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) 2937 { 2938 int rc; 2939 struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); 2940 2941 if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { 2942 return -EBUSY; 2943 } 2944 2945 rc = spdk_sock_group_close(&group->sock_group); 2946 if (rc != 0) { 2947 SPDK_ERRLOG("Failed to close the sock group for a tcp poll group.\n"); 2948 assert(false); 2949 } 2950 2951 free(tgroup); 2952 2953 return 0; 2954 } 2955 2956 static int 2957 nvme_tcp_poll_group_get_stats(struct spdk_nvme_transport_poll_group *tgroup, 2958 struct spdk_nvme_transport_poll_group_stat **_stats) 2959 { 2960 struct nvme_tcp_poll_group *group; 2961 struct spdk_nvme_transport_poll_group_stat *stats; 2962 2963 if (tgroup == NULL || _stats == NULL) { 2964 SPDK_ERRLOG("Invalid stats or group pointer\n"); 2965 return -EINVAL; 2966 } 2967 2968 group = nvme_tcp_poll_group(tgroup); 2969 2970 stats = calloc(1, sizeof(*stats)); 2971 if (!stats) { 2972 SPDK_ERRLOG("Can't allocate memory for TCP stats\n"); 2973 return -ENOMEM; 2974 } 2975 stats->trtype = SPDK_NVME_TRANSPORT_TCP; 2976 memcpy(&stats->tcp, &group->stats, sizeof(group->stats)); 2977 2978 *_stats = stats; 2979 2980 return 0; 2981 } 2982 2983 static void 2984 nvme_tcp_poll_group_free_stats(struct spdk_nvme_transport_poll_group *tgroup, 2985 struct spdk_nvme_transport_poll_group_stat *stats) 2986 { 2987 free(stats); 2988 } 2989 2990 static int 2991 nvme_tcp_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr, 2992 struct spdk_memory_domain **domains, int array_size) 2993 { 2994 if (domains && array_size > 0) { 2995 domains[0] = spdk_memory_domain_get_system_domain(); 2996 } 2997 2998 return 1; 2999 } 3000 3001 const struct spdk_nvme_transport_ops tcp_ops = { 3002 .name = "TCP", 3003 .type = SPDK_NVME_TRANSPORT_TCP, 3004 .ctrlr_construct = nvme_tcp_ctrlr_construct, 3005 .ctrlr_scan = nvme_fabric_ctrlr_scan, 3006 .ctrlr_destruct = nvme_tcp_ctrlr_destruct, 3007 .ctrlr_enable = nvme_tcp_ctrlr_enable, 3008 3009 .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4, 3010 .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8, 3011 .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4, 3012 .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8, 3013 .ctrlr_set_reg_4_async = nvme_fabric_ctrlr_set_reg_4_async, 3014 .ctrlr_set_reg_8_async = nvme_fabric_ctrlr_set_reg_8_async, 3015 .ctrlr_get_reg_4_async = nvme_fabric_ctrlr_get_reg_4_async, 3016 .ctrlr_get_reg_8_async = nvme_fabric_ctrlr_get_reg_8_async, 3017 3018 .ctrlr_get_max_xfer_size = nvme_tcp_ctrlr_get_max_xfer_size, 3019 .ctrlr_get_max_sges = nvme_tcp_ctrlr_get_max_sges, 3020 3021 .ctrlr_create_io_qpair = nvme_tcp_ctrlr_create_io_qpair, 3022 .ctrlr_delete_io_qpair = nvme_tcp_ctrlr_delete_io_qpair, 3023 .ctrlr_connect_qpair = nvme_tcp_ctrlr_connect_qpair, 3024 .ctrlr_disconnect_qpair = nvme_tcp_ctrlr_disconnect_qpair, 3025 3026 .ctrlr_get_memory_domains = nvme_tcp_ctrlr_get_memory_domains, 3027 3028 .qpair_abort_reqs = nvme_tcp_qpair_abort_reqs, 3029 .qpair_reset = nvme_tcp_qpair_reset, 3030 .qpair_submit_request = nvme_tcp_qpair_submit_request, 3031 .qpair_process_completions = nvme_tcp_qpair_process_completions, 3032 .qpair_iterate_requests = nvme_tcp_qpair_iterate_requests, 3033 .admin_qpair_abort_aers = nvme_tcp_admin_qpair_abort_aers, 3034 3035 .poll_group_create = nvme_tcp_poll_group_create, 3036 .qpair_get_optimal_poll_group = nvme_tcp_qpair_get_optimal_poll_group, 3037 .poll_group_connect_qpair = nvme_tcp_poll_group_connect_qpair, 3038 .poll_group_disconnect_qpair = nvme_tcp_poll_group_disconnect_qpair, 3039 .poll_group_add = nvme_tcp_poll_group_add, 3040 .poll_group_remove = nvme_tcp_poll_group_remove, 3041 .poll_group_process_completions = nvme_tcp_poll_group_process_completions, 3042 .poll_group_destroy = nvme_tcp_poll_group_destroy, 3043 .poll_group_get_stats = nvme_tcp_poll_group_get_stats, 3044 .poll_group_free_stats = nvme_tcp_poll_group_free_stats, 3045 }; 3046 3047 SPDK_NVME_TRANSPORT_REGISTER(tcp, &tcp_ops); 3048 3049 SPDK_TRACE_REGISTER_FN(nvme_tcp, "nvme_tcp", TRACE_GROUP_NVME_TCP) 3050 { 3051 struct spdk_trace_tpoint_opts opts[] = { 3052 { 3053 "NVME_TCP_SUBMIT", TRACE_NVME_TCP_SUBMIT, 3054 OWNER_TYPE_NVME_TCP_QP, OBJECT_NVME_TCP_REQ, 1, 3055 { { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 3056 { "cid", SPDK_TRACE_ARG_TYPE_INT, 4 }, 3057 { "opc", SPDK_TRACE_ARG_TYPE_INT, 4 }, 3058 { "dw10", SPDK_TRACE_ARG_TYPE_PTR, 4 }, 3059 { "dw11", SPDK_TRACE_ARG_TYPE_PTR, 4 }, 3060 { "dw12", SPDK_TRACE_ARG_TYPE_PTR, 4 }, 3061 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 3062 } 3063 }, 3064 { 3065 "NVME_TCP_COMPLETE", TRACE_NVME_TCP_COMPLETE, 3066 OWNER_TYPE_NVME_TCP_QP, OBJECT_NVME_TCP_REQ, 0, 3067 { { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 3068 { "cid", SPDK_TRACE_ARG_TYPE_INT, 4 }, 3069 { "cpl", SPDK_TRACE_ARG_TYPE_PTR, 4 }, 3070 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 3071 } 3072 }, 3073 }; 3074 3075 spdk_trace_register_object(OBJECT_NVME_TCP_REQ, 'p'); 3076 spdk_trace_register_owner_type(OWNER_TYPE_NVME_TCP_QP, 'q'); 3077 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 3078 3079 spdk_trace_tpoint_register_relation(TRACE_SOCK_REQ_QUEUE, OBJECT_NVME_TCP_REQ, 0); 3080 spdk_trace_tpoint_register_relation(TRACE_SOCK_REQ_PEND, OBJECT_NVME_TCP_REQ, 0); 3081 spdk_trace_tpoint_register_relation(TRACE_SOCK_REQ_COMPLETE, OBJECT_NVME_TCP_REQ, 0); 3082 } 3083