1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. All rights reserved. 3 * Copyright (c) 2020, 2021 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #if defined(__FreeBSD__) 10 #include <sys/event.h> 11 #define SPDK_KEVENT 12 #else 13 #include <sys/epoll.h> 14 #define SPDK_EPOLL 15 #endif 16 17 #if defined(__linux__) 18 #include <linux/errqueue.h> 19 #endif 20 21 #include "spdk/env.h" 22 #include "spdk/log.h" 23 #include "spdk/pipe.h" 24 #include "spdk/sock.h" 25 #include "spdk/util.h" 26 #include "spdk/string.h" 27 #include "spdk_internal/sock.h" 28 #include "../sock_kernel.h" 29 30 #include "openssl/crypto.h" 31 #include "openssl/err.h" 32 #include "openssl/ssl.h" 33 34 #define MAX_TMPBUF 1024 35 #define PORTNUMLEN 32 36 37 #if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) 38 #define SPDK_ZEROCOPY 39 #endif 40 41 struct spdk_posix_sock { 42 struct spdk_sock base; 43 int fd; 44 45 uint32_t sendmsg_idx; 46 47 struct spdk_pipe *recv_pipe; 48 void *recv_buf; 49 int recv_buf_sz; 50 bool pipe_has_data; 51 bool socket_has_data; 52 bool zcopy; 53 54 int placement_id; 55 56 SSL_CTX *ctx; 57 SSL *ssl; 58 59 TAILQ_ENTRY(spdk_posix_sock) link; 60 }; 61 62 TAILQ_HEAD(spdk_has_data_list, spdk_posix_sock); 63 64 struct spdk_posix_sock_group_impl { 65 struct spdk_sock_group_impl base; 66 int fd; 67 struct spdk_has_data_list socks_with_data; 68 int placement_id; 69 }; 70 71 static struct spdk_sock_impl_opts g_posix_impl_opts = { 72 .recv_buf_size = DEFAULT_SO_RCVBUF_SIZE, 73 .send_buf_size = DEFAULT_SO_SNDBUF_SIZE, 74 .enable_recv_pipe = true, 75 .enable_quickack = false, 76 .enable_placement_id = PLACEMENT_NONE, 77 .enable_zerocopy_send_server = true, 78 .enable_zerocopy_send_client = false, 79 .zerocopy_threshold = 0, 80 .tls_version = 0, 81 .enable_ktls = false, 82 .psk_key = NULL, 83 .psk_key_size = 0, 84 .psk_identity = NULL, 85 .get_key = NULL, 86 .get_key_ctx = NULL, 87 .tls_cipher_suites = NULL 88 }; 89 90 static struct spdk_sock_impl_opts g_ssl_impl_opts = { 91 .recv_buf_size = MIN_SO_RCVBUF_SIZE, 92 .send_buf_size = MIN_SO_SNDBUF_SIZE, 93 .enable_recv_pipe = true, 94 .enable_quickack = false, 95 .enable_placement_id = PLACEMENT_NONE, 96 .enable_zerocopy_send_server = true, 97 .enable_zerocopy_send_client = false, 98 .zerocopy_threshold = 0, 99 .tls_version = 0, 100 .enable_ktls = false, 101 .psk_key = NULL, 102 .psk_identity = NULL 103 }; 104 105 static struct spdk_sock_map g_map = { 106 .entries = STAILQ_HEAD_INITIALIZER(g_map.entries), 107 .mtx = PTHREAD_MUTEX_INITIALIZER 108 }; 109 110 __attribute((destructor)) static void 111 posix_sock_map_cleanup(void) 112 { 113 spdk_sock_map_cleanup(&g_map); 114 } 115 116 #define __posix_sock(sock) (struct spdk_posix_sock *)sock 117 #define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group 118 119 static void 120 posix_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src, 121 size_t len) 122 { 123 #define FIELD_OK(field) \ 124 offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len 125 126 #define SET_FIELD(field) \ 127 if (FIELD_OK(field)) { \ 128 dest->field = src->field; \ 129 } 130 131 SET_FIELD(recv_buf_size); 132 SET_FIELD(send_buf_size); 133 SET_FIELD(enable_recv_pipe); 134 SET_FIELD(enable_zerocopy_send); 135 SET_FIELD(enable_quickack); 136 SET_FIELD(enable_placement_id); 137 SET_FIELD(enable_zerocopy_send_server); 138 SET_FIELD(enable_zerocopy_send_client); 139 SET_FIELD(zerocopy_threshold); 140 SET_FIELD(tls_version); 141 SET_FIELD(enable_ktls); 142 SET_FIELD(psk_key); 143 SET_FIELD(psk_key_size); 144 SET_FIELD(psk_identity); 145 SET_FIELD(get_key); 146 SET_FIELD(get_key_ctx); 147 SET_FIELD(tls_cipher_suites); 148 149 #undef SET_FIELD 150 #undef FIELD_OK 151 } 152 153 static int 154 _sock_impl_get_opts(struct spdk_sock_impl_opts *opts, struct spdk_sock_impl_opts *impl_opts, 155 size_t *len) 156 { 157 if (!opts || !len) { 158 errno = EINVAL; 159 return -1; 160 } 161 162 assert(sizeof(*opts) >= *len); 163 memset(opts, 0, *len); 164 165 posix_sock_copy_impl_opts(opts, impl_opts, *len); 166 *len = spdk_min(*len, sizeof(*impl_opts)); 167 168 return 0; 169 } 170 171 static int 172 posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) 173 { 174 return _sock_impl_get_opts(opts, &g_posix_impl_opts, len); 175 } 176 177 static int 178 ssl_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) 179 { 180 return _sock_impl_get_opts(opts, &g_ssl_impl_opts, len); 181 } 182 183 static int 184 _sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, struct spdk_sock_impl_opts *impl_opts, 185 size_t len) 186 { 187 if (!opts) { 188 errno = EINVAL; 189 return -1; 190 } 191 192 assert(sizeof(*opts) >= len); 193 posix_sock_copy_impl_opts(impl_opts, opts, len); 194 195 return 0; 196 } 197 198 static int 199 posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) 200 { 201 return _sock_impl_set_opts(opts, &g_posix_impl_opts, len); 202 } 203 204 static int 205 ssl_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) 206 { 207 return _sock_impl_set_opts(opts, &g_ssl_impl_opts, len); 208 } 209 210 static void 211 _opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest, 212 const struct spdk_sock_impl_opts *default_impl) 213 { 214 /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */ 215 memcpy(dest, default_impl, sizeof(*dest)); 216 217 if (opts->impl_opts != NULL) { 218 assert(sizeof(*dest) >= opts->impl_opts_size); 219 posix_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size); 220 } 221 } 222 223 static int 224 posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, 225 char *caddr, int clen, uint16_t *cport) 226 { 227 struct spdk_posix_sock *sock = __posix_sock(_sock); 228 struct sockaddr_storage sa; 229 socklen_t salen; 230 int rc; 231 232 assert(sock != NULL); 233 234 memset(&sa, 0, sizeof sa); 235 salen = sizeof sa; 236 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 237 if (rc != 0) { 238 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 239 return -1; 240 } 241 242 switch (sa.ss_family) { 243 case AF_UNIX: 244 /* Acceptable connection types that don't have IPs */ 245 return 0; 246 case AF_INET: 247 case AF_INET6: 248 /* Code below will get IP addresses */ 249 break; 250 default: 251 /* Unsupported socket family */ 252 return -1; 253 } 254 255 rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); 256 if (rc != 0) { 257 SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); 258 return -1; 259 } 260 261 if (sport) { 262 if (sa.ss_family == AF_INET) { 263 *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); 264 } else if (sa.ss_family == AF_INET6) { 265 *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); 266 } 267 } 268 269 memset(&sa, 0, sizeof sa); 270 salen = sizeof sa; 271 rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); 272 if (rc != 0) { 273 SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); 274 return -1; 275 } 276 277 rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); 278 if (rc != 0) { 279 SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); 280 return -1; 281 } 282 283 if (cport) { 284 if (sa.ss_family == AF_INET) { 285 *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); 286 } else if (sa.ss_family == AF_INET6) { 287 *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); 288 } 289 } 290 291 return 0; 292 } 293 294 enum posix_sock_create_type { 295 SPDK_SOCK_CREATE_LISTEN, 296 SPDK_SOCK_CREATE_CONNECT, 297 }; 298 299 static int 300 posix_sock_alloc_pipe(struct spdk_posix_sock *sock, int sz) 301 { 302 uint8_t *new_buf; 303 struct spdk_pipe *new_pipe; 304 struct iovec siov[2]; 305 struct iovec diov[2]; 306 int sbytes; 307 ssize_t bytes; 308 int rc; 309 310 if (sock->recv_buf_sz == sz) { 311 return 0; 312 } 313 314 /* If the new size is 0, just free the pipe */ 315 if (sz == 0) { 316 spdk_pipe_destroy(sock->recv_pipe); 317 free(sock->recv_buf); 318 sock->recv_pipe = NULL; 319 sock->recv_buf = NULL; 320 return 0; 321 } else if (sz < MIN_SOCK_PIPE_SIZE) { 322 SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); 323 return -1; 324 } 325 326 /* Round up to next 64 byte multiple */ 327 rc = posix_memalign((void **)&new_buf, 64, sz); 328 if (rc != 0) { 329 SPDK_ERRLOG("socket recv buf allocation failed\n"); 330 return -ENOMEM; 331 } 332 memset(new_buf, 0, sz); 333 334 new_pipe = spdk_pipe_create(new_buf, sz); 335 if (new_pipe == NULL) { 336 SPDK_ERRLOG("socket pipe allocation failed\n"); 337 free(new_buf); 338 return -ENOMEM; 339 } 340 341 if (sock->recv_pipe != NULL) { 342 /* Pull all of the data out of the old pipe */ 343 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); 344 if (sbytes > sz) { 345 /* Too much data to fit into the new pipe size */ 346 spdk_pipe_destroy(new_pipe); 347 free(new_buf); 348 return -EINVAL; 349 } 350 351 sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); 352 assert(sbytes == sz); 353 354 bytes = spdk_iovcpy(siov, 2, diov, 2); 355 spdk_pipe_writer_advance(new_pipe, bytes); 356 357 spdk_pipe_destroy(sock->recv_pipe); 358 free(sock->recv_buf); 359 } 360 361 sock->recv_buf_sz = sz; 362 sock->recv_buf = new_buf; 363 sock->recv_pipe = new_pipe; 364 365 return 0; 366 } 367 368 static int 369 posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz) 370 { 371 struct spdk_posix_sock *sock = __posix_sock(_sock); 372 int min_size; 373 int rc; 374 375 assert(sock != NULL); 376 377 if (_sock->impl_opts.enable_recv_pipe) { 378 rc = posix_sock_alloc_pipe(sock, sz); 379 if (rc) { 380 return rc; 381 } 382 } 383 384 /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and 385 * _sock->impl_opts.recv_buf_size. */ 386 min_size = spdk_max(MIN_SO_RCVBUF_SIZE, _sock->impl_opts.recv_buf_size); 387 388 if (sz < min_size) { 389 sz = min_size; 390 } 391 392 rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); 393 if (rc < 0) { 394 return rc; 395 } 396 397 _sock->impl_opts.recv_buf_size = sz; 398 399 return 0; 400 } 401 402 static int 403 posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz) 404 { 405 struct spdk_posix_sock *sock = __posix_sock(_sock); 406 int min_size; 407 int rc; 408 409 assert(sock != NULL); 410 411 /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and 412 * _sock->impl_opts.send_buf_size. */ 413 min_size = spdk_max(MIN_SO_SNDBUF_SIZE, _sock->impl_opts.send_buf_size); 414 415 if (sz < min_size) { 416 sz = min_size; 417 } 418 419 rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); 420 if (rc < 0) { 421 return rc; 422 } 423 424 _sock->impl_opts.send_buf_size = sz; 425 426 return 0; 427 } 428 429 static void 430 posix_sock_init(struct spdk_posix_sock *sock, bool enable_zero_copy) 431 { 432 #if defined(SPDK_ZEROCOPY) || defined(__linux__) 433 int flag; 434 int rc; 435 #endif 436 437 #if defined(SPDK_ZEROCOPY) 438 flag = 1; 439 440 if (enable_zero_copy) { 441 /* Try to turn on zero copy sends */ 442 rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); 443 if (rc == 0) { 444 sock->zcopy = true; 445 } 446 } 447 #endif 448 449 #if defined(__linux__) 450 flag = 1; 451 452 if (sock->base.impl_opts.enable_quickack) { 453 rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); 454 if (rc != 0) { 455 SPDK_ERRLOG("quickack was failed to set\n"); 456 } 457 } 458 459 spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id, 460 &sock->placement_id); 461 462 if (sock->base.impl_opts.enable_placement_id == PLACEMENT_MARK) { 463 /* Save placement_id */ 464 spdk_sock_map_insert(&g_map, sock->placement_id, NULL); 465 } 466 #endif 467 } 468 469 static struct spdk_posix_sock * 470 posix_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy) 471 { 472 struct spdk_posix_sock *sock; 473 474 sock = calloc(1, sizeof(*sock)); 475 if (sock == NULL) { 476 SPDK_ERRLOG("sock allocation failed\n"); 477 return NULL; 478 } 479 480 sock->fd = fd; 481 memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts)); 482 posix_sock_init(sock, enable_zero_copy); 483 484 return sock; 485 } 486 487 static int 488 posix_fd_create(struct addrinfo *res, struct spdk_sock_opts *opts, 489 struct spdk_sock_impl_opts *impl_opts) 490 { 491 int fd; 492 int val = 1; 493 int rc, sz; 494 #if defined(__linux__) 495 int to; 496 #endif 497 498 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); 499 if (fd < 0) { 500 /* error */ 501 return -1; 502 } 503 504 sz = impl_opts->recv_buf_size; 505 rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); 506 if (rc) { 507 /* Not fatal */ 508 } 509 510 sz = impl_opts->send_buf_size; 511 rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); 512 if (rc) { 513 /* Not fatal */ 514 } 515 516 rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); 517 if (rc != 0) { 518 close(fd); 519 /* error */ 520 return -1; 521 } 522 rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); 523 if (rc != 0) { 524 close(fd); 525 /* error */ 526 return -1; 527 } 528 529 #if defined(SO_PRIORITY) 530 if (opts->priority) { 531 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); 532 if (rc != 0) { 533 close(fd); 534 /* error */ 535 return -1; 536 } 537 } 538 #endif 539 540 if (res->ai_family == AF_INET6) { 541 rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); 542 if (rc != 0) { 543 close(fd); 544 /* error */ 545 return -1; 546 } 547 } 548 549 if (opts->ack_timeout) { 550 #if defined(__linux__) 551 to = opts->ack_timeout; 552 rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &to, sizeof(to)); 553 if (rc != 0) { 554 close(fd); 555 /* error */ 556 return -1; 557 } 558 #else 559 SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n"); 560 #endif 561 } 562 563 return fd; 564 } 565 566 static int 567 posix_sock_psk_find_session_server_cb(SSL *ssl, const unsigned char *identity, 568 size_t identity_len, SSL_SESSION **sess) 569 { 570 struct spdk_sock_impl_opts *impl_opts = SSL_get_app_data(ssl); 571 uint8_t key[SSL_MAX_MASTER_KEY_LENGTH] = {}; 572 int keylen; 573 int rc, i; 574 STACK_OF(SSL_CIPHER) *ciphers; 575 const SSL_CIPHER *cipher; 576 const char *cipher_name; 577 const char *user_cipher = NULL; 578 bool found = false; 579 580 if (impl_opts->get_key) { 581 rc = impl_opts->get_key(key, sizeof(key), &user_cipher, identity, impl_opts->get_key_ctx); 582 if (rc < 0) { 583 SPDK_ERRLOG("Unable to find PSK for identity: %s\n", identity); 584 return 0; 585 } 586 keylen = rc; 587 } else { 588 if (impl_opts->psk_key == NULL) { 589 SPDK_ERRLOG("PSK is not set\n"); 590 return 0; 591 } 592 593 SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK ID %lu\n", strlen(impl_opts->psk_identity)); 594 if (strcmp(impl_opts->psk_identity, identity) != 0) { 595 SPDK_ERRLOG("Unknown Client's PSK ID\n"); 596 return 0; 597 } 598 keylen = impl_opts->psk_key_size; 599 600 memcpy(key, impl_opts->psk_key, keylen); 601 user_cipher = impl_opts->tls_cipher_suites; 602 } 603 604 if (user_cipher == NULL) { 605 SPDK_ERRLOG("Cipher suite not set\n"); 606 return 0; 607 } 608 609 *sess = SSL_SESSION_new(); 610 if (*sess == NULL) { 611 SPDK_ERRLOG("Unable to allocate new SSL session\n"); 612 return 0; 613 } 614 615 ciphers = SSL_get_ciphers(ssl); 616 for (i = 0; i < sk_SSL_CIPHER_num(ciphers); i++) { 617 cipher = sk_SSL_CIPHER_value(ciphers, i); 618 cipher_name = SSL_CIPHER_get_name(cipher); 619 620 if (strcmp(user_cipher, cipher_name) == 0) { 621 rc = SSL_SESSION_set_cipher(*sess, cipher); 622 if (rc != 1) { 623 SPDK_ERRLOG("Unable to set cipher: %s\n", cipher_name); 624 goto err; 625 } 626 found = true; 627 break; 628 } 629 } 630 if (found == false) { 631 SPDK_ERRLOG("No suitable cipher found\n"); 632 goto err; 633 } 634 635 SPDK_DEBUGLOG(sock_posix, "Cipher selected: %s\n", cipher_name); 636 637 rc = SSL_SESSION_set_protocol_version(*sess, TLS1_3_VERSION); 638 if (rc != 1) { 639 SPDK_ERRLOG("Unable to set TLS version: %d\n", TLS1_3_VERSION); 640 goto err; 641 } 642 643 rc = SSL_SESSION_set1_master_key(*sess, key, keylen); 644 if (rc != 1) { 645 SPDK_ERRLOG("Unable to set PSK for session\n"); 646 goto err; 647 } 648 649 return 1; 650 651 err: 652 SSL_SESSION_free(*sess); 653 *sess = NULL; 654 return 0; 655 } 656 657 static int 658 posix_sock_psk_use_session_client_cb(SSL *ssl, const EVP_MD *md, const unsigned char **identity, 659 size_t *identity_len, SSL_SESSION **sess) 660 { 661 struct spdk_sock_impl_opts *impl_opts = SSL_get_app_data(ssl); 662 int rc, i; 663 STACK_OF(SSL_CIPHER) *ciphers; 664 const SSL_CIPHER *cipher; 665 const char *cipher_name; 666 long keylen; 667 bool found = false; 668 669 if (impl_opts->psk_key == NULL) { 670 SPDK_ERRLOG("PSK is not set\n"); 671 return 0; 672 } 673 if (impl_opts->psk_key_size > SSL_MAX_MASTER_KEY_LENGTH) { 674 SPDK_ERRLOG("PSK too long\n"); 675 return 0; 676 } 677 keylen = impl_opts->psk_key_size; 678 679 if (impl_opts->tls_cipher_suites == NULL) { 680 SPDK_ERRLOG("Cipher suite not set\n"); 681 return 0; 682 } 683 *sess = SSL_SESSION_new(); 684 if (*sess == NULL) { 685 SPDK_ERRLOG("Unable to allocate new SSL session\n"); 686 return 0; 687 } 688 689 ciphers = SSL_get_ciphers(ssl); 690 for (i = 0; i < sk_SSL_CIPHER_num(ciphers); i++) { 691 cipher = sk_SSL_CIPHER_value(ciphers, i); 692 cipher_name = SSL_CIPHER_get_name(cipher); 693 694 if (strcmp(impl_opts->tls_cipher_suites, cipher_name) == 0) { 695 rc = SSL_SESSION_set_cipher(*sess, cipher); 696 if (rc != 1) { 697 SPDK_ERRLOG("Unable to set cipher: %s\n", cipher_name); 698 goto err; 699 } 700 found = true; 701 break; 702 } 703 } 704 if (found == false) { 705 SPDK_ERRLOG("No suitable cipher found\n"); 706 goto err; 707 } 708 709 SPDK_DEBUGLOG(sock_posix, "Cipher selected: %s\n", cipher_name); 710 711 rc = SSL_SESSION_set_protocol_version(*sess, TLS1_3_VERSION); 712 if (rc != 1) { 713 SPDK_ERRLOG("Unable to set TLS version: %d\n", TLS1_3_VERSION); 714 goto err; 715 } 716 717 rc = SSL_SESSION_set1_master_key(*sess, impl_opts->psk_key, keylen); 718 if (rc != 1) { 719 SPDK_ERRLOG("Unable to set PSK for session\n"); 720 goto err; 721 } 722 723 *identity_len = strlen(impl_opts->psk_identity); 724 *identity = impl_opts->psk_identity; 725 726 return 1; 727 728 err: 729 SSL_SESSION_free(*sess); 730 *sess = NULL; 731 return 0; 732 } 733 734 static SSL_CTX * 735 posix_sock_create_ssl_context(const SSL_METHOD *method, struct spdk_sock_opts *opts, 736 struct spdk_sock_impl_opts *impl_opts) 737 { 738 SSL_CTX *ctx; 739 int tls_version = 0; 740 bool ktls_enabled = false; 741 #ifdef SSL_OP_ENABLE_KTLS 742 long options; 743 #endif 744 745 SSL_library_init(); 746 OpenSSL_add_all_algorithms(); 747 SSL_load_error_strings(); 748 /* Produce a SSL CTX in SSL V2 and V3 standards compliant way */ 749 ctx = SSL_CTX_new(method); 750 if (!ctx) { 751 SPDK_ERRLOG("SSL_CTX_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 752 return NULL; 753 } 754 SPDK_DEBUGLOG(sock_posix, "SSL context created\n"); 755 756 switch (impl_opts->tls_version) { 757 case 0: 758 /* auto-negotioation */ 759 break; 760 case SPDK_TLS_VERSION_1_3: 761 tls_version = TLS1_3_VERSION; 762 break; 763 default: 764 SPDK_ERRLOG("Incorrect TLS version provided: %d\n", impl_opts->tls_version); 765 goto err; 766 } 767 768 if (tls_version) { 769 SPDK_DEBUGLOG(sock_posix, "Hardening TLS version to '%d'='0x%X'\n", impl_opts->tls_version, 770 tls_version); 771 if (!SSL_CTX_set_min_proto_version(ctx, tls_version)) { 772 SPDK_ERRLOG("Unable to set Min TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); 773 goto err; 774 } 775 if (!SSL_CTX_set_max_proto_version(ctx, tls_version)) { 776 SPDK_ERRLOG("Unable to set Max TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); 777 goto err; 778 } 779 } 780 if (impl_opts->enable_ktls) { 781 SPDK_DEBUGLOG(sock_posix, "Enabling kTLS offload\n"); 782 #ifdef SSL_OP_ENABLE_KTLS 783 options = SSL_CTX_set_options(ctx, SSL_OP_ENABLE_KTLS); 784 ktls_enabled = options & SSL_OP_ENABLE_KTLS; 785 #else 786 ktls_enabled = false; 787 #endif 788 if (!ktls_enabled) { 789 SPDK_ERRLOG("Unable to set kTLS offload via SSL_CTX_set_options(). Configure openssl with 'enable-ktls'\n"); 790 goto err; 791 } 792 } 793 794 /* SSL_CTX_set_ciphersuites() return 1 if the requested 795 * cipher suite list was configured, and 0 otherwise. */ 796 if (impl_opts->tls_cipher_suites != NULL && 797 SSL_CTX_set_ciphersuites(ctx, impl_opts->tls_cipher_suites) != 1) { 798 SPDK_ERRLOG("Unable to set TLS cipher suites for SSL'\n"); 799 goto err; 800 } 801 802 return ctx; 803 804 err: 805 SSL_CTX_free(ctx); 806 return NULL; 807 } 808 809 static SSL * 810 ssl_sock_setup_connect(SSL_CTX *ctx, int fd) 811 { 812 SSL *ssl; 813 814 ssl = SSL_new(ctx); 815 if (!ssl) { 816 SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 817 return NULL; 818 } 819 SSL_set_fd(ssl, fd); 820 SSL_set_connect_state(ssl); 821 SSL_set_psk_use_session_callback(ssl, posix_sock_psk_use_session_client_cb); 822 SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); 823 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 824 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 825 SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", 826 SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); 827 return ssl; 828 } 829 830 static SSL * 831 ssl_sock_setup_accept(SSL_CTX *ctx, int fd) 832 { 833 SSL *ssl; 834 835 ssl = SSL_new(ctx); 836 if (!ssl) { 837 SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 838 return NULL; 839 } 840 SSL_set_fd(ssl, fd); 841 SSL_set_accept_state(ssl); 842 SSL_set_psk_find_session_callback(ssl, posix_sock_psk_find_session_server_cb); 843 SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); 844 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 845 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 846 SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", 847 SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); 848 return ssl; 849 } 850 851 static ssize_t 852 SSL_readv(SSL *ssl, const struct iovec *iov, int iovcnt) 853 { 854 int i, rc = 0; 855 ssize_t total = 0; 856 857 for (i = 0; i < iovcnt; i++) { 858 rc = SSL_read(ssl, iov[i].iov_base, iov[i].iov_len); 859 860 if (rc > 0) { 861 total += rc; 862 } 863 if (rc != (int)iov[i].iov_len) { 864 break; 865 } 866 } 867 if (total > 0) { 868 errno = 0; 869 return total; 870 } 871 switch (SSL_get_error(ssl, rc)) { 872 case SSL_ERROR_ZERO_RETURN: 873 errno = ENOTCONN; 874 return 0; 875 case SSL_ERROR_WANT_READ: 876 case SSL_ERROR_WANT_WRITE: 877 case SSL_ERROR_WANT_CONNECT: 878 case SSL_ERROR_WANT_ACCEPT: 879 case SSL_ERROR_WANT_X509_LOOKUP: 880 case SSL_ERROR_WANT_ASYNC: 881 case SSL_ERROR_WANT_ASYNC_JOB: 882 case SSL_ERROR_WANT_CLIENT_HELLO_CB: 883 errno = EAGAIN; 884 return -1; 885 case SSL_ERROR_SYSCALL: 886 case SSL_ERROR_SSL: 887 errno = ENOTCONN; 888 return -1; 889 default: 890 errno = ENOTCONN; 891 return -1; 892 } 893 } 894 895 static ssize_t 896 SSL_writev(SSL *ssl, struct iovec *iov, int iovcnt) 897 { 898 int i, rc = 0; 899 ssize_t total = 0; 900 901 for (i = 0; i < iovcnt; i++) { 902 rc = SSL_write(ssl, iov[i].iov_base, iov[i].iov_len); 903 904 if (rc > 0) { 905 total += rc; 906 } 907 if (rc != (int)iov[i].iov_len) { 908 break; 909 } 910 } 911 if (total > 0) { 912 errno = 0; 913 return total; 914 } 915 switch (SSL_get_error(ssl, rc)) { 916 case SSL_ERROR_ZERO_RETURN: 917 errno = ENOTCONN; 918 return 0; 919 case SSL_ERROR_WANT_READ: 920 case SSL_ERROR_WANT_WRITE: 921 case SSL_ERROR_WANT_CONNECT: 922 case SSL_ERROR_WANT_ACCEPT: 923 case SSL_ERROR_WANT_X509_LOOKUP: 924 case SSL_ERROR_WANT_ASYNC: 925 case SSL_ERROR_WANT_ASYNC_JOB: 926 case SSL_ERROR_WANT_CLIENT_HELLO_CB: 927 errno = EAGAIN; 928 return -1; 929 case SSL_ERROR_SYSCALL: 930 case SSL_ERROR_SSL: 931 errno = ENOTCONN; 932 return -1; 933 default: 934 errno = ENOTCONN; 935 return -1; 936 } 937 } 938 939 static struct spdk_sock * 940 posix_sock_create(const char *ip, int port, 941 enum posix_sock_create_type type, 942 struct spdk_sock_opts *opts, 943 bool enable_ssl) 944 { 945 struct spdk_posix_sock *sock; 946 struct spdk_sock_impl_opts impl_opts; 947 char buf[MAX_TMPBUF]; 948 char portnum[PORTNUMLEN]; 949 char *p; 950 struct addrinfo hints, *res, *res0; 951 int fd, flag; 952 int rc; 953 bool enable_zcopy_user_opts = true; 954 bool enable_zcopy_impl_opts = true; 955 SSL_CTX *ctx = 0; 956 SSL *ssl = 0; 957 958 assert(opts != NULL); 959 if (enable_ssl) { 960 _opts_get_impl_opts(opts, &impl_opts, &g_ssl_impl_opts); 961 } else { 962 _opts_get_impl_opts(opts, &impl_opts, &g_posix_impl_opts); 963 } 964 965 if (ip == NULL) { 966 return NULL; 967 } 968 if (ip[0] == '[') { 969 snprintf(buf, sizeof(buf), "%s", ip + 1); 970 p = strchr(buf, ']'); 971 if (p != NULL) { 972 *p = '\0'; 973 } 974 ip = (const char *) &buf[0]; 975 } 976 977 snprintf(portnum, sizeof portnum, "%d", port); 978 memset(&hints, 0, sizeof hints); 979 hints.ai_family = PF_UNSPEC; 980 hints.ai_socktype = SOCK_STREAM; 981 hints.ai_flags = AI_NUMERICSERV; 982 hints.ai_flags |= AI_PASSIVE; 983 hints.ai_flags |= AI_NUMERICHOST; 984 rc = getaddrinfo(ip, portnum, &hints, &res0); 985 if (rc != 0) { 986 SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); 987 return NULL; 988 } 989 990 /* try listen */ 991 fd = -1; 992 for (res = res0; res != NULL; res = res->ai_next) { 993 retry: 994 fd = posix_fd_create(res, opts, &impl_opts); 995 if (fd < 0) { 996 continue; 997 } 998 if (type == SPDK_SOCK_CREATE_LISTEN) { 999 rc = bind(fd, res->ai_addr, res->ai_addrlen); 1000 if (rc != 0) { 1001 SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); 1002 switch (errno) { 1003 case EINTR: 1004 /* interrupted? */ 1005 close(fd); 1006 goto retry; 1007 case EADDRNOTAVAIL: 1008 SPDK_ERRLOG("IP address %s not available. " 1009 "Verify IP address in config file " 1010 "and make sure setup script is " 1011 "run before starting spdk app.\n", ip); 1012 /* FALLTHROUGH */ 1013 default: 1014 /* try next family */ 1015 close(fd); 1016 fd = -1; 1017 continue; 1018 } 1019 } 1020 /* bind OK */ 1021 rc = listen(fd, 512); 1022 if (rc != 0) { 1023 SPDK_ERRLOG("listen() failed, errno = %d\n", errno); 1024 close(fd); 1025 fd = -1; 1026 break; 1027 } 1028 enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server; 1029 } else if (type == SPDK_SOCK_CREATE_CONNECT) { 1030 rc = connect(fd, res->ai_addr, res->ai_addrlen); 1031 if (rc != 0) { 1032 SPDK_ERRLOG("connect() failed, errno = %d\n", errno); 1033 /* try next family */ 1034 close(fd); 1035 fd = -1; 1036 continue; 1037 } 1038 enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client; 1039 if (enable_ssl) { 1040 ctx = posix_sock_create_ssl_context(TLS_client_method(), opts, &impl_opts); 1041 if (!ctx) { 1042 SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); 1043 close(fd); 1044 fd = -1; 1045 break; 1046 } 1047 ssl = ssl_sock_setup_connect(ctx, fd); 1048 if (!ssl) { 1049 SPDK_ERRLOG("ssl_sock_setup_connect() failed, errno = %d\n", errno); 1050 close(fd); 1051 fd = -1; 1052 SSL_CTX_free(ctx); 1053 break; 1054 } 1055 } 1056 } 1057 1058 flag = fcntl(fd, F_GETFL); 1059 if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1060 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); 1061 SSL_free(ssl); 1062 SSL_CTX_free(ctx); 1063 close(fd); 1064 fd = -1; 1065 break; 1066 } 1067 break; 1068 } 1069 freeaddrinfo(res0); 1070 1071 if (fd < 0) { 1072 return NULL; 1073 } 1074 1075 /* Only enable zero copy for non-loopback and non-ssl sockets. */ 1076 enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd) && !enable_ssl; 1077 1078 sock = posix_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts); 1079 if (sock == NULL) { 1080 SPDK_ERRLOG("sock allocation failed\n"); 1081 SSL_free(ssl); 1082 SSL_CTX_free(ctx); 1083 close(fd); 1084 return NULL; 1085 } 1086 1087 if (ctx) { 1088 sock->ctx = ctx; 1089 } 1090 1091 if (ssl) { 1092 sock->ssl = ssl; 1093 SSL_set_app_data(ssl, &sock->base.impl_opts); 1094 } 1095 1096 return &sock->base; 1097 } 1098 1099 static struct spdk_sock * 1100 posix_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) 1101 { 1102 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, false); 1103 } 1104 1105 static struct spdk_sock * 1106 posix_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) 1107 { 1108 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, false); 1109 } 1110 1111 static struct spdk_sock * 1112 _posix_sock_accept(struct spdk_sock *_sock, bool enable_ssl) 1113 { 1114 struct spdk_posix_sock *sock = __posix_sock(_sock); 1115 struct sockaddr_storage sa; 1116 socklen_t salen; 1117 int rc, fd; 1118 struct spdk_posix_sock *new_sock; 1119 int flag; 1120 SSL_CTX *ctx = 0; 1121 SSL *ssl = 0; 1122 1123 memset(&sa, 0, sizeof(sa)); 1124 salen = sizeof(sa); 1125 1126 assert(sock != NULL); 1127 1128 rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); 1129 1130 if (rc == -1) { 1131 return NULL; 1132 } 1133 1134 fd = rc; 1135 1136 flag = fcntl(fd, F_GETFL); 1137 if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) { 1138 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); 1139 close(fd); 1140 return NULL; 1141 } 1142 1143 #if defined(SO_PRIORITY) 1144 /* The priority is not inherited, so call this function again */ 1145 if (sock->base.opts.priority) { 1146 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); 1147 if (rc != 0) { 1148 close(fd); 1149 return NULL; 1150 } 1151 } 1152 #endif 1153 1154 /* Establish SSL connection */ 1155 if (enable_ssl) { 1156 ctx = posix_sock_create_ssl_context(TLS_server_method(), &sock->base.opts, &sock->base.impl_opts); 1157 if (!ctx) { 1158 SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); 1159 close(fd); 1160 return NULL; 1161 } 1162 ssl = ssl_sock_setup_accept(ctx, fd); 1163 if (!ssl) { 1164 SPDK_ERRLOG("ssl_sock_setup_accept() failed, errno = %d\n", errno); 1165 close(fd); 1166 SSL_CTX_free(ctx); 1167 return NULL; 1168 } 1169 } 1170 1171 /* Inherit the zero copy feature from the listen socket */ 1172 new_sock = posix_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy); 1173 if (new_sock == NULL) { 1174 close(fd); 1175 SSL_free(ssl); 1176 SSL_CTX_free(ctx); 1177 return NULL; 1178 } 1179 1180 if (ctx) { 1181 new_sock->ctx = ctx; 1182 } 1183 1184 if (ssl) { 1185 new_sock->ssl = ssl; 1186 SSL_set_app_data(ssl, &new_sock->base.impl_opts); 1187 } 1188 1189 return &new_sock->base; 1190 } 1191 1192 static struct spdk_sock * 1193 posix_sock_accept(struct spdk_sock *_sock) 1194 { 1195 return _posix_sock_accept(_sock, false); 1196 } 1197 1198 static int 1199 posix_sock_close(struct spdk_sock *_sock) 1200 { 1201 struct spdk_posix_sock *sock = __posix_sock(_sock); 1202 1203 assert(TAILQ_EMPTY(&_sock->pending_reqs)); 1204 1205 if (sock->ssl != NULL) { 1206 SSL_shutdown(sock->ssl); 1207 } 1208 1209 /* If the socket fails to close, the best choice is to 1210 * leak the fd but continue to free the rest of the sock 1211 * memory. */ 1212 close(sock->fd); 1213 1214 SSL_free(sock->ssl); 1215 SSL_CTX_free(sock->ctx); 1216 1217 spdk_pipe_destroy(sock->recv_pipe); 1218 free(sock->recv_buf); 1219 free(sock); 1220 1221 return 0; 1222 } 1223 1224 #ifdef SPDK_ZEROCOPY 1225 static int 1226 _sock_check_zcopy(struct spdk_sock *sock) 1227 { 1228 struct spdk_posix_sock *psock = __posix_sock(sock); 1229 struct msghdr msgh = {}; 1230 uint8_t buf[sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)]; 1231 ssize_t rc; 1232 struct sock_extended_err *serr; 1233 struct cmsghdr *cm; 1234 uint32_t idx; 1235 struct spdk_sock_request *req, *treq; 1236 bool found; 1237 1238 msgh.msg_control = buf; 1239 msgh.msg_controllen = sizeof(buf); 1240 1241 while (true) { 1242 rc = recvmsg(psock->fd, &msgh, MSG_ERRQUEUE); 1243 1244 if (rc < 0) { 1245 if (errno == EWOULDBLOCK || errno == EAGAIN) { 1246 return 0; 1247 } 1248 1249 if (!TAILQ_EMPTY(&sock->pending_reqs)) { 1250 SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries\n"); 1251 } else { 1252 SPDK_WARNLOG("Recvmsg yielded an error!\n"); 1253 } 1254 return 0; 1255 } 1256 1257 cm = CMSG_FIRSTHDR(&msgh); 1258 if (!(cm && 1259 ((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || 1260 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR)))) { 1261 SPDK_WARNLOG("Unexpected cmsg level or type!\n"); 1262 return 0; 1263 } 1264 1265 serr = (struct sock_extended_err *)CMSG_DATA(cm); 1266 if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { 1267 SPDK_WARNLOG("Unexpected extended error origin\n"); 1268 return 0; 1269 } 1270 1271 /* Most of the time, the pending_reqs array is in the exact 1272 * order we need such that all of the requests to complete are 1273 * in order, in the front. It is guaranteed that all requests 1274 * belonging to the same sendmsg call are sequential, so once 1275 * we encounter one match we can stop looping as soon as a 1276 * non-match is found. 1277 */ 1278 idx = serr->ee_info; 1279 while (true) { 1280 found = false; 1281 TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) { 1282 if (!req->internal.is_zcopy) { 1283 /* This wasn't a zcopy request. It was just waiting in line to complete */ 1284 rc = spdk_sock_request_put(sock, req, 0); 1285 if (rc < 0) { 1286 return rc; 1287 } 1288 } else if (req->internal.offset == idx) { 1289 found = true; 1290 rc = spdk_sock_request_put(sock, req, 0); 1291 if (rc < 0) { 1292 return rc; 1293 } 1294 } else if (found) { 1295 break; 1296 } 1297 } 1298 1299 if (idx == serr->ee_data) { 1300 break; 1301 } 1302 1303 if (idx == UINT32_MAX) { 1304 idx = 0; 1305 } else { 1306 idx++; 1307 } 1308 } 1309 } 1310 1311 return 0; 1312 } 1313 #endif 1314 1315 static int 1316 _sock_flush(struct spdk_sock *sock) 1317 { 1318 struct spdk_posix_sock *psock = __posix_sock(sock); 1319 struct msghdr msg = {}; 1320 int flags; 1321 struct iovec iovs[IOV_BATCH_SIZE]; 1322 int iovcnt; 1323 int retval; 1324 struct spdk_sock_request *req; 1325 int i; 1326 ssize_t rc, sent; 1327 unsigned int offset; 1328 size_t len; 1329 bool is_zcopy = false; 1330 1331 /* Can't flush from within a callback or we end up with recursive calls */ 1332 if (sock->cb_cnt > 0) { 1333 errno = EAGAIN; 1334 return -1; 1335 } 1336 1337 #ifdef SPDK_ZEROCOPY 1338 if (psock->zcopy) { 1339 flags = MSG_ZEROCOPY | MSG_NOSIGNAL; 1340 } else 1341 #endif 1342 { 1343 flags = MSG_NOSIGNAL; 1344 } 1345 1346 iovcnt = spdk_sock_prep_reqs(sock, iovs, 0, NULL, &flags); 1347 if (iovcnt == 0) { 1348 return 0; 1349 } 1350 1351 #ifdef SPDK_ZEROCOPY 1352 is_zcopy = flags & MSG_ZEROCOPY; 1353 #endif 1354 1355 /* Perform the vectored write */ 1356 msg.msg_iov = iovs; 1357 msg.msg_iovlen = iovcnt; 1358 1359 if (psock->ssl) { 1360 rc = SSL_writev(psock->ssl, iovs, iovcnt); 1361 } else { 1362 rc = sendmsg(psock->fd, &msg, flags); 1363 } 1364 if (rc <= 0) { 1365 if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && psock->zcopy)) { 1366 errno = EAGAIN; 1367 } 1368 return -1; 1369 } 1370 1371 sent = rc; 1372 1373 if (is_zcopy) { 1374 /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the 1375 * req->internal.offset, so sendmsg_idx should not be zero */ 1376 if (spdk_unlikely(psock->sendmsg_idx == UINT32_MAX)) { 1377 psock->sendmsg_idx = 1; 1378 } else { 1379 psock->sendmsg_idx++; 1380 } 1381 } 1382 1383 /* Consume the requests that were actually written */ 1384 req = TAILQ_FIRST(&sock->queued_reqs); 1385 while (req) { 1386 offset = req->internal.offset; 1387 1388 /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */ 1389 req->internal.is_zcopy = is_zcopy; 1390 1391 for (i = 0; i < req->iovcnt; i++) { 1392 /* Advance by the offset first */ 1393 if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { 1394 offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; 1395 continue; 1396 } 1397 1398 /* Calculate the remaining length of this element */ 1399 len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; 1400 1401 if (len > (size_t)rc) { 1402 /* This element was partially sent. */ 1403 req->internal.offset += rc; 1404 return sent; 1405 } 1406 1407 offset = 0; 1408 req->internal.offset += len; 1409 rc -= len; 1410 } 1411 1412 /* Handled a full request. */ 1413 spdk_sock_request_pend(sock, req); 1414 1415 if (!req->internal.is_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) { 1416 /* The sendmsg syscall above isn't currently asynchronous, 1417 * so it's already done. */ 1418 retval = spdk_sock_request_put(sock, req, 0); 1419 if (retval) { 1420 break; 1421 } 1422 } else { 1423 /* Re-use the offset field to hold the sendmsg call index. The 1424 * index is 0 based, so subtract one here because we've already 1425 * incremented above. */ 1426 req->internal.offset = psock->sendmsg_idx - 1; 1427 } 1428 1429 if (rc == 0) { 1430 break; 1431 } 1432 1433 req = TAILQ_FIRST(&sock->queued_reqs); 1434 } 1435 1436 return sent; 1437 } 1438 1439 static int 1440 posix_sock_flush(struct spdk_sock *sock) 1441 { 1442 #ifdef SPDK_ZEROCOPY 1443 struct spdk_posix_sock *psock = __posix_sock(sock); 1444 1445 if (psock->zcopy && !TAILQ_EMPTY(&sock->pending_reqs)) { 1446 _sock_check_zcopy(sock); 1447 } 1448 #endif 1449 1450 return _sock_flush(sock); 1451 } 1452 1453 static ssize_t 1454 posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int diovcnt) 1455 { 1456 struct iovec siov[2]; 1457 int sbytes; 1458 ssize_t bytes; 1459 struct spdk_posix_sock_group_impl *group; 1460 1461 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); 1462 if (sbytes < 0) { 1463 errno = EINVAL; 1464 return -1; 1465 } else if (sbytes == 0) { 1466 errno = EAGAIN; 1467 return -1; 1468 } 1469 1470 bytes = spdk_iovcpy(siov, 2, diov, diovcnt); 1471 1472 if (bytes == 0) { 1473 /* The only way this happens is if diov is 0 length */ 1474 errno = EINVAL; 1475 return -1; 1476 } 1477 1478 spdk_pipe_reader_advance(sock->recv_pipe, bytes); 1479 1480 /* If we drained the pipe, mark it appropriately */ 1481 if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { 1482 assert(sock->pipe_has_data == true); 1483 1484 group = __posix_group_impl(sock->base.group_impl); 1485 if (group && !sock->socket_has_data) { 1486 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1487 } 1488 1489 sock->pipe_has_data = false; 1490 } 1491 1492 return bytes; 1493 } 1494 1495 static inline ssize_t 1496 posix_sock_read(struct spdk_posix_sock *sock) 1497 { 1498 struct iovec iov[2]; 1499 int bytes_avail, bytes_recvd; 1500 struct spdk_posix_sock_group_impl *group; 1501 1502 bytes_avail = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); 1503 1504 if (bytes_avail <= 0) { 1505 return bytes_avail; 1506 } 1507 1508 if (sock->ssl) { 1509 bytes_recvd = SSL_readv(sock->ssl, iov, 2); 1510 } else { 1511 bytes_recvd = readv(sock->fd, iov, 2); 1512 } 1513 1514 assert(sock->pipe_has_data == false); 1515 1516 if (bytes_recvd <= 0) { 1517 /* Errors count as draining the socket data */ 1518 if (sock->base.group_impl && sock->socket_has_data) { 1519 group = __posix_group_impl(sock->base.group_impl); 1520 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1521 } 1522 1523 sock->socket_has_data = false; 1524 1525 return bytes_recvd; 1526 } 1527 1528 spdk_pipe_writer_advance(sock->recv_pipe, bytes_recvd); 1529 1530 #if DEBUG 1531 if (sock->base.group_impl) { 1532 assert(sock->socket_has_data == true); 1533 } 1534 #endif 1535 1536 sock->pipe_has_data = true; 1537 if (bytes_recvd < bytes_avail) { 1538 /* We drained the kernel socket entirely. */ 1539 sock->socket_has_data = false; 1540 } 1541 1542 return bytes_recvd; 1543 } 1544 1545 static ssize_t 1546 posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) 1547 { 1548 struct spdk_posix_sock *sock = __posix_sock(_sock); 1549 struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl); 1550 int rc, i; 1551 size_t len; 1552 1553 if (sock->recv_pipe == NULL) { 1554 assert(sock->pipe_has_data == false); 1555 if (group && sock->socket_has_data) { 1556 sock->socket_has_data = false; 1557 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1558 } 1559 if (sock->ssl) { 1560 return SSL_readv(sock->ssl, iov, iovcnt); 1561 } else { 1562 return readv(sock->fd, iov, iovcnt); 1563 } 1564 } 1565 1566 /* If the socket is not in a group, we must assume it always has 1567 * data waiting for us because it is not epolled */ 1568 if (!sock->pipe_has_data && (group == NULL || sock->socket_has_data)) { 1569 /* If the user is receiving a sufficiently large amount of data, 1570 * receive directly to their buffers. */ 1571 len = 0; 1572 for (i = 0; i < iovcnt; i++) { 1573 len += iov[i].iov_len; 1574 } 1575 1576 if (len >= MIN_SOCK_PIPE_SIZE) { 1577 /* TODO: Should this detect if kernel socket is drained? */ 1578 if (sock->ssl) { 1579 return SSL_readv(sock->ssl, iov, iovcnt); 1580 } else { 1581 return readv(sock->fd, iov, iovcnt); 1582 } 1583 } 1584 1585 /* Otherwise, do a big read into our pipe */ 1586 rc = posix_sock_read(sock); 1587 if (rc <= 0) { 1588 return rc; 1589 } 1590 } 1591 1592 return posix_sock_recv_from_pipe(sock, iov, iovcnt); 1593 } 1594 1595 static ssize_t 1596 posix_sock_recv(struct spdk_sock *sock, void *buf, size_t len) 1597 { 1598 struct iovec iov[1]; 1599 1600 iov[0].iov_base = buf; 1601 iov[0].iov_len = len; 1602 1603 return posix_sock_readv(sock, iov, 1); 1604 } 1605 1606 static ssize_t 1607 posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) 1608 { 1609 struct spdk_posix_sock *sock = __posix_sock(_sock); 1610 int rc; 1611 1612 /* In order to process a writev, we need to flush any asynchronous writes 1613 * first. */ 1614 rc = _sock_flush(_sock); 1615 if (rc < 0) { 1616 return rc; 1617 } 1618 1619 if (!TAILQ_EMPTY(&_sock->queued_reqs)) { 1620 /* We weren't able to flush all requests */ 1621 errno = EAGAIN; 1622 return -1; 1623 } 1624 1625 if (sock->ssl) { 1626 return SSL_writev(sock->ssl, iov, iovcnt); 1627 } else { 1628 return writev(sock->fd, iov, iovcnt); 1629 } 1630 } 1631 1632 static int 1633 posix_sock_recv_next(struct spdk_sock *_sock, void **buf, void **ctx) 1634 { 1635 struct spdk_posix_sock *sock = __posix_sock(_sock); 1636 struct iovec iov; 1637 ssize_t rc; 1638 1639 if (sock->recv_pipe != NULL) { 1640 errno = ENOTSUP; 1641 return -1; 1642 } 1643 1644 iov.iov_len = spdk_sock_group_get_buf(_sock->group_impl->group, &iov.iov_base, ctx); 1645 if (iov.iov_len == 0) { 1646 errno = ENOBUFS; 1647 return -1; 1648 } 1649 1650 rc = posix_sock_readv(_sock, &iov, 1); 1651 if (rc <= 0) { 1652 spdk_sock_group_provide_buf(_sock->group_impl->group, iov.iov_base, iov.iov_len, *ctx); 1653 return rc; 1654 } 1655 1656 *buf = iov.iov_base; 1657 1658 return rc; 1659 } 1660 1661 static void 1662 posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req) 1663 { 1664 int rc; 1665 1666 spdk_sock_request_queue(sock, req); 1667 1668 /* If there are a sufficient number queued, just flush them out immediately. */ 1669 if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { 1670 rc = _sock_flush(sock); 1671 if (rc < 0 && errno != EAGAIN) { 1672 spdk_sock_abort_requests(sock); 1673 } 1674 } 1675 } 1676 1677 static int 1678 posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) 1679 { 1680 struct spdk_posix_sock *sock = __posix_sock(_sock); 1681 int val; 1682 int rc; 1683 1684 assert(sock != NULL); 1685 1686 val = nbytes; 1687 rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); 1688 if (rc != 0) { 1689 return -1; 1690 } 1691 return 0; 1692 } 1693 1694 static bool 1695 posix_sock_is_ipv6(struct spdk_sock *_sock) 1696 { 1697 struct spdk_posix_sock *sock = __posix_sock(_sock); 1698 struct sockaddr_storage sa; 1699 socklen_t salen; 1700 int rc; 1701 1702 assert(sock != NULL); 1703 1704 memset(&sa, 0, sizeof sa); 1705 salen = sizeof sa; 1706 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 1707 if (rc != 0) { 1708 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 1709 return false; 1710 } 1711 1712 return (sa.ss_family == AF_INET6); 1713 } 1714 1715 static bool 1716 posix_sock_is_ipv4(struct spdk_sock *_sock) 1717 { 1718 struct spdk_posix_sock *sock = __posix_sock(_sock); 1719 struct sockaddr_storage sa; 1720 socklen_t salen; 1721 int rc; 1722 1723 assert(sock != NULL); 1724 1725 memset(&sa, 0, sizeof sa); 1726 salen = sizeof sa; 1727 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 1728 if (rc != 0) { 1729 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 1730 return false; 1731 } 1732 1733 return (sa.ss_family == AF_INET); 1734 } 1735 1736 static bool 1737 posix_sock_is_connected(struct spdk_sock *_sock) 1738 { 1739 struct spdk_posix_sock *sock = __posix_sock(_sock); 1740 uint8_t byte; 1741 int rc; 1742 1743 rc = recv(sock->fd, &byte, 1, MSG_PEEK); 1744 if (rc == 0) { 1745 return false; 1746 } 1747 1748 if (rc < 0) { 1749 if (errno == EAGAIN || errno == EWOULDBLOCK) { 1750 return true; 1751 } 1752 1753 return false; 1754 } 1755 1756 return true; 1757 } 1758 1759 static struct spdk_sock_group_impl * 1760 posix_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint) 1761 { 1762 struct spdk_posix_sock *sock = __posix_sock(_sock); 1763 struct spdk_sock_group_impl *group_impl; 1764 1765 if (sock->placement_id != -1) { 1766 spdk_sock_map_lookup(&g_map, sock->placement_id, &group_impl, hint); 1767 return group_impl; 1768 } 1769 1770 return NULL; 1771 } 1772 1773 static struct spdk_sock_group_impl * 1774 _sock_group_impl_create(uint32_t enable_placement_id) 1775 { 1776 struct spdk_posix_sock_group_impl *group_impl; 1777 int fd; 1778 1779 #if defined(SPDK_EPOLL) 1780 fd = epoll_create1(0); 1781 #elif defined(SPDK_KEVENT) 1782 fd = kqueue(); 1783 #endif 1784 if (fd == -1) { 1785 return NULL; 1786 } 1787 1788 group_impl = calloc(1, sizeof(*group_impl)); 1789 if (group_impl == NULL) { 1790 SPDK_ERRLOG("group_impl allocation failed\n"); 1791 close(fd); 1792 return NULL; 1793 } 1794 1795 group_impl->fd = fd; 1796 TAILQ_INIT(&group_impl->socks_with_data); 1797 group_impl->placement_id = -1; 1798 1799 if (enable_placement_id == PLACEMENT_CPU) { 1800 spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base); 1801 group_impl->placement_id = spdk_env_get_current_core(); 1802 } 1803 1804 return &group_impl->base; 1805 } 1806 1807 static struct spdk_sock_group_impl * 1808 posix_sock_group_impl_create(void) 1809 { 1810 return _sock_group_impl_create(g_posix_impl_opts.enable_placement_id); 1811 } 1812 1813 static struct spdk_sock_group_impl * 1814 ssl_sock_group_impl_create(void) 1815 { 1816 return _sock_group_impl_create(g_ssl_impl_opts.enable_placement_id); 1817 } 1818 1819 static void 1820 posix_sock_mark(struct spdk_posix_sock_group_impl *group, struct spdk_posix_sock *sock, 1821 int placement_id) 1822 { 1823 #if defined(SO_MARK) 1824 int rc; 1825 1826 rc = setsockopt(sock->fd, SOL_SOCKET, SO_MARK, 1827 &placement_id, sizeof(placement_id)); 1828 if (rc != 0) { 1829 /* Not fatal */ 1830 SPDK_ERRLOG("Error setting SO_MARK\n"); 1831 return; 1832 } 1833 1834 rc = spdk_sock_map_insert(&g_map, placement_id, &group->base); 1835 if (rc != 0) { 1836 /* Not fatal */ 1837 SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); 1838 return; 1839 } 1840 1841 sock->placement_id = placement_id; 1842 #endif 1843 } 1844 1845 static void 1846 posix_sock_update_mark(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1847 { 1848 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1849 1850 if (group->placement_id == -1) { 1851 group->placement_id = spdk_sock_map_find_free(&g_map); 1852 1853 /* If a free placement id is found, update existing sockets in this group */ 1854 if (group->placement_id != -1) { 1855 struct spdk_sock *sock, *tmp; 1856 1857 TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { 1858 posix_sock_mark(group, __posix_sock(sock), group->placement_id); 1859 } 1860 } 1861 } 1862 1863 if (group->placement_id != -1) { 1864 /* 1865 * group placement id is already determined for this poll group. 1866 * Mark socket with group's placement id. 1867 */ 1868 posix_sock_mark(group, __posix_sock(_sock), group->placement_id); 1869 } 1870 } 1871 1872 static int 1873 posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1874 { 1875 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1876 struct spdk_posix_sock *sock = __posix_sock(_sock); 1877 int rc; 1878 1879 #if defined(SPDK_EPOLL) 1880 struct epoll_event event; 1881 1882 memset(&event, 0, sizeof(event)); 1883 /* EPOLLERR is always on even if we don't set it, but be explicit for clarity */ 1884 event.events = EPOLLIN | EPOLLERR; 1885 event.data.ptr = sock; 1886 1887 rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event); 1888 #elif defined(SPDK_KEVENT) 1889 struct kevent event; 1890 struct timespec ts = {0}; 1891 1892 EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock); 1893 1894 rc = kevent(group->fd, &event, 1, NULL, 0, &ts); 1895 #endif 1896 1897 if (rc != 0) { 1898 return rc; 1899 } 1900 1901 /* switched from another polling group due to scheduling */ 1902 if (spdk_unlikely(sock->recv_pipe != NULL && 1903 (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { 1904 sock->pipe_has_data = true; 1905 sock->socket_has_data = false; 1906 TAILQ_INSERT_TAIL(&group->socks_with_data, sock, link); 1907 } 1908 1909 if (_sock->impl_opts.enable_placement_id == PLACEMENT_MARK) { 1910 posix_sock_update_mark(_group, _sock); 1911 } else if (sock->placement_id != -1) { 1912 rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base); 1913 if (rc != 0) { 1914 SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); 1915 /* Do not treat this as an error. The system will continue running. */ 1916 } 1917 } 1918 1919 return rc; 1920 } 1921 1922 static int 1923 posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1924 { 1925 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1926 struct spdk_posix_sock *sock = __posix_sock(_sock); 1927 int rc; 1928 1929 if (sock->pipe_has_data || sock->socket_has_data) { 1930 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1931 sock->pipe_has_data = false; 1932 sock->socket_has_data = false; 1933 } 1934 1935 if (sock->placement_id != -1) { 1936 spdk_sock_map_release(&g_map, sock->placement_id); 1937 } 1938 1939 #if defined(SPDK_EPOLL) 1940 struct epoll_event event; 1941 1942 /* Event parameter is ignored but some old kernel version still require it. */ 1943 rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event); 1944 #elif defined(SPDK_KEVENT) 1945 struct kevent event; 1946 struct timespec ts = {0}; 1947 1948 EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); 1949 1950 rc = kevent(group->fd, &event, 1, NULL, 0, &ts); 1951 if (rc == 0 && event.flags & EV_ERROR) { 1952 rc = -1; 1953 errno = event.data; 1954 } 1955 #endif 1956 1957 spdk_sock_abort_requests(_sock); 1958 1959 return rc; 1960 } 1961 1962 static int 1963 posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, 1964 struct spdk_sock **socks) 1965 { 1966 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1967 struct spdk_sock *sock, *tmp; 1968 int num_events, i, rc; 1969 struct spdk_posix_sock *psock, *ptmp; 1970 #if defined(SPDK_EPOLL) 1971 struct epoll_event events[MAX_EVENTS_PER_POLL]; 1972 #elif defined(SPDK_KEVENT) 1973 struct kevent events[MAX_EVENTS_PER_POLL]; 1974 struct timespec ts = {0}; 1975 #endif 1976 1977 #ifdef SPDK_ZEROCOPY 1978 /* When all of the following conditions are met 1979 * - non-blocking socket 1980 * - zero copy is enabled 1981 * - interrupts suppressed (i.e. busy polling) 1982 * - the NIC tx queue is full at the time sendmsg() is called 1983 * - epoll_wait determines there is an EPOLLIN event for the socket 1984 * then we can get into a situation where data we've sent is queued 1985 * up in the kernel network stack, but interrupts have been suppressed 1986 * because other traffic is flowing so the kernel misses the signal 1987 * to flush the software tx queue. If there wasn't incoming data 1988 * pending on the socket, then epoll_wait would have been sufficient 1989 * to kick off the send operation, but since there is a pending event 1990 * epoll_wait does not trigger the necessary operation. 1991 * 1992 * We deal with this by checking for all of the above conditions and 1993 * additionally looking for EPOLLIN events that were not consumed from 1994 * the last poll loop. We take this to mean that the upper layer is 1995 * unable to consume them because it is blocked waiting for resources 1996 * to free up, and those resources are most likely freed in response 1997 * to a pending asynchronous write completing. 1998 * 1999 * Additionally, sockets that have the same placement_id actually share 2000 * an underlying hardware queue. That means polling one of them is 2001 * equivalent to polling all of them. As a quick mechanism to avoid 2002 * making extra poll() calls, stash the last placement_id during the loop 2003 * and only poll if it's not the same. The overwhelmingly common case 2004 * is that all sockets in this list have the same placement_id because 2005 * SPDK is intentionally grouping sockets by that value, so even 2006 * though this won't stop all extra calls to poll(), it's very fast 2007 * and will catch all of them in practice. 2008 */ 2009 int last_placement_id = -1; 2010 2011 TAILQ_FOREACH(psock, &group->socks_with_data, link) { 2012 if (psock->zcopy && psock->placement_id >= 0 && 2013 psock->placement_id != last_placement_id) { 2014 struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0}; 2015 2016 poll(&pfd, 1, 0); 2017 last_placement_id = psock->placement_id; 2018 } 2019 } 2020 #endif 2021 2022 /* This must be a TAILQ_FOREACH_SAFE because while flushing, 2023 * a completion callback could remove the sock from the 2024 * group. */ 2025 TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { 2026 rc = _sock_flush(sock); 2027 if (rc < 0 && errno != EAGAIN) { 2028 spdk_sock_abort_requests(sock); 2029 } 2030 } 2031 2032 assert(max_events > 0); 2033 2034 #if defined(SPDK_EPOLL) 2035 num_events = epoll_wait(group->fd, events, max_events, 0); 2036 #elif defined(SPDK_KEVENT) 2037 num_events = kevent(group->fd, NULL, 0, events, max_events, &ts); 2038 #endif 2039 2040 if (num_events == -1) { 2041 return -1; 2042 } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) { 2043 sock = TAILQ_FIRST(&_group->socks); 2044 psock = __posix_sock(sock); 2045 /* poll() is called here to busy poll the queue associated with 2046 * first socket in list and potentially reap incoming data. 2047 */ 2048 if (sock->opts.priority) { 2049 struct pollfd pfd = {0, 0, 0}; 2050 2051 pfd.fd = psock->fd; 2052 pfd.events = POLLIN | POLLERR; 2053 poll(&pfd, 1, 0); 2054 } 2055 } 2056 2057 for (i = 0; i < num_events; i++) { 2058 #if defined(SPDK_EPOLL) 2059 sock = events[i].data.ptr; 2060 psock = __posix_sock(sock); 2061 2062 #ifdef SPDK_ZEROCOPY 2063 if (events[i].events & EPOLLERR) { 2064 rc = _sock_check_zcopy(sock); 2065 /* If the socket was closed or removed from 2066 * the group in response to a send ack, don't 2067 * add it to the array here. */ 2068 if (rc || sock->cb_fn == NULL) { 2069 continue; 2070 } 2071 } 2072 #endif 2073 if ((events[i].events & EPOLLIN) == 0) { 2074 continue; 2075 } 2076 2077 #elif defined(SPDK_KEVENT) 2078 sock = events[i].udata; 2079 psock = __posix_sock(sock); 2080 #endif 2081 2082 /* If the socket is not already in the list, add it now */ 2083 if (!psock->socket_has_data && !psock->pipe_has_data) { 2084 TAILQ_INSERT_TAIL(&group->socks_with_data, psock, link); 2085 } 2086 psock->socket_has_data = true; 2087 } 2088 2089 num_events = 0; 2090 2091 TAILQ_FOREACH_SAFE(psock, &group->socks_with_data, link, ptmp) { 2092 if (num_events == max_events) { 2093 break; 2094 } 2095 2096 /* If the socket's cb_fn is NULL, just remove it from the 2097 * list and do not add it to socks array */ 2098 if (spdk_unlikely(psock->base.cb_fn == NULL)) { 2099 psock->socket_has_data = false; 2100 psock->pipe_has_data = false; 2101 TAILQ_REMOVE(&group->socks_with_data, psock, link); 2102 continue; 2103 } 2104 2105 socks[num_events++] = &psock->base; 2106 } 2107 2108 /* Cycle the has_data list so that each time we poll things aren't 2109 * in the same order. Say we have 6 sockets in the list, named as follows: 2110 * A B C D E F 2111 * And all 6 sockets had epoll events, but max_events is only 3. That means 2112 * psock currently points at D. We want to rearrange the list to the following: 2113 * D E F A B C 2114 * 2115 * The variables below are named according to this example to make it easier to 2116 * follow the swaps. 2117 */ 2118 if (psock != NULL) { 2119 struct spdk_posix_sock *pa, *pc, *pd, *pf; 2120 2121 /* Capture pointers to the elements we need */ 2122 pd = psock; 2123 pc = TAILQ_PREV(pd, spdk_has_data_list, link); 2124 pa = TAILQ_FIRST(&group->socks_with_data); 2125 pf = TAILQ_LAST(&group->socks_with_data, spdk_has_data_list); 2126 2127 /* Break the link between C and D */ 2128 pc->link.tqe_next = NULL; 2129 2130 /* Connect F to A */ 2131 pf->link.tqe_next = pa; 2132 pa->link.tqe_prev = &pf->link.tqe_next; 2133 2134 /* Fix up the list first/last pointers */ 2135 group->socks_with_data.tqh_first = pd; 2136 group->socks_with_data.tqh_last = &pc->link.tqe_next; 2137 2138 /* D is in front of the list, make tqe prev pointer point to the head of list */ 2139 pd->link.tqe_prev = &group->socks_with_data.tqh_first; 2140 } 2141 2142 return num_events; 2143 } 2144 2145 static int 2146 _sock_group_impl_close(struct spdk_sock_group_impl *_group, uint32_t enable_placement_id) 2147 { 2148 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 2149 int rc; 2150 2151 if (enable_placement_id == PLACEMENT_CPU) { 2152 spdk_sock_map_release(&g_map, spdk_env_get_current_core()); 2153 } 2154 2155 rc = close(group->fd); 2156 free(group); 2157 return rc; 2158 } 2159 2160 static int 2161 posix_sock_group_impl_close(struct spdk_sock_group_impl *_group) 2162 { 2163 return _sock_group_impl_close(_group, g_posix_impl_opts.enable_placement_id); 2164 } 2165 2166 static int 2167 ssl_sock_group_impl_close(struct spdk_sock_group_impl *_group) 2168 { 2169 return _sock_group_impl_close(_group, g_ssl_impl_opts.enable_placement_id); 2170 } 2171 2172 static struct spdk_net_impl g_posix_net_impl = { 2173 .name = "posix", 2174 .getaddr = posix_sock_getaddr, 2175 .connect = posix_sock_connect, 2176 .listen = posix_sock_listen, 2177 .accept = posix_sock_accept, 2178 .close = posix_sock_close, 2179 .recv = posix_sock_recv, 2180 .readv = posix_sock_readv, 2181 .writev = posix_sock_writev, 2182 .recv_next = posix_sock_recv_next, 2183 .writev_async = posix_sock_writev_async, 2184 .flush = posix_sock_flush, 2185 .set_recvlowat = posix_sock_set_recvlowat, 2186 .set_recvbuf = posix_sock_set_recvbuf, 2187 .set_sendbuf = posix_sock_set_sendbuf, 2188 .is_ipv6 = posix_sock_is_ipv6, 2189 .is_ipv4 = posix_sock_is_ipv4, 2190 .is_connected = posix_sock_is_connected, 2191 .group_impl_get_optimal = posix_sock_group_impl_get_optimal, 2192 .group_impl_create = posix_sock_group_impl_create, 2193 .group_impl_add_sock = posix_sock_group_impl_add_sock, 2194 .group_impl_remove_sock = posix_sock_group_impl_remove_sock, 2195 .group_impl_poll = posix_sock_group_impl_poll, 2196 .group_impl_close = posix_sock_group_impl_close, 2197 .get_opts = posix_sock_impl_get_opts, 2198 .set_opts = posix_sock_impl_set_opts, 2199 }; 2200 2201 SPDK_NET_IMPL_REGISTER(posix, &g_posix_net_impl, DEFAULT_SOCK_PRIORITY + 1); 2202 2203 static struct spdk_sock * 2204 ssl_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) 2205 { 2206 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, true); 2207 } 2208 2209 static struct spdk_sock * 2210 ssl_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) 2211 { 2212 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, true); 2213 } 2214 2215 static struct spdk_sock * 2216 ssl_sock_accept(struct spdk_sock *_sock) 2217 { 2218 return _posix_sock_accept(_sock, true); 2219 } 2220 2221 static struct spdk_net_impl g_ssl_net_impl = { 2222 .name = "ssl", 2223 .getaddr = posix_sock_getaddr, 2224 .connect = ssl_sock_connect, 2225 .listen = ssl_sock_listen, 2226 .accept = ssl_sock_accept, 2227 .close = posix_sock_close, 2228 .recv = posix_sock_recv, 2229 .readv = posix_sock_readv, 2230 .writev = posix_sock_writev, 2231 .recv_next = posix_sock_recv_next, 2232 .writev_async = posix_sock_writev_async, 2233 .flush = posix_sock_flush, 2234 .set_recvlowat = posix_sock_set_recvlowat, 2235 .set_recvbuf = posix_sock_set_recvbuf, 2236 .set_sendbuf = posix_sock_set_sendbuf, 2237 .is_ipv6 = posix_sock_is_ipv6, 2238 .is_ipv4 = posix_sock_is_ipv4, 2239 .is_connected = posix_sock_is_connected, 2240 .group_impl_get_optimal = posix_sock_group_impl_get_optimal, 2241 .group_impl_create = ssl_sock_group_impl_create, 2242 .group_impl_add_sock = posix_sock_group_impl_add_sock, 2243 .group_impl_remove_sock = posix_sock_group_impl_remove_sock, 2244 .group_impl_poll = posix_sock_group_impl_poll, 2245 .group_impl_close = ssl_sock_group_impl_close, 2246 .get_opts = ssl_sock_impl_get_opts, 2247 .set_opts = ssl_sock_impl_set_opts, 2248 }; 2249 2250 SPDK_NET_IMPL_REGISTER(ssl, &g_ssl_net_impl, DEFAULT_SOCK_PRIORITY); 2251 SPDK_LOG_REGISTER_COMPONENT(sock_posix) 2252