1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. All rights reserved. 3 * Copyright (c) 2020, 2021 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #if defined(__FreeBSD__) 10 #include <sys/event.h> 11 #define SPDK_KEVENT 12 #else 13 #include <sys/epoll.h> 14 #define SPDK_EPOLL 15 #endif 16 17 #if defined(__linux__) 18 #include <linux/errqueue.h> 19 #endif 20 21 #include "spdk/env.h" 22 #include "spdk/log.h" 23 #include "spdk/pipe.h" 24 #include "spdk/sock.h" 25 #include "spdk/util.h" 26 #include "spdk/string.h" 27 #include "spdk_internal/sock.h" 28 #include "../sock_kernel.h" 29 30 #include "openssl/crypto.h" 31 #include "openssl/err.h" 32 #include "openssl/ssl.h" 33 34 #define MAX_TMPBUF 1024 35 #define PORTNUMLEN 32 36 37 #if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) 38 #define SPDK_ZEROCOPY 39 #endif 40 41 struct spdk_posix_sock { 42 struct spdk_sock base; 43 int fd; 44 45 uint32_t sendmsg_idx; 46 47 struct spdk_pipe *recv_pipe; 48 int recv_buf_sz; 49 bool pipe_has_data; 50 bool socket_has_data; 51 bool zcopy; 52 53 int placement_id; 54 55 SSL_CTX *ctx; 56 SSL *ssl; 57 58 TAILQ_ENTRY(spdk_posix_sock) link; 59 }; 60 61 TAILQ_HEAD(spdk_has_data_list, spdk_posix_sock); 62 63 struct spdk_posix_sock_group_impl { 64 struct spdk_sock_group_impl base; 65 int fd; 66 struct spdk_interrupt *intr; 67 struct spdk_has_data_list socks_with_data; 68 int placement_id; 69 struct spdk_pipe_group *pipe_group; 70 }; 71 72 static struct spdk_sock_impl_opts g_posix_impl_opts = { 73 .recv_buf_size = DEFAULT_SO_RCVBUF_SIZE, 74 .send_buf_size = DEFAULT_SO_SNDBUF_SIZE, 75 .enable_recv_pipe = true, 76 .enable_quickack = false, 77 .enable_placement_id = PLACEMENT_NONE, 78 .enable_zerocopy_send_server = true, 79 .enable_zerocopy_send_client = false, 80 .zerocopy_threshold = 0, 81 .tls_version = 0, 82 .enable_ktls = false, 83 .psk_key = NULL, 84 .psk_key_size = 0, 85 .psk_identity = NULL, 86 .get_key = NULL, 87 .get_key_ctx = NULL, 88 .tls_cipher_suites = NULL 89 }; 90 91 static struct spdk_sock_impl_opts g_ssl_impl_opts = { 92 .recv_buf_size = MIN_SO_RCVBUF_SIZE, 93 .send_buf_size = MIN_SO_SNDBUF_SIZE, 94 .enable_recv_pipe = true, 95 .enable_quickack = false, 96 .enable_placement_id = PLACEMENT_NONE, 97 .enable_zerocopy_send_server = true, 98 .enable_zerocopy_send_client = false, 99 .zerocopy_threshold = 0, 100 .tls_version = 0, 101 .enable_ktls = false, 102 .psk_key = NULL, 103 .psk_identity = NULL 104 }; 105 106 static struct spdk_sock_map g_map = { 107 .entries = STAILQ_HEAD_INITIALIZER(g_map.entries), 108 .mtx = PTHREAD_MUTEX_INITIALIZER 109 }; 110 111 __attribute((destructor)) static void 112 posix_sock_map_cleanup(void) 113 { 114 spdk_sock_map_cleanup(&g_map); 115 } 116 117 #define __posix_sock(sock) (struct spdk_posix_sock *)sock 118 #define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group 119 120 static void 121 posix_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src, 122 size_t len) 123 { 124 #define FIELD_OK(field) \ 125 offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len 126 127 #define SET_FIELD(field) \ 128 if (FIELD_OK(field)) { \ 129 dest->field = src->field; \ 130 } 131 132 SET_FIELD(recv_buf_size); 133 SET_FIELD(send_buf_size); 134 SET_FIELD(enable_recv_pipe); 135 SET_FIELD(enable_zerocopy_send); 136 SET_FIELD(enable_quickack); 137 SET_FIELD(enable_placement_id); 138 SET_FIELD(enable_zerocopy_send_server); 139 SET_FIELD(enable_zerocopy_send_client); 140 SET_FIELD(zerocopy_threshold); 141 SET_FIELD(tls_version); 142 SET_FIELD(enable_ktls); 143 SET_FIELD(psk_key); 144 SET_FIELD(psk_key_size); 145 SET_FIELD(psk_identity); 146 SET_FIELD(get_key); 147 SET_FIELD(get_key_ctx); 148 SET_FIELD(tls_cipher_suites); 149 150 #undef SET_FIELD 151 #undef FIELD_OK 152 } 153 154 static int 155 _sock_impl_get_opts(struct spdk_sock_impl_opts *opts, struct spdk_sock_impl_opts *impl_opts, 156 size_t *len) 157 { 158 if (!opts || !len) { 159 errno = EINVAL; 160 return -1; 161 } 162 163 assert(sizeof(*opts) >= *len); 164 memset(opts, 0, *len); 165 166 posix_sock_copy_impl_opts(opts, impl_opts, *len); 167 *len = spdk_min(*len, sizeof(*impl_opts)); 168 169 return 0; 170 } 171 172 static int 173 posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) 174 { 175 return _sock_impl_get_opts(opts, &g_posix_impl_opts, len); 176 } 177 178 static int 179 ssl_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) 180 { 181 return _sock_impl_get_opts(opts, &g_ssl_impl_opts, len); 182 } 183 184 static int 185 _sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, struct spdk_sock_impl_opts *impl_opts, 186 size_t len) 187 { 188 if (!opts) { 189 errno = EINVAL; 190 return -1; 191 } 192 193 assert(sizeof(*opts) >= len); 194 posix_sock_copy_impl_opts(impl_opts, opts, len); 195 196 return 0; 197 } 198 199 static int 200 posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) 201 { 202 return _sock_impl_set_opts(opts, &g_posix_impl_opts, len); 203 } 204 205 static int 206 ssl_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) 207 { 208 return _sock_impl_set_opts(opts, &g_ssl_impl_opts, len); 209 } 210 211 static void 212 _opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest, 213 const struct spdk_sock_impl_opts *default_impl) 214 { 215 /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */ 216 memcpy(dest, default_impl, sizeof(*dest)); 217 218 if (opts->impl_opts != NULL) { 219 assert(sizeof(*dest) >= opts->impl_opts_size); 220 posix_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size); 221 } 222 } 223 224 static int 225 posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, 226 char *caddr, int clen, uint16_t *cport) 227 { 228 struct spdk_posix_sock *sock = __posix_sock(_sock); 229 struct sockaddr_storage sa; 230 socklen_t salen; 231 int rc; 232 233 assert(sock != NULL); 234 235 memset(&sa, 0, sizeof sa); 236 salen = sizeof sa; 237 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 238 if (rc != 0) { 239 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 240 return -1; 241 } 242 243 switch (sa.ss_family) { 244 case AF_UNIX: 245 /* Acceptable connection types that don't have IPs */ 246 return 0; 247 case AF_INET: 248 case AF_INET6: 249 /* Code below will get IP addresses */ 250 break; 251 default: 252 /* Unsupported socket family */ 253 return -1; 254 } 255 256 rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); 257 if (rc != 0) { 258 SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); 259 return -1; 260 } 261 262 if (sport) { 263 if (sa.ss_family == AF_INET) { 264 *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); 265 } else if (sa.ss_family == AF_INET6) { 266 *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); 267 } 268 } 269 270 memset(&sa, 0, sizeof sa); 271 salen = sizeof sa; 272 rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); 273 if (rc != 0) { 274 SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); 275 return -1; 276 } 277 278 rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); 279 if (rc != 0) { 280 SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); 281 return -1; 282 } 283 284 if (cport) { 285 if (sa.ss_family == AF_INET) { 286 *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); 287 } else if (sa.ss_family == AF_INET6) { 288 *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); 289 } 290 } 291 292 return 0; 293 } 294 295 enum posix_sock_create_type { 296 SPDK_SOCK_CREATE_LISTEN, 297 SPDK_SOCK_CREATE_CONNECT, 298 }; 299 300 static int 301 posix_sock_alloc_pipe(struct spdk_posix_sock *sock, int sz) 302 { 303 uint8_t *new_buf, *old_buf; 304 struct spdk_pipe *new_pipe; 305 struct iovec siov[2]; 306 struct iovec diov[2]; 307 int sbytes; 308 ssize_t bytes; 309 int rc; 310 311 if (sock->recv_buf_sz == sz) { 312 return 0; 313 } 314 315 /* If the new size is 0, just free the pipe */ 316 if (sz == 0) { 317 old_buf = spdk_pipe_destroy(sock->recv_pipe); 318 free(old_buf); 319 sock->recv_pipe = NULL; 320 return 0; 321 } else if (sz < MIN_SOCK_PIPE_SIZE) { 322 SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); 323 return -1; 324 } 325 326 /* Round up to next 64 byte multiple */ 327 rc = posix_memalign((void **)&new_buf, 64, sz); 328 if (rc != 0) { 329 SPDK_ERRLOG("socket recv buf allocation failed\n"); 330 return -ENOMEM; 331 } 332 memset(new_buf, 0, sz); 333 334 new_pipe = spdk_pipe_create(new_buf, sz); 335 if (new_pipe == NULL) { 336 SPDK_ERRLOG("socket pipe allocation failed\n"); 337 free(new_buf); 338 return -ENOMEM; 339 } 340 341 if (sock->recv_pipe != NULL) { 342 /* Pull all of the data out of the old pipe */ 343 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); 344 if (sbytes > sz) { 345 /* Too much data to fit into the new pipe size */ 346 old_buf = spdk_pipe_destroy(new_pipe); 347 free(old_buf); 348 return -EINVAL; 349 } 350 351 sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); 352 assert(sbytes == sz); 353 354 bytes = spdk_iovcpy(siov, 2, diov, 2); 355 spdk_pipe_writer_advance(new_pipe, bytes); 356 357 old_buf = spdk_pipe_destroy(sock->recv_pipe); 358 free(old_buf); 359 } 360 361 sock->recv_buf_sz = sz; 362 sock->recv_pipe = new_pipe; 363 364 if (sock->base.group_impl) { 365 struct spdk_posix_sock_group_impl *group; 366 367 group = __posix_group_impl(sock->base.group_impl); 368 spdk_pipe_group_add(group->pipe_group, sock->recv_pipe); 369 } 370 371 return 0; 372 } 373 374 static int 375 posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz) 376 { 377 struct spdk_posix_sock *sock = __posix_sock(_sock); 378 int min_size; 379 int rc; 380 381 assert(sock != NULL); 382 383 if (_sock->impl_opts.enable_recv_pipe) { 384 rc = posix_sock_alloc_pipe(sock, sz); 385 if (rc) { 386 return rc; 387 } 388 } 389 390 /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and 391 * _sock->impl_opts.recv_buf_size. */ 392 min_size = spdk_max(MIN_SO_RCVBUF_SIZE, _sock->impl_opts.recv_buf_size); 393 394 if (sz < min_size) { 395 sz = min_size; 396 } 397 398 rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); 399 if (rc < 0) { 400 return rc; 401 } 402 403 _sock->impl_opts.recv_buf_size = sz; 404 405 return 0; 406 } 407 408 static int 409 posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz) 410 { 411 struct spdk_posix_sock *sock = __posix_sock(_sock); 412 int min_size; 413 int rc; 414 415 assert(sock != NULL); 416 417 /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and 418 * _sock->impl_opts.send_buf_size. */ 419 min_size = spdk_max(MIN_SO_SNDBUF_SIZE, _sock->impl_opts.send_buf_size); 420 421 if (sz < min_size) { 422 sz = min_size; 423 } 424 425 rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); 426 if (rc < 0) { 427 return rc; 428 } 429 430 _sock->impl_opts.send_buf_size = sz; 431 432 return 0; 433 } 434 435 static void 436 posix_sock_init(struct spdk_posix_sock *sock, bool enable_zero_copy) 437 { 438 #if defined(SPDK_ZEROCOPY) || defined(__linux__) 439 int flag; 440 int rc; 441 #endif 442 443 #if defined(SPDK_ZEROCOPY) 444 flag = 1; 445 446 if (enable_zero_copy) { 447 /* Try to turn on zero copy sends */ 448 rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); 449 if (rc == 0) { 450 sock->zcopy = true; 451 } 452 } 453 #endif 454 455 #if defined(__linux__) 456 flag = 1; 457 458 if (sock->base.impl_opts.enable_quickack) { 459 rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); 460 if (rc != 0) { 461 SPDK_ERRLOG("quickack was failed to set\n"); 462 } 463 } 464 465 spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id, 466 &sock->placement_id); 467 468 if (sock->base.impl_opts.enable_placement_id == PLACEMENT_MARK) { 469 /* Save placement_id */ 470 spdk_sock_map_insert(&g_map, sock->placement_id, NULL); 471 } 472 #endif 473 } 474 475 static struct spdk_posix_sock * 476 posix_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy) 477 { 478 struct spdk_posix_sock *sock; 479 480 sock = calloc(1, sizeof(*sock)); 481 if (sock == NULL) { 482 SPDK_ERRLOG("sock allocation failed\n"); 483 return NULL; 484 } 485 486 sock->fd = fd; 487 memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts)); 488 posix_sock_init(sock, enable_zero_copy); 489 490 return sock; 491 } 492 493 static int 494 posix_fd_create(struct addrinfo *res, struct spdk_sock_opts *opts, 495 struct spdk_sock_impl_opts *impl_opts) 496 { 497 int fd; 498 int val = 1; 499 int rc, sz; 500 #if defined(__linux__) 501 int to; 502 #endif 503 504 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); 505 if (fd < 0) { 506 /* error */ 507 return -1; 508 } 509 510 sz = impl_opts->recv_buf_size; 511 rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); 512 if (rc) { 513 /* Not fatal */ 514 } 515 516 sz = impl_opts->send_buf_size; 517 rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); 518 if (rc) { 519 /* Not fatal */ 520 } 521 522 rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); 523 if (rc != 0) { 524 close(fd); 525 /* error */ 526 return -1; 527 } 528 rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); 529 if (rc != 0) { 530 close(fd); 531 /* error */ 532 return -1; 533 } 534 535 #if defined(SO_PRIORITY) 536 if (opts->priority) { 537 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); 538 if (rc != 0) { 539 close(fd); 540 /* error */ 541 return -1; 542 } 543 } 544 #endif 545 546 if (res->ai_family == AF_INET6) { 547 rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); 548 if (rc != 0) { 549 close(fd); 550 /* error */ 551 return -1; 552 } 553 } 554 555 if (opts->ack_timeout) { 556 #if defined(__linux__) 557 to = opts->ack_timeout; 558 rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &to, sizeof(to)); 559 if (rc != 0) { 560 close(fd); 561 /* error */ 562 return -1; 563 } 564 #else 565 SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n"); 566 #endif 567 } 568 569 return fd; 570 } 571 572 static int 573 posix_sock_psk_find_session_server_cb(SSL *ssl, const unsigned char *identity, 574 size_t identity_len, SSL_SESSION **sess) 575 { 576 struct spdk_sock_impl_opts *impl_opts = SSL_get_app_data(ssl); 577 uint8_t key[SSL_MAX_MASTER_KEY_LENGTH] = {}; 578 int keylen; 579 int rc, i; 580 STACK_OF(SSL_CIPHER) *ciphers; 581 const SSL_CIPHER *cipher; 582 const char *cipher_name; 583 const char *user_cipher = NULL; 584 bool found = false; 585 586 if (impl_opts->get_key) { 587 rc = impl_opts->get_key(key, sizeof(key), &user_cipher, identity, impl_opts->get_key_ctx); 588 if (rc < 0) { 589 SPDK_ERRLOG("Unable to find PSK for identity: %s\n", identity); 590 return 0; 591 } 592 keylen = rc; 593 } else { 594 if (impl_opts->psk_key == NULL) { 595 SPDK_ERRLOG("PSK is not set\n"); 596 return 0; 597 } 598 599 SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK ID %lu\n", strlen(impl_opts->psk_identity)); 600 if (strcmp(impl_opts->psk_identity, identity) != 0) { 601 SPDK_ERRLOG("Unknown Client's PSK ID\n"); 602 return 0; 603 } 604 keylen = impl_opts->psk_key_size; 605 606 memcpy(key, impl_opts->psk_key, keylen); 607 user_cipher = impl_opts->tls_cipher_suites; 608 } 609 610 if (user_cipher == NULL) { 611 SPDK_ERRLOG("Cipher suite not set\n"); 612 return 0; 613 } 614 615 *sess = SSL_SESSION_new(); 616 if (*sess == NULL) { 617 SPDK_ERRLOG("Unable to allocate new SSL session\n"); 618 return 0; 619 } 620 621 ciphers = SSL_get_ciphers(ssl); 622 for (i = 0; i < sk_SSL_CIPHER_num(ciphers); i++) { 623 cipher = sk_SSL_CIPHER_value(ciphers, i); 624 cipher_name = SSL_CIPHER_get_name(cipher); 625 626 if (strcmp(user_cipher, cipher_name) == 0) { 627 rc = SSL_SESSION_set_cipher(*sess, cipher); 628 if (rc != 1) { 629 SPDK_ERRLOG("Unable to set cipher: %s\n", cipher_name); 630 goto err; 631 } 632 found = true; 633 break; 634 } 635 } 636 if (found == false) { 637 SPDK_ERRLOG("No suitable cipher found\n"); 638 goto err; 639 } 640 641 SPDK_DEBUGLOG(sock_posix, "Cipher selected: %s\n", cipher_name); 642 643 rc = SSL_SESSION_set_protocol_version(*sess, TLS1_3_VERSION); 644 if (rc != 1) { 645 SPDK_ERRLOG("Unable to set TLS version: %d\n", TLS1_3_VERSION); 646 goto err; 647 } 648 649 rc = SSL_SESSION_set1_master_key(*sess, key, keylen); 650 if (rc != 1) { 651 SPDK_ERRLOG("Unable to set PSK for session\n"); 652 goto err; 653 } 654 655 return 1; 656 657 err: 658 SSL_SESSION_free(*sess); 659 *sess = NULL; 660 return 0; 661 } 662 663 static int 664 posix_sock_psk_use_session_client_cb(SSL *ssl, const EVP_MD *md, const unsigned char **identity, 665 size_t *identity_len, SSL_SESSION **sess) 666 { 667 struct spdk_sock_impl_opts *impl_opts = SSL_get_app_data(ssl); 668 int rc, i; 669 STACK_OF(SSL_CIPHER) *ciphers; 670 const SSL_CIPHER *cipher; 671 const char *cipher_name; 672 long keylen; 673 bool found = false; 674 675 if (impl_opts->psk_key == NULL) { 676 SPDK_ERRLOG("PSK is not set\n"); 677 return 0; 678 } 679 if (impl_opts->psk_key_size > SSL_MAX_MASTER_KEY_LENGTH) { 680 SPDK_ERRLOG("PSK too long\n"); 681 return 0; 682 } 683 keylen = impl_opts->psk_key_size; 684 685 if (impl_opts->tls_cipher_suites == NULL) { 686 SPDK_ERRLOG("Cipher suite not set\n"); 687 return 0; 688 } 689 *sess = SSL_SESSION_new(); 690 if (*sess == NULL) { 691 SPDK_ERRLOG("Unable to allocate new SSL session\n"); 692 return 0; 693 } 694 695 ciphers = SSL_get_ciphers(ssl); 696 for (i = 0; i < sk_SSL_CIPHER_num(ciphers); i++) { 697 cipher = sk_SSL_CIPHER_value(ciphers, i); 698 cipher_name = SSL_CIPHER_get_name(cipher); 699 700 if (strcmp(impl_opts->tls_cipher_suites, cipher_name) == 0) { 701 rc = SSL_SESSION_set_cipher(*sess, cipher); 702 if (rc != 1) { 703 SPDK_ERRLOG("Unable to set cipher: %s\n", cipher_name); 704 goto err; 705 } 706 found = true; 707 break; 708 } 709 } 710 if (found == false) { 711 SPDK_ERRLOG("No suitable cipher found\n"); 712 goto err; 713 } 714 715 SPDK_DEBUGLOG(sock_posix, "Cipher selected: %s\n", cipher_name); 716 717 rc = SSL_SESSION_set_protocol_version(*sess, TLS1_3_VERSION); 718 if (rc != 1) { 719 SPDK_ERRLOG("Unable to set TLS version: %d\n", TLS1_3_VERSION); 720 goto err; 721 } 722 723 rc = SSL_SESSION_set1_master_key(*sess, impl_opts->psk_key, keylen); 724 if (rc != 1) { 725 SPDK_ERRLOG("Unable to set PSK for session\n"); 726 goto err; 727 } 728 729 *identity_len = strlen(impl_opts->psk_identity); 730 *identity = impl_opts->psk_identity; 731 732 return 1; 733 734 err: 735 SSL_SESSION_free(*sess); 736 *sess = NULL; 737 return 0; 738 } 739 740 static SSL_CTX * 741 posix_sock_create_ssl_context(const SSL_METHOD *method, struct spdk_sock_opts *opts, 742 struct spdk_sock_impl_opts *impl_opts) 743 { 744 SSL_CTX *ctx; 745 int tls_version = 0; 746 bool ktls_enabled = false; 747 #ifdef SSL_OP_ENABLE_KTLS 748 long options; 749 #endif 750 751 SSL_library_init(); 752 OpenSSL_add_all_algorithms(); 753 SSL_load_error_strings(); 754 /* Produce a SSL CTX in SSL V2 and V3 standards compliant way */ 755 ctx = SSL_CTX_new(method); 756 if (!ctx) { 757 SPDK_ERRLOG("SSL_CTX_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 758 return NULL; 759 } 760 SPDK_DEBUGLOG(sock_posix, "SSL context created\n"); 761 762 switch (impl_opts->tls_version) { 763 case 0: 764 /* auto-negotioation */ 765 break; 766 case SPDK_TLS_VERSION_1_3: 767 tls_version = TLS1_3_VERSION; 768 break; 769 default: 770 SPDK_ERRLOG("Incorrect TLS version provided: %d\n", impl_opts->tls_version); 771 goto err; 772 } 773 774 if (tls_version) { 775 SPDK_DEBUGLOG(sock_posix, "Hardening TLS version to '%d'='0x%X'\n", impl_opts->tls_version, 776 tls_version); 777 if (!SSL_CTX_set_min_proto_version(ctx, tls_version)) { 778 SPDK_ERRLOG("Unable to set Min TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); 779 goto err; 780 } 781 if (!SSL_CTX_set_max_proto_version(ctx, tls_version)) { 782 SPDK_ERRLOG("Unable to set Max TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); 783 goto err; 784 } 785 } 786 if (impl_opts->enable_ktls) { 787 SPDK_DEBUGLOG(sock_posix, "Enabling kTLS offload\n"); 788 #ifdef SSL_OP_ENABLE_KTLS 789 options = SSL_CTX_set_options(ctx, SSL_OP_ENABLE_KTLS); 790 ktls_enabled = options & SSL_OP_ENABLE_KTLS; 791 #else 792 ktls_enabled = false; 793 #endif 794 if (!ktls_enabled) { 795 SPDK_ERRLOG("Unable to set kTLS offload via SSL_CTX_set_options(). Configure openssl with 'enable-ktls'\n"); 796 goto err; 797 } 798 } 799 800 /* SSL_CTX_set_ciphersuites() return 1 if the requested 801 * cipher suite list was configured, and 0 otherwise. */ 802 if (impl_opts->tls_cipher_suites != NULL && 803 SSL_CTX_set_ciphersuites(ctx, impl_opts->tls_cipher_suites) != 1) { 804 SPDK_ERRLOG("Unable to set TLS cipher suites for SSL'\n"); 805 goto err; 806 } 807 808 return ctx; 809 810 err: 811 SSL_CTX_free(ctx); 812 return NULL; 813 } 814 815 static SSL * 816 ssl_sock_setup_connect(SSL_CTX *ctx, int fd) 817 { 818 SSL *ssl; 819 820 ssl = SSL_new(ctx); 821 if (!ssl) { 822 SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 823 return NULL; 824 } 825 SSL_set_fd(ssl, fd); 826 SSL_set_connect_state(ssl); 827 SSL_set_psk_use_session_callback(ssl, posix_sock_psk_use_session_client_cb); 828 SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); 829 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 830 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 831 SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", 832 SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); 833 return ssl; 834 } 835 836 static SSL * 837 ssl_sock_setup_accept(SSL_CTX *ctx, int fd) 838 { 839 SSL *ssl; 840 841 ssl = SSL_new(ctx); 842 if (!ssl) { 843 SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 844 return NULL; 845 } 846 SSL_set_fd(ssl, fd); 847 SSL_set_accept_state(ssl); 848 SSL_set_psk_find_session_callback(ssl, posix_sock_psk_find_session_server_cb); 849 SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); 850 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 851 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 852 SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", 853 SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); 854 return ssl; 855 } 856 857 static ssize_t 858 SSL_readv(SSL *ssl, const struct iovec *iov, int iovcnt) 859 { 860 int i, rc = 0; 861 ssize_t total = 0; 862 863 for (i = 0; i < iovcnt; i++) { 864 rc = SSL_read(ssl, iov[i].iov_base, iov[i].iov_len); 865 866 if (rc > 0) { 867 total += rc; 868 } 869 if (rc != (int)iov[i].iov_len) { 870 break; 871 } 872 } 873 if (total > 0) { 874 errno = 0; 875 return total; 876 } 877 switch (SSL_get_error(ssl, rc)) { 878 case SSL_ERROR_ZERO_RETURN: 879 errno = ENOTCONN; 880 return 0; 881 case SSL_ERROR_WANT_READ: 882 case SSL_ERROR_WANT_WRITE: 883 case SSL_ERROR_WANT_CONNECT: 884 case SSL_ERROR_WANT_ACCEPT: 885 case SSL_ERROR_WANT_X509_LOOKUP: 886 case SSL_ERROR_WANT_ASYNC: 887 case SSL_ERROR_WANT_ASYNC_JOB: 888 case SSL_ERROR_WANT_CLIENT_HELLO_CB: 889 errno = EAGAIN; 890 return -1; 891 case SSL_ERROR_SYSCALL: 892 case SSL_ERROR_SSL: 893 errno = ENOTCONN; 894 return -1; 895 default: 896 errno = ENOTCONN; 897 return -1; 898 } 899 } 900 901 static ssize_t 902 SSL_writev(SSL *ssl, struct iovec *iov, int iovcnt) 903 { 904 int i, rc = 0; 905 ssize_t total = 0; 906 907 for (i = 0; i < iovcnt; i++) { 908 rc = SSL_write(ssl, iov[i].iov_base, iov[i].iov_len); 909 910 if (rc > 0) { 911 total += rc; 912 } 913 if (rc != (int)iov[i].iov_len) { 914 break; 915 } 916 } 917 if (total > 0) { 918 errno = 0; 919 return total; 920 } 921 switch (SSL_get_error(ssl, rc)) { 922 case SSL_ERROR_ZERO_RETURN: 923 errno = ENOTCONN; 924 return 0; 925 case SSL_ERROR_WANT_READ: 926 case SSL_ERROR_WANT_WRITE: 927 case SSL_ERROR_WANT_CONNECT: 928 case SSL_ERROR_WANT_ACCEPT: 929 case SSL_ERROR_WANT_X509_LOOKUP: 930 case SSL_ERROR_WANT_ASYNC: 931 case SSL_ERROR_WANT_ASYNC_JOB: 932 case SSL_ERROR_WANT_CLIENT_HELLO_CB: 933 errno = EAGAIN; 934 return -1; 935 case SSL_ERROR_SYSCALL: 936 case SSL_ERROR_SSL: 937 errno = ENOTCONN; 938 return -1; 939 default: 940 errno = ENOTCONN; 941 return -1; 942 } 943 } 944 945 static struct spdk_sock * 946 posix_sock_create(const char *ip, int port, 947 enum posix_sock_create_type type, 948 struct spdk_sock_opts *opts, 949 bool enable_ssl) 950 { 951 struct spdk_posix_sock *sock; 952 struct spdk_sock_impl_opts impl_opts; 953 char buf[MAX_TMPBUF]; 954 char portnum[PORTNUMLEN]; 955 char *p; 956 struct addrinfo hints, *res, *res0; 957 int fd, flag; 958 int rc; 959 bool enable_zcopy_user_opts = true; 960 bool enable_zcopy_impl_opts = true; 961 SSL_CTX *ctx = 0; 962 SSL *ssl = 0; 963 964 assert(opts != NULL); 965 if (enable_ssl) { 966 _opts_get_impl_opts(opts, &impl_opts, &g_ssl_impl_opts); 967 } else { 968 _opts_get_impl_opts(opts, &impl_opts, &g_posix_impl_opts); 969 } 970 971 if (ip == NULL) { 972 return NULL; 973 } 974 if (ip[0] == '[') { 975 snprintf(buf, sizeof(buf), "%s", ip + 1); 976 p = strchr(buf, ']'); 977 if (p != NULL) { 978 *p = '\0'; 979 } 980 ip = (const char *) &buf[0]; 981 } 982 983 snprintf(portnum, sizeof portnum, "%d", port); 984 memset(&hints, 0, sizeof hints); 985 hints.ai_family = PF_UNSPEC; 986 hints.ai_socktype = SOCK_STREAM; 987 hints.ai_flags = AI_NUMERICSERV; 988 hints.ai_flags |= AI_PASSIVE; 989 hints.ai_flags |= AI_NUMERICHOST; 990 rc = getaddrinfo(ip, portnum, &hints, &res0); 991 if (rc != 0) { 992 SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); 993 return NULL; 994 } 995 996 /* try listen */ 997 fd = -1; 998 for (res = res0; res != NULL; res = res->ai_next) { 999 retry: 1000 fd = posix_fd_create(res, opts, &impl_opts); 1001 if (fd < 0) { 1002 continue; 1003 } 1004 if (type == SPDK_SOCK_CREATE_LISTEN) { 1005 rc = bind(fd, res->ai_addr, res->ai_addrlen); 1006 if (rc != 0) { 1007 SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); 1008 switch (errno) { 1009 case EINTR: 1010 /* interrupted? */ 1011 close(fd); 1012 goto retry; 1013 case EADDRNOTAVAIL: 1014 SPDK_ERRLOG("IP address %s not available. " 1015 "Verify IP address in config file " 1016 "and make sure setup script is " 1017 "run before starting spdk app.\n", ip); 1018 /* FALLTHROUGH */ 1019 default: 1020 /* try next family */ 1021 close(fd); 1022 fd = -1; 1023 continue; 1024 } 1025 } 1026 /* bind OK */ 1027 rc = listen(fd, 512); 1028 if (rc != 0) { 1029 SPDK_ERRLOG("listen() failed, errno = %d\n", errno); 1030 close(fd); 1031 fd = -1; 1032 break; 1033 } 1034 enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server; 1035 } else if (type == SPDK_SOCK_CREATE_CONNECT) { 1036 rc = connect(fd, res->ai_addr, res->ai_addrlen); 1037 if (rc != 0) { 1038 SPDK_ERRLOG("connect() failed, errno = %d\n", errno); 1039 /* try next family */ 1040 close(fd); 1041 fd = -1; 1042 continue; 1043 } 1044 enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client; 1045 if (enable_ssl) { 1046 ctx = posix_sock_create_ssl_context(TLS_client_method(), opts, &impl_opts); 1047 if (!ctx) { 1048 SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); 1049 close(fd); 1050 fd = -1; 1051 break; 1052 } 1053 ssl = ssl_sock_setup_connect(ctx, fd); 1054 if (!ssl) { 1055 SPDK_ERRLOG("ssl_sock_setup_connect() failed, errno = %d\n", errno); 1056 close(fd); 1057 fd = -1; 1058 SSL_CTX_free(ctx); 1059 break; 1060 } 1061 } 1062 } 1063 1064 flag = fcntl(fd, F_GETFL); 1065 if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1066 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); 1067 SSL_free(ssl); 1068 SSL_CTX_free(ctx); 1069 close(fd); 1070 fd = -1; 1071 break; 1072 } 1073 break; 1074 } 1075 freeaddrinfo(res0); 1076 1077 if (fd < 0) { 1078 return NULL; 1079 } 1080 1081 /* Only enable zero copy for non-loopback and non-ssl sockets. */ 1082 enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd) && !enable_ssl; 1083 1084 sock = posix_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts); 1085 if (sock == NULL) { 1086 SPDK_ERRLOG("sock allocation failed\n"); 1087 SSL_free(ssl); 1088 SSL_CTX_free(ctx); 1089 close(fd); 1090 return NULL; 1091 } 1092 1093 if (ctx) { 1094 sock->ctx = ctx; 1095 } 1096 1097 if (ssl) { 1098 sock->ssl = ssl; 1099 SSL_set_app_data(ssl, &sock->base.impl_opts); 1100 } 1101 1102 return &sock->base; 1103 } 1104 1105 static struct spdk_sock * 1106 posix_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) 1107 { 1108 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, false); 1109 } 1110 1111 static struct spdk_sock * 1112 posix_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) 1113 { 1114 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, false); 1115 } 1116 1117 static struct spdk_sock * 1118 _posix_sock_accept(struct spdk_sock *_sock, bool enable_ssl) 1119 { 1120 struct spdk_posix_sock *sock = __posix_sock(_sock); 1121 struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl); 1122 struct sockaddr_storage sa; 1123 socklen_t salen; 1124 int rc, fd; 1125 struct spdk_posix_sock *new_sock; 1126 int flag; 1127 SSL_CTX *ctx = 0; 1128 SSL *ssl = 0; 1129 1130 memset(&sa, 0, sizeof(sa)); 1131 salen = sizeof(sa); 1132 1133 assert(sock != NULL); 1134 1135 /* epoll_wait will trigger again if there is more than one request */ 1136 if (group && sock->socket_has_data) { 1137 sock->socket_has_data = false; 1138 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1139 } 1140 1141 rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); 1142 1143 if (rc == -1) { 1144 return NULL; 1145 } 1146 1147 fd = rc; 1148 1149 flag = fcntl(fd, F_GETFL); 1150 if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) { 1151 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); 1152 close(fd); 1153 return NULL; 1154 } 1155 1156 #if defined(SO_PRIORITY) 1157 /* The priority is not inherited, so call this function again */ 1158 if (sock->base.opts.priority) { 1159 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); 1160 if (rc != 0) { 1161 close(fd); 1162 return NULL; 1163 } 1164 } 1165 #endif 1166 1167 /* Establish SSL connection */ 1168 if (enable_ssl) { 1169 ctx = posix_sock_create_ssl_context(TLS_server_method(), &sock->base.opts, &sock->base.impl_opts); 1170 if (!ctx) { 1171 SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); 1172 close(fd); 1173 return NULL; 1174 } 1175 ssl = ssl_sock_setup_accept(ctx, fd); 1176 if (!ssl) { 1177 SPDK_ERRLOG("ssl_sock_setup_accept() failed, errno = %d\n", errno); 1178 close(fd); 1179 SSL_CTX_free(ctx); 1180 return NULL; 1181 } 1182 } 1183 1184 /* Inherit the zero copy feature from the listen socket */ 1185 new_sock = posix_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy); 1186 if (new_sock == NULL) { 1187 close(fd); 1188 SSL_free(ssl); 1189 SSL_CTX_free(ctx); 1190 return NULL; 1191 } 1192 1193 if (ctx) { 1194 new_sock->ctx = ctx; 1195 } 1196 1197 if (ssl) { 1198 new_sock->ssl = ssl; 1199 SSL_set_app_data(ssl, &new_sock->base.impl_opts); 1200 } 1201 1202 return &new_sock->base; 1203 } 1204 1205 static struct spdk_sock * 1206 posix_sock_accept(struct spdk_sock *_sock) 1207 { 1208 return _posix_sock_accept(_sock, false); 1209 } 1210 1211 static int 1212 posix_sock_close(struct spdk_sock *_sock) 1213 { 1214 struct spdk_posix_sock *sock = __posix_sock(_sock); 1215 void *pipe_buf; 1216 1217 assert(TAILQ_EMPTY(&_sock->pending_reqs)); 1218 1219 if (sock->ssl != NULL) { 1220 SSL_shutdown(sock->ssl); 1221 } 1222 1223 /* If the socket fails to close, the best choice is to 1224 * leak the fd but continue to free the rest of the sock 1225 * memory. */ 1226 close(sock->fd); 1227 1228 SSL_free(sock->ssl); 1229 SSL_CTX_free(sock->ctx); 1230 1231 pipe_buf = spdk_pipe_destroy(sock->recv_pipe); 1232 free(pipe_buf); 1233 free(sock); 1234 1235 return 0; 1236 } 1237 1238 #ifdef SPDK_ZEROCOPY 1239 static int 1240 _sock_check_zcopy(struct spdk_sock *sock) 1241 { 1242 struct spdk_posix_sock *psock = __posix_sock(sock); 1243 struct msghdr msgh = {}; 1244 uint8_t buf[sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)]; 1245 ssize_t rc; 1246 struct sock_extended_err *serr; 1247 struct cmsghdr *cm; 1248 uint32_t idx; 1249 struct spdk_sock_request *req, *treq; 1250 bool found; 1251 1252 msgh.msg_control = buf; 1253 msgh.msg_controllen = sizeof(buf); 1254 1255 while (true) { 1256 rc = recvmsg(psock->fd, &msgh, MSG_ERRQUEUE); 1257 1258 if (rc < 0) { 1259 if (errno == EWOULDBLOCK || errno == EAGAIN) { 1260 return 0; 1261 } 1262 1263 if (!TAILQ_EMPTY(&sock->pending_reqs)) { 1264 SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries\n"); 1265 } else { 1266 SPDK_WARNLOG("Recvmsg yielded an error!\n"); 1267 } 1268 return 0; 1269 } 1270 1271 cm = CMSG_FIRSTHDR(&msgh); 1272 if (!(cm && 1273 ((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || 1274 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR)))) { 1275 SPDK_WARNLOG("Unexpected cmsg level or type!\n"); 1276 return 0; 1277 } 1278 1279 serr = (struct sock_extended_err *)CMSG_DATA(cm); 1280 if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { 1281 SPDK_WARNLOG("Unexpected extended error origin\n"); 1282 return 0; 1283 } 1284 1285 /* Most of the time, the pending_reqs array is in the exact 1286 * order we need such that all of the requests to complete are 1287 * in order, in the front. It is guaranteed that all requests 1288 * belonging to the same sendmsg call are sequential, so once 1289 * we encounter one match we can stop looping as soon as a 1290 * non-match is found. 1291 */ 1292 idx = serr->ee_info; 1293 while (true) { 1294 found = false; 1295 TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) { 1296 if (!req->internal.is_zcopy) { 1297 /* This wasn't a zcopy request. It was just waiting in line to complete */ 1298 rc = spdk_sock_request_put(sock, req, 0); 1299 if (rc < 0) { 1300 return rc; 1301 } 1302 } else if (req->internal.offset == idx) { 1303 found = true; 1304 rc = spdk_sock_request_put(sock, req, 0); 1305 if (rc < 0) { 1306 return rc; 1307 } 1308 } else if (found) { 1309 break; 1310 } 1311 } 1312 1313 if (idx == serr->ee_data) { 1314 break; 1315 } 1316 1317 if (idx == UINT32_MAX) { 1318 idx = 0; 1319 } else { 1320 idx++; 1321 } 1322 } 1323 } 1324 1325 return 0; 1326 } 1327 #endif 1328 1329 static int 1330 _sock_flush(struct spdk_sock *sock) 1331 { 1332 struct spdk_posix_sock *psock = __posix_sock(sock); 1333 struct msghdr msg = {}; 1334 int flags; 1335 struct iovec iovs[IOV_BATCH_SIZE]; 1336 int iovcnt; 1337 int retval; 1338 struct spdk_sock_request *req; 1339 int i; 1340 ssize_t rc, sent; 1341 unsigned int offset; 1342 size_t len; 1343 bool is_zcopy = false; 1344 1345 /* Can't flush from within a callback or we end up with recursive calls */ 1346 if (sock->cb_cnt > 0) { 1347 errno = EAGAIN; 1348 return -1; 1349 } 1350 1351 #ifdef SPDK_ZEROCOPY 1352 if (psock->zcopy) { 1353 flags = MSG_ZEROCOPY | MSG_NOSIGNAL; 1354 } else 1355 #endif 1356 { 1357 flags = MSG_NOSIGNAL; 1358 } 1359 1360 iovcnt = spdk_sock_prep_reqs(sock, iovs, 0, NULL, &flags); 1361 if (iovcnt == 0) { 1362 return 0; 1363 } 1364 1365 #ifdef SPDK_ZEROCOPY 1366 is_zcopy = flags & MSG_ZEROCOPY; 1367 #endif 1368 1369 /* Perform the vectored write */ 1370 msg.msg_iov = iovs; 1371 msg.msg_iovlen = iovcnt; 1372 1373 if (psock->ssl) { 1374 rc = SSL_writev(psock->ssl, iovs, iovcnt); 1375 } else { 1376 rc = sendmsg(psock->fd, &msg, flags); 1377 } 1378 if (rc <= 0) { 1379 if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && psock->zcopy)) { 1380 errno = EAGAIN; 1381 } 1382 return -1; 1383 } 1384 1385 sent = rc; 1386 1387 if (is_zcopy) { 1388 /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the 1389 * req->internal.offset, so sendmsg_idx should not be zero */ 1390 if (spdk_unlikely(psock->sendmsg_idx == UINT32_MAX)) { 1391 psock->sendmsg_idx = 1; 1392 } else { 1393 psock->sendmsg_idx++; 1394 } 1395 } 1396 1397 /* Consume the requests that were actually written */ 1398 req = TAILQ_FIRST(&sock->queued_reqs); 1399 while (req) { 1400 offset = req->internal.offset; 1401 1402 /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */ 1403 req->internal.is_zcopy = is_zcopy; 1404 1405 for (i = 0; i < req->iovcnt; i++) { 1406 /* Advance by the offset first */ 1407 if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { 1408 offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; 1409 continue; 1410 } 1411 1412 /* Calculate the remaining length of this element */ 1413 len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; 1414 1415 if (len > (size_t)rc) { 1416 /* This element was partially sent. */ 1417 req->internal.offset += rc; 1418 return sent; 1419 } 1420 1421 offset = 0; 1422 req->internal.offset += len; 1423 rc -= len; 1424 } 1425 1426 /* Handled a full request. */ 1427 spdk_sock_request_pend(sock, req); 1428 1429 if (!req->internal.is_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) { 1430 /* The sendmsg syscall above isn't currently asynchronous, 1431 * so it's already done. */ 1432 retval = spdk_sock_request_put(sock, req, 0); 1433 if (retval) { 1434 break; 1435 } 1436 } else { 1437 /* Re-use the offset field to hold the sendmsg call index. The 1438 * index is 0 based, so subtract one here because we've already 1439 * incremented above. */ 1440 req->internal.offset = psock->sendmsg_idx - 1; 1441 } 1442 1443 if (rc == 0) { 1444 break; 1445 } 1446 1447 req = TAILQ_FIRST(&sock->queued_reqs); 1448 } 1449 1450 return sent; 1451 } 1452 1453 static int 1454 posix_sock_flush(struct spdk_sock *sock) 1455 { 1456 #ifdef SPDK_ZEROCOPY 1457 struct spdk_posix_sock *psock = __posix_sock(sock); 1458 1459 if (psock->zcopy && !TAILQ_EMPTY(&sock->pending_reqs)) { 1460 _sock_check_zcopy(sock); 1461 } 1462 #endif 1463 1464 return _sock_flush(sock); 1465 } 1466 1467 static ssize_t 1468 posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int diovcnt) 1469 { 1470 struct iovec siov[2]; 1471 int sbytes; 1472 ssize_t bytes; 1473 struct spdk_posix_sock_group_impl *group; 1474 1475 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); 1476 if (sbytes < 0) { 1477 errno = EINVAL; 1478 return -1; 1479 } else if (sbytes == 0) { 1480 errno = EAGAIN; 1481 return -1; 1482 } 1483 1484 bytes = spdk_iovcpy(siov, 2, diov, diovcnt); 1485 1486 if (bytes == 0) { 1487 /* The only way this happens is if diov is 0 length */ 1488 errno = EINVAL; 1489 return -1; 1490 } 1491 1492 spdk_pipe_reader_advance(sock->recv_pipe, bytes); 1493 1494 /* If we drained the pipe, mark it appropriately */ 1495 if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { 1496 assert(sock->pipe_has_data == true); 1497 1498 group = __posix_group_impl(sock->base.group_impl); 1499 if (group && !sock->socket_has_data) { 1500 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1501 } 1502 1503 sock->pipe_has_data = false; 1504 } 1505 1506 return bytes; 1507 } 1508 1509 static inline ssize_t 1510 posix_sock_read(struct spdk_posix_sock *sock) 1511 { 1512 struct iovec iov[2]; 1513 int bytes_avail, bytes_recvd; 1514 struct spdk_posix_sock_group_impl *group; 1515 1516 bytes_avail = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); 1517 1518 if (bytes_avail <= 0) { 1519 return bytes_avail; 1520 } 1521 1522 if (sock->ssl) { 1523 bytes_recvd = SSL_readv(sock->ssl, iov, 2); 1524 } else { 1525 bytes_recvd = readv(sock->fd, iov, 2); 1526 } 1527 1528 assert(sock->pipe_has_data == false); 1529 1530 if (bytes_recvd <= 0) { 1531 /* Errors count as draining the socket data */ 1532 if (sock->base.group_impl && sock->socket_has_data) { 1533 group = __posix_group_impl(sock->base.group_impl); 1534 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1535 } 1536 1537 sock->socket_has_data = false; 1538 1539 return bytes_recvd; 1540 } 1541 1542 spdk_pipe_writer_advance(sock->recv_pipe, bytes_recvd); 1543 1544 #if DEBUG 1545 if (sock->base.group_impl) { 1546 assert(sock->socket_has_data == true); 1547 } 1548 #endif 1549 1550 sock->pipe_has_data = true; 1551 if (bytes_recvd < bytes_avail) { 1552 /* We drained the kernel socket entirely. */ 1553 sock->socket_has_data = false; 1554 } 1555 1556 return bytes_recvd; 1557 } 1558 1559 static ssize_t 1560 posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) 1561 { 1562 struct spdk_posix_sock *sock = __posix_sock(_sock); 1563 struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl); 1564 int rc, i; 1565 size_t len; 1566 1567 if (sock->recv_pipe == NULL) { 1568 assert(sock->pipe_has_data == false); 1569 if (group && sock->socket_has_data) { 1570 sock->socket_has_data = false; 1571 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1572 } 1573 if (sock->ssl) { 1574 return SSL_readv(sock->ssl, iov, iovcnt); 1575 } else { 1576 return readv(sock->fd, iov, iovcnt); 1577 } 1578 } 1579 1580 /* If the socket is not in a group, we must assume it always has 1581 * data waiting for us because it is not epolled */ 1582 if (!sock->pipe_has_data && (group == NULL || sock->socket_has_data)) { 1583 /* If the user is receiving a sufficiently large amount of data, 1584 * receive directly to their buffers. */ 1585 len = 0; 1586 for (i = 0; i < iovcnt; i++) { 1587 len += iov[i].iov_len; 1588 } 1589 1590 if (len >= MIN_SOCK_PIPE_SIZE) { 1591 /* TODO: Should this detect if kernel socket is drained? */ 1592 if (sock->ssl) { 1593 return SSL_readv(sock->ssl, iov, iovcnt); 1594 } else { 1595 return readv(sock->fd, iov, iovcnt); 1596 } 1597 } 1598 1599 /* Otherwise, do a big read into our pipe */ 1600 rc = posix_sock_read(sock); 1601 if (rc <= 0) { 1602 return rc; 1603 } 1604 } 1605 1606 return posix_sock_recv_from_pipe(sock, iov, iovcnt); 1607 } 1608 1609 static ssize_t 1610 posix_sock_recv(struct spdk_sock *sock, void *buf, size_t len) 1611 { 1612 struct iovec iov[1]; 1613 1614 iov[0].iov_base = buf; 1615 iov[0].iov_len = len; 1616 1617 return posix_sock_readv(sock, iov, 1); 1618 } 1619 1620 static ssize_t 1621 posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) 1622 { 1623 struct spdk_posix_sock *sock = __posix_sock(_sock); 1624 int rc; 1625 1626 /* In order to process a writev, we need to flush any asynchronous writes 1627 * first. */ 1628 rc = _sock_flush(_sock); 1629 if (rc < 0) { 1630 return rc; 1631 } 1632 1633 if (!TAILQ_EMPTY(&_sock->queued_reqs)) { 1634 /* We weren't able to flush all requests */ 1635 errno = EAGAIN; 1636 return -1; 1637 } 1638 1639 if (sock->ssl) { 1640 return SSL_writev(sock->ssl, iov, iovcnt); 1641 } else { 1642 return writev(sock->fd, iov, iovcnt); 1643 } 1644 } 1645 1646 static int 1647 posix_sock_recv_next(struct spdk_sock *_sock, void **buf, void **ctx) 1648 { 1649 struct spdk_posix_sock *sock = __posix_sock(_sock); 1650 struct iovec iov; 1651 ssize_t rc; 1652 1653 if (sock->recv_pipe != NULL) { 1654 errno = ENOTSUP; 1655 return -1; 1656 } 1657 1658 iov.iov_len = spdk_sock_group_get_buf(_sock->group_impl->group, &iov.iov_base, ctx); 1659 if (iov.iov_len == 0) { 1660 errno = ENOBUFS; 1661 return -1; 1662 } 1663 1664 rc = posix_sock_readv(_sock, &iov, 1); 1665 if (rc <= 0) { 1666 spdk_sock_group_provide_buf(_sock->group_impl->group, iov.iov_base, iov.iov_len, *ctx); 1667 return rc; 1668 } 1669 1670 *buf = iov.iov_base; 1671 1672 return rc; 1673 } 1674 1675 static void 1676 posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req) 1677 { 1678 int rc; 1679 1680 spdk_sock_request_queue(sock, req); 1681 1682 /* If there are a sufficient number queued, just flush them out immediately. */ 1683 if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { 1684 rc = _sock_flush(sock); 1685 if (rc < 0 && errno != EAGAIN) { 1686 spdk_sock_abort_requests(sock); 1687 } 1688 } 1689 } 1690 1691 static int 1692 posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) 1693 { 1694 struct spdk_posix_sock *sock = __posix_sock(_sock); 1695 int val; 1696 int rc; 1697 1698 assert(sock != NULL); 1699 1700 val = nbytes; 1701 rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); 1702 if (rc != 0) { 1703 return -1; 1704 } 1705 return 0; 1706 } 1707 1708 static bool 1709 posix_sock_is_ipv6(struct spdk_sock *_sock) 1710 { 1711 struct spdk_posix_sock *sock = __posix_sock(_sock); 1712 struct sockaddr_storage sa; 1713 socklen_t salen; 1714 int rc; 1715 1716 assert(sock != NULL); 1717 1718 memset(&sa, 0, sizeof sa); 1719 salen = sizeof sa; 1720 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 1721 if (rc != 0) { 1722 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 1723 return false; 1724 } 1725 1726 return (sa.ss_family == AF_INET6); 1727 } 1728 1729 static bool 1730 posix_sock_is_ipv4(struct spdk_sock *_sock) 1731 { 1732 struct spdk_posix_sock *sock = __posix_sock(_sock); 1733 struct sockaddr_storage sa; 1734 socklen_t salen; 1735 int rc; 1736 1737 assert(sock != NULL); 1738 1739 memset(&sa, 0, sizeof sa); 1740 salen = sizeof sa; 1741 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 1742 if (rc != 0) { 1743 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 1744 return false; 1745 } 1746 1747 return (sa.ss_family == AF_INET); 1748 } 1749 1750 static bool 1751 posix_sock_is_connected(struct spdk_sock *_sock) 1752 { 1753 struct spdk_posix_sock *sock = __posix_sock(_sock); 1754 uint8_t byte; 1755 int rc; 1756 1757 rc = recv(sock->fd, &byte, 1, MSG_PEEK); 1758 if (rc == 0) { 1759 return false; 1760 } 1761 1762 if (rc < 0) { 1763 if (errno == EAGAIN || errno == EWOULDBLOCK) { 1764 return true; 1765 } 1766 1767 return false; 1768 } 1769 1770 return true; 1771 } 1772 1773 static struct spdk_sock_group_impl * 1774 posix_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint) 1775 { 1776 struct spdk_posix_sock *sock = __posix_sock(_sock); 1777 struct spdk_sock_group_impl *group_impl; 1778 1779 if (sock->placement_id != -1) { 1780 spdk_sock_map_lookup(&g_map, sock->placement_id, &group_impl, hint); 1781 return group_impl; 1782 } 1783 1784 return NULL; 1785 } 1786 1787 static struct spdk_sock_group_impl * 1788 _sock_group_impl_create(uint32_t enable_placement_id) 1789 { 1790 struct spdk_posix_sock_group_impl *group_impl; 1791 int fd; 1792 1793 #if defined(SPDK_EPOLL) 1794 fd = epoll_create1(0); 1795 #elif defined(SPDK_KEVENT) 1796 fd = kqueue(); 1797 #endif 1798 if (fd == -1) { 1799 return NULL; 1800 } 1801 1802 group_impl = calloc(1, sizeof(*group_impl)); 1803 if (group_impl == NULL) { 1804 SPDK_ERRLOG("group_impl allocation failed\n"); 1805 close(fd); 1806 return NULL; 1807 } 1808 1809 group_impl->pipe_group = spdk_pipe_group_create(); 1810 if (group_impl->pipe_group == NULL) { 1811 SPDK_ERRLOG("pipe_group allocation failed\n"); 1812 free(group_impl); 1813 close(fd); 1814 return NULL; 1815 } 1816 1817 group_impl->fd = fd; 1818 TAILQ_INIT(&group_impl->socks_with_data); 1819 group_impl->placement_id = -1; 1820 1821 if (enable_placement_id == PLACEMENT_CPU) { 1822 spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base); 1823 group_impl->placement_id = spdk_env_get_current_core(); 1824 } 1825 1826 return &group_impl->base; 1827 } 1828 1829 static struct spdk_sock_group_impl * 1830 posix_sock_group_impl_create(void) 1831 { 1832 return _sock_group_impl_create(g_posix_impl_opts.enable_placement_id); 1833 } 1834 1835 static struct spdk_sock_group_impl * 1836 ssl_sock_group_impl_create(void) 1837 { 1838 return _sock_group_impl_create(g_ssl_impl_opts.enable_placement_id); 1839 } 1840 1841 static void 1842 posix_sock_mark(struct spdk_posix_sock_group_impl *group, struct spdk_posix_sock *sock, 1843 int placement_id) 1844 { 1845 #if defined(SO_MARK) 1846 int rc; 1847 1848 rc = setsockopt(sock->fd, SOL_SOCKET, SO_MARK, 1849 &placement_id, sizeof(placement_id)); 1850 if (rc != 0) { 1851 /* Not fatal */ 1852 SPDK_ERRLOG("Error setting SO_MARK\n"); 1853 return; 1854 } 1855 1856 rc = spdk_sock_map_insert(&g_map, placement_id, &group->base); 1857 if (rc != 0) { 1858 /* Not fatal */ 1859 SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); 1860 return; 1861 } 1862 1863 sock->placement_id = placement_id; 1864 #endif 1865 } 1866 1867 static void 1868 posix_sock_update_mark(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1869 { 1870 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1871 1872 if (group->placement_id == -1) { 1873 group->placement_id = spdk_sock_map_find_free(&g_map); 1874 1875 /* If a free placement id is found, update existing sockets in this group */ 1876 if (group->placement_id != -1) { 1877 struct spdk_sock *sock, *tmp; 1878 1879 TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { 1880 posix_sock_mark(group, __posix_sock(sock), group->placement_id); 1881 } 1882 } 1883 } 1884 1885 if (group->placement_id != -1) { 1886 /* 1887 * group placement id is already determined for this poll group. 1888 * Mark socket with group's placement id. 1889 */ 1890 posix_sock_mark(group, __posix_sock(_sock), group->placement_id); 1891 } 1892 } 1893 1894 static int 1895 posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1896 { 1897 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1898 struct spdk_posix_sock *sock = __posix_sock(_sock); 1899 int rc; 1900 1901 #if defined(SPDK_EPOLL) 1902 struct epoll_event event; 1903 1904 memset(&event, 0, sizeof(event)); 1905 /* EPOLLERR is always on even if we don't set it, but be explicit for clarity */ 1906 event.events = EPOLLIN | EPOLLERR; 1907 if (spdk_interrupt_mode_is_enabled()) { 1908 event.events |= EPOLLOUT; 1909 } 1910 1911 event.data.ptr = sock; 1912 1913 rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event); 1914 #elif defined(SPDK_KEVENT) 1915 struct kevent event; 1916 struct timespec ts = {0}; 1917 1918 EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock); 1919 1920 rc = kevent(group->fd, &event, 1, NULL, 0, &ts); 1921 #endif 1922 1923 if (rc != 0) { 1924 return rc; 1925 } 1926 1927 /* switched from another polling group due to scheduling */ 1928 if (spdk_unlikely(sock->recv_pipe != NULL && 1929 (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { 1930 sock->pipe_has_data = true; 1931 sock->socket_has_data = false; 1932 TAILQ_INSERT_TAIL(&group->socks_with_data, sock, link); 1933 } else if (sock->recv_pipe != NULL) { 1934 rc = spdk_pipe_group_add(group->pipe_group, sock->recv_pipe); 1935 assert(rc == 0); 1936 } 1937 1938 if (_sock->impl_opts.enable_placement_id == PLACEMENT_MARK) { 1939 posix_sock_update_mark(_group, _sock); 1940 } else if (sock->placement_id != -1) { 1941 rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base); 1942 if (rc != 0) { 1943 SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); 1944 /* Do not treat this as an error. The system will continue running. */ 1945 } 1946 } 1947 1948 return rc; 1949 } 1950 1951 static int 1952 posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1953 { 1954 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1955 struct spdk_posix_sock *sock = __posix_sock(_sock); 1956 int rc; 1957 1958 if (sock->pipe_has_data || sock->socket_has_data) { 1959 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1960 sock->pipe_has_data = false; 1961 sock->socket_has_data = false; 1962 } else if (sock->recv_pipe != NULL) { 1963 rc = spdk_pipe_group_remove(group->pipe_group, sock->recv_pipe); 1964 assert(rc == 0); 1965 } 1966 1967 if (sock->placement_id != -1) { 1968 spdk_sock_map_release(&g_map, sock->placement_id); 1969 } 1970 1971 #if defined(SPDK_EPOLL) 1972 struct epoll_event event; 1973 1974 /* Event parameter is ignored but some old kernel version still require it. */ 1975 rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event); 1976 #elif defined(SPDK_KEVENT) 1977 struct kevent event; 1978 struct timespec ts = {0}; 1979 1980 EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); 1981 1982 rc = kevent(group->fd, &event, 1, NULL, 0, &ts); 1983 if (rc == 0 && event.flags & EV_ERROR) { 1984 rc = -1; 1985 errno = event.data; 1986 } 1987 #endif 1988 1989 spdk_sock_abort_requests(_sock); 1990 1991 return rc; 1992 } 1993 1994 static int 1995 posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, 1996 struct spdk_sock **socks) 1997 { 1998 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1999 struct spdk_sock *sock, *tmp; 2000 int num_events, i, rc; 2001 struct spdk_posix_sock *psock, *ptmp; 2002 #if defined(SPDK_EPOLL) 2003 struct epoll_event events[MAX_EVENTS_PER_POLL]; 2004 #elif defined(SPDK_KEVENT) 2005 struct kevent events[MAX_EVENTS_PER_POLL]; 2006 struct timespec ts = {0}; 2007 #endif 2008 2009 #ifdef SPDK_ZEROCOPY 2010 /* When all of the following conditions are met 2011 * - non-blocking socket 2012 * - zero copy is enabled 2013 * - interrupts suppressed (i.e. busy polling) 2014 * - the NIC tx queue is full at the time sendmsg() is called 2015 * - epoll_wait determines there is an EPOLLIN event for the socket 2016 * then we can get into a situation where data we've sent is queued 2017 * up in the kernel network stack, but interrupts have been suppressed 2018 * because other traffic is flowing so the kernel misses the signal 2019 * to flush the software tx queue. If there wasn't incoming data 2020 * pending on the socket, then epoll_wait would have been sufficient 2021 * to kick off the send operation, but since there is a pending event 2022 * epoll_wait does not trigger the necessary operation. 2023 * 2024 * We deal with this by checking for all of the above conditions and 2025 * additionally looking for EPOLLIN events that were not consumed from 2026 * the last poll loop. We take this to mean that the upper layer is 2027 * unable to consume them because it is blocked waiting for resources 2028 * to free up, and those resources are most likely freed in response 2029 * to a pending asynchronous write completing. 2030 * 2031 * Additionally, sockets that have the same placement_id actually share 2032 * an underlying hardware queue. That means polling one of them is 2033 * equivalent to polling all of them. As a quick mechanism to avoid 2034 * making extra poll() calls, stash the last placement_id during the loop 2035 * and only poll if it's not the same. The overwhelmingly common case 2036 * is that all sockets in this list have the same placement_id because 2037 * SPDK is intentionally grouping sockets by that value, so even 2038 * though this won't stop all extra calls to poll(), it's very fast 2039 * and will catch all of them in practice. 2040 */ 2041 int last_placement_id = -1; 2042 2043 TAILQ_FOREACH(psock, &group->socks_with_data, link) { 2044 if (psock->zcopy && psock->placement_id >= 0 && 2045 psock->placement_id != last_placement_id) { 2046 struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0}; 2047 2048 poll(&pfd, 1, 0); 2049 last_placement_id = psock->placement_id; 2050 } 2051 } 2052 #endif 2053 2054 /* This must be a TAILQ_FOREACH_SAFE because while flushing, 2055 * a completion callback could remove the sock from the 2056 * group. */ 2057 TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { 2058 rc = _sock_flush(sock); 2059 if (rc < 0 && errno != EAGAIN) { 2060 spdk_sock_abort_requests(sock); 2061 } 2062 } 2063 2064 assert(max_events > 0); 2065 2066 #if defined(SPDK_EPOLL) 2067 num_events = epoll_wait(group->fd, events, max_events, 0); 2068 #elif defined(SPDK_KEVENT) 2069 num_events = kevent(group->fd, NULL, 0, events, max_events, &ts); 2070 #endif 2071 2072 if (num_events == -1) { 2073 return -1; 2074 } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) { 2075 sock = TAILQ_FIRST(&_group->socks); 2076 psock = __posix_sock(sock); 2077 /* poll() is called here to busy poll the queue associated with 2078 * first socket in list and potentially reap incoming data. 2079 */ 2080 if (sock->opts.priority) { 2081 struct pollfd pfd = {0, 0, 0}; 2082 2083 pfd.fd = psock->fd; 2084 pfd.events = POLLIN | POLLERR; 2085 poll(&pfd, 1, 0); 2086 } 2087 } 2088 2089 for (i = 0; i < num_events; i++) { 2090 #if defined(SPDK_EPOLL) 2091 sock = events[i].data.ptr; 2092 psock = __posix_sock(sock); 2093 2094 #ifdef SPDK_ZEROCOPY 2095 if (events[i].events & EPOLLERR) { 2096 rc = _sock_check_zcopy(sock); 2097 /* If the socket was closed or removed from 2098 * the group in response to a send ack, don't 2099 * add it to the array here. */ 2100 if (rc || sock->cb_fn == NULL) { 2101 continue; 2102 } 2103 } 2104 #endif 2105 if ((events[i].events & EPOLLIN) == 0) { 2106 continue; 2107 } 2108 2109 #elif defined(SPDK_KEVENT) 2110 sock = events[i].udata; 2111 psock = __posix_sock(sock); 2112 #endif 2113 2114 /* If the socket is not already in the list, add it now */ 2115 if (!psock->socket_has_data && !psock->pipe_has_data) { 2116 TAILQ_INSERT_TAIL(&group->socks_with_data, psock, link); 2117 } 2118 psock->socket_has_data = true; 2119 } 2120 2121 num_events = 0; 2122 2123 TAILQ_FOREACH_SAFE(psock, &group->socks_with_data, link, ptmp) { 2124 if (num_events == max_events) { 2125 break; 2126 } 2127 2128 /* If the socket's cb_fn is NULL, just remove it from the 2129 * list and do not add it to socks array */ 2130 if (spdk_unlikely(psock->base.cb_fn == NULL)) { 2131 psock->socket_has_data = false; 2132 psock->pipe_has_data = false; 2133 TAILQ_REMOVE(&group->socks_with_data, psock, link); 2134 continue; 2135 } 2136 2137 socks[num_events++] = &psock->base; 2138 } 2139 2140 /* Cycle the has_data list so that each time we poll things aren't 2141 * in the same order. Say we have 6 sockets in the list, named as follows: 2142 * A B C D E F 2143 * And all 6 sockets had epoll events, but max_events is only 3. That means 2144 * psock currently points at D. We want to rearrange the list to the following: 2145 * D E F A B C 2146 * 2147 * The variables below are named according to this example to make it easier to 2148 * follow the swaps. 2149 */ 2150 if (psock != NULL) { 2151 struct spdk_posix_sock *pa, *pc, *pd, *pf; 2152 2153 /* Capture pointers to the elements we need */ 2154 pd = psock; 2155 pc = TAILQ_PREV(pd, spdk_has_data_list, link); 2156 pa = TAILQ_FIRST(&group->socks_with_data); 2157 pf = TAILQ_LAST(&group->socks_with_data, spdk_has_data_list); 2158 2159 /* Break the link between C and D */ 2160 pc->link.tqe_next = NULL; 2161 2162 /* Connect F to A */ 2163 pf->link.tqe_next = pa; 2164 pa->link.tqe_prev = &pf->link.tqe_next; 2165 2166 /* Fix up the list first/last pointers */ 2167 group->socks_with_data.tqh_first = pd; 2168 group->socks_with_data.tqh_last = &pc->link.tqe_next; 2169 2170 /* D is in front of the list, make tqe prev pointer point to the head of list */ 2171 pd->link.tqe_prev = &group->socks_with_data.tqh_first; 2172 } 2173 2174 return num_events; 2175 } 2176 2177 static int 2178 posix_sock_group_impl_register_interrupt(struct spdk_sock_group_impl *_group, uint32_t events, 2179 spdk_interrupt_fn fn, void *arg, const char *name) 2180 { 2181 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 2182 2183 group->intr = spdk_interrupt_register_for_events(group->fd, events, fn, arg, name); 2184 2185 return group->intr ? 0 : -1; 2186 } 2187 2188 static void 2189 posix_sock_group_impl_unregister_interrupt(struct spdk_sock_group_impl *_group) 2190 { 2191 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 2192 2193 spdk_interrupt_unregister(&group->intr); 2194 } 2195 2196 static int 2197 _sock_group_impl_close(struct spdk_sock_group_impl *_group, uint32_t enable_placement_id) 2198 { 2199 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 2200 int rc; 2201 2202 if (enable_placement_id == PLACEMENT_CPU) { 2203 spdk_sock_map_release(&g_map, spdk_env_get_current_core()); 2204 } 2205 2206 spdk_pipe_group_destroy(group->pipe_group); 2207 rc = close(group->fd); 2208 free(group); 2209 return rc; 2210 } 2211 2212 static int 2213 posix_sock_group_impl_close(struct spdk_sock_group_impl *_group) 2214 { 2215 return _sock_group_impl_close(_group, g_posix_impl_opts.enable_placement_id); 2216 } 2217 2218 static int 2219 ssl_sock_group_impl_close(struct spdk_sock_group_impl *_group) 2220 { 2221 return _sock_group_impl_close(_group, g_ssl_impl_opts.enable_placement_id); 2222 } 2223 2224 static struct spdk_net_impl g_posix_net_impl = { 2225 .name = "posix", 2226 .getaddr = posix_sock_getaddr, 2227 .connect = posix_sock_connect, 2228 .listen = posix_sock_listen, 2229 .accept = posix_sock_accept, 2230 .close = posix_sock_close, 2231 .recv = posix_sock_recv, 2232 .readv = posix_sock_readv, 2233 .writev = posix_sock_writev, 2234 .recv_next = posix_sock_recv_next, 2235 .writev_async = posix_sock_writev_async, 2236 .flush = posix_sock_flush, 2237 .set_recvlowat = posix_sock_set_recvlowat, 2238 .set_recvbuf = posix_sock_set_recvbuf, 2239 .set_sendbuf = posix_sock_set_sendbuf, 2240 .is_ipv6 = posix_sock_is_ipv6, 2241 .is_ipv4 = posix_sock_is_ipv4, 2242 .is_connected = posix_sock_is_connected, 2243 .group_impl_get_optimal = posix_sock_group_impl_get_optimal, 2244 .group_impl_create = posix_sock_group_impl_create, 2245 .group_impl_add_sock = posix_sock_group_impl_add_sock, 2246 .group_impl_remove_sock = posix_sock_group_impl_remove_sock, 2247 .group_impl_poll = posix_sock_group_impl_poll, 2248 .group_impl_register_interrupt = posix_sock_group_impl_register_interrupt, 2249 .group_impl_unregister_interrupt = posix_sock_group_impl_unregister_interrupt, 2250 .group_impl_close = posix_sock_group_impl_close, 2251 .get_opts = posix_sock_impl_get_opts, 2252 .set_opts = posix_sock_impl_set_opts, 2253 }; 2254 2255 SPDK_NET_IMPL_REGISTER_DEFAULT(posix, &g_posix_net_impl); 2256 2257 static struct spdk_sock * 2258 ssl_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) 2259 { 2260 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, true); 2261 } 2262 2263 static struct spdk_sock * 2264 ssl_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) 2265 { 2266 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, true); 2267 } 2268 2269 static struct spdk_sock * 2270 ssl_sock_accept(struct spdk_sock *_sock) 2271 { 2272 return _posix_sock_accept(_sock, true); 2273 } 2274 2275 static struct spdk_net_impl g_ssl_net_impl = { 2276 .name = "ssl", 2277 .getaddr = posix_sock_getaddr, 2278 .connect = ssl_sock_connect, 2279 .listen = ssl_sock_listen, 2280 .accept = ssl_sock_accept, 2281 .close = posix_sock_close, 2282 .recv = posix_sock_recv, 2283 .readv = posix_sock_readv, 2284 .writev = posix_sock_writev, 2285 .recv_next = posix_sock_recv_next, 2286 .writev_async = posix_sock_writev_async, 2287 .flush = posix_sock_flush, 2288 .set_recvlowat = posix_sock_set_recvlowat, 2289 .set_recvbuf = posix_sock_set_recvbuf, 2290 .set_sendbuf = posix_sock_set_sendbuf, 2291 .is_ipv6 = posix_sock_is_ipv6, 2292 .is_ipv4 = posix_sock_is_ipv4, 2293 .is_connected = posix_sock_is_connected, 2294 .group_impl_get_optimal = posix_sock_group_impl_get_optimal, 2295 .group_impl_create = ssl_sock_group_impl_create, 2296 .group_impl_add_sock = posix_sock_group_impl_add_sock, 2297 .group_impl_remove_sock = posix_sock_group_impl_remove_sock, 2298 .group_impl_poll = posix_sock_group_impl_poll, 2299 .group_impl_register_interrupt = posix_sock_group_impl_register_interrupt, 2300 .group_impl_unregister_interrupt = posix_sock_group_impl_unregister_interrupt, 2301 .group_impl_close = ssl_sock_group_impl_close, 2302 .get_opts = ssl_sock_impl_get_opts, 2303 .set_opts = ssl_sock_impl_set_opts, 2304 }; 2305 2306 SPDK_NET_IMPL_REGISTER(ssl, &g_ssl_net_impl); 2307 SPDK_LOG_REGISTER_COMPONENT(sock_posix) 2308