1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. All rights reserved. 3 * Copyright (c) 2020, 2021 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #if defined(__FreeBSD__) 10 #include <sys/event.h> 11 #define SPDK_KEVENT 12 #else 13 #include <sys/epoll.h> 14 #define SPDK_EPOLL 15 #endif 16 17 #if defined(__linux__) 18 #include <linux/errqueue.h> 19 #endif 20 21 #include "spdk/env.h" 22 #include "spdk/log.h" 23 #include "spdk/pipe.h" 24 #include "spdk/sock.h" 25 #include "spdk/util.h" 26 #include "spdk/string.h" 27 #include "spdk/net.h" 28 #include "spdk/file.h" 29 #include "spdk_internal/sock.h" 30 #include "spdk/net.h" 31 32 #include "openssl/crypto.h" 33 #include "openssl/err.h" 34 #include "openssl/ssl.h" 35 36 #define MAX_TMPBUF 1024 37 #define PORTNUMLEN 32 38 39 #if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) 40 #define SPDK_ZEROCOPY 41 #endif 42 43 struct spdk_posix_sock { 44 struct spdk_sock base; 45 int fd; 46 47 uint32_t sendmsg_idx; 48 49 struct spdk_pipe *recv_pipe; 50 int recv_buf_sz; 51 bool pipe_has_data; 52 bool socket_has_data; 53 bool zcopy; 54 55 int placement_id; 56 57 SSL_CTX *ctx; 58 SSL *ssl; 59 60 TAILQ_ENTRY(spdk_posix_sock) link; 61 62 char interface_name[IFNAMSIZ]; 63 }; 64 65 TAILQ_HEAD(spdk_has_data_list, spdk_posix_sock); 66 67 struct spdk_posix_sock_group_impl { 68 struct spdk_sock_group_impl base; 69 int fd; 70 struct spdk_interrupt *intr; 71 struct spdk_has_data_list socks_with_data; 72 int placement_id; 73 struct spdk_pipe_group *pipe_group; 74 }; 75 76 static struct spdk_sock_impl_opts g_posix_impl_opts = { 77 .recv_buf_size = DEFAULT_SO_RCVBUF_SIZE, 78 .send_buf_size = DEFAULT_SO_SNDBUF_SIZE, 79 .enable_recv_pipe = true, 80 .enable_quickack = false, 81 .enable_placement_id = PLACEMENT_NONE, 82 .enable_zerocopy_send_server = true, 83 .enable_zerocopy_send_client = false, 84 .zerocopy_threshold = 0, 85 .tls_version = 0, 86 .enable_ktls = false, 87 .psk_key = NULL, 88 .psk_key_size = 0, 89 .psk_identity = NULL, 90 .get_key = NULL, 91 .get_key_ctx = NULL, 92 .tls_cipher_suites = NULL 93 }; 94 95 static struct spdk_sock_impl_opts g_ssl_impl_opts = { 96 .recv_buf_size = MIN_SO_RCVBUF_SIZE, 97 .send_buf_size = MIN_SO_SNDBUF_SIZE, 98 .enable_recv_pipe = true, 99 .enable_quickack = false, 100 .enable_placement_id = PLACEMENT_NONE, 101 .enable_zerocopy_send_server = true, 102 .enable_zerocopy_send_client = false, 103 .zerocopy_threshold = 0, 104 .tls_version = 0, 105 .enable_ktls = false, 106 .psk_key = NULL, 107 .psk_identity = NULL 108 }; 109 110 static struct spdk_sock_map g_map = { 111 .entries = STAILQ_HEAD_INITIALIZER(g_map.entries), 112 .mtx = PTHREAD_MUTEX_INITIALIZER 113 }; 114 115 __attribute((destructor)) static void 116 posix_sock_map_cleanup(void) 117 { 118 spdk_sock_map_cleanup(&g_map); 119 } 120 121 #define __posix_sock(sock) (struct spdk_posix_sock *)sock 122 #define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group 123 124 static void 125 posix_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src, 126 size_t len) 127 { 128 #define FIELD_OK(field) \ 129 offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len 130 131 #define SET_FIELD(field) \ 132 if (FIELD_OK(field)) { \ 133 dest->field = src->field; \ 134 } 135 136 SET_FIELD(recv_buf_size); 137 SET_FIELD(send_buf_size); 138 SET_FIELD(enable_recv_pipe); 139 SET_FIELD(enable_zerocopy_send); 140 SET_FIELD(enable_quickack); 141 SET_FIELD(enable_placement_id); 142 SET_FIELD(enable_zerocopy_send_server); 143 SET_FIELD(enable_zerocopy_send_client); 144 SET_FIELD(zerocopy_threshold); 145 SET_FIELD(tls_version); 146 SET_FIELD(enable_ktls); 147 SET_FIELD(psk_key); 148 SET_FIELD(psk_key_size); 149 SET_FIELD(psk_identity); 150 SET_FIELD(get_key); 151 SET_FIELD(get_key_ctx); 152 SET_FIELD(tls_cipher_suites); 153 154 #undef SET_FIELD 155 #undef FIELD_OK 156 } 157 158 static int 159 _sock_impl_get_opts(struct spdk_sock_impl_opts *opts, struct spdk_sock_impl_opts *impl_opts, 160 size_t *len) 161 { 162 if (!opts || !len) { 163 errno = EINVAL; 164 return -1; 165 } 166 167 assert(sizeof(*opts) >= *len); 168 memset(opts, 0, *len); 169 170 posix_sock_copy_impl_opts(opts, impl_opts, *len); 171 *len = spdk_min(*len, sizeof(*impl_opts)); 172 173 return 0; 174 } 175 176 static int 177 posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) 178 { 179 return _sock_impl_get_opts(opts, &g_posix_impl_opts, len); 180 } 181 182 static int 183 ssl_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) 184 { 185 return _sock_impl_get_opts(opts, &g_ssl_impl_opts, len); 186 } 187 188 static int 189 _sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, struct spdk_sock_impl_opts *impl_opts, 190 size_t len) 191 { 192 if (!opts) { 193 errno = EINVAL; 194 return -1; 195 } 196 197 assert(sizeof(*opts) >= len); 198 posix_sock_copy_impl_opts(impl_opts, opts, len); 199 200 return 0; 201 } 202 203 static int 204 posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) 205 { 206 return _sock_impl_set_opts(opts, &g_posix_impl_opts, len); 207 } 208 209 static int 210 ssl_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) 211 { 212 return _sock_impl_set_opts(opts, &g_ssl_impl_opts, len); 213 } 214 215 static void 216 _opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest, 217 const struct spdk_sock_impl_opts *default_impl) 218 { 219 /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */ 220 memcpy(dest, default_impl, sizeof(*dest)); 221 222 if (opts->impl_opts != NULL) { 223 assert(sizeof(*dest) >= opts->impl_opts_size); 224 posix_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size); 225 } 226 } 227 228 static int 229 posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, 230 char *caddr, int clen, uint16_t *cport) 231 { 232 struct spdk_posix_sock *sock = __posix_sock(_sock); 233 234 assert(sock != NULL); 235 return spdk_net_getaddr(sock->fd, saddr, slen, sport, caddr, clen, cport); 236 } 237 238 static const char * 239 posix_sock_get_interface_name(struct spdk_sock *_sock) 240 { 241 struct spdk_posix_sock *sock = __posix_sock(_sock); 242 char saddr[64]; 243 int rc; 244 245 rc = spdk_net_getaddr(sock->fd, saddr, sizeof(saddr), NULL, NULL, 0, NULL); 246 if (rc != 0) { 247 return NULL; 248 } 249 250 rc = spdk_net_get_interface_name(saddr, sock->interface_name, 251 sizeof(sock->interface_name)); 252 if (rc != 0) { 253 return NULL; 254 } 255 256 return sock->interface_name; 257 } 258 259 static uint32_t 260 posix_sock_get_numa_socket_id(struct spdk_sock *sock) 261 { 262 const char *interface_name; 263 uint32_t numa_socket_id; 264 int rc; 265 266 interface_name = posix_sock_get_interface_name(sock); 267 if (interface_name == NULL) { 268 return SPDK_ENV_SOCKET_ID_ANY; 269 } 270 271 rc = spdk_read_sysfs_attribute_uint32(&numa_socket_id, 272 "/sys/class/net/%s/device/numa_node", interface_name); 273 if (rc == 0) { 274 return numa_socket_id; 275 } else { 276 return SPDK_ENV_SOCKET_ID_ANY; 277 } 278 } 279 280 enum posix_sock_create_type { 281 SPDK_SOCK_CREATE_LISTEN, 282 SPDK_SOCK_CREATE_CONNECT, 283 }; 284 285 static int 286 posix_sock_alloc_pipe(struct spdk_posix_sock *sock, int sz) 287 { 288 uint8_t *new_buf, *old_buf; 289 struct spdk_pipe *new_pipe; 290 struct iovec siov[2]; 291 struct iovec diov[2]; 292 int sbytes; 293 ssize_t bytes; 294 int rc; 295 296 if (sock->recv_buf_sz == sz) { 297 return 0; 298 } 299 300 /* If the new size is 0, just free the pipe */ 301 if (sz == 0) { 302 old_buf = spdk_pipe_destroy(sock->recv_pipe); 303 free(old_buf); 304 sock->recv_pipe = NULL; 305 return 0; 306 } else if (sz < MIN_SOCK_PIPE_SIZE) { 307 SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); 308 return -1; 309 } 310 311 /* Round up to next 64 byte multiple */ 312 rc = posix_memalign((void **)&new_buf, 64, sz); 313 if (rc != 0) { 314 SPDK_ERRLOG("socket recv buf allocation failed\n"); 315 return -ENOMEM; 316 } 317 memset(new_buf, 0, sz); 318 319 new_pipe = spdk_pipe_create(new_buf, sz); 320 if (new_pipe == NULL) { 321 SPDK_ERRLOG("socket pipe allocation failed\n"); 322 free(new_buf); 323 return -ENOMEM; 324 } 325 326 if (sock->recv_pipe != NULL) { 327 /* Pull all of the data out of the old pipe */ 328 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); 329 if (sbytes > sz) { 330 /* Too much data to fit into the new pipe size */ 331 old_buf = spdk_pipe_destroy(new_pipe); 332 free(old_buf); 333 return -EINVAL; 334 } 335 336 sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); 337 assert(sbytes == sz); 338 339 bytes = spdk_iovcpy(siov, 2, diov, 2); 340 spdk_pipe_writer_advance(new_pipe, bytes); 341 342 old_buf = spdk_pipe_destroy(sock->recv_pipe); 343 free(old_buf); 344 } 345 346 sock->recv_buf_sz = sz; 347 sock->recv_pipe = new_pipe; 348 349 if (sock->base.group_impl) { 350 struct spdk_posix_sock_group_impl *group; 351 352 group = __posix_group_impl(sock->base.group_impl); 353 spdk_pipe_group_add(group->pipe_group, sock->recv_pipe); 354 } 355 356 return 0; 357 } 358 359 static int 360 posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz) 361 { 362 struct spdk_posix_sock *sock = __posix_sock(_sock); 363 int min_size; 364 int rc; 365 366 assert(sock != NULL); 367 368 if (_sock->impl_opts.enable_recv_pipe) { 369 rc = posix_sock_alloc_pipe(sock, sz); 370 if (rc) { 371 return rc; 372 } 373 } 374 375 /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and 376 * _sock->impl_opts.recv_buf_size. */ 377 min_size = spdk_max(MIN_SO_RCVBUF_SIZE, _sock->impl_opts.recv_buf_size); 378 379 if (sz < min_size) { 380 sz = min_size; 381 } 382 383 rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); 384 if (rc < 0) { 385 return rc; 386 } 387 388 _sock->impl_opts.recv_buf_size = sz; 389 390 return 0; 391 } 392 393 static int 394 posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz) 395 { 396 struct spdk_posix_sock *sock = __posix_sock(_sock); 397 int min_size; 398 int rc; 399 400 assert(sock != NULL); 401 402 /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and 403 * _sock->impl_opts.send_buf_size. */ 404 min_size = spdk_max(MIN_SO_SNDBUF_SIZE, _sock->impl_opts.send_buf_size); 405 406 if (sz < min_size) { 407 sz = min_size; 408 } 409 410 rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); 411 if (rc < 0) { 412 return rc; 413 } 414 415 _sock->impl_opts.send_buf_size = sz; 416 417 return 0; 418 } 419 420 static void 421 posix_sock_init(struct spdk_posix_sock *sock, bool enable_zero_copy) 422 { 423 #if defined(SPDK_ZEROCOPY) || defined(__linux__) 424 int flag; 425 int rc; 426 #endif 427 428 #if defined(SPDK_ZEROCOPY) 429 flag = 1; 430 431 if (enable_zero_copy) { 432 /* Try to turn on zero copy sends */ 433 rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); 434 if (rc == 0) { 435 sock->zcopy = true; 436 } 437 } 438 #endif 439 440 #if defined(__linux__) 441 flag = 1; 442 443 if (sock->base.impl_opts.enable_quickack) { 444 rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); 445 if (rc != 0) { 446 SPDK_ERRLOG("quickack was failed to set\n"); 447 } 448 } 449 450 spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id, 451 &sock->placement_id); 452 453 if (sock->base.impl_opts.enable_placement_id == PLACEMENT_MARK) { 454 /* Save placement_id */ 455 spdk_sock_map_insert(&g_map, sock->placement_id, NULL); 456 } 457 #endif 458 } 459 460 static struct spdk_posix_sock * 461 posix_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy) 462 { 463 struct spdk_posix_sock *sock; 464 465 sock = calloc(1, sizeof(*sock)); 466 if (sock == NULL) { 467 SPDK_ERRLOG("sock allocation failed\n"); 468 return NULL; 469 } 470 471 sock->fd = fd; 472 memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts)); 473 posix_sock_init(sock, enable_zero_copy); 474 475 return sock; 476 } 477 478 static int 479 posix_fd_create(struct addrinfo *res, struct spdk_sock_opts *opts, 480 struct spdk_sock_impl_opts *impl_opts) 481 { 482 int fd; 483 int val = 1; 484 int rc, sz; 485 #if defined(__linux__) 486 int to; 487 #endif 488 489 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); 490 if (fd < 0) { 491 /* error */ 492 return -1; 493 } 494 495 sz = impl_opts->recv_buf_size; 496 rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); 497 if (rc) { 498 /* Not fatal */ 499 } 500 501 sz = impl_opts->send_buf_size; 502 rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); 503 if (rc) { 504 /* Not fatal */ 505 } 506 507 rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); 508 if (rc != 0) { 509 close(fd); 510 /* error */ 511 return -1; 512 } 513 rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); 514 if (rc != 0) { 515 close(fd); 516 /* error */ 517 return -1; 518 } 519 520 #if defined(SO_PRIORITY) 521 if (opts->priority) { 522 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); 523 if (rc != 0) { 524 close(fd); 525 /* error */ 526 return -1; 527 } 528 } 529 #endif 530 531 if (res->ai_family == AF_INET6) { 532 rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); 533 if (rc != 0) { 534 close(fd); 535 /* error */ 536 return -1; 537 } 538 } 539 540 if (opts->ack_timeout) { 541 #if defined(__linux__) 542 to = opts->ack_timeout; 543 rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &to, sizeof(to)); 544 if (rc != 0) { 545 close(fd); 546 /* error */ 547 return -1; 548 } 549 #else 550 SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n"); 551 #endif 552 } 553 554 return fd; 555 } 556 557 static int 558 posix_sock_psk_find_session_server_cb(SSL *ssl, const unsigned char *identity, 559 size_t identity_len, SSL_SESSION **sess) 560 { 561 struct spdk_sock_impl_opts *impl_opts = SSL_get_app_data(ssl); 562 uint8_t key[SSL_MAX_MASTER_KEY_LENGTH] = {}; 563 int keylen; 564 int rc, i; 565 STACK_OF(SSL_CIPHER) *ciphers; 566 const SSL_CIPHER *cipher; 567 const char *cipher_name; 568 const char *user_cipher = NULL; 569 bool found = false; 570 571 if (impl_opts->get_key) { 572 rc = impl_opts->get_key(key, sizeof(key), &user_cipher, identity, impl_opts->get_key_ctx); 573 if (rc < 0) { 574 SPDK_ERRLOG("Unable to find PSK for identity: %s\n", identity); 575 return 0; 576 } 577 keylen = rc; 578 } else { 579 if (impl_opts->psk_key == NULL) { 580 SPDK_ERRLOG("PSK is not set\n"); 581 return 0; 582 } 583 584 SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK ID %lu\n", strlen(impl_opts->psk_identity)); 585 if (strcmp(impl_opts->psk_identity, identity) != 0) { 586 SPDK_ERRLOG("Unknown Client's PSK ID\n"); 587 return 0; 588 } 589 keylen = impl_opts->psk_key_size; 590 591 memcpy(key, impl_opts->psk_key, keylen); 592 user_cipher = impl_opts->tls_cipher_suites; 593 } 594 595 if (user_cipher == NULL) { 596 SPDK_ERRLOG("Cipher suite not set\n"); 597 return 0; 598 } 599 600 *sess = SSL_SESSION_new(); 601 if (*sess == NULL) { 602 SPDK_ERRLOG("Unable to allocate new SSL session\n"); 603 return 0; 604 } 605 606 ciphers = SSL_get_ciphers(ssl); 607 for (i = 0; i < sk_SSL_CIPHER_num(ciphers); i++) { 608 cipher = sk_SSL_CIPHER_value(ciphers, i); 609 cipher_name = SSL_CIPHER_get_name(cipher); 610 611 if (strcmp(user_cipher, cipher_name) == 0) { 612 rc = SSL_SESSION_set_cipher(*sess, cipher); 613 if (rc != 1) { 614 SPDK_ERRLOG("Unable to set cipher: %s\n", cipher_name); 615 goto err; 616 } 617 found = true; 618 break; 619 } 620 } 621 if (found == false) { 622 SPDK_ERRLOG("No suitable cipher found\n"); 623 goto err; 624 } 625 626 SPDK_DEBUGLOG(sock_posix, "Cipher selected: %s\n", cipher_name); 627 628 rc = SSL_SESSION_set_protocol_version(*sess, TLS1_3_VERSION); 629 if (rc != 1) { 630 SPDK_ERRLOG("Unable to set TLS version: %d\n", TLS1_3_VERSION); 631 goto err; 632 } 633 634 rc = SSL_SESSION_set1_master_key(*sess, key, keylen); 635 if (rc != 1) { 636 SPDK_ERRLOG("Unable to set PSK for session\n"); 637 goto err; 638 } 639 640 return 1; 641 642 err: 643 SSL_SESSION_free(*sess); 644 *sess = NULL; 645 return 0; 646 } 647 648 static int 649 posix_sock_psk_use_session_client_cb(SSL *ssl, const EVP_MD *md, const unsigned char **identity, 650 size_t *identity_len, SSL_SESSION **sess) 651 { 652 struct spdk_sock_impl_opts *impl_opts = SSL_get_app_data(ssl); 653 int rc, i; 654 STACK_OF(SSL_CIPHER) *ciphers; 655 const SSL_CIPHER *cipher; 656 const char *cipher_name; 657 long keylen; 658 bool found = false; 659 660 if (impl_opts->psk_key == NULL) { 661 SPDK_ERRLOG("PSK is not set\n"); 662 return 0; 663 } 664 if (impl_opts->psk_key_size > SSL_MAX_MASTER_KEY_LENGTH) { 665 SPDK_ERRLOG("PSK too long\n"); 666 return 0; 667 } 668 keylen = impl_opts->psk_key_size; 669 670 if (impl_opts->tls_cipher_suites == NULL) { 671 SPDK_ERRLOG("Cipher suite not set\n"); 672 return 0; 673 } 674 *sess = SSL_SESSION_new(); 675 if (*sess == NULL) { 676 SPDK_ERRLOG("Unable to allocate new SSL session\n"); 677 return 0; 678 } 679 680 ciphers = SSL_get_ciphers(ssl); 681 for (i = 0; i < sk_SSL_CIPHER_num(ciphers); i++) { 682 cipher = sk_SSL_CIPHER_value(ciphers, i); 683 cipher_name = SSL_CIPHER_get_name(cipher); 684 685 if (strcmp(impl_opts->tls_cipher_suites, cipher_name) == 0) { 686 rc = SSL_SESSION_set_cipher(*sess, cipher); 687 if (rc != 1) { 688 SPDK_ERRLOG("Unable to set cipher: %s\n", cipher_name); 689 goto err; 690 } 691 found = true; 692 break; 693 } 694 } 695 if (found == false) { 696 SPDK_ERRLOG("No suitable cipher found\n"); 697 goto err; 698 } 699 700 SPDK_DEBUGLOG(sock_posix, "Cipher selected: %s\n", cipher_name); 701 702 rc = SSL_SESSION_set_protocol_version(*sess, TLS1_3_VERSION); 703 if (rc != 1) { 704 SPDK_ERRLOG("Unable to set TLS version: %d\n", TLS1_3_VERSION); 705 goto err; 706 } 707 708 rc = SSL_SESSION_set1_master_key(*sess, impl_opts->psk_key, keylen); 709 if (rc != 1) { 710 SPDK_ERRLOG("Unable to set PSK for session\n"); 711 goto err; 712 } 713 714 *identity_len = strlen(impl_opts->psk_identity); 715 *identity = impl_opts->psk_identity; 716 717 return 1; 718 719 err: 720 SSL_SESSION_free(*sess); 721 *sess = NULL; 722 return 0; 723 } 724 725 static SSL_CTX * 726 posix_sock_create_ssl_context(const SSL_METHOD *method, struct spdk_sock_opts *opts, 727 struct spdk_sock_impl_opts *impl_opts) 728 { 729 SSL_CTX *ctx; 730 int tls_version = 0; 731 bool ktls_enabled = false; 732 #ifdef SSL_OP_ENABLE_KTLS 733 long options; 734 #endif 735 736 SSL_library_init(); 737 OpenSSL_add_all_algorithms(); 738 SSL_load_error_strings(); 739 /* Produce a SSL CTX in SSL V2 and V3 standards compliant way */ 740 ctx = SSL_CTX_new(method); 741 if (!ctx) { 742 SPDK_ERRLOG("SSL_CTX_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 743 return NULL; 744 } 745 SPDK_DEBUGLOG(sock_posix, "SSL context created\n"); 746 747 switch (impl_opts->tls_version) { 748 case 0: 749 /* auto-negotiation */ 750 break; 751 case SPDK_TLS_VERSION_1_3: 752 tls_version = TLS1_3_VERSION; 753 break; 754 default: 755 SPDK_ERRLOG("Incorrect TLS version provided: %d\n", impl_opts->tls_version); 756 goto err; 757 } 758 759 if (tls_version) { 760 SPDK_DEBUGLOG(sock_posix, "Hardening TLS version to '%d'='0x%X'\n", impl_opts->tls_version, 761 tls_version); 762 if (!SSL_CTX_set_min_proto_version(ctx, tls_version)) { 763 SPDK_ERRLOG("Unable to set Min TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); 764 goto err; 765 } 766 if (!SSL_CTX_set_max_proto_version(ctx, tls_version)) { 767 SPDK_ERRLOG("Unable to set Max TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); 768 goto err; 769 } 770 } 771 if (impl_opts->enable_ktls) { 772 SPDK_DEBUGLOG(sock_posix, "Enabling kTLS offload\n"); 773 #ifdef SSL_OP_ENABLE_KTLS 774 options = SSL_CTX_set_options(ctx, SSL_OP_ENABLE_KTLS); 775 ktls_enabled = options & SSL_OP_ENABLE_KTLS; 776 #else 777 ktls_enabled = false; 778 #endif 779 if (!ktls_enabled) { 780 SPDK_ERRLOG("Unable to set kTLS offload via SSL_CTX_set_options(). Configure openssl with 'enable-ktls'\n"); 781 goto err; 782 } 783 } 784 785 /* SSL_CTX_set_ciphersuites() return 1 if the requested 786 * cipher suite list was configured, and 0 otherwise. */ 787 if (impl_opts->tls_cipher_suites != NULL && 788 SSL_CTX_set_ciphersuites(ctx, impl_opts->tls_cipher_suites) != 1) { 789 SPDK_ERRLOG("Unable to set TLS cipher suites for SSL'\n"); 790 goto err; 791 } 792 793 return ctx; 794 795 err: 796 SSL_CTX_free(ctx); 797 return NULL; 798 } 799 800 static SSL * 801 ssl_sock_setup_connect(SSL_CTX *ctx, int fd) 802 { 803 SSL *ssl; 804 805 ssl = SSL_new(ctx); 806 if (!ssl) { 807 SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 808 return NULL; 809 } 810 SSL_set_fd(ssl, fd); 811 SSL_set_connect_state(ssl); 812 SSL_set_psk_use_session_callback(ssl, posix_sock_psk_use_session_client_cb); 813 SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); 814 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 815 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 816 SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", 817 SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); 818 return ssl; 819 } 820 821 static SSL * 822 ssl_sock_setup_accept(SSL_CTX *ctx, int fd) 823 { 824 SSL *ssl; 825 826 ssl = SSL_new(ctx); 827 if (!ssl) { 828 SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 829 return NULL; 830 } 831 SSL_set_fd(ssl, fd); 832 SSL_set_accept_state(ssl); 833 SSL_set_psk_find_session_callback(ssl, posix_sock_psk_find_session_server_cb); 834 SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); 835 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 836 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 837 SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", 838 SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); 839 return ssl; 840 } 841 842 static ssize_t 843 SSL_readv(SSL *ssl, const struct iovec *iov, int iovcnt) 844 { 845 int i, rc = 0; 846 ssize_t total = 0; 847 848 for (i = 0; i < iovcnt; i++) { 849 rc = SSL_read(ssl, iov[i].iov_base, iov[i].iov_len); 850 851 if (rc > 0) { 852 total += rc; 853 } 854 if (rc != (int)iov[i].iov_len) { 855 break; 856 } 857 } 858 if (total > 0) { 859 errno = 0; 860 return total; 861 } 862 switch (SSL_get_error(ssl, rc)) { 863 case SSL_ERROR_ZERO_RETURN: 864 errno = ENOTCONN; 865 return 0; 866 case SSL_ERROR_WANT_READ: 867 case SSL_ERROR_WANT_WRITE: 868 case SSL_ERROR_WANT_CONNECT: 869 case SSL_ERROR_WANT_ACCEPT: 870 case SSL_ERROR_WANT_X509_LOOKUP: 871 case SSL_ERROR_WANT_ASYNC: 872 case SSL_ERROR_WANT_ASYNC_JOB: 873 case SSL_ERROR_WANT_CLIENT_HELLO_CB: 874 errno = EAGAIN; 875 return -1; 876 case SSL_ERROR_SYSCALL: 877 case SSL_ERROR_SSL: 878 errno = ENOTCONN; 879 return -1; 880 default: 881 errno = ENOTCONN; 882 return -1; 883 } 884 } 885 886 static ssize_t 887 SSL_writev(SSL *ssl, struct iovec *iov, int iovcnt) 888 { 889 int i, rc = 0; 890 ssize_t total = 0; 891 892 for (i = 0; i < iovcnt; i++) { 893 rc = SSL_write(ssl, iov[i].iov_base, iov[i].iov_len); 894 895 if (rc > 0) { 896 total += rc; 897 } 898 if (rc != (int)iov[i].iov_len) { 899 break; 900 } 901 } 902 if (total > 0) { 903 errno = 0; 904 return total; 905 } 906 switch (SSL_get_error(ssl, rc)) { 907 case SSL_ERROR_ZERO_RETURN: 908 errno = ENOTCONN; 909 return 0; 910 case SSL_ERROR_WANT_READ: 911 case SSL_ERROR_WANT_WRITE: 912 case SSL_ERROR_WANT_CONNECT: 913 case SSL_ERROR_WANT_ACCEPT: 914 case SSL_ERROR_WANT_X509_LOOKUP: 915 case SSL_ERROR_WANT_ASYNC: 916 case SSL_ERROR_WANT_ASYNC_JOB: 917 case SSL_ERROR_WANT_CLIENT_HELLO_CB: 918 errno = EAGAIN; 919 return -1; 920 case SSL_ERROR_SYSCALL: 921 case SSL_ERROR_SSL: 922 errno = ENOTCONN; 923 return -1; 924 default: 925 errno = ENOTCONN; 926 return -1; 927 } 928 } 929 930 static struct spdk_sock * 931 posix_sock_create(const char *ip, int port, 932 enum posix_sock_create_type type, 933 struct spdk_sock_opts *opts, 934 bool enable_ssl) 935 { 936 struct spdk_posix_sock *sock; 937 struct spdk_sock_impl_opts impl_opts; 938 char buf[MAX_TMPBUF]; 939 char portnum[PORTNUMLEN]; 940 char *p; 941 const char *src_addr; 942 uint16_t src_port; 943 struct addrinfo hints, *res, *res0, *src_ai; 944 int fd, flag; 945 int rc; 946 bool enable_zcopy_user_opts = true; 947 bool enable_zcopy_impl_opts = true; 948 SSL_CTX *ctx = 0; 949 SSL *ssl = 0; 950 951 assert(opts != NULL); 952 if (enable_ssl) { 953 _opts_get_impl_opts(opts, &impl_opts, &g_ssl_impl_opts); 954 } else { 955 _opts_get_impl_opts(opts, &impl_opts, &g_posix_impl_opts); 956 } 957 958 if (ip == NULL) { 959 return NULL; 960 } 961 if (ip[0] == '[') { 962 snprintf(buf, sizeof(buf), "%s", ip + 1); 963 p = strchr(buf, ']'); 964 if (p != NULL) { 965 *p = '\0'; 966 } 967 ip = (const char *) &buf[0]; 968 } 969 970 snprintf(portnum, sizeof portnum, "%d", port); 971 memset(&hints, 0, sizeof hints); 972 hints.ai_family = PF_UNSPEC; 973 hints.ai_socktype = SOCK_STREAM; 974 hints.ai_flags = AI_NUMERICSERV; 975 hints.ai_flags |= AI_PASSIVE; 976 hints.ai_flags |= AI_NUMERICHOST; 977 rc = getaddrinfo(ip, portnum, &hints, &res0); 978 if (rc != 0) { 979 SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); 980 return NULL; 981 } 982 983 /* try listen */ 984 fd = -1; 985 for (res = res0; res != NULL; res = res->ai_next) { 986 retry: 987 fd = posix_fd_create(res, opts, &impl_opts); 988 if (fd < 0) { 989 continue; 990 } 991 if (type == SPDK_SOCK_CREATE_LISTEN) { 992 rc = bind(fd, res->ai_addr, res->ai_addrlen); 993 if (rc != 0) { 994 SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); 995 switch (errno) { 996 case EINTR: 997 /* interrupted? */ 998 close(fd); 999 goto retry; 1000 case EADDRNOTAVAIL: 1001 SPDK_ERRLOG("IP address %s not available. " 1002 "Verify IP address in config file " 1003 "and make sure setup script is " 1004 "run before starting spdk app.\n", ip); 1005 /* FALLTHROUGH */ 1006 default: 1007 /* try next family */ 1008 close(fd); 1009 fd = -1; 1010 continue; 1011 } 1012 } 1013 /* bind OK */ 1014 rc = listen(fd, 512); 1015 if (rc != 0) { 1016 SPDK_ERRLOG("listen() failed, errno = %d\n", errno); 1017 close(fd); 1018 fd = -1; 1019 break; 1020 } 1021 enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server; 1022 } else if (type == SPDK_SOCK_CREATE_CONNECT) { 1023 src_addr = SPDK_GET_FIELD(opts, src_addr, NULL, opts->opts_size); 1024 src_port = SPDK_GET_FIELD(opts, src_port, 0, opts->opts_size); 1025 if (src_addr != NULL || src_port != 0) { 1026 snprintf(portnum, sizeof(portnum), "%"PRIu16, src_port); 1027 memset(&hints, 0, sizeof hints); 1028 hints.ai_family = AF_UNSPEC; 1029 hints.ai_socktype = SOCK_STREAM; 1030 hints.ai_flags = AI_NUMERICSERV | AI_NUMERICHOST | AI_PASSIVE; 1031 rc = getaddrinfo(src_addr, src_port > 0 ? portnum : NULL, 1032 &hints, &src_ai); 1033 if (rc != 0 || src_ai == NULL) { 1034 SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", 1035 rc != 0 ? gai_strerror(rc) : "", rc); 1036 close(fd); 1037 fd = -1; 1038 break; 1039 } 1040 rc = bind(fd, src_ai->ai_addr, src_ai->ai_addrlen); 1041 if (rc != 0) { 1042 SPDK_ERRLOG("bind() failed errno %d (%s:%s)\n", errno, 1043 src_addr ? src_addr : "", portnum); 1044 close(fd); 1045 fd = -1; 1046 freeaddrinfo(src_ai); 1047 src_ai = NULL; 1048 break; 1049 } 1050 freeaddrinfo(src_ai); 1051 src_ai = NULL; 1052 } 1053 rc = connect(fd, res->ai_addr, res->ai_addrlen); 1054 if (rc != 0) { 1055 SPDK_ERRLOG("connect() failed, errno = %d\n", errno); 1056 /* try next family */ 1057 close(fd); 1058 fd = -1; 1059 continue; 1060 } 1061 enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client; 1062 if (enable_ssl) { 1063 ctx = posix_sock_create_ssl_context(TLS_client_method(), opts, &impl_opts); 1064 if (!ctx) { 1065 SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); 1066 close(fd); 1067 fd = -1; 1068 break; 1069 } 1070 ssl = ssl_sock_setup_connect(ctx, fd); 1071 if (!ssl) { 1072 SPDK_ERRLOG("ssl_sock_setup_connect() failed, errno = %d\n", errno); 1073 close(fd); 1074 fd = -1; 1075 SSL_CTX_free(ctx); 1076 break; 1077 } 1078 } 1079 } 1080 1081 flag = fcntl(fd, F_GETFL); 1082 if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { 1083 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); 1084 SSL_free(ssl); 1085 SSL_CTX_free(ctx); 1086 close(fd); 1087 fd = -1; 1088 break; 1089 } 1090 break; 1091 } 1092 freeaddrinfo(res0); 1093 1094 if (fd < 0) { 1095 return NULL; 1096 } 1097 1098 /* Only enable zero copy for non-loopback and non-ssl sockets. */ 1099 enable_zcopy_user_opts = opts->zcopy && !spdk_net_is_loopback(fd) && !enable_ssl; 1100 1101 sock = posix_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts); 1102 if (sock == NULL) { 1103 SPDK_ERRLOG("sock allocation failed\n"); 1104 SSL_free(ssl); 1105 SSL_CTX_free(ctx); 1106 close(fd); 1107 return NULL; 1108 } 1109 1110 if (ctx) { 1111 sock->ctx = ctx; 1112 } 1113 1114 if (ssl) { 1115 sock->ssl = ssl; 1116 SSL_set_app_data(ssl, &sock->base.impl_opts); 1117 } 1118 1119 return &sock->base; 1120 } 1121 1122 static struct spdk_sock * 1123 posix_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) 1124 { 1125 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, false); 1126 } 1127 1128 static struct spdk_sock * 1129 posix_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) 1130 { 1131 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, false); 1132 } 1133 1134 static struct spdk_sock * 1135 _posix_sock_accept(struct spdk_sock *_sock, bool enable_ssl) 1136 { 1137 struct spdk_posix_sock *sock = __posix_sock(_sock); 1138 struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl); 1139 struct sockaddr_storage sa; 1140 socklen_t salen; 1141 int rc, fd; 1142 struct spdk_posix_sock *new_sock; 1143 int flag; 1144 SSL_CTX *ctx = 0; 1145 SSL *ssl = 0; 1146 1147 memset(&sa, 0, sizeof(sa)); 1148 salen = sizeof(sa); 1149 1150 assert(sock != NULL); 1151 1152 /* epoll_wait will trigger again if there is more than one request */ 1153 if (group && sock->socket_has_data) { 1154 sock->socket_has_data = false; 1155 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1156 } 1157 1158 rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); 1159 1160 if (rc == -1) { 1161 return NULL; 1162 } 1163 1164 fd = rc; 1165 1166 flag = fcntl(fd, F_GETFL); 1167 if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) { 1168 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); 1169 close(fd); 1170 return NULL; 1171 } 1172 1173 #if defined(SO_PRIORITY) 1174 /* The priority is not inherited, so call this function again */ 1175 if (sock->base.opts.priority) { 1176 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); 1177 if (rc != 0) { 1178 close(fd); 1179 return NULL; 1180 } 1181 } 1182 #endif 1183 1184 /* Establish SSL connection */ 1185 if (enable_ssl) { 1186 ctx = posix_sock_create_ssl_context(TLS_server_method(), &sock->base.opts, &sock->base.impl_opts); 1187 if (!ctx) { 1188 SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); 1189 close(fd); 1190 return NULL; 1191 } 1192 ssl = ssl_sock_setup_accept(ctx, fd); 1193 if (!ssl) { 1194 SPDK_ERRLOG("ssl_sock_setup_accept() failed, errno = %d\n", errno); 1195 close(fd); 1196 SSL_CTX_free(ctx); 1197 return NULL; 1198 } 1199 } 1200 1201 /* Inherit the zero copy feature from the listen socket */ 1202 new_sock = posix_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy); 1203 if (new_sock == NULL) { 1204 close(fd); 1205 SSL_free(ssl); 1206 SSL_CTX_free(ctx); 1207 return NULL; 1208 } 1209 1210 if (ctx) { 1211 new_sock->ctx = ctx; 1212 } 1213 1214 if (ssl) { 1215 new_sock->ssl = ssl; 1216 SSL_set_app_data(ssl, &new_sock->base.impl_opts); 1217 } 1218 1219 return &new_sock->base; 1220 } 1221 1222 static struct spdk_sock * 1223 posix_sock_accept(struct spdk_sock *_sock) 1224 { 1225 return _posix_sock_accept(_sock, false); 1226 } 1227 1228 static int 1229 posix_sock_close(struct spdk_sock *_sock) 1230 { 1231 struct spdk_posix_sock *sock = __posix_sock(_sock); 1232 void *pipe_buf; 1233 1234 assert(TAILQ_EMPTY(&_sock->pending_reqs)); 1235 1236 if (sock->ssl != NULL) { 1237 SSL_shutdown(sock->ssl); 1238 } 1239 1240 /* If the socket fails to close, the best choice is to 1241 * leak the fd but continue to free the rest of the sock 1242 * memory. */ 1243 close(sock->fd); 1244 1245 SSL_free(sock->ssl); 1246 SSL_CTX_free(sock->ctx); 1247 1248 pipe_buf = spdk_pipe_destroy(sock->recv_pipe); 1249 free(pipe_buf); 1250 free(sock); 1251 1252 return 0; 1253 } 1254 1255 #ifdef SPDK_ZEROCOPY 1256 static int 1257 _sock_check_zcopy(struct spdk_sock *sock) 1258 { 1259 struct spdk_posix_sock *psock = __posix_sock(sock); 1260 struct msghdr msgh = {}; 1261 uint8_t buf[sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)]; 1262 ssize_t rc; 1263 struct sock_extended_err *serr; 1264 struct cmsghdr *cm; 1265 uint32_t idx; 1266 struct spdk_sock_request *req, *treq; 1267 bool found; 1268 1269 msgh.msg_control = buf; 1270 msgh.msg_controllen = sizeof(buf); 1271 1272 while (true) { 1273 rc = recvmsg(psock->fd, &msgh, MSG_ERRQUEUE); 1274 1275 if (rc < 0) { 1276 if (errno == EWOULDBLOCK || errno == EAGAIN) { 1277 return 0; 1278 } 1279 1280 if (!TAILQ_EMPTY(&sock->pending_reqs)) { 1281 SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries\n"); 1282 } else { 1283 SPDK_WARNLOG("Recvmsg yielded an error!\n"); 1284 } 1285 return 0; 1286 } 1287 1288 cm = CMSG_FIRSTHDR(&msgh); 1289 if (!(cm && 1290 ((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || 1291 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR)))) { 1292 SPDK_WARNLOG("Unexpected cmsg level or type!\n"); 1293 return 0; 1294 } 1295 1296 serr = (struct sock_extended_err *)CMSG_DATA(cm); 1297 if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { 1298 SPDK_WARNLOG("Unexpected extended error origin\n"); 1299 return 0; 1300 } 1301 1302 /* Most of the time, the pending_reqs array is in the exact 1303 * order we need such that all of the requests to complete are 1304 * in order, in the front. It is guaranteed that all requests 1305 * belonging to the same sendmsg call are sequential, so once 1306 * we encounter one match we can stop looping as soon as a 1307 * non-match is found. 1308 */ 1309 idx = serr->ee_info; 1310 while (true) { 1311 found = false; 1312 TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) { 1313 if (!req->internal.is_zcopy) { 1314 /* This wasn't a zcopy request. It was just waiting in line to complete */ 1315 rc = spdk_sock_request_put(sock, req, 0); 1316 if (rc < 0) { 1317 return rc; 1318 } 1319 } else if (req->internal.offset == idx) { 1320 found = true; 1321 rc = spdk_sock_request_put(sock, req, 0); 1322 if (rc < 0) { 1323 return rc; 1324 } 1325 } else if (found) { 1326 break; 1327 } 1328 } 1329 1330 if (idx == serr->ee_data) { 1331 break; 1332 } 1333 1334 if (idx == UINT32_MAX) { 1335 idx = 0; 1336 } else { 1337 idx++; 1338 } 1339 } 1340 } 1341 1342 return 0; 1343 } 1344 #endif 1345 1346 static int 1347 _sock_flush(struct spdk_sock *sock) 1348 { 1349 struct spdk_posix_sock *psock = __posix_sock(sock); 1350 struct msghdr msg = {}; 1351 int flags; 1352 struct iovec iovs[IOV_BATCH_SIZE]; 1353 int iovcnt; 1354 int retval; 1355 struct spdk_sock_request *req; 1356 int i; 1357 ssize_t rc, sent; 1358 unsigned int offset; 1359 size_t len; 1360 bool is_zcopy = false; 1361 1362 /* Can't flush from within a callback or we end up with recursive calls */ 1363 if (sock->cb_cnt > 0) { 1364 errno = EAGAIN; 1365 return -1; 1366 } 1367 1368 #ifdef SPDK_ZEROCOPY 1369 if (psock->zcopy) { 1370 flags = MSG_ZEROCOPY | MSG_NOSIGNAL; 1371 } else 1372 #endif 1373 { 1374 flags = MSG_NOSIGNAL; 1375 } 1376 1377 iovcnt = spdk_sock_prep_reqs(sock, iovs, 0, NULL, &flags); 1378 if (iovcnt == 0) { 1379 return 0; 1380 } 1381 1382 #ifdef SPDK_ZEROCOPY 1383 is_zcopy = flags & MSG_ZEROCOPY; 1384 #endif 1385 1386 /* Perform the vectored write */ 1387 msg.msg_iov = iovs; 1388 msg.msg_iovlen = iovcnt; 1389 1390 if (psock->ssl) { 1391 rc = SSL_writev(psock->ssl, iovs, iovcnt); 1392 } else { 1393 rc = sendmsg(psock->fd, &msg, flags); 1394 } 1395 if (rc <= 0) { 1396 if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && psock->zcopy)) { 1397 errno = EAGAIN; 1398 } 1399 return -1; 1400 } 1401 1402 sent = rc; 1403 1404 if (is_zcopy) { 1405 /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the 1406 * req->internal.offset, so sendmsg_idx should not be zero */ 1407 if (spdk_unlikely(psock->sendmsg_idx == UINT32_MAX)) { 1408 psock->sendmsg_idx = 1; 1409 } else { 1410 psock->sendmsg_idx++; 1411 } 1412 } 1413 1414 /* Consume the requests that were actually written */ 1415 req = TAILQ_FIRST(&sock->queued_reqs); 1416 while (req) { 1417 offset = req->internal.offset; 1418 1419 /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */ 1420 req->internal.is_zcopy = is_zcopy; 1421 1422 for (i = 0; i < req->iovcnt; i++) { 1423 /* Advance by the offset first */ 1424 if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { 1425 offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; 1426 continue; 1427 } 1428 1429 /* Calculate the remaining length of this element */ 1430 len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; 1431 1432 if (len > (size_t)rc) { 1433 /* This element was partially sent. */ 1434 req->internal.offset += rc; 1435 return sent; 1436 } 1437 1438 offset = 0; 1439 req->internal.offset += len; 1440 rc -= len; 1441 } 1442 1443 /* Handled a full request. */ 1444 spdk_sock_request_pend(sock, req); 1445 1446 if (!req->internal.is_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) { 1447 /* The sendmsg syscall above isn't currently asynchronous, 1448 * so it's already done. */ 1449 retval = spdk_sock_request_put(sock, req, 0); 1450 if (retval) { 1451 break; 1452 } 1453 } else { 1454 /* Re-use the offset field to hold the sendmsg call index. The 1455 * index is 0 based, so subtract one here because we've already 1456 * incremented above. */ 1457 req->internal.offset = psock->sendmsg_idx - 1; 1458 } 1459 1460 if (rc == 0) { 1461 break; 1462 } 1463 1464 req = TAILQ_FIRST(&sock->queued_reqs); 1465 } 1466 1467 return sent; 1468 } 1469 1470 static int 1471 posix_sock_flush(struct spdk_sock *sock) 1472 { 1473 #ifdef SPDK_ZEROCOPY 1474 struct spdk_posix_sock *psock = __posix_sock(sock); 1475 1476 if (psock->zcopy && !TAILQ_EMPTY(&sock->pending_reqs)) { 1477 _sock_check_zcopy(sock); 1478 } 1479 #endif 1480 1481 return _sock_flush(sock); 1482 } 1483 1484 static ssize_t 1485 posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int diovcnt) 1486 { 1487 struct iovec siov[2]; 1488 int sbytes; 1489 ssize_t bytes; 1490 struct spdk_posix_sock_group_impl *group; 1491 1492 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); 1493 if (sbytes < 0) { 1494 errno = EINVAL; 1495 return -1; 1496 } else if (sbytes == 0) { 1497 errno = EAGAIN; 1498 return -1; 1499 } 1500 1501 bytes = spdk_iovcpy(siov, 2, diov, diovcnt); 1502 1503 if (bytes == 0) { 1504 /* The only way this happens is if diov is 0 length */ 1505 errno = EINVAL; 1506 return -1; 1507 } 1508 1509 spdk_pipe_reader_advance(sock->recv_pipe, bytes); 1510 1511 /* If we drained the pipe, mark it appropriately */ 1512 if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { 1513 assert(sock->pipe_has_data == true); 1514 1515 group = __posix_group_impl(sock->base.group_impl); 1516 if (group && !sock->socket_has_data) { 1517 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1518 } 1519 1520 sock->pipe_has_data = false; 1521 } 1522 1523 return bytes; 1524 } 1525 1526 static inline ssize_t 1527 posix_sock_read(struct spdk_posix_sock *sock) 1528 { 1529 struct iovec iov[2]; 1530 int bytes_avail, bytes_recvd; 1531 struct spdk_posix_sock_group_impl *group; 1532 1533 bytes_avail = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); 1534 1535 if (bytes_avail <= 0) { 1536 return bytes_avail; 1537 } 1538 1539 if (sock->ssl) { 1540 bytes_recvd = SSL_readv(sock->ssl, iov, 2); 1541 } else { 1542 bytes_recvd = readv(sock->fd, iov, 2); 1543 } 1544 1545 assert(sock->pipe_has_data == false); 1546 1547 if (bytes_recvd <= 0) { 1548 /* Errors count as draining the socket data */ 1549 if (sock->base.group_impl && sock->socket_has_data) { 1550 group = __posix_group_impl(sock->base.group_impl); 1551 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1552 } 1553 1554 sock->socket_has_data = false; 1555 1556 return bytes_recvd; 1557 } 1558 1559 spdk_pipe_writer_advance(sock->recv_pipe, bytes_recvd); 1560 1561 #if DEBUG 1562 if (sock->base.group_impl) { 1563 assert(sock->socket_has_data == true); 1564 } 1565 #endif 1566 1567 sock->pipe_has_data = true; 1568 if (bytes_recvd < bytes_avail) { 1569 /* We drained the kernel socket entirely. */ 1570 sock->socket_has_data = false; 1571 } 1572 1573 return bytes_recvd; 1574 } 1575 1576 static ssize_t 1577 posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) 1578 { 1579 struct spdk_posix_sock *sock = __posix_sock(_sock); 1580 struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl); 1581 int rc, i; 1582 size_t len; 1583 1584 if (sock->recv_pipe == NULL) { 1585 assert(sock->pipe_has_data == false); 1586 if (group && sock->socket_has_data) { 1587 sock->socket_has_data = false; 1588 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1589 } 1590 if (sock->ssl) { 1591 return SSL_readv(sock->ssl, iov, iovcnt); 1592 } else { 1593 return readv(sock->fd, iov, iovcnt); 1594 } 1595 } 1596 1597 /* If the socket is not in a group, we must assume it always has 1598 * data waiting for us because it is not epolled */ 1599 if (!sock->pipe_has_data && (group == NULL || sock->socket_has_data)) { 1600 /* If the user is receiving a sufficiently large amount of data, 1601 * receive directly to their buffers. */ 1602 len = 0; 1603 for (i = 0; i < iovcnt; i++) { 1604 len += iov[i].iov_len; 1605 } 1606 1607 if (len >= MIN_SOCK_PIPE_SIZE) { 1608 /* TODO: Should this detect if kernel socket is drained? */ 1609 if (sock->ssl) { 1610 return SSL_readv(sock->ssl, iov, iovcnt); 1611 } else { 1612 return readv(sock->fd, iov, iovcnt); 1613 } 1614 } 1615 1616 /* Otherwise, do a big read into our pipe */ 1617 rc = posix_sock_read(sock); 1618 if (rc <= 0) { 1619 return rc; 1620 } 1621 } 1622 1623 return posix_sock_recv_from_pipe(sock, iov, iovcnt); 1624 } 1625 1626 static ssize_t 1627 posix_sock_recv(struct spdk_sock *sock, void *buf, size_t len) 1628 { 1629 struct iovec iov[1]; 1630 1631 iov[0].iov_base = buf; 1632 iov[0].iov_len = len; 1633 1634 return posix_sock_readv(sock, iov, 1); 1635 } 1636 1637 static ssize_t 1638 posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) 1639 { 1640 struct spdk_posix_sock *sock = __posix_sock(_sock); 1641 int rc; 1642 1643 /* In order to process a writev, we need to flush any asynchronous writes 1644 * first. */ 1645 rc = _sock_flush(_sock); 1646 if (rc < 0) { 1647 return rc; 1648 } 1649 1650 if (!TAILQ_EMPTY(&_sock->queued_reqs)) { 1651 /* We weren't able to flush all requests */ 1652 errno = EAGAIN; 1653 return -1; 1654 } 1655 1656 if (sock->ssl) { 1657 return SSL_writev(sock->ssl, iov, iovcnt); 1658 } else { 1659 return writev(sock->fd, iov, iovcnt); 1660 } 1661 } 1662 1663 static int 1664 posix_sock_recv_next(struct spdk_sock *_sock, void **buf, void **ctx) 1665 { 1666 struct spdk_posix_sock *sock = __posix_sock(_sock); 1667 struct iovec iov; 1668 ssize_t rc; 1669 1670 if (sock->recv_pipe != NULL) { 1671 errno = ENOTSUP; 1672 return -1; 1673 } 1674 1675 iov.iov_len = spdk_sock_group_get_buf(_sock->group_impl->group, &iov.iov_base, ctx); 1676 if (iov.iov_len == 0) { 1677 errno = ENOBUFS; 1678 return -1; 1679 } 1680 1681 rc = posix_sock_readv(_sock, &iov, 1); 1682 if (rc <= 0) { 1683 spdk_sock_group_provide_buf(_sock->group_impl->group, iov.iov_base, iov.iov_len, *ctx); 1684 return rc; 1685 } 1686 1687 *buf = iov.iov_base; 1688 1689 return rc; 1690 } 1691 1692 static void 1693 posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req) 1694 { 1695 int rc; 1696 1697 spdk_sock_request_queue(sock, req); 1698 1699 /* If there are a sufficient number queued, just flush them out immediately. */ 1700 if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { 1701 rc = _sock_flush(sock); 1702 if (rc < 0 && errno != EAGAIN) { 1703 spdk_sock_abort_requests(sock); 1704 } 1705 } 1706 } 1707 1708 static int 1709 posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) 1710 { 1711 struct spdk_posix_sock *sock = __posix_sock(_sock); 1712 int val; 1713 int rc; 1714 1715 assert(sock != NULL); 1716 1717 val = nbytes; 1718 rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); 1719 if (rc != 0) { 1720 return -1; 1721 } 1722 return 0; 1723 } 1724 1725 static bool 1726 posix_sock_is_ipv6(struct spdk_sock *_sock) 1727 { 1728 struct spdk_posix_sock *sock = __posix_sock(_sock); 1729 struct sockaddr_storage sa; 1730 socklen_t salen; 1731 int rc; 1732 1733 assert(sock != NULL); 1734 1735 memset(&sa, 0, sizeof sa); 1736 salen = sizeof sa; 1737 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 1738 if (rc != 0) { 1739 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 1740 return false; 1741 } 1742 1743 return (sa.ss_family == AF_INET6); 1744 } 1745 1746 static bool 1747 posix_sock_is_ipv4(struct spdk_sock *_sock) 1748 { 1749 struct spdk_posix_sock *sock = __posix_sock(_sock); 1750 struct sockaddr_storage sa; 1751 socklen_t salen; 1752 int rc; 1753 1754 assert(sock != NULL); 1755 1756 memset(&sa, 0, sizeof sa); 1757 salen = sizeof sa; 1758 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 1759 if (rc != 0) { 1760 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 1761 return false; 1762 } 1763 1764 return (sa.ss_family == AF_INET); 1765 } 1766 1767 static bool 1768 posix_sock_is_connected(struct spdk_sock *_sock) 1769 { 1770 struct spdk_posix_sock *sock = __posix_sock(_sock); 1771 uint8_t byte; 1772 int rc; 1773 1774 rc = recv(sock->fd, &byte, 1, MSG_PEEK); 1775 if (rc == 0) { 1776 return false; 1777 } 1778 1779 if (rc < 0) { 1780 if (errno == EAGAIN || errno == EWOULDBLOCK) { 1781 return true; 1782 } 1783 1784 return false; 1785 } 1786 1787 return true; 1788 } 1789 1790 static struct spdk_sock_group_impl * 1791 posix_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint) 1792 { 1793 struct spdk_posix_sock *sock = __posix_sock(_sock); 1794 struct spdk_sock_group_impl *group_impl; 1795 1796 if (sock->placement_id != -1) { 1797 spdk_sock_map_lookup(&g_map, sock->placement_id, &group_impl, hint); 1798 return group_impl; 1799 } 1800 1801 return NULL; 1802 } 1803 1804 static struct spdk_sock_group_impl * 1805 _sock_group_impl_create(uint32_t enable_placement_id) 1806 { 1807 struct spdk_posix_sock_group_impl *group_impl; 1808 int fd; 1809 1810 #if defined(SPDK_EPOLL) 1811 fd = epoll_create1(0); 1812 #elif defined(SPDK_KEVENT) 1813 fd = kqueue(); 1814 #endif 1815 if (fd == -1) { 1816 return NULL; 1817 } 1818 1819 group_impl = calloc(1, sizeof(*group_impl)); 1820 if (group_impl == NULL) { 1821 SPDK_ERRLOG("group_impl allocation failed\n"); 1822 close(fd); 1823 return NULL; 1824 } 1825 1826 group_impl->pipe_group = spdk_pipe_group_create(); 1827 if (group_impl->pipe_group == NULL) { 1828 SPDK_ERRLOG("pipe_group allocation failed\n"); 1829 free(group_impl); 1830 close(fd); 1831 return NULL; 1832 } 1833 1834 group_impl->fd = fd; 1835 TAILQ_INIT(&group_impl->socks_with_data); 1836 group_impl->placement_id = -1; 1837 1838 if (enable_placement_id == PLACEMENT_CPU) { 1839 spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base); 1840 group_impl->placement_id = spdk_env_get_current_core(); 1841 } 1842 1843 return &group_impl->base; 1844 } 1845 1846 static struct spdk_sock_group_impl * 1847 posix_sock_group_impl_create(void) 1848 { 1849 return _sock_group_impl_create(g_posix_impl_opts.enable_placement_id); 1850 } 1851 1852 static struct spdk_sock_group_impl * 1853 ssl_sock_group_impl_create(void) 1854 { 1855 return _sock_group_impl_create(g_ssl_impl_opts.enable_placement_id); 1856 } 1857 1858 static void 1859 posix_sock_mark(struct spdk_posix_sock_group_impl *group, struct spdk_posix_sock *sock, 1860 int placement_id) 1861 { 1862 #if defined(SO_MARK) 1863 int rc; 1864 1865 rc = setsockopt(sock->fd, SOL_SOCKET, SO_MARK, 1866 &placement_id, sizeof(placement_id)); 1867 if (rc != 0) { 1868 /* Not fatal */ 1869 SPDK_ERRLOG("Error setting SO_MARK\n"); 1870 return; 1871 } 1872 1873 rc = spdk_sock_map_insert(&g_map, placement_id, &group->base); 1874 if (rc != 0) { 1875 /* Not fatal */ 1876 SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); 1877 return; 1878 } 1879 1880 sock->placement_id = placement_id; 1881 #endif 1882 } 1883 1884 static void 1885 posix_sock_update_mark(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1886 { 1887 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1888 1889 if (group->placement_id == -1) { 1890 group->placement_id = spdk_sock_map_find_free(&g_map); 1891 1892 /* If a free placement id is found, update existing sockets in this group */ 1893 if (group->placement_id != -1) { 1894 struct spdk_sock *sock, *tmp; 1895 1896 TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { 1897 posix_sock_mark(group, __posix_sock(sock), group->placement_id); 1898 } 1899 } 1900 } 1901 1902 if (group->placement_id != -1) { 1903 /* 1904 * group placement id is already determined for this poll group. 1905 * Mark socket with group's placement id. 1906 */ 1907 posix_sock_mark(group, __posix_sock(_sock), group->placement_id); 1908 } 1909 } 1910 1911 static int 1912 posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1913 { 1914 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1915 struct spdk_posix_sock *sock = __posix_sock(_sock); 1916 int rc; 1917 1918 #if defined(SPDK_EPOLL) 1919 struct epoll_event event; 1920 1921 memset(&event, 0, sizeof(event)); 1922 /* EPOLLERR is always on even if we don't set it, but be explicit for clarity */ 1923 event.events = EPOLLIN | EPOLLERR; 1924 if (spdk_interrupt_mode_is_enabled()) { 1925 event.events |= EPOLLOUT; 1926 } 1927 1928 event.data.ptr = sock; 1929 1930 rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event); 1931 #elif defined(SPDK_KEVENT) 1932 struct kevent event; 1933 struct timespec ts = {0}; 1934 1935 EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock); 1936 1937 rc = kevent(group->fd, &event, 1, NULL, 0, &ts); 1938 #endif 1939 1940 if (rc != 0) { 1941 return rc; 1942 } 1943 1944 /* switched from another polling group due to scheduling */ 1945 if (spdk_unlikely(sock->recv_pipe != NULL && 1946 (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { 1947 sock->pipe_has_data = true; 1948 sock->socket_has_data = false; 1949 TAILQ_INSERT_TAIL(&group->socks_with_data, sock, link); 1950 } else if (sock->recv_pipe != NULL) { 1951 rc = spdk_pipe_group_add(group->pipe_group, sock->recv_pipe); 1952 assert(rc == 0); 1953 } 1954 1955 if (_sock->impl_opts.enable_placement_id == PLACEMENT_MARK) { 1956 posix_sock_update_mark(_group, _sock); 1957 } else if (sock->placement_id != -1) { 1958 rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base); 1959 if (rc != 0) { 1960 SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); 1961 /* Do not treat this as an error. The system will continue running. */ 1962 } 1963 } 1964 1965 return rc; 1966 } 1967 1968 static int 1969 posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1970 { 1971 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1972 struct spdk_posix_sock *sock = __posix_sock(_sock); 1973 int rc; 1974 1975 if (sock->pipe_has_data || sock->socket_has_data) { 1976 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1977 sock->pipe_has_data = false; 1978 sock->socket_has_data = false; 1979 } else if (sock->recv_pipe != NULL) { 1980 rc = spdk_pipe_group_remove(group->pipe_group, sock->recv_pipe); 1981 assert(rc == 0); 1982 } 1983 1984 if (sock->placement_id != -1) { 1985 spdk_sock_map_release(&g_map, sock->placement_id); 1986 } 1987 1988 #if defined(SPDK_EPOLL) 1989 struct epoll_event event; 1990 1991 /* Event parameter is ignored but some old kernel version still require it. */ 1992 rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event); 1993 #elif defined(SPDK_KEVENT) 1994 struct kevent event; 1995 struct timespec ts = {0}; 1996 1997 EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); 1998 1999 rc = kevent(group->fd, &event, 1, NULL, 0, &ts); 2000 if (rc == 0 && event.flags & EV_ERROR) { 2001 rc = -1; 2002 errno = event.data; 2003 } 2004 #endif 2005 2006 spdk_sock_abort_requests(_sock); 2007 2008 return rc; 2009 } 2010 2011 static int 2012 posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, 2013 struct spdk_sock **socks) 2014 { 2015 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 2016 struct spdk_sock *sock, *tmp; 2017 int num_events, i, rc; 2018 struct spdk_posix_sock *psock, *ptmp; 2019 #if defined(SPDK_EPOLL) 2020 struct epoll_event events[MAX_EVENTS_PER_POLL]; 2021 #elif defined(SPDK_KEVENT) 2022 struct kevent events[MAX_EVENTS_PER_POLL]; 2023 struct timespec ts = {0}; 2024 #endif 2025 2026 #ifdef SPDK_ZEROCOPY 2027 /* When all of the following conditions are met 2028 * - non-blocking socket 2029 * - zero copy is enabled 2030 * - interrupts suppressed (i.e. busy polling) 2031 * - the NIC tx queue is full at the time sendmsg() is called 2032 * - epoll_wait determines there is an EPOLLIN event for the socket 2033 * then we can get into a situation where data we've sent is queued 2034 * up in the kernel network stack, but interrupts have been suppressed 2035 * because other traffic is flowing so the kernel misses the signal 2036 * to flush the software tx queue. If there wasn't incoming data 2037 * pending on the socket, then epoll_wait would have been sufficient 2038 * to kick off the send operation, but since there is a pending event 2039 * epoll_wait does not trigger the necessary operation. 2040 * 2041 * We deal with this by checking for all of the above conditions and 2042 * additionally looking for EPOLLIN events that were not consumed from 2043 * the last poll loop. We take this to mean that the upper layer is 2044 * unable to consume them because it is blocked waiting for resources 2045 * to free up, and those resources are most likely freed in response 2046 * to a pending asynchronous write completing. 2047 * 2048 * Additionally, sockets that have the same placement_id actually share 2049 * an underlying hardware queue. That means polling one of them is 2050 * equivalent to polling all of them. As a quick mechanism to avoid 2051 * making extra poll() calls, stash the last placement_id during the loop 2052 * and only poll if it's not the same. The overwhelmingly common case 2053 * is that all sockets in this list have the same placement_id because 2054 * SPDK is intentionally grouping sockets by that value, so even 2055 * though this won't stop all extra calls to poll(), it's very fast 2056 * and will catch all of them in practice. 2057 */ 2058 int last_placement_id = -1; 2059 2060 TAILQ_FOREACH(psock, &group->socks_with_data, link) { 2061 if (psock->zcopy && psock->placement_id >= 0 && 2062 psock->placement_id != last_placement_id) { 2063 struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0}; 2064 2065 poll(&pfd, 1, 0); 2066 last_placement_id = psock->placement_id; 2067 } 2068 } 2069 #endif 2070 2071 /* This must be a TAILQ_FOREACH_SAFE because while flushing, 2072 * a completion callback could remove the sock from the 2073 * group. */ 2074 TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { 2075 rc = _sock_flush(sock); 2076 if (rc < 0 && errno != EAGAIN) { 2077 spdk_sock_abort_requests(sock); 2078 } 2079 } 2080 2081 assert(max_events > 0); 2082 2083 #if defined(SPDK_EPOLL) 2084 num_events = epoll_wait(group->fd, events, max_events, 0); 2085 #elif defined(SPDK_KEVENT) 2086 num_events = kevent(group->fd, NULL, 0, events, max_events, &ts); 2087 #endif 2088 2089 if (num_events == -1) { 2090 return -1; 2091 } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) { 2092 sock = TAILQ_FIRST(&_group->socks); 2093 psock = __posix_sock(sock); 2094 /* poll() is called here to busy poll the queue associated with 2095 * first socket in list and potentially reap incoming data. 2096 */ 2097 if (sock->opts.priority) { 2098 struct pollfd pfd = {0, 0, 0}; 2099 2100 pfd.fd = psock->fd; 2101 pfd.events = POLLIN | POLLERR; 2102 poll(&pfd, 1, 0); 2103 } 2104 } 2105 2106 for (i = 0; i < num_events; i++) { 2107 #if defined(SPDK_EPOLL) 2108 sock = events[i].data.ptr; 2109 psock = __posix_sock(sock); 2110 2111 #ifdef SPDK_ZEROCOPY 2112 if (events[i].events & EPOLLERR) { 2113 rc = _sock_check_zcopy(sock); 2114 /* If the socket was closed or removed from 2115 * the group in response to a send ack, don't 2116 * add it to the array here. */ 2117 if (rc || sock->cb_fn == NULL) { 2118 continue; 2119 } 2120 } 2121 #endif 2122 if ((events[i].events & EPOLLIN) == 0) { 2123 continue; 2124 } 2125 2126 #elif defined(SPDK_KEVENT) 2127 sock = events[i].udata; 2128 psock = __posix_sock(sock); 2129 #endif 2130 2131 /* If the socket is not already in the list, add it now */ 2132 if (!psock->socket_has_data && !psock->pipe_has_data) { 2133 TAILQ_INSERT_TAIL(&group->socks_with_data, psock, link); 2134 } 2135 psock->socket_has_data = true; 2136 } 2137 2138 num_events = 0; 2139 2140 TAILQ_FOREACH_SAFE(psock, &group->socks_with_data, link, ptmp) { 2141 if (num_events == max_events) { 2142 break; 2143 } 2144 2145 /* If the socket's cb_fn is NULL, just remove it from the 2146 * list and do not add it to socks array */ 2147 if (spdk_unlikely(psock->base.cb_fn == NULL)) { 2148 psock->socket_has_data = false; 2149 psock->pipe_has_data = false; 2150 TAILQ_REMOVE(&group->socks_with_data, psock, link); 2151 continue; 2152 } 2153 2154 socks[num_events++] = &psock->base; 2155 } 2156 2157 /* Cycle the has_data list so that each time we poll things aren't 2158 * in the same order. Say we have 6 sockets in the list, named as follows: 2159 * A B C D E F 2160 * And all 6 sockets had epoll events, but max_events is only 3. That means 2161 * psock currently points at D. We want to rearrange the list to the following: 2162 * D E F A B C 2163 * 2164 * The variables below are named according to this example to make it easier to 2165 * follow the swaps. 2166 */ 2167 if (psock != NULL) { 2168 struct spdk_posix_sock *pa, *pc, *pd, *pf; 2169 2170 /* Capture pointers to the elements we need */ 2171 pd = psock; 2172 pc = TAILQ_PREV(pd, spdk_has_data_list, link); 2173 pa = TAILQ_FIRST(&group->socks_with_data); 2174 pf = TAILQ_LAST(&group->socks_with_data, spdk_has_data_list); 2175 2176 /* Break the link between C and D */ 2177 pc->link.tqe_next = NULL; 2178 2179 /* Connect F to A */ 2180 pf->link.tqe_next = pa; 2181 pa->link.tqe_prev = &pf->link.tqe_next; 2182 2183 /* Fix up the list first/last pointers */ 2184 group->socks_with_data.tqh_first = pd; 2185 group->socks_with_data.tqh_last = &pc->link.tqe_next; 2186 2187 /* D is in front of the list, make tqe prev pointer point to the head of list */ 2188 pd->link.tqe_prev = &group->socks_with_data.tqh_first; 2189 } 2190 2191 return num_events; 2192 } 2193 2194 static int 2195 posix_sock_group_impl_register_interrupt(struct spdk_sock_group_impl *_group, uint32_t events, 2196 spdk_interrupt_fn fn, void *arg, const char *name) 2197 { 2198 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 2199 2200 group->intr = spdk_interrupt_register_for_events(group->fd, events, fn, arg, name); 2201 2202 return group->intr ? 0 : -1; 2203 } 2204 2205 static void 2206 posix_sock_group_impl_unregister_interrupt(struct spdk_sock_group_impl *_group) 2207 { 2208 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 2209 2210 spdk_interrupt_unregister(&group->intr); 2211 } 2212 2213 static int 2214 _sock_group_impl_close(struct spdk_sock_group_impl *_group, uint32_t enable_placement_id) 2215 { 2216 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 2217 int rc; 2218 2219 if (enable_placement_id == PLACEMENT_CPU) { 2220 spdk_sock_map_release(&g_map, spdk_env_get_current_core()); 2221 } 2222 2223 spdk_pipe_group_destroy(group->pipe_group); 2224 rc = close(group->fd); 2225 free(group); 2226 return rc; 2227 } 2228 2229 static int 2230 posix_sock_group_impl_close(struct spdk_sock_group_impl *_group) 2231 { 2232 return _sock_group_impl_close(_group, g_posix_impl_opts.enable_placement_id); 2233 } 2234 2235 static int 2236 ssl_sock_group_impl_close(struct spdk_sock_group_impl *_group) 2237 { 2238 return _sock_group_impl_close(_group, g_ssl_impl_opts.enable_placement_id); 2239 } 2240 2241 static struct spdk_net_impl g_posix_net_impl = { 2242 .name = "posix", 2243 .getaddr = posix_sock_getaddr, 2244 .get_interface_name = posix_sock_get_interface_name, 2245 .get_numa_socket_id = posix_sock_get_numa_socket_id, 2246 .connect = posix_sock_connect, 2247 .listen = posix_sock_listen, 2248 .accept = posix_sock_accept, 2249 .close = posix_sock_close, 2250 .recv = posix_sock_recv, 2251 .readv = posix_sock_readv, 2252 .writev = posix_sock_writev, 2253 .recv_next = posix_sock_recv_next, 2254 .writev_async = posix_sock_writev_async, 2255 .flush = posix_sock_flush, 2256 .set_recvlowat = posix_sock_set_recvlowat, 2257 .set_recvbuf = posix_sock_set_recvbuf, 2258 .set_sendbuf = posix_sock_set_sendbuf, 2259 .is_ipv6 = posix_sock_is_ipv6, 2260 .is_ipv4 = posix_sock_is_ipv4, 2261 .is_connected = posix_sock_is_connected, 2262 .group_impl_get_optimal = posix_sock_group_impl_get_optimal, 2263 .group_impl_create = posix_sock_group_impl_create, 2264 .group_impl_add_sock = posix_sock_group_impl_add_sock, 2265 .group_impl_remove_sock = posix_sock_group_impl_remove_sock, 2266 .group_impl_poll = posix_sock_group_impl_poll, 2267 .group_impl_register_interrupt = posix_sock_group_impl_register_interrupt, 2268 .group_impl_unregister_interrupt = posix_sock_group_impl_unregister_interrupt, 2269 .group_impl_close = posix_sock_group_impl_close, 2270 .get_opts = posix_sock_impl_get_opts, 2271 .set_opts = posix_sock_impl_set_opts, 2272 }; 2273 2274 SPDK_NET_IMPL_REGISTER_DEFAULT(posix, &g_posix_net_impl); 2275 2276 static struct spdk_sock * 2277 ssl_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) 2278 { 2279 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, true); 2280 } 2281 2282 static struct spdk_sock * 2283 ssl_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) 2284 { 2285 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, true); 2286 } 2287 2288 static struct spdk_sock * 2289 ssl_sock_accept(struct spdk_sock *_sock) 2290 { 2291 return _posix_sock_accept(_sock, true); 2292 } 2293 2294 static struct spdk_net_impl g_ssl_net_impl = { 2295 .name = "ssl", 2296 .getaddr = posix_sock_getaddr, 2297 .get_interface_name = posix_sock_get_interface_name, 2298 .get_numa_socket_id = posix_sock_get_numa_socket_id, 2299 .connect = ssl_sock_connect, 2300 .listen = ssl_sock_listen, 2301 .accept = ssl_sock_accept, 2302 .close = posix_sock_close, 2303 .recv = posix_sock_recv, 2304 .readv = posix_sock_readv, 2305 .writev = posix_sock_writev, 2306 .recv_next = posix_sock_recv_next, 2307 .writev_async = posix_sock_writev_async, 2308 .flush = posix_sock_flush, 2309 .set_recvlowat = posix_sock_set_recvlowat, 2310 .set_recvbuf = posix_sock_set_recvbuf, 2311 .set_sendbuf = posix_sock_set_sendbuf, 2312 .is_ipv6 = posix_sock_is_ipv6, 2313 .is_ipv4 = posix_sock_is_ipv4, 2314 .is_connected = posix_sock_is_connected, 2315 .group_impl_get_optimal = posix_sock_group_impl_get_optimal, 2316 .group_impl_create = ssl_sock_group_impl_create, 2317 .group_impl_add_sock = posix_sock_group_impl_add_sock, 2318 .group_impl_remove_sock = posix_sock_group_impl_remove_sock, 2319 .group_impl_poll = posix_sock_group_impl_poll, 2320 .group_impl_register_interrupt = posix_sock_group_impl_register_interrupt, 2321 .group_impl_unregister_interrupt = posix_sock_group_impl_unregister_interrupt, 2322 .group_impl_close = ssl_sock_group_impl_close, 2323 .get_opts = ssl_sock_impl_get_opts, 2324 .set_opts = ssl_sock_impl_set_opts, 2325 }; 2326 2327 SPDK_NET_IMPL_REGISTER(ssl, &g_ssl_net_impl); 2328 SPDK_LOG_REGISTER_COMPONENT(sock_posix) 2329