1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2020, 2021 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #if defined(__FreeBSD__) 10 #include <sys/event.h> 11 #define SPDK_KEVENT 12 #else 13 #include <sys/epoll.h> 14 #define SPDK_EPOLL 15 #endif 16 17 #if defined(__linux__) 18 #include <linux/errqueue.h> 19 #endif 20 21 #include "spdk/env.h" 22 #include "spdk/log.h" 23 #include "spdk/pipe.h" 24 #include "spdk/sock.h" 25 #include "spdk/util.h" 26 #include "spdk/string.h" 27 #include "spdk_internal/sock.h" 28 #include "../sock_kernel.h" 29 30 #include "openssl/crypto.h" 31 #include "openssl/err.h" 32 #include "openssl/ssl.h" 33 34 #define MAX_TMPBUF 1024 35 #define PORTNUMLEN 32 36 37 #if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) 38 #define SPDK_ZEROCOPY 39 #endif 40 41 struct spdk_posix_sock { 42 struct spdk_sock base; 43 int fd; 44 45 uint32_t sendmsg_idx; 46 47 struct spdk_pipe *recv_pipe; 48 void *recv_buf; 49 int recv_buf_sz; 50 bool pipe_has_data; 51 bool socket_has_data; 52 bool zcopy; 53 54 int placement_id; 55 56 SSL_CTX *ctx; 57 SSL *ssl; 58 59 TAILQ_ENTRY(spdk_posix_sock) link; 60 }; 61 62 TAILQ_HEAD(spdk_has_data_list, spdk_posix_sock); 63 64 struct spdk_posix_sock_group_impl { 65 struct spdk_sock_group_impl base; 66 int fd; 67 struct spdk_has_data_list socks_with_data; 68 int placement_id; 69 }; 70 71 static struct spdk_sock_impl_opts g_spdk_posix_sock_impl_opts = { 72 .recv_buf_size = MIN_SO_RCVBUF_SIZE, 73 .send_buf_size = MIN_SO_SNDBUF_SIZE, 74 .enable_recv_pipe = true, 75 .enable_quickack = false, 76 .enable_placement_id = PLACEMENT_NONE, 77 .enable_zerocopy_send_server = true, 78 .enable_zerocopy_send_client = false, 79 .zerocopy_threshold = 0 80 }; 81 82 static struct spdk_sock_map g_map = { 83 .entries = STAILQ_HEAD_INITIALIZER(g_map.entries), 84 .mtx = PTHREAD_MUTEX_INITIALIZER 85 }; 86 87 __attribute((destructor)) static void 88 posix_sock_map_cleanup(void) 89 { 90 spdk_sock_map_cleanup(&g_map); 91 } 92 93 #define __posix_sock(sock) (struct spdk_posix_sock *)sock 94 #define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group 95 96 static void 97 posix_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src, 98 size_t len) 99 { 100 #define FIELD_OK(field) \ 101 offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len 102 103 #define SET_FIELD(field) \ 104 if (FIELD_OK(field)) { \ 105 dest->field = src->field; \ 106 } 107 108 SET_FIELD(recv_buf_size); 109 SET_FIELD(send_buf_size); 110 SET_FIELD(enable_recv_pipe); 111 SET_FIELD(enable_zerocopy_send); 112 SET_FIELD(enable_quickack); 113 SET_FIELD(enable_placement_id); 114 SET_FIELD(enable_zerocopy_send_server); 115 SET_FIELD(enable_zerocopy_send_client); 116 SET_FIELD(zerocopy_threshold); 117 118 #undef SET_FIELD 119 #undef FIELD_OK 120 } 121 122 static int 123 posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) 124 { 125 if (!opts || !len) { 126 errno = EINVAL; 127 return -1; 128 } 129 memset(opts, 0, *len); 130 131 posix_sock_copy_impl_opts(opts, &g_spdk_posix_sock_impl_opts, *len); 132 *len = spdk_min(*len, sizeof(g_spdk_posix_sock_impl_opts)); 133 134 return 0; 135 } 136 137 static int 138 posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) 139 { 140 if (!opts) { 141 errno = EINVAL; 142 return -1; 143 } 144 145 posix_sock_copy_impl_opts(&g_spdk_posix_sock_impl_opts, opts, len); 146 147 return 0; 148 } 149 150 static int 151 posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, 152 char *caddr, int clen, uint16_t *cport) 153 { 154 struct spdk_posix_sock *sock = __posix_sock(_sock); 155 struct sockaddr_storage sa; 156 socklen_t salen; 157 int rc; 158 159 assert(sock != NULL); 160 161 memset(&sa, 0, sizeof sa); 162 salen = sizeof sa; 163 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 164 if (rc != 0) { 165 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 166 return -1; 167 } 168 169 switch (sa.ss_family) { 170 case AF_UNIX: 171 /* Acceptable connection types that don't have IPs */ 172 return 0; 173 case AF_INET: 174 case AF_INET6: 175 /* Code below will get IP addresses */ 176 break; 177 default: 178 /* Unsupported socket family */ 179 return -1; 180 } 181 182 rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); 183 if (rc != 0) { 184 SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); 185 return -1; 186 } 187 188 if (sport) { 189 if (sa.ss_family == AF_INET) { 190 *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); 191 } else if (sa.ss_family == AF_INET6) { 192 *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); 193 } 194 } 195 196 memset(&sa, 0, sizeof sa); 197 salen = sizeof sa; 198 rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); 199 if (rc != 0) { 200 SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); 201 return -1; 202 } 203 204 rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); 205 if (rc != 0) { 206 SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); 207 return -1; 208 } 209 210 if (cport) { 211 if (sa.ss_family == AF_INET) { 212 *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); 213 } else if (sa.ss_family == AF_INET6) { 214 *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); 215 } 216 } 217 218 return 0; 219 } 220 221 enum posix_sock_create_type { 222 SPDK_SOCK_CREATE_LISTEN, 223 SPDK_SOCK_CREATE_CONNECT, 224 }; 225 226 static int 227 posix_sock_alloc_pipe(struct spdk_posix_sock *sock, int sz) 228 { 229 uint8_t *new_buf; 230 struct spdk_pipe *new_pipe; 231 struct iovec siov[2]; 232 struct iovec diov[2]; 233 int sbytes; 234 ssize_t bytes; 235 236 if (sock->recv_buf_sz == sz) { 237 return 0; 238 } 239 240 /* If the new size is 0, just free the pipe */ 241 if (sz == 0) { 242 spdk_pipe_destroy(sock->recv_pipe); 243 free(sock->recv_buf); 244 sock->recv_pipe = NULL; 245 sock->recv_buf = NULL; 246 return 0; 247 } else if (sz < MIN_SOCK_PIPE_SIZE) { 248 SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); 249 return -1; 250 } 251 252 /* Round up to next 64 byte multiple */ 253 new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t)); 254 if (!new_buf) { 255 SPDK_ERRLOG("socket recv buf allocation failed\n"); 256 return -ENOMEM; 257 } 258 259 new_pipe = spdk_pipe_create(new_buf, sz + 1); 260 if (new_pipe == NULL) { 261 SPDK_ERRLOG("socket pipe allocation failed\n"); 262 free(new_buf); 263 return -ENOMEM; 264 } 265 266 if (sock->recv_pipe != NULL) { 267 /* Pull all of the data out of the old pipe */ 268 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); 269 if (sbytes > sz) { 270 /* Too much data to fit into the new pipe size */ 271 spdk_pipe_destroy(new_pipe); 272 free(new_buf); 273 return -EINVAL; 274 } 275 276 sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); 277 assert(sbytes == sz); 278 279 bytes = spdk_iovcpy(siov, 2, diov, 2); 280 spdk_pipe_writer_advance(new_pipe, bytes); 281 282 spdk_pipe_destroy(sock->recv_pipe); 283 free(sock->recv_buf); 284 } 285 286 sock->recv_buf_sz = sz; 287 sock->recv_buf = new_buf; 288 sock->recv_pipe = new_pipe; 289 290 return 0; 291 } 292 293 static int 294 posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz) 295 { 296 struct spdk_posix_sock *sock = __posix_sock(_sock); 297 int rc; 298 299 assert(sock != NULL); 300 301 if (g_spdk_posix_sock_impl_opts.enable_recv_pipe) { 302 rc = posix_sock_alloc_pipe(sock, sz); 303 if (rc) { 304 return rc; 305 } 306 } 307 308 /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE */ 309 if (sz < MIN_SO_RCVBUF_SIZE) { 310 sz = MIN_SO_RCVBUF_SIZE; 311 } 312 313 rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); 314 if (rc < 0) { 315 return rc; 316 } 317 318 return 0; 319 } 320 321 static int 322 posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz) 323 { 324 struct spdk_posix_sock *sock = __posix_sock(_sock); 325 int rc; 326 327 assert(sock != NULL); 328 329 if (sz < MIN_SO_SNDBUF_SIZE) { 330 sz = MIN_SO_SNDBUF_SIZE; 331 } 332 333 rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); 334 if (rc < 0) { 335 return rc; 336 } 337 338 return 0; 339 } 340 341 static void 342 posix_sock_init(struct spdk_posix_sock *sock, bool enable_zero_copy) 343 { 344 #if defined(SPDK_ZEROCOPY) || defined(__linux__) 345 int flag; 346 int rc; 347 #endif 348 349 #if defined(SPDK_ZEROCOPY) 350 flag = 1; 351 352 if (enable_zero_copy) { 353 /* Try to turn on zero copy sends */ 354 rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); 355 if (rc == 0) { 356 sock->zcopy = true; 357 } 358 } 359 #endif 360 361 #if defined(__linux__) 362 flag = 1; 363 364 if (g_spdk_posix_sock_impl_opts.enable_quickack) { 365 rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); 366 if (rc != 0) { 367 SPDK_ERRLOG("quickack was failed to set\n"); 368 } 369 } 370 371 spdk_sock_get_placement_id(sock->fd, g_spdk_posix_sock_impl_opts.enable_placement_id, 372 &sock->placement_id); 373 374 if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_MARK) { 375 /* Save placement_id */ 376 spdk_sock_map_insert(&g_map, sock->placement_id, NULL); 377 } 378 #endif 379 } 380 381 static struct spdk_posix_sock * 382 posix_sock_alloc(int fd, bool enable_zero_copy) 383 { 384 struct spdk_posix_sock *sock; 385 386 sock = calloc(1, sizeof(*sock)); 387 if (sock == NULL) { 388 SPDK_ERRLOG("sock allocation failed\n"); 389 return NULL; 390 } 391 392 sock->fd = fd; 393 posix_sock_init(sock, enable_zero_copy); 394 395 return sock; 396 } 397 398 static int 399 posix_fd_create(struct addrinfo *res, struct spdk_sock_opts *opts) 400 { 401 int fd; 402 int val = 1; 403 int rc, sz; 404 #if defined(__linux__) 405 int to; 406 #endif 407 408 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); 409 if (fd < 0) { 410 /* error */ 411 return -1; 412 } 413 414 sz = g_spdk_posix_sock_impl_opts.recv_buf_size; 415 rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); 416 if (rc) { 417 /* Not fatal */ 418 } 419 420 sz = g_spdk_posix_sock_impl_opts.send_buf_size; 421 rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); 422 if (rc) { 423 /* Not fatal */ 424 } 425 426 rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); 427 if (rc != 0) { 428 close(fd); 429 /* error */ 430 return -1; 431 } 432 rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); 433 if (rc != 0) { 434 close(fd); 435 /* error */ 436 return -1; 437 } 438 439 #if defined(SO_PRIORITY) 440 if (opts->priority) { 441 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); 442 if (rc != 0) { 443 close(fd); 444 /* error */ 445 return -1; 446 } 447 } 448 #endif 449 450 if (res->ai_family == AF_INET6) { 451 rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); 452 if (rc != 0) { 453 close(fd); 454 /* error */ 455 return -1; 456 } 457 } 458 459 if (opts->ack_timeout) { 460 #if defined(__linux__) 461 to = opts->ack_timeout; 462 rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &to, sizeof(to)); 463 if (rc != 0) { 464 close(fd); 465 /* error */ 466 return -1; 467 } 468 #else 469 SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n"); 470 #endif 471 } 472 473 return fd; 474 } 475 476 #define PSK_ID "nqn.2014-08.org.nvmexpress:uuid:f81d4fae-7dec-11d0-a765-00a0c91e6bf6" 477 #define PSK_KEY "1234567890ABCDEF" 478 479 static unsigned int 480 posix_sock_tls_psk_server_cb(SSL *ssl, 481 const char *id, 482 unsigned char *psk, 483 unsigned int max_psk_len) 484 { 485 long key_len; 486 unsigned char *default_psk; 487 488 if (PSK_KEY == NULL) { 489 SPDK_ERRLOG("PSK is not set\n"); 490 goto err; 491 } 492 SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK ID %lu\n", strlen(PSK_ID)); 493 if (id == NULL) { 494 SPDK_ERRLOG("Received empty PSK ID\n"); 495 goto err; 496 } 497 SPDK_DEBUGLOG(sock_posix, "Received PSK ID '%s'\n", id); 498 if (strcmp(PSK_ID, id) != 0) { 499 SPDK_ERRLOG("Unknown Client's PSK ID\n"); 500 goto err; 501 } 502 503 SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK KEY %u\n", max_psk_len); 504 default_psk = OPENSSL_hexstr2buf(PSK_KEY, &key_len); 505 if (default_psk == NULL) { 506 SPDK_ERRLOG("Could not unhexlify PSK\n"); 507 goto err; 508 } 509 if (key_len > max_psk_len) { 510 SPDK_ERRLOG("Insufficient buffer size to copy PSK\n"); 511 goto err; 512 } 513 514 memcpy(psk, default_psk, key_len); 515 516 return key_len; 517 518 err: 519 return 0; 520 } 521 522 static unsigned int 523 posix_sock_tls_psk_client_cb(SSL *ssl, const char *hint, 524 char *identity, 525 unsigned int max_identity_len, 526 unsigned char *psk, 527 unsigned int max_psk_len) 528 { 529 long key_len; 530 unsigned char *default_psk; 531 532 if (hint) { 533 SPDK_DEBUGLOG(sock_posix, "Received PSK identity hint '%s'\n", hint); 534 } 535 536 if (PSK_KEY == NULL) { 537 SPDK_ERRLOG("PSK is not set\n"); 538 goto err; 539 } 540 default_psk = OPENSSL_hexstr2buf(PSK_KEY, &key_len); 541 if (default_psk == NULL) { 542 SPDK_ERRLOG("Could not unhexlify PSK\n"); 543 goto err; 544 } 545 if ((strlen(PSK_ID) + 1 > max_identity_len) 546 || (key_len > max_psk_len)) { 547 SPDK_ERRLOG("PSK ID or Key buffer is not sufficient\n"); 548 goto err; 549 } 550 spdk_strcpy_pad(identity, PSK_ID, strlen(PSK_ID), 0); 551 SPDK_DEBUGLOG(sock_posix, "Sending PSK identity '%s'\n", identity); 552 553 memcpy(psk, default_psk, key_len); 554 SPDK_DEBUGLOG(sock_posix, "Provided out-of-band (OOB) PSK for TLS1.3 client\n"); 555 556 return key_len; 557 558 err: 559 return 0; 560 } 561 562 static SSL_CTX * 563 posix_sock_create_ssl_context(const SSL_METHOD *method, struct spdk_sock_opts *opts) 564 { 565 SSL_CTX *ctx; 566 int tls_version = 0; 567 bool ktls_enabled = false; 568 #ifdef SSL_OP_ENABLE_KTLS 569 long options; 570 #endif 571 572 SSL_library_init(); 573 OpenSSL_add_all_algorithms(); 574 SSL_load_error_strings(); 575 /* Produce a SSL CTX in SSL V2 and V3 standards compliant way */ 576 ctx = SSL_CTX_new(method); 577 if (!ctx) { 578 SPDK_ERRLOG("SSL_CTX_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 579 return NULL; 580 } 581 SPDK_DEBUGLOG(sock_posix, "SSL context created\n"); 582 583 switch (opts->tls_version) { 584 case 0: 585 /* auto-negotioation */ 586 break; 587 case SPDK_TLS_VERSION_1_1: 588 tls_version = TLS1_1_VERSION; 589 break; 590 case SPDK_TLS_VERSION_1_2: 591 tls_version = TLS1_2_VERSION; 592 break; 593 case SPDK_TLS_VERSION_1_3: 594 tls_version = TLS1_3_VERSION; 595 break; 596 default: 597 SPDK_ERRLOG("Incorrect TLS version provided: %d\n", opts->tls_version); 598 goto err; 599 } 600 601 if (tls_version) { 602 SPDK_DEBUGLOG(sock_posix, "Hardening TLS version to '%d'='0x%X'\n", opts->tls_version, tls_version); 603 if (!SSL_CTX_set_min_proto_version(ctx, tls_version)) { 604 SPDK_ERRLOG("Unable to set Min TLS version to '%d'='0x%X\n", opts->tls_version, tls_version); 605 goto err; 606 } 607 if (!SSL_CTX_set_max_proto_version(ctx, tls_version)) { 608 SPDK_ERRLOG("Unable to set Max TLS version to '%d'='0x%X\n", opts->tls_version, tls_version); 609 goto err; 610 } 611 } 612 if (opts->ktls) { 613 SPDK_DEBUGLOG(sock_posix, "Enabling kTLS offload\n"); 614 #ifdef SSL_OP_ENABLE_KTLS 615 options = SSL_CTX_set_options(ctx, SSL_OP_ENABLE_KTLS); 616 ktls_enabled = options & SSL_OP_ENABLE_KTLS; 617 #else 618 ktls_enabled = false; 619 #endif 620 if (!ktls_enabled) { 621 SPDK_ERRLOG("Unable to set kTLS offload via SSL_CTX_set_options(). Configure openssl with 'enable-ktls'\n"); 622 goto err; 623 } 624 } 625 626 return ctx; 627 628 err: 629 SSL_CTX_free(ctx); 630 return NULL; 631 } 632 633 static SSL * 634 ssl_sock_connect_loop(SSL_CTX *ctx, int fd) 635 { 636 int rc; 637 SSL *ssl; 638 int ssl_get_error; 639 640 ssl = SSL_new(ctx); 641 if (!ssl) { 642 SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 643 return NULL; 644 } 645 SSL_set_fd(ssl, fd); 646 SSL_set_psk_client_callback(ssl, posix_sock_tls_psk_client_cb); 647 SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); 648 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 649 while ((rc = SSL_connect(ssl)) != 1) { 650 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 651 ssl_get_error = SSL_get_error(ssl, rc); 652 SPDK_DEBUGLOG(sock_posix, "SSL_connect failed %d = SSL_connect(%p), %d = SSL_get_error(%p, %d)\n", 653 rc, ssl, ssl_get_error, ssl, rc); 654 switch (ssl_get_error) { 655 case SSL_ERROR_WANT_READ: 656 case SSL_ERROR_WANT_WRITE: 657 continue; 658 default: 659 break; 660 } 661 SPDK_ERRLOG("SSL_connect() failed, errno = %d\n", errno); 662 SSL_free(ssl); 663 return NULL; 664 } 665 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 666 SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", 667 SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); 668 return ssl; 669 } 670 671 static SSL * 672 ssl_sock_accept_loop(SSL_CTX *ctx, int fd) 673 { 674 int rc; 675 SSL *ssl; 676 int ssl_get_error; 677 678 ssl = SSL_new(ctx); 679 if (!ssl) { 680 SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); 681 return NULL; 682 } 683 SSL_set_fd(ssl, fd); 684 SSL_set_psk_server_callback(ssl, posix_sock_tls_psk_server_cb); 685 SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); 686 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 687 while ((rc = SSL_accept(ssl)) != 1) { 688 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 689 ssl_get_error = SSL_get_error(ssl, rc); 690 SPDK_DEBUGLOG(sock_posix, "SSL_accept failed %d = SSL_accept(%p), %d = SSL_get_error(%p, %d)\n", rc, 691 ssl, ssl_get_error, ssl, rc); 692 switch (ssl_get_error) { 693 case SSL_ERROR_WANT_READ: 694 case SSL_ERROR_WANT_WRITE: 695 continue; 696 default: 697 break; 698 } 699 SPDK_ERRLOG("SSL_accept() failed, errno = %d\n", errno); 700 SSL_free(ssl); 701 return NULL; 702 } 703 SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); 704 SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", 705 SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); 706 return ssl; 707 } 708 709 static ssize_t 710 SSL_readv(SSL *ssl, const struct iovec *iov, int iovcnt) 711 { 712 int i, rc = 0; 713 ssize_t total = 0; 714 715 for (i = 0; i < iovcnt; i++) { 716 rc = SSL_read(ssl, iov[i].iov_base, iov[i].iov_len); 717 718 if (rc > 0) { 719 total += rc; 720 } 721 if (rc != (int)iov[i].iov_len) { 722 break; 723 } 724 } 725 if (total > 0) { 726 errno = 0; 727 return total; 728 } 729 switch (SSL_get_error(ssl, rc)) { 730 case SSL_ERROR_ZERO_RETURN: 731 errno = ENOTCONN; 732 return 0; 733 case SSL_ERROR_WANT_READ: 734 case SSL_ERROR_WANT_WRITE: 735 case SSL_ERROR_WANT_CONNECT: 736 case SSL_ERROR_WANT_ACCEPT: 737 case SSL_ERROR_WANT_X509_LOOKUP: 738 case SSL_ERROR_WANT_ASYNC: 739 case SSL_ERROR_WANT_ASYNC_JOB: 740 case SSL_ERROR_WANT_CLIENT_HELLO_CB: 741 errno = EAGAIN; 742 return -1; 743 case SSL_ERROR_SYSCALL: 744 case SSL_ERROR_SSL: 745 errno = ENOTCONN; 746 return -1; 747 default: 748 errno = ENOTCONN; 749 return -1; 750 } 751 } 752 753 static ssize_t 754 SSL_writev(SSL *ssl, struct iovec *iov, int iovcnt) 755 { 756 int i, rc = 0; 757 ssize_t total = 0; 758 759 for (i = 0; i < iovcnt; i++) { 760 rc = SSL_write(ssl, iov[i].iov_base, iov[i].iov_len); 761 762 if (rc > 0) { 763 total += rc; 764 } 765 if (rc != (int)iov[i].iov_len) { 766 break; 767 } 768 } 769 if (total > 0) { 770 errno = 0; 771 return total; 772 } 773 switch (SSL_get_error(ssl, rc)) { 774 case SSL_ERROR_ZERO_RETURN: 775 errno = ENOTCONN; 776 return 0; 777 case SSL_ERROR_WANT_READ: 778 case SSL_ERROR_WANT_WRITE: 779 case SSL_ERROR_WANT_CONNECT: 780 case SSL_ERROR_WANT_ACCEPT: 781 case SSL_ERROR_WANT_X509_LOOKUP: 782 case SSL_ERROR_WANT_ASYNC: 783 case SSL_ERROR_WANT_ASYNC_JOB: 784 case SSL_ERROR_WANT_CLIENT_HELLO_CB: 785 errno = EAGAIN; 786 return -1; 787 case SSL_ERROR_SYSCALL: 788 case SSL_ERROR_SSL: 789 errno = ENOTCONN; 790 return -1; 791 default: 792 errno = ENOTCONN; 793 return -1; 794 } 795 } 796 797 static struct spdk_sock * 798 posix_sock_create(const char *ip, int port, 799 enum posix_sock_create_type type, 800 struct spdk_sock_opts *opts, 801 bool enable_ssl) 802 { 803 struct spdk_posix_sock *sock; 804 char buf[MAX_TMPBUF]; 805 char portnum[PORTNUMLEN]; 806 char *p; 807 struct addrinfo hints, *res, *res0; 808 int fd, flag; 809 int rc; 810 bool enable_zcopy_user_opts = true; 811 bool enable_zcopy_impl_opts = true; 812 SSL_CTX *ctx = 0; 813 SSL *ssl = 0; 814 815 assert(opts != NULL); 816 817 if (ip == NULL) { 818 return NULL; 819 } 820 if (ip[0] == '[') { 821 snprintf(buf, sizeof(buf), "%s", ip + 1); 822 p = strchr(buf, ']'); 823 if (p != NULL) { 824 *p = '\0'; 825 } 826 ip = (const char *) &buf[0]; 827 } 828 829 snprintf(portnum, sizeof portnum, "%d", port); 830 memset(&hints, 0, sizeof hints); 831 hints.ai_family = PF_UNSPEC; 832 hints.ai_socktype = SOCK_STREAM; 833 hints.ai_flags = AI_NUMERICSERV; 834 hints.ai_flags |= AI_PASSIVE; 835 hints.ai_flags |= AI_NUMERICHOST; 836 rc = getaddrinfo(ip, portnum, &hints, &res0); 837 if (rc != 0) { 838 SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); 839 return NULL; 840 } 841 842 /* try listen */ 843 fd = -1; 844 for (res = res0; res != NULL; res = res->ai_next) { 845 retry: 846 fd = posix_fd_create(res, opts); 847 if (fd < 0) { 848 continue; 849 } 850 if (type == SPDK_SOCK_CREATE_LISTEN) { 851 if (enable_ssl) { 852 ctx = posix_sock_create_ssl_context(TLS_server_method(), opts); 853 if (!ctx) { 854 SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); 855 close(fd); 856 fd = -1; 857 break; 858 } 859 } 860 rc = bind(fd, res->ai_addr, res->ai_addrlen); 861 if (rc != 0) { 862 SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); 863 switch (errno) { 864 case EINTR: 865 /* interrupted? */ 866 close(fd); 867 goto retry; 868 case EADDRNOTAVAIL: 869 SPDK_ERRLOG("IP address %s not available. " 870 "Verify IP address in config file " 871 "and make sure setup script is " 872 "run before starting spdk app.\n", ip); 873 /* FALLTHROUGH */ 874 default: 875 /* try next family */ 876 close(fd); 877 fd = -1; 878 continue; 879 } 880 } 881 /* bind OK */ 882 rc = listen(fd, 512); 883 if (rc != 0) { 884 SPDK_ERRLOG("listen() failed, errno = %d\n", errno); 885 close(fd); 886 fd = -1; 887 break; 888 } 889 enable_zcopy_impl_opts = g_spdk_posix_sock_impl_opts.enable_zerocopy_send_server; 890 } else if (type == SPDK_SOCK_CREATE_CONNECT) { 891 rc = connect(fd, res->ai_addr, res->ai_addrlen); 892 if (rc != 0) { 893 SPDK_ERRLOG("connect() failed, errno = %d\n", errno); 894 /* try next family */ 895 close(fd); 896 fd = -1; 897 continue; 898 } 899 enable_zcopy_impl_opts = g_spdk_posix_sock_impl_opts.enable_zerocopy_send_client; 900 if (enable_ssl) { 901 ctx = posix_sock_create_ssl_context(TLS_client_method(), opts); 902 if (!ctx) { 903 SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); 904 close(fd); 905 fd = -1; 906 break; 907 } 908 ssl = ssl_sock_connect_loop(ctx, fd); 909 if (!ssl) { 910 SPDK_ERRLOG("ssl_sock_connect_loop() failed, errno = %d\n", errno); 911 close(fd); 912 fd = -1; 913 SSL_CTX_free(ctx); 914 break; 915 } 916 } 917 } 918 919 flag = fcntl(fd, F_GETFL); 920 if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { 921 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); 922 close(fd); 923 fd = -1; 924 break; 925 } 926 break; 927 } 928 freeaddrinfo(res0); 929 930 if (fd < 0) { 931 return NULL; 932 } 933 934 /* Only enable zero copy for non-loopback and non-ssl sockets. */ 935 enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd) && !enable_ssl; 936 937 sock = posix_sock_alloc(fd, enable_zcopy_user_opts && enable_zcopy_impl_opts); 938 if (sock == NULL) { 939 SPDK_ERRLOG("sock allocation failed\n"); 940 close(fd); 941 return NULL; 942 } 943 944 if (ctx) { 945 sock->ctx = ctx; 946 } 947 948 if (ssl) { 949 sock->ssl = ssl; 950 } 951 952 return &sock->base; 953 } 954 955 static struct spdk_sock * 956 posix_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) 957 { 958 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, false); 959 } 960 961 static struct spdk_sock * 962 posix_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) 963 { 964 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, false); 965 } 966 967 static struct spdk_sock * 968 posix_sock_accept(struct spdk_sock *_sock) 969 { 970 struct spdk_posix_sock *sock = __posix_sock(_sock); 971 struct sockaddr_storage sa; 972 socklen_t salen; 973 int rc, fd; 974 struct spdk_posix_sock *new_sock; 975 int flag; 976 SSL *ssl = 0; 977 978 memset(&sa, 0, sizeof(sa)); 979 salen = sizeof(sa); 980 981 assert(sock != NULL); 982 983 rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); 984 985 if (rc == -1) { 986 return NULL; 987 } 988 989 fd = rc; 990 991 flag = fcntl(fd, F_GETFL); 992 if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) { 993 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); 994 close(fd); 995 return NULL; 996 } 997 998 #if defined(SO_PRIORITY) 999 /* The priority is not inherited, so call this function again */ 1000 if (sock->base.opts.priority) { 1001 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); 1002 if (rc != 0) { 1003 close(fd); 1004 return NULL; 1005 } 1006 } 1007 #endif 1008 1009 /* Establish SSL connection */ 1010 if (sock->ctx) { 1011 ssl = ssl_sock_accept_loop(sock->ctx, fd); 1012 if (!ssl) { 1013 SPDK_ERRLOG("ssl_sock_accept_loop() failed, errno = %d\n", errno); 1014 close(fd); 1015 SSL_CTX_free(sock->ctx); 1016 return NULL; 1017 } 1018 } 1019 1020 /* Inherit the zero copy feature from the listen socket */ 1021 new_sock = posix_sock_alloc(fd, sock->zcopy); 1022 if (new_sock == NULL) { 1023 close(fd); 1024 return NULL; 1025 } 1026 1027 if (sock->ctx) { 1028 new_sock->ctx = sock->ctx; 1029 } 1030 1031 if (ssl) { 1032 new_sock->ssl = ssl; 1033 } 1034 1035 return &new_sock->base; 1036 } 1037 1038 static int 1039 posix_sock_close(struct spdk_sock *_sock) 1040 { 1041 struct spdk_posix_sock *sock = __posix_sock(_sock); 1042 1043 assert(TAILQ_EMPTY(&_sock->pending_reqs)); 1044 1045 /* If the socket fails to close, the best choice is to 1046 * leak the fd but continue to free the rest of the sock 1047 * memory. */ 1048 close(sock->fd); 1049 1050 spdk_pipe_destroy(sock->recv_pipe); 1051 free(sock->recv_buf); 1052 free(sock); 1053 1054 return 0; 1055 } 1056 1057 #ifdef SPDK_ZEROCOPY 1058 static int 1059 _sock_check_zcopy(struct spdk_sock *sock) 1060 { 1061 struct spdk_posix_sock *psock = __posix_sock(sock); 1062 struct msghdr msgh = {}; 1063 uint8_t buf[sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)]; 1064 ssize_t rc; 1065 struct sock_extended_err *serr; 1066 struct cmsghdr *cm; 1067 uint32_t idx; 1068 struct spdk_sock_request *req, *treq; 1069 bool found; 1070 1071 msgh.msg_control = buf; 1072 msgh.msg_controllen = sizeof(buf); 1073 1074 while (true) { 1075 rc = recvmsg(psock->fd, &msgh, MSG_ERRQUEUE); 1076 1077 if (rc < 0) { 1078 if (errno == EWOULDBLOCK || errno == EAGAIN) { 1079 return 0; 1080 } 1081 1082 if (!TAILQ_EMPTY(&sock->pending_reqs)) { 1083 SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries\n"); 1084 } else { 1085 SPDK_WARNLOG("Recvmsg yielded an error!\n"); 1086 } 1087 return 0; 1088 } 1089 1090 cm = CMSG_FIRSTHDR(&msgh); 1091 if (!(cm && 1092 ((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || 1093 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR)))) { 1094 SPDK_WARNLOG("Unexpected cmsg level or type!\n"); 1095 return 0; 1096 } 1097 1098 serr = (struct sock_extended_err *)CMSG_DATA(cm); 1099 if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { 1100 SPDK_WARNLOG("Unexpected extended error origin\n"); 1101 return 0; 1102 } 1103 1104 /* Most of the time, the pending_reqs array is in the exact 1105 * order we need such that all of the requests to complete are 1106 * in order, in the front. It is guaranteed that all requests 1107 * belonging to the same sendmsg call are sequential, so once 1108 * we encounter one match we can stop looping as soon as a 1109 * non-match is found. 1110 */ 1111 for (idx = serr->ee_info; idx <= serr->ee_data; idx++) { 1112 found = false; 1113 TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) { 1114 if (!req->internal.is_zcopy) { 1115 /* This wasn't a zcopy request. It was just waiting in line to complete */ 1116 rc = spdk_sock_request_put(sock, req, 0); 1117 if (rc < 0) { 1118 return rc; 1119 } 1120 } else if (req->internal.offset == idx) { 1121 found = true; 1122 rc = spdk_sock_request_put(sock, req, 0); 1123 if (rc < 0) { 1124 return rc; 1125 } 1126 } else if (found) { 1127 break; 1128 } 1129 } 1130 } 1131 } 1132 1133 return 0; 1134 } 1135 #endif 1136 1137 static int 1138 _sock_flush(struct spdk_sock *sock) 1139 { 1140 struct spdk_posix_sock *psock = __posix_sock(sock); 1141 struct msghdr msg = {}; 1142 int flags; 1143 struct iovec iovs[IOV_BATCH_SIZE]; 1144 int iovcnt; 1145 int retval; 1146 struct spdk_sock_request *req; 1147 int i; 1148 ssize_t rc; 1149 unsigned int offset; 1150 size_t len; 1151 bool is_zcopy = false; 1152 1153 /* Can't flush from within a callback or we end up with recursive calls */ 1154 if (sock->cb_cnt > 0) { 1155 return 0; 1156 } 1157 1158 #ifdef SPDK_ZEROCOPY 1159 if (psock->zcopy) { 1160 flags = MSG_ZEROCOPY | MSG_NOSIGNAL; 1161 } else 1162 #endif 1163 { 1164 flags = MSG_NOSIGNAL; 1165 } 1166 1167 iovcnt = spdk_sock_prep_reqs(sock, iovs, 0, NULL, &flags); 1168 if (iovcnt == 0) { 1169 return 0; 1170 } 1171 1172 #ifdef SPDK_ZEROCOPY 1173 is_zcopy = flags & MSG_ZEROCOPY; 1174 #endif 1175 1176 /* Perform the vectored write */ 1177 msg.msg_iov = iovs; 1178 msg.msg_iovlen = iovcnt; 1179 1180 if (psock->ssl) { 1181 rc = SSL_writev(psock->ssl, iovs, iovcnt); 1182 } else { 1183 rc = sendmsg(psock->fd, &msg, flags); 1184 } 1185 if (rc <= 0) { 1186 if (errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && psock->zcopy)) { 1187 return 0; 1188 } 1189 return rc; 1190 } 1191 1192 if (is_zcopy) { 1193 /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the 1194 * req->internal.offset, so sendmsg_idx should not be zero */ 1195 if (spdk_unlikely(psock->sendmsg_idx == UINT32_MAX)) { 1196 psock->sendmsg_idx = 1; 1197 } else { 1198 psock->sendmsg_idx++; 1199 } 1200 } 1201 1202 /* Consume the requests that were actually written */ 1203 req = TAILQ_FIRST(&sock->queued_reqs); 1204 while (req) { 1205 offset = req->internal.offset; 1206 1207 /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */ 1208 req->internal.is_zcopy = is_zcopy; 1209 1210 for (i = 0; i < req->iovcnt; i++) { 1211 /* Advance by the offset first */ 1212 if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { 1213 offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; 1214 continue; 1215 } 1216 1217 /* Calculate the remaining length of this element */ 1218 len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; 1219 1220 if (len > (size_t)rc) { 1221 /* This element was partially sent. */ 1222 req->internal.offset += rc; 1223 return 0; 1224 } 1225 1226 offset = 0; 1227 req->internal.offset += len; 1228 rc -= len; 1229 } 1230 1231 /* Handled a full request. */ 1232 spdk_sock_request_pend(sock, req); 1233 1234 if (!req->internal.is_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) { 1235 /* The sendmsg syscall above isn't currently asynchronous, 1236 * so it's already done. */ 1237 retval = spdk_sock_request_put(sock, req, 0); 1238 if (retval) { 1239 break; 1240 } 1241 } else { 1242 /* Re-use the offset field to hold the sendmsg call index. The 1243 * index is 0 based, so subtract one here because we've already 1244 * incremented above. */ 1245 req->internal.offset = psock->sendmsg_idx - 1; 1246 } 1247 1248 if (rc == 0) { 1249 break; 1250 } 1251 1252 req = TAILQ_FIRST(&sock->queued_reqs); 1253 } 1254 1255 return 0; 1256 } 1257 1258 static int 1259 posix_sock_flush(struct spdk_sock *sock) 1260 { 1261 #ifdef SPDK_ZEROCOPY 1262 struct spdk_posix_sock *psock = __posix_sock(sock); 1263 1264 if (psock->zcopy && !TAILQ_EMPTY(&sock->pending_reqs)) { 1265 _sock_check_zcopy(sock); 1266 } 1267 #endif 1268 1269 return _sock_flush(sock); 1270 } 1271 1272 static ssize_t 1273 posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int diovcnt) 1274 { 1275 struct iovec siov[2]; 1276 int sbytes; 1277 ssize_t bytes; 1278 struct spdk_posix_sock_group_impl *group; 1279 1280 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); 1281 if (sbytes < 0) { 1282 errno = EINVAL; 1283 return -1; 1284 } else if (sbytes == 0) { 1285 errno = EAGAIN; 1286 return -1; 1287 } 1288 1289 bytes = spdk_iovcpy(siov, 2, diov, diovcnt); 1290 1291 if (bytes == 0) { 1292 /* The only way this happens is if diov is 0 length */ 1293 errno = EINVAL; 1294 return -1; 1295 } 1296 1297 spdk_pipe_reader_advance(sock->recv_pipe, bytes); 1298 1299 /* If we drained the pipe, mark it appropriately */ 1300 if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { 1301 assert(sock->pipe_has_data == true); 1302 1303 group = __posix_group_impl(sock->base.group_impl); 1304 if (group && !sock->socket_has_data) { 1305 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1306 } 1307 1308 sock->pipe_has_data = false; 1309 } 1310 1311 return bytes; 1312 } 1313 1314 static inline ssize_t 1315 posix_sock_read(struct spdk_posix_sock *sock) 1316 { 1317 struct iovec iov[2]; 1318 int bytes_avail, bytes_recvd; 1319 struct spdk_posix_sock_group_impl *group; 1320 1321 bytes_avail = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); 1322 1323 if (bytes_avail <= 0) { 1324 return bytes_avail; 1325 } 1326 1327 if (sock->ssl) { 1328 bytes_recvd = SSL_readv(sock->ssl, iov, 2); 1329 } else { 1330 bytes_recvd = readv(sock->fd, iov, 2); 1331 } 1332 1333 assert(sock->pipe_has_data == false); 1334 1335 if (bytes_recvd <= 0) { 1336 /* Errors count as draining the socket data */ 1337 if (sock->base.group_impl && sock->socket_has_data) { 1338 group = __posix_group_impl(sock->base.group_impl); 1339 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1340 } 1341 1342 sock->socket_has_data = false; 1343 1344 return bytes_recvd; 1345 } 1346 1347 spdk_pipe_writer_advance(sock->recv_pipe, bytes_recvd); 1348 1349 #if DEBUG 1350 if (sock->base.group_impl) { 1351 assert(sock->socket_has_data == true); 1352 } 1353 #endif 1354 1355 sock->pipe_has_data = true; 1356 if (bytes_recvd < bytes_avail) { 1357 /* We drained the kernel socket entirely. */ 1358 sock->socket_has_data = false; 1359 } 1360 1361 return bytes_recvd; 1362 } 1363 1364 static ssize_t 1365 posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) 1366 { 1367 struct spdk_posix_sock *sock = __posix_sock(_sock); 1368 struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl); 1369 int rc, i; 1370 size_t len; 1371 1372 if (sock->recv_pipe == NULL) { 1373 assert(sock->pipe_has_data == false); 1374 if (group && sock->socket_has_data) { 1375 sock->socket_has_data = false; 1376 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1377 } 1378 if (sock->ssl) { 1379 return SSL_readv(sock->ssl, iov, iovcnt); 1380 } else { 1381 return readv(sock->fd, iov, iovcnt); 1382 } 1383 } 1384 1385 /* If the socket is not in a group, we must assume it always has 1386 * data waiting for us because it is not epolled */ 1387 if (!sock->pipe_has_data && (group == NULL || sock->socket_has_data)) { 1388 /* If the user is receiving a sufficiently large amount of data, 1389 * receive directly to their buffers. */ 1390 len = 0; 1391 for (i = 0; i < iovcnt; i++) { 1392 len += iov[i].iov_len; 1393 } 1394 1395 if (len >= MIN_SOCK_PIPE_SIZE) { 1396 /* TODO: Should this detect if kernel socket is drained? */ 1397 if (sock->ssl) { 1398 return SSL_readv(sock->ssl, iov, iovcnt); 1399 } else { 1400 return readv(sock->fd, iov, iovcnt); 1401 } 1402 } 1403 1404 /* Otherwise, do a big read into our pipe */ 1405 rc = posix_sock_read(sock); 1406 if (rc <= 0) { 1407 return rc; 1408 } 1409 } 1410 1411 return posix_sock_recv_from_pipe(sock, iov, iovcnt); 1412 } 1413 1414 static ssize_t 1415 posix_sock_recv(struct spdk_sock *sock, void *buf, size_t len) 1416 { 1417 struct iovec iov[1]; 1418 1419 iov[0].iov_base = buf; 1420 iov[0].iov_len = len; 1421 1422 return posix_sock_readv(sock, iov, 1); 1423 } 1424 1425 static void 1426 posix_sock_readv_async(struct spdk_sock *sock, struct spdk_sock_request *req) 1427 { 1428 req->cb_fn(req->cb_arg, -ENOTSUP); 1429 } 1430 1431 static ssize_t 1432 posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) 1433 { 1434 struct spdk_posix_sock *sock = __posix_sock(_sock); 1435 int rc; 1436 1437 /* In order to process a writev, we need to flush any asynchronous writes 1438 * first. */ 1439 rc = _sock_flush(_sock); 1440 if (rc < 0) { 1441 return rc; 1442 } 1443 1444 if (!TAILQ_EMPTY(&_sock->queued_reqs)) { 1445 /* We weren't able to flush all requests */ 1446 errno = EAGAIN; 1447 return -1; 1448 } 1449 1450 if (sock->ssl) { 1451 return SSL_writev(sock->ssl, iov, iovcnt); 1452 } else { 1453 return writev(sock->fd, iov, iovcnt); 1454 } 1455 } 1456 1457 static void 1458 posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req) 1459 { 1460 int rc; 1461 1462 spdk_sock_request_queue(sock, req); 1463 1464 /* If there are a sufficient number queued, just flush them out immediately. */ 1465 if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { 1466 rc = _sock_flush(sock); 1467 if (rc) { 1468 spdk_sock_abort_requests(sock); 1469 } 1470 } 1471 } 1472 1473 static int 1474 posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) 1475 { 1476 struct spdk_posix_sock *sock = __posix_sock(_sock); 1477 int val; 1478 int rc; 1479 1480 assert(sock != NULL); 1481 1482 val = nbytes; 1483 rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); 1484 if (rc != 0) { 1485 return -1; 1486 } 1487 return 0; 1488 } 1489 1490 static bool 1491 posix_sock_is_ipv6(struct spdk_sock *_sock) 1492 { 1493 struct spdk_posix_sock *sock = __posix_sock(_sock); 1494 struct sockaddr_storage sa; 1495 socklen_t salen; 1496 int rc; 1497 1498 assert(sock != NULL); 1499 1500 memset(&sa, 0, sizeof sa); 1501 salen = sizeof sa; 1502 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 1503 if (rc != 0) { 1504 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 1505 return false; 1506 } 1507 1508 return (sa.ss_family == AF_INET6); 1509 } 1510 1511 static bool 1512 posix_sock_is_ipv4(struct spdk_sock *_sock) 1513 { 1514 struct spdk_posix_sock *sock = __posix_sock(_sock); 1515 struct sockaddr_storage sa; 1516 socklen_t salen; 1517 int rc; 1518 1519 assert(sock != NULL); 1520 1521 memset(&sa, 0, sizeof sa); 1522 salen = sizeof sa; 1523 rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); 1524 if (rc != 0) { 1525 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); 1526 return false; 1527 } 1528 1529 return (sa.ss_family == AF_INET); 1530 } 1531 1532 static bool 1533 posix_sock_is_connected(struct spdk_sock *_sock) 1534 { 1535 struct spdk_posix_sock *sock = __posix_sock(_sock); 1536 uint8_t byte; 1537 int rc; 1538 1539 rc = recv(sock->fd, &byte, 1, MSG_PEEK); 1540 if (rc == 0) { 1541 return false; 1542 } 1543 1544 if (rc < 0) { 1545 if (errno == EAGAIN || errno == EWOULDBLOCK) { 1546 return true; 1547 } 1548 1549 return false; 1550 } 1551 1552 return true; 1553 } 1554 1555 static struct spdk_sock_group_impl * 1556 posix_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint) 1557 { 1558 struct spdk_posix_sock *sock = __posix_sock(_sock); 1559 struct spdk_sock_group_impl *group_impl; 1560 1561 if (sock->placement_id != -1) { 1562 spdk_sock_map_lookup(&g_map, sock->placement_id, &group_impl, hint); 1563 return group_impl; 1564 } 1565 1566 return NULL; 1567 } 1568 1569 static struct spdk_sock_group_impl * 1570 posix_sock_group_impl_create(void) 1571 { 1572 struct spdk_posix_sock_group_impl *group_impl; 1573 int fd; 1574 1575 #if defined(SPDK_EPOLL) 1576 fd = epoll_create1(0); 1577 #elif defined(SPDK_KEVENT) 1578 fd = kqueue(); 1579 #endif 1580 if (fd == -1) { 1581 return NULL; 1582 } 1583 1584 group_impl = calloc(1, sizeof(*group_impl)); 1585 if (group_impl == NULL) { 1586 SPDK_ERRLOG("group_impl allocation failed\n"); 1587 close(fd); 1588 return NULL; 1589 } 1590 1591 group_impl->fd = fd; 1592 TAILQ_INIT(&group_impl->socks_with_data); 1593 group_impl->placement_id = -1; 1594 1595 if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { 1596 spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base); 1597 group_impl->placement_id = spdk_env_get_current_core(); 1598 } 1599 1600 return &group_impl->base; 1601 } 1602 1603 static void 1604 posix_sock_mark(struct spdk_posix_sock_group_impl *group, struct spdk_posix_sock *sock, 1605 int placement_id) 1606 { 1607 #if defined(SO_MARK) 1608 int rc; 1609 1610 rc = setsockopt(sock->fd, SOL_SOCKET, SO_MARK, 1611 &placement_id, sizeof(placement_id)); 1612 if (rc != 0) { 1613 /* Not fatal */ 1614 SPDK_ERRLOG("Error setting SO_MARK\n"); 1615 return; 1616 } 1617 1618 rc = spdk_sock_map_insert(&g_map, placement_id, &group->base); 1619 if (rc != 0) { 1620 /* Not fatal */ 1621 SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); 1622 return; 1623 } 1624 1625 sock->placement_id = placement_id; 1626 #endif 1627 } 1628 1629 static void 1630 posix_sock_update_mark(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1631 { 1632 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1633 1634 if (group->placement_id == -1) { 1635 group->placement_id = spdk_sock_map_find_free(&g_map); 1636 1637 /* If a free placement id is found, update existing sockets in this group */ 1638 if (group->placement_id != -1) { 1639 struct spdk_sock *sock, *tmp; 1640 1641 TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { 1642 posix_sock_mark(group, __posix_sock(sock), group->placement_id); 1643 } 1644 } 1645 } 1646 1647 if (group->placement_id != -1) { 1648 /* 1649 * group placement id is already determined for this poll group. 1650 * Mark socket with group's placement id. 1651 */ 1652 posix_sock_mark(group, __posix_sock(_sock), group->placement_id); 1653 } 1654 } 1655 1656 static int 1657 posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1658 { 1659 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1660 struct spdk_posix_sock *sock = __posix_sock(_sock); 1661 int rc; 1662 1663 #if defined(SPDK_EPOLL) 1664 struct epoll_event event; 1665 1666 memset(&event, 0, sizeof(event)); 1667 /* EPOLLERR is always on even if we don't set it, but be explicit for clarity */ 1668 event.events = EPOLLIN | EPOLLERR; 1669 event.data.ptr = sock; 1670 1671 rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event); 1672 #elif defined(SPDK_KEVENT) 1673 struct kevent event; 1674 struct timespec ts = {0}; 1675 1676 EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock); 1677 1678 rc = kevent(group->fd, &event, 1, NULL, 0, &ts); 1679 #endif 1680 1681 if (rc != 0) { 1682 return rc; 1683 } 1684 1685 /* switched from another polling group due to scheduling */ 1686 if (spdk_unlikely(sock->recv_pipe != NULL && 1687 (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { 1688 sock->pipe_has_data = true; 1689 sock->socket_has_data = false; 1690 TAILQ_INSERT_TAIL(&group->socks_with_data, sock, link); 1691 } 1692 1693 if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_MARK) { 1694 posix_sock_update_mark(_group, _sock); 1695 } else if (sock->placement_id != -1) { 1696 rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base); 1697 if (rc != 0) { 1698 SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); 1699 /* Do not treat this as an error. The system will continue running. */ 1700 } 1701 } 1702 1703 return rc; 1704 } 1705 1706 static int 1707 posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) 1708 { 1709 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1710 struct spdk_posix_sock *sock = __posix_sock(_sock); 1711 int rc; 1712 1713 if (sock->pipe_has_data || sock->socket_has_data) { 1714 TAILQ_REMOVE(&group->socks_with_data, sock, link); 1715 sock->pipe_has_data = false; 1716 sock->socket_has_data = false; 1717 } 1718 1719 if (sock->placement_id != -1) { 1720 spdk_sock_map_release(&g_map, sock->placement_id); 1721 } 1722 1723 #if defined(SPDK_EPOLL) 1724 struct epoll_event event; 1725 1726 /* Event parameter is ignored but some old kernel version still require it. */ 1727 rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event); 1728 #elif defined(SPDK_KEVENT) 1729 struct kevent event; 1730 struct timespec ts = {0}; 1731 1732 EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); 1733 1734 rc = kevent(group->fd, &event, 1, NULL, 0, &ts); 1735 if (rc == 0 && event.flags & EV_ERROR) { 1736 rc = -1; 1737 errno = event.data; 1738 } 1739 #endif 1740 1741 spdk_sock_abort_requests(_sock); 1742 1743 return rc; 1744 } 1745 1746 static int 1747 posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, 1748 struct spdk_sock **socks) 1749 { 1750 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1751 struct spdk_sock *sock, *tmp; 1752 int num_events, i, rc; 1753 struct spdk_posix_sock *psock, *ptmp; 1754 #if defined(SPDK_EPOLL) 1755 struct epoll_event events[MAX_EVENTS_PER_POLL]; 1756 #elif defined(SPDK_KEVENT) 1757 struct kevent events[MAX_EVENTS_PER_POLL]; 1758 struct timespec ts = {0}; 1759 #endif 1760 1761 #ifdef SPDK_ZEROCOPY 1762 /* When all of the following conditions are met 1763 * - non-blocking socket 1764 * - zero copy is enabled 1765 * - interrupts suppressed (i.e. busy polling) 1766 * - the NIC tx queue is full at the time sendmsg() is called 1767 * - epoll_wait determines there is an EPOLLIN event for the socket 1768 * then we can get into a situation where data we've sent is queued 1769 * up in the kernel network stack, but interrupts have been suppressed 1770 * because other traffic is flowing so the kernel misses the signal 1771 * to flush the software tx queue. If there wasn't incoming data 1772 * pending on the socket, then epoll_wait would have been sufficient 1773 * to kick off the send operation, but since there is a pending event 1774 * epoll_wait does not trigger the necessary operation. 1775 * 1776 * We deal with this by checking for all of the above conditions and 1777 * additionally looking for EPOLLIN events that were not consumed from 1778 * the last poll loop. We take this to mean that the upper layer is 1779 * unable to consume them because it is blocked waiting for resources 1780 * to free up, and those resources are most likely freed in response 1781 * to a pending asynchronous write completing. 1782 * 1783 * Additionally, sockets that have the same placement_id actually share 1784 * an underlying hardware queue. That means polling one of them is 1785 * equivalent to polling all of them. As a quick mechanism to avoid 1786 * making extra poll() calls, stash the last placement_id during the loop 1787 * and only poll if it's not the same. The overwhelmingly common case 1788 * is that all sockets in this list have the same placement_id because 1789 * SPDK is intentionally grouping sockets by that value, so even 1790 * though this won't stop all extra calls to poll(), it's very fast 1791 * and will catch all of them in practice. 1792 */ 1793 int last_placement_id = -1; 1794 1795 TAILQ_FOREACH(psock, &group->socks_with_data, link) { 1796 if (psock->zcopy && psock->placement_id >= 0 && 1797 psock->placement_id != last_placement_id) { 1798 struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0}; 1799 1800 poll(&pfd, 1, 0); 1801 last_placement_id = psock->placement_id; 1802 } 1803 } 1804 #endif 1805 1806 /* This must be a TAILQ_FOREACH_SAFE because while flushing, 1807 * a completion callback could remove the sock from the 1808 * group. */ 1809 TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { 1810 rc = _sock_flush(sock); 1811 if (rc) { 1812 spdk_sock_abort_requests(sock); 1813 } 1814 } 1815 1816 assert(max_events > 0); 1817 1818 #if defined(SPDK_EPOLL) 1819 num_events = epoll_wait(group->fd, events, max_events, 0); 1820 #elif defined(SPDK_KEVENT) 1821 num_events = kevent(group->fd, NULL, 0, events, max_events, &ts); 1822 #endif 1823 1824 if (num_events == -1) { 1825 return -1; 1826 } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) { 1827 sock = TAILQ_FIRST(&_group->socks); 1828 psock = __posix_sock(sock); 1829 /* poll() is called here to busy poll the queue associated with 1830 * first socket in list and potentially reap incoming data. 1831 */ 1832 if (sock->opts.priority) { 1833 struct pollfd pfd = {0, 0, 0}; 1834 1835 pfd.fd = psock->fd; 1836 pfd.events = POLLIN | POLLERR; 1837 poll(&pfd, 1, 0); 1838 } 1839 } 1840 1841 for (i = 0; i < num_events; i++) { 1842 #if defined(SPDK_EPOLL) 1843 sock = events[i].data.ptr; 1844 psock = __posix_sock(sock); 1845 1846 #ifdef SPDK_ZEROCOPY 1847 if (events[i].events & EPOLLERR) { 1848 rc = _sock_check_zcopy(sock); 1849 /* If the socket was closed or removed from 1850 * the group in response to a send ack, don't 1851 * add it to the array here. */ 1852 if (rc || sock->cb_fn == NULL) { 1853 continue; 1854 } 1855 } 1856 #endif 1857 if ((events[i].events & EPOLLIN) == 0) { 1858 continue; 1859 } 1860 1861 #elif defined(SPDK_KEVENT) 1862 sock = events[i].udata; 1863 psock = __posix_sock(sock); 1864 #endif 1865 1866 /* If the socket is not already in the list, add it now */ 1867 if (!psock->socket_has_data && !psock->pipe_has_data) { 1868 TAILQ_INSERT_TAIL(&group->socks_with_data, psock, link); 1869 } 1870 psock->socket_has_data = true; 1871 } 1872 1873 num_events = 0; 1874 1875 TAILQ_FOREACH_SAFE(psock, &group->socks_with_data, link, ptmp) { 1876 if (num_events == max_events) { 1877 break; 1878 } 1879 1880 /* If the socket's cb_fn is NULL, just remove it from the 1881 * list and do not add it to socks array */ 1882 if (spdk_unlikely(psock->base.cb_fn == NULL)) { 1883 psock->socket_has_data = false; 1884 psock->pipe_has_data = false; 1885 TAILQ_REMOVE(&group->socks_with_data, psock, link); 1886 continue; 1887 } 1888 1889 socks[num_events++] = &psock->base; 1890 } 1891 1892 /* Cycle the has_data list so that each time we poll things aren't 1893 * in the same order. Say we have 6 sockets in the list, named as follows: 1894 * A B C D E F 1895 * And all 6 sockets had epoll events, but max_events is only 3. That means 1896 * psock currently points at D. We want to rearrange the list to the following: 1897 * D E F A B C 1898 * 1899 * The variables below are named according to this example to make it easier to 1900 * follow the swaps. 1901 */ 1902 if (psock != NULL) { 1903 struct spdk_posix_sock *pa, *pc, *pd, *pf; 1904 1905 /* Capture pointers to the elements we need */ 1906 pd = psock; 1907 pc = TAILQ_PREV(pd, spdk_has_data_list, link); 1908 pa = TAILQ_FIRST(&group->socks_with_data); 1909 pf = TAILQ_LAST(&group->socks_with_data, spdk_has_data_list); 1910 1911 /* Break the link between C and D */ 1912 pc->link.tqe_next = NULL; 1913 1914 /* Connect F to A */ 1915 pf->link.tqe_next = pa; 1916 pa->link.tqe_prev = &pf->link.tqe_next; 1917 1918 /* Fix up the list first/last pointers */ 1919 group->socks_with_data.tqh_first = pd; 1920 group->socks_with_data.tqh_last = &pc->link.tqe_next; 1921 1922 /* D is in front of the list, make tqe prev pointer point to the head of list */ 1923 pd->link.tqe_prev = &group->socks_with_data.tqh_first; 1924 } 1925 1926 return num_events; 1927 } 1928 1929 static int 1930 posix_sock_group_impl_close(struct spdk_sock_group_impl *_group) 1931 { 1932 struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); 1933 int rc; 1934 1935 if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { 1936 spdk_sock_map_release(&g_map, spdk_env_get_current_core()); 1937 } 1938 1939 rc = close(group->fd); 1940 free(group); 1941 return rc; 1942 } 1943 1944 static struct spdk_net_impl g_posix_net_impl = { 1945 .name = "posix", 1946 .getaddr = posix_sock_getaddr, 1947 .connect = posix_sock_connect, 1948 .listen = posix_sock_listen, 1949 .accept = posix_sock_accept, 1950 .close = posix_sock_close, 1951 .recv = posix_sock_recv, 1952 .readv = posix_sock_readv, 1953 .readv_async = posix_sock_readv_async, 1954 .writev = posix_sock_writev, 1955 .writev_async = posix_sock_writev_async, 1956 .flush = posix_sock_flush, 1957 .set_recvlowat = posix_sock_set_recvlowat, 1958 .set_recvbuf = posix_sock_set_recvbuf, 1959 .set_sendbuf = posix_sock_set_sendbuf, 1960 .is_ipv6 = posix_sock_is_ipv6, 1961 .is_ipv4 = posix_sock_is_ipv4, 1962 .is_connected = posix_sock_is_connected, 1963 .group_impl_get_optimal = posix_sock_group_impl_get_optimal, 1964 .group_impl_create = posix_sock_group_impl_create, 1965 .group_impl_add_sock = posix_sock_group_impl_add_sock, 1966 .group_impl_remove_sock = posix_sock_group_impl_remove_sock, 1967 .group_impl_poll = posix_sock_group_impl_poll, 1968 .group_impl_close = posix_sock_group_impl_close, 1969 .get_opts = posix_sock_impl_get_opts, 1970 .set_opts = posix_sock_impl_set_opts, 1971 }; 1972 1973 SPDK_NET_IMPL_REGISTER(posix, &g_posix_net_impl, DEFAULT_SOCK_PRIORITY); 1974 1975 static struct spdk_sock * 1976 ssl_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) 1977 { 1978 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, true); 1979 } 1980 1981 static struct spdk_sock * 1982 ssl_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) 1983 { 1984 return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, true); 1985 } 1986 1987 static struct spdk_net_impl g_ssl_net_impl = { 1988 .name = "ssl", 1989 .getaddr = posix_sock_getaddr, 1990 .connect = ssl_sock_connect, 1991 .listen = ssl_sock_listen, 1992 .accept = posix_sock_accept, 1993 .close = posix_sock_close, 1994 .recv = posix_sock_recv, 1995 .readv = posix_sock_readv, 1996 .writev = posix_sock_writev, 1997 .writev_async = posix_sock_writev_async, 1998 .flush = posix_sock_flush, 1999 .set_recvlowat = posix_sock_set_recvlowat, 2000 .set_recvbuf = posix_sock_set_recvbuf, 2001 .set_sendbuf = posix_sock_set_sendbuf, 2002 .is_ipv6 = posix_sock_is_ipv6, 2003 .is_ipv4 = posix_sock_is_ipv4, 2004 .is_connected = posix_sock_is_connected, 2005 .group_impl_get_optimal = posix_sock_group_impl_get_optimal, 2006 .group_impl_create = posix_sock_group_impl_create, 2007 .group_impl_add_sock = posix_sock_group_impl_add_sock, 2008 .group_impl_remove_sock = posix_sock_group_impl_remove_sock, 2009 .group_impl_poll = posix_sock_group_impl_poll, 2010 .group_impl_close = posix_sock_group_impl_close, 2011 .get_opts = posix_sock_impl_get_opts, 2012 .set_opts = posix_sock_impl_set_opts, 2013 }; 2014 2015 SPDK_NET_IMPL_REGISTER(ssl, &g_ssl_net_impl, DEFAULT_SOCK_PRIORITY); 2016 SPDK_LOG_REGISTER_COMPONENT(sock_posix) 2017