1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 67 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $ 68 */ 69 70 #include "opt_ipsec.h" 71 #include "opt_inet.h" 72 #include "opt_inet6.h" 73 #include "opt_tcpdebug.h" 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/kernel.h> 78 #include <sys/malloc.h> 79 #include <sys/sysctl.h> 80 #include <sys/globaldata.h> 81 #include <sys/thread.h> 82 83 #include <sys/mbuf.h> 84 #ifdef INET6 85 #include <sys/domain.h> 86 #endif /* INET6 */ 87 #include <sys/socket.h> 88 #include <sys/socketvar.h> 89 #include <sys/protosw.h> 90 91 #include <sys/thread2.h> 92 #include <sys/msgport2.h> 93 #include <sys/socketvar2.h> 94 95 #include <net/if.h> 96 #include <net/netisr.h> 97 #include <net/route.h> 98 99 #include <net/netmsg2.h> 100 101 #include <netinet/in.h> 102 #include <netinet/in_systm.h> 103 #ifdef INET6 104 #include <netinet/ip6.h> 105 #endif 106 #include <netinet/in_pcb.h> 107 #ifdef INET6 108 #include <netinet6/in6_pcb.h> 109 #endif 110 #include <netinet/in_var.h> 111 #include <netinet/ip_var.h> 112 #ifdef INET6 113 #include <netinet6/ip6_var.h> 114 #include <netinet6/tcp6_var.h> 115 #endif 116 #include <netinet/tcp.h> 117 #include <netinet/tcp_fsm.h> 118 #include <netinet/tcp_seq.h> 119 #include <netinet/tcp_timer.h> 120 #include <netinet/tcp_timer2.h> 121 #include <netinet/tcp_var.h> 122 #include <netinet/tcpip.h> 123 #ifdef TCPDEBUG 124 #include <netinet/tcp_debug.h> 125 #endif 126 127 #ifdef IPSEC 128 #include <netinet6/ipsec.h> 129 #endif /*IPSEC*/ 130 131 /* 132 * TCP protocol interface to socket abstraction. 133 */ 134 extern char *tcpstates[]; /* XXX ??? */ 135 136 static int tcp_attach (struct socket *, struct pru_attach_info *); 137 static void tcp_connect (netmsg_t msg); 138 #ifdef INET6 139 static void tcp6_connect (netmsg_t msg); 140 static int tcp6_connect_oncpu(struct tcpcb *tp, int flags, 141 struct mbuf **mp, 142 struct sockaddr_in6 *sin6, 143 struct in6_addr *addr6); 144 #endif /* INET6 */ 145 static struct tcpcb * 146 tcp_disconnect (struct tcpcb *); 147 static struct tcpcb * 148 tcp_usrclosed (struct tcpcb *); 149 150 #ifdef TCPDEBUG 151 #define TCPDEBUG0 int ostate = 0 152 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 153 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 154 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 155 #else 156 #define TCPDEBUG0 157 #define TCPDEBUG1() 158 #define TCPDEBUG2(req) 159 #endif 160 161 static int tcp_lport_extension = 1; 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lportext, CTLFLAG_RW, 163 &tcp_lport_extension, 0, ""); 164 165 /* 166 * For some ill optimized programs, which try to use TCP_NOPUSH 167 * to improve performance, will have small amount of data sits 168 * in the sending buffer. These small amount of data will _not_ 169 * be pushed into the network until more data are written into 170 * the socket or the socket write side is shutdown. 171 */ 172 static int tcp_disable_nopush = 1; 173 SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_nopush, CTLFLAG_RW, 174 &tcp_disable_nopush, 0, "TCP_NOPUSH socket option will have no effect"); 175 176 /* 177 * TCP attaches to socket via pru_attach(), reserving space, 178 * and an internet control block. This is likely occuring on 179 * cpu0 and may have to move later when we bind/connect. 180 */ 181 static void 182 tcp_usr_attach(netmsg_t msg) 183 { 184 struct socket *so = msg->base.nm_so; 185 struct pru_attach_info *ai = msg->attach.nm_ai; 186 int error; 187 struct inpcb *inp; 188 struct tcpcb *tp = NULL; 189 TCPDEBUG0; 190 191 soreference(so); 192 inp = so->so_pcb; 193 TCPDEBUG1(); 194 if (inp) { 195 error = EISCONN; 196 goto out; 197 } 198 199 error = tcp_attach(so, ai); 200 if (error) 201 goto out; 202 203 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 204 so->so_linger = TCP_LINGERTIME; 205 tp = sototcpcb(so); 206 out: 207 sofree(so); /* from ref above */ 208 TCPDEBUG2(PRU_ATTACH); 209 lwkt_replymsg(&msg->lmsg, error); 210 } 211 212 /* 213 * pru_detach() detaches the TCP protocol from the socket. 214 * If the protocol state is non-embryonic, then can't 215 * do this directly: have to initiate a pru_disconnect(), 216 * which may finish later; embryonic TCB's can just 217 * be discarded here. 218 */ 219 static void 220 tcp_usr_detach(netmsg_t msg) 221 { 222 struct socket *so = msg->base.nm_so; 223 int error = 0; 224 struct inpcb *inp; 225 struct tcpcb *tp; 226 TCPDEBUG0; 227 228 inp = so->so_pcb; 229 230 /* 231 * If the inp is already detached it may have been due to an async 232 * close. Just return as if no error occured. 233 * 234 * It's possible for the tcpcb (tp) to disconnect from the inp due 235 * to tcp_drop()->tcp_close() being called. This may occur *after* 236 * the detach message has been queued so we may find a NULL tp here. 237 */ 238 if (inp) { 239 if ((tp = intotcpcb(inp)) != NULL) { 240 TCPDEBUG1(); 241 tp = tcp_disconnect(tp); 242 TCPDEBUG2(PRU_DETACH); 243 } 244 } 245 lwkt_replymsg(&msg->lmsg, error); 246 } 247 248 /* 249 * NOTE: ignore_error is non-zero for certain disconnection races 250 * which we want to silently allow, otherwise close() may return 251 * an unexpected error. 252 * 253 * NOTE: The variables (msg) and (tp) are assumed. 254 */ 255 #define COMMON_START(so, inp, ignore_error) \ 256 TCPDEBUG0; \ 257 \ 258 inp = so->so_pcb; \ 259 do { \ 260 if (inp == NULL) { \ 261 error = ignore_error ? 0 : EINVAL; \ 262 tp = NULL; \ 263 goto out; \ 264 } \ 265 tp = intotcpcb(inp); \ 266 TCPDEBUG1(); \ 267 } while(0) 268 269 #define COMMON_END1(req, noreply) \ 270 out: do { \ 271 TCPDEBUG2(req); \ 272 if (!(noreply)) \ 273 lwkt_replymsg(&msg->lmsg, error); \ 274 return; \ 275 } while(0) 276 277 #define COMMON_END(req) COMMON_END1((req), 0) 278 279 /* 280 * Give the socket an address. 281 */ 282 static void 283 tcp_usr_bind(netmsg_t msg) 284 { 285 struct socket *so = msg->bind.base.nm_so; 286 struct sockaddr *nam = msg->bind.nm_nam; 287 struct thread *td = msg->bind.nm_td; 288 int error = 0; 289 struct inpcb *inp; 290 struct tcpcb *tp; 291 struct sockaddr_in *sinp; 292 293 COMMON_START(so, inp, 0); 294 295 /* 296 * Must check for multicast addresses and disallow binding 297 * to them. 298 */ 299 sinp = (struct sockaddr_in *)nam; 300 if (sinp->sin_family == AF_INET && 301 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 302 error = EAFNOSUPPORT; 303 goto out; 304 } 305 error = in_pcbbind(inp, nam, td); 306 if (error) 307 goto out; 308 COMMON_END(PRU_BIND); 309 310 } 311 312 #ifdef INET6 313 314 static void 315 tcp6_usr_bind(netmsg_t msg) 316 { 317 struct socket *so = msg->bind.base.nm_so; 318 struct sockaddr *nam = msg->bind.nm_nam; 319 struct thread *td = msg->bind.nm_td; 320 int error = 0; 321 struct inpcb *inp; 322 struct tcpcb *tp; 323 struct sockaddr_in6 *sin6p; 324 325 COMMON_START(so, inp, 0); 326 327 /* 328 * Must check for multicast addresses and disallow binding 329 * to them. 330 */ 331 sin6p = (struct sockaddr_in6 *)nam; 332 if (sin6p->sin6_family == AF_INET6 && 333 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 334 error = EAFNOSUPPORT; 335 goto out; 336 } 337 inp->inp_vflag &= ~INP_IPV4; 338 inp->inp_vflag |= INP_IPV6; 339 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 340 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) 341 inp->inp_vflag |= INP_IPV4; 342 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 343 struct sockaddr_in sin; 344 345 in6_sin6_2_sin(&sin, sin6p); 346 inp->inp_vflag |= INP_IPV4; 347 inp->inp_vflag &= ~INP_IPV6; 348 error = in_pcbbind(inp, (struct sockaddr *)&sin, td); 349 goto out; 350 } 351 } 352 error = in6_pcbbind(inp, nam, td); 353 if (error) 354 goto out; 355 COMMON_END(PRU_BIND); 356 } 357 #endif /* INET6 */ 358 359 #ifdef SMP 360 361 struct netmsg_inswildcard { 362 struct netmsg_base base; 363 struct inpcb *nm_inp; 364 }; 365 366 static void 367 in_pcbinswildcardhash_handler(netmsg_t msg) 368 { 369 struct netmsg_inswildcard *nm = (struct netmsg_inswildcard *)msg; 370 int cpu = mycpuid, nextcpu; 371 372 in_pcbinswildcardhash_oncpu(nm->nm_inp, &tcbinfo[cpu]); 373 374 nextcpu = cpu + 1; 375 if (nextcpu < ncpus2) 376 lwkt_forwardmsg(cpu_portfn(nextcpu), &nm->base.lmsg); 377 else 378 lwkt_replymsg(&nm->base.lmsg, 0); 379 } 380 381 #endif 382 383 /* 384 * Prepare to accept connections. 385 */ 386 static void 387 tcp_usr_listen(netmsg_t msg) 388 { 389 struct socket *so = msg->listen.base.nm_so; 390 struct thread *td = msg->listen.nm_td; 391 int error = 0; 392 struct inpcb *inp; 393 struct tcpcb *tp; 394 #ifdef SMP 395 struct netmsg_inswildcard nm; 396 #endif 397 398 COMMON_START(so, inp, 0); 399 400 if (tp->t_flags & TF_LISTEN) 401 goto out; 402 403 if (inp->inp_lport == 0) { 404 error = in_pcbbind(inp, NULL, td); 405 if (error) 406 goto out; 407 } 408 409 tp->t_state = TCPS_LISTEN; 410 tp->t_flags |= TF_LISTEN; 411 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 412 413 #ifdef SMP 414 if (ncpus > 1) { 415 /* 416 * We have to set the flag because we can't have other cpus 417 * messing with our inp's flags. 418 */ 419 KASSERT(!(inp->inp_flags & INP_CONNECTED), 420 ("already on connhash")); 421 KASSERT(!(inp->inp_flags & INP_WILDCARD), 422 ("already on wildcardhash")); 423 KASSERT(!(inp->inp_flags & INP_WILDCARD_MP), 424 ("already on MP wildcardhash")); 425 inp->inp_flags |= INP_WILDCARD_MP; 426 427 KKASSERT(so->so_port == cpu_portfn(0)); 428 KKASSERT(&curthread->td_msgport == cpu_portfn(0)); 429 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 430 431 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 432 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 433 nm.nm_inp = inp; 434 lwkt_domsg(cpu_portfn(1), &nm.base.lmsg, 0); 435 } 436 #endif 437 in_pcbinswildcardhash(inp); 438 COMMON_END(PRU_LISTEN); 439 } 440 441 #ifdef INET6 442 443 static void 444 tcp6_usr_listen(netmsg_t msg) 445 { 446 struct socket *so = msg->listen.base.nm_so; 447 struct thread *td = msg->listen.nm_td; 448 int error = 0; 449 struct inpcb *inp; 450 struct tcpcb *tp; 451 #ifdef SMP 452 struct netmsg_inswildcard nm; 453 #endif 454 455 COMMON_START(so, inp, 0); 456 457 if (tp->t_flags & TF_LISTEN) 458 goto out; 459 460 if (inp->inp_lport == 0) { 461 if (!(inp->inp_flags & IN6P_IPV6_V6ONLY)) 462 inp->inp_vflag |= INP_IPV4; 463 else 464 inp->inp_vflag &= ~INP_IPV4; 465 error = in6_pcbbind(inp, NULL, td); 466 if (error) 467 goto out; 468 } 469 470 tp->t_state = TCPS_LISTEN; 471 tp->t_flags |= TF_LISTEN; 472 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 473 474 #ifdef SMP 475 if (ncpus > 1) { 476 /* 477 * We have to set the flag because we can't have other cpus 478 * messing with our inp's flags. 479 */ 480 KASSERT(!(inp->inp_flags & INP_CONNECTED), 481 ("already on connhash")); 482 KASSERT(!(inp->inp_flags & INP_WILDCARD), 483 ("already on wildcardhash")); 484 KASSERT(!(inp->inp_flags & INP_WILDCARD_MP), 485 ("already on MP wildcardhash")); 486 inp->inp_flags |= INP_WILDCARD_MP; 487 488 KKASSERT(so->so_port == cpu_portfn(0)); 489 KKASSERT(&curthread->td_msgport == cpu_portfn(0)); 490 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 491 492 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 493 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 494 nm.nm_inp = inp; 495 lwkt_domsg(cpu_portfn(1), &nm.base.lmsg, 0); 496 } 497 #endif 498 in_pcbinswildcardhash(inp); 499 COMMON_END(PRU_LISTEN); 500 } 501 #endif /* INET6 */ 502 503 /* 504 * Initiate connection to peer. 505 * Create a template for use in transmissions on this connection. 506 * Enter SYN_SENT state, and mark socket as connecting. 507 * Start keep-alive timer, and seed output sequence space. 508 * Send initial segment on connection. 509 */ 510 static void 511 tcp_usr_connect(netmsg_t msg) 512 { 513 struct socket *so = msg->connect.base.nm_so; 514 struct sockaddr *nam = msg->connect.nm_nam; 515 struct thread *td = msg->connect.nm_td; 516 int error = 0; 517 struct inpcb *inp; 518 struct tcpcb *tp; 519 struct sockaddr_in *sinp; 520 521 COMMON_START(so, inp, 0); 522 523 /* 524 * Must disallow TCP ``connections'' to multicast addresses. 525 */ 526 sinp = (struct sockaddr_in *)nam; 527 if (sinp->sin_family == AF_INET 528 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 529 error = EAFNOSUPPORT; 530 goto out; 531 } 532 533 if (!prison_remote_ip(td, (struct sockaddr*)sinp)) { 534 error = EAFNOSUPPORT; /* IPv6 only jail */ 535 goto out; 536 } 537 538 tcp_connect(msg); 539 /* msg is invalid now */ 540 return; 541 out: 542 if (msg->connect.nm_m) { 543 m_freem(msg->connect.nm_m); 544 msg->connect.nm_m = NULL; 545 } 546 lwkt_replymsg(&msg->lmsg, error); 547 } 548 549 #ifdef INET6 550 551 static void 552 tcp6_usr_connect(netmsg_t msg) 553 { 554 struct socket *so = msg->connect.base.nm_so; 555 struct sockaddr *nam = msg->connect.nm_nam; 556 struct thread *td = msg->connect.nm_td; 557 int error = 0; 558 struct inpcb *inp; 559 struct tcpcb *tp; 560 struct sockaddr_in6 *sin6p; 561 562 COMMON_START(so, inp, 0); 563 564 /* 565 * Must disallow TCP ``connections'' to multicast addresses. 566 */ 567 sin6p = (struct sockaddr_in6 *)nam; 568 if (sin6p->sin6_family == AF_INET6 569 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 570 error = EAFNOSUPPORT; 571 goto out; 572 } 573 574 if (!prison_remote_ip(td, nam)) { 575 error = EAFNOSUPPORT; /* IPv4 only jail */ 576 goto out; 577 } 578 579 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 580 struct sockaddr_in *sinp; 581 582 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 583 error = EINVAL; 584 goto out; 585 } 586 sinp = kmalloc(sizeof(*sinp), M_LWKTMSG, M_INTWAIT); 587 in6_sin6_2_sin(sinp, sin6p); 588 inp->inp_vflag |= INP_IPV4; 589 inp->inp_vflag &= ~INP_IPV6; 590 msg->connect.nm_nam = (struct sockaddr *)sinp; 591 msg->connect.nm_reconnect |= NMSG_RECONNECT_NAMALLOC; 592 tcp_connect(msg); 593 /* msg is invalid now */ 594 return; 595 } 596 inp->inp_vflag &= ~INP_IPV4; 597 inp->inp_vflag |= INP_IPV6; 598 inp->inp_inc.inc_isipv6 = 1; 599 600 msg->connect.nm_reconnect |= NMSG_RECONNECT_FALLBACK; 601 tcp6_connect(msg); 602 /* msg is invalid now */ 603 return; 604 out: 605 if (msg->connect.nm_m) { 606 m_freem(msg->connect.nm_m); 607 msg->connect.nm_m = NULL; 608 } 609 lwkt_replymsg(&msg->lmsg, error); 610 } 611 612 #endif /* INET6 */ 613 614 /* 615 * Initiate disconnect from peer. 616 * If connection never passed embryonic stage, just drop; 617 * else if don't need to let data drain, then can just drop anyways, 618 * else have to begin TCP shutdown process: mark socket disconnecting, 619 * drain unread data, state switch to reflect user close, and 620 * send segment (e.g. FIN) to peer. Socket will be really disconnected 621 * when peer sends FIN and acks ours. 622 * 623 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 624 */ 625 static void 626 tcp_usr_disconnect(netmsg_t msg) 627 { 628 struct socket *so = msg->disconnect.base.nm_so; 629 int error = 0; 630 struct inpcb *inp; 631 struct tcpcb *tp; 632 633 COMMON_START(so, inp, 1); 634 tp = tcp_disconnect(tp); 635 COMMON_END(PRU_DISCONNECT); 636 } 637 638 /* 639 * Accept a connection. Essentially all the work is 640 * done at higher levels; just return the address 641 * of the peer, storing through addr. 642 */ 643 static void 644 tcp_usr_accept(netmsg_t msg) 645 { 646 struct socket *so = msg->accept.base.nm_so; 647 struct sockaddr **nam = msg->accept.nm_nam; 648 int error = 0; 649 struct inpcb *inp; 650 struct tcpcb *tp = NULL; 651 TCPDEBUG0; 652 653 inp = so->so_pcb; 654 if (so->so_state & SS_ISDISCONNECTED) { 655 error = ECONNABORTED; 656 goto out; 657 } 658 if (inp == 0) { 659 error = EINVAL; 660 goto out; 661 } 662 663 tp = intotcpcb(inp); 664 TCPDEBUG1(); 665 in_setpeeraddr(so, nam); 666 COMMON_END(PRU_ACCEPT); 667 } 668 669 #ifdef INET6 670 static void 671 tcp6_usr_accept(netmsg_t msg) 672 { 673 struct socket *so = msg->accept.base.nm_so; 674 struct sockaddr **nam = msg->accept.nm_nam; 675 int error = 0; 676 struct inpcb *inp; 677 struct tcpcb *tp = NULL; 678 TCPDEBUG0; 679 680 inp = so->so_pcb; 681 682 if (so->so_state & SS_ISDISCONNECTED) { 683 error = ECONNABORTED; 684 goto out; 685 } 686 if (inp == 0) { 687 error = EINVAL; 688 goto out; 689 } 690 tp = intotcpcb(inp); 691 TCPDEBUG1(); 692 in6_mapped_peeraddr(so, nam); 693 COMMON_END(PRU_ACCEPT); 694 } 695 #endif /* INET6 */ 696 /* 697 * Mark the connection as being incapable of further output. 698 */ 699 static void 700 tcp_usr_shutdown(netmsg_t msg) 701 { 702 struct socket *so = msg->shutdown.base.nm_so; 703 int error = 0; 704 struct inpcb *inp; 705 struct tcpcb *tp; 706 707 COMMON_START(so, inp, 0); 708 socantsendmore(so); 709 tp = tcp_usrclosed(tp); 710 if (tp) 711 error = tcp_output(tp); 712 COMMON_END(PRU_SHUTDOWN); 713 } 714 715 /* 716 * After a receive, possibly send window update to peer. 717 */ 718 static void 719 tcp_usr_rcvd(netmsg_t msg) 720 { 721 struct socket *so = msg->rcvd.base.nm_so; 722 int error = 0; 723 struct inpcb *inp; 724 struct tcpcb *tp; 725 726 COMMON_START(so, inp, 0); 727 tcp_output(tp); 728 COMMON_END(PRU_RCVD); 729 } 730 731 /* 732 * Do a send by putting data in output queue and updating urgent 733 * marker if URG set. Possibly send more data. Unlike the other 734 * pru_*() routines, the mbuf chains are our responsibility. We 735 * must either enqueue them or free them. The other pru_* routines 736 * generally are caller-frees. 737 */ 738 static void 739 tcp_usr_send(netmsg_t msg) 740 { 741 struct socket *so = msg->send.base.nm_so; 742 int flags = msg->send.nm_flags; 743 struct mbuf *m = msg->send.nm_m; 744 int error = 0; 745 struct inpcb *inp; 746 struct tcpcb *tp; 747 TCPDEBUG0; 748 749 KKASSERT(msg->send.nm_control == NULL); 750 KKASSERT(msg->send.nm_addr == NULL); 751 KKASSERT((flags & PRUS_FREEADDR) == 0); 752 753 inp = so->so_pcb; 754 755 if (inp == NULL) { 756 /* 757 * OOPS! we lost a race, the TCP session got reset after 758 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a 759 * network interrupt in the non-critical section of sosend(). 760 */ 761 m_freem(m); 762 error = ECONNRESET; /* XXX EPIPE? */ 763 tp = NULL; 764 TCPDEBUG1(); 765 goto out; 766 } 767 tp = intotcpcb(inp); 768 TCPDEBUG1(); 769 770 #ifdef foo 771 /* 772 * This is no longer necessary, since: 773 * - sosendtcp() has already checked it for us 774 * - It does not work with asynchronized send 775 */ 776 777 /* 778 * Don't let too much OOB data build up 779 */ 780 if (flags & PRUS_OOB) { 781 if (ssb_space(&so->so_snd) < -512) { 782 m_freem(m); 783 error = ENOBUFS; 784 goto out; 785 } 786 } 787 #endif 788 789 /* 790 * Pump the data into the socket. 791 */ 792 if (m) 793 ssb_appendstream(&so->so_snd, m); 794 if (flags & PRUS_OOB) { 795 /* 796 * According to RFC961 (Assigned Protocols), 797 * the urgent pointer points to the last octet 798 * of urgent data. We continue, however, 799 * to consider it to indicate the first octet 800 * of data past the urgent section. 801 * Otherwise, snd_up should be one lower. 802 */ 803 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 804 tp->t_flags |= TF_FORCE; 805 error = tcp_output(tp); 806 tp->t_flags &= ~TF_FORCE; 807 } else { 808 if (flags & PRUS_EOF) { 809 /* 810 * Close the send side of the connection after 811 * the data is sent. 812 */ 813 socantsendmore(so); 814 tp = tcp_usrclosed(tp); 815 } 816 if (tp != NULL) { 817 if (flags & PRUS_MORETOCOME) 818 tp->t_flags |= TF_MORETOCOME; 819 error = tcp_output(tp); 820 if (flags & PRUS_MORETOCOME) 821 tp->t_flags &= ~TF_MORETOCOME; 822 } 823 } 824 COMMON_END1((flags & PRUS_OOB) ? PRU_SENDOOB : 825 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), 826 (flags & PRUS_NOREPLY)); 827 } 828 829 /* 830 * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort() 831 * will sofree() it when we return. 832 */ 833 static void 834 tcp_usr_abort(netmsg_t msg) 835 { 836 struct socket *so = msg->abort.base.nm_so; 837 int error = 0; 838 struct inpcb *inp; 839 struct tcpcb *tp; 840 841 COMMON_START(so, inp, 1); 842 tp = tcp_drop(tp, ECONNABORTED); 843 COMMON_END(PRU_ABORT); 844 } 845 846 /* 847 * Receive out-of-band data. 848 */ 849 static void 850 tcp_usr_rcvoob(netmsg_t msg) 851 { 852 struct socket *so = msg->rcvoob.base.nm_so; 853 struct mbuf *m = msg->rcvoob.nm_m; 854 int flags = msg->rcvoob.nm_flags; 855 int error = 0; 856 struct inpcb *inp; 857 struct tcpcb *tp; 858 859 COMMON_START(so, inp, 0); 860 if ((so->so_oobmark == 0 && 861 (so->so_state & SS_RCVATMARK) == 0) || 862 so->so_options & SO_OOBINLINE || 863 tp->t_oobflags & TCPOOB_HADDATA) { 864 error = EINVAL; 865 goto out; 866 } 867 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 868 error = EWOULDBLOCK; 869 goto out; 870 } 871 m->m_len = 1; 872 *mtod(m, caddr_t) = tp->t_iobc; 873 if ((flags & MSG_PEEK) == 0) 874 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 875 COMMON_END(PRU_RCVOOB); 876 } 877 878 static void 879 tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 880 { 881 in_savefaddr(so, faddr); 882 } 883 884 #ifdef INET6 885 static void 886 tcp6_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 887 { 888 in6_mapped_savefaddr(so, faddr); 889 } 890 #endif 891 892 /* xxx - should be const */ 893 struct pr_usrreqs tcp_usrreqs = { 894 .pru_abort = tcp_usr_abort, 895 .pru_accept = tcp_usr_accept, 896 .pru_attach = tcp_usr_attach, 897 .pru_bind = tcp_usr_bind, 898 .pru_connect = tcp_usr_connect, 899 .pru_connect2 = pr_generic_notsupp, 900 .pru_control = in_control_dispatch, 901 .pru_detach = tcp_usr_detach, 902 .pru_disconnect = tcp_usr_disconnect, 903 .pru_listen = tcp_usr_listen, 904 .pru_peeraddr = in_setpeeraddr_dispatch, 905 .pru_rcvd = tcp_usr_rcvd, 906 .pru_rcvoob = tcp_usr_rcvoob, 907 .pru_send = tcp_usr_send, 908 .pru_sense = pru_sense_null, 909 .pru_shutdown = tcp_usr_shutdown, 910 .pru_sockaddr = in_setsockaddr_dispatch, 911 .pru_sosend = sosendtcp, 912 .pru_soreceive = soreceive, 913 .pru_savefaddr = tcp_usr_savefaddr 914 }; 915 916 #ifdef INET6 917 struct pr_usrreqs tcp6_usrreqs = { 918 .pru_abort = tcp_usr_abort, 919 .pru_accept = tcp6_usr_accept, 920 .pru_attach = tcp_usr_attach, 921 .pru_bind = tcp6_usr_bind, 922 .pru_connect = tcp6_usr_connect, 923 .pru_connect2 = pr_generic_notsupp, 924 .pru_control = in6_control_dispatch, 925 .pru_detach = tcp_usr_detach, 926 .pru_disconnect = tcp_usr_disconnect, 927 .pru_listen = tcp6_usr_listen, 928 .pru_peeraddr = in6_mapped_peeraddr_dispatch, 929 .pru_rcvd = tcp_usr_rcvd, 930 .pru_rcvoob = tcp_usr_rcvoob, 931 .pru_send = tcp_usr_send, 932 .pru_sense = pru_sense_null, 933 .pru_shutdown = tcp_usr_shutdown, 934 .pru_sockaddr = in6_mapped_sockaddr_dispatch, 935 .pru_sosend = sosendtcp, 936 .pru_soreceive = soreceive, 937 .pru_savefaddr = tcp6_usr_savefaddr 938 }; 939 #endif /* INET6 */ 940 941 static int 942 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m, 943 struct sockaddr_in *sin, struct sockaddr_in *if_sin) 944 { 945 struct inpcb *inp = tp->t_inpcb, *oinp; 946 struct socket *so = inp->inp_socket; 947 struct route *ro = &inp->inp_route; 948 949 oinp = in_pcblookup_hash(&tcbinfo[mycpu->gd_cpuid], 950 sin->sin_addr, sin->sin_port, 951 (inp->inp_laddr.s_addr != INADDR_ANY ? 952 inp->inp_laddr : if_sin->sin_addr), 953 inp->inp_lport, 0, NULL); 954 if (oinp != NULL) { 955 m_freem(m); 956 return (EADDRINUSE); 957 } 958 if (inp->inp_laddr.s_addr == INADDR_ANY) 959 inp->inp_laddr = if_sin->sin_addr; 960 inp->inp_faddr = sin->sin_addr; 961 inp->inp_fport = sin->sin_port; 962 inp->inp_cpcbinfo = &tcbinfo[mycpu->gd_cpuid]; 963 in_pcbinsconnhash(inp); 964 965 /* 966 * We are now on the inpcb's owner CPU, if the cached route was 967 * freed because the rtentry's owner CPU is not the current CPU 968 * (e.g. in tcp_connect()), then we try to reallocate it here with 969 * the hope that a rtentry may be cloned from a RTF_PRCLONING 970 * rtentry. 971 */ 972 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 973 ro->ro_rt == NULL) { 974 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 975 ro->ro_dst.sa_family = AF_INET; 976 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 977 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = 978 sin->sin_addr; 979 rtalloc(ro); 980 } 981 982 /* 983 * Now that no more errors can occur, change the protocol processing 984 * port to the current thread (which is the correct thread). 985 * 986 * Create TCP timer message now; we are on the tcpcb's owner 987 * CPU/thread. 988 */ 989 tcp_create_timermsg(tp, &curthread->td_msgport); 990 991 /* 992 * Compute window scaling to request. Use a larger scaling then 993 * needed for the initial receive buffer in case the receive buffer 994 * gets expanded. 995 */ 996 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 997 tp->request_r_scale = TCP_MIN_WINSHIFT; 998 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 999 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat 1000 ) { 1001 tp->request_r_scale++; 1002 } 1003 1004 soisconnecting(so); 1005 tcpstat.tcps_connattempt++; 1006 tp->t_state = TCPS_SYN_SENT; 1007 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1008 tp->iss = tcp_new_isn(tp); 1009 tcp_sendseqinit(tp); 1010 if (m) { 1011 ssb_appendstream(&so->so_snd, m); 1012 m = NULL; 1013 if (flags & PRUS_OOB) 1014 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1015 } 1016 1017 /* 1018 * Close the send side of the connection after 1019 * the data is sent if flagged. 1020 */ 1021 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1022 socantsendmore(so); 1023 tp = tcp_usrclosed(tp); 1024 } 1025 return (tcp_output(tp)); 1026 } 1027 1028 /* 1029 * Common subroutine to open a TCP connection to remote host specified 1030 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 1031 * port number if needed. Call in_pcbladdr to do the routing and to choose 1032 * a local host address (interface). 1033 * Initialize connection parameters and enter SYN-SENT state. 1034 */ 1035 static void 1036 tcp_connect(netmsg_t msg) 1037 { 1038 struct socket *so = msg->connect.base.nm_so; 1039 struct sockaddr *nam = msg->connect.nm_nam; 1040 struct thread *td = msg->connect.nm_td; 1041 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1042 struct sockaddr_in *if_sin; 1043 struct inpcb *inp; 1044 struct tcpcb *tp; 1045 int error, calc_laddr = 1; 1046 #ifdef SMP 1047 lwkt_port_t port; 1048 #endif 1049 1050 COMMON_START(so, inp, 0); 1051 1052 /* 1053 * Reconnect our pcb if we have to 1054 */ 1055 if (msg->connect.nm_reconnect & NMSG_RECONNECT_RECONNECT) { 1056 msg->connect.nm_reconnect &= ~NMSG_RECONNECT_RECONNECT; 1057 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1058 } 1059 1060 /* 1061 * Bind if we have to 1062 */ 1063 if (inp->inp_lport == 0) { 1064 if (tcp_lport_extension) { 1065 KKASSERT(inp->inp_laddr.s_addr == INADDR_ANY); 1066 1067 error = in_pcbladdr(inp, nam, &if_sin, td); 1068 if (error) 1069 goto out; 1070 inp->inp_laddr.s_addr = if_sin->sin_addr.s_addr; 1071 1072 error = in_pcbconn_bind(inp, nam, td); 1073 if (error) 1074 goto out; 1075 1076 calc_laddr = 0; 1077 } else { 1078 error = in_pcbbind(inp, NULL, td); 1079 if (error) 1080 goto out; 1081 } 1082 } 1083 1084 if (calc_laddr) { 1085 /* 1086 * Calculate the correct protocol processing thread. The 1087 * connect operation must run there. Set the forwarding 1088 * port before we forward the message or it will get bounced 1089 * right back to us. 1090 */ 1091 error = in_pcbladdr(inp, nam, &if_sin, td); 1092 if (error) 1093 goto out; 1094 } 1095 KKASSERT(inp->inp_socket == so); 1096 1097 #ifdef SMP 1098 port = tcp_addrport(sin->sin_addr.s_addr, sin->sin_port, 1099 (inp->inp_laddr.s_addr ? 1100 inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr), 1101 inp->inp_lport); 1102 1103 if (port != &curthread->td_msgport) { 1104 struct route *ro = &inp->inp_route; 1105 1106 /* 1107 * in_pcbladdr() may have allocated a route entry for us 1108 * on the current CPU, but we need a route entry on the 1109 * inpcb's owner CPU, so free it here. 1110 */ 1111 if (ro->ro_rt != NULL) 1112 RTFREE(ro->ro_rt); 1113 bzero(ro, sizeof(*ro)); 1114 1115 /* 1116 * We are moving the protocol processing port the socket 1117 * is on, we have to unlink here and re-link on the 1118 * target cpu. 1119 */ 1120 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1121 sosetport(so, port); 1122 msg->connect.nm_reconnect |= NMSG_RECONNECT_RECONNECT; 1123 msg->connect.base.nm_dispatch = tcp_connect; 1124 1125 lwkt_forwardmsg(port, &msg->connect.base.lmsg); 1126 /* msg invalid now */ 1127 return; 1128 } 1129 #else 1130 KKASSERT(so->so_port == &curthread->td_msgport); 1131 #endif 1132 error = tcp_connect_oncpu(tp, msg->connect.nm_flags, 1133 msg->connect.nm_m, sin, if_sin); 1134 msg->connect.nm_m = NULL; 1135 out: 1136 if (msg->connect.nm_m) { 1137 m_freem(msg->connect.nm_m); 1138 msg->connect.nm_m = NULL; 1139 } 1140 if (msg->connect.nm_reconnect & NMSG_RECONNECT_NAMALLOC) { 1141 kfree(msg->connect.nm_nam, M_LWKTMSG); 1142 msg->connect.nm_nam = NULL; 1143 } 1144 lwkt_replymsg(&msg->connect.base.lmsg, error); 1145 /* msg invalid now */ 1146 } 1147 1148 #ifdef INET6 1149 1150 static void 1151 tcp6_connect(netmsg_t msg) 1152 { 1153 struct tcpcb *tp; 1154 struct socket *so = msg->connect.base.nm_so; 1155 struct sockaddr *nam = msg->connect.nm_nam; 1156 struct thread *td = msg->connect.nm_td; 1157 struct inpcb *inp; 1158 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 1159 struct in6_addr *addr6; 1160 #ifdef SMP 1161 lwkt_port_t port; 1162 #endif 1163 int error; 1164 1165 COMMON_START(so, inp, 0); 1166 1167 /* 1168 * Reconnect our pcb if we have to 1169 */ 1170 if (msg->connect.nm_reconnect & NMSG_RECONNECT_RECONNECT) { 1171 msg->connect.nm_reconnect &= ~NMSG_RECONNECT_RECONNECT; 1172 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1173 } 1174 1175 /* 1176 * Bind if we have to 1177 */ 1178 if (inp->inp_lport == 0) { 1179 error = in6_pcbbind(inp, NULL, td); 1180 if (error) 1181 goto out; 1182 } 1183 1184 /* 1185 * Cannot simply call in_pcbconnect, because there might be an 1186 * earlier incarnation of this same connection still in 1187 * TIME_WAIT state, creating an ADDRINUSE error. 1188 */ 1189 error = in6_pcbladdr(inp, nam, &addr6, td); 1190 if (error) 1191 goto out; 1192 1193 #ifdef SMP 1194 port = tcp6_addrport(); /* XXX hack for now, always cpu0 */ 1195 1196 if (port != &curthread->td_msgport) { 1197 struct route *ro = &inp->inp_route; 1198 1199 /* 1200 * in_pcbladdr() may have allocated a route entry for us 1201 * on the current CPU, but we need a route entry on the 1202 * inpcb's owner CPU, so free it here. 1203 */ 1204 if (ro->ro_rt != NULL) 1205 RTFREE(ro->ro_rt); 1206 bzero(ro, sizeof(*ro)); 1207 1208 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1209 sosetport(so, port); 1210 msg->connect.nm_reconnect |= NMSG_RECONNECT_RECONNECT; 1211 msg->connect.base.nm_dispatch = tcp6_connect; 1212 1213 lwkt_forwardmsg(port, &msg->connect.base.lmsg); 1214 /* msg invalid now */ 1215 return; 1216 } 1217 #endif 1218 error = tcp6_connect_oncpu(tp, msg->connect.nm_flags, 1219 &msg->connect.nm_m, sin6, addr6); 1220 /* nm_m may still be intact */ 1221 out: 1222 if (error && (msg->connect.nm_reconnect & NMSG_RECONNECT_FALLBACK)) { 1223 tcp_connect(msg); 1224 /* msg invalid now */ 1225 } else { 1226 if (msg->connect.nm_m) { 1227 m_freem(msg->connect.nm_m); 1228 msg->connect.nm_m = NULL; 1229 } 1230 if (msg->connect.nm_reconnect & NMSG_RECONNECT_NAMALLOC) { 1231 kfree(msg->connect.nm_nam, M_LWKTMSG); 1232 msg->connect.nm_nam = NULL; 1233 } 1234 lwkt_replymsg(&msg->connect.base.lmsg, error); 1235 /* msg invalid now */ 1236 } 1237 } 1238 1239 static int 1240 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf **mp, 1241 struct sockaddr_in6 *sin6, struct in6_addr *addr6) 1242 { 1243 struct mbuf *m = *mp; 1244 struct inpcb *inp = tp->t_inpcb; 1245 struct socket *so = inp->inp_socket; 1246 struct inpcb *oinp; 1247 1248 /* 1249 * Cannot simply call in_pcbconnect, because there might be an 1250 * earlier incarnation of this same connection still in 1251 * TIME_WAIT state, creating an ADDRINUSE error. 1252 */ 1253 oinp = in6_pcblookup_hash(inp->inp_cpcbinfo, 1254 &sin6->sin6_addr, sin6->sin6_port, 1255 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? 1256 addr6 : &inp->in6p_laddr), 1257 inp->inp_lport, 0, NULL); 1258 if (oinp) 1259 return (EADDRINUSE); 1260 1261 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 1262 inp->in6p_laddr = *addr6; 1263 inp->in6p_faddr = sin6->sin6_addr; 1264 inp->inp_fport = sin6->sin6_port; 1265 if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) 1266 inp->in6p_flowinfo = sin6->sin6_flowinfo; 1267 in_pcbinsconnhash(inp); 1268 1269 /* 1270 * Now that no more errors can occur, change the protocol processing 1271 * port to the current thread (which is the correct thread). 1272 * 1273 * Create TCP timer message now; we are on the tcpcb's owner 1274 * CPU/thread. 1275 */ 1276 tcp_create_timermsg(tp, &curthread->td_msgport); 1277 1278 /* Compute window scaling to request. */ 1279 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1280 tp->request_r_scale = TCP_MIN_WINSHIFT; 1281 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1282 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) { 1283 tp->request_r_scale++; 1284 } 1285 1286 soisconnecting(so); 1287 tcpstat.tcps_connattempt++; 1288 tp->t_state = TCPS_SYN_SENT; 1289 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1290 tp->iss = tcp_new_isn(tp); 1291 tcp_sendseqinit(tp); 1292 if (m) { 1293 ssb_appendstream(&so->so_snd, m); 1294 *mp = NULL; 1295 if (flags & PRUS_OOB) 1296 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1297 } 1298 1299 /* 1300 * Close the send side of the connection after 1301 * the data is sent if flagged. 1302 */ 1303 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1304 socantsendmore(so); 1305 tp = tcp_usrclosed(tp); 1306 } 1307 return (tcp_output(tp)); 1308 } 1309 1310 #endif /* INET6 */ 1311 1312 /* 1313 * The new sockopt interface makes it possible for us to block in the 1314 * copyin/out step (if we take a page fault). Taking a page fault while 1315 * in a critical section is probably a Bad Thing. (Since sockets and pcbs 1316 * both now use TSM, there probably isn't any need for this function to 1317 * run in a critical section any more. This needs more examination.) 1318 */ 1319 void 1320 tcp_ctloutput(netmsg_t msg) 1321 { 1322 struct socket *so = msg->base.nm_so; 1323 struct sockopt *sopt = msg->ctloutput.nm_sopt; 1324 int error, opt, optval, opthz; 1325 struct inpcb *inp; 1326 struct tcpcb *tp; 1327 1328 error = 0; 1329 inp = so->so_pcb; 1330 if (inp == NULL) { 1331 error = ECONNRESET; 1332 goto done; 1333 } 1334 1335 if (sopt->sopt_level != IPPROTO_TCP) { 1336 #ifdef INET6 1337 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1338 ip6_ctloutput_dispatch(msg); 1339 else 1340 #endif /* INET6 */ 1341 ip_ctloutput(msg); 1342 /* msg invalid now */ 1343 return; 1344 } 1345 tp = intotcpcb(inp); 1346 1347 switch (sopt->sopt_dir) { 1348 case SOPT_SET: 1349 error = soopt_to_kbuf(sopt, &optval, sizeof optval, 1350 sizeof optval); 1351 if (error) 1352 break; 1353 switch (sopt->sopt_name) { 1354 case TCP_FASTKEEP: 1355 if (optval > 0) 1356 tp->t_keepidle = tp->t_keepintvl; 1357 else 1358 tp->t_keepidle = tcp_keepidle; 1359 tcp_timer_keep_activity(tp, 0); 1360 break; 1361 #ifdef TCP_SIGNATURE 1362 case TCP_SIGNATURE_ENABLE: 1363 if (tp->t_state == TCPS_CLOSED) { 1364 /* 1365 * This is the only safe state that this 1366 * option could be changed. Some segments 1367 * could already have been sent in other 1368 * states. 1369 */ 1370 if (optval > 0) 1371 tp->t_flags |= TF_SIGNATURE; 1372 else 1373 tp->t_flags &= ~TF_SIGNATURE; 1374 } else { 1375 error = EOPNOTSUPP; 1376 } 1377 break; 1378 #endif /* TCP_SIGNATURE */ 1379 case TCP_NODELAY: 1380 case TCP_NOOPT: 1381 switch (sopt->sopt_name) { 1382 case TCP_NODELAY: 1383 opt = TF_NODELAY; 1384 break; 1385 case TCP_NOOPT: 1386 opt = TF_NOOPT; 1387 break; 1388 default: 1389 opt = 0; /* dead code to fool gcc */ 1390 break; 1391 } 1392 1393 if (optval) 1394 tp->t_flags |= opt; 1395 else 1396 tp->t_flags &= ~opt; 1397 break; 1398 1399 case TCP_NOPUSH: 1400 if (tcp_disable_nopush) 1401 break; 1402 if (optval) 1403 tp->t_flags |= TF_NOPUSH; 1404 else { 1405 tp->t_flags &= ~TF_NOPUSH; 1406 error = tcp_output(tp); 1407 } 1408 break; 1409 1410 case TCP_MAXSEG: 1411 /* 1412 * Must be between 0 and maxseg. If the requested 1413 * maxseg is too small to satisfy the desired minmss, 1414 * pump it up (silently so sysctl modifications of 1415 * minmss do not create unexpected program failures). 1416 * Handle degenerate cases. 1417 */ 1418 if (optval > 0 && optval <= tp->t_maxseg) { 1419 if (optval + 40 < tcp_minmss) { 1420 optval = tcp_minmss - 40; 1421 if (optval < 0) 1422 optval = 1; 1423 } 1424 tp->t_maxseg = optval; 1425 } else { 1426 error = EINVAL; 1427 } 1428 break; 1429 1430 case TCP_KEEPINIT: 1431 opthz = ((int64_t)optval * hz) / 1000; 1432 if (opthz >= 1) 1433 tp->t_keepinit = opthz; 1434 else 1435 error = EINVAL; 1436 break; 1437 1438 case TCP_KEEPIDLE: 1439 opthz = ((int64_t)optval * hz) / 1000; 1440 if (opthz >= 1) { 1441 tp->t_keepidle = opthz; 1442 tcp_timer_keep_activity(tp, 0); 1443 } else { 1444 error = EINVAL; 1445 } 1446 break; 1447 1448 case TCP_KEEPINTVL: 1449 opthz = ((int64_t)optval * hz) / 1000; 1450 if (opthz >= 1) { 1451 tp->t_keepintvl = opthz; 1452 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1453 } else { 1454 error = EINVAL; 1455 } 1456 break; 1457 1458 case TCP_KEEPCNT: 1459 if (optval > 0) { 1460 tp->t_keepcnt = optval; 1461 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1462 } else { 1463 error = EINVAL; 1464 } 1465 break; 1466 1467 default: 1468 error = ENOPROTOOPT; 1469 break; 1470 } 1471 break; 1472 1473 case SOPT_GET: 1474 switch (sopt->sopt_name) { 1475 #ifdef TCP_SIGNATURE 1476 case TCP_SIGNATURE_ENABLE: 1477 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1478 break; 1479 #endif /* TCP_SIGNATURE */ 1480 case TCP_NODELAY: 1481 optval = tp->t_flags & TF_NODELAY; 1482 break; 1483 case TCP_MAXSEG: 1484 optval = tp->t_maxseg; 1485 break; 1486 case TCP_NOOPT: 1487 optval = tp->t_flags & TF_NOOPT; 1488 break; 1489 case TCP_NOPUSH: 1490 optval = tp->t_flags & TF_NOPUSH; 1491 break; 1492 case TCP_KEEPINIT: 1493 optval = ((int64_t)tp->t_keepinit * 1000) / hz; 1494 break; 1495 case TCP_KEEPIDLE: 1496 optval = ((int64_t)tp->t_keepidle * 1000) / hz; 1497 break; 1498 case TCP_KEEPINTVL: 1499 optval = ((int64_t)tp->t_keepintvl * 1000) / hz; 1500 break; 1501 case TCP_KEEPCNT: 1502 optval = tp->t_keepcnt; 1503 break; 1504 default: 1505 error = ENOPROTOOPT; 1506 break; 1507 } 1508 if (error == 0) 1509 soopt_from_kbuf(sopt, &optval, sizeof optval); 1510 break; 1511 } 1512 done: 1513 lwkt_replymsg(&msg->lmsg, error); 1514 } 1515 1516 /* 1517 * tcp_sendspace and tcp_recvspace are the default send and receive window 1518 * sizes, respectively. These are obsolescent (this information should 1519 * be set by the route). 1520 * 1521 * Use a default that does not require tcp window scaling to be turned 1522 * on. Individual programs or the administrator can increase the default. 1523 */ 1524 u_long tcp_sendspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1525 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1526 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1527 u_long tcp_recvspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1528 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1529 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1530 1531 /* 1532 * Attach TCP protocol to socket, allocating internet protocol control 1533 * block, tcp control block, bufer space, and entering LISTEN state 1534 * if to accept connections. 1535 */ 1536 static int 1537 tcp_attach(struct socket *so, struct pru_attach_info *ai) 1538 { 1539 struct tcpcb *tp; 1540 struct inpcb *inp; 1541 int error; 1542 int cpu; 1543 #ifdef INET6 1544 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; 1545 #endif 1546 1547 if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) { 1548 lwkt_gettoken(&so->so_rcv.ssb_token); 1549 error = soreserve(so, tcp_sendspace, tcp_recvspace, 1550 ai->sb_rlimit); 1551 lwkt_reltoken(&so->so_rcv.ssb_token); 1552 if (error) 1553 return (error); 1554 } 1555 atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE); 1556 atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); 1557 cpu = mycpu->gd_cpuid; 1558 1559 /* 1560 * Set the default port for protocol processing. This will likely 1561 * change when we connect. 1562 */ 1563 error = in_pcballoc(so, &tcbinfo[cpu]); 1564 if (error) 1565 return (error); 1566 inp = so->so_pcb; 1567 #ifdef INET6 1568 if (isipv6) { 1569 inp->inp_vflag |= INP_IPV6; 1570 inp->in6p_hops = -1; /* use kernel default */ 1571 } 1572 else 1573 #endif 1574 inp->inp_vflag |= INP_IPV4; 1575 tp = tcp_newtcpcb(inp); 1576 if (tp == NULL) { 1577 /* 1578 * Make sure the socket is destroyed by the pcbdetach. 1579 */ 1580 soreference(so); 1581 #ifdef INET6 1582 if (isipv6) 1583 in6_pcbdetach(inp); 1584 else 1585 #endif 1586 in_pcbdetach(inp); 1587 sofree(so); /* from ref above */ 1588 return (ENOBUFS); 1589 } 1590 tp->t_state = TCPS_CLOSED; 1591 return (0); 1592 } 1593 1594 /* 1595 * Initiate (or continue) disconnect. 1596 * If embryonic state, just send reset (once). 1597 * If in ``let data drain'' option and linger null, just drop. 1598 * Otherwise (hard), mark socket disconnecting and drop 1599 * current input data; switch states based on user close, and 1600 * send segment to peer (with FIN). 1601 */ 1602 static struct tcpcb * 1603 tcp_disconnect(struct tcpcb *tp) 1604 { 1605 struct socket *so = tp->t_inpcb->inp_socket; 1606 1607 if (tp->t_state < TCPS_ESTABLISHED) { 1608 tp = tcp_close(tp); 1609 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 1610 tp = tcp_drop(tp, 0); 1611 } else { 1612 lwkt_gettoken(&so->so_rcv.ssb_token); 1613 soisdisconnecting(so); 1614 sbflush(&so->so_rcv.sb); 1615 tp = tcp_usrclosed(tp); 1616 if (tp) 1617 tcp_output(tp); 1618 lwkt_reltoken(&so->so_rcv.ssb_token); 1619 } 1620 return (tp); 1621 } 1622 1623 /* 1624 * User issued close, and wish to trail through shutdown states: 1625 * if never received SYN, just forget it. If got a SYN from peer, 1626 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1627 * If already got a FIN from peer, then almost done; go to LAST_ACK 1628 * state. In all other cases, have already sent FIN to peer (e.g. 1629 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1630 * for peer to send FIN or not respond to keep-alives, etc. 1631 * We can let the user exit from the close as soon as the FIN is acked. 1632 */ 1633 static struct tcpcb * 1634 tcp_usrclosed(struct tcpcb *tp) 1635 { 1636 1637 switch (tp->t_state) { 1638 1639 case TCPS_CLOSED: 1640 case TCPS_LISTEN: 1641 tp->t_state = TCPS_CLOSED; 1642 tp = tcp_close(tp); 1643 break; 1644 1645 case TCPS_SYN_SENT: 1646 case TCPS_SYN_RECEIVED: 1647 tp->t_flags |= TF_NEEDFIN; 1648 break; 1649 1650 case TCPS_ESTABLISHED: 1651 tp->t_state = TCPS_FIN_WAIT_1; 1652 break; 1653 1654 case TCPS_CLOSE_WAIT: 1655 tp->t_state = TCPS_LAST_ACK; 1656 break; 1657 } 1658 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1659 soisdisconnected(tp->t_inpcb->inp_socket); 1660 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1661 if (tp->t_state == TCPS_FIN_WAIT_2) { 1662 tcp_callout_reset(tp, tp->tt_2msl, tp->t_maxidle, 1663 tcp_timer_2msl); 1664 } 1665 } 1666 return (tp); 1667 } 1668