1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 67 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $ 68 */ 69 70 #include "opt_ipsec.h" 71 #include "opt_inet.h" 72 #include "opt_inet6.h" 73 #include "opt_tcpdebug.h" 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/kernel.h> 78 #include <sys/malloc.h> 79 #include <sys/sysctl.h> 80 #include <sys/globaldata.h> 81 #include <sys/thread.h> 82 83 #include <sys/mbuf.h> 84 #ifdef INET6 85 #include <sys/domain.h> 86 #endif /* INET6 */ 87 #include <sys/socket.h> 88 #include <sys/socketvar.h> 89 #include <sys/socketops.h> 90 #include <sys/protosw.h> 91 92 #include <sys/thread2.h> 93 #include <sys/msgport2.h> 94 #include <sys/socketvar2.h> 95 96 #include <net/if.h> 97 #include <net/netisr.h> 98 #include <net/route.h> 99 100 #include <net/netmsg2.h> 101 102 #include <netinet/in.h> 103 #include <netinet/in_systm.h> 104 #ifdef INET6 105 #include <netinet/ip6.h> 106 #endif 107 #include <netinet/in_pcb.h> 108 #ifdef INET6 109 #include <netinet6/in6_pcb.h> 110 #endif 111 #include <netinet/in_var.h> 112 #include <netinet/ip_var.h> 113 #ifdef INET6 114 #include <netinet6/ip6_var.h> 115 #include <netinet6/tcp6_var.h> 116 #endif 117 #include <netinet/tcp.h> 118 #include <netinet/tcp_fsm.h> 119 #include <netinet/tcp_seq.h> 120 #include <netinet/tcp_timer.h> 121 #include <netinet/tcp_timer2.h> 122 #include <netinet/tcp_var.h> 123 #include <netinet/tcpip.h> 124 #ifdef TCPDEBUG 125 #include <netinet/tcp_debug.h> 126 #endif 127 128 #ifdef IPSEC 129 #include <netinet6/ipsec.h> 130 #endif /*IPSEC*/ 131 132 /* 133 * TCP protocol interface to socket abstraction. 134 */ 135 extern char *tcpstates[]; /* XXX ??? */ 136 137 static int tcp_attach (struct socket *, struct pru_attach_info *); 138 static void tcp_connect (netmsg_t msg); 139 #ifdef INET6 140 static void tcp6_connect (netmsg_t msg); 141 static int tcp6_connect_oncpu(struct tcpcb *tp, int flags, 142 struct mbuf **mp, 143 struct sockaddr_in6 *sin6, 144 struct in6_addr *addr6); 145 #endif /* INET6 */ 146 static struct tcpcb * 147 tcp_disconnect (struct tcpcb *); 148 static struct tcpcb * 149 tcp_usrclosed (struct tcpcb *); 150 151 #ifdef TCPDEBUG 152 #define TCPDEBUG0 int ostate = 0 153 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 154 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 155 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 156 #else 157 #define TCPDEBUG0 158 #define TCPDEBUG1() 159 #define TCPDEBUG2(req) 160 #endif 161 162 static int tcp_lport_extension = 1; 163 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lportext, CTLFLAG_RW, 164 &tcp_lport_extension, 0, ""); 165 166 /* 167 * For some ill optimized programs, which try to use TCP_NOPUSH 168 * to improve performance, will have small amount of data sits 169 * in the sending buffer. These small amount of data will _not_ 170 * be pushed into the network until more data are written into 171 * the socket or the socket write side is shutdown. 172 */ 173 static int tcp_disable_nopush = 1; 174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_nopush, CTLFLAG_RW, 175 &tcp_disable_nopush, 0, "TCP_NOPUSH socket option will have no effect"); 176 177 /* 178 * TCP attaches to socket via pru_attach(), reserving space, 179 * and an internet control block. This is likely occuring on 180 * cpu0 and may have to move later when we bind/connect. 181 */ 182 static void 183 tcp_usr_attach(netmsg_t msg) 184 { 185 struct socket *so = msg->base.nm_so; 186 struct pru_attach_info *ai = msg->attach.nm_ai; 187 int error; 188 struct inpcb *inp; 189 struct tcpcb *tp = NULL; 190 TCPDEBUG0; 191 192 soreference(so); 193 inp = so->so_pcb; 194 TCPDEBUG1(); 195 if (inp) { 196 error = EISCONN; 197 goto out; 198 } 199 200 error = tcp_attach(so, ai); 201 if (error) 202 goto out; 203 204 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 205 so->so_linger = TCP_LINGERTIME; 206 tp = sototcpcb(so); 207 out: 208 sofree(so); /* from ref above */ 209 TCPDEBUG2(PRU_ATTACH); 210 lwkt_replymsg(&msg->lmsg, error); 211 } 212 213 /* 214 * pru_detach() detaches the TCP protocol from the socket. 215 * If the protocol state is non-embryonic, then can't 216 * do this directly: have to initiate a pru_disconnect(), 217 * which may finish later; embryonic TCB's can just 218 * be discarded here. 219 */ 220 static void 221 tcp_usr_detach(netmsg_t msg) 222 { 223 struct socket *so = msg->base.nm_so; 224 int error = 0; 225 struct inpcb *inp; 226 struct tcpcb *tp; 227 TCPDEBUG0; 228 229 inp = so->so_pcb; 230 231 /* 232 * If the inp is already detached it may have been due to an async 233 * close. Just return as if no error occured. 234 * 235 * It's possible for the tcpcb (tp) to disconnect from the inp due 236 * to tcp_drop()->tcp_close() being called. This may occur *after* 237 * the detach message has been queued so we may find a NULL tp here. 238 */ 239 if (inp) { 240 if ((tp = intotcpcb(inp)) != NULL) { 241 TCPDEBUG1(); 242 tp = tcp_disconnect(tp); 243 TCPDEBUG2(PRU_DETACH); 244 } 245 } 246 lwkt_replymsg(&msg->lmsg, error); 247 } 248 249 /* 250 * NOTE: ignore_error is non-zero for certain disconnection races 251 * which we want to silently allow, otherwise close() may return 252 * an unexpected error. 253 * 254 * NOTE: The variables (msg) and (tp) are assumed. 255 */ 256 #define COMMON_START(so, inp, ignore_error) \ 257 TCPDEBUG0; \ 258 \ 259 inp = so->so_pcb; \ 260 do { \ 261 if (inp == NULL) { \ 262 error = ignore_error ? 0 : EINVAL; \ 263 tp = NULL; \ 264 goto out; \ 265 } \ 266 tp = intotcpcb(inp); \ 267 TCPDEBUG1(); \ 268 } while(0) 269 270 #define COMMON_END1(req, noreply) \ 271 out: do { \ 272 TCPDEBUG2(req); \ 273 if (!(noreply)) \ 274 lwkt_replymsg(&msg->lmsg, error); \ 275 return; \ 276 } while(0) 277 278 #define COMMON_END(req) COMMON_END1((req), 0) 279 280 /* 281 * Give the socket an address. 282 */ 283 static void 284 tcp_usr_bind(netmsg_t msg) 285 { 286 struct socket *so = msg->bind.base.nm_so; 287 struct sockaddr *nam = msg->bind.nm_nam; 288 struct thread *td = msg->bind.nm_td; 289 int error = 0; 290 struct inpcb *inp; 291 struct tcpcb *tp; 292 struct sockaddr_in *sinp; 293 294 COMMON_START(so, inp, 0); 295 296 /* 297 * Must check for multicast addresses and disallow binding 298 * to them. 299 */ 300 sinp = (struct sockaddr_in *)nam; 301 if (sinp->sin_family == AF_INET && 302 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 303 error = EAFNOSUPPORT; 304 goto out; 305 } 306 error = in_pcbbind(inp, nam, td); 307 if (error) 308 goto out; 309 COMMON_END(PRU_BIND); 310 311 } 312 313 #ifdef INET6 314 315 static void 316 tcp6_usr_bind(netmsg_t msg) 317 { 318 struct socket *so = msg->bind.base.nm_so; 319 struct sockaddr *nam = msg->bind.nm_nam; 320 struct thread *td = msg->bind.nm_td; 321 int error = 0; 322 struct inpcb *inp; 323 struct tcpcb *tp; 324 struct sockaddr_in6 *sin6p; 325 326 COMMON_START(so, inp, 0); 327 328 /* 329 * Must check for multicast addresses and disallow binding 330 * to them. 331 */ 332 sin6p = (struct sockaddr_in6 *)nam; 333 if (sin6p->sin6_family == AF_INET6 && 334 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 335 error = EAFNOSUPPORT; 336 goto out; 337 } 338 inp->inp_vflag &= ~INP_IPV4; 339 inp->inp_vflag |= INP_IPV6; 340 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 341 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) 342 inp->inp_vflag |= INP_IPV4; 343 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 344 struct sockaddr_in sin; 345 346 in6_sin6_2_sin(&sin, sin6p); 347 inp->inp_vflag |= INP_IPV4; 348 inp->inp_vflag &= ~INP_IPV6; 349 error = in_pcbbind(inp, (struct sockaddr *)&sin, td); 350 goto out; 351 } 352 } 353 error = in6_pcbbind(inp, nam, td); 354 if (error) 355 goto out; 356 COMMON_END(PRU_BIND); 357 } 358 #endif /* INET6 */ 359 360 #ifdef SMP 361 362 struct netmsg_inswildcard { 363 struct netmsg_base base; 364 struct inpcb *nm_inp; 365 }; 366 367 static void 368 in_pcbinswildcardhash_handler(netmsg_t msg) 369 { 370 struct netmsg_inswildcard *nm = (struct netmsg_inswildcard *)msg; 371 int cpu = mycpuid, nextcpu; 372 373 in_pcbinswildcardhash_oncpu(nm->nm_inp, &tcbinfo[cpu]); 374 375 nextcpu = cpu + 1; 376 if (nextcpu < ncpus2) 377 lwkt_forwardmsg(netisr_portfn(nextcpu), &nm->base.lmsg); 378 else 379 lwkt_replymsg(&nm->base.lmsg, 0); 380 } 381 382 #endif 383 384 /* 385 * Prepare to accept connections. 386 */ 387 static void 388 tcp_usr_listen(netmsg_t msg) 389 { 390 struct socket *so = msg->listen.base.nm_so; 391 struct thread *td = msg->listen.nm_td; 392 int error = 0; 393 struct inpcb *inp; 394 struct tcpcb *tp; 395 #ifdef SMP 396 struct netmsg_inswildcard nm; 397 #endif 398 399 COMMON_START(so, inp, 0); 400 401 if (tp->t_flags & TF_LISTEN) 402 goto out; 403 404 if (inp->inp_lport == 0) { 405 error = in_pcbbind(inp, NULL, td); 406 if (error) 407 goto out; 408 } 409 410 tp->t_state = TCPS_LISTEN; 411 tp->t_flags |= TF_LISTEN; 412 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 413 414 #ifdef SMP 415 if (ncpus > 1) { 416 /* 417 * We have to set the flag because we can't have other cpus 418 * messing with our inp's flags. 419 */ 420 KASSERT(!(inp->inp_flags & INP_CONNECTED), 421 ("already on connhash")); 422 KASSERT(!(inp->inp_flags & INP_WILDCARD), 423 ("already on wildcardhash")); 424 KASSERT(!(inp->inp_flags & INP_WILDCARD_MP), 425 ("already on MP wildcardhash")); 426 inp->inp_flags |= INP_WILDCARD_MP; 427 428 KKASSERT(so->so_port == netisr_portfn(0)); 429 KKASSERT(&curthread->td_msgport == netisr_portfn(0)); 430 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 431 432 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 433 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 434 nm.nm_inp = inp; 435 lwkt_domsg(netisr_portfn(1), &nm.base.lmsg, 0); 436 } 437 #endif 438 in_pcbinswildcardhash(inp); 439 COMMON_END(PRU_LISTEN); 440 } 441 442 #ifdef INET6 443 444 static void 445 tcp6_usr_listen(netmsg_t msg) 446 { 447 struct socket *so = msg->listen.base.nm_so; 448 struct thread *td = msg->listen.nm_td; 449 int error = 0; 450 struct inpcb *inp; 451 struct tcpcb *tp; 452 #ifdef SMP 453 struct netmsg_inswildcard nm; 454 #endif 455 456 COMMON_START(so, inp, 0); 457 458 if (tp->t_flags & TF_LISTEN) 459 goto out; 460 461 if (inp->inp_lport == 0) { 462 if (!(inp->inp_flags & IN6P_IPV6_V6ONLY)) 463 inp->inp_vflag |= INP_IPV4; 464 else 465 inp->inp_vflag &= ~INP_IPV4; 466 error = in6_pcbbind(inp, NULL, td); 467 if (error) 468 goto out; 469 } 470 471 tp->t_state = TCPS_LISTEN; 472 tp->t_flags |= TF_LISTEN; 473 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 474 475 #ifdef SMP 476 if (ncpus > 1) { 477 /* 478 * We have to set the flag because we can't have other cpus 479 * messing with our inp's flags. 480 */ 481 KASSERT(!(inp->inp_flags & INP_CONNECTED), 482 ("already on connhash")); 483 KASSERT(!(inp->inp_flags & INP_WILDCARD), 484 ("already on wildcardhash")); 485 KASSERT(!(inp->inp_flags & INP_WILDCARD_MP), 486 ("already on MP wildcardhash")); 487 inp->inp_flags |= INP_WILDCARD_MP; 488 489 KKASSERT(so->so_port == netisr_portfn(0)); 490 KKASSERT(&curthread->td_msgport == netisr_portfn(0)); 491 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 492 493 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 494 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 495 nm.nm_inp = inp; 496 lwkt_domsg(netisr_portfn(1), &nm.base.lmsg, 0); 497 } 498 #endif 499 in_pcbinswildcardhash(inp); 500 COMMON_END(PRU_LISTEN); 501 } 502 #endif /* INET6 */ 503 504 /* 505 * Initiate connection to peer. 506 * Create a template for use in transmissions on this connection. 507 * Enter SYN_SENT state, and mark socket as connecting. 508 * Start keep-alive timer, and seed output sequence space. 509 * Send initial segment on connection. 510 */ 511 static void 512 tcp_usr_connect(netmsg_t msg) 513 { 514 struct socket *so = msg->connect.base.nm_so; 515 struct sockaddr *nam = msg->connect.nm_nam; 516 struct thread *td = msg->connect.nm_td; 517 int error = 0; 518 struct inpcb *inp; 519 struct tcpcb *tp; 520 struct sockaddr_in *sinp; 521 522 COMMON_START(so, inp, 0); 523 524 /* 525 * Must disallow TCP ``connections'' to multicast addresses. 526 */ 527 sinp = (struct sockaddr_in *)nam; 528 if (sinp->sin_family == AF_INET 529 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 530 error = EAFNOSUPPORT; 531 goto out; 532 } 533 534 if (!prison_remote_ip(td, (struct sockaddr*)sinp)) { 535 error = EAFNOSUPPORT; /* IPv6 only jail */ 536 goto out; 537 } 538 539 tcp_connect(msg); 540 /* msg is invalid now */ 541 return; 542 out: 543 if (msg->connect.nm_m) { 544 m_freem(msg->connect.nm_m); 545 msg->connect.nm_m = NULL; 546 } 547 lwkt_replymsg(&msg->lmsg, error); 548 } 549 550 #ifdef INET6 551 552 static void 553 tcp6_usr_connect(netmsg_t msg) 554 { 555 struct socket *so = msg->connect.base.nm_so; 556 struct sockaddr *nam = msg->connect.nm_nam; 557 struct thread *td = msg->connect.nm_td; 558 int error = 0; 559 struct inpcb *inp; 560 struct tcpcb *tp; 561 struct sockaddr_in6 *sin6p; 562 563 COMMON_START(so, inp, 0); 564 565 /* 566 * Must disallow TCP ``connections'' to multicast addresses. 567 */ 568 sin6p = (struct sockaddr_in6 *)nam; 569 if (sin6p->sin6_family == AF_INET6 570 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 571 error = EAFNOSUPPORT; 572 goto out; 573 } 574 575 if (!prison_remote_ip(td, nam)) { 576 error = EAFNOSUPPORT; /* IPv4 only jail */ 577 goto out; 578 } 579 580 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 581 struct sockaddr_in *sinp; 582 583 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 584 error = EINVAL; 585 goto out; 586 } 587 sinp = kmalloc(sizeof(*sinp), M_LWKTMSG, M_INTWAIT); 588 in6_sin6_2_sin(sinp, sin6p); 589 inp->inp_vflag |= INP_IPV4; 590 inp->inp_vflag &= ~INP_IPV6; 591 msg->connect.nm_nam = (struct sockaddr *)sinp; 592 msg->connect.nm_reconnect |= NMSG_RECONNECT_NAMALLOC; 593 tcp_connect(msg); 594 /* msg is invalid now */ 595 return; 596 } 597 inp->inp_vflag &= ~INP_IPV4; 598 inp->inp_vflag |= INP_IPV6; 599 inp->inp_inc.inc_isipv6 = 1; 600 601 msg->connect.nm_reconnect |= NMSG_RECONNECT_FALLBACK; 602 tcp6_connect(msg); 603 /* msg is invalid now */ 604 return; 605 out: 606 if (msg->connect.nm_m) { 607 m_freem(msg->connect.nm_m); 608 msg->connect.nm_m = NULL; 609 } 610 lwkt_replymsg(&msg->lmsg, error); 611 } 612 613 #endif /* INET6 */ 614 615 /* 616 * Initiate disconnect from peer. 617 * If connection never passed embryonic stage, just drop; 618 * else if don't need to let data drain, then can just drop anyways, 619 * else have to begin TCP shutdown process: mark socket disconnecting, 620 * drain unread data, state switch to reflect user close, and 621 * send segment (e.g. FIN) to peer. Socket will be really disconnected 622 * when peer sends FIN and acks ours. 623 * 624 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 625 */ 626 static void 627 tcp_usr_disconnect(netmsg_t msg) 628 { 629 struct socket *so = msg->disconnect.base.nm_so; 630 int error = 0; 631 struct inpcb *inp; 632 struct tcpcb *tp; 633 634 COMMON_START(so, inp, 1); 635 tp = tcp_disconnect(tp); 636 COMMON_END(PRU_DISCONNECT); 637 } 638 639 /* 640 * Accept a connection. Essentially all the work is 641 * done at higher levels; just return the address 642 * of the peer, storing through addr. 643 */ 644 static void 645 tcp_usr_accept(netmsg_t msg) 646 { 647 struct socket *so = msg->accept.base.nm_so; 648 struct sockaddr **nam = msg->accept.nm_nam; 649 int error = 0; 650 struct inpcb *inp; 651 struct tcpcb *tp = NULL; 652 TCPDEBUG0; 653 654 inp = so->so_pcb; 655 if (so->so_state & SS_ISDISCONNECTED) { 656 error = ECONNABORTED; 657 goto out; 658 } 659 if (inp == 0) { 660 error = EINVAL; 661 goto out; 662 } 663 664 tp = intotcpcb(inp); 665 TCPDEBUG1(); 666 in_setpeeraddr(so, nam); 667 COMMON_END(PRU_ACCEPT); 668 } 669 670 #ifdef INET6 671 static void 672 tcp6_usr_accept(netmsg_t msg) 673 { 674 struct socket *so = msg->accept.base.nm_so; 675 struct sockaddr **nam = msg->accept.nm_nam; 676 int error = 0; 677 struct inpcb *inp; 678 struct tcpcb *tp = NULL; 679 TCPDEBUG0; 680 681 inp = so->so_pcb; 682 683 if (so->so_state & SS_ISDISCONNECTED) { 684 error = ECONNABORTED; 685 goto out; 686 } 687 if (inp == 0) { 688 error = EINVAL; 689 goto out; 690 } 691 tp = intotcpcb(inp); 692 TCPDEBUG1(); 693 in6_mapped_peeraddr(so, nam); 694 COMMON_END(PRU_ACCEPT); 695 } 696 #endif /* INET6 */ 697 /* 698 * Mark the connection as being incapable of further output. 699 */ 700 static void 701 tcp_usr_shutdown(netmsg_t msg) 702 { 703 struct socket *so = msg->shutdown.base.nm_so; 704 int error = 0; 705 struct inpcb *inp; 706 struct tcpcb *tp; 707 708 COMMON_START(so, inp, 0); 709 socantsendmore(so); 710 tp = tcp_usrclosed(tp); 711 if (tp) 712 error = tcp_output(tp); 713 COMMON_END(PRU_SHUTDOWN); 714 } 715 716 /* 717 * After a receive, possibly send window update to peer. 718 */ 719 static void 720 tcp_usr_rcvd(netmsg_t msg) 721 { 722 struct socket *so = msg->rcvd.base.nm_so; 723 int error = 0, noreply = 0; 724 struct inpcb *inp; 725 struct tcpcb *tp; 726 727 COMMON_START(so, inp, 0); 728 729 if (msg->rcvd.nm_pru_flags & PRUR_ASYNC) { 730 noreply = 1; 731 so_async_rcvd_reply(so); 732 } 733 tcp_output(tp); 734 735 COMMON_END1(PRU_RCVD, noreply); 736 } 737 738 /* 739 * Do a send by putting data in output queue and updating urgent 740 * marker if URG set. Possibly send more data. Unlike the other 741 * pru_*() routines, the mbuf chains are our responsibility. We 742 * must either enqueue them or free them. The other pru_* routines 743 * generally are caller-frees. 744 */ 745 static void 746 tcp_usr_send(netmsg_t msg) 747 { 748 struct socket *so = msg->send.base.nm_so; 749 int flags = msg->send.nm_flags; 750 struct mbuf *m = msg->send.nm_m; 751 int error = 0; 752 struct inpcb *inp; 753 struct tcpcb *tp; 754 TCPDEBUG0; 755 756 KKASSERT(msg->send.nm_control == NULL); 757 KKASSERT(msg->send.nm_addr == NULL); 758 KKASSERT((flags & PRUS_FREEADDR) == 0); 759 760 inp = so->so_pcb; 761 762 if (inp == NULL) { 763 /* 764 * OOPS! we lost a race, the TCP session got reset after 765 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a 766 * network interrupt in the non-critical section of sosend(). 767 */ 768 m_freem(m); 769 error = ECONNRESET; /* XXX EPIPE? */ 770 tp = NULL; 771 TCPDEBUG1(); 772 goto out; 773 } 774 tp = intotcpcb(inp); 775 TCPDEBUG1(); 776 777 #ifdef foo 778 /* 779 * This is no longer necessary, since: 780 * - sosendtcp() has already checked it for us 781 * - It does not work with asynchronized send 782 */ 783 784 /* 785 * Don't let too much OOB data build up 786 */ 787 if (flags & PRUS_OOB) { 788 if (ssb_space(&so->so_snd) < -512) { 789 m_freem(m); 790 error = ENOBUFS; 791 goto out; 792 } 793 } 794 #endif 795 796 /* 797 * Pump the data into the socket. 798 */ 799 if (m) 800 ssb_appendstream(&so->so_snd, m); 801 if (flags & PRUS_OOB) { 802 /* 803 * According to RFC961 (Assigned Protocols), 804 * the urgent pointer points to the last octet 805 * of urgent data. We continue, however, 806 * to consider it to indicate the first octet 807 * of data past the urgent section. 808 * Otherwise, snd_up should be one lower. 809 */ 810 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 811 tp->t_flags |= TF_FORCE; 812 error = tcp_output(tp); 813 tp->t_flags &= ~TF_FORCE; 814 } else { 815 if (flags & PRUS_EOF) { 816 /* 817 * Close the send side of the connection after 818 * the data is sent. 819 */ 820 socantsendmore(so); 821 tp = tcp_usrclosed(tp); 822 } 823 if (tp != NULL) { 824 if (flags & PRUS_MORETOCOME) 825 tp->t_flags |= TF_MORETOCOME; 826 error = tcp_output(tp); 827 if (flags & PRUS_MORETOCOME) 828 tp->t_flags &= ~TF_MORETOCOME; 829 } 830 } 831 COMMON_END1((flags & PRUS_OOB) ? PRU_SENDOOB : 832 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), 833 (flags & PRUS_NOREPLY)); 834 } 835 836 /* 837 * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort() 838 * will sofree() it when we return. 839 */ 840 static void 841 tcp_usr_abort(netmsg_t msg) 842 { 843 struct socket *so = msg->abort.base.nm_so; 844 int error = 0; 845 struct inpcb *inp; 846 struct tcpcb *tp; 847 848 COMMON_START(so, inp, 1); 849 tp = tcp_drop(tp, ECONNABORTED); 850 COMMON_END(PRU_ABORT); 851 } 852 853 /* 854 * Receive out-of-band data. 855 */ 856 static void 857 tcp_usr_rcvoob(netmsg_t msg) 858 { 859 struct socket *so = msg->rcvoob.base.nm_so; 860 struct mbuf *m = msg->rcvoob.nm_m; 861 int flags = msg->rcvoob.nm_flags; 862 int error = 0; 863 struct inpcb *inp; 864 struct tcpcb *tp; 865 866 COMMON_START(so, inp, 0); 867 if ((so->so_oobmark == 0 && 868 (so->so_state & SS_RCVATMARK) == 0) || 869 so->so_options & SO_OOBINLINE || 870 tp->t_oobflags & TCPOOB_HADDATA) { 871 error = EINVAL; 872 goto out; 873 } 874 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 875 error = EWOULDBLOCK; 876 goto out; 877 } 878 m->m_len = 1; 879 *mtod(m, caddr_t) = tp->t_iobc; 880 if ((flags & MSG_PEEK) == 0) 881 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 882 COMMON_END(PRU_RCVOOB); 883 } 884 885 static void 886 tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 887 { 888 in_savefaddr(so, faddr); 889 } 890 891 #ifdef INET6 892 static void 893 tcp6_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 894 { 895 in6_mapped_savefaddr(so, faddr); 896 } 897 #endif 898 899 /* xxx - should be const */ 900 struct pr_usrreqs tcp_usrreqs = { 901 .pru_abort = tcp_usr_abort, 902 .pru_accept = tcp_usr_accept, 903 .pru_attach = tcp_usr_attach, 904 .pru_bind = tcp_usr_bind, 905 .pru_connect = tcp_usr_connect, 906 .pru_connect2 = pr_generic_notsupp, 907 .pru_control = in_control_dispatch, 908 .pru_detach = tcp_usr_detach, 909 .pru_disconnect = tcp_usr_disconnect, 910 .pru_listen = tcp_usr_listen, 911 .pru_peeraddr = in_setpeeraddr_dispatch, 912 .pru_rcvd = tcp_usr_rcvd, 913 .pru_rcvoob = tcp_usr_rcvoob, 914 .pru_send = tcp_usr_send, 915 .pru_sense = pru_sense_null, 916 .pru_shutdown = tcp_usr_shutdown, 917 .pru_sockaddr = in_setsockaddr_dispatch, 918 .pru_sosend = sosendtcp, 919 .pru_soreceive = sorecvtcp, 920 .pru_savefaddr = tcp_usr_savefaddr 921 }; 922 923 #ifdef INET6 924 struct pr_usrreqs tcp6_usrreqs = { 925 .pru_abort = tcp_usr_abort, 926 .pru_accept = tcp6_usr_accept, 927 .pru_attach = tcp_usr_attach, 928 .pru_bind = tcp6_usr_bind, 929 .pru_connect = tcp6_usr_connect, 930 .pru_connect2 = pr_generic_notsupp, 931 .pru_control = in6_control_dispatch, 932 .pru_detach = tcp_usr_detach, 933 .pru_disconnect = tcp_usr_disconnect, 934 .pru_listen = tcp6_usr_listen, 935 .pru_peeraddr = in6_mapped_peeraddr_dispatch, 936 .pru_rcvd = tcp_usr_rcvd, 937 .pru_rcvoob = tcp_usr_rcvoob, 938 .pru_send = tcp_usr_send, 939 .pru_sense = pru_sense_null, 940 .pru_shutdown = tcp_usr_shutdown, 941 .pru_sockaddr = in6_mapped_sockaddr_dispatch, 942 .pru_sosend = sosendtcp, 943 .pru_soreceive = sorecvtcp, 944 .pru_savefaddr = tcp6_usr_savefaddr 945 }; 946 #endif /* INET6 */ 947 948 static int 949 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m, 950 struct sockaddr_in *sin, struct sockaddr_in *if_sin) 951 { 952 struct inpcb *inp = tp->t_inpcb, *oinp; 953 struct socket *so = inp->inp_socket; 954 struct route *ro = &inp->inp_route; 955 956 oinp = in_pcblookup_hash(&tcbinfo[mycpu->gd_cpuid], 957 sin->sin_addr, sin->sin_port, 958 (inp->inp_laddr.s_addr != INADDR_ANY ? 959 inp->inp_laddr : if_sin->sin_addr), 960 inp->inp_lport, 0, NULL); 961 if (oinp != NULL) { 962 m_freem(m); 963 return (EADDRINUSE); 964 } 965 if (inp->inp_laddr.s_addr == INADDR_ANY) 966 inp->inp_laddr = if_sin->sin_addr; 967 inp->inp_faddr = sin->sin_addr; 968 inp->inp_fport = sin->sin_port; 969 inp->inp_cpcbinfo = &tcbinfo[mycpu->gd_cpuid]; 970 in_pcbinsconnhash(inp); 971 972 /* 973 * We are now on the inpcb's owner CPU, if the cached route was 974 * freed because the rtentry's owner CPU is not the current CPU 975 * (e.g. in tcp_connect()), then we try to reallocate it here with 976 * the hope that a rtentry may be cloned from a RTF_PRCLONING 977 * rtentry. 978 */ 979 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 980 ro->ro_rt == NULL) { 981 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 982 ro->ro_dst.sa_family = AF_INET; 983 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 984 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = 985 sin->sin_addr; 986 rtalloc(ro); 987 } 988 989 /* 990 * Now that no more errors can occur, change the protocol processing 991 * port to the current thread (which is the correct thread). 992 * 993 * Create TCP timer message now; we are on the tcpcb's owner 994 * CPU/thread. 995 */ 996 tcp_create_timermsg(tp, &curthread->td_msgport); 997 998 /* 999 * Compute window scaling to request. Use a larger scaling then 1000 * needed for the initial receive buffer in case the receive buffer 1001 * gets expanded. 1002 */ 1003 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1004 tp->request_r_scale = TCP_MIN_WINSHIFT; 1005 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1006 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat 1007 ) { 1008 tp->request_r_scale++; 1009 } 1010 1011 soisconnecting(so); 1012 tcpstat.tcps_connattempt++; 1013 tp->t_state = TCPS_SYN_SENT; 1014 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1015 tp->iss = tcp_new_isn(tp); 1016 tcp_sendseqinit(tp); 1017 if (m) { 1018 ssb_appendstream(&so->so_snd, m); 1019 m = NULL; 1020 if (flags & PRUS_OOB) 1021 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1022 } 1023 1024 /* 1025 * Close the send side of the connection after 1026 * the data is sent if flagged. 1027 */ 1028 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1029 socantsendmore(so); 1030 tp = tcp_usrclosed(tp); 1031 } 1032 return (tcp_output(tp)); 1033 } 1034 1035 /* 1036 * Common subroutine to open a TCP connection to remote host specified 1037 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 1038 * port number if needed. Call in_pcbladdr to do the routing and to choose 1039 * a local host address (interface). 1040 * Initialize connection parameters and enter SYN-SENT state. 1041 */ 1042 static void 1043 tcp_connect(netmsg_t msg) 1044 { 1045 struct socket *so = msg->connect.base.nm_so; 1046 struct sockaddr *nam = msg->connect.nm_nam; 1047 struct thread *td = msg->connect.nm_td; 1048 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1049 struct sockaddr_in *if_sin; 1050 struct inpcb *inp; 1051 struct tcpcb *tp; 1052 int error, calc_laddr = 1; 1053 #ifdef SMP 1054 lwkt_port_t port; 1055 #endif 1056 1057 COMMON_START(so, inp, 0); 1058 1059 /* 1060 * Reconnect our pcb if we have to 1061 */ 1062 if (msg->connect.nm_reconnect & NMSG_RECONNECT_RECONNECT) { 1063 msg->connect.nm_reconnect &= ~NMSG_RECONNECT_RECONNECT; 1064 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1065 } 1066 1067 /* 1068 * Bind if we have to 1069 */ 1070 if (inp->inp_lport == 0) { 1071 if (tcp_lport_extension) { 1072 KKASSERT(inp->inp_laddr.s_addr == INADDR_ANY); 1073 1074 error = in_pcbladdr(inp, nam, &if_sin, td); 1075 if (error) 1076 goto out; 1077 inp->inp_laddr.s_addr = if_sin->sin_addr.s_addr; 1078 1079 error = in_pcbconn_bind(inp, nam, td); 1080 if (error) 1081 goto out; 1082 1083 calc_laddr = 0; 1084 } else { 1085 error = in_pcbbind(inp, NULL, td); 1086 if (error) 1087 goto out; 1088 } 1089 } 1090 1091 if (calc_laddr) { 1092 /* 1093 * Calculate the correct protocol processing thread. The 1094 * connect operation must run there. Set the forwarding 1095 * port before we forward the message or it will get bounced 1096 * right back to us. 1097 */ 1098 error = in_pcbladdr(inp, nam, &if_sin, td); 1099 if (error) 1100 goto out; 1101 } 1102 KKASSERT(inp->inp_socket == so); 1103 1104 #ifdef SMP 1105 port = tcp_addrport(sin->sin_addr.s_addr, sin->sin_port, 1106 (inp->inp_laddr.s_addr ? 1107 inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr), 1108 inp->inp_lport); 1109 1110 if (port != &curthread->td_msgport) { 1111 struct route *ro = &inp->inp_route; 1112 1113 /* 1114 * in_pcbladdr() may have allocated a route entry for us 1115 * on the current CPU, but we need a route entry on the 1116 * inpcb's owner CPU, so free it here. 1117 */ 1118 if (ro->ro_rt != NULL) 1119 RTFREE(ro->ro_rt); 1120 bzero(ro, sizeof(*ro)); 1121 1122 /* 1123 * We are moving the protocol processing port the socket 1124 * is on, we have to unlink here and re-link on the 1125 * target cpu. 1126 */ 1127 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1128 sosetport(so, port); 1129 msg->connect.nm_reconnect |= NMSG_RECONNECT_RECONNECT; 1130 msg->connect.base.nm_dispatch = tcp_connect; 1131 1132 lwkt_forwardmsg(port, &msg->connect.base.lmsg); 1133 /* msg invalid now */ 1134 return; 1135 } 1136 #else 1137 KKASSERT(so->so_port == &curthread->td_msgport); 1138 #endif 1139 error = tcp_connect_oncpu(tp, msg->connect.nm_flags, 1140 msg->connect.nm_m, sin, if_sin); 1141 msg->connect.nm_m = NULL; 1142 out: 1143 if (msg->connect.nm_m) { 1144 m_freem(msg->connect.nm_m); 1145 msg->connect.nm_m = NULL; 1146 } 1147 if (msg->connect.nm_reconnect & NMSG_RECONNECT_NAMALLOC) { 1148 kfree(msg->connect.nm_nam, M_LWKTMSG); 1149 msg->connect.nm_nam = NULL; 1150 } 1151 lwkt_replymsg(&msg->connect.base.lmsg, error); 1152 /* msg invalid now */ 1153 } 1154 1155 #ifdef INET6 1156 1157 static void 1158 tcp6_connect(netmsg_t msg) 1159 { 1160 struct tcpcb *tp; 1161 struct socket *so = msg->connect.base.nm_so; 1162 struct sockaddr *nam = msg->connect.nm_nam; 1163 struct thread *td = msg->connect.nm_td; 1164 struct inpcb *inp; 1165 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 1166 struct in6_addr *addr6; 1167 #ifdef SMP 1168 lwkt_port_t port; 1169 #endif 1170 int error; 1171 1172 COMMON_START(so, inp, 0); 1173 1174 /* 1175 * Reconnect our pcb if we have to 1176 */ 1177 if (msg->connect.nm_reconnect & NMSG_RECONNECT_RECONNECT) { 1178 msg->connect.nm_reconnect &= ~NMSG_RECONNECT_RECONNECT; 1179 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1180 } 1181 1182 /* 1183 * Bind if we have to 1184 */ 1185 if (inp->inp_lport == 0) { 1186 error = in6_pcbbind(inp, NULL, td); 1187 if (error) 1188 goto out; 1189 } 1190 1191 /* 1192 * Cannot simply call in_pcbconnect, because there might be an 1193 * earlier incarnation of this same connection still in 1194 * TIME_WAIT state, creating an ADDRINUSE error. 1195 */ 1196 error = in6_pcbladdr(inp, nam, &addr6, td); 1197 if (error) 1198 goto out; 1199 1200 #ifdef SMP 1201 port = tcp6_addrport(); /* XXX hack for now, always cpu0 */ 1202 1203 if (port != &curthread->td_msgport) { 1204 struct route *ro = &inp->inp_route; 1205 1206 /* 1207 * in_pcbladdr() may have allocated a route entry for us 1208 * on the current CPU, but we need a route entry on the 1209 * inpcb's owner CPU, so free it here. 1210 */ 1211 if (ro->ro_rt != NULL) 1212 RTFREE(ro->ro_rt); 1213 bzero(ro, sizeof(*ro)); 1214 1215 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1216 sosetport(so, port); 1217 msg->connect.nm_reconnect |= NMSG_RECONNECT_RECONNECT; 1218 msg->connect.base.nm_dispatch = tcp6_connect; 1219 1220 lwkt_forwardmsg(port, &msg->connect.base.lmsg); 1221 /* msg invalid now */ 1222 return; 1223 } 1224 #endif 1225 error = tcp6_connect_oncpu(tp, msg->connect.nm_flags, 1226 &msg->connect.nm_m, sin6, addr6); 1227 /* nm_m may still be intact */ 1228 out: 1229 if (error && (msg->connect.nm_reconnect & NMSG_RECONNECT_FALLBACK)) { 1230 tcp_connect(msg); 1231 /* msg invalid now */ 1232 } else { 1233 if (msg->connect.nm_m) { 1234 m_freem(msg->connect.nm_m); 1235 msg->connect.nm_m = NULL; 1236 } 1237 if (msg->connect.nm_reconnect & NMSG_RECONNECT_NAMALLOC) { 1238 kfree(msg->connect.nm_nam, M_LWKTMSG); 1239 msg->connect.nm_nam = NULL; 1240 } 1241 lwkt_replymsg(&msg->connect.base.lmsg, error); 1242 /* msg invalid now */ 1243 } 1244 } 1245 1246 static int 1247 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf **mp, 1248 struct sockaddr_in6 *sin6, struct in6_addr *addr6) 1249 { 1250 struct mbuf *m = *mp; 1251 struct inpcb *inp = tp->t_inpcb; 1252 struct socket *so = inp->inp_socket; 1253 struct inpcb *oinp; 1254 1255 /* 1256 * Cannot simply call in_pcbconnect, because there might be an 1257 * earlier incarnation of this same connection still in 1258 * TIME_WAIT state, creating an ADDRINUSE error. 1259 */ 1260 oinp = in6_pcblookup_hash(inp->inp_cpcbinfo, 1261 &sin6->sin6_addr, sin6->sin6_port, 1262 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? 1263 addr6 : &inp->in6p_laddr), 1264 inp->inp_lport, 0, NULL); 1265 if (oinp) 1266 return (EADDRINUSE); 1267 1268 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 1269 inp->in6p_laddr = *addr6; 1270 inp->in6p_faddr = sin6->sin6_addr; 1271 inp->inp_fport = sin6->sin6_port; 1272 if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) 1273 inp->in6p_flowinfo = sin6->sin6_flowinfo; 1274 in_pcbinsconnhash(inp); 1275 1276 /* 1277 * Now that no more errors can occur, change the protocol processing 1278 * port to the current thread (which is the correct thread). 1279 * 1280 * Create TCP timer message now; we are on the tcpcb's owner 1281 * CPU/thread. 1282 */ 1283 tcp_create_timermsg(tp, &curthread->td_msgport); 1284 1285 /* Compute window scaling to request. */ 1286 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1287 tp->request_r_scale = TCP_MIN_WINSHIFT; 1288 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1289 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) { 1290 tp->request_r_scale++; 1291 } 1292 1293 soisconnecting(so); 1294 tcpstat.tcps_connattempt++; 1295 tp->t_state = TCPS_SYN_SENT; 1296 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1297 tp->iss = tcp_new_isn(tp); 1298 tcp_sendseqinit(tp); 1299 if (m) { 1300 ssb_appendstream(&so->so_snd, m); 1301 *mp = NULL; 1302 if (flags & PRUS_OOB) 1303 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1304 } 1305 1306 /* 1307 * Close the send side of the connection after 1308 * the data is sent if flagged. 1309 */ 1310 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1311 socantsendmore(so); 1312 tp = tcp_usrclosed(tp); 1313 } 1314 return (tcp_output(tp)); 1315 } 1316 1317 #endif /* INET6 */ 1318 1319 /* 1320 * The new sockopt interface makes it possible for us to block in the 1321 * copyin/out step (if we take a page fault). Taking a page fault while 1322 * in a critical section is probably a Bad Thing. (Since sockets and pcbs 1323 * both now use TSM, there probably isn't any need for this function to 1324 * run in a critical section any more. This needs more examination.) 1325 */ 1326 void 1327 tcp_ctloutput(netmsg_t msg) 1328 { 1329 struct socket *so = msg->base.nm_so; 1330 struct sockopt *sopt = msg->ctloutput.nm_sopt; 1331 int error, opt, optval, opthz; 1332 struct inpcb *inp; 1333 struct tcpcb *tp; 1334 1335 error = 0; 1336 inp = so->so_pcb; 1337 if (inp == NULL) { 1338 error = ECONNRESET; 1339 goto done; 1340 } 1341 1342 if (sopt->sopt_level != IPPROTO_TCP) { 1343 #ifdef INET6 1344 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1345 ip6_ctloutput_dispatch(msg); 1346 else 1347 #endif /* INET6 */ 1348 ip_ctloutput(msg); 1349 /* msg invalid now */ 1350 return; 1351 } 1352 tp = intotcpcb(inp); 1353 1354 switch (sopt->sopt_dir) { 1355 case SOPT_SET: 1356 error = soopt_to_kbuf(sopt, &optval, sizeof optval, 1357 sizeof optval); 1358 if (error) 1359 break; 1360 switch (sopt->sopt_name) { 1361 case TCP_FASTKEEP: 1362 if (optval > 0) 1363 tp->t_keepidle = tp->t_keepintvl; 1364 else 1365 tp->t_keepidle = tcp_keepidle; 1366 tcp_timer_keep_activity(tp, 0); 1367 break; 1368 #ifdef TCP_SIGNATURE 1369 case TCP_SIGNATURE_ENABLE: 1370 if (tp->t_state == TCPS_CLOSED) { 1371 /* 1372 * This is the only safe state that this 1373 * option could be changed. Some segments 1374 * could already have been sent in other 1375 * states. 1376 */ 1377 if (optval > 0) 1378 tp->t_flags |= TF_SIGNATURE; 1379 else 1380 tp->t_flags &= ~TF_SIGNATURE; 1381 } else { 1382 error = EOPNOTSUPP; 1383 } 1384 break; 1385 #endif /* TCP_SIGNATURE */ 1386 case TCP_NODELAY: 1387 case TCP_NOOPT: 1388 switch (sopt->sopt_name) { 1389 case TCP_NODELAY: 1390 opt = TF_NODELAY; 1391 break; 1392 case TCP_NOOPT: 1393 opt = TF_NOOPT; 1394 break; 1395 default: 1396 opt = 0; /* dead code to fool gcc */ 1397 break; 1398 } 1399 1400 if (optval) 1401 tp->t_flags |= opt; 1402 else 1403 tp->t_flags &= ~opt; 1404 break; 1405 1406 case TCP_NOPUSH: 1407 if (tcp_disable_nopush) 1408 break; 1409 if (optval) 1410 tp->t_flags |= TF_NOPUSH; 1411 else { 1412 tp->t_flags &= ~TF_NOPUSH; 1413 error = tcp_output(tp); 1414 } 1415 break; 1416 1417 case TCP_MAXSEG: 1418 /* 1419 * Must be between 0 and maxseg. If the requested 1420 * maxseg is too small to satisfy the desired minmss, 1421 * pump it up (silently so sysctl modifications of 1422 * minmss do not create unexpected program failures). 1423 * Handle degenerate cases. 1424 */ 1425 if (optval > 0 && optval <= tp->t_maxseg) { 1426 if (optval + 40 < tcp_minmss) { 1427 optval = tcp_minmss - 40; 1428 if (optval < 0) 1429 optval = 1; 1430 } 1431 tp->t_maxseg = optval; 1432 } else { 1433 error = EINVAL; 1434 } 1435 break; 1436 1437 case TCP_KEEPINIT: 1438 opthz = ((int64_t)optval * hz) / 1000; 1439 if (opthz >= 1) 1440 tp->t_keepinit = opthz; 1441 else 1442 error = EINVAL; 1443 break; 1444 1445 case TCP_KEEPIDLE: 1446 opthz = ((int64_t)optval * hz) / 1000; 1447 if (opthz >= 1) { 1448 tp->t_keepidle = opthz; 1449 tcp_timer_keep_activity(tp, 0); 1450 } else { 1451 error = EINVAL; 1452 } 1453 break; 1454 1455 case TCP_KEEPINTVL: 1456 opthz = ((int64_t)optval * hz) / 1000; 1457 if (opthz >= 1) { 1458 tp->t_keepintvl = opthz; 1459 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1460 } else { 1461 error = EINVAL; 1462 } 1463 break; 1464 1465 case TCP_KEEPCNT: 1466 if (optval > 0) { 1467 tp->t_keepcnt = optval; 1468 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1469 } else { 1470 error = EINVAL; 1471 } 1472 break; 1473 1474 default: 1475 error = ENOPROTOOPT; 1476 break; 1477 } 1478 break; 1479 1480 case SOPT_GET: 1481 switch (sopt->sopt_name) { 1482 #ifdef TCP_SIGNATURE 1483 case TCP_SIGNATURE_ENABLE: 1484 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1485 break; 1486 #endif /* TCP_SIGNATURE */ 1487 case TCP_NODELAY: 1488 optval = tp->t_flags & TF_NODELAY; 1489 break; 1490 case TCP_MAXSEG: 1491 optval = tp->t_maxseg; 1492 break; 1493 case TCP_NOOPT: 1494 optval = tp->t_flags & TF_NOOPT; 1495 break; 1496 case TCP_NOPUSH: 1497 optval = tp->t_flags & TF_NOPUSH; 1498 break; 1499 case TCP_KEEPINIT: 1500 optval = ((int64_t)tp->t_keepinit * 1000) / hz; 1501 break; 1502 case TCP_KEEPIDLE: 1503 optval = ((int64_t)tp->t_keepidle * 1000) / hz; 1504 break; 1505 case TCP_KEEPINTVL: 1506 optval = ((int64_t)tp->t_keepintvl * 1000) / hz; 1507 break; 1508 case TCP_KEEPCNT: 1509 optval = tp->t_keepcnt; 1510 break; 1511 default: 1512 error = ENOPROTOOPT; 1513 break; 1514 } 1515 if (error == 0) 1516 soopt_from_kbuf(sopt, &optval, sizeof optval); 1517 break; 1518 } 1519 done: 1520 lwkt_replymsg(&msg->lmsg, error); 1521 } 1522 1523 /* 1524 * tcp_sendspace and tcp_recvspace are the default send and receive window 1525 * sizes, respectively. These are obsolescent (this information should 1526 * be set by the route). 1527 * 1528 * Use a default that does not require tcp window scaling to be turned 1529 * on. Individual programs or the administrator can increase the default. 1530 */ 1531 u_long tcp_sendspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1532 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1533 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1534 u_long tcp_recvspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1535 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1536 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1537 1538 /* 1539 * Attach TCP protocol to socket, allocating internet protocol control 1540 * block, tcp control block, bufer space, and entering LISTEN state 1541 * if to accept connections. 1542 */ 1543 static int 1544 tcp_attach(struct socket *so, struct pru_attach_info *ai) 1545 { 1546 struct tcpcb *tp; 1547 struct inpcb *inp; 1548 int error; 1549 int cpu; 1550 #ifdef INET6 1551 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; 1552 #endif 1553 1554 if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) { 1555 lwkt_gettoken(&so->so_rcv.ssb_token); 1556 error = soreserve(so, tcp_sendspace, tcp_recvspace, 1557 ai->sb_rlimit); 1558 lwkt_reltoken(&so->so_rcv.ssb_token); 1559 if (error) 1560 return (error); 1561 } 1562 atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE); 1563 atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); 1564 cpu = mycpu->gd_cpuid; 1565 1566 /* 1567 * Set the default port for protocol processing. This will likely 1568 * change when we connect. 1569 */ 1570 error = in_pcballoc(so, &tcbinfo[cpu]); 1571 if (error) 1572 return (error); 1573 inp = so->so_pcb; 1574 #ifdef INET6 1575 if (isipv6) { 1576 inp->inp_vflag |= INP_IPV6; 1577 inp->in6p_hops = -1; /* use kernel default */ 1578 } 1579 else 1580 #endif 1581 inp->inp_vflag |= INP_IPV4; 1582 tp = tcp_newtcpcb(inp); 1583 if (tp == NULL) { 1584 /* 1585 * Make sure the socket is destroyed by the pcbdetach. 1586 */ 1587 soreference(so); 1588 #ifdef INET6 1589 if (isipv6) 1590 in6_pcbdetach(inp); 1591 else 1592 #endif 1593 in_pcbdetach(inp); 1594 sofree(so); /* from ref above */ 1595 return (ENOBUFS); 1596 } 1597 tp->t_state = TCPS_CLOSED; 1598 return (0); 1599 } 1600 1601 /* 1602 * Initiate (or continue) disconnect. 1603 * If embryonic state, just send reset (once). 1604 * If in ``let data drain'' option and linger null, just drop. 1605 * Otherwise (hard), mark socket disconnecting and drop 1606 * current input data; switch states based on user close, and 1607 * send segment to peer (with FIN). 1608 */ 1609 static struct tcpcb * 1610 tcp_disconnect(struct tcpcb *tp) 1611 { 1612 struct socket *so = tp->t_inpcb->inp_socket; 1613 1614 if (tp->t_state < TCPS_ESTABLISHED) { 1615 tp = tcp_close(tp); 1616 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 1617 tp = tcp_drop(tp, 0); 1618 } else { 1619 lwkt_gettoken(&so->so_rcv.ssb_token); 1620 soisdisconnecting(so); 1621 sbflush(&so->so_rcv.sb); 1622 tp = tcp_usrclosed(tp); 1623 if (tp) 1624 tcp_output(tp); 1625 lwkt_reltoken(&so->so_rcv.ssb_token); 1626 } 1627 return (tp); 1628 } 1629 1630 /* 1631 * User issued close, and wish to trail through shutdown states: 1632 * if never received SYN, just forget it. If got a SYN from peer, 1633 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1634 * If already got a FIN from peer, then almost done; go to LAST_ACK 1635 * state. In all other cases, have already sent FIN to peer (e.g. 1636 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1637 * for peer to send FIN or not respond to keep-alives, etc. 1638 * We can let the user exit from the close as soon as the FIN is acked. 1639 */ 1640 static struct tcpcb * 1641 tcp_usrclosed(struct tcpcb *tp) 1642 { 1643 1644 switch (tp->t_state) { 1645 1646 case TCPS_CLOSED: 1647 case TCPS_LISTEN: 1648 tp->t_state = TCPS_CLOSED; 1649 tp = tcp_close(tp); 1650 break; 1651 1652 case TCPS_SYN_SENT: 1653 case TCPS_SYN_RECEIVED: 1654 tp->t_flags |= TF_NEEDFIN; 1655 break; 1656 1657 case TCPS_ESTABLISHED: 1658 tp->t_state = TCPS_FIN_WAIT_1; 1659 break; 1660 1661 case TCPS_CLOSE_WAIT: 1662 tp->t_state = TCPS_LAST_ACK; 1663 break; 1664 } 1665 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1666 soisdisconnected(tp->t_inpcb->inp_socket); 1667 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1668 if (tp->t_state == TCPS_FIN_WAIT_2) { 1669 tcp_callout_reset(tp, tp->tt_2msl, tp->t_maxidle, 1670 tcp_timer_2msl); 1671 } 1672 } 1673 return (tp); 1674 } 1675