1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 63 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $ 64 */ 65 66 #include "opt_ipsec.h" 67 #include "opt_inet.h" 68 #include "opt_inet6.h" 69 #include "opt_tcpdebug.h" 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/kernel.h> 74 #include <sys/malloc.h> 75 #include <sys/sysctl.h> 76 #include <sys/globaldata.h> 77 #include <sys/thread.h> 78 79 #include <sys/mbuf.h> 80 #ifdef INET6 81 #include <sys/domain.h> 82 #endif /* INET6 */ 83 #include <sys/socket.h> 84 #include <sys/socketvar.h> 85 #include <sys/socketops.h> 86 #include <sys/protosw.h> 87 88 #include <sys/thread2.h> 89 #include <sys/msgport2.h> 90 #include <sys/socketvar2.h> 91 92 #include <net/if.h> 93 #include <net/netisr.h> 94 #include <net/route.h> 95 96 #include <net/netmsg2.h> 97 #include <net/netisr2.h> 98 99 #include <netinet/in.h> 100 #include <netinet/in_systm.h> 101 #ifdef INET6 102 #include <netinet/ip6.h> 103 #endif 104 #include <netinet/in_pcb.h> 105 #ifdef INET6 106 #include <netinet6/in6_pcb.h> 107 #endif 108 #include <netinet/in_var.h> 109 #include <netinet/ip_var.h> 110 #ifdef INET6 111 #include <netinet6/ip6_var.h> 112 #include <netinet6/tcp6_var.h> 113 #endif 114 #include <netinet/tcp.h> 115 #include <netinet/tcp_fsm.h> 116 #include <netinet/tcp_seq.h> 117 #include <netinet/tcp_timer.h> 118 #include <netinet/tcp_timer2.h> 119 #include <netinet/tcp_var.h> 120 #include <netinet/tcpip.h> 121 #ifdef TCPDEBUG 122 #include <netinet/tcp_debug.h> 123 #endif 124 125 #ifdef IPSEC 126 #include <netinet6/ipsec.h> 127 #endif /*IPSEC*/ 128 129 /* 130 * TCP protocol interface to socket abstraction. 131 */ 132 extern char *tcpstates[]; /* XXX ??? */ 133 134 static int tcp_attach (struct socket *, struct pru_attach_info *); 135 static void tcp_connect (netmsg_t msg); 136 #ifdef INET6 137 static void tcp6_connect (netmsg_t msg); 138 static int tcp6_connect_oncpu(struct tcpcb *tp, int flags, 139 struct mbuf **mp, 140 struct sockaddr_in6 *sin6, 141 struct in6_addr *addr6); 142 #endif /* INET6 */ 143 static struct tcpcb * 144 tcp_disconnect (struct tcpcb *); 145 static struct tcpcb * 146 tcp_usrclosed (struct tcpcb *); 147 148 #ifdef TCPDEBUG 149 #define TCPDEBUG0 int ostate = 0 150 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 151 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 152 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 153 #else 154 #define TCPDEBUG0 155 #define TCPDEBUG1() 156 #define TCPDEBUG2(req) 157 #endif 158 159 static int tcp_lport_extension = 1; 160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lportext, CTLFLAG_RW, 161 &tcp_lport_extension, 0, ""); 162 163 /* 164 * For some ill optimized programs, which try to use TCP_NOPUSH 165 * to improve performance, will have small amount of data sits 166 * in the sending buffer. These small amount of data will _not_ 167 * be pushed into the network until more data are written into 168 * the socket or the socket write side is shutdown. 169 */ 170 static int tcp_disable_nopush = 1; 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_nopush, CTLFLAG_RW, 172 &tcp_disable_nopush, 0, "TCP_NOPUSH socket option will have no effect"); 173 174 /* 175 * Allocate socket buffer space. 176 */ 177 static int 178 tcp_usr_preattach(struct socket *so, int proto __unused, 179 struct pru_attach_info *ai) 180 { 181 int error; 182 183 if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) { 184 error = soreserve(so, tcp_sendspace, tcp_recvspace, 185 ai->sb_rlimit); 186 if (error) 187 return (error); 188 } 189 atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE); 190 atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE | SSB_PREALLOC); 191 192 return 0; 193 } 194 195 /* 196 * TCP attaches to socket via pru_attach(), reserving space, 197 * and an internet control block. This socket may move to 198 * other CPU later when we bind/connect. 199 */ 200 static void 201 tcp_usr_attach(netmsg_t msg) 202 { 203 struct socket *so = msg->base.nm_so; 204 struct pru_attach_info *ai = msg->attach.nm_ai; 205 int error; 206 struct inpcb *inp; 207 struct tcpcb *tp = NULL; 208 TCPDEBUG0; 209 210 inp = so->so_pcb; 211 KASSERT(inp == NULL, ("tcp socket attached")); 212 TCPDEBUG1(); 213 214 error = tcp_attach(so, ai); 215 if (error) 216 goto out; 217 218 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 219 so->so_linger = TCP_LINGERTIME; 220 tp = sototcpcb(so); 221 out: 222 TCPDEBUG2(PRU_ATTACH); 223 lwkt_replymsg(&msg->lmsg, error); 224 } 225 226 /* 227 * pru_detach() detaches the TCP protocol from the socket. 228 * If the protocol state is non-embryonic, then can't 229 * do this directly: have to initiate a pru_disconnect(), 230 * which may finish later; embryonic TCB's can just 231 * be discarded here. 232 */ 233 static void 234 tcp_usr_detach(netmsg_t msg) 235 { 236 struct socket *so = msg->base.nm_so; 237 int error = 0; 238 struct inpcb *inp; 239 struct tcpcb *tp; 240 TCPDEBUG0; 241 242 inp = so->so_pcb; 243 244 /* 245 * If the inp is already detached or never attached, it may have 246 * been due to an async close or async attach failure. Just return 247 * as if no error occured. 248 */ 249 if (inp) { 250 tp = intotcpcb(inp); 251 KASSERT(tp != NULL, ("tcp_usr_detach: tp is NULL")); 252 TCPDEBUG1(); 253 tp = tcp_disconnect(tp); 254 TCPDEBUG2(PRU_DETACH); 255 } 256 lwkt_replymsg(&msg->lmsg, error); 257 } 258 259 /* 260 * NOTE: ignore_error is non-zero for certain disconnection races 261 * which we want to silently allow, otherwise close() may return 262 * an unexpected error. 263 * 264 * NOTE: The variables (msg) and (tp) are assumed. 265 */ 266 #define COMMON_START(so, inp, ignore_error) \ 267 TCPDEBUG0; \ 268 \ 269 inp = so->so_pcb; \ 270 do { \ 271 if (inp == NULL) { \ 272 error = ignore_error ? 0 : EINVAL; \ 273 tp = NULL; \ 274 goto out; \ 275 } \ 276 tp = intotcpcb(inp); \ 277 TCPDEBUG1(); \ 278 } while(0) 279 280 #define COMMON_END1(req, noreply) \ 281 out: do { \ 282 TCPDEBUG2(req); \ 283 if (!(noreply)) \ 284 lwkt_replymsg(&msg->lmsg, error); \ 285 return; \ 286 } while(0) 287 288 #define COMMON_END(req) COMMON_END1((req), 0) 289 290 static void 291 tcp_sosetport(struct lwkt_msg *msg, lwkt_port_t port) 292 { 293 sosetport(((struct netmsg_base *)msg)->nm_so, port); 294 } 295 296 /* 297 * Give the socket an address. 298 */ 299 static void 300 tcp_usr_bind(netmsg_t msg) 301 { 302 struct socket *so = msg->bind.base.nm_so; 303 struct sockaddr *nam = msg->bind.nm_nam; 304 struct thread *td = msg->bind.nm_td; 305 int error = 0; 306 struct inpcb *inp; 307 struct tcpcb *tp; 308 struct sockaddr_in *sinp; 309 lwkt_port_t port0 = netisr_cpuport(0); 310 311 COMMON_START(so, inp, 0); 312 313 /* 314 * Must check for multicast addresses and disallow binding 315 * to them. 316 */ 317 sinp = (struct sockaddr_in *)nam; 318 if (sinp->sin_family == AF_INET && 319 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 320 error = EAFNOSUPPORT; 321 goto out; 322 } 323 324 /* 325 * Check "already bound" here (in_pcbbind() does the same check 326 * though), so we don't forward a connected socket to netisr0, 327 * which would panic in the following in_pcbunlink(). 328 */ 329 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) { 330 error = EINVAL; /* already bound */ 331 goto out; 332 } 333 334 /* 335 * Use netisr0 to serialize in_pcbbind(), so that pru_detach and 336 * pru_bind for different sockets on the same local port could be 337 * properly ordered. The original race is illustrated here for 338 * reference. 339 * 340 * s1 = socket(); 341 * bind(s1, *.PORT); 342 * close(s1); <----- asynchronous 343 * s2 = socket(); 344 * bind(s2, *.PORT); 345 * 346 * All will expect bind(s2, *.PORT) to succeed. However, it will 347 * fail, if following sequence happens due to random socket initial 348 * msgport and asynchronous close(2): 349 * 350 * netisrN netisrM 351 * : : 352 * : pru_bind(s2) [*.PORT is used by s1] 353 * pru_detach(s1) : 354 */ 355 if (&curthread->td_msgport != port0) { 356 lwkt_msg_t lmsg = &msg->bind.base.lmsg; 357 358 KASSERT((msg->bind.nm_flags & PRUB_RELINK) == 0, 359 ("already asked to relink")); 360 361 in_pcbunlink(so->so_pcb, &tcbinfo[mycpuid]); 362 msg->bind.nm_flags |= PRUB_RELINK; 363 364 /* See the related comment in tcp_connect() */ 365 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 366 lwkt_forwardmsg(port0, lmsg); 367 /* msg invalid now */ 368 return; 369 } 370 KASSERT(so->so_port == port0, ("so_port is not netisr0")); 371 372 if (msg->bind.nm_flags & PRUB_RELINK) { 373 msg->bind.nm_flags &= ~PRUB_RELINK; 374 in_pcblink(so->so_pcb, &tcbinfo[mycpuid]); 375 } 376 KASSERT(inp->inp_pcbinfo == &tcbinfo[0], ("pcbinfo is not tcbinfo0")); 377 378 error = in_pcbbind(inp, nam, td); 379 if (error) 380 goto out; 381 382 COMMON_END(PRU_BIND); 383 } 384 385 #ifdef INET6 386 387 static void 388 tcp6_usr_bind(netmsg_t msg) 389 { 390 struct socket *so = msg->bind.base.nm_so; 391 struct sockaddr *nam = msg->bind.nm_nam; 392 struct thread *td = msg->bind.nm_td; 393 int error = 0; 394 struct inpcb *inp; 395 struct tcpcb *tp; 396 struct sockaddr_in6 *sin6p; 397 398 COMMON_START(so, inp, 0); 399 400 /* 401 * Must check for multicast addresses and disallow binding 402 * to them. 403 */ 404 sin6p = (struct sockaddr_in6 *)nam; 405 if (sin6p->sin6_family == AF_INET6 && 406 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 407 error = EAFNOSUPPORT; 408 goto out; 409 } 410 error = in6_pcbbind(inp, nam, td); 411 if (error) 412 goto out; 413 COMMON_END(PRU_BIND); 414 } 415 #endif /* INET6 */ 416 417 struct netmsg_inswildcard { 418 struct netmsg_base base; 419 struct inpcb *nm_inp; 420 }; 421 422 static void 423 in_pcbinswildcardhash_handler(netmsg_t msg) 424 { 425 struct netmsg_inswildcard *nm = (struct netmsg_inswildcard *)msg; 426 int cpu = mycpuid, nextcpu; 427 428 in_pcbinswildcardhash_oncpu(nm->nm_inp, &tcbinfo[cpu]); 429 430 nextcpu = cpu + 1; 431 if (nextcpu < ncpus2) 432 lwkt_forwardmsg(netisr_cpuport(nextcpu), &nm->base.lmsg); 433 else 434 lwkt_replymsg(&nm->base.lmsg, 0); 435 } 436 437 /* 438 * Prepare to accept connections. 439 */ 440 static void 441 tcp_usr_listen(netmsg_t msg) 442 { 443 struct socket *so = msg->listen.base.nm_so; 444 struct thread *td = msg->listen.nm_td; 445 int error = 0; 446 struct inpcb *inp; 447 struct tcpcb *tp; 448 struct netmsg_inswildcard nm; 449 lwkt_port_t port0 = netisr_cpuport(0); 450 451 COMMON_START(so, inp, 0); 452 453 if (&curthread->td_msgport != port0) { 454 lwkt_msg_t lmsg = &msg->listen.base.lmsg; 455 456 KASSERT((msg->listen.nm_flags & PRUL_RELINK) == 0, 457 ("already asked to relink")); 458 459 in_pcbunlink(so->so_pcb, &tcbinfo[mycpuid]); 460 msg->listen.nm_flags |= PRUL_RELINK; 461 462 /* See the related comment in tcp_connect() */ 463 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 464 lwkt_forwardmsg(port0, lmsg); 465 /* msg invalid now */ 466 return; 467 } 468 KASSERT(so->so_port == port0, ("so_port is not netisr0")); 469 470 if (msg->listen.nm_flags & PRUL_RELINK) { 471 msg->listen.nm_flags &= ~PRUL_RELINK; 472 in_pcblink(so->so_pcb, &tcbinfo[mycpuid]); 473 } 474 KASSERT(inp->inp_pcbinfo == &tcbinfo[0], ("pcbinfo is not tcbinfo0")); 475 476 if (tp->t_flags & TF_LISTEN) 477 goto out; 478 479 if (inp->inp_lport == 0) { 480 error = in_pcbbind(inp, NULL, td); 481 if (error) 482 goto out; 483 } 484 485 tp->t_state = TCPS_LISTEN; 486 tp->t_flags |= TF_LISTEN; 487 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 488 489 /* 490 * Create tcpcb per-cpu port cache 491 * 492 * NOTE: 493 * This _must_ be done before installing this inpcb into 494 * wildcard hash. 495 */ 496 tcp_pcbport_create(tp); 497 498 if (ncpus2 > 1) { 499 /* 500 * Put this inpcb into wildcard hash on other cpus. 501 */ 502 ASSERT_INP_NOTINHASH(inp); 503 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 504 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 505 nm.nm_inp = inp; 506 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0); 507 } 508 in_pcbinswildcardhash(inp); 509 COMMON_END(PRU_LISTEN); 510 } 511 512 #ifdef INET6 513 514 static void 515 tcp6_usr_listen(netmsg_t msg) 516 { 517 struct socket *so = msg->listen.base.nm_so; 518 struct thread *td = msg->listen.nm_td; 519 int error = 0; 520 struct inpcb *inp; 521 struct tcpcb *tp; 522 struct netmsg_inswildcard nm; 523 524 COMMON_START(so, inp, 0); 525 526 if (tp->t_flags & TF_LISTEN) 527 goto out; 528 529 if (inp->inp_lport == 0) { 530 error = in6_pcbbind(inp, NULL, td); 531 if (error) 532 goto out; 533 } 534 535 tp->t_state = TCPS_LISTEN; 536 tp->t_flags |= TF_LISTEN; 537 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 538 539 /* 540 * Create tcpcb per-cpu port cache 541 * 542 * NOTE: 543 * This _must_ be done before installing this inpcb into 544 * wildcard hash. 545 */ 546 tcp_pcbport_create(tp); 547 548 if (ncpus2 > 1) { 549 /* 550 * Put this inpcb into wildcard hash on other cpus. 551 */ 552 KKASSERT(so->so_port == netisr_cpuport(0)); 553 ASSERT_IN_NETISR(0); 554 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 555 ASSERT_INP_NOTINHASH(inp); 556 557 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 558 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 559 nm.nm_inp = inp; 560 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0); 561 } 562 in_pcbinswildcardhash(inp); 563 COMMON_END(PRU_LISTEN); 564 } 565 #endif /* INET6 */ 566 567 /* 568 * Initiate connection to peer. 569 * Create a template for use in transmissions on this connection. 570 * Enter SYN_SENT state, and mark socket as connecting. 571 * Start keep-alive timer, and seed output sequence space. 572 * Send initial segment on connection. 573 */ 574 static void 575 tcp_usr_connect(netmsg_t msg) 576 { 577 struct socket *so = msg->connect.base.nm_so; 578 struct sockaddr *nam = msg->connect.nm_nam; 579 struct thread *td = msg->connect.nm_td; 580 int error = 0; 581 struct inpcb *inp; 582 struct tcpcb *tp; 583 struct sockaddr_in *sinp; 584 585 COMMON_START(so, inp, 0); 586 587 /* 588 * Must disallow TCP ``connections'' to multicast addresses. 589 */ 590 sinp = (struct sockaddr_in *)nam; 591 if (sinp->sin_family == AF_INET 592 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 593 error = EAFNOSUPPORT; 594 goto out; 595 } 596 597 if (!prison_remote_ip(td, (struct sockaddr*)sinp)) { 598 error = EAFNOSUPPORT; /* IPv6 only jail */ 599 goto out; 600 } 601 602 tcp_connect(msg); 603 /* msg is invalid now */ 604 return; 605 out: 606 if (msg->connect.nm_m) { 607 m_freem(msg->connect.nm_m); 608 msg->connect.nm_m = NULL; 609 } 610 if (msg->connect.nm_flags & PRUC_HELDTD) 611 lwkt_rele(td); 612 if (error && (msg->connect.nm_flags & PRUC_ASYNC)) { 613 so->so_error = error; 614 soisdisconnected(so); 615 } 616 lwkt_replymsg(&msg->lmsg, error); 617 } 618 619 #ifdef INET6 620 621 static void 622 tcp6_usr_connect(netmsg_t msg) 623 { 624 struct socket *so = msg->connect.base.nm_so; 625 struct sockaddr *nam = msg->connect.nm_nam; 626 struct thread *td = msg->connect.nm_td; 627 int error = 0; 628 struct inpcb *inp; 629 struct tcpcb *tp; 630 struct sockaddr_in6 *sin6p; 631 632 COMMON_START(so, inp, 0); 633 634 /* 635 * Must disallow TCP ``connections'' to multicast addresses. 636 */ 637 sin6p = (struct sockaddr_in6 *)nam; 638 if (sin6p->sin6_family == AF_INET6 639 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 640 error = EAFNOSUPPORT; 641 goto out; 642 } 643 644 if (!prison_remote_ip(td, nam)) { 645 error = EAFNOSUPPORT; /* IPv4 only jail */ 646 goto out; 647 } 648 649 /* Reject v4-mapped address */ 650 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 651 error = EADDRNOTAVAIL; 652 goto out; 653 } 654 655 inp->inp_inc.inc_isipv6 = 1; 656 tcp6_connect(msg); 657 /* msg is invalid now */ 658 return; 659 out: 660 if (msg->connect.nm_m) { 661 m_freem(msg->connect.nm_m); 662 msg->connect.nm_m = NULL; 663 } 664 lwkt_replymsg(&msg->lmsg, error); 665 } 666 667 #endif /* INET6 */ 668 669 /* 670 * Initiate disconnect from peer. 671 * If connection never passed embryonic stage, just drop; 672 * else if don't need to let data drain, then can just drop anyways, 673 * else have to begin TCP shutdown process: mark socket disconnecting, 674 * drain unread data, state switch to reflect user close, and 675 * send segment (e.g. FIN) to peer. Socket will be really disconnected 676 * when peer sends FIN and acks ours. 677 * 678 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 679 */ 680 static void 681 tcp_usr_disconnect(netmsg_t msg) 682 { 683 struct socket *so = msg->disconnect.base.nm_so; 684 int error = 0; 685 struct inpcb *inp; 686 struct tcpcb *tp; 687 688 COMMON_START(so, inp, 1); 689 tp = tcp_disconnect(tp); 690 COMMON_END(PRU_DISCONNECT); 691 } 692 693 /* 694 * Accept a connection. Essentially all the work is 695 * done at higher levels; just return the address 696 * of the peer, storing through addr. 697 */ 698 static void 699 tcp_usr_accept(netmsg_t msg) 700 { 701 struct socket *so = msg->accept.base.nm_so; 702 struct sockaddr **nam = msg->accept.nm_nam; 703 int error = 0; 704 struct inpcb *inp; 705 struct tcpcb *tp = NULL; 706 TCPDEBUG0; 707 708 inp = so->so_pcb; 709 if (so->so_state & SS_ISDISCONNECTED) { 710 error = ECONNABORTED; 711 goto out; 712 } 713 if (inp == NULL) { 714 error = EINVAL; 715 goto out; 716 } 717 718 tp = intotcpcb(inp); 719 TCPDEBUG1(); 720 in_setpeeraddr(so, nam); 721 COMMON_END(PRU_ACCEPT); 722 } 723 724 #ifdef INET6 725 static void 726 tcp6_usr_accept(netmsg_t msg) 727 { 728 struct socket *so = msg->accept.base.nm_so; 729 struct sockaddr **nam = msg->accept.nm_nam; 730 int error = 0; 731 struct inpcb *inp; 732 struct tcpcb *tp = NULL; 733 TCPDEBUG0; 734 735 inp = so->so_pcb; 736 737 if (so->so_state & SS_ISDISCONNECTED) { 738 error = ECONNABORTED; 739 goto out; 740 } 741 if (inp == NULL) { 742 error = EINVAL; 743 goto out; 744 } 745 tp = intotcpcb(inp); 746 TCPDEBUG1(); 747 in6_setpeeraddr(so, nam); 748 COMMON_END(PRU_ACCEPT); 749 } 750 #endif /* INET6 */ 751 752 /* 753 * Mark the connection as being incapable of further output. 754 */ 755 static void 756 tcp_usr_shutdown(netmsg_t msg) 757 { 758 struct socket *so = msg->shutdown.base.nm_so; 759 int error = 0; 760 struct inpcb *inp; 761 struct tcpcb *tp; 762 763 COMMON_START(so, inp, 0); 764 socantsendmore(so); 765 tp = tcp_usrclosed(tp); 766 if (tp) 767 error = tcp_output(tp); 768 COMMON_END(PRU_SHUTDOWN); 769 } 770 771 /* 772 * After a receive, possibly send window update to peer. 773 */ 774 static void 775 tcp_usr_rcvd(netmsg_t msg) 776 { 777 struct socket *so = msg->rcvd.base.nm_so; 778 int error = 0, noreply = 0; 779 struct inpcb *inp; 780 struct tcpcb *tp; 781 782 COMMON_START(so, inp, 0); 783 784 if (msg->rcvd.nm_pru_flags & PRUR_ASYNC) { 785 noreply = 1; 786 so_async_rcvd_reply(so); 787 } 788 tcp_output(tp); 789 790 COMMON_END1(PRU_RCVD, noreply); 791 } 792 793 /* 794 * Do a send by putting data in output queue and updating urgent 795 * marker if URG set. Possibly send more data. Unlike the other 796 * pru_*() routines, the mbuf chains are our responsibility. We 797 * must either enqueue them or free them. The other pru_* routines 798 * generally are caller-frees. 799 */ 800 static void 801 tcp_usr_send(netmsg_t msg) 802 { 803 struct socket *so = msg->send.base.nm_so; 804 int flags = msg->send.nm_flags; 805 struct mbuf *m = msg->send.nm_m; 806 int error = 0; 807 struct inpcb *inp; 808 struct tcpcb *tp; 809 TCPDEBUG0; 810 811 KKASSERT(msg->send.nm_control == NULL); 812 KKASSERT(msg->send.nm_addr == NULL); 813 KKASSERT((flags & PRUS_FREEADDR) == 0); 814 815 inp = so->so_pcb; 816 817 if (inp == NULL) { 818 /* 819 * OOPS! we lost a race, the TCP session got reset after 820 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a 821 * network interrupt in the non-critical section of sosend(). 822 */ 823 m_freem(m); 824 error = ECONNRESET; /* XXX EPIPE? */ 825 tp = NULL; 826 TCPDEBUG1(); 827 goto out; 828 } 829 tp = intotcpcb(inp); 830 TCPDEBUG1(); 831 832 #ifdef foo 833 /* 834 * This is no longer necessary, since: 835 * - sosendtcp() has already checked it for us 836 * - It does not work with asynchronized send 837 */ 838 839 /* 840 * Don't let too much OOB data build up 841 */ 842 if (flags & PRUS_OOB) { 843 if (ssb_space(&so->so_snd) < -512) { 844 m_freem(m); 845 error = ENOBUFS; 846 goto out; 847 } 848 } 849 #endif 850 851 /* 852 * Pump the data into the socket. 853 */ 854 if (m) { 855 ssb_appendstream(&so->so_snd, m); 856 sowwakeup(so); 857 } 858 if (flags & PRUS_OOB) { 859 /* 860 * According to RFC961 (Assigned Protocols), 861 * the urgent pointer points to the last octet 862 * of urgent data. We continue, however, 863 * to consider it to indicate the first octet 864 * of data past the urgent section. 865 * Otherwise, snd_up should be one lower. 866 */ 867 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 868 tp->t_flags |= TF_FORCE; 869 error = tcp_output(tp); 870 tp->t_flags &= ~TF_FORCE; 871 } else { 872 if (flags & PRUS_EOF) { 873 /* 874 * Close the send side of the connection after 875 * the data is sent. 876 */ 877 socantsendmore(so); 878 tp = tcp_usrclosed(tp); 879 } 880 if (tp != NULL && !tcp_output_pending(tp)) { 881 if (flags & PRUS_MORETOCOME) 882 tp->t_flags |= TF_MORETOCOME; 883 error = tcp_output_fair(tp); 884 if (flags & PRUS_MORETOCOME) 885 tp->t_flags &= ~TF_MORETOCOME; 886 } 887 } 888 COMMON_END1((flags & PRUS_OOB) ? PRU_SENDOOB : 889 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), 890 (flags & PRUS_NOREPLY)); 891 } 892 893 /* 894 * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort() 895 * will sofree() it when we return. 896 */ 897 static void 898 tcp_usr_abort(netmsg_t msg) 899 { 900 struct socket *so = msg->abort.base.nm_so; 901 int error = 0; 902 struct inpcb *inp; 903 struct tcpcb *tp; 904 905 COMMON_START(so, inp, 1); 906 tp = tcp_drop(tp, ECONNABORTED); 907 COMMON_END(PRU_ABORT); 908 } 909 910 /* 911 * Receive out-of-band data. 912 */ 913 static void 914 tcp_usr_rcvoob(netmsg_t msg) 915 { 916 struct socket *so = msg->rcvoob.base.nm_so; 917 struct mbuf *m = msg->rcvoob.nm_m; 918 int flags = msg->rcvoob.nm_flags; 919 int error = 0; 920 struct inpcb *inp; 921 struct tcpcb *tp; 922 923 COMMON_START(so, inp, 0); 924 if ((so->so_oobmark == 0 && 925 (so->so_state & SS_RCVATMARK) == 0) || 926 so->so_options & SO_OOBINLINE || 927 tp->t_oobflags & TCPOOB_HADDATA) { 928 error = EINVAL; 929 goto out; 930 } 931 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 932 error = EWOULDBLOCK; 933 goto out; 934 } 935 m->m_len = 1; 936 *mtod(m, caddr_t) = tp->t_iobc; 937 if ((flags & MSG_PEEK) == 0) 938 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 939 COMMON_END(PRU_RCVOOB); 940 } 941 942 static void 943 tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 944 { 945 in_savefaddr(so, faddr); 946 } 947 948 #ifdef INET6 949 static void 950 tcp6_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 951 { 952 in6_savefaddr(so, faddr); 953 } 954 #endif 955 956 static int 957 tcp_usr_preconnect(struct socket *so, const struct sockaddr *nam, 958 struct thread *td __unused) 959 { 960 const struct sockaddr_in *sinp; 961 962 sinp = (const struct sockaddr_in *)nam; 963 if (sinp->sin_family == AF_INET && 964 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 965 return EAFNOSUPPORT; 966 967 soisconnecting(so); 968 return 0; 969 } 970 971 /* xxx - should be const */ 972 struct pr_usrreqs tcp_usrreqs = { 973 .pru_abort = tcp_usr_abort, 974 .pru_accept = tcp_usr_accept, 975 .pru_attach = tcp_usr_attach, 976 .pru_bind = tcp_usr_bind, 977 .pru_connect = tcp_usr_connect, 978 .pru_connect2 = pr_generic_notsupp, 979 .pru_control = in_control_dispatch, 980 .pru_detach = tcp_usr_detach, 981 .pru_disconnect = tcp_usr_disconnect, 982 .pru_listen = tcp_usr_listen, 983 .pru_peeraddr = in_setpeeraddr_dispatch, 984 .pru_rcvd = tcp_usr_rcvd, 985 .pru_rcvoob = tcp_usr_rcvoob, 986 .pru_send = tcp_usr_send, 987 .pru_sense = pru_sense_null, 988 .pru_shutdown = tcp_usr_shutdown, 989 .pru_sockaddr = in_setsockaddr_dispatch, 990 .pru_sosend = sosendtcp, 991 .pru_soreceive = sorecvtcp, 992 .pru_savefaddr = tcp_usr_savefaddr, 993 .pru_preconnect = tcp_usr_preconnect, 994 .pru_preattach = tcp_usr_preattach 995 }; 996 997 #ifdef INET6 998 struct pr_usrreqs tcp6_usrreqs = { 999 .pru_abort = tcp_usr_abort, 1000 .pru_accept = tcp6_usr_accept, 1001 .pru_attach = tcp_usr_attach, 1002 .pru_bind = tcp6_usr_bind, 1003 .pru_connect = tcp6_usr_connect, 1004 .pru_connect2 = pr_generic_notsupp, 1005 .pru_control = in6_control_dispatch, 1006 .pru_detach = tcp_usr_detach, 1007 .pru_disconnect = tcp_usr_disconnect, 1008 .pru_listen = tcp6_usr_listen, 1009 .pru_peeraddr = in6_setpeeraddr_dispatch, 1010 .pru_rcvd = tcp_usr_rcvd, 1011 .pru_rcvoob = tcp_usr_rcvoob, 1012 .pru_send = tcp_usr_send, 1013 .pru_sense = pru_sense_null, 1014 .pru_shutdown = tcp_usr_shutdown, 1015 .pru_sockaddr = in6_setsockaddr_dispatch, 1016 .pru_sosend = sosendtcp, 1017 .pru_soreceive = sorecvtcp, 1018 .pru_savefaddr = tcp6_usr_savefaddr 1019 }; 1020 #endif /* INET6 */ 1021 1022 static int 1023 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m, 1024 struct sockaddr_in *sin, struct sockaddr_in *if_sin) 1025 { 1026 struct inpcb *inp = tp->t_inpcb, *oinp; 1027 struct socket *so = inp->inp_socket; 1028 struct route *ro = &inp->inp_route; 1029 1030 KASSERT(inp->inp_pcbinfo == &tcbinfo[mycpu->gd_cpuid], 1031 ("pcbinfo mismatch")); 1032 1033 oinp = in_pcblookup_hash(inp->inp_pcbinfo, 1034 sin->sin_addr, sin->sin_port, 1035 (inp->inp_laddr.s_addr != INADDR_ANY ? 1036 inp->inp_laddr : if_sin->sin_addr), 1037 inp->inp_lport, 0, NULL); 1038 if (oinp != NULL) { 1039 m_freem(m); 1040 return (EADDRINUSE); 1041 } 1042 if (inp->inp_laddr.s_addr == INADDR_ANY) 1043 inp->inp_laddr = if_sin->sin_addr; 1044 inp->inp_faddr = sin->sin_addr; 1045 inp->inp_fport = sin->sin_port; 1046 in_pcbinsconnhash(inp); 1047 1048 /* 1049 * We are now on the inpcb's owner CPU, if the cached route was 1050 * freed because the rtentry's owner CPU is not the current CPU 1051 * (e.g. in tcp_connect()), then we try to reallocate it here with 1052 * the hope that a rtentry may be cloned from a RTF_PRCLONING 1053 * rtentry. 1054 */ 1055 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 1056 ro->ro_rt == NULL) { 1057 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 1058 ro->ro_dst.sa_family = AF_INET; 1059 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 1060 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = 1061 sin->sin_addr; 1062 rtalloc(ro); 1063 } 1064 1065 /* 1066 * Now that no more errors can occur, change the protocol processing 1067 * port to the current thread (which is the correct thread). 1068 * 1069 * Create TCP timer message now; we are on the tcpcb's owner 1070 * CPU/thread. 1071 */ 1072 tcp_create_timermsg(tp, &curthread->td_msgport); 1073 1074 /* 1075 * Compute window scaling to request. Use a larger scaling then 1076 * needed for the initial receive buffer in case the receive buffer 1077 * gets expanded. 1078 */ 1079 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1080 tp->request_r_scale = TCP_MIN_WINSHIFT; 1081 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1082 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat 1083 ) { 1084 tp->request_r_scale++; 1085 } 1086 1087 soisconnecting(so); 1088 tcpstat.tcps_connattempt++; 1089 tp->t_state = TCPS_SYN_SENT; 1090 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1091 tp->iss = tcp_new_isn(tp); 1092 tcp_sendseqinit(tp); 1093 if (m) { 1094 ssb_appendstream(&so->so_snd, m); 1095 m = NULL; 1096 if (flags & PRUS_OOB) 1097 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1098 } 1099 1100 /* 1101 * Close the send side of the connection after 1102 * the data is sent if flagged. 1103 */ 1104 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1105 socantsendmore(so); 1106 tp = tcp_usrclosed(tp); 1107 } 1108 return (tcp_output(tp)); 1109 } 1110 1111 /* 1112 * Common subroutine to open a TCP connection to remote host specified 1113 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 1114 * port number if needed. Call in_pcbladdr to do the routing and to choose 1115 * a local host address (interface). 1116 * Initialize connection parameters and enter SYN-SENT state. 1117 */ 1118 static void 1119 tcp_connect(netmsg_t msg) 1120 { 1121 struct socket *so = msg->connect.base.nm_so; 1122 struct sockaddr *nam = msg->connect.nm_nam; 1123 struct thread *td = msg->connect.nm_td; 1124 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1125 struct sockaddr_in *if_sin = NULL; 1126 struct inpcb *inp; 1127 struct tcpcb *tp; 1128 int error; 1129 lwkt_port_t port; 1130 1131 COMMON_START(so, inp, 0); 1132 1133 /* 1134 * Reconnect our pcb if we have to 1135 */ 1136 if (msg->connect.nm_flags & PRUC_RECONNECT) { 1137 msg->connect.nm_flags &= ~PRUC_RECONNECT; 1138 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1139 } 1140 1141 /* 1142 * Bind if we have to 1143 */ 1144 if (inp->inp_lport == 0) { 1145 if (tcp_lport_extension) { 1146 KKASSERT(inp->inp_laddr.s_addr == INADDR_ANY); 1147 1148 error = in_pcbladdr(inp, nam, &if_sin, td); 1149 if (error) 1150 goto out; 1151 inp->inp_laddr.s_addr = if_sin->sin_addr.s_addr; 1152 1153 error = in_pcbbind_remote(inp, nam, td); 1154 if (error) 1155 goto out; 1156 1157 msg->connect.nm_flags |= PRUC_HASLADDR; 1158 } else { 1159 error = in_pcbbind(inp, NULL, td); 1160 if (error) 1161 goto out; 1162 } 1163 } 1164 1165 if ((msg->connect.nm_flags & PRUC_HASLADDR) == 0) { 1166 /* 1167 * Calculate the correct protocol processing thread. The 1168 * connect operation must run there. Set the forwarding 1169 * port before we forward the message or it will get bounced 1170 * right back to us. 1171 */ 1172 error = in_pcbladdr(inp, nam, &if_sin, td); 1173 if (error) 1174 goto out; 1175 } 1176 KKASSERT(inp->inp_socket == so); 1177 1178 port = tcp_addrport(sin->sin_addr.s_addr, sin->sin_port, 1179 (inp->inp_laddr.s_addr != INADDR_ANY ? 1180 inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr), 1181 inp->inp_lport); 1182 1183 if (port != &curthread->td_msgport) { 1184 lwkt_msg_t lmsg = &msg->connect.base.lmsg; 1185 1186 /* 1187 * in_pcbladdr() may have allocated a route entry for us 1188 * on the current CPU, but we need a route entry on the 1189 * inpcb's owner CPU, so free it here. 1190 */ 1191 in_pcbresetroute(inp); 1192 1193 /* 1194 * We are moving the protocol processing port the socket 1195 * is on, we have to unlink here and re-link on the 1196 * target cpu. 1197 */ 1198 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1199 msg->connect.nm_flags |= PRUC_RECONNECT; 1200 msg->connect.base.nm_dispatch = tcp_connect; 1201 1202 /* 1203 * Use message put done receipt to change this socket's 1204 * so_port, i.e. _after_ this message was put onto the 1205 * target netisr's msgport but _before_ the message could 1206 * be pulled from the target netisr's msgport, so that: 1207 * - The upper half (socket code) will not see the new 1208 * msgport before this message reaches the new msgport 1209 * and messages for this socket will be ordered. 1210 * - This message will see the new msgport, when its 1211 * handler is called in the target netisr. 1212 * 1213 * NOTE: 1214 * We MUST use messege put done receipt to change this 1215 * socket's so_port: 1216 * If we changed the so_port in this netisr after the 1217 * lwkt_forwardmsg (so messages for this socket will be 1218 * ordered) and changed the so_port in the target netisr 1219 * at the very beginning of this message's handler, we 1220 * would suffer so_port overwritten race, given this 1221 * message might be forwarded again. 1222 * 1223 * NOTE: 1224 * This mechanism depends on that the netisr's msgport 1225 * is spin msgport (currently it is :). 1226 * 1227 * If the upper half saw the new msgport before this 1228 * message reached the target netisr's msgport, the 1229 * messages sent from the upper half could reach the new 1230 * msgport before this message, thus there would be 1231 * message reordering. The worst case could be soclose() 1232 * saw the new msgport and the detach message could reach 1233 * the new msgport before this message, i.e. the inpcb 1234 * could have been destroyed when this message was still 1235 * pending on or on its way to the new msgport. Other 1236 * weird cases could also happen, e.g. inpcb->inp_pcbinfo, 1237 * since we have unlinked this inpcb from the current 1238 * pcbinfo first. 1239 */ 1240 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 1241 lwkt_forwardmsg(port, lmsg); 1242 /* msg invalid now */ 1243 return; 1244 } else if (msg->connect.nm_flags & PRUC_HELDTD) { 1245 /* 1246 * The original thread is no longer needed; release it. 1247 */ 1248 lwkt_rele(td); 1249 msg->connect.nm_flags &= ~PRUC_HELDTD; 1250 } 1251 error = tcp_connect_oncpu(tp, msg->connect.nm_sndflags, 1252 msg->connect.nm_m, sin, if_sin); 1253 msg->connect.nm_m = NULL; 1254 out: 1255 if (msg->connect.nm_m) { 1256 m_freem(msg->connect.nm_m); 1257 msg->connect.nm_m = NULL; 1258 } 1259 if (msg->connect.nm_flags & PRUC_HELDTD) 1260 lwkt_rele(td); 1261 if (error && (msg->connect.nm_flags & PRUC_ASYNC)) { 1262 so->so_error = error; 1263 soisdisconnected(so); 1264 } 1265 lwkt_replymsg(&msg->connect.base.lmsg, error); 1266 /* msg invalid now */ 1267 } 1268 1269 #ifdef INET6 1270 1271 static void 1272 tcp6_connect(netmsg_t msg) 1273 { 1274 struct tcpcb *tp; 1275 struct socket *so = msg->connect.base.nm_so; 1276 struct sockaddr *nam = msg->connect.nm_nam; 1277 struct thread *td = msg->connect.nm_td; 1278 struct inpcb *inp; 1279 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 1280 struct in6_addr *addr6; 1281 lwkt_port_t port; 1282 int error; 1283 1284 COMMON_START(so, inp, 0); 1285 1286 /* 1287 * Reconnect our pcb if we have to 1288 */ 1289 if (msg->connect.nm_flags & PRUC_RECONNECT) { 1290 msg->connect.nm_flags &= ~PRUC_RECONNECT; 1291 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1292 } 1293 1294 /* 1295 * Bind if we have to 1296 */ 1297 if (inp->inp_lport == 0) { 1298 error = in6_pcbbind(inp, NULL, td); 1299 if (error) 1300 goto out; 1301 } 1302 1303 /* 1304 * Cannot simply call in_pcbconnect, because there might be an 1305 * earlier incarnation of this same connection still in 1306 * TIME_WAIT state, creating an ADDRINUSE error. 1307 */ 1308 error = in6_pcbladdr(inp, nam, &addr6, td); 1309 if (error) 1310 goto out; 1311 1312 port = tcp6_addrport(); /* XXX hack for now, always cpu0 */ 1313 1314 if (port != &curthread->td_msgport) { 1315 lwkt_msg_t lmsg = &msg->connect.base.lmsg; 1316 1317 /* 1318 * in_pcbladdr() may have allocated a route entry for us 1319 * on the current CPU, but we need a route entry on the 1320 * inpcb's owner CPU, so free it here. 1321 */ 1322 in_pcbresetroute(inp); 1323 1324 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1325 msg->connect.nm_flags |= PRUC_RECONNECT; 1326 msg->connect.base.nm_dispatch = tcp6_connect; 1327 1328 /* See the related comment in tcp_connect() */ 1329 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 1330 lwkt_forwardmsg(port, lmsg); 1331 /* msg invalid now */ 1332 return; 1333 } 1334 error = tcp6_connect_oncpu(tp, msg->connect.nm_sndflags, 1335 &msg->connect.nm_m, sin6, addr6); 1336 /* nm_m may still be intact */ 1337 out: 1338 if (msg->connect.nm_m) { 1339 m_freem(msg->connect.nm_m); 1340 msg->connect.nm_m = NULL; 1341 } 1342 lwkt_replymsg(&msg->connect.base.lmsg, error); 1343 /* msg invalid now */ 1344 } 1345 1346 static int 1347 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf **mp, 1348 struct sockaddr_in6 *sin6, struct in6_addr *addr6) 1349 { 1350 struct mbuf *m = *mp; 1351 struct inpcb *inp = tp->t_inpcb; 1352 struct socket *so = inp->inp_socket; 1353 struct inpcb *oinp; 1354 1355 /* 1356 * Cannot simply call in_pcbconnect, because there might be an 1357 * earlier incarnation of this same connection still in 1358 * TIME_WAIT state, creating an ADDRINUSE error. 1359 */ 1360 oinp = in6_pcblookup_hash(inp->inp_pcbinfo, 1361 &sin6->sin6_addr, sin6->sin6_port, 1362 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? 1363 addr6 : &inp->in6p_laddr), 1364 inp->inp_lport, 0, NULL); 1365 if (oinp) 1366 return (EADDRINUSE); 1367 1368 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 1369 inp->in6p_laddr = *addr6; 1370 inp->in6p_faddr = sin6->sin6_addr; 1371 inp->inp_fport = sin6->sin6_port; 1372 if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) 1373 inp->in6p_flowinfo = sin6->sin6_flowinfo; 1374 in_pcbinsconnhash(inp); 1375 1376 /* 1377 * Now that no more errors can occur, change the protocol processing 1378 * port to the current thread (which is the correct thread). 1379 * 1380 * Create TCP timer message now; we are on the tcpcb's owner 1381 * CPU/thread. 1382 */ 1383 tcp_create_timermsg(tp, &curthread->td_msgport); 1384 1385 /* Compute window scaling to request. */ 1386 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1387 tp->request_r_scale = TCP_MIN_WINSHIFT; 1388 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1389 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) { 1390 tp->request_r_scale++; 1391 } 1392 1393 soisconnecting(so); 1394 tcpstat.tcps_connattempt++; 1395 tp->t_state = TCPS_SYN_SENT; 1396 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1397 tp->iss = tcp_new_isn(tp); 1398 tcp_sendseqinit(tp); 1399 if (m) { 1400 ssb_appendstream(&so->so_snd, m); 1401 *mp = NULL; 1402 if (flags & PRUS_OOB) 1403 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1404 } 1405 1406 /* 1407 * Close the send side of the connection after 1408 * the data is sent if flagged. 1409 */ 1410 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1411 socantsendmore(so); 1412 tp = tcp_usrclosed(tp); 1413 } 1414 return (tcp_output(tp)); 1415 } 1416 1417 #endif /* INET6 */ 1418 1419 /* 1420 * The new sockopt interface makes it possible for us to block in the 1421 * copyin/out step (if we take a page fault). Taking a page fault while 1422 * in a critical section is probably a Bad Thing. (Since sockets and pcbs 1423 * both now use TSM, there probably isn't any need for this function to 1424 * run in a critical section any more. This needs more examination.) 1425 */ 1426 void 1427 tcp_ctloutput(netmsg_t msg) 1428 { 1429 struct socket *so = msg->base.nm_so; 1430 struct sockopt *sopt = msg->ctloutput.nm_sopt; 1431 int error, opt, optval, opthz; 1432 struct inpcb *inp; 1433 struct tcpcb *tp; 1434 1435 error = 0; 1436 inp = so->so_pcb; 1437 if (inp == NULL) { 1438 error = ECONNRESET; 1439 goto done; 1440 } 1441 tp = intotcpcb(inp); 1442 1443 /* Get socket's owner cpuid hint */ 1444 if (sopt->sopt_level == SOL_SOCKET && 1445 sopt->sopt_dir == SOPT_GET && 1446 sopt->sopt_name == SO_CPUHINT) { 1447 if (tp->t_flags & TF_LISTEN) { 1448 /* 1449 * Listen sockets owner cpuid is always 0, 1450 * which does not make sense if SO_REUSEPORT 1451 * is not set. 1452 */ 1453 if (so->so_options & SO_REUSEPORT) 1454 optval = (inp->inp_lgrpindex & ncpus2_mask); 1455 else 1456 optval = -1; /* no hint */ 1457 } else { 1458 optval = mycpuid; 1459 } 1460 soopt_from_kbuf(sopt, &optval, sizeof(optval)); 1461 goto done; 1462 } 1463 1464 if (sopt->sopt_level != IPPROTO_TCP) { 1465 if (sopt->sopt_level == IPPROTO_IP) { 1466 switch (sopt->sopt_name) { 1467 case IP_MULTICAST_IF: 1468 case IP_MULTICAST_VIF: 1469 case IP_MULTICAST_TTL: 1470 case IP_MULTICAST_LOOP: 1471 case IP_ADD_MEMBERSHIP: 1472 case IP_DROP_MEMBERSHIP: 1473 /* 1474 * Multicast does not make sense on 1475 * TCP sockets. 1476 */ 1477 error = EOPNOTSUPP; 1478 goto done; 1479 } 1480 } 1481 #ifdef INET6 1482 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1483 ip6_ctloutput_dispatch(msg); 1484 else 1485 #endif /* INET6 */ 1486 ip_ctloutput(msg); 1487 /* msg invalid now */ 1488 return; 1489 } 1490 1491 switch (sopt->sopt_dir) { 1492 case SOPT_SET: 1493 error = soopt_to_kbuf(sopt, &optval, sizeof optval, 1494 sizeof optval); 1495 if (error) 1496 break; 1497 switch (sopt->sopt_name) { 1498 case TCP_FASTKEEP: 1499 if (optval > 0) 1500 tp->t_keepidle = tp->t_keepintvl; 1501 else 1502 tp->t_keepidle = tcp_keepidle; 1503 tcp_timer_keep_activity(tp, 0); 1504 break; 1505 #ifdef TCP_SIGNATURE 1506 case TCP_SIGNATURE_ENABLE: 1507 if (tp->t_state == TCPS_CLOSED) { 1508 /* 1509 * This is the only safe state that this 1510 * option could be changed. Some segments 1511 * could already have been sent in other 1512 * states. 1513 */ 1514 if (optval > 0) 1515 tp->t_flags |= TF_SIGNATURE; 1516 else 1517 tp->t_flags &= ~TF_SIGNATURE; 1518 } else { 1519 error = EOPNOTSUPP; 1520 } 1521 break; 1522 #endif /* TCP_SIGNATURE */ 1523 case TCP_NODELAY: 1524 case TCP_NOOPT: 1525 switch (sopt->sopt_name) { 1526 case TCP_NODELAY: 1527 opt = TF_NODELAY; 1528 break; 1529 case TCP_NOOPT: 1530 opt = TF_NOOPT; 1531 break; 1532 default: 1533 opt = 0; /* dead code to fool gcc */ 1534 break; 1535 } 1536 1537 if (optval) 1538 tp->t_flags |= opt; 1539 else 1540 tp->t_flags &= ~opt; 1541 break; 1542 1543 case TCP_NOPUSH: 1544 if (tcp_disable_nopush) 1545 break; 1546 if (optval) 1547 tp->t_flags |= TF_NOPUSH; 1548 else { 1549 tp->t_flags &= ~TF_NOPUSH; 1550 error = tcp_output(tp); 1551 } 1552 break; 1553 1554 case TCP_MAXSEG: 1555 /* 1556 * Must be between 0 and maxseg. If the requested 1557 * maxseg is too small to satisfy the desired minmss, 1558 * pump it up (silently so sysctl modifications of 1559 * minmss do not create unexpected program failures). 1560 * Handle degenerate cases. 1561 */ 1562 if (optval > 0 && optval <= tp->t_maxseg) { 1563 if (optval + 40 < tcp_minmss) { 1564 optval = tcp_minmss - 40; 1565 if (optval < 0) 1566 optval = 1; 1567 } 1568 tp->t_maxseg = optval; 1569 } else { 1570 error = EINVAL; 1571 } 1572 break; 1573 1574 case TCP_KEEPINIT: 1575 opthz = ((int64_t)optval * hz) / 1000; 1576 if (opthz >= 1) 1577 tp->t_keepinit = opthz; 1578 else 1579 error = EINVAL; 1580 break; 1581 1582 case TCP_KEEPIDLE: 1583 opthz = ((int64_t)optval * hz) / 1000; 1584 if (opthz >= 1) { 1585 tp->t_keepidle = opthz; 1586 tcp_timer_keep_activity(tp, 0); 1587 } else { 1588 error = EINVAL; 1589 } 1590 break; 1591 1592 case TCP_KEEPINTVL: 1593 opthz = ((int64_t)optval * hz) / 1000; 1594 if (opthz >= 1) { 1595 tp->t_keepintvl = opthz; 1596 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1597 } else { 1598 error = EINVAL; 1599 } 1600 break; 1601 1602 case TCP_KEEPCNT: 1603 if (optval > 0) { 1604 tp->t_keepcnt = optval; 1605 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1606 } else { 1607 error = EINVAL; 1608 } 1609 break; 1610 1611 default: 1612 error = ENOPROTOOPT; 1613 break; 1614 } 1615 break; 1616 1617 case SOPT_GET: 1618 switch (sopt->sopt_name) { 1619 #ifdef TCP_SIGNATURE 1620 case TCP_SIGNATURE_ENABLE: 1621 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1622 break; 1623 #endif /* TCP_SIGNATURE */ 1624 case TCP_NODELAY: 1625 optval = tp->t_flags & TF_NODELAY; 1626 break; 1627 case TCP_MAXSEG: 1628 optval = tp->t_maxseg; 1629 break; 1630 case TCP_NOOPT: 1631 optval = tp->t_flags & TF_NOOPT; 1632 break; 1633 case TCP_NOPUSH: 1634 optval = tp->t_flags & TF_NOPUSH; 1635 break; 1636 case TCP_KEEPINIT: 1637 optval = ((int64_t)tp->t_keepinit * 1000) / hz; 1638 break; 1639 case TCP_KEEPIDLE: 1640 optval = ((int64_t)tp->t_keepidle * 1000) / hz; 1641 break; 1642 case TCP_KEEPINTVL: 1643 optval = ((int64_t)tp->t_keepintvl * 1000) / hz; 1644 break; 1645 case TCP_KEEPCNT: 1646 optval = tp->t_keepcnt; 1647 break; 1648 default: 1649 error = ENOPROTOOPT; 1650 break; 1651 } 1652 if (error == 0) 1653 soopt_from_kbuf(sopt, &optval, sizeof optval); 1654 break; 1655 } 1656 done: 1657 lwkt_replymsg(&msg->lmsg, error); 1658 } 1659 1660 /* 1661 * tcp_sendspace and tcp_recvspace are the default send and receive window 1662 * sizes, respectively. These are obsolescent (this information should 1663 * be set by the route). 1664 * 1665 * Use a default that does not require tcp window scaling to be turned 1666 * on. Individual programs or the administrator can increase the default. 1667 */ 1668 u_long tcp_sendspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1669 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1670 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1671 u_long tcp_recvspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1672 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1673 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1674 1675 /* 1676 * Attach TCP protocol to socket, allocating internet protocol control 1677 * block, tcp control block, buffer space, and entering CLOSED state. 1678 */ 1679 static int 1680 tcp_attach(struct socket *so, struct pru_attach_info *ai) 1681 { 1682 struct tcpcb *tp; 1683 struct inpcb *inp; 1684 int error; 1685 int cpu; 1686 #ifdef INET6 1687 boolean_t isipv6 = INP_CHECK_SOCKAF(so, AF_INET6); 1688 #endif 1689 1690 if (ai != NULL) { 1691 error = tcp_usr_preattach(so, 0 /* don't care */, ai); 1692 if (error) 1693 return (error); 1694 } else { 1695 /* Post attach; do nothing */ 1696 } 1697 1698 cpu = mycpu->gd_cpuid; 1699 1700 /* 1701 * Set the default pcbinfo. This will likely change when we 1702 * bind/connect. 1703 */ 1704 error = in_pcballoc(so, &tcbinfo[cpu]); 1705 if (error) 1706 return (error); 1707 inp = so->so_pcb; 1708 #ifdef INET6 1709 if (isipv6) 1710 inp->in6p_hops = -1; /* use kernel default */ 1711 #endif 1712 tp = tcp_newtcpcb(inp); 1713 KASSERT(tp != NULL, ("tcp_newtcpcb failed")); 1714 tp->t_state = TCPS_CLOSED; 1715 /* Keep a reference for asynchronized pru_rcvd */ 1716 soreference(so); 1717 return (0); 1718 } 1719 1720 /* 1721 * Initiate (or continue) disconnect. 1722 * If embryonic state, just send reset (once). 1723 * If in ``let data drain'' option and linger null, just drop. 1724 * Otherwise (hard), mark socket disconnecting and drop 1725 * current input data; switch states based on user close, and 1726 * send segment to peer (with FIN). 1727 */ 1728 static struct tcpcb * 1729 tcp_disconnect(struct tcpcb *tp) 1730 { 1731 struct socket *so = tp->t_inpcb->inp_socket; 1732 1733 if (tp->t_state < TCPS_ESTABLISHED) { 1734 tp = tcp_close(tp); 1735 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 1736 tp = tcp_drop(tp, 0); 1737 } else { 1738 lwkt_gettoken(&so->so_rcv.ssb_token); 1739 soisdisconnecting(so); 1740 sbflush(&so->so_rcv.sb); 1741 tp = tcp_usrclosed(tp); 1742 if (tp) 1743 tcp_output(tp); 1744 lwkt_reltoken(&so->so_rcv.ssb_token); 1745 } 1746 return (tp); 1747 } 1748 1749 /* 1750 * User issued close, and wish to trail through shutdown states: 1751 * if never received SYN, just forget it. If got a SYN from peer, 1752 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1753 * If already got a FIN from peer, then almost done; go to LAST_ACK 1754 * state. In all other cases, have already sent FIN to peer (e.g. 1755 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1756 * for peer to send FIN or not respond to keep-alives, etc. 1757 * We can let the user exit from the close as soon as the FIN is acked. 1758 */ 1759 static struct tcpcb * 1760 tcp_usrclosed(struct tcpcb *tp) 1761 { 1762 1763 switch (tp->t_state) { 1764 1765 case TCPS_CLOSED: 1766 case TCPS_LISTEN: 1767 tp->t_state = TCPS_CLOSED; 1768 tp = tcp_close(tp); 1769 break; 1770 1771 case TCPS_SYN_SENT: 1772 case TCPS_SYN_RECEIVED: 1773 tp->t_flags |= TF_NEEDFIN; 1774 break; 1775 1776 case TCPS_ESTABLISHED: 1777 tp->t_state = TCPS_FIN_WAIT_1; 1778 break; 1779 1780 case TCPS_CLOSE_WAIT: 1781 tp->t_state = TCPS_LAST_ACK; 1782 break; 1783 } 1784 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1785 soisdisconnected(tp->t_inpcb->inp_socket); 1786 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1787 if (tp->t_state == TCPS_FIN_WAIT_2) { 1788 tcp_callout_reset(tp, tp->tt_2msl, tp->t_maxidle, 1789 tcp_timer_2msl); 1790 } 1791 } 1792 return (tp); 1793 } 1794