1 /* $OpenBSD: tcp_input.c,v 1.272 2014/01/24 18:54:58 henning Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <dev/rndvar.h> 84 85 #include <net/if.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_systm.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/tcp.h> 94 #include <netinet/tcp_fsm.h> 95 #include <netinet/tcp_seq.h> 96 #include <netinet/tcp_timer.h> 97 #include <netinet/tcp_var.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/tcp_debug.h> 100 101 #if NPF > 0 102 #include <net/pfvar.h> 103 #endif 104 105 struct tcpiphdr tcp_saveti; 106 107 int tcp_mss_adv(struct ifnet *, int); 108 int tcp_flush_queue(struct tcpcb *); 109 110 #ifdef INET6 111 #include <netinet6/in6_var.h> 112 #include <netinet6/nd6.h> 113 114 struct tcpipv6hdr tcp_saveti6; 115 116 /* for the packet header length in the mbuf */ 117 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 118 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 119 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 120 #endif /* INET6 */ 121 122 int tcprexmtthresh = 3; 123 int tcptv_keep_init = TCPTV_KEEP_INIT; 124 125 int tcp_rst_ppslim = 100; /* 100pps */ 126 int tcp_rst_ppslim_count = 0; 127 struct timeval tcp_rst_ppslim_last; 128 129 int tcp_ackdrop_ppslim = 100; /* 100pps */ 130 int tcp_ackdrop_ppslim_count = 0; 131 struct timeval tcp_ackdrop_ppslim_last; 132 133 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 134 135 /* for modulo comparisons of timestamps */ 136 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 137 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 138 139 /* for TCP SACK comparisons */ 140 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 141 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 142 143 /* 144 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 145 */ 146 #ifdef INET6 147 #define ND6_HINT(tp) \ 148 do { \ 149 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 150 tp->t_inpcb->inp_route6.ro_rt) { \ 151 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0, \ 152 tp->t_inpcb->inp_rtableid); \ 153 } \ 154 } while (0) 155 #else 156 #define ND6_HINT(tp) 157 #endif 158 159 #ifdef TCP_ECN 160 /* 161 * ECN (Explicit Congestion Notification) support based on RFC3168 162 * implementation note: 163 * snd_last is used to track a recovery phase. 164 * when cwnd is reduced, snd_last is set to snd_max. 165 * while snd_last > snd_una, the sender is in a recovery phase and 166 * its cwnd should not be reduced again. 167 * snd_last follows snd_una when not in a recovery phase. 168 */ 169 #endif 170 171 /* 172 * Macro to compute ACK transmission behavior. Delay the ACK unless 173 * we have already delayed an ACK (must send an ACK every two segments). 174 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 175 * option is enabled or when the packet is coming from a loopback 176 * interface. 177 */ 178 #define TCP_SETUP_ACK(tp, tiflags, m) \ 179 do { \ 180 if ((tp)->t_flags & TF_DELACK || \ 181 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 182 (m && (m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif && \ 183 (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK))) \ 184 tp->t_flags |= TF_ACKNOW; \ 185 else \ 186 TCP_SET_DELACK(tp); \ 187 } while (0) 188 189 void syn_cache_put(struct syn_cache *); 190 void syn_cache_rm(struct syn_cache *); 191 192 /* 193 * Insert segment ti into reassembly queue of tcp with 194 * control block tp. Return TH_FIN if reassembly now includes 195 * a segment with FIN. The macro form does the common case inline 196 * (segment is the next to be received on an established connection, 197 * and the queue is empty), avoiding linkage into and removal 198 * from the queue and repetition of various conversions. 199 * Set DELACK for segments received in order, but ack immediately 200 * when segments are out of order (so fast retransmit can work). 201 */ 202 203 int 204 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 205 { 206 struct tcpqent *p, *q, *nq, *tiqe; 207 208 /* 209 * Allocate a new queue entry, before we throw away any data. 210 * If we can't, just drop the packet. XXX 211 */ 212 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 213 if (tiqe == NULL) { 214 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 215 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 216 /* Reuse last entry since new segment fills a hole */ 217 m_freem(tiqe->tcpqe_m); 218 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 219 } 220 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 221 /* Flush segment queue for this connection */ 222 tcp_freeq(tp); 223 tcpstat.tcps_rcvmemdrop++; 224 m_freem(m); 225 return (0); 226 } 227 } 228 229 /* 230 * Find a segment which begins after this one does. 231 */ 232 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 233 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 234 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 235 break; 236 237 /* 238 * If there is a preceding segment, it may provide some of 239 * our data already. If so, drop the data from the incoming 240 * segment. If it provides all of our data, drop us. 241 */ 242 if (p != NULL) { 243 struct tcphdr *phdr = p->tcpqe_tcp; 244 int i; 245 246 /* conversion to int (in i) handles seq wraparound */ 247 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 248 if (i > 0) { 249 if (i >= *tlen) { 250 tcpstat.tcps_rcvduppack++; 251 tcpstat.tcps_rcvdupbyte += *tlen; 252 m_freem(m); 253 pool_put(&tcpqe_pool, tiqe); 254 return (0); 255 } 256 m_adj(m, i); 257 *tlen -= i; 258 th->th_seq += i; 259 } 260 } 261 tcpstat.tcps_rcvoopack++; 262 tcpstat.tcps_rcvoobyte += *tlen; 263 264 /* 265 * While we overlap succeeding segments trim them or, 266 * if they are completely covered, dequeue them. 267 */ 268 for (; q != NULL; q = nq) { 269 struct tcphdr *qhdr = q->tcpqe_tcp; 270 int i = (th->th_seq + *tlen) - qhdr->th_seq; 271 272 if (i <= 0) 273 break; 274 if (i < qhdr->th_reseqlen) { 275 qhdr->th_seq += i; 276 qhdr->th_reseqlen -= i; 277 m_adj(q->tcpqe_m, i); 278 break; 279 } 280 nq = TAILQ_NEXT(q, tcpqe_q); 281 m_freem(q->tcpqe_m); 282 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 283 pool_put(&tcpqe_pool, q); 284 } 285 286 /* Insert the new segment queue entry into place. */ 287 tiqe->tcpqe_m = m; 288 th->th_reseqlen = *tlen; 289 tiqe->tcpqe_tcp = th; 290 if (p == NULL) { 291 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 292 } else { 293 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 294 } 295 296 if (th->th_seq != tp->rcv_nxt) 297 return (0); 298 299 return (tcp_flush_queue(tp)); 300 } 301 302 int 303 tcp_flush_queue(struct tcpcb *tp) 304 { 305 struct socket *so = tp->t_inpcb->inp_socket; 306 struct tcpqent *q, *nq; 307 int flags; 308 309 /* 310 * Present data to user, advancing rcv_nxt through 311 * completed sequence space. 312 */ 313 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 314 return (0); 315 q = TAILQ_FIRST(&tp->t_segq); 316 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 317 return (0); 318 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 319 return (0); 320 do { 321 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 322 flags = q->tcpqe_tcp->th_flags & TH_FIN; 323 324 nq = TAILQ_NEXT(q, tcpqe_q); 325 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 326 ND6_HINT(tp); 327 if (so->so_state & SS_CANTRCVMORE) 328 m_freem(q->tcpqe_m); 329 else 330 sbappendstream(&so->so_rcv, q->tcpqe_m); 331 pool_put(&tcpqe_pool, q); 332 q = nq; 333 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 334 tp->t_flags |= TF_BLOCKOUTPUT; 335 sorwakeup(so); 336 tp->t_flags &= ~TF_BLOCKOUTPUT; 337 return (flags); 338 } 339 340 #ifdef INET6 341 int 342 tcp6_input(struct mbuf **mp, int *offp, int proto) 343 { 344 struct mbuf *m = *mp; 345 346 tcp_input(m, *offp, proto); 347 return IPPROTO_DONE; 348 } 349 #endif 350 351 /* 352 * TCP input routine, follows pages 65-76 of the 353 * protocol specification dated September, 1981 very closely. 354 */ 355 void 356 tcp_input(struct mbuf *m, ...) 357 { 358 struct ip *ip; 359 struct inpcb *inp = NULL; 360 u_int8_t *optp = NULL; 361 int optlen = 0; 362 int tlen, off; 363 struct tcpcb *tp = NULL; 364 int tiflags; 365 struct socket *so = NULL; 366 int todrop, acked, ourfinisacked; 367 int hdroptlen = 0; 368 short ostate = 0; 369 tcp_seq iss, *reuse = NULL; 370 u_long tiwin; 371 struct tcp_opt_info opti; 372 int iphlen; 373 va_list ap; 374 struct tcphdr *th; 375 #ifdef INET6 376 struct ip6_hdr *ip6 = NULL; 377 #endif /* INET6 */ 378 #ifdef IPSEC 379 struct m_tag *mtag; 380 struct tdb_ident *tdbi; 381 struct tdb *tdb; 382 int error; 383 #endif /* IPSEC */ 384 int af; 385 #ifdef TCP_ECN 386 u_char iptos; 387 #endif 388 389 va_start(ap, m); 390 iphlen = va_arg(ap, int); 391 va_end(ap); 392 393 tcpstat.tcps_rcvtotal++; 394 395 opti.ts_present = 0; 396 opti.maxseg = 0; 397 398 /* 399 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 400 * See below for AF specific multicast. 401 */ 402 if (m->m_flags & (M_BCAST|M_MCAST)) 403 goto drop; 404 405 /* 406 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 407 * TCP/IPv4. 408 */ 409 switch (mtod(m, struct ip *)->ip_v) { 410 #ifdef INET6 411 case 6: 412 af = AF_INET6; 413 break; 414 #endif 415 case 4: 416 af = AF_INET; 417 break; 418 default: 419 m_freem(m); 420 return; /*EAFNOSUPPORT*/ 421 } 422 423 /* 424 * Get IP and TCP header together in first mbuf. 425 * Note: IP leaves IP header in first mbuf. 426 */ 427 switch (af) { 428 case AF_INET: 429 #ifdef DIAGNOSTIC 430 if (iphlen < sizeof(struct ip)) { 431 m_freem(m); 432 return; 433 } 434 #endif /* DIAGNOSTIC */ 435 break; 436 #ifdef INET6 437 case AF_INET6: 438 #ifdef DIAGNOSTIC 439 if (iphlen < sizeof(struct ip6_hdr)) { 440 m_freem(m); 441 return; 442 } 443 #endif /* DIAGNOSTIC */ 444 break; 445 #endif 446 default: 447 m_freem(m); 448 return; 449 } 450 451 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 452 if (!th) { 453 tcpstat.tcps_rcvshort++; 454 return; 455 } 456 457 tlen = m->m_pkthdr.len - iphlen; 458 ip = NULL; 459 #ifdef INET6 460 ip6 = NULL; 461 #endif 462 switch (af) { 463 case AF_INET: 464 ip = mtod(m, struct ip *); 465 if (IN_MULTICAST(ip->ip_dst.s_addr) || 466 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif, 467 m->m_pkthdr.rdomain)) 468 goto drop; 469 #ifdef TCP_ECN 470 /* save ip_tos before clearing it for checksum */ 471 iptos = ip->ip_tos; 472 #endif 473 break; 474 #ifdef INET6 475 case AF_INET6: 476 ip6 = mtod(m, struct ip6_hdr *); 477 #ifdef TCP_ECN 478 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 479 #endif 480 481 /* Be proactive about malicious use of IPv4 mapped address */ 482 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 483 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 484 /* XXX stat */ 485 goto drop; 486 } 487 488 /* 489 * Be proactive about unspecified IPv6 address in source. 490 * As we use all-zero to indicate unbounded/unconnected pcb, 491 * unspecified IPv6 address can be used to confuse us. 492 * 493 * Note that packets with unspecified IPv6 destination is 494 * already dropped in ip6_input. 495 */ 496 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 497 /* XXX stat */ 498 goto drop; 499 } 500 501 /* Discard packets to multicast */ 502 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 503 /* XXX stat */ 504 goto drop; 505 } 506 break; 507 #endif 508 } 509 510 /* 511 * Checksum extended TCP header and data. 512 */ 513 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 514 int sum; 515 516 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 517 tcpstat.tcps_rcvbadsum++; 518 goto drop; 519 } 520 tcpstat.tcps_inswcsum++; 521 switch (af) { 522 case AF_INET: 523 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 524 break; 525 #ifdef INET6 526 case AF_INET6: 527 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 528 tlen); 529 break; 530 #endif 531 } 532 if (sum != 0) { 533 tcpstat.tcps_rcvbadsum++; 534 goto drop; 535 } 536 } 537 538 /* 539 * Check that TCP offset makes sense, 540 * pull out TCP options and adjust length. XXX 541 */ 542 off = th->th_off << 2; 543 if (off < sizeof(struct tcphdr) || off > tlen) { 544 tcpstat.tcps_rcvbadoff++; 545 goto drop; 546 } 547 tlen -= off; 548 if (off > sizeof(struct tcphdr)) { 549 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 550 if (!th) { 551 tcpstat.tcps_rcvshort++; 552 return; 553 } 554 optlen = off - sizeof(struct tcphdr); 555 optp = (u_int8_t *)(th + 1); 556 /* 557 * Do quick retrieval of timestamp options ("options 558 * prediction?"). If timestamp is the only option and it's 559 * formatted as recommended in RFC 1323 appendix A, we 560 * quickly get the values now and not bother calling 561 * tcp_dooptions(), etc. 562 */ 563 if ((optlen == TCPOLEN_TSTAMP_APPA || 564 (optlen > TCPOLEN_TSTAMP_APPA && 565 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 566 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 567 (th->th_flags & TH_SYN) == 0) { 568 opti.ts_present = 1; 569 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 570 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 571 optp = NULL; /* we've parsed the options */ 572 } 573 } 574 tiflags = th->th_flags; 575 576 /* 577 * Convert TCP protocol specific fields to host format. 578 */ 579 NTOHL(th->th_seq); 580 NTOHL(th->th_ack); 581 NTOHS(th->th_win); 582 NTOHS(th->th_urp); 583 584 /* 585 * Locate pcb for segment. 586 */ 587 #if NPF > 0 588 if (m->m_pkthdr.pf.statekey) 589 inp = m->m_pkthdr.pf.statekey->inp; 590 #endif 591 findpcb: 592 if (inp == NULL) { 593 switch (af) { 594 #ifdef INET6 595 case AF_INET6: 596 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 597 th->th_sport, &ip6->ip6_dst, th->th_dport, 598 m->m_pkthdr.rdomain); 599 break; 600 #endif 601 case AF_INET: 602 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 603 th->th_sport, ip->ip_dst, th->th_dport, 604 m->m_pkthdr.rdomain); 605 break; 606 } 607 #if NPF > 0 608 if (m->m_pkthdr.pf.statekey && inp) { 609 m->m_pkthdr.pf.statekey->inp = inp; 610 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 611 } 612 #endif 613 } 614 if (inp == NULL) { 615 int inpl_reverse = 0; 616 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 617 inpl_reverse = 1; 618 ++tcpstat.tcps_pcbhashmiss; 619 switch (af) { 620 #ifdef INET6 621 case AF_INET6: 622 inp = in6_pcblookup_listen(&tcbtable, 623 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 624 m->m_pkthdr.rdomain); 625 break; 626 #endif /* INET6 */ 627 case AF_INET: 628 inp = in_pcblookup_listen(&tcbtable, 629 ip->ip_dst, th->th_dport, inpl_reverse, m, 630 m->m_pkthdr.rdomain); 631 break; 632 } 633 /* 634 * If the state is CLOSED (i.e., TCB does not exist) then 635 * all data in the incoming segment is discarded. 636 * If the TCB exists but is in CLOSED state, it is embryonic, 637 * but should either do a listen or a connect soon. 638 */ 639 if (inp == 0) { 640 ++tcpstat.tcps_noport; 641 goto dropwithreset_ratelim; 642 } 643 } 644 KASSERT(sotoinpcb(inp->inp_socket) == inp); 645 KASSERT(intotcpcb(inp)->t_inpcb == inp); 646 647 /* Check the minimum TTL for socket. */ 648 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 649 goto drop; 650 651 tp = intotcpcb(inp); 652 if (tp == 0) 653 goto dropwithreset_ratelim; 654 if (tp->t_state == TCPS_CLOSED) 655 goto drop; 656 657 /* Unscale the window into a 32-bit value. */ 658 if ((tiflags & TH_SYN) == 0) 659 tiwin = th->th_win << tp->snd_scale; 660 else 661 tiwin = th->th_win; 662 663 so = inp->inp_socket; 664 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 665 union syn_cache_sa src; 666 union syn_cache_sa dst; 667 668 bzero(&src, sizeof(src)); 669 bzero(&dst, sizeof(dst)); 670 switch (af) { 671 #ifdef INET 672 case AF_INET: 673 src.sin.sin_len = sizeof(struct sockaddr_in); 674 src.sin.sin_family = AF_INET; 675 src.sin.sin_addr = ip->ip_src; 676 src.sin.sin_port = th->th_sport; 677 678 dst.sin.sin_len = sizeof(struct sockaddr_in); 679 dst.sin.sin_family = AF_INET; 680 dst.sin.sin_addr = ip->ip_dst; 681 dst.sin.sin_port = th->th_dport; 682 break; 683 #endif 684 #ifdef INET6 685 case AF_INET6: 686 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 687 src.sin6.sin6_family = AF_INET6; 688 src.sin6.sin6_addr = ip6->ip6_src; 689 src.sin6.sin6_port = th->th_sport; 690 691 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 692 dst.sin6.sin6_family = AF_INET6; 693 dst.sin6.sin6_addr = ip6->ip6_dst; 694 dst.sin6.sin6_port = th->th_dport; 695 break; 696 #endif /* INET6 */ 697 default: 698 goto badsyn; /*sanity*/ 699 } 700 701 if (so->so_options & SO_DEBUG) { 702 ostate = tp->t_state; 703 switch (af) { 704 #ifdef INET6 705 case AF_INET6: 706 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 707 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 708 break; 709 #endif 710 case AF_INET: 711 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 712 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 713 break; 714 } 715 } 716 if (so->so_options & SO_ACCEPTCONN) { 717 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 718 719 case TH_SYN|TH_ACK|TH_RST: 720 case TH_SYN|TH_RST: 721 case TH_ACK|TH_RST: 722 case TH_RST: 723 syn_cache_reset(&src.sa, &dst.sa, th, 724 inp->inp_rtableid); 725 goto drop; 726 727 case TH_SYN|TH_ACK: 728 /* 729 * Received a SYN,ACK. This should 730 * never happen while we are in 731 * LISTEN. Send an RST. 732 */ 733 goto badsyn; 734 735 case TH_ACK: 736 so = syn_cache_get(&src.sa, &dst.sa, 737 th, iphlen, tlen, so, m); 738 if (so == NULL) { 739 /* 740 * We don't have a SYN for 741 * this ACK; send an RST. 742 */ 743 goto badsyn; 744 } else if (so == (struct socket *)(-1)) { 745 /* 746 * We were unable to create 747 * the connection. If the 748 * 3-way handshake was 749 * completed, and RST has 750 * been sent to the peer. 751 * Since the mbuf might be 752 * in use for the reply, 753 * do not free it. 754 */ 755 m = NULL; 756 goto drop; 757 } else { 758 /* 759 * We have created a 760 * full-blown connection. 761 */ 762 tp = NULL; 763 inp = sotoinpcb(so); 764 tp = intotcpcb(inp); 765 if (tp == NULL) 766 goto badsyn; /*XXX*/ 767 768 } 769 break; 770 771 default: 772 /* 773 * None of RST, SYN or ACK was set. 774 * This is an invalid packet for a 775 * TCB in LISTEN state. Send a RST. 776 */ 777 goto badsyn; 778 779 case TH_SYN: 780 /* 781 * Received a SYN. 782 */ 783 #ifdef INET6 784 /* 785 * If deprecated address is forbidden, we do 786 * not accept SYN to deprecated interface 787 * address to prevent any new inbound 788 * connection from getting established. 789 * When we do not accept SYN, we send a TCP 790 * RST, with deprecated source address (instead 791 * of dropping it). We compromise it as it is 792 * much better for peer to send a RST, and 793 * RST will be the final packet for the 794 * exchange. 795 * 796 * If we do not forbid deprecated addresses, we 797 * accept the SYN packet. RFC2462 does not 798 * suggest dropping SYN in this case. 799 * If we decipher RFC2462 5.5.4, it says like 800 * this: 801 * 1. use of deprecated addr with existing 802 * communication is okay - "SHOULD continue 803 * to be used" 804 * 2. use of it with new communication: 805 * (2a) "SHOULD NOT be used if alternate 806 * address with sufficient scope is 807 * available" 808 * (2b) nothing mentioned otherwise. 809 * Here we fall into (2b) case as we have no 810 * choice in our source address selection - we 811 * must obey the peer. 812 * 813 * The wording in RFC2462 is confusing, and 814 * there are multiple description text for 815 * deprecated address handling - worse, they 816 * are not exactly the same. I believe 5.5.4 817 * is the best one, so we follow 5.5.4. 818 */ 819 if (ip6 && !ip6_use_deprecated) { 820 struct in6_ifaddr *ia6; 821 822 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 823 &ip6->ip6_dst)) && 824 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 825 tp = NULL; 826 goto dropwithreset; 827 } 828 } 829 #endif 830 831 /* 832 * LISTEN socket received a SYN 833 * from itself? This can't possibly 834 * be valid; drop the packet. 835 */ 836 if (th->th_dport == th->th_sport) { 837 switch (af) { 838 #ifdef INET6 839 case AF_INET6: 840 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 841 &ip6->ip6_dst)) { 842 tcpstat.tcps_badsyn++; 843 goto drop; 844 } 845 break; 846 #endif /* INET6 */ 847 case AF_INET: 848 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 849 tcpstat.tcps_badsyn++; 850 goto drop; 851 } 852 break; 853 } 854 } 855 856 /* 857 * SYN looks ok; create compressed TCP 858 * state for it. 859 */ 860 if (so->so_qlen > so->so_qlimit || 861 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 862 so, m, optp, optlen, &opti, reuse) == -1) 863 goto drop; 864 return; 865 } 866 } 867 } 868 869 #ifdef DIAGNOSTIC 870 /* 871 * Should not happen now that all embryonic connections 872 * are handled with compressed state. 873 */ 874 if (tp->t_state == TCPS_LISTEN) 875 panic("tcp_input: TCPS_LISTEN"); 876 #endif 877 878 #if NPF > 0 879 if (m->m_pkthdr.pf.statekey && !m->m_pkthdr.pf.statekey->inp && 880 !inp->inp_pf_sk) { 881 m->m_pkthdr.pf.statekey->inp = inp; 882 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 883 } 884 /* The statekey has finished finding the inp, it is no longer needed. */ 885 m->m_pkthdr.pf.statekey = NULL; 886 #endif 887 888 #ifdef IPSEC 889 /* Find most recent IPsec tag */ 890 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 891 if (mtag != NULL) { 892 tdbi = (struct tdb_ident *)(mtag + 1); 893 tdb = gettdb(tdbi->rdomain, tdbi->spi, 894 &tdbi->dst, tdbi->proto); 895 } else 896 tdb = NULL; 897 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 898 tdb, inp, 0); 899 if (error) { 900 tcpstat.tcps_rcvnosec++; 901 goto drop; 902 } 903 904 /* Latch SA */ 905 if (inp->inp_tdb_in != tdb) { 906 if (tdb) { 907 tdb_add_inp(tdb, inp, 1); 908 if (inp->inp_ipo == NULL) { 909 inp->inp_ipo = ipsec_add_policy(inp, af, 910 IPSP_DIRECTION_OUT); 911 if (inp->inp_ipo == NULL) { 912 goto drop; 913 } 914 } 915 if (inp->inp_ipo->ipo_dstid == NULL && 916 tdb->tdb_srcid != NULL) { 917 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 918 tdb->tdb_srcid->ref_count++; 919 } 920 if (inp->inp_ipsec_remotecred == NULL && 921 tdb->tdb_remote_cred != NULL) { 922 inp->inp_ipsec_remotecred = 923 tdb->tdb_remote_cred; 924 tdb->tdb_remote_cred->ref_count++; 925 } 926 if (inp->inp_ipsec_remoteauth == NULL && 927 tdb->tdb_remote_auth != NULL) { 928 inp->inp_ipsec_remoteauth = 929 tdb->tdb_remote_auth; 930 tdb->tdb_remote_auth->ref_count++; 931 } 932 } else { /* Just reset */ 933 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 934 inp_tdb_in_next); 935 inp->inp_tdb_in = NULL; 936 } 937 } 938 #endif /* IPSEC */ 939 940 /* 941 * Segment received on connection. 942 * Reset idle time and keep-alive timer. 943 */ 944 tp->t_rcvtime = tcp_now; 945 if (TCPS_HAVEESTABLISHED(tp->t_state)) 946 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 947 948 #ifdef TCP_SACK 949 if (tp->sack_enable) 950 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 951 #endif /* TCP_SACK */ 952 953 /* 954 * Process options. 955 */ 956 #ifdef TCP_SIGNATURE 957 if (optp || (tp->t_flags & TF_SIGNATURE)) 958 #else 959 if (optp) 960 #endif 961 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 962 m->m_pkthdr.rdomain)) 963 goto drop; 964 965 if (opti.ts_present && opti.ts_ecr) { 966 int rtt_test; 967 968 /* subtract out the tcp timestamp modulator */ 969 opti.ts_ecr -= tp->ts_modulate; 970 971 /* make sure ts_ecr is sensible */ 972 rtt_test = tcp_now - opti.ts_ecr; 973 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 974 opti.ts_ecr = 0; 975 } 976 977 #ifdef TCP_ECN 978 /* if congestion experienced, set ECE bit in subsequent packets. */ 979 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 980 tp->t_flags |= TF_RCVD_CE; 981 tcpstat.tcps_ecn_rcvce++; 982 } 983 #endif 984 /* 985 * Header prediction: check for the two common cases 986 * of a uni-directional data xfer. If the packet has 987 * no control flags, is in-sequence, the window didn't 988 * change and we're not retransmitting, it's a 989 * candidate. If the length is zero and the ack moved 990 * forward, we're the sender side of the xfer. Just 991 * free the data acked & wake any higher level process 992 * that was blocked waiting for space. If the length 993 * is non-zero and the ack didn't move, we're the 994 * receiver side. If we're getting packets in-order 995 * (the reassembly queue is empty), add the data to 996 * the socket buffer and note that we need a delayed ack. 997 */ 998 if (tp->t_state == TCPS_ESTABLISHED && 999 #ifdef TCP_ECN 1000 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 1001 #else 1002 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1003 #endif 1004 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1005 th->th_seq == tp->rcv_nxt && 1006 tiwin && tiwin == tp->snd_wnd && 1007 tp->snd_nxt == tp->snd_max) { 1008 1009 /* 1010 * If last ACK falls within this segment's sequence numbers, 1011 * record the timestamp. 1012 * Fix from Braden, see Stevens p. 870 1013 */ 1014 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1015 tp->ts_recent_age = tcp_now; 1016 tp->ts_recent = opti.ts_val; 1017 } 1018 1019 if (tlen == 0) { 1020 if (SEQ_GT(th->th_ack, tp->snd_una) && 1021 SEQ_LEQ(th->th_ack, tp->snd_max) && 1022 tp->snd_cwnd >= tp->snd_wnd && 1023 tp->t_dupacks == 0) { 1024 /* 1025 * this is a pure ack for outstanding data. 1026 */ 1027 ++tcpstat.tcps_predack; 1028 if (opti.ts_present && opti.ts_ecr) 1029 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1030 else if (tp->t_rtttime && 1031 SEQ_GT(th->th_ack, tp->t_rtseq)) 1032 tcp_xmit_timer(tp, 1033 tcp_now - tp->t_rtttime); 1034 acked = th->th_ack - tp->snd_una; 1035 tcpstat.tcps_rcvackpack++; 1036 tcpstat.tcps_rcvackbyte += acked; 1037 ND6_HINT(tp); 1038 sbdrop(&so->so_snd, acked); 1039 1040 /* 1041 * If we had a pending ICMP message that 1042 * referres to data that have just been 1043 * acknowledged, disregard the recorded ICMP 1044 * message. 1045 */ 1046 if ((tp->t_flags & TF_PMTUD_PEND) && 1047 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1048 tp->t_flags &= ~TF_PMTUD_PEND; 1049 1050 /* 1051 * Keep track of the largest chunk of data 1052 * acknowledged since last PMTU update 1053 */ 1054 if (tp->t_pmtud_mss_acked < acked) 1055 tp->t_pmtud_mss_acked = acked; 1056 1057 tp->snd_una = th->th_ack; 1058 #if defined(TCP_SACK) || defined(TCP_ECN) 1059 /* 1060 * We want snd_last to track snd_una so 1061 * as to avoid sequence wraparound problems 1062 * for very large transfers. 1063 */ 1064 #ifdef TCP_ECN 1065 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1066 #endif 1067 tp->snd_last = tp->snd_una; 1068 #endif /* TCP_SACK */ 1069 #if defined(TCP_SACK) && defined(TCP_FACK) 1070 tp->snd_fack = tp->snd_una; 1071 tp->retran_data = 0; 1072 #endif /* TCP_FACK */ 1073 m_freem(m); 1074 1075 /* 1076 * If all outstanding data are acked, stop 1077 * retransmit timer, otherwise restart timer 1078 * using current (possibly backed-off) value. 1079 * If process is waiting for space, 1080 * wakeup/selwakeup/signal. If data 1081 * are ready to send, let tcp_output 1082 * decide between more output or persist. 1083 */ 1084 if (tp->snd_una == tp->snd_max) 1085 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1086 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1087 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1088 1089 tcp_update_sndspace(tp); 1090 if (sb_notify(&so->so_snd)) { 1091 tp->t_flags |= TF_BLOCKOUTPUT; 1092 sowwakeup(so); 1093 tp->t_flags &= ~TF_BLOCKOUTPUT; 1094 } 1095 if (so->so_snd.sb_cc || 1096 tp->t_flags & TF_NEEDOUTPUT) 1097 (void) tcp_output(tp); 1098 return; 1099 } 1100 } else if (th->th_ack == tp->snd_una && 1101 TAILQ_EMPTY(&tp->t_segq) && 1102 tlen <= sbspace(&so->so_rcv)) { 1103 /* 1104 * This is a pure, in-sequence data packet 1105 * with nothing on the reassembly queue and 1106 * we have enough buffer space to take it. 1107 */ 1108 #ifdef TCP_SACK 1109 /* Clean receiver SACK report if present */ 1110 if (tp->sack_enable && tp->rcv_numsacks) 1111 tcp_clean_sackreport(tp); 1112 #endif /* TCP_SACK */ 1113 ++tcpstat.tcps_preddat; 1114 tp->rcv_nxt += tlen; 1115 tcpstat.tcps_rcvpack++; 1116 tcpstat.tcps_rcvbyte += tlen; 1117 ND6_HINT(tp); 1118 1119 TCP_SETUP_ACK(tp, tiflags, m); 1120 /* 1121 * Drop TCP, IP headers and TCP options then add data 1122 * to socket buffer. 1123 */ 1124 if (so->so_state & SS_CANTRCVMORE) 1125 m_freem(m); 1126 else { 1127 if (opti.ts_present && opti.ts_ecr) { 1128 if (tp->rfbuf_ts < opti.ts_ecr && 1129 opti.ts_ecr - tp->rfbuf_ts < hz) { 1130 tcp_update_rcvspace(tp); 1131 /* Start over with next RTT. */ 1132 tp->rfbuf_cnt = 0; 1133 tp->rfbuf_ts = 0; 1134 } else 1135 tp->rfbuf_cnt += tlen; 1136 } 1137 m_adj(m, iphlen + off); 1138 sbappendstream(&so->so_rcv, m); 1139 } 1140 tp->t_flags |= TF_BLOCKOUTPUT; 1141 sorwakeup(so); 1142 tp->t_flags &= ~TF_BLOCKOUTPUT; 1143 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1144 (void) tcp_output(tp); 1145 return; 1146 } 1147 } 1148 1149 /* 1150 * Compute mbuf offset to TCP data segment. 1151 */ 1152 hdroptlen = iphlen + off; 1153 1154 /* 1155 * Calculate amount of space in receive window, 1156 * and then do TCP input processing. 1157 * Receive window is amount of space in rcv queue, 1158 * but not less than advertised window. 1159 */ 1160 { int win; 1161 1162 win = sbspace(&so->so_rcv); 1163 if (win < 0) 1164 win = 0; 1165 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1166 } 1167 1168 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1169 tp->rfbuf_cnt = 0; 1170 tp->rfbuf_ts = 0; 1171 1172 switch (tp->t_state) { 1173 1174 /* 1175 * If the state is SYN_RECEIVED: 1176 * if seg contains SYN/ACK, send an RST. 1177 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1178 */ 1179 1180 case TCPS_SYN_RECEIVED: 1181 if (tiflags & TH_ACK) { 1182 if (tiflags & TH_SYN) { 1183 tcpstat.tcps_badsyn++; 1184 goto dropwithreset; 1185 } 1186 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1187 SEQ_GT(th->th_ack, tp->snd_max)) 1188 goto dropwithreset; 1189 } 1190 break; 1191 1192 /* 1193 * If the state is SYN_SENT: 1194 * if seg contains an ACK, but not for our SYN, drop the input. 1195 * if seg contains a RST, then drop the connection. 1196 * if seg does not contain SYN, then drop it. 1197 * Otherwise this is an acceptable SYN segment 1198 * initialize tp->rcv_nxt and tp->irs 1199 * if seg contains ack then advance tp->snd_una 1200 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1201 * arrange for segment to be acked (eventually) 1202 * continue processing rest of data/controls, beginning with URG 1203 */ 1204 case TCPS_SYN_SENT: 1205 if ((tiflags & TH_ACK) && 1206 (SEQ_LEQ(th->th_ack, tp->iss) || 1207 SEQ_GT(th->th_ack, tp->snd_max))) 1208 goto dropwithreset; 1209 if (tiflags & TH_RST) { 1210 #ifdef TCP_ECN 1211 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1212 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1213 goto drop; 1214 #endif 1215 if (tiflags & TH_ACK) 1216 tp = tcp_drop(tp, ECONNREFUSED); 1217 goto drop; 1218 } 1219 if ((tiflags & TH_SYN) == 0) 1220 goto drop; 1221 if (tiflags & TH_ACK) { 1222 tp->snd_una = th->th_ack; 1223 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1224 tp->snd_nxt = tp->snd_una; 1225 } 1226 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1227 tp->irs = th->th_seq; 1228 tcp_mss(tp, opti.maxseg); 1229 /* Reset initial window to 1 segment for retransmit */ 1230 if (tp->t_rxtshift > 0) 1231 tp->snd_cwnd = tp->t_maxseg; 1232 tcp_rcvseqinit(tp); 1233 tp->t_flags |= TF_ACKNOW; 1234 #ifdef TCP_SACK 1235 /* 1236 * If we've sent a SACK_PERMITTED option, and the peer 1237 * also replied with one, then TF_SACK_PERMIT should have 1238 * been set in tcp_dooptions(). If it was not, disable SACKs. 1239 */ 1240 if (tp->sack_enable) 1241 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1242 #endif 1243 #ifdef TCP_ECN 1244 /* 1245 * if ECE is set but CWR is not set for SYN-ACK, or 1246 * both ECE and CWR are set for simultaneous open, 1247 * peer is ECN capable. 1248 */ 1249 if (tcp_do_ecn) { 1250 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1251 case TH_ACK|TH_ECE: 1252 case TH_ECE|TH_CWR: 1253 tp->t_flags |= TF_ECN_PERMIT; 1254 tiflags &= ~(TH_ECE|TH_CWR); 1255 tcpstat.tcps_ecn_accepts++; 1256 } 1257 } 1258 #endif 1259 1260 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1261 tcpstat.tcps_connects++; 1262 soisconnected(so); 1263 tp->t_state = TCPS_ESTABLISHED; 1264 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1265 /* Do window scaling on this connection? */ 1266 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1267 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1268 tp->snd_scale = tp->requested_s_scale; 1269 tp->rcv_scale = tp->request_r_scale; 1270 } 1271 tcp_flush_queue(tp); 1272 1273 /* 1274 * if we didn't have to retransmit the SYN, 1275 * use its rtt as our initial srtt & rtt var. 1276 */ 1277 if (tp->t_rtttime) 1278 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1279 /* 1280 * Since new data was acked (the SYN), open the 1281 * congestion window by one MSS. We do this 1282 * here, because we won't go through the normal 1283 * ACK processing below. And since this is the 1284 * start of the connection, we know we are in 1285 * the exponential phase of slow-start. 1286 */ 1287 tp->snd_cwnd += tp->t_maxseg; 1288 } else 1289 tp->t_state = TCPS_SYN_RECEIVED; 1290 1291 #if 0 1292 trimthenstep6: 1293 #endif 1294 /* 1295 * Advance th->th_seq to correspond to first data byte. 1296 * If data, trim to stay within window, 1297 * dropping FIN if necessary. 1298 */ 1299 th->th_seq++; 1300 if (tlen > tp->rcv_wnd) { 1301 todrop = tlen - tp->rcv_wnd; 1302 m_adj(m, -todrop); 1303 tlen = tp->rcv_wnd; 1304 tiflags &= ~TH_FIN; 1305 tcpstat.tcps_rcvpackafterwin++; 1306 tcpstat.tcps_rcvbyteafterwin += todrop; 1307 } 1308 tp->snd_wl1 = th->th_seq - 1; 1309 tp->rcv_up = th->th_seq; 1310 goto step6; 1311 /* 1312 * If a new connection request is received while in TIME_WAIT, 1313 * drop the old connection and start over if the if the 1314 * timestamp or the sequence numbers are above the previous 1315 * ones. 1316 */ 1317 case TCPS_TIME_WAIT: 1318 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1319 ((opti.ts_present && 1320 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1321 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1322 #if NPF > 0 1323 /* 1324 * The socket will be recreated but the new state 1325 * has already been linked to the socket. Remove the 1326 * link between old socket and new state. 1327 */ 1328 if (inp->inp_pf_sk) { 1329 inp->inp_pf_sk->inp = NULL; 1330 inp->inp_pf_sk = NULL; 1331 } 1332 #endif 1333 /* 1334 * Advance the iss by at least 32768, but 1335 * clear the msb in order to make sure 1336 * that SEG_LT(snd_nxt, iss). 1337 */ 1338 iss = tp->snd_nxt + 1339 ((arc4random() & 0x7fffffff) | 0x8000); 1340 reuse = &iss; 1341 tp = tcp_close(tp); 1342 inp = NULL; 1343 goto findpcb; 1344 } 1345 } 1346 1347 /* 1348 * States other than LISTEN or SYN_SENT. 1349 * First check timestamp, if present. 1350 * Then check that at least some bytes of segment are within 1351 * receive window. If segment begins before rcv_nxt, 1352 * drop leading data (and SYN); if nothing left, just ack. 1353 * 1354 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1355 * and it's less than opti.ts_recent, drop it. 1356 */ 1357 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1358 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1359 1360 /* Check to see if ts_recent is over 24 days old. */ 1361 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1362 /* 1363 * Invalidate ts_recent. If this segment updates 1364 * ts_recent, the age will be reset later and ts_recent 1365 * will get a valid value. If it does not, setting 1366 * ts_recent to zero will at least satisfy the 1367 * requirement that zero be placed in the timestamp 1368 * echo reply when ts_recent isn't valid. The 1369 * age isn't reset until we get a valid ts_recent 1370 * because we don't want out-of-order segments to be 1371 * dropped when ts_recent is old. 1372 */ 1373 tp->ts_recent = 0; 1374 } else { 1375 tcpstat.tcps_rcvduppack++; 1376 tcpstat.tcps_rcvdupbyte += tlen; 1377 tcpstat.tcps_pawsdrop++; 1378 goto dropafterack; 1379 } 1380 } 1381 1382 todrop = tp->rcv_nxt - th->th_seq; 1383 if (todrop > 0) { 1384 if (tiflags & TH_SYN) { 1385 tiflags &= ~TH_SYN; 1386 th->th_seq++; 1387 if (th->th_urp > 1) 1388 th->th_urp--; 1389 else 1390 tiflags &= ~TH_URG; 1391 todrop--; 1392 } 1393 if (todrop > tlen || 1394 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1395 /* 1396 * Any valid FIN must be to the left of the 1397 * window. At this point, FIN must be a 1398 * duplicate or out-of-sequence, so drop it. 1399 */ 1400 tiflags &= ~TH_FIN; 1401 /* 1402 * Send ACK to resynchronize, and drop any data, 1403 * but keep on processing for RST or ACK. 1404 */ 1405 tp->t_flags |= TF_ACKNOW; 1406 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1407 tcpstat.tcps_rcvduppack++; 1408 } else { 1409 tcpstat.tcps_rcvpartduppack++; 1410 tcpstat.tcps_rcvpartdupbyte += todrop; 1411 } 1412 hdroptlen += todrop; /* drop from head afterwards */ 1413 th->th_seq += todrop; 1414 tlen -= todrop; 1415 if (th->th_urp > todrop) 1416 th->th_urp -= todrop; 1417 else { 1418 tiflags &= ~TH_URG; 1419 th->th_urp = 0; 1420 } 1421 } 1422 1423 /* 1424 * If new data are received on a connection after the 1425 * user processes are gone, then RST the other end. 1426 */ 1427 if ((so->so_state & SS_NOFDREF) && 1428 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1429 tp = tcp_close(tp); 1430 tcpstat.tcps_rcvafterclose++; 1431 goto dropwithreset; 1432 } 1433 1434 /* 1435 * If segment ends after window, drop trailing data 1436 * (and PUSH and FIN); if nothing left, just ACK. 1437 */ 1438 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1439 if (todrop > 0) { 1440 tcpstat.tcps_rcvpackafterwin++; 1441 if (todrop >= tlen) { 1442 tcpstat.tcps_rcvbyteafterwin += tlen; 1443 /* 1444 * If window is closed can only take segments at 1445 * window edge, and have to drop data and PUSH from 1446 * incoming segments. Continue processing, but 1447 * remember to ack. Otherwise, drop segment 1448 * and ack. 1449 */ 1450 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1451 tp->t_flags |= TF_ACKNOW; 1452 tcpstat.tcps_rcvwinprobe++; 1453 } else 1454 goto dropafterack; 1455 } else 1456 tcpstat.tcps_rcvbyteafterwin += todrop; 1457 m_adj(m, -todrop); 1458 tlen -= todrop; 1459 tiflags &= ~(TH_PUSH|TH_FIN); 1460 } 1461 1462 /* 1463 * If last ACK falls within this segment's sequence numbers, 1464 * record its timestamp if it's more recent. 1465 * Cf fix from Braden, see Stevens p. 870 1466 */ 1467 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1468 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1469 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1470 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1471 tp->ts_recent = opti.ts_val; 1472 else 1473 tp->ts_recent = 0; 1474 tp->ts_recent_age = tcp_now; 1475 } 1476 1477 /* 1478 * If the RST bit is set examine the state: 1479 * SYN_RECEIVED STATE: 1480 * If passive open, return to LISTEN state. 1481 * If active open, inform user that connection was refused. 1482 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1483 * Inform user that connection was reset, and close tcb. 1484 * CLOSING, LAST_ACK, TIME_WAIT STATES 1485 * Close the tcb. 1486 */ 1487 if (tiflags & TH_RST) { 1488 if (th->th_seq != tp->last_ack_sent && 1489 th->th_seq != tp->rcv_nxt && 1490 th->th_seq != (tp->rcv_nxt + 1)) 1491 goto drop; 1492 1493 switch (tp->t_state) { 1494 case TCPS_SYN_RECEIVED: 1495 #ifdef TCP_ECN 1496 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1497 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1498 goto drop; 1499 #endif 1500 so->so_error = ECONNREFUSED; 1501 goto close; 1502 1503 case TCPS_ESTABLISHED: 1504 case TCPS_FIN_WAIT_1: 1505 case TCPS_FIN_WAIT_2: 1506 case TCPS_CLOSE_WAIT: 1507 so->so_error = ECONNRESET; 1508 close: 1509 tp->t_state = TCPS_CLOSED; 1510 tcpstat.tcps_drops++; 1511 tp = tcp_close(tp); 1512 goto drop; 1513 case TCPS_CLOSING: 1514 case TCPS_LAST_ACK: 1515 case TCPS_TIME_WAIT: 1516 tp = tcp_close(tp); 1517 goto drop; 1518 } 1519 } 1520 1521 /* 1522 * If a SYN is in the window, then this is an 1523 * error and we ACK and drop the packet. 1524 */ 1525 if (tiflags & TH_SYN) 1526 goto dropafterack_ratelim; 1527 1528 /* 1529 * If the ACK bit is off we drop the segment and return. 1530 */ 1531 if ((tiflags & TH_ACK) == 0) { 1532 if (tp->t_flags & TF_ACKNOW) 1533 goto dropafterack; 1534 else 1535 goto drop; 1536 } 1537 1538 /* 1539 * Ack processing. 1540 */ 1541 switch (tp->t_state) { 1542 1543 /* 1544 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1545 * ESTABLISHED state and continue processing. 1546 * The ACK was checked above. 1547 */ 1548 case TCPS_SYN_RECEIVED: 1549 tcpstat.tcps_connects++; 1550 soisconnected(so); 1551 tp->t_state = TCPS_ESTABLISHED; 1552 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1553 /* Do window scaling? */ 1554 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1555 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1556 tp->snd_scale = tp->requested_s_scale; 1557 tp->rcv_scale = tp->request_r_scale; 1558 tiwin = th->th_win << tp->snd_scale; 1559 } 1560 tcp_flush_queue(tp); 1561 tp->snd_wl1 = th->th_seq - 1; 1562 /* fall into ... */ 1563 1564 /* 1565 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1566 * ACKs. If the ack is in the range 1567 * tp->snd_una < th->th_ack <= tp->snd_max 1568 * then advance tp->snd_una to th->th_ack and drop 1569 * data from the retransmission queue. If this ACK reflects 1570 * more up to date window information we update our window information. 1571 */ 1572 case TCPS_ESTABLISHED: 1573 case TCPS_FIN_WAIT_1: 1574 case TCPS_FIN_WAIT_2: 1575 case TCPS_CLOSE_WAIT: 1576 case TCPS_CLOSING: 1577 case TCPS_LAST_ACK: 1578 case TCPS_TIME_WAIT: 1579 #ifdef TCP_ECN 1580 /* 1581 * if we receive ECE and are not already in recovery phase, 1582 * reduce cwnd by half but don't slow-start. 1583 * advance snd_last to snd_max not to reduce cwnd again 1584 * until all outstanding packets are acked. 1585 */ 1586 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1587 if ((tp->t_flags & TF_ECN_PERMIT) && 1588 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1589 u_int win; 1590 1591 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1592 if (win > 1) { 1593 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1594 tp->snd_cwnd = tp->snd_ssthresh; 1595 tp->snd_last = tp->snd_max; 1596 tp->t_flags |= TF_SEND_CWR; 1597 tcpstat.tcps_cwr_ecn++; 1598 } 1599 } 1600 tcpstat.tcps_ecn_rcvece++; 1601 } 1602 /* 1603 * if we receive CWR, we know that the peer has reduced 1604 * its congestion window. stop sending ecn-echo. 1605 */ 1606 if ((tiflags & TH_CWR)) { 1607 tp->t_flags &= ~TF_RCVD_CE; 1608 tcpstat.tcps_ecn_rcvcwr++; 1609 } 1610 #endif /* TCP_ECN */ 1611 1612 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1613 /* 1614 * Duplicate/old ACK processing. 1615 * Increments t_dupacks: 1616 * Pure duplicate (same seq/ack/window, no data) 1617 * Doesn't affect t_dupacks: 1618 * Data packets. 1619 * Normal window updates (window opens) 1620 * Resets t_dupacks: 1621 * New data ACKed. 1622 * Window shrinks 1623 * Old ACK 1624 */ 1625 if (tlen) { 1626 /* Drop very old ACKs unless th_seq matches */ 1627 if (th->th_seq != tp->rcv_nxt && 1628 SEQ_LT(th->th_ack, 1629 tp->snd_una - tp->max_sndwnd)) { 1630 tcpstat.tcps_rcvacktooold++; 1631 goto drop; 1632 } 1633 break; 1634 } 1635 /* 1636 * If we get an old ACK, there is probably packet 1637 * reordering going on. Be conservative and reset 1638 * t_dupacks so that we are less aggressive in 1639 * doing a fast retransmit. 1640 */ 1641 if (th->th_ack != tp->snd_una) { 1642 tp->t_dupacks = 0; 1643 break; 1644 } 1645 if (tiwin == tp->snd_wnd) { 1646 tcpstat.tcps_rcvdupack++; 1647 /* 1648 * If we have outstanding data (other than 1649 * a window probe), this is a completely 1650 * duplicate ack (ie, window info didn't 1651 * change), the ack is the biggest we've 1652 * seen and we've seen exactly our rexmt 1653 * threshold of them, assume a packet 1654 * has been dropped and retransmit it. 1655 * Kludge snd_nxt & the congestion 1656 * window so we send only this one 1657 * packet. 1658 * 1659 * We know we're losing at the current 1660 * window size so do congestion avoidance 1661 * (set ssthresh to half the current window 1662 * and pull our congestion window back to 1663 * the new ssthresh). 1664 * 1665 * Dup acks mean that packets have left the 1666 * network (they're now cached at the receiver) 1667 * so bump cwnd by the amount in the receiver 1668 * to keep a constant cwnd packets in the 1669 * network. 1670 */ 1671 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1672 tp->t_dupacks = 0; 1673 #if defined(TCP_SACK) && defined(TCP_FACK) 1674 /* 1675 * In FACK, can enter fast rec. if the receiver 1676 * reports a reass. queue longer than 3 segs. 1677 */ 1678 else if (++tp->t_dupacks == tcprexmtthresh || 1679 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1680 tp->t_maxseg + tp->snd_una)) && 1681 SEQ_GT(tp->snd_una, tp->snd_last))) { 1682 #else 1683 else if (++tp->t_dupacks == tcprexmtthresh) { 1684 #endif /* TCP_FACK */ 1685 tcp_seq onxt = tp->snd_nxt; 1686 u_long win = 1687 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1688 2 / tp->t_maxseg; 1689 1690 #if defined(TCP_SACK) || defined(TCP_ECN) 1691 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1692 /* 1693 * False fast retx after 1694 * timeout. Do not cut window. 1695 */ 1696 tp->t_dupacks = 0; 1697 goto drop; 1698 } 1699 #endif 1700 if (win < 2) 1701 win = 2; 1702 tp->snd_ssthresh = win * tp->t_maxseg; 1703 #ifdef TCP_SACK 1704 tp->snd_last = tp->snd_max; 1705 if (tp->sack_enable) { 1706 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1707 tp->t_rtttime = 0; 1708 #ifdef TCP_ECN 1709 tp->t_flags |= TF_SEND_CWR; 1710 #endif 1711 tcpstat.tcps_cwr_frecovery++; 1712 tcpstat.tcps_sack_recovery_episode++; 1713 #if defined(TCP_SACK) && defined(TCP_FACK) 1714 tp->t_dupacks = tcprexmtthresh; 1715 (void) tcp_output(tp); 1716 /* 1717 * During FR, snd_cwnd is held 1718 * constant for FACK. 1719 */ 1720 tp->snd_cwnd = tp->snd_ssthresh; 1721 #else 1722 /* 1723 * tcp_output() will send 1724 * oldest SACK-eligible rtx. 1725 */ 1726 (void) tcp_output(tp); 1727 tp->snd_cwnd = tp->snd_ssthresh+ 1728 tp->t_maxseg * tp->t_dupacks; 1729 #endif /* TCP_FACK */ 1730 goto drop; 1731 } 1732 #endif /* TCP_SACK */ 1733 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1734 tp->t_rtttime = 0; 1735 tp->snd_nxt = th->th_ack; 1736 tp->snd_cwnd = tp->t_maxseg; 1737 #ifdef TCP_ECN 1738 tp->t_flags |= TF_SEND_CWR; 1739 #endif 1740 tcpstat.tcps_cwr_frecovery++; 1741 tcpstat.tcps_sndrexmitfast++; 1742 (void) tcp_output(tp); 1743 1744 tp->snd_cwnd = tp->snd_ssthresh + 1745 tp->t_maxseg * tp->t_dupacks; 1746 if (SEQ_GT(onxt, tp->snd_nxt)) 1747 tp->snd_nxt = onxt; 1748 goto drop; 1749 } else if (tp->t_dupacks > tcprexmtthresh) { 1750 #if defined(TCP_SACK) && defined(TCP_FACK) 1751 /* 1752 * while (awnd < cwnd) 1753 * sendsomething(); 1754 */ 1755 if (tp->sack_enable) { 1756 if (tp->snd_awnd < tp->snd_cwnd) 1757 tcp_output(tp); 1758 goto drop; 1759 } 1760 #endif /* TCP_FACK */ 1761 tp->snd_cwnd += tp->t_maxseg; 1762 (void) tcp_output(tp); 1763 goto drop; 1764 } 1765 } else if (tiwin < tp->snd_wnd) { 1766 /* 1767 * The window was retracted! Previous dup 1768 * ACKs may have been due to packets arriving 1769 * after the shrunken window, not a missing 1770 * packet, so play it safe and reset t_dupacks 1771 */ 1772 tp->t_dupacks = 0; 1773 } 1774 break; 1775 } 1776 /* 1777 * If the congestion window was inflated to account 1778 * for the other side's cached packets, retract it. 1779 */ 1780 #if defined(TCP_SACK) 1781 if (tp->sack_enable) { 1782 if (tp->t_dupacks >= tcprexmtthresh) { 1783 /* Check for a partial ACK */ 1784 if (tcp_sack_partialack(tp, th)) { 1785 #if defined(TCP_SACK) && defined(TCP_FACK) 1786 /* Force call to tcp_output */ 1787 if (tp->snd_awnd < tp->snd_cwnd) 1788 tp->t_flags |= TF_NEEDOUTPUT; 1789 #else 1790 tp->snd_cwnd += tp->t_maxseg; 1791 tp->t_flags |= TF_NEEDOUTPUT; 1792 #endif /* TCP_FACK */ 1793 } else { 1794 /* Out of fast recovery */ 1795 tp->snd_cwnd = tp->snd_ssthresh; 1796 if (tcp_seq_subtract(tp->snd_max, 1797 th->th_ack) < tp->snd_ssthresh) 1798 tp->snd_cwnd = 1799 tcp_seq_subtract(tp->snd_max, 1800 th->th_ack); 1801 tp->t_dupacks = 0; 1802 #if defined(TCP_SACK) && defined(TCP_FACK) 1803 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1804 tp->snd_fack = th->th_ack; 1805 #endif /* TCP_FACK */ 1806 } 1807 } 1808 } else { 1809 if (tp->t_dupacks >= tcprexmtthresh && 1810 !tcp_newreno(tp, th)) { 1811 /* Out of fast recovery */ 1812 tp->snd_cwnd = tp->snd_ssthresh; 1813 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1814 tp->snd_ssthresh) 1815 tp->snd_cwnd = 1816 tcp_seq_subtract(tp->snd_max, 1817 th->th_ack); 1818 tp->t_dupacks = 0; 1819 } 1820 } 1821 if (tp->t_dupacks < tcprexmtthresh) 1822 tp->t_dupacks = 0; 1823 #else /* else no TCP_SACK */ 1824 if (tp->t_dupacks >= tcprexmtthresh && 1825 tp->snd_cwnd > tp->snd_ssthresh) 1826 tp->snd_cwnd = tp->snd_ssthresh; 1827 tp->t_dupacks = 0; 1828 #endif 1829 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1830 tcpstat.tcps_rcvacktoomuch++; 1831 goto dropafterack_ratelim; 1832 } 1833 acked = th->th_ack - tp->snd_una; 1834 tcpstat.tcps_rcvackpack++; 1835 tcpstat.tcps_rcvackbyte += acked; 1836 1837 /* 1838 * If we have a timestamp reply, update smoothed 1839 * round trip time. If no timestamp is present but 1840 * transmit timer is running and timed sequence 1841 * number was acked, update smoothed round trip time. 1842 * Since we now have an rtt measurement, cancel the 1843 * timer backoff (cf., Phil Karn's retransmit alg.). 1844 * Recompute the initial retransmit timer. 1845 */ 1846 if (opti.ts_present && opti.ts_ecr) 1847 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1848 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1849 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1850 1851 /* 1852 * If all outstanding data is acked, stop retransmit 1853 * timer and remember to restart (more output or persist). 1854 * If there is more data to be acked, restart retransmit 1855 * timer, using current (possibly backed-off) value. 1856 */ 1857 if (th->th_ack == tp->snd_max) { 1858 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1859 tp->t_flags |= TF_NEEDOUTPUT; 1860 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1861 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1862 /* 1863 * When new data is acked, open the congestion window. 1864 * If the window gives us less than ssthresh packets 1865 * in flight, open exponentially (maxseg per packet). 1866 * Otherwise open linearly: maxseg per window 1867 * (maxseg^2 / cwnd per packet). 1868 */ 1869 { 1870 u_int cw = tp->snd_cwnd; 1871 u_int incr = tp->t_maxseg; 1872 1873 if (cw > tp->snd_ssthresh) 1874 incr = incr * incr / cw; 1875 #if defined (TCP_SACK) 1876 if (tp->t_dupacks < tcprexmtthresh) 1877 #endif 1878 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1879 } 1880 ND6_HINT(tp); 1881 if (acked > so->so_snd.sb_cc) { 1882 tp->snd_wnd -= so->so_snd.sb_cc; 1883 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1884 ourfinisacked = 1; 1885 } else { 1886 sbdrop(&so->so_snd, acked); 1887 tp->snd_wnd -= acked; 1888 ourfinisacked = 0; 1889 } 1890 1891 tcp_update_sndspace(tp); 1892 if (sb_notify(&so->so_snd)) { 1893 tp->t_flags |= TF_BLOCKOUTPUT; 1894 sowwakeup(so); 1895 tp->t_flags &= ~TF_BLOCKOUTPUT; 1896 } 1897 1898 /* 1899 * If we had a pending ICMP message that referred to data 1900 * that have just been acknowledged, disregard the recorded 1901 * ICMP message. 1902 */ 1903 if ((tp->t_flags & TF_PMTUD_PEND) && 1904 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1905 tp->t_flags &= ~TF_PMTUD_PEND; 1906 1907 /* 1908 * Keep track of the largest chunk of data acknowledged 1909 * since last PMTU update 1910 */ 1911 if (tp->t_pmtud_mss_acked < acked) 1912 tp->t_pmtud_mss_acked = acked; 1913 1914 tp->snd_una = th->th_ack; 1915 #ifdef TCP_ECN 1916 /* sync snd_last with snd_una */ 1917 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1918 tp->snd_last = tp->snd_una; 1919 #endif 1920 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1921 tp->snd_nxt = tp->snd_una; 1922 #if defined (TCP_SACK) && defined (TCP_FACK) 1923 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1924 tp->snd_fack = tp->snd_una; 1925 /* Update snd_awnd for partial ACK 1926 * without any SACK blocks. 1927 */ 1928 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1929 tp->snd_fack) + tp->retran_data; 1930 } 1931 #endif 1932 1933 switch (tp->t_state) { 1934 1935 /* 1936 * In FIN_WAIT_1 STATE in addition to the processing 1937 * for the ESTABLISHED state if our FIN is now acknowledged 1938 * then enter FIN_WAIT_2. 1939 */ 1940 case TCPS_FIN_WAIT_1: 1941 if (ourfinisacked) { 1942 /* 1943 * If we can't receive any more 1944 * data, then closing user can proceed. 1945 * Starting the timer is contrary to the 1946 * specification, but if we don't get a FIN 1947 * we'll hang forever. 1948 */ 1949 if (so->so_state & SS_CANTRCVMORE) { 1950 soisdisconnected(so); 1951 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1952 } 1953 tp->t_state = TCPS_FIN_WAIT_2; 1954 } 1955 break; 1956 1957 /* 1958 * In CLOSING STATE in addition to the processing for 1959 * the ESTABLISHED state if the ACK acknowledges our FIN 1960 * then enter the TIME-WAIT state, otherwise ignore 1961 * the segment. 1962 */ 1963 case TCPS_CLOSING: 1964 if (ourfinisacked) { 1965 tp->t_state = TCPS_TIME_WAIT; 1966 tcp_canceltimers(tp); 1967 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1968 soisdisconnected(so); 1969 } 1970 break; 1971 1972 /* 1973 * In LAST_ACK, we may still be waiting for data to drain 1974 * and/or to be acked, as well as for the ack of our FIN. 1975 * If our FIN is now acknowledged, delete the TCB, 1976 * enter the closed state and return. 1977 */ 1978 case TCPS_LAST_ACK: 1979 if (ourfinisacked) { 1980 tp = tcp_close(tp); 1981 goto drop; 1982 } 1983 break; 1984 1985 /* 1986 * In TIME_WAIT state the only thing that should arrive 1987 * is a retransmission of the remote FIN. Acknowledge 1988 * it and restart the finack timer. 1989 */ 1990 case TCPS_TIME_WAIT: 1991 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1992 goto dropafterack; 1993 } 1994 } 1995 1996 step6: 1997 /* 1998 * Update window information. 1999 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2000 */ 2001 if ((tiflags & TH_ACK) && 2002 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 2003 (SEQ_LT(tp->snd_wl2, th->th_ack) || 2004 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2005 /* keep track of pure window updates */ 2006 if (tlen == 0 && 2007 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2008 tcpstat.tcps_rcvwinupd++; 2009 tp->snd_wnd = tiwin; 2010 tp->snd_wl1 = th->th_seq; 2011 tp->snd_wl2 = th->th_ack; 2012 if (tp->snd_wnd > tp->max_sndwnd) 2013 tp->max_sndwnd = tp->snd_wnd; 2014 tp->t_flags |= TF_NEEDOUTPUT; 2015 } 2016 2017 /* 2018 * Process segments with URG. 2019 */ 2020 if ((tiflags & TH_URG) && th->th_urp && 2021 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2022 /* 2023 * This is a kludge, but if we receive and accept 2024 * random urgent pointers, we'll crash in 2025 * soreceive. It's hard to imagine someone 2026 * actually wanting to send this much urgent data. 2027 */ 2028 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2029 th->th_urp = 0; /* XXX */ 2030 tiflags &= ~TH_URG; /* XXX */ 2031 goto dodata; /* XXX */ 2032 } 2033 /* 2034 * If this segment advances the known urgent pointer, 2035 * then mark the data stream. This should not happen 2036 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2037 * a FIN has been received from the remote side. 2038 * In these states we ignore the URG. 2039 * 2040 * According to RFC961 (Assigned Protocols), 2041 * the urgent pointer points to the last octet 2042 * of urgent data. We continue, however, 2043 * to consider it to indicate the first octet 2044 * of data past the urgent section as the original 2045 * spec states (in one of two places). 2046 */ 2047 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2048 tp->rcv_up = th->th_seq + th->th_urp; 2049 so->so_oobmark = so->so_rcv.sb_cc + 2050 (tp->rcv_up - tp->rcv_nxt) - 1; 2051 if (so->so_oobmark == 0) 2052 so->so_state |= SS_RCVATMARK; 2053 sohasoutofband(so); 2054 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2055 } 2056 /* 2057 * Remove out of band data so doesn't get presented to user. 2058 * This can happen independent of advancing the URG pointer, 2059 * but if two URG's are pending at once, some out-of-band 2060 * data may creep in... ick. 2061 */ 2062 if (th->th_urp <= (u_int16_t) tlen 2063 #ifdef SO_OOBINLINE 2064 && (so->so_options & SO_OOBINLINE) == 0 2065 #endif 2066 ) 2067 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2068 } else 2069 /* 2070 * If no out of band data is expected, 2071 * pull receive urgent pointer along 2072 * with the receive window. 2073 */ 2074 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2075 tp->rcv_up = tp->rcv_nxt; 2076 dodata: /* XXX */ 2077 2078 /* 2079 * Process the segment text, merging it into the TCP sequencing queue, 2080 * and arranging for acknowledgment of receipt if necessary. 2081 * This process logically involves adjusting tp->rcv_wnd as data 2082 * is presented to the user (this happens in tcp_usrreq.c, 2083 * case PRU_RCVD). If a FIN has already been received on this 2084 * connection then we just ignore the text. 2085 */ 2086 if ((tlen || (tiflags & TH_FIN)) && 2087 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2088 #ifdef TCP_SACK 2089 tcp_seq laststart = th->th_seq; 2090 tcp_seq lastend = th->th_seq + tlen; 2091 #endif 2092 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2093 tp->t_state == TCPS_ESTABLISHED) { 2094 TCP_SETUP_ACK(tp, tiflags, m); 2095 tp->rcv_nxt += tlen; 2096 tiflags = th->th_flags & TH_FIN; 2097 tcpstat.tcps_rcvpack++; 2098 tcpstat.tcps_rcvbyte += tlen; 2099 ND6_HINT(tp); 2100 if (so->so_state & SS_CANTRCVMORE) 2101 m_freem(m); 2102 else { 2103 m_adj(m, hdroptlen); 2104 sbappendstream(&so->so_rcv, m); 2105 } 2106 tp->t_flags |= TF_BLOCKOUTPUT; 2107 sorwakeup(so); 2108 tp->t_flags &= ~TF_BLOCKOUTPUT; 2109 } else { 2110 m_adj(m, hdroptlen); 2111 tiflags = tcp_reass(tp, th, m, &tlen); 2112 tp->t_flags |= TF_ACKNOW; 2113 } 2114 #ifdef TCP_SACK 2115 if (tp->sack_enable) 2116 tcp_update_sack_list(tp, laststart, lastend); 2117 #endif 2118 2119 /* 2120 * variable len never referenced again in modern BSD, 2121 * so why bother computing it ?? 2122 */ 2123 #if 0 2124 /* 2125 * Note the amount of data that peer has sent into 2126 * our window, in order to estimate the sender's 2127 * buffer size. 2128 */ 2129 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2130 #endif /* 0 */ 2131 } else { 2132 m_freem(m); 2133 tiflags &= ~TH_FIN; 2134 } 2135 2136 /* 2137 * If FIN is received ACK the FIN and let the user know 2138 * that the connection is closing. Ignore a FIN received before 2139 * the connection is fully established. 2140 */ 2141 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2142 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2143 socantrcvmore(so); 2144 tp->t_flags |= TF_ACKNOW; 2145 tp->rcv_nxt++; 2146 } 2147 switch (tp->t_state) { 2148 2149 /* 2150 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2151 */ 2152 case TCPS_ESTABLISHED: 2153 tp->t_state = TCPS_CLOSE_WAIT; 2154 break; 2155 2156 /* 2157 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2158 * enter the CLOSING state. 2159 */ 2160 case TCPS_FIN_WAIT_1: 2161 tp->t_state = TCPS_CLOSING; 2162 break; 2163 2164 /* 2165 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2166 * starting the time-wait timer, turning off the other 2167 * standard timers. 2168 */ 2169 case TCPS_FIN_WAIT_2: 2170 tp->t_state = TCPS_TIME_WAIT; 2171 tcp_canceltimers(tp); 2172 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2173 soisdisconnected(so); 2174 break; 2175 2176 /* 2177 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2178 */ 2179 case TCPS_TIME_WAIT: 2180 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2181 break; 2182 } 2183 } 2184 if (so->so_options & SO_DEBUG) { 2185 switch (tp->pf) { 2186 #ifdef INET6 2187 case PF_INET6: 2188 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2189 0, tlen); 2190 break; 2191 #endif /* INET6 */ 2192 case PF_INET: 2193 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2194 0, tlen); 2195 break; 2196 } 2197 } 2198 2199 /* 2200 * Return any desired output. 2201 */ 2202 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2203 (void) tcp_output(tp); 2204 return; 2205 2206 badsyn: 2207 /* 2208 * Received a bad SYN. Increment counters and dropwithreset. 2209 */ 2210 tcpstat.tcps_badsyn++; 2211 tp = NULL; 2212 goto dropwithreset; 2213 2214 dropafterack_ratelim: 2215 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2216 tcp_ackdrop_ppslim) == 0) { 2217 /* XXX stat */ 2218 goto drop; 2219 } 2220 /* ...fall into dropafterack... */ 2221 2222 dropafterack: 2223 /* 2224 * Generate an ACK dropping incoming segment if it occupies 2225 * sequence space, where the ACK reflects our state. 2226 */ 2227 if (tiflags & TH_RST) 2228 goto drop; 2229 m_freem(m); 2230 tp->t_flags |= TF_ACKNOW; 2231 (void) tcp_output(tp); 2232 return; 2233 2234 dropwithreset_ratelim: 2235 /* 2236 * We may want to rate-limit RSTs in certain situations, 2237 * particularly if we are sending an RST in response to 2238 * an attempt to connect to or otherwise communicate with 2239 * a port for which we have no socket. 2240 */ 2241 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2242 tcp_rst_ppslim) == 0) { 2243 /* XXX stat */ 2244 goto drop; 2245 } 2246 /* ...fall into dropwithreset... */ 2247 2248 dropwithreset: 2249 /* 2250 * Generate a RST, dropping incoming segment. 2251 * Make ACK acceptable to originator of segment. 2252 * Don't bother to respond to RST. 2253 */ 2254 if (tiflags & TH_RST) 2255 goto drop; 2256 if (tiflags & TH_ACK) { 2257 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2258 TH_RST, m->m_pkthdr.rdomain); 2259 } else { 2260 if (tiflags & TH_SYN) 2261 tlen++; 2262 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2263 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.rdomain); 2264 } 2265 m_freem(m); 2266 return; 2267 2268 drop: 2269 /* 2270 * Drop space held by incoming segment and return. 2271 */ 2272 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2273 switch (tp->pf) { 2274 #ifdef INET6 2275 case PF_INET6: 2276 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2277 0, tlen); 2278 break; 2279 #endif /* INET6 */ 2280 case PF_INET: 2281 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2282 0, tlen); 2283 break; 2284 } 2285 } 2286 2287 m_freem(m); 2288 return; 2289 } 2290 2291 int 2292 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2293 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2294 u_int rtableid) 2295 { 2296 u_int16_t mss = 0; 2297 int opt, optlen; 2298 #ifdef TCP_SIGNATURE 2299 caddr_t sigp = NULL; 2300 struct tdb *tdb = NULL; 2301 #endif /* TCP_SIGNATURE */ 2302 2303 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2304 opt = cp[0]; 2305 if (opt == TCPOPT_EOL) 2306 break; 2307 if (opt == TCPOPT_NOP) 2308 optlen = 1; 2309 else { 2310 if (cnt < 2) 2311 break; 2312 optlen = cp[1]; 2313 if (optlen < 2 || optlen > cnt) 2314 break; 2315 } 2316 switch (opt) { 2317 2318 default: 2319 continue; 2320 2321 case TCPOPT_MAXSEG: 2322 if (optlen != TCPOLEN_MAXSEG) 2323 continue; 2324 if (!(th->th_flags & TH_SYN)) 2325 continue; 2326 if (TCPS_HAVERCVDSYN(tp->t_state)) 2327 continue; 2328 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2329 NTOHS(mss); 2330 oi->maxseg = mss; 2331 break; 2332 2333 case TCPOPT_WINDOW: 2334 if (optlen != TCPOLEN_WINDOW) 2335 continue; 2336 if (!(th->th_flags & TH_SYN)) 2337 continue; 2338 if (TCPS_HAVERCVDSYN(tp->t_state)) 2339 continue; 2340 tp->t_flags |= TF_RCVD_SCALE; 2341 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2342 break; 2343 2344 case TCPOPT_TIMESTAMP: 2345 if (optlen != TCPOLEN_TIMESTAMP) 2346 continue; 2347 oi->ts_present = 1; 2348 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2349 NTOHL(oi->ts_val); 2350 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2351 NTOHL(oi->ts_ecr); 2352 2353 if (!(th->th_flags & TH_SYN)) 2354 continue; 2355 if (TCPS_HAVERCVDSYN(tp->t_state)) 2356 continue; 2357 /* 2358 * A timestamp received in a SYN makes 2359 * it ok to send timestamp requests and replies. 2360 */ 2361 tp->t_flags |= TF_RCVD_TSTMP; 2362 tp->ts_recent = oi->ts_val; 2363 tp->ts_recent_age = tcp_now; 2364 break; 2365 2366 #ifdef TCP_SACK 2367 case TCPOPT_SACK_PERMITTED: 2368 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2369 continue; 2370 if (!(th->th_flags & TH_SYN)) 2371 continue; 2372 if (TCPS_HAVERCVDSYN(tp->t_state)) 2373 continue; 2374 /* MUST only be set on SYN */ 2375 tp->t_flags |= TF_SACK_PERMIT; 2376 break; 2377 case TCPOPT_SACK: 2378 tcp_sack_option(tp, th, cp, optlen); 2379 break; 2380 #endif 2381 #ifdef TCP_SIGNATURE 2382 case TCPOPT_SIGNATURE: 2383 if (optlen != TCPOLEN_SIGNATURE) 2384 continue; 2385 2386 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2387 return (-1); 2388 2389 sigp = cp + 2; 2390 break; 2391 #endif /* TCP_SIGNATURE */ 2392 } 2393 } 2394 2395 #ifdef TCP_SIGNATURE 2396 if (tp->t_flags & TF_SIGNATURE) { 2397 union sockaddr_union src, dst; 2398 2399 memset(&src, 0, sizeof(union sockaddr_union)); 2400 memset(&dst, 0, sizeof(union sockaddr_union)); 2401 2402 switch (tp->pf) { 2403 case 0: 2404 #ifdef INET 2405 case AF_INET: 2406 src.sa.sa_len = sizeof(struct sockaddr_in); 2407 src.sa.sa_family = AF_INET; 2408 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2409 dst.sa.sa_len = sizeof(struct sockaddr_in); 2410 dst.sa.sa_family = AF_INET; 2411 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2412 break; 2413 #endif 2414 #ifdef INET6 2415 case AF_INET6: 2416 src.sa.sa_len = sizeof(struct sockaddr_in6); 2417 src.sa.sa_family = AF_INET6; 2418 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2419 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2420 dst.sa.sa_family = AF_INET6; 2421 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2422 break; 2423 #endif /* INET6 */ 2424 } 2425 2426 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2427 0, &src, &dst, IPPROTO_TCP); 2428 2429 /* 2430 * We don't have an SA for this peer, so we turn off 2431 * TF_SIGNATURE on the listen socket 2432 */ 2433 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2434 tp->t_flags &= ~TF_SIGNATURE; 2435 2436 } 2437 2438 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2439 tcpstat.tcps_rcvbadsig++; 2440 return (-1); 2441 } 2442 2443 if (sigp) { 2444 char sig[16]; 2445 2446 if (tdb == NULL) { 2447 tcpstat.tcps_rcvbadsig++; 2448 return (-1); 2449 } 2450 2451 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2452 return (-1); 2453 2454 if (timingsafe_bcmp(sig, sigp, 16)) { 2455 tcpstat.tcps_rcvbadsig++; 2456 return (-1); 2457 } 2458 2459 tcpstat.tcps_rcvgoodsig++; 2460 } 2461 #endif /* TCP_SIGNATURE */ 2462 2463 return (0); 2464 } 2465 2466 #if defined(TCP_SACK) 2467 u_long 2468 tcp_seq_subtract(u_long a, u_long b) 2469 { 2470 return ((long)(a - b)); 2471 } 2472 #endif 2473 2474 2475 #ifdef TCP_SACK 2476 /* 2477 * This function is called upon receipt of new valid data (while not in header 2478 * prediction mode), and it updates the ordered list of sacks. 2479 */ 2480 void 2481 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2482 tcp_seq rcv_lastend) 2483 { 2484 /* 2485 * First reported block MUST be the most recent one. Subsequent 2486 * blocks SHOULD be in the order in which they arrived at the 2487 * receiver. These two conditions make the implementation fully 2488 * compliant with RFC 2018. 2489 */ 2490 int i, j = 0, count = 0, lastpos = -1; 2491 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2492 2493 /* First clean up current list of sacks */ 2494 for (i = 0; i < tp->rcv_numsacks; i++) { 2495 sack = tp->sackblks[i]; 2496 if (sack.start == 0 && sack.end == 0) { 2497 count++; /* count = number of blocks to be discarded */ 2498 continue; 2499 } 2500 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2501 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2502 count++; 2503 } else { 2504 temp[j].start = tp->sackblks[i].start; 2505 temp[j++].end = tp->sackblks[i].end; 2506 } 2507 } 2508 tp->rcv_numsacks -= count; 2509 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2510 tcp_clean_sackreport(tp); 2511 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2512 /* ==> need first sack block */ 2513 tp->sackblks[0].start = rcv_laststart; 2514 tp->sackblks[0].end = rcv_lastend; 2515 tp->rcv_numsacks = 1; 2516 } 2517 return; 2518 } 2519 /* Otherwise, sack blocks are already present. */ 2520 for (i = 0; i < tp->rcv_numsacks; i++) 2521 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2522 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2523 return; /* sack list remains unchanged */ 2524 /* 2525 * From here, segment just received should be (part of) the 1st sack. 2526 * Go through list, possibly coalescing sack block entries. 2527 */ 2528 firstsack.start = rcv_laststart; 2529 firstsack.end = rcv_lastend; 2530 for (i = 0; i < tp->rcv_numsacks; i++) { 2531 sack = tp->sackblks[i]; 2532 if (SEQ_LT(sack.end, firstsack.start) || 2533 SEQ_GT(sack.start, firstsack.end)) 2534 continue; /* no overlap */ 2535 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2536 /* 2537 * identical block; delete it here since we will 2538 * move it to the front of the list. 2539 */ 2540 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2541 lastpos = i; /* last posn with a zero entry */ 2542 continue; 2543 } 2544 if (SEQ_LEQ(sack.start, firstsack.start)) 2545 firstsack.start = sack.start; /* merge blocks */ 2546 if (SEQ_GEQ(sack.end, firstsack.end)) 2547 firstsack.end = sack.end; /* merge blocks */ 2548 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2549 lastpos = i; /* last posn with a zero entry */ 2550 } 2551 if (lastpos != -1) { /* at least one merge */ 2552 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2553 sack = tp->sackblks[i]; 2554 if (sack.start == 0 && sack.end == 0) 2555 continue; 2556 temp[j++] = sack; 2557 } 2558 tp->rcv_numsacks = j; /* including first blk (added later) */ 2559 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2560 tp->sackblks[i] = temp[i]; 2561 } else { /* no merges -- shift sacks by 1 */ 2562 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2563 tp->rcv_numsacks++; 2564 for (i = tp->rcv_numsacks-1; i > 0; i--) 2565 tp->sackblks[i] = tp->sackblks[i-1]; 2566 } 2567 tp->sackblks[0] = firstsack; 2568 return; 2569 } 2570 2571 /* 2572 * Process the TCP SACK option. tp->snd_holes is an ordered list 2573 * of holes (oldest to newest, in terms of the sequence space). 2574 */ 2575 void 2576 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2577 { 2578 int tmp_olen; 2579 u_char *tmp_cp; 2580 struct sackhole *cur, *p, *temp; 2581 2582 if (!tp->sack_enable) 2583 return; 2584 /* SACK without ACK doesn't make sense. */ 2585 if ((th->th_flags & TH_ACK) == 0) 2586 return; 2587 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2588 if (SEQ_LT(th->th_ack, tp->snd_una) || 2589 SEQ_GT(th->th_ack, tp->snd_max)) 2590 return; 2591 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2592 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2593 return; 2594 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2595 tmp_cp = cp + 2; 2596 tmp_olen = optlen - 2; 2597 tcpstat.tcps_sack_rcv_opts++; 2598 if (tp->snd_numholes < 0) 2599 tp->snd_numholes = 0; 2600 if (tp->t_maxseg == 0) 2601 panic("tcp_sack_option"); /* Should never happen */ 2602 while (tmp_olen > 0) { 2603 struct sackblk sack; 2604 2605 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2606 NTOHL(sack.start); 2607 bcopy(tmp_cp + sizeof(tcp_seq), 2608 (char *) &(sack.end), sizeof(tcp_seq)); 2609 NTOHL(sack.end); 2610 tmp_olen -= TCPOLEN_SACK; 2611 tmp_cp += TCPOLEN_SACK; 2612 if (SEQ_LEQ(sack.end, sack.start)) 2613 continue; /* bad SACK fields */ 2614 if (SEQ_LEQ(sack.end, tp->snd_una)) 2615 continue; /* old block */ 2616 #if defined(TCP_SACK) && defined(TCP_FACK) 2617 /* Updates snd_fack. */ 2618 if (SEQ_GT(sack.end, tp->snd_fack)) 2619 tp->snd_fack = sack.end; 2620 #endif /* TCP_FACK */ 2621 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2622 if (SEQ_LT(sack.start, th->th_ack)) 2623 continue; 2624 } 2625 if (SEQ_GT(sack.end, tp->snd_max)) 2626 continue; 2627 if (tp->snd_holes == NULL) { /* first hole */ 2628 tp->snd_holes = (struct sackhole *) 2629 pool_get(&sackhl_pool, PR_NOWAIT); 2630 if (tp->snd_holes == NULL) { 2631 /* ENOBUFS, so ignore SACKed block for now*/ 2632 goto done; 2633 } 2634 cur = tp->snd_holes; 2635 cur->start = th->th_ack; 2636 cur->end = sack.start; 2637 cur->rxmit = cur->start; 2638 cur->next = NULL; 2639 tp->snd_numholes = 1; 2640 tp->rcv_lastsack = sack.end; 2641 /* 2642 * dups is at least one. If more data has been 2643 * SACKed, it can be greater than one. 2644 */ 2645 cur->dups = min(tcprexmtthresh, 2646 ((sack.end - cur->end)/tp->t_maxseg)); 2647 if (cur->dups < 1) 2648 cur->dups = 1; 2649 continue; /* with next sack block */ 2650 } 2651 /* Go thru list of holes: p = previous, cur = current */ 2652 p = cur = tp->snd_holes; 2653 while (cur) { 2654 if (SEQ_LEQ(sack.end, cur->start)) 2655 /* SACKs data before the current hole */ 2656 break; /* no use going through more holes */ 2657 if (SEQ_GEQ(sack.start, cur->end)) { 2658 /* SACKs data beyond the current hole */ 2659 cur->dups++; 2660 if (((sack.end - cur->end)/tp->t_maxseg) >= 2661 tcprexmtthresh) 2662 cur->dups = tcprexmtthresh; 2663 p = cur; 2664 cur = cur->next; 2665 continue; 2666 } 2667 if (SEQ_LEQ(sack.start, cur->start)) { 2668 /* Data acks at least the beginning of hole */ 2669 #if defined(TCP_SACK) && defined(TCP_FACK) 2670 if (SEQ_GT(sack.end, cur->rxmit)) 2671 tp->retran_data -= 2672 tcp_seq_subtract(cur->rxmit, 2673 cur->start); 2674 else 2675 tp->retran_data -= 2676 tcp_seq_subtract(sack.end, 2677 cur->start); 2678 #endif /* TCP_FACK */ 2679 if (SEQ_GEQ(sack.end, cur->end)) { 2680 /* Acks entire hole, so delete hole */ 2681 if (p != cur) { 2682 p->next = cur->next; 2683 pool_put(&sackhl_pool, cur); 2684 cur = p->next; 2685 } else { 2686 cur = cur->next; 2687 pool_put(&sackhl_pool, p); 2688 p = cur; 2689 tp->snd_holes = p; 2690 } 2691 tp->snd_numholes--; 2692 continue; 2693 } 2694 /* otherwise, move start of hole forward */ 2695 cur->start = sack.end; 2696 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2697 p = cur; 2698 cur = cur->next; 2699 continue; 2700 } 2701 /* move end of hole backward */ 2702 if (SEQ_GEQ(sack.end, cur->end)) { 2703 #if defined(TCP_SACK) && defined(TCP_FACK) 2704 if (SEQ_GT(cur->rxmit, sack.start)) 2705 tp->retran_data -= 2706 tcp_seq_subtract(cur->rxmit, 2707 sack.start); 2708 #endif /* TCP_FACK */ 2709 cur->end = sack.start; 2710 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2711 cur->dups++; 2712 if (((sack.end - cur->end)/tp->t_maxseg) >= 2713 tcprexmtthresh) 2714 cur->dups = tcprexmtthresh; 2715 p = cur; 2716 cur = cur->next; 2717 continue; 2718 } 2719 if (SEQ_LT(cur->start, sack.start) && 2720 SEQ_GT(cur->end, sack.end)) { 2721 /* 2722 * ACKs some data in middle of a hole; need to 2723 * split current hole 2724 */ 2725 temp = (struct sackhole *) 2726 pool_get(&sackhl_pool, PR_NOWAIT); 2727 if (temp == NULL) 2728 goto done; /* ENOBUFS */ 2729 #if defined(TCP_SACK) && defined(TCP_FACK) 2730 if (SEQ_GT(cur->rxmit, sack.end)) 2731 tp->retran_data -= 2732 tcp_seq_subtract(sack.end, 2733 sack.start); 2734 else if (SEQ_GT(cur->rxmit, sack.start)) 2735 tp->retran_data -= 2736 tcp_seq_subtract(cur->rxmit, 2737 sack.start); 2738 #endif /* TCP_FACK */ 2739 temp->next = cur->next; 2740 temp->start = sack.end; 2741 temp->end = cur->end; 2742 temp->dups = cur->dups; 2743 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2744 cur->end = sack.start; 2745 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2746 cur->dups++; 2747 if (((sack.end - cur->end)/tp->t_maxseg) >= 2748 tcprexmtthresh) 2749 cur->dups = tcprexmtthresh; 2750 cur->next = temp; 2751 p = temp; 2752 cur = p->next; 2753 tp->snd_numholes++; 2754 } 2755 } 2756 /* At this point, p points to the last hole on the list */ 2757 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2758 /* 2759 * Need to append new hole at end. 2760 * Last hole is p (and it's not NULL). 2761 */ 2762 temp = (struct sackhole *) 2763 pool_get(&sackhl_pool, PR_NOWAIT); 2764 if (temp == NULL) 2765 goto done; /* ENOBUFS */ 2766 temp->start = tp->rcv_lastsack; 2767 temp->end = sack.start; 2768 temp->dups = min(tcprexmtthresh, 2769 ((sack.end - sack.start)/tp->t_maxseg)); 2770 if (temp->dups < 1) 2771 temp->dups = 1; 2772 temp->rxmit = temp->start; 2773 temp->next = 0; 2774 p->next = temp; 2775 tp->rcv_lastsack = sack.end; 2776 tp->snd_numholes++; 2777 } 2778 } 2779 done: 2780 #if defined(TCP_SACK) && defined(TCP_FACK) 2781 /* 2782 * Update retran_data and snd_awnd. Go through the list of 2783 * holes. Increment retran_data by (hole->rxmit - hole->start). 2784 */ 2785 tp->retran_data = 0; 2786 cur = tp->snd_holes; 2787 while (cur) { 2788 tp->retran_data += cur->rxmit - cur->start; 2789 cur = cur->next; 2790 } 2791 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2792 tp->retran_data; 2793 #endif /* TCP_FACK */ 2794 2795 return; 2796 } 2797 2798 /* 2799 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2800 * it is completely acked; otherwise, tcp_sack_option(), called from 2801 * tcp_dooptions(), will fix up the hole. 2802 */ 2803 void 2804 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2805 { 2806 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2807 /* max because this could be an older ack just arrived */ 2808 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2809 th->th_ack : tp->snd_una; 2810 struct sackhole *cur = tp->snd_holes; 2811 struct sackhole *prev; 2812 while (cur) 2813 if (SEQ_LEQ(cur->end, lastack)) { 2814 prev = cur; 2815 cur = cur->next; 2816 pool_put(&sackhl_pool, prev); 2817 tp->snd_numholes--; 2818 } else if (SEQ_LT(cur->start, lastack)) { 2819 cur->start = lastack; 2820 if (SEQ_LT(cur->rxmit, cur->start)) 2821 cur->rxmit = cur->start; 2822 break; 2823 } else 2824 break; 2825 tp->snd_holes = cur; 2826 } 2827 } 2828 2829 /* 2830 * Delete all receiver-side SACK information. 2831 */ 2832 void 2833 tcp_clean_sackreport(struct tcpcb *tp) 2834 { 2835 int i; 2836 2837 tp->rcv_numsacks = 0; 2838 for (i = 0; i < MAX_SACK_BLKS; i++) 2839 tp->sackblks[i].start = tp->sackblks[i].end=0; 2840 2841 } 2842 2843 /* 2844 * Checks for partial ack. If partial ack arrives, turn off retransmission 2845 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2846 * If the ack advances at least to tp->snd_last, return 0. 2847 */ 2848 int 2849 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2850 { 2851 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2852 /* Turn off retx. timer (will start again next segment) */ 2853 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2854 tp->t_rtttime = 0; 2855 #ifndef TCP_FACK 2856 /* 2857 * Partial window deflation. This statement relies on the 2858 * fact that tp->snd_una has not been updated yet. In FACK 2859 * hold snd_cwnd constant during fast recovery. 2860 */ 2861 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2862 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2863 tp->snd_cwnd += tp->t_maxseg; 2864 } else 2865 tp->snd_cwnd = tp->t_maxseg; 2866 #endif 2867 return (1); 2868 } 2869 return (0); 2870 } 2871 #endif /* TCP_SACK */ 2872 2873 /* 2874 * Pull out of band byte out of a segment so 2875 * it doesn't appear in the user's data queue. 2876 * It is still reflected in the segment length for 2877 * sequencing purposes. 2878 */ 2879 void 2880 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2881 { 2882 int cnt = off + urgent - 1; 2883 2884 while (cnt >= 0) { 2885 if (m->m_len > cnt) { 2886 char *cp = mtod(m, caddr_t) + cnt; 2887 struct tcpcb *tp = sototcpcb(so); 2888 2889 tp->t_iobc = *cp; 2890 tp->t_oobflags |= TCPOOB_HAVEDATA; 2891 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2892 m->m_len--; 2893 return; 2894 } 2895 cnt -= m->m_len; 2896 m = m->m_next; 2897 if (m == 0) 2898 break; 2899 } 2900 panic("tcp_pulloutofband"); 2901 } 2902 2903 /* 2904 * Collect new round-trip time estimate 2905 * and update averages and current timeout. 2906 */ 2907 void 2908 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2909 { 2910 short delta; 2911 short rttmin; 2912 2913 if (rtt < 0) 2914 rtt = 0; 2915 else if (rtt > TCP_RTT_MAX) 2916 rtt = TCP_RTT_MAX; 2917 2918 tcpstat.tcps_rttupdated++; 2919 if (tp->t_srtt != 0) { 2920 /* 2921 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2922 * after the binary point (scaled by 4), whereas 2923 * srtt is stored as fixed point with 5 bits after the 2924 * binary point (i.e., scaled by 32). The following magic 2925 * is equivalent to the smoothing algorithm in rfc793 with 2926 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2927 * point). 2928 */ 2929 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2930 (tp->t_srtt >> TCP_RTT_SHIFT); 2931 if ((tp->t_srtt += delta) <= 0) 2932 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2933 /* 2934 * We accumulate a smoothed rtt variance (actually, a 2935 * smoothed mean difference), then set the retransmit 2936 * timer to smoothed rtt + 4 times the smoothed variance. 2937 * rttvar is stored as fixed point with 4 bits after the 2938 * binary point (scaled by 16). The following is 2939 * equivalent to rfc793 smoothing with an alpha of .75 2940 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2941 * rfc793's wired-in beta. 2942 */ 2943 if (delta < 0) 2944 delta = -delta; 2945 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2946 if ((tp->t_rttvar += delta) <= 0) 2947 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2948 } else { 2949 /* 2950 * No rtt measurement yet - use the unsmoothed rtt. 2951 * Set the variance to half the rtt (so our first 2952 * retransmit happens at 3*rtt). 2953 */ 2954 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2955 tp->t_rttvar = (rtt + 1) << 2956 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2957 } 2958 tp->t_rtttime = 0; 2959 tp->t_rxtshift = 0; 2960 2961 /* 2962 * the retransmit should happen at rtt + 4 * rttvar. 2963 * Because of the way we do the smoothing, srtt and rttvar 2964 * will each average +1/2 tick of bias. When we compute 2965 * the retransmit timer, we want 1/2 tick of rounding and 2966 * 1 extra tick because of +-1/2 tick uncertainty in the 2967 * firing of the timer. The bias will give us exactly the 2968 * 1.5 tick we need. But, because the bias is 2969 * statistical, we have to test that we don't drop below 2970 * the minimum feasible timer (which is 2 ticks). 2971 */ 2972 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2973 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2974 2975 /* 2976 * We received an ack for a packet that wasn't retransmitted; 2977 * it is probably safe to discard any error indications we've 2978 * received recently. This isn't quite right, but close enough 2979 * for now (a route might have failed after we sent a segment, 2980 * and the return path might not be symmetrical). 2981 */ 2982 tp->t_softerror = 0; 2983 } 2984 2985 /* 2986 * Determine a reasonable value for maxseg size. 2987 * If the route is known, check route for mtu. 2988 * If none, use an mss that can be handled on the outgoing 2989 * interface without forcing IP to fragment; if bigger than 2990 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2991 * to utilize large mbufs. If no route is found, route has no mtu, 2992 * or the destination isn't local, use a default, hopefully conservative 2993 * size (usually 512 or the default IP max size, but no more than the mtu 2994 * of the interface), as we can't discover anything about intervening 2995 * gateways or networks. We also initialize the congestion/slow start 2996 * window to be a single segment if the destination isn't local. 2997 * While looking at the routing entry, we also initialize other path-dependent 2998 * parameters from pre-set or cached values in the routing entry. 2999 * 3000 * Also take into account the space needed for options that we 3001 * send regularly. Make maxseg shorter by that amount to assure 3002 * that we can send maxseg amount of data even when the options 3003 * are present. Store the upper limit of the length of options plus 3004 * data in maxopd. 3005 * 3006 * NOTE: offer == -1 indicates that the maxseg size changed due to 3007 * Path MTU discovery. 3008 */ 3009 int 3010 tcp_mss(struct tcpcb *tp, int offer) 3011 { 3012 struct rtentry *rt; 3013 struct ifnet *ifp; 3014 int mss, mssopt; 3015 int iphlen; 3016 struct inpcb *inp; 3017 3018 inp = tp->t_inpcb; 3019 3020 mssopt = mss = tcp_mssdflt; 3021 3022 rt = in_pcbrtentry(inp); 3023 3024 if (rt == NULL) 3025 goto out; 3026 3027 ifp = rt->rt_ifp; 3028 3029 switch (tp->pf) { 3030 #ifdef INET6 3031 case AF_INET6: 3032 iphlen = sizeof(struct ip6_hdr); 3033 break; 3034 #endif 3035 case AF_INET: 3036 iphlen = sizeof(struct ip); 3037 break; 3038 default: 3039 /* the family does not support path MTU discovery */ 3040 goto out; 3041 } 3042 3043 #ifdef RTV_MTU 3044 /* 3045 * if there's an mtu associated with the route and we support 3046 * path MTU discovery for the underlying protocol family, use it. 3047 */ 3048 if (rt->rt_rmx.rmx_mtu) { 3049 /* 3050 * One may wish to lower MSS to take into account options, 3051 * especially security-related options. 3052 */ 3053 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3054 /* 3055 * RFC2460 section 5, last paragraph: if path MTU is 3056 * smaller than 1280, use 1280 as packet size and 3057 * attach fragment header. 3058 */ 3059 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3060 sizeof(struct tcphdr); 3061 } else 3062 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3063 } else 3064 #endif /* RTV_MTU */ 3065 if (!ifp) 3066 /* 3067 * ifp may be null and rmx_mtu may be zero in certain 3068 * v6 cases (e.g., if ND wasn't able to resolve the 3069 * destination host. 3070 */ 3071 goto out; 3072 else if (ifp->if_flags & IFF_LOOPBACK) 3073 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3074 else if (tp->pf == AF_INET) { 3075 if (ip_mtudisc) 3076 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3077 else if (inp && in_localaddr(inp->inp_faddr, inp->inp_rtableid)) 3078 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3079 } 3080 #ifdef INET6 3081 else if (tp->pf == AF_INET6) { 3082 /* 3083 * for IPv6, path MTU discovery is always turned on, 3084 * or the node must use packet size <= 1280. 3085 */ 3086 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3087 } 3088 #endif /* INET6 */ 3089 3090 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3091 if (offer != -1) { 3092 #ifndef INET6 3093 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3094 #else 3095 if (tp->pf == AF_INET6) 3096 mssopt = IN6_LINKMTU(ifp) - iphlen - 3097 sizeof(struct tcphdr); 3098 else 3099 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3100 #endif 3101 3102 mssopt = max(tcp_mssdflt, mssopt); 3103 } 3104 3105 out: 3106 /* 3107 * The current mss, t_maxseg, is initialized to the default value. 3108 * If we compute a smaller value, reduce the current mss. 3109 * If we compute a larger value, return it for use in sending 3110 * a max seg size option, but don't store it for use 3111 * unless we received an offer at least that large from peer. 3112 * 3113 * However, do not accept offers lower than the minimum of 3114 * the interface MTU and 216. 3115 */ 3116 if (offer > 0) 3117 tp->t_peermss = offer; 3118 if (tp->t_peermss) 3119 mss = min(mss, max(tp->t_peermss, 216)); 3120 3121 /* sanity - at least max opt. space */ 3122 mss = max(mss, 64); 3123 3124 /* 3125 * maxopd stores the maximum length of data AND options 3126 * in a segment; maxseg is the amount of data in a normal 3127 * segment. We need to store this value (maxopd) apart 3128 * from maxseg, because now every segment carries options 3129 * and thus we normally have somewhat less data in segments. 3130 */ 3131 tp->t_maxopd = mss; 3132 3133 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3134 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3135 mss -= TCPOLEN_TSTAMP_APPA; 3136 #ifdef TCP_SIGNATURE 3137 if (tp->t_flags & TF_SIGNATURE) 3138 mss -= TCPOLEN_SIGLEN; 3139 #endif 3140 3141 if (offer == -1) { 3142 /* mss changed due to Path MTU discovery */ 3143 tp->t_flags &= ~TF_PMTUD_PEND; 3144 tp->t_pmtud_mtu_sent = 0; 3145 tp->t_pmtud_mss_acked = 0; 3146 if (mss < tp->t_maxseg) { 3147 /* 3148 * Follow suggestion in RFC 2414 to reduce the 3149 * congestion window by the ratio of the old 3150 * segment size to the new segment size. 3151 */ 3152 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3153 mss, mss); 3154 } 3155 } else if (tcp_do_rfc3390 == 2) { 3156 /* increase initial window */ 3157 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3158 } else if (tcp_do_rfc3390) { 3159 /* increase initial window */ 3160 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3161 } else 3162 tp->snd_cwnd = mss; 3163 3164 tp->t_maxseg = mss; 3165 3166 return (offer != -1 ? mssopt : mss); 3167 } 3168 3169 u_int 3170 tcp_hdrsz(struct tcpcb *tp) 3171 { 3172 u_int hlen; 3173 3174 switch (tp->pf) { 3175 #ifdef INET6 3176 case AF_INET6: 3177 hlen = sizeof(struct ip6_hdr); 3178 break; 3179 #endif 3180 case AF_INET: 3181 hlen = sizeof(struct ip); 3182 break; 3183 default: 3184 hlen = 0; 3185 break; 3186 } 3187 hlen += sizeof(struct tcphdr); 3188 3189 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3190 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3191 hlen += TCPOLEN_TSTAMP_APPA; 3192 #ifdef TCP_SIGNATURE 3193 if (tp->t_flags & TF_SIGNATURE) 3194 hlen += TCPOLEN_SIGLEN; 3195 #endif 3196 return (hlen); 3197 } 3198 3199 /* 3200 * Set connection variables based on the effective MSS. 3201 * We are passed the TCPCB for the actual connection. If we 3202 * are the server, we are called by the compressed state engine 3203 * when the 3-way handshake is complete. If we are the client, 3204 * we are called when we receive the SYN,ACK from the server. 3205 * 3206 * NOTE: The t_maxseg value must be initialized in the TCPCB 3207 * before this routine is called! 3208 */ 3209 void 3210 tcp_mss_update(struct tcpcb *tp) 3211 { 3212 int mss; 3213 u_long bufsize; 3214 struct rtentry *rt; 3215 struct socket *so; 3216 3217 so = tp->t_inpcb->inp_socket; 3218 mss = tp->t_maxseg; 3219 3220 rt = in_pcbrtentry(tp->t_inpcb); 3221 3222 if (rt == NULL) 3223 return; 3224 3225 bufsize = so->so_snd.sb_hiwat; 3226 if (bufsize < mss) { 3227 mss = bufsize; 3228 /* Update t_maxseg and t_maxopd */ 3229 tcp_mss(tp, mss); 3230 } else { 3231 bufsize = roundup(bufsize, mss); 3232 if (bufsize > sb_max) 3233 bufsize = sb_max; 3234 (void)sbreserve(&so->so_snd, bufsize); 3235 } 3236 3237 bufsize = so->so_rcv.sb_hiwat; 3238 if (bufsize > mss) { 3239 bufsize = roundup(bufsize, mss); 3240 if (bufsize > sb_max) 3241 bufsize = sb_max; 3242 (void)sbreserve(&so->so_rcv, bufsize); 3243 } 3244 3245 } 3246 3247 #if defined (TCP_SACK) 3248 /* 3249 * Checks for partial ack. If partial ack arrives, force the retransmission 3250 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3251 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3252 * be started again. If the ack advances at least to tp->snd_last, return 0. 3253 */ 3254 int 3255 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3256 { 3257 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3258 /* 3259 * snd_una has not been updated and the socket send buffer 3260 * not yet drained of the acked data, so we have to leave 3261 * snd_una as it was to get the correct data offset in 3262 * tcp_output(). 3263 */ 3264 tcp_seq onxt = tp->snd_nxt; 3265 u_long ocwnd = tp->snd_cwnd; 3266 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3267 tp->t_rtttime = 0; 3268 tp->snd_nxt = th->th_ack; 3269 /* 3270 * Set snd_cwnd to one segment beyond acknowledged offset 3271 * (tp->snd_una not yet updated when this function is called) 3272 */ 3273 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3274 (void) tcp_output(tp); 3275 tp->snd_cwnd = ocwnd; 3276 if (SEQ_GT(onxt, tp->snd_nxt)) 3277 tp->snd_nxt = onxt; 3278 /* 3279 * Partial window deflation. Relies on fact that tp->snd_una 3280 * not updated yet. 3281 */ 3282 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3283 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3284 else 3285 tp->snd_cwnd = 0; 3286 tp->snd_cwnd += tp->t_maxseg; 3287 3288 return 1; 3289 } 3290 return 0; 3291 } 3292 #endif /* TCP_SACK */ 3293 3294 int 3295 tcp_mss_adv(struct ifnet *ifp, int af) 3296 { 3297 int mss = 0; 3298 int iphlen; 3299 3300 switch (af) { 3301 case AF_INET: 3302 if (ifp != NULL) 3303 mss = ifp->if_mtu; 3304 iphlen = sizeof(struct ip); 3305 break; 3306 #ifdef INET6 3307 case AF_INET6: 3308 if (ifp != NULL) 3309 mss = IN6_LINKMTU(ifp); 3310 iphlen = sizeof(struct ip6_hdr); 3311 break; 3312 #endif 3313 } 3314 mss = mss - iphlen - sizeof(struct tcphdr); 3315 return (max(mss, tcp_mssdflt)); 3316 } 3317 3318 /* 3319 * TCP compressed state engine. Currently used to hold compressed 3320 * state for SYN_RECEIVED. 3321 */ 3322 3323 u_long syn_cache_count; 3324 u_int32_t syn_hash1, syn_hash2; 3325 3326 #define SYN_HASH(sa, sp, dp) \ 3327 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3328 ((u_int32_t)(sp)))^syn_hash2))) 3329 #ifndef INET6 3330 #define SYN_HASHALL(hash, src, dst) \ 3331 do { \ 3332 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3333 ((struct sockaddr_in *)(src))->sin_port, \ 3334 ((struct sockaddr_in *)(dst))->sin_port); \ 3335 } while (/*CONSTCOND*/ 0) 3336 #else 3337 #define SYN_HASH6(sa, sp, dp) \ 3338 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3339 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3340 & 0x7fffffff) 3341 3342 #define SYN_HASHALL(hash, src, dst) \ 3343 do { \ 3344 switch ((src)->sa_family) { \ 3345 case AF_INET: \ 3346 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3347 ((struct sockaddr_in *)(src))->sin_port, \ 3348 ((struct sockaddr_in *)(dst))->sin_port); \ 3349 break; \ 3350 case AF_INET6: \ 3351 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3352 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3353 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3354 break; \ 3355 default: \ 3356 hash = 0; \ 3357 } \ 3358 } while (/*CONSTCOND*/0) 3359 #endif /* INET6 */ 3360 3361 void 3362 syn_cache_rm(struct syn_cache *sc) 3363 { 3364 sc->sc_flags |= SCF_DEAD; 3365 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 3366 sc, sc_bucketq); 3367 sc->sc_tp = NULL; 3368 LIST_REMOVE(sc, sc_tpq); 3369 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 3370 timeout_del(&sc->sc_timer); 3371 syn_cache_count--; 3372 } 3373 3374 void 3375 syn_cache_put(struct syn_cache *sc) 3376 { 3377 if (sc->sc_ipopts) 3378 (void) m_free(sc->sc_ipopts); 3379 if (sc->sc_route4.ro_rt != NULL) 3380 RTFREE(sc->sc_route4.ro_rt); 3381 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3382 timeout_add(&sc->sc_timer, 0); 3383 } 3384 3385 struct pool syn_cache_pool; 3386 3387 /* 3388 * We don't estimate RTT with SYNs, so each packet starts with the default 3389 * RTT and each timer step has a fixed timeout value. 3390 */ 3391 #define SYN_CACHE_TIMER_ARM(sc) \ 3392 do { \ 3393 TCPT_RANGESET((sc)->sc_rxtcur, \ 3394 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3395 TCPTV_REXMTMAX); \ 3396 if (!timeout_initialized(&(sc)->sc_timer)) \ 3397 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3398 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3399 } while (/*CONSTCOND*/0) 3400 3401 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3402 3403 void 3404 syn_cache_init() 3405 { 3406 int i; 3407 3408 /* Initialize the hash buckets. */ 3409 for (i = 0; i < tcp_syn_cache_size; i++) 3410 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3411 3412 /* Initialize the syn cache pool. */ 3413 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3414 "synpl", NULL); 3415 } 3416 3417 void 3418 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3419 { 3420 struct syn_cache_head *scp; 3421 struct syn_cache *sc2; 3422 int s; 3423 3424 /* 3425 * If there are no entries in the hash table, reinitialize 3426 * the hash secrets. 3427 */ 3428 if (syn_cache_count == 0) { 3429 syn_hash1 = arc4random(); 3430 syn_hash2 = arc4random(); 3431 } 3432 3433 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3434 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3435 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3436 3437 /* 3438 * Make sure that we don't overflow the per-bucket 3439 * limit or the total cache size limit. 3440 */ 3441 s = splsoftnet(); 3442 if (scp->sch_length >= tcp_syn_bucket_limit) { 3443 tcpstat.tcps_sc_bucketoverflow++; 3444 /* 3445 * The bucket is full. Toss the oldest element in the 3446 * bucket. This will be the first entry in the bucket. 3447 */ 3448 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3449 #ifdef DIAGNOSTIC 3450 /* 3451 * This should never happen; we should always find an 3452 * entry in our bucket. 3453 */ 3454 if (sc2 == NULL) 3455 panic("syn_cache_insert: bucketoverflow: impossible"); 3456 #endif 3457 syn_cache_rm(sc2); 3458 syn_cache_put(sc2); 3459 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3460 struct syn_cache_head *scp2, *sce; 3461 3462 tcpstat.tcps_sc_overflowed++; 3463 /* 3464 * The cache is full. Toss the oldest entry in the 3465 * first non-empty bucket we can find. 3466 * 3467 * XXX We would really like to toss the oldest 3468 * entry in the cache, but we hope that this 3469 * condition doesn't happen very often. 3470 */ 3471 scp2 = scp; 3472 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3473 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3474 for (++scp2; scp2 != scp; scp2++) { 3475 if (scp2 >= sce) 3476 scp2 = &tcp_syn_cache[0]; 3477 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3478 break; 3479 } 3480 #ifdef DIAGNOSTIC 3481 /* 3482 * This should never happen; we should always find a 3483 * non-empty bucket. 3484 */ 3485 if (scp2 == scp) 3486 panic("syn_cache_insert: cacheoverflow: " 3487 "impossible"); 3488 #endif 3489 } 3490 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3491 syn_cache_rm(sc2); 3492 syn_cache_put(sc2); 3493 } 3494 3495 /* 3496 * Initialize the entry's timer. 3497 */ 3498 sc->sc_rxttot = 0; 3499 sc->sc_rxtshift = 0; 3500 SYN_CACHE_TIMER_ARM(sc); 3501 3502 /* Link it from tcpcb entry */ 3503 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3504 3505 /* Put it into the bucket. */ 3506 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3507 scp->sch_length++; 3508 syn_cache_count++; 3509 3510 tcpstat.tcps_sc_added++; 3511 splx(s); 3512 } 3513 3514 /* 3515 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3516 * If we have retransmitted an entry the maximum number of times, expire 3517 * that entry. 3518 */ 3519 void 3520 syn_cache_timer(void *arg) 3521 { 3522 struct syn_cache *sc = arg; 3523 int s; 3524 3525 s = splsoftnet(); 3526 if (sc->sc_flags & SCF_DEAD) { 3527 splx(s); 3528 return; 3529 } 3530 3531 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3532 /* Drop it -- too many retransmissions. */ 3533 goto dropit; 3534 } 3535 3536 /* 3537 * Compute the total amount of time this entry has 3538 * been on a queue. If this entry has been on longer 3539 * than the keep alive timer would allow, expire it. 3540 */ 3541 sc->sc_rxttot += sc->sc_rxtcur; 3542 if (sc->sc_rxttot >= tcptv_keep_init) 3543 goto dropit; 3544 3545 tcpstat.tcps_sc_retransmitted++; 3546 (void) syn_cache_respond(sc, NULL); 3547 3548 /* Advance the timer back-off. */ 3549 sc->sc_rxtshift++; 3550 SYN_CACHE_TIMER_ARM(sc); 3551 3552 splx(s); 3553 return; 3554 3555 dropit: 3556 tcpstat.tcps_sc_timed_out++; 3557 syn_cache_rm(sc); 3558 syn_cache_put(sc); 3559 splx(s); 3560 } 3561 3562 void 3563 syn_cache_reaper(void *arg) 3564 { 3565 struct syn_cache *sc = arg; 3566 int s; 3567 3568 s = splsoftnet(); 3569 pool_put(&syn_cache_pool, (sc)); 3570 splx(s); 3571 return; 3572 } 3573 3574 /* 3575 * Remove syn cache created by the specified tcb entry, 3576 * because this does not make sense to keep them 3577 * (if there's no tcb entry, syn cache entry will never be used) 3578 */ 3579 void 3580 syn_cache_cleanup(struct tcpcb *tp) 3581 { 3582 struct syn_cache *sc, *nsc; 3583 int s; 3584 3585 s = splsoftnet(); 3586 3587 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3588 nsc = LIST_NEXT(sc, sc_tpq); 3589 3590 #ifdef DIAGNOSTIC 3591 if (sc->sc_tp != tp) 3592 panic("invalid sc_tp in syn_cache_cleanup"); 3593 #endif 3594 syn_cache_rm(sc); 3595 syn_cache_put(sc); 3596 } 3597 /* just for safety */ 3598 LIST_INIT(&tp->t_sc); 3599 3600 splx(s); 3601 } 3602 3603 /* 3604 * Find an entry in the syn cache. 3605 */ 3606 struct syn_cache * 3607 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3608 struct syn_cache_head **headp, u_int rtableid) 3609 { 3610 struct syn_cache *sc; 3611 struct syn_cache_head *scp; 3612 u_int32_t hash; 3613 int s; 3614 3615 SYN_HASHALL(hash, src, dst); 3616 3617 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3618 *headp = scp; 3619 s = splsoftnet(); 3620 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3621 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3622 if (sc->sc_hash != hash) 3623 continue; 3624 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3625 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3626 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) { 3627 splx(s); 3628 return (sc); 3629 } 3630 } 3631 splx(s); 3632 return (NULL); 3633 } 3634 3635 /* 3636 * This function gets called when we receive an ACK for a 3637 * socket in the LISTEN state. We look up the connection 3638 * in the syn cache, and if its there, we pull it out of 3639 * the cache and turn it into a full-blown connection in 3640 * the SYN-RECEIVED state. 3641 * 3642 * The return values may not be immediately obvious, and their effects 3643 * can be subtle, so here they are: 3644 * 3645 * NULL SYN was not found in cache; caller should drop the 3646 * packet and send an RST. 3647 * 3648 * -1 We were unable to create the new connection, and are 3649 * aborting it. An ACK,RST is being sent to the peer 3650 * (unless we got screwey sequence numbners; see below), 3651 * because the 3-way handshake has been completed. Caller 3652 * should not free the mbuf, since we may be using it. If 3653 * we are not, we will free it. 3654 * 3655 * Otherwise, the return value is a pointer to the new socket 3656 * associated with the connection. 3657 */ 3658 struct socket * 3659 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3660 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3661 { 3662 struct syn_cache *sc; 3663 struct syn_cache_head *scp; 3664 struct inpcb *inp = NULL; 3665 struct tcpcb *tp = NULL; 3666 struct mbuf *am; 3667 int s; 3668 struct socket *oso; 3669 #if NPF > 0 3670 struct pf_divert *divert = NULL; 3671 #endif 3672 3673 s = splsoftnet(); 3674 if ((sc = syn_cache_lookup(src, dst, &scp, 3675 sotoinpcb(so)->inp_rtableid)) == NULL) { 3676 splx(s); 3677 return (NULL); 3678 } 3679 3680 /* 3681 * Verify the sequence and ack numbers. Try getting the correct 3682 * response again. 3683 */ 3684 if ((th->th_ack != sc->sc_iss + 1) || 3685 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3686 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3687 (void) syn_cache_respond(sc, m); 3688 splx(s); 3689 return ((struct socket *)(-1)); 3690 } 3691 3692 /* Remove this cache entry */ 3693 syn_cache_rm(sc); 3694 splx(s); 3695 3696 /* 3697 * Ok, create the full blown connection, and set things up 3698 * as they would have been set up if we had created the 3699 * connection when the SYN arrived. If we can't create 3700 * the connection, abort it. 3701 */ 3702 oso = so; 3703 so = sonewconn(so, SS_ISCONNECTED); 3704 if (so == NULL) 3705 goto resetandabort; 3706 3707 inp = sotoinpcb(oso); 3708 3709 #ifdef IPSEC 3710 /* 3711 * We need to copy the required security levels 3712 * from the old pcb. Ditto for any other 3713 * IPsec-related information. 3714 */ 3715 { 3716 struct inpcb *newinp = sotoinpcb(so); 3717 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3718 sizeof(inp->inp_seclevel)); 3719 newinp->inp_secrequire = inp->inp_secrequire; 3720 if (inp->inp_ipo != NULL) { 3721 newinp->inp_ipo = inp->inp_ipo; 3722 inp->inp_ipo->ipo_ref_count++; 3723 } 3724 if (inp->inp_ipsec_remotecred != NULL) { 3725 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3726 inp->inp_ipsec_remotecred->ref_count++; 3727 } 3728 if (inp->inp_ipsec_remoteauth != NULL) { 3729 newinp->inp_ipsec_remoteauth 3730 = inp->inp_ipsec_remoteauth; 3731 inp->inp_ipsec_remoteauth->ref_count++; 3732 } 3733 } 3734 #endif /* IPSEC */ 3735 #ifdef INET6 3736 /* 3737 * inp still has the OLD in_pcb stuff, set the 3738 * v6-related flags on the new guy, too. 3739 */ 3740 { 3741 int flags = inp->inp_flags; 3742 struct inpcb *oldinpcb = inp; 3743 3744 inp = sotoinpcb(so); 3745 inp->inp_flags |= (flags & INP_IPV6); 3746 if ((inp->inp_flags & INP_IPV6) != 0) { 3747 inp->inp_ipv6.ip6_hlim = 3748 oldinpcb->inp_ipv6.ip6_hlim; 3749 } 3750 } 3751 #else /* INET6 */ 3752 inp = sotoinpcb(so); 3753 #endif /* INET6 */ 3754 3755 #if NPF > 0 3756 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3757 (divert = pf_find_divert(m)) != NULL) 3758 inp->inp_rtableid = divert->rdomain; 3759 else 3760 #endif 3761 /* inherit rtable from listening socket */ 3762 inp->inp_rtableid = sc->sc_rtableid; 3763 3764 inp->inp_lport = th->th_dport; 3765 switch (src->sa_family) { 3766 #ifdef INET6 3767 case AF_INET6: 3768 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3769 break; 3770 #endif /* INET6 */ 3771 case AF_INET: 3772 3773 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3774 inp->inp_options = ip_srcroute(m); 3775 if (inp->inp_options == NULL) { 3776 inp->inp_options = sc->sc_ipopts; 3777 sc->sc_ipopts = NULL; 3778 } 3779 break; 3780 } 3781 in_pcbrehash(inp); 3782 3783 /* 3784 * Give the new socket our cached route reference. 3785 */ 3786 if (src->sa_family == AF_INET) 3787 inp->inp_route = sc->sc_route4; /* struct assignment */ 3788 #ifdef INET6 3789 else 3790 inp->inp_route6 = sc->sc_route6; 3791 #endif 3792 sc->sc_route4.ro_rt = NULL; 3793 3794 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3795 if (am == NULL) 3796 goto resetandabort; 3797 am->m_len = src->sa_len; 3798 bcopy(src, mtod(am, caddr_t), src->sa_len); 3799 3800 switch (src->sa_family) { 3801 case AF_INET: 3802 /* drop IPv4 packet to AF_INET6 socket */ 3803 if (inp->inp_flags & INP_IPV6) { 3804 (void) m_free(am); 3805 goto resetandabort; 3806 } 3807 if (in_pcbconnect(inp, am)) { 3808 (void) m_free(am); 3809 goto resetandabort; 3810 } 3811 break; 3812 #ifdef INET6 3813 case AF_INET6: 3814 if (in6_pcbconnect(inp, am)) { 3815 (void) m_free(am); 3816 goto resetandabort; 3817 } 3818 break; 3819 #endif 3820 } 3821 (void) m_free(am); 3822 3823 tp = intotcpcb(inp); 3824 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3825 if (sc->sc_request_r_scale != 15) { 3826 tp->requested_s_scale = sc->sc_requested_s_scale; 3827 tp->request_r_scale = sc->sc_request_r_scale; 3828 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3829 } 3830 if (sc->sc_flags & SCF_TIMESTAMP) 3831 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3832 3833 tp->t_template = tcp_template(tp); 3834 if (tp->t_template == 0) { 3835 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3836 so = NULL; 3837 m_freem(m); 3838 goto abort; 3839 } 3840 #ifdef TCP_SACK 3841 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3842 #endif 3843 3844 tp->ts_modulate = sc->sc_modulate; 3845 tp->ts_recent = sc->sc_timestamp; 3846 tp->iss = sc->sc_iss; 3847 tp->irs = sc->sc_irs; 3848 tcp_sendseqinit(tp); 3849 #if defined (TCP_SACK) || defined(TCP_ECN) 3850 tp->snd_last = tp->snd_una; 3851 #endif /* TCP_SACK */ 3852 #if defined(TCP_SACK) && defined(TCP_FACK) 3853 tp->snd_fack = tp->snd_una; 3854 tp->retran_data = 0; 3855 tp->snd_awnd = 0; 3856 #endif /* TCP_FACK */ 3857 #ifdef TCP_ECN 3858 if (sc->sc_flags & SCF_ECN_PERMIT) { 3859 tp->t_flags |= TF_ECN_PERMIT; 3860 tcpstat.tcps_ecn_accepts++; 3861 } 3862 #endif 3863 #ifdef TCP_SACK 3864 if (sc->sc_flags & SCF_SACK_PERMIT) 3865 tp->t_flags |= TF_SACK_PERMIT; 3866 #endif 3867 #ifdef TCP_SIGNATURE 3868 if (sc->sc_flags & SCF_SIGNATURE) 3869 tp->t_flags |= TF_SIGNATURE; 3870 #endif 3871 tcp_rcvseqinit(tp); 3872 tp->t_state = TCPS_SYN_RECEIVED; 3873 tp->t_rcvtime = tcp_now; 3874 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3875 tcpstat.tcps_accepts++; 3876 3877 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3878 if (sc->sc_peermaxseg) 3879 tcp_mss_update(tp); 3880 /* Reset initial window to 1 segment for retransmit */ 3881 if (sc->sc_rxtshift > 0) 3882 tp->snd_cwnd = tp->t_maxseg; 3883 tp->snd_wl1 = sc->sc_irs; 3884 tp->rcv_up = sc->sc_irs + 1; 3885 3886 /* 3887 * This is what whould have happened in tcp_output() when 3888 * the SYN,ACK was sent. 3889 */ 3890 tp->snd_up = tp->snd_una; 3891 tp->snd_max = tp->snd_nxt = tp->iss+1; 3892 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3893 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3894 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3895 tp->last_ack_sent = tp->rcv_nxt; 3896 3897 tcpstat.tcps_sc_completed++; 3898 syn_cache_put(sc); 3899 return (so); 3900 3901 resetandabort: 3902 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3903 m->m_pkthdr.rdomain); 3904 m_freem(m); 3905 abort: 3906 if (so != NULL) 3907 (void) soabort(so); 3908 syn_cache_put(sc); 3909 tcpstat.tcps_sc_aborted++; 3910 return ((struct socket *)(-1)); 3911 } 3912 3913 /* 3914 * This function is called when we get a RST for a 3915 * non-existent connection, so that we can see if the 3916 * connection is in the syn cache. If it is, zap it. 3917 */ 3918 3919 void 3920 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3921 u_int rtableid) 3922 { 3923 struct syn_cache *sc; 3924 struct syn_cache_head *scp; 3925 int s = splsoftnet(); 3926 3927 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3928 splx(s); 3929 return; 3930 } 3931 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3932 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3933 splx(s); 3934 return; 3935 } 3936 syn_cache_rm(sc); 3937 splx(s); 3938 tcpstat.tcps_sc_reset++; 3939 syn_cache_put(sc); 3940 } 3941 3942 void 3943 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3944 u_int rtableid) 3945 { 3946 struct syn_cache *sc; 3947 struct syn_cache_head *scp; 3948 int s; 3949 3950 s = splsoftnet(); 3951 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3952 splx(s); 3953 return; 3954 } 3955 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3956 if (ntohl (th->th_seq) != sc->sc_iss) { 3957 splx(s); 3958 return; 3959 } 3960 3961 /* 3962 * If we've retransmitted 3 times and this is our second error, 3963 * we remove the entry. Otherwise, we allow it to continue on. 3964 * This prevents us from incorrectly nuking an entry during a 3965 * spurious network outage. 3966 * 3967 * See tcp_notify(). 3968 */ 3969 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3970 sc->sc_flags |= SCF_UNREACH; 3971 splx(s); 3972 return; 3973 } 3974 3975 syn_cache_rm(sc); 3976 splx(s); 3977 tcpstat.tcps_sc_unreach++; 3978 syn_cache_put(sc); 3979 } 3980 3981 /* 3982 * Given a LISTEN socket and an inbound SYN request, add 3983 * this to the syn cache, and send back a segment: 3984 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3985 * to the source. 3986 * 3987 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3988 * Doing so would require that we hold onto the data and deliver it 3989 * to the application. However, if we are the target of a SYN-flood 3990 * DoS attack, an attacker could send data which would eventually 3991 * consume all available buffer space if it were ACKed. By not ACKing 3992 * the data, we avoid this DoS scenario. 3993 */ 3994 3995 int 3996 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3997 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3998 struct tcp_opt_info *oi, tcp_seq *issp) 3999 { 4000 struct tcpcb tb, *tp; 4001 long win; 4002 struct syn_cache *sc; 4003 struct syn_cache_head *scp; 4004 struct mbuf *ipopts; 4005 4006 tp = sototcpcb(so); 4007 4008 /* 4009 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 4010 * 4011 * Note this check is performed in tcp_input() very early on. 4012 */ 4013 4014 /* 4015 * Initialize some local state. 4016 */ 4017 win = sbspace(&so->so_rcv); 4018 if (win > TCP_MAXWIN) 4019 win = TCP_MAXWIN; 4020 4021 bzero(&tb, sizeof(tb)); 4022 #ifdef TCP_SIGNATURE 4023 if (optp || (tp->t_flags & TF_SIGNATURE)) { 4024 #else 4025 if (optp) { 4026 #endif 4027 tb.pf = tp->pf; 4028 #ifdef TCP_SACK 4029 tb.sack_enable = tp->sack_enable; 4030 #endif 4031 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 4032 #ifdef TCP_SIGNATURE 4033 if (tp->t_flags & TF_SIGNATURE) 4034 tb.t_flags |= TF_SIGNATURE; 4035 #endif 4036 tb.t_state = TCPS_LISTEN; 4037 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 4038 sotoinpcb(so)->inp_rtableid)) 4039 return (-1); 4040 } 4041 4042 switch (src->sa_family) { 4043 #ifdef INET 4044 case AF_INET: 4045 /* 4046 * Remember the IP options, if any. 4047 */ 4048 ipopts = ip_srcroute(m); 4049 break; 4050 #endif 4051 default: 4052 ipopts = NULL; 4053 } 4054 4055 /* 4056 * See if we already have an entry for this connection. 4057 * If we do, resend the SYN,ACK. We do not count this 4058 * as a retransmission (XXX though maybe we should). 4059 */ 4060 if ((sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid)) 4061 != NULL) { 4062 tcpstat.tcps_sc_dupesyn++; 4063 if (ipopts) { 4064 /* 4065 * If we were remembering a previous source route, 4066 * forget it and use the new one we've been given. 4067 */ 4068 if (sc->sc_ipopts) 4069 (void) m_free(sc->sc_ipopts); 4070 sc->sc_ipopts = ipopts; 4071 } 4072 sc->sc_timestamp = tb.ts_recent; 4073 if (syn_cache_respond(sc, m) == 0) { 4074 tcpstat.tcps_sndacks++; 4075 tcpstat.tcps_sndtotal++; 4076 } 4077 return (0); 4078 } 4079 4080 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4081 if (sc == NULL) { 4082 if (ipopts) 4083 (void) m_free(ipopts); 4084 return (-1); 4085 } 4086 4087 /* 4088 * Fill in the cache, and put the necessary IP and TCP 4089 * options into the reply. 4090 */ 4091 bcopy(src, &sc->sc_src, src->sa_len); 4092 bcopy(dst, &sc->sc_dst, dst->sa_len); 4093 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 4094 sc->sc_flags = 0; 4095 sc->sc_ipopts = ipopts; 4096 sc->sc_irs = th->th_seq; 4097 4098 sc->sc_iss = issp ? *issp : arc4random(); 4099 sc->sc_peermaxseg = oi->maxseg; 4100 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4101 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4102 sc->sc_win = win; 4103 sc->sc_timestamp = tb.ts_recent; 4104 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4105 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4106 sc->sc_flags |= SCF_TIMESTAMP; 4107 sc->sc_modulate = arc4random(); 4108 } 4109 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4110 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4111 sc->sc_requested_s_scale = tb.requested_s_scale; 4112 sc->sc_request_r_scale = 0; 4113 /* 4114 * Pick the smallest possible scaling factor that 4115 * will still allow us to scale up to sb_max. 4116 * 4117 * We do this because there are broken firewalls that 4118 * will corrupt the window scale option, leading to 4119 * the other endpoint believing that our advertised 4120 * window is unscaled. At scale factors larger than 4121 * 5 the unscaled window will drop below 1500 bytes, 4122 * leading to serious problems when traversing these 4123 * broken firewalls. 4124 * 4125 * With the default sbmax of 256K, a scale factor 4126 * of 3 will be chosen by this algorithm. Those who 4127 * choose a larger sbmax should watch out 4128 * for the compatiblity problems mentioned above. 4129 * 4130 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4131 * or <SYN,ACK>) segment itself is never scaled. 4132 */ 4133 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4134 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4135 sc->sc_request_r_scale++; 4136 } else { 4137 sc->sc_requested_s_scale = 15; 4138 sc->sc_request_r_scale = 15; 4139 } 4140 #ifdef TCP_ECN 4141 /* 4142 * if both ECE and CWR flag bits are set, peer is ECN capable. 4143 */ 4144 if (tcp_do_ecn && 4145 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4146 sc->sc_flags |= SCF_ECN_PERMIT; 4147 #endif 4148 #ifdef TCP_SACK 4149 /* 4150 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4151 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4152 */ 4153 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4154 sc->sc_flags |= SCF_SACK_PERMIT; 4155 #endif 4156 #ifdef TCP_SIGNATURE 4157 if (tb.t_flags & TF_SIGNATURE) 4158 sc->sc_flags |= SCF_SIGNATURE; 4159 #endif 4160 sc->sc_tp = tp; 4161 if (syn_cache_respond(sc, m) == 0) { 4162 syn_cache_insert(sc, tp); 4163 tcpstat.tcps_sndacks++; 4164 tcpstat.tcps_sndtotal++; 4165 } else { 4166 syn_cache_put(sc); 4167 tcpstat.tcps_sc_dropped++; 4168 } 4169 4170 return (0); 4171 } 4172 4173 int 4174 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4175 { 4176 struct route *ro; 4177 u_int8_t *optp; 4178 int optlen, error; 4179 u_int16_t tlen; 4180 struct ip *ip = NULL; 4181 #ifdef INET6 4182 struct ip6_hdr *ip6 = NULL; 4183 #endif 4184 struct tcphdr *th; 4185 u_int hlen; 4186 struct inpcb *inp; 4187 4188 switch (sc->sc_src.sa.sa_family) { 4189 case AF_INET: 4190 hlen = sizeof(struct ip); 4191 ro = &sc->sc_route4; 4192 break; 4193 #ifdef INET6 4194 case AF_INET6: 4195 hlen = sizeof(struct ip6_hdr); 4196 ro = (struct route *)&sc->sc_route6; 4197 break; 4198 #endif 4199 default: 4200 if (m) 4201 m_freem(m); 4202 return (EAFNOSUPPORT); 4203 } 4204 4205 /* Compute the size of the TCP options. */ 4206 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4207 #ifdef TCP_SACK 4208 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4209 #endif 4210 #ifdef TCP_SIGNATURE 4211 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4212 #endif 4213 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4214 4215 tlen = hlen + sizeof(struct tcphdr) + optlen; 4216 4217 /* 4218 * Create the IP+TCP header from scratch. 4219 */ 4220 if (m) 4221 m_freem(m); 4222 #ifdef DIAGNOSTIC 4223 if (max_linkhdr + tlen > MCLBYTES) 4224 return (ENOBUFS); 4225 #endif 4226 MGETHDR(m, M_DONTWAIT, MT_DATA); 4227 if (m && max_linkhdr + tlen > MHLEN) { 4228 MCLGET(m, M_DONTWAIT); 4229 if ((m->m_flags & M_EXT) == 0) { 4230 m_freem(m); 4231 m = NULL; 4232 } 4233 } 4234 if (m == NULL) 4235 return (ENOBUFS); 4236 4237 /* Fixup the mbuf. */ 4238 m->m_data += max_linkhdr; 4239 m->m_len = m->m_pkthdr.len = tlen; 4240 m->m_pkthdr.rcvif = NULL; 4241 m->m_pkthdr.rdomain = sc->sc_rtableid; 4242 memset(mtod(m, u_char *), 0, tlen); 4243 4244 switch (sc->sc_src.sa.sa_family) { 4245 case AF_INET: 4246 ip = mtod(m, struct ip *); 4247 ip->ip_dst = sc->sc_src.sin.sin_addr; 4248 ip->ip_src = sc->sc_dst.sin.sin_addr; 4249 ip->ip_p = IPPROTO_TCP; 4250 th = (struct tcphdr *)(ip + 1); 4251 th->th_dport = sc->sc_src.sin.sin_port; 4252 th->th_sport = sc->sc_dst.sin.sin_port; 4253 break; 4254 #ifdef INET6 4255 case AF_INET6: 4256 ip6 = mtod(m, struct ip6_hdr *); 4257 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4258 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4259 ip6->ip6_nxt = IPPROTO_TCP; 4260 /* ip6_plen will be updated in ip6_output() */ 4261 th = (struct tcphdr *)(ip6 + 1); 4262 th->th_dport = sc->sc_src.sin6.sin6_port; 4263 th->th_sport = sc->sc_dst.sin6.sin6_port; 4264 break; 4265 #endif 4266 default: 4267 th = NULL; 4268 } 4269 4270 th->th_seq = htonl(sc->sc_iss); 4271 th->th_ack = htonl(sc->sc_irs + 1); 4272 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4273 th->th_flags = TH_SYN|TH_ACK; 4274 #ifdef TCP_ECN 4275 /* Set ECE for SYN-ACK if peer supports ECN. */ 4276 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4277 th->th_flags |= TH_ECE; 4278 #endif 4279 th->th_win = htons(sc->sc_win); 4280 /* th_sum already 0 */ 4281 /* th_urp already 0 */ 4282 4283 /* Tack on the TCP options. */ 4284 optp = (u_int8_t *)(th + 1); 4285 *optp++ = TCPOPT_MAXSEG; 4286 *optp++ = 4; 4287 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4288 *optp++ = sc->sc_ourmaxseg & 0xff; 4289 4290 #ifdef TCP_SACK 4291 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4292 if (sc->sc_flags & SCF_SACK_PERMIT) { 4293 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4294 optp += 4; 4295 } 4296 #endif 4297 4298 if (sc->sc_request_r_scale != 15) { 4299 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4300 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4301 sc->sc_request_r_scale); 4302 optp += 4; 4303 } 4304 4305 if (sc->sc_flags & SCF_TIMESTAMP) { 4306 u_int32_t *lp = (u_int32_t *)(optp); 4307 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4308 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4309 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4310 *lp = htonl(sc->sc_timestamp); 4311 optp += TCPOLEN_TSTAMP_APPA; 4312 } 4313 4314 #ifdef TCP_SIGNATURE 4315 if (sc->sc_flags & SCF_SIGNATURE) { 4316 union sockaddr_union src, dst; 4317 struct tdb *tdb; 4318 4319 bzero(&src, sizeof(union sockaddr_union)); 4320 bzero(&dst, sizeof(union sockaddr_union)); 4321 src.sa.sa_len = sc->sc_src.sa.sa_len; 4322 src.sa.sa_family = sc->sc_src.sa.sa_family; 4323 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4324 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4325 4326 switch (sc->sc_src.sa.sa_family) { 4327 case 0: /*default to PF_INET*/ 4328 #ifdef INET 4329 case AF_INET: 4330 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4331 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4332 break; 4333 #endif /* INET */ 4334 #ifdef INET6 4335 case AF_INET6: 4336 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4337 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4338 break; 4339 #endif /* INET6 */ 4340 } 4341 4342 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4343 0, &src, &dst, IPPROTO_TCP); 4344 if (tdb == NULL) { 4345 if (m) 4346 m_freem(m); 4347 return (EPERM); 4348 } 4349 4350 /* Send signature option */ 4351 *(optp++) = TCPOPT_SIGNATURE; 4352 *(optp++) = TCPOLEN_SIGNATURE; 4353 4354 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4355 hlen, 0, optp) < 0) { 4356 if (m) 4357 m_freem(m); 4358 return (EINVAL); 4359 } 4360 optp += 16; 4361 4362 /* Pad options list to the next 32 bit boundary and 4363 * terminate it. 4364 */ 4365 *optp++ = TCPOPT_NOP; 4366 *optp++ = TCPOPT_EOL; 4367 } 4368 #endif /* TCP_SIGNATURE */ 4369 4370 /* Compute the packet's checksum. */ 4371 switch (sc->sc_src.sa.sa_family) { 4372 case AF_INET: 4373 ip->ip_len = htons(tlen - hlen); 4374 th->th_sum = 0; 4375 th->th_sum = in_cksum(m, tlen); 4376 break; 4377 #ifdef INET6 4378 case AF_INET6: 4379 ip6->ip6_plen = htons(tlen - hlen); 4380 th->th_sum = 0; 4381 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4382 break; 4383 #endif 4384 } 4385 4386 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4387 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4388 4389 /* 4390 * Fill in some straggling IP bits. Note the stack expects 4391 * ip_len to be in host order, for convenience. 4392 */ 4393 switch (sc->sc_src.sa.sa_family) { 4394 #ifdef INET 4395 case AF_INET: 4396 ip->ip_len = htons(tlen); 4397 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4398 if (inp != NULL) 4399 ip->ip_tos = inp->inp_ip.ip_tos; 4400 break; 4401 #endif 4402 #ifdef INET6 4403 case AF_INET6: 4404 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4405 ip6->ip6_vfc |= IPV6_VERSION; 4406 ip6->ip6_plen = htons(tlen - hlen); 4407 /* ip6_hlim will be initialized afterwards */ 4408 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4409 break; 4410 #endif 4411 } 4412 4413 switch (sc->sc_src.sa.sa_family) { 4414 #ifdef INET 4415 case AF_INET: 4416 error = ip_output(m, sc->sc_ipopts, ro, 4417 (ip_mtudisc ? IP_MTUDISC : 0), 4418 (struct ip_moptions *)NULL, inp); 4419 break; 4420 #endif 4421 #ifdef INET6 4422 case AF_INET6: 4423 ip6->ip6_hlim = in6_selecthlim(NULL, 4424 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4425 4426 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4427 (struct ip6_moptions *)0, NULL, NULL); 4428 break; 4429 #endif 4430 default: 4431 error = EAFNOSUPPORT; 4432 break; 4433 } 4434 return (error); 4435 } 4436