1 /* $OpenBSD: tcp_input.c,v 1.306 2015/10/24 16:08:48 mpi Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_debug.h> 98 99 #if NPF > 0 100 #include <net/pfvar.h> 101 #endif 102 103 struct tcpiphdr tcp_saveti; 104 105 int tcp_mss_adv(struct mbuf *, int); 106 int tcp_flush_queue(struct tcpcb *); 107 108 #ifdef INET6 109 #include <netinet6/in6_var.h> 110 #include <netinet6/nd6.h> 111 112 struct tcpipv6hdr tcp_saveti6; 113 114 /* for the packet header length in the mbuf */ 115 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 116 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 117 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 118 #endif /* INET6 */ 119 120 int tcprexmtthresh = 3; 121 int tcptv_keep_init = TCPTV_KEEP_INIT; 122 123 int tcp_rst_ppslim = 100; /* 100pps */ 124 int tcp_rst_ppslim_count = 0; 125 struct timeval tcp_rst_ppslim_last; 126 127 int tcp_ackdrop_ppslim = 100; /* 100pps */ 128 int tcp_ackdrop_ppslim_count = 0; 129 struct timeval tcp_ackdrop_ppslim_last; 130 131 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 132 133 /* for modulo comparisons of timestamps */ 134 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 135 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 136 137 /* for TCP SACK comparisons */ 138 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 139 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 140 141 /* 142 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 143 */ 144 #ifdef INET6 145 #define ND6_HINT(tp) \ 146 do { \ 147 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 148 tp->t_inpcb->inp_route6.ro_rt) { \ 149 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, \ 150 tp->t_inpcb->inp_rtableid); \ 151 } \ 152 } while (0) 153 #else 154 #define ND6_HINT(tp) 155 #endif 156 157 #ifdef TCP_ECN 158 /* 159 * ECN (Explicit Congestion Notification) support based on RFC3168 160 * implementation note: 161 * snd_last is used to track a recovery phase. 162 * when cwnd is reduced, snd_last is set to snd_max. 163 * while snd_last > snd_una, the sender is in a recovery phase and 164 * its cwnd should not be reduced again. 165 * snd_last follows snd_una when not in a recovery phase. 166 */ 167 #endif 168 169 /* 170 * Macro to compute ACK transmission behavior. Delay the ACK unless 171 * we have already delayed an ACK (must send an ACK every two segments). 172 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 173 * option is enabled or when the packet is coming from a loopback 174 * interface. 175 */ 176 #define TCP_SETUP_ACK(tp, tiflags, m) \ 177 do { \ 178 struct ifnet *ifp = NULL; \ 179 if (m && (m->m_flags & M_PKTHDR)) \ 180 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 181 if ((tp)->t_flags & TF_DELACK || \ 182 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 183 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 184 tp->t_flags |= TF_ACKNOW; \ 185 else \ 186 TCP_SET_DELACK(tp); \ 187 if_put(ifp); \ 188 } while (0) 189 190 void syn_cache_put(struct syn_cache *); 191 void syn_cache_rm(struct syn_cache *); 192 193 /* 194 * Insert segment ti into reassembly queue of tcp with 195 * control block tp. Return TH_FIN if reassembly now includes 196 * a segment with FIN. The macro form does the common case inline 197 * (segment is the next to be received on an established connection, 198 * and the queue is empty), avoiding linkage into and removal 199 * from the queue and repetition of various conversions. 200 * Set DELACK for segments received in order, but ack immediately 201 * when segments are out of order (so fast retransmit can work). 202 */ 203 204 int 205 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 206 { 207 struct tcpqent *p, *q, *nq, *tiqe; 208 209 /* 210 * Allocate a new queue entry, before we throw away any data. 211 * If we can't, just drop the packet. XXX 212 */ 213 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 214 if (tiqe == NULL) { 215 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 216 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 217 /* Reuse last entry since new segment fills a hole */ 218 m_freem(tiqe->tcpqe_m); 219 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 220 } 221 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 222 /* Flush segment queue for this connection */ 223 tcp_freeq(tp); 224 tcpstat.tcps_rcvmemdrop++; 225 m_freem(m); 226 return (0); 227 } 228 } 229 230 /* 231 * Find a segment which begins after this one does. 232 */ 233 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 234 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 235 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 236 break; 237 238 /* 239 * If there is a preceding segment, it may provide some of 240 * our data already. If so, drop the data from the incoming 241 * segment. If it provides all of our data, drop us. 242 */ 243 if (p != NULL) { 244 struct tcphdr *phdr = p->tcpqe_tcp; 245 int i; 246 247 /* conversion to int (in i) handles seq wraparound */ 248 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 249 if (i > 0) { 250 if (i >= *tlen) { 251 tcpstat.tcps_rcvduppack++; 252 tcpstat.tcps_rcvdupbyte += *tlen; 253 m_freem(m); 254 pool_put(&tcpqe_pool, tiqe); 255 return (0); 256 } 257 m_adj(m, i); 258 *tlen -= i; 259 th->th_seq += i; 260 } 261 } 262 tcpstat.tcps_rcvoopack++; 263 tcpstat.tcps_rcvoobyte += *tlen; 264 265 /* 266 * While we overlap succeeding segments trim them or, 267 * if they are completely covered, dequeue them. 268 */ 269 for (; q != NULL; q = nq) { 270 struct tcphdr *qhdr = q->tcpqe_tcp; 271 int i = (th->th_seq + *tlen) - qhdr->th_seq; 272 273 if (i <= 0) 274 break; 275 if (i < qhdr->th_reseqlen) { 276 qhdr->th_seq += i; 277 qhdr->th_reseqlen -= i; 278 m_adj(q->tcpqe_m, i); 279 break; 280 } 281 nq = TAILQ_NEXT(q, tcpqe_q); 282 m_freem(q->tcpqe_m); 283 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 284 pool_put(&tcpqe_pool, q); 285 } 286 287 /* Insert the new segment queue entry into place. */ 288 tiqe->tcpqe_m = m; 289 th->th_reseqlen = *tlen; 290 tiqe->tcpqe_tcp = th; 291 if (p == NULL) { 292 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 293 } else { 294 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 295 } 296 297 if (th->th_seq != tp->rcv_nxt) 298 return (0); 299 300 return (tcp_flush_queue(tp)); 301 } 302 303 int 304 tcp_flush_queue(struct tcpcb *tp) 305 { 306 struct socket *so = tp->t_inpcb->inp_socket; 307 struct tcpqent *q, *nq; 308 int flags; 309 310 /* 311 * Present data to user, advancing rcv_nxt through 312 * completed sequence space. 313 */ 314 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 315 return (0); 316 q = TAILQ_FIRST(&tp->t_segq); 317 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 318 return (0); 319 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 320 return (0); 321 do { 322 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 323 flags = q->tcpqe_tcp->th_flags & TH_FIN; 324 325 nq = TAILQ_NEXT(q, tcpqe_q); 326 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 327 ND6_HINT(tp); 328 if (so->so_state & SS_CANTRCVMORE) 329 m_freem(q->tcpqe_m); 330 else 331 sbappendstream(&so->so_rcv, q->tcpqe_m); 332 pool_put(&tcpqe_pool, q); 333 q = nq; 334 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 335 tp->t_flags |= TF_BLOCKOUTPUT; 336 sorwakeup(so); 337 tp->t_flags &= ~TF_BLOCKOUTPUT; 338 return (flags); 339 } 340 341 #ifdef INET6 342 int 343 tcp6_input(struct mbuf **mp, int *offp, int proto) 344 { 345 struct mbuf *m = *mp; 346 347 tcp_input(m, *offp, proto); 348 return IPPROTO_DONE; 349 } 350 #endif 351 352 /* 353 * TCP input routine, follows pages 65-76 of the 354 * protocol specification dated September, 1981 very closely. 355 */ 356 void 357 tcp_input(struct mbuf *m, ...) 358 { 359 struct ip *ip; 360 struct inpcb *inp = NULL; 361 u_int8_t *optp = NULL; 362 int optlen = 0; 363 int tlen, off; 364 struct tcpcb *tp = NULL; 365 int tiflags; 366 struct socket *so = NULL; 367 int todrop, acked, ourfinisacked; 368 int hdroptlen = 0; 369 short ostate = 0; 370 tcp_seq iss, *reuse = NULL; 371 u_long tiwin; 372 struct tcp_opt_info opti; 373 int iphlen; 374 va_list ap; 375 struct tcphdr *th; 376 #ifdef INET6 377 struct ip6_hdr *ip6 = NULL; 378 #endif /* INET6 */ 379 #ifdef IPSEC 380 struct m_tag *mtag; 381 struct tdb_ident *tdbi; 382 struct tdb *tdb; 383 int error; 384 #endif /* IPSEC */ 385 int af; 386 #ifdef TCP_ECN 387 u_char iptos; 388 #endif 389 390 va_start(ap, m); 391 iphlen = va_arg(ap, int); 392 va_end(ap); 393 394 tcpstat.tcps_rcvtotal++; 395 396 opti.ts_present = 0; 397 opti.maxseg = 0; 398 399 /* 400 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 401 */ 402 if (m->m_flags & (M_BCAST|M_MCAST)) 403 goto drop; 404 405 /* 406 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 407 * TCP/IPv4. 408 */ 409 switch (mtod(m, struct ip *)->ip_v) { 410 #ifdef INET6 411 case 6: 412 af = AF_INET6; 413 break; 414 #endif 415 case 4: 416 af = AF_INET; 417 break; 418 default: 419 m_freem(m); 420 return; /*EAFNOSUPPORT*/ 421 } 422 423 /* 424 * Get IP and TCP header together in first mbuf. 425 * Note: IP leaves IP header in first mbuf. 426 */ 427 switch (af) { 428 case AF_INET: 429 #ifdef DIAGNOSTIC 430 if (iphlen < sizeof(struct ip)) { 431 m_freem(m); 432 return; 433 } 434 #endif /* DIAGNOSTIC */ 435 break; 436 #ifdef INET6 437 case AF_INET6: 438 #ifdef DIAGNOSTIC 439 if (iphlen < sizeof(struct ip6_hdr)) { 440 m_freem(m); 441 return; 442 } 443 #endif /* DIAGNOSTIC */ 444 break; 445 #endif 446 default: 447 m_freem(m); 448 return; 449 } 450 451 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 452 if (!th) { 453 tcpstat.tcps_rcvshort++; 454 return; 455 } 456 457 tlen = m->m_pkthdr.len - iphlen; 458 ip = NULL; 459 #ifdef INET6 460 ip6 = NULL; 461 #endif 462 switch (af) { 463 case AF_INET: 464 ip = mtod(m, struct ip *); 465 #ifdef TCP_ECN 466 /* save ip_tos before clearing it for checksum */ 467 iptos = ip->ip_tos; 468 #endif 469 break; 470 #ifdef INET6 471 case AF_INET6: 472 ip6 = mtod(m, struct ip6_hdr *); 473 #ifdef TCP_ECN 474 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 475 #endif 476 477 /* Be proactive about malicious use of IPv4 mapped address */ 478 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 479 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 480 /* XXX stat */ 481 goto drop; 482 } 483 484 /* 485 * Be proactive about unspecified IPv6 address in source. 486 * As we use all-zero to indicate unbounded/unconnected pcb, 487 * unspecified IPv6 address can be used to confuse us. 488 * 489 * Note that packets with unspecified IPv6 destination is 490 * already dropped in ip6_input. 491 */ 492 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 493 /* XXX stat */ 494 goto drop; 495 } 496 497 /* Discard packets to multicast */ 498 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 499 /* XXX stat */ 500 goto drop; 501 } 502 break; 503 #endif 504 } 505 506 /* 507 * Checksum extended TCP header and data. 508 */ 509 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 510 int sum; 511 512 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 513 tcpstat.tcps_rcvbadsum++; 514 goto drop; 515 } 516 tcpstat.tcps_inswcsum++; 517 switch (af) { 518 case AF_INET: 519 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 520 break; 521 #ifdef INET6 522 case AF_INET6: 523 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 524 tlen); 525 break; 526 #endif 527 } 528 if (sum != 0) { 529 tcpstat.tcps_rcvbadsum++; 530 goto drop; 531 } 532 } 533 534 /* 535 * Check that TCP offset makes sense, 536 * pull out TCP options and adjust length. XXX 537 */ 538 off = th->th_off << 2; 539 if (off < sizeof(struct tcphdr) || off > tlen) { 540 tcpstat.tcps_rcvbadoff++; 541 goto drop; 542 } 543 tlen -= off; 544 if (off > sizeof(struct tcphdr)) { 545 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 546 if (!th) { 547 tcpstat.tcps_rcvshort++; 548 return; 549 } 550 optlen = off - sizeof(struct tcphdr); 551 optp = (u_int8_t *)(th + 1); 552 /* 553 * Do quick retrieval of timestamp options ("options 554 * prediction?"). If timestamp is the only option and it's 555 * formatted as recommended in RFC 1323 appendix A, we 556 * quickly get the values now and not bother calling 557 * tcp_dooptions(), etc. 558 */ 559 if ((optlen == TCPOLEN_TSTAMP_APPA || 560 (optlen > TCPOLEN_TSTAMP_APPA && 561 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 562 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 563 (th->th_flags & TH_SYN) == 0) { 564 opti.ts_present = 1; 565 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 566 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 567 optp = NULL; /* we've parsed the options */ 568 } 569 } 570 tiflags = th->th_flags; 571 572 /* 573 * Convert TCP protocol specific fields to host format. 574 */ 575 th->th_seq = ntohl(th->th_seq); 576 th->th_ack = ntohl(th->th_ack); 577 th->th_win = ntohs(th->th_win); 578 th->th_urp = ntohs(th->th_urp); 579 580 /* 581 * Locate pcb for segment. 582 */ 583 #if NPF > 0 584 if (m->m_pkthdr.pf.statekey) { 585 inp = m->m_pkthdr.pf.statekey->inp; 586 if (inp && inp->inp_pf_sk) 587 KASSERT(m->m_pkthdr.pf.statekey == inp->inp_pf_sk); 588 } 589 #endif 590 findpcb: 591 if (inp == NULL) { 592 switch (af) { 593 #ifdef INET6 594 case AF_INET6: 595 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 596 th->th_sport, &ip6->ip6_dst, th->th_dport, 597 m->m_pkthdr.ph_rtableid); 598 break; 599 #endif 600 case AF_INET: 601 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 602 th->th_sport, ip->ip_dst, th->th_dport, 603 m->m_pkthdr.ph_rtableid); 604 break; 605 } 606 #if NPF > 0 607 if (m->m_pkthdr.pf.statekey && inp) { 608 m->m_pkthdr.pf.statekey->inp = inp; 609 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 610 } 611 #endif 612 } 613 if (inp == NULL) { 614 int inpl_reverse = 0; 615 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 616 inpl_reverse = 1; 617 ++tcpstat.tcps_pcbhashmiss; 618 switch (af) { 619 #ifdef INET6 620 case AF_INET6: 621 inp = in6_pcblookup_listen(&tcbtable, 622 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 623 m->m_pkthdr.ph_rtableid); 624 break; 625 #endif /* INET6 */ 626 case AF_INET: 627 inp = in_pcblookup_listen(&tcbtable, 628 ip->ip_dst, th->th_dport, inpl_reverse, m, 629 m->m_pkthdr.ph_rtableid); 630 break; 631 } 632 /* 633 * If the state is CLOSED (i.e., TCB does not exist) then 634 * all data in the incoming segment is discarded. 635 * If the TCB exists but is in CLOSED state, it is embryonic, 636 * but should either do a listen or a connect soon. 637 */ 638 if (inp == NULL) { 639 ++tcpstat.tcps_noport; 640 goto dropwithreset_ratelim; 641 } 642 } 643 KASSERT(sotoinpcb(inp->inp_socket) == inp); 644 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 645 646 /* Check the minimum TTL for socket. */ 647 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 648 goto drop; 649 650 tp = intotcpcb(inp); 651 if (tp == NULL) 652 goto dropwithreset_ratelim; 653 if (tp->t_state == TCPS_CLOSED) 654 goto drop; 655 656 /* Unscale the window into a 32-bit value. */ 657 if ((tiflags & TH_SYN) == 0) 658 tiwin = th->th_win << tp->snd_scale; 659 else 660 tiwin = th->th_win; 661 662 so = inp->inp_socket; 663 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 664 union syn_cache_sa src; 665 union syn_cache_sa dst; 666 667 bzero(&src, sizeof(src)); 668 bzero(&dst, sizeof(dst)); 669 switch (af) { 670 case AF_INET: 671 src.sin.sin_len = sizeof(struct sockaddr_in); 672 src.sin.sin_family = AF_INET; 673 src.sin.sin_addr = ip->ip_src; 674 src.sin.sin_port = th->th_sport; 675 676 dst.sin.sin_len = sizeof(struct sockaddr_in); 677 dst.sin.sin_family = AF_INET; 678 dst.sin.sin_addr = ip->ip_dst; 679 dst.sin.sin_port = th->th_dport; 680 break; 681 #ifdef INET6 682 case AF_INET6: 683 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 684 src.sin6.sin6_family = AF_INET6; 685 src.sin6.sin6_addr = ip6->ip6_src; 686 src.sin6.sin6_port = th->th_sport; 687 688 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 689 dst.sin6.sin6_family = AF_INET6; 690 dst.sin6.sin6_addr = ip6->ip6_dst; 691 dst.sin6.sin6_port = th->th_dport; 692 break; 693 #endif /* INET6 */ 694 default: 695 goto badsyn; /*sanity*/ 696 } 697 698 if (so->so_options & SO_DEBUG) { 699 ostate = tp->t_state; 700 switch (af) { 701 #ifdef INET6 702 case AF_INET6: 703 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 704 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 705 break; 706 #endif 707 case AF_INET: 708 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 709 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 710 break; 711 } 712 } 713 if (so->so_options & SO_ACCEPTCONN) { 714 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 715 716 case TH_SYN|TH_ACK|TH_RST: 717 case TH_SYN|TH_RST: 718 case TH_ACK|TH_RST: 719 case TH_RST: 720 syn_cache_reset(&src.sa, &dst.sa, th, 721 inp->inp_rtableid); 722 goto drop; 723 724 case TH_SYN|TH_ACK: 725 /* 726 * Received a SYN,ACK. This should 727 * never happen while we are in 728 * LISTEN. Send an RST. 729 */ 730 goto badsyn; 731 732 case TH_ACK: 733 so = syn_cache_get(&src.sa, &dst.sa, 734 th, iphlen, tlen, so, m); 735 if (so == NULL) { 736 /* 737 * We don't have a SYN for 738 * this ACK; send an RST. 739 */ 740 goto badsyn; 741 } else if (so == (struct socket *)(-1)) { 742 /* 743 * We were unable to create 744 * the connection. If the 745 * 3-way handshake was 746 * completed, and RST has 747 * been sent to the peer. 748 * Since the mbuf might be 749 * in use for the reply, 750 * do not free it. 751 */ 752 m = NULL; 753 goto drop; 754 } else { 755 /* 756 * We have created a 757 * full-blown connection. 758 */ 759 tp = NULL; 760 inp = sotoinpcb(so); 761 tp = intotcpcb(inp); 762 if (tp == NULL) 763 goto badsyn; /*XXX*/ 764 765 } 766 break; 767 768 default: 769 /* 770 * None of RST, SYN or ACK was set. 771 * This is an invalid packet for a 772 * TCB in LISTEN state. Send a RST. 773 */ 774 goto badsyn; 775 776 case TH_SYN: 777 /* 778 * Received a SYN. 779 */ 780 #ifdef INET6 781 /* 782 * If deprecated address is forbidden, we do 783 * not accept SYN to deprecated interface 784 * address to prevent any new inbound 785 * connection from getting established. 786 * When we do not accept SYN, we send a TCP 787 * RST, with deprecated source address (instead 788 * of dropping it). We compromise it as it is 789 * much better for peer to send a RST, and 790 * RST will be the final packet for the 791 * exchange. 792 * 793 * If we do not forbid deprecated addresses, we 794 * accept the SYN packet. RFC2462 does not 795 * suggest dropping SYN in this case. 796 * If we decipher RFC2462 5.5.4, it says like 797 * this: 798 * 1. use of deprecated addr with existing 799 * communication is okay - "SHOULD continue 800 * to be used" 801 * 2. use of it with new communication: 802 * (2a) "SHOULD NOT be used if alternate 803 * address with sufficient scope is 804 * available" 805 * (2b) nothing mentioned otherwise. 806 * Here we fall into (2b) case as we have no 807 * choice in our source address selection - we 808 * must obey the peer. 809 * 810 * The wording in RFC2462 is confusing, and 811 * there are multiple description text for 812 * deprecated address handling - worse, they 813 * are not exactly the same. I believe 5.5.4 814 * is the best one, so we follow 5.5.4. 815 */ 816 if (ip6 && !ip6_use_deprecated) { 817 struct in6_ifaddr *ia6; 818 struct ifnet *ifp = 819 if_get(m->m_pkthdr.ph_ifidx); 820 821 if (ifp && 822 (ia6 = in6ifa_ifpwithaddr(ifp, 823 &ip6->ip6_dst)) && 824 (ia6->ia6_flags & 825 IN6_IFF_DEPRECATED)) { 826 tp = NULL; 827 if_put(ifp); 828 goto dropwithreset; 829 } 830 if_put(ifp); 831 } 832 #endif 833 834 /* 835 * LISTEN socket received a SYN 836 * from itself? This can't possibly 837 * be valid; drop the packet. 838 */ 839 if (th->th_dport == th->th_sport) { 840 switch (af) { 841 #ifdef INET6 842 case AF_INET6: 843 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 844 &ip6->ip6_dst)) { 845 tcpstat.tcps_badsyn++; 846 goto drop; 847 } 848 break; 849 #endif /* INET6 */ 850 case AF_INET: 851 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 852 tcpstat.tcps_badsyn++; 853 goto drop; 854 } 855 break; 856 } 857 } 858 859 /* 860 * SYN looks ok; create compressed TCP 861 * state for it. 862 */ 863 if (so->so_qlen > so->so_qlimit || 864 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 865 so, m, optp, optlen, &opti, reuse) == -1) { 866 tcpstat.tcps_dropsyn++; 867 goto drop; 868 } 869 return; 870 } 871 } 872 } 873 874 #ifdef DIAGNOSTIC 875 /* 876 * Should not happen now that all embryonic connections 877 * are handled with compressed state. 878 */ 879 if (tp->t_state == TCPS_LISTEN) 880 panic("tcp_input: TCPS_LISTEN"); 881 #endif 882 883 #if NPF > 0 884 if (m->m_pkthdr.pf.statekey && !m->m_pkthdr.pf.statekey->inp && 885 !inp->inp_pf_sk) { 886 m->m_pkthdr.pf.statekey->inp = inp; 887 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 888 } 889 /* The statekey has finished finding the inp, it is no longer needed. */ 890 m->m_pkthdr.pf.statekey = NULL; 891 #endif 892 893 #ifdef IPSEC 894 /* Find most recent IPsec tag */ 895 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 896 if (mtag != NULL) { 897 tdbi = (struct tdb_ident *)(mtag + 1); 898 tdb = gettdb(tdbi->rdomain, tdbi->spi, 899 &tdbi->dst, tdbi->proto); 900 } else 901 tdb = NULL; 902 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 903 tdb, inp, 0); 904 if (error) { 905 tcpstat.tcps_rcvnosec++; 906 goto drop; 907 } 908 #endif /* IPSEC */ 909 910 /* 911 * Segment received on connection. 912 * Reset idle time and keep-alive timer. 913 */ 914 tp->t_rcvtime = tcp_now; 915 if (TCPS_HAVEESTABLISHED(tp->t_state)) 916 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 917 918 #ifdef TCP_SACK 919 if (tp->sack_enable) 920 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 921 #endif /* TCP_SACK */ 922 923 /* 924 * Process options. 925 */ 926 #ifdef TCP_SIGNATURE 927 if (optp || (tp->t_flags & TF_SIGNATURE)) 928 #else 929 if (optp) 930 #endif 931 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 932 m->m_pkthdr.ph_rtableid)) 933 goto drop; 934 935 if (opti.ts_present && opti.ts_ecr) { 936 int rtt_test; 937 938 /* subtract out the tcp timestamp modulator */ 939 opti.ts_ecr -= tp->ts_modulate; 940 941 /* make sure ts_ecr is sensible */ 942 rtt_test = tcp_now - opti.ts_ecr; 943 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 944 opti.ts_ecr = 0; 945 } 946 947 #ifdef TCP_ECN 948 /* if congestion experienced, set ECE bit in subsequent packets. */ 949 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 950 tp->t_flags |= TF_RCVD_CE; 951 tcpstat.tcps_ecn_rcvce++; 952 } 953 #endif 954 /* 955 * Header prediction: check for the two common cases 956 * of a uni-directional data xfer. If the packet has 957 * no control flags, is in-sequence, the window didn't 958 * change and we're not retransmitting, it's a 959 * candidate. If the length is zero and the ack moved 960 * forward, we're the sender side of the xfer. Just 961 * free the data acked & wake any higher level process 962 * that was blocked waiting for space. If the length 963 * is non-zero and the ack didn't move, we're the 964 * receiver side. If we're getting packets in-order 965 * (the reassembly queue is empty), add the data to 966 * the socket buffer and note that we need a delayed ack. 967 */ 968 if (tp->t_state == TCPS_ESTABLISHED && 969 #ifdef TCP_ECN 970 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 971 #else 972 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 973 #endif 974 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 975 th->th_seq == tp->rcv_nxt && 976 tiwin && tiwin == tp->snd_wnd && 977 tp->snd_nxt == tp->snd_max) { 978 979 /* 980 * If last ACK falls within this segment's sequence numbers, 981 * record the timestamp. 982 * Fix from Braden, see Stevens p. 870 983 */ 984 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 985 tp->ts_recent_age = tcp_now; 986 tp->ts_recent = opti.ts_val; 987 } 988 989 if (tlen == 0) { 990 if (SEQ_GT(th->th_ack, tp->snd_una) && 991 SEQ_LEQ(th->th_ack, tp->snd_max) && 992 tp->snd_cwnd >= tp->snd_wnd && 993 tp->t_dupacks == 0) { 994 /* 995 * this is a pure ack for outstanding data. 996 */ 997 ++tcpstat.tcps_predack; 998 if (opti.ts_present && opti.ts_ecr) 999 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1000 else if (tp->t_rtttime && 1001 SEQ_GT(th->th_ack, tp->t_rtseq)) 1002 tcp_xmit_timer(tp, 1003 tcp_now - tp->t_rtttime); 1004 acked = th->th_ack - tp->snd_una; 1005 tcpstat.tcps_rcvackpack++; 1006 tcpstat.tcps_rcvackbyte += acked; 1007 ND6_HINT(tp); 1008 sbdrop(&so->so_snd, acked); 1009 1010 /* 1011 * If we had a pending ICMP message that 1012 * refers to data that have just been 1013 * acknowledged, disregard the recorded ICMP 1014 * message. 1015 */ 1016 if ((tp->t_flags & TF_PMTUD_PEND) && 1017 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1018 tp->t_flags &= ~TF_PMTUD_PEND; 1019 1020 /* 1021 * Keep track of the largest chunk of data 1022 * acknowledged since last PMTU update 1023 */ 1024 if (tp->t_pmtud_mss_acked < acked) 1025 tp->t_pmtud_mss_acked = acked; 1026 1027 tp->snd_una = th->th_ack; 1028 #if defined(TCP_SACK) || defined(TCP_ECN) 1029 /* 1030 * We want snd_last to track snd_una so 1031 * as to avoid sequence wraparound problems 1032 * for very large transfers. 1033 */ 1034 #ifdef TCP_ECN 1035 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1036 #endif 1037 tp->snd_last = tp->snd_una; 1038 #endif /* TCP_SACK */ 1039 #if defined(TCP_SACK) && defined(TCP_FACK) 1040 tp->snd_fack = tp->snd_una; 1041 tp->retran_data = 0; 1042 #endif /* TCP_FACK */ 1043 m_freem(m); 1044 1045 /* 1046 * If all outstanding data are acked, stop 1047 * retransmit timer, otherwise restart timer 1048 * using current (possibly backed-off) value. 1049 * If process is waiting for space, 1050 * wakeup/selwakeup/signal. If data 1051 * are ready to send, let tcp_output 1052 * decide between more output or persist. 1053 */ 1054 if (tp->snd_una == tp->snd_max) 1055 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1056 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1057 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1058 1059 tcp_update_sndspace(tp); 1060 if (sb_notify(&so->so_snd)) { 1061 tp->t_flags |= TF_BLOCKOUTPUT; 1062 sowwakeup(so); 1063 tp->t_flags &= ~TF_BLOCKOUTPUT; 1064 } 1065 if (so->so_snd.sb_cc || 1066 tp->t_flags & TF_NEEDOUTPUT) 1067 (void) tcp_output(tp); 1068 return; 1069 } 1070 } else if (th->th_ack == tp->snd_una && 1071 TAILQ_EMPTY(&tp->t_segq) && 1072 tlen <= sbspace(&so->so_rcv)) { 1073 /* 1074 * This is a pure, in-sequence data packet 1075 * with nothing on the reassembly queue and 1076 * we have enough buffer space to take it. 1077 */ 1078 #ifdef TCP_SACK 1079 /* Clean receiver SACK report if present */ 1080 if (tp->sack_enable && tp->rcv_numsacks) 1081 tcp_clean_sackreport(tp); 1082 #endif /* TCP_SACK */ 1083 ++tcpstat.tcps_preddat; 1084 tp->rcv_nxt += tlen; 1085 tcpstat.tcps_rcvpack++; 1086 tcpstat.tcps_rcvbyte += tlen; 1087 ND6_HINT(tp); 1088 1089 TCP_SETUP_ACK(tp, tiflags, m); 1090 /* 1091 * Drop TCP, IP headers and TCP options then add data 1092 * to socket buffer. 1093 */ 1094 if (so->so_state & SS_CANTRCVMORE) 1095 m_freem(m); 1096 else { 1097 if (opti.ts_present && opti.ts_ecr) { 1098 if (tp->rfbuf_ts < opti.ts_ecr && 1099 opti.ts_ecr - tp->rfbuf_ts < hz) { 1100 tcp_update_rcvspace(tp); 1101 /* Start over with next RTT. */ 1102 tp->rfbuf_cnt = 0; 1103 tp->rfbuf_ts = 0; 1104 } else 1105 tp->rfbuf_cnt += tlen; 1106 } 1107 m_adj(m, iphlen + off); 1108 sbappendstream(&so->so_rcv, m); 1109 } 1110 tp->t_flags |= TF_BLOCKOUTPUT; 1111 sorwakeup(so); 1112 tp->t_flags &= ~TF_BLOCKOUTPUT; 1113 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1114 (void) tcp_output(tp); 1115 return; 1116 } 1117 } 1118 1119 /* 1120 * Compute mbuf offset to TCP data segment. 1121 */ 1122 hdroptlen = iphlen + off; 1123 1124 /* 1125 * Calculate amount of space in receive window, 1126 * and then do TCP input processing. 1127 * Receive window is amount of space in rcv queue, 1128 * but not less than advertised window. 1129 */ 1130 { int win; 1131 1132 win = sbspace(&so->so_rcv); 1133 if (win < 0) 1134 win = 0; 1135 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1136 } 1137 1138 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1139 tp->rfbuf_cnt = 0; 1140 tp->rfbuf_ts = 0; 1141 1142 switch (tp->t_state) { 1143 1144 /* 1145 * If the state is SYN_RECEIVED: 1146 * if seg contains SYN/ACK, send an RST. 1147 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1148 */ 1149 1150 case TCPS_SYN_RECEIVED: 1151 if (tiflags & TH_ACK) { 1152 if (tiflags & TH_SYN) { 1153 tcpstat.tcps_badsyn++; 1154 goto dropwithreset; 1155 } 1156 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1157 SEQ_GT(th->th_ack, tp->snd_max)) 1158 goto dropwithreset; 1159 } 1160 break; 1161 1162 /* 1163 * If the state is SYN_SENT: 1164 * if seg contains an ACK, but not for our SYN, drop the input. 1165 * if seg contains a RST, then drop the connection. 1166 * if seg does not contain SYN, then drop it. 1167 * Otherwise this is an acceptable SYN segment 1168 * initialize tp->rcv_nxt and tp->irs 1169 * if seg contains ack then advance tp->snd_una 1170 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1171 * arrange for segment to be acked (eventually) 1172 * continue processing rest of data/controls, beginning with URG 1173 */ 1174 case TCPS_SYN_SENT: 1175 if ((tiflags & TH_ACK) && 1176 (SEQ_LEQ(th->th_ack, tp->iss) || 1177 SEQ_GT(th->th_ack, tp->snd_max))) 1178 goto dropwithreset; 1179 if (tiflags & TH_RST) { 1180 #ifdef TCP_ECN 1181 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1182 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1183 goto drop; 1184 #endif 1185 if (tiflags & TH_ACK) 1186 tp = tcp_drop(tp, ECONNREFUSED); 1187 goto drop; 1188 } 1189 if ((tiflags & TH_SYN) == 0) 1190 goto drop; 1191 if (tiflags & TH_ACK) { 1192 tp->snd_una = th->th_ack; 1193 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1194 tp->snd_nxt = tp->snd_una; 1195 } 1196 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1197 tp->irs = th->th_seq; 1198 tcp_mss(tp, opti.maxseg); 1199 /* Reset initial window to 1 segment for retransmit */ 1200 if (tp->t_rxtshift > 0) 1201 tp->snd_cwnd = tp->t_maxseg; 1202 tcp_rcvseqinit(tp); 1203 tp->t_flags |= TF_ACKNOW; 1204 #ifdef TCP_SACK 1205 /* 1206 * If we've sent a SACK_PERMITTED option, and the peer 1207 * also replied with one, then TF_SACK_PERMIT should have 1208 * been set in tcp_dooptions(). If it was not, disable SACKs. 1209 */ 1210 if (tp->sack_enable) 1211 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1212 #endif 1213 #ifdef TCP_ECN 1214 /* 1215 * if ECE is set but CWR is not set for SYN-ACK, or 1216 * both ECE and CWR are set for simultaneous open, 1217 * peer is ECN capable. 1218 */ 1219 if (tcp_do_ecn) { 1220 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1221 case TH_ACK|TH_ECE: 1222 case TH_ECE|TH_CWR: 1223 tp->t_flags |= TF_ECN_PERMIT; 1224 tiflags &= ~(TH_ECE|TH_CWR); 1225 tcpstat.tcps_ecn_accepts++; 1226 } 1227 } 1228 #endif 1229 1230 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1231 tcpstat.tcps_connects++; 1232 soisconnected(so); 1233 tp->t_state = TCPS_ESTABLISHED; 1234 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1235 /* Do window scaling on this connection? */ 1236 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1237 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1238 tp->snd_scale = tp->requested_s_scale; 1239 tp->rcv_scale = tp->request_r_scale; 1240 } 1241 tcp_flush_queue(tp); 1242 1243 /* 1244 * if we didn't have to retransmit the SYN, 1245 * use its rtt as our initial srtt & rtt var. 1246 */ 1247 if (tp->t_rtttime) 1248 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1249 /* 1250 * Since new data was acked (the SYN), open the 1251 * congestion window by one MSS. We do this 1252 * here, because we won't go through the normal 1253 * ACK processing below. And since this is the 1254 * start of the connection, we know we are in 1255 * the exponential phase of slow-start. 1256 */ 1257 tp->snd_cwnd += tp->t_maxseg; 1258 } else 1259 tp->t_state = TCPS_SYN_RECEIVED; 1260 1261 #if 0 1262 trimthenstep6: 1263 #endif 1264 /* 1265 * Advance th->th_seq to correspond to first data byte. 1266 * If data, trim to stay within window, 1267 * dropping FIN if necessary. 1268 */ 1269 th->th_seq++; 1270 if (tlen > tp->rcv_wnd) { 1271 todrop = tlen - tp->rcv_wnd; 1272 m_adj(m, -todrop); 1273 tlen = tp->rcv_wnd; 1274 tiflags &= ~TH_FIN; 1275 tcpstat.tcps_rcvpackafterwin++; 1276 tcpstat.tcps_rcvbyteafterwin += todrop; 1277 } 1278 tp->snd_wl1 = th->th_seq - 1; 1279 tp->rcv_up = th->th_seq; 1280 goto step6; 1281 /* 1282 * If a new connection request is received while in TIME_WAIT, 1283 * drop the old connection and start over if the if the 1284 * timestamp or the sequence numbers are above the previous 1285 * ones. 1286 */ 1287 case TCPS_TIME_WAIT: 1288 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1289 ((opti.ts_present && 1290 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1291 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1292 #if NPF > 0 1293 /* 1294 * The socket will be recreated but the new state 1295 * has already been linked to the socket. Remove the 1296 * link between old socket and new state. 1297 */ 1298 if (inp->inp_pf_sk) { 1299 inp->inp_pf_sk->inp = NULL; 1300 inp->inp_pf_sk = NULL; 1301 } 1302 #endif 1303 /* 1304 * Advance the iss by at least 32768, but 1305 * clear the msb in order to make sure 1306 * that SEG_LT(snd_nxt, iss). 1307 */ 1308 iss = tp->snd_nxt + 1309 ((arc4random() & 0x7fffffff) | 0x8000); 1310 reuse = &iss; 1311 tp = tcp_close(tp); 1312 inp = NULL; 1313 goto findpcb; 1314 } 1315 } 1316 1317 /* 1318 * States other than LISTEN or SYN_SENT. 1319 * First check timestamp, if present. 1320 * Then check that at least some bytes of segment are within 1321 * receive window. If segment begins before rcv_nxt, 1322 * drop leading data (and SYN); if nothing left, just ack. 1323 * 1324 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1325 * and it's less than opti.ts_recent, drop it. 1326 */ 1327 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1328 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1329 1330 /* Check to see if ts_recent is over 24 days old. */ 1331 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1332 /* 1333 * Invalidate ts_recent. If this segment updates 1334 * ts_recent, the age will be reset later and ts_recent 1335 * will get a valid value. If it does not, setting 1336 * ts_recent to zero will at least satisfy the 1337 * requirement that zero be placed in the timestamp 1338 * echo reply when ts_recent isn't valid. The 1339 * age isn't reset until we get a valid ts_recent 1340 * because we don't want out-of-order segments to be 1341 * dropped when ts_recent is old. 1342 */ 1343 tp->ts_recent = 0; 1344 } else { 1345 tcpstat.tcps_rcvduppack++; 1346 tcpstat.tcps_rcvdupbyte += tlen; 1347 tcpstat.tcps_pawsdrop++; 1348 goto dropafterack; 1349 } 1350 } 1351 1352 todrop = tp->rcv_nxt - th->th_seq; 1353 if (todrop > 0) { 1354 if (tiflags & TH_SYN) { 1355 tiflags &= ~TH_SYN; 1356 th->th_seq++; 1357 if (th->th_urp > 1) 1358 th->th_urp--; 1359 else 1360 tiflags &= ~TH_URG; 1361 todrop--; 1362 } 1363 if (todrop > tlen || 1364 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1365 /* 1366 * Any valid FIN must be to the left of the 1367 * window. At this point, FIN must be a 1368 * duplicate or out-of-sequence, so drop it. 1369 */ 1370 tiflags &= ~TH_FIN; 1371 /* 1372 * Send ACK to resynchronize, and drop any data, 1373 * but keep on processing for RST or ACK. 1374 */ 1375 tp->t_flags |= TF_ACKNOW; 1376 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1377 tcpstat.tcps_rcvduppack++; 1378 } else { 1379 tcpstat.tcps_rcvpartduppack++; 1380 tcpstat.tcps_rcvpartdupbyte += todrop; 1381 } 1382 hdroptlen += todrop; /* drop from head afterwards */ 1383 th->th_seq += todrop; 1384 tlen -= todrop; 1385 if (th->th_urp > todrop) 1386 th->th_urp -= todrop; 1387 else { 1388 tiflags &= ~TH_URG; 1389 th->th_urp = 0; 1390 } 1391 } 1392 1393 /* 1394 * If new data are received on a connection after the 1395 * user processes are gone, then RST the other end. 1396 */ 1397 if ((so->so_state & SS_NOFDREF) && 1398 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1399 tp = tcp_close(tp); 1400 tcpstat.tcps_rcvafterclose++; 1401 goto dropwithreset; 1402 } 1403 1404 /* 1405 * If segment ends after window, drop trailing data 1406 * (and PUSH and FIN); if nothing left, just ACK. 1407 */ 1408 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1409 if (todrop > 0) { 1410 tcpstat.tcps_rcvpackafterwin++; 1411 if (todrop >= tlen) { 1412 tcpstat.tcps_rcvbyteafterwin += tlen; 1413 /* 1414 * If window is closed can only take segments at 1415 * window edge, and have to drop data and PUSH from 1416 * incoming segments. Continue processing, but 1417 * remember to ack. Otherwise, drop segment 1418 * and ack. 1419 */ 1420 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1421 tp->t_flags |= TF_ACKNOW; 1422 tcpstat.tcps_rcvwinprobe++; 1423 } else 1424 goto dropafterack; 1425 } else 1426 tcpstat.tcps_rcvbyteafterwin += todrop; 1427 m_adj(m, -todrop); 1428 tlen -= todrop; 1429 tiflags &= ~(TH_PUSH|TH_FIN); 1430 } 1431 1432 /* 1433 * If last ACK falls within this segment's sequence numbers, 1434 * record its timestamp if it's more recent. 1435 * Cf fix from Braden, see Stevens p. 870 1436 */ 1437 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1438 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1439 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1440 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1441 tp->ts_recent = opti.ts_val; 1442 else 1443 tp->ts_recent = 0; 1444 tp->ts_recent_age = tcp_now; 1445 } 1446 1447 /* 1448 * If the RST bit is set examine the state: 1449 * SYN_RECEIVED STATE: 1450 * If passive open, return to LISTEN state. 1451 * If active open, inform user that connection was refused. 1452 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1453 * Inform user that connection was reset, and close tcb. 1454 * CLOSING, LAST_ACK, TIME_WAIT STATES 1455 * Close the tcb. 1456 */ 1457 if (tiflags & TH_RST) { 1458 if (th->th_seq != tp->last_ack_sent && 1459 th->th_seq != tp->rcv_nxt && 1460 th->th_seq != (tp->rcv_nxt + 1)) 1461 goto drop; 1462 1463 switch (tp->t_state) { 1464 case TCPS_SYN_RECEIVED: 1465 #ifdef TCP_ECN 1466 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1467 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1468 goto drop; 1469 #endif 1470 so->so_error = ECONNREFUSED; 1471 goto close; 1472 1473 case TCPS_ESTABLISHED: 1474 case TCPS_FIN_WAIT_1: 1475 case TCPS_FIN_WAIT_2: 1476 case TCPS_CLOSE_WAIT: 1477 so->so_error = ECONNRESET; 1478 close: 1479 tp->t_state = TCPS_CLOSED; 1480 tcpstat.tcps_drops++; 1481 tp = tcp_close(tp); 1482 goto drop; 1483 case TCPS_CLOSING: 1484 case TCPS_LAST_ACK: 1485 case TCPS_TIME_WAIT: 1486 tp = tcp_close(tp); 1487 goto drop; 1488 } 1489 } 1490 1491 /* 1492 * If a SYN is in the window, then this is an 1493 * error and we ACK and drop the packet. 1494 */ 1495 if (tiflags & TH_SYN) 1496 goto dropafterack_ratelim; 1497 1498 /* 1499 * If the ACK bit is off we drop the segment and return. 1500 */ 1501 if ((tiflags & TH_ACK) == 0) { 1502 if (tp->t_flags & TF_ACKNOW) 1503 goto dropafterack; 1504 else 1505 goto drop; 1506 } 1507 1508 /* 1509 * Ack processing. 1510 */ 1511 switch (tp->t_state) { 1512 1513 /* 1514 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1515 * ESTABLISHED state and continue processing. 1516 * The ACK was checked above. 1517 */ 1518 case TCPS_SYN_RECEIVED: 1519 tcpstat.tcps_connects++; 1520 soisconnected(so); 1521 tp->t_state = TCPS_ESTABLISHED; 1522 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1523 /* Do window scaling? */ 1524 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1525 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1526 tp->snd_scale = tp->requested_s_scale; 1527 tp->rcv_scale = tp->request_r_scale; 1528 tiwin = th->th_win << tp->snd_scale; 1529 } 1530 tcp_flush_queue(tp); 1531 tp->snd_wl1 = th->th_seq - 1; 1532 /* fall into ... */ 1533 1534 /* 1535 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1536 * ACKs. If the ack is in the range 1537 * tp->snd_una < th->th_ack <= tp->snd_max 1538 * then advance tp->snd_una to th->th_ack and drop 1539 * data from the retransmission queue. If this ACK reflects 1540 * more up to date window information we update our window information. 1541 */ 1542 case TCPS_ESTABLISHED: 1543 case TCPS_FIN_WAIT_1: 1544 case TCPS_FIN_WAIT_2: 1545 case TCPS_CLOSE_WAIT: 1546 case TCPS_CLOSING: 1547 case TCPS_LAST_ACK: 1548 case TCPS_TIME_WAIT: 1549 #ifdef TCP_ECN 1550 /* 1551 * if we receive ECE and are not already in recovery phase, 1552 * reduce cwnd by half but don't slow-start. 1553 * advance snd_last to snd_max not to reduce cwnd again 1554 * until all outstanding packets are acked. 1555 */ 1556 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1557 if ((tp->t_flags & TF_ECN_PERMIT) && 1558 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1559 u_int win; 1560 1561 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1562 if (win > 1) { 1563 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1564 tp->snd_cwnd = tp->snd_ssthresh; 1565 tp->snd_last = tp->snd_max; 1566 tp->t_flags |= TF_SEND_CWR; 1567 tcpstat.tcps_cwr_ecn++; 1568 } 1569 } 1570 tcpstat.tcps_ecn_rcvece++; 1571 } 1572 /* 1573 * if we receive CWR, we know that the peer has reduced 1574 * its congestion window. stop sending ecn-echo. 1575 */ 1576 if ((tiflags & TH_CWR)) { 1577 tp->t_flags &= ~TF_RCVD_CE; 1578 tcpstat.tcps_ecn_rcvcwr++; 1579 } 1580 #endif /* TCP_ECN */ 1581 1582 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1583 /* 1584 * Duplicate/old ACK processing. 1585 * Increments t_dupacks: 1586 * Pure duplicate (same seq/ack/window, no data) 1587 * Doesn't affect t_dupacks: 1588 * Data packets. 1589 * Normal window updates (window opens) 1590 * Resets t_dupacks: 1591 * New data ACKed. 1592 * Window shrinks 1593 * Old ACK 1594 */ 1595 if (tlen) { 1596 /* Drop very old ACKs unless th_seq matches */ 1597 if (th->th_seq != tp->rcv_nxt && 1598 SEQ_LT(th->th_ack, 1599 tp->snd_una - tp->max_sndwnd)) { 1600 tcpstat.tcps_rcvacktooold++; 1601 goto drop; 1602 } 1603 break; 1604 } 1605 /* 1606 * If we get an old ACK, there is probably packet 1607 * reordering going on. Be conservative and reset 1608 * t_dupacks so that we are less aggressive in 1609 * doing a fast retransmit. 1610 */ 1611 if (th->th_ack != tp->snd_una) { 1612 tp->t_dupacks = 0; 1613 break; 1614 } 1615 if (tiwin == tp->snd_wnd) { 1616 tcpstat.tcps_rcvdupack++; 1617 /* 1618 * If we have outstanding data (other than 1619 * a window probe), this is a completely 1620 * duplicate ack (ie, window info didn't 1621 * change), the ack is the biggest we've 1622 * seen and we've seen exactly our rexmt 1623 * threshold of them, assume a packet 1624 * has been dropped and retransmit it. 1625 * Kludge snd_nxt & the congestion 1626 * window so we send only this one 1627 * packet. 1628 * 1629 * We know we're losing at the current 1630 * window size so do congestion avoidance 1631 * (set ssthresh to half the current window 1632 * and pull our congestion window back to 1633 * the new ssthresh). 1634 * 1635 * Dup acks mean that packets have left the 1636 * network (they're now cached at the receiver) 1637 * so bump cwnd by the amount in the receiver 1638 * to keep a constant cwnd packets in the 1639 * network. 1640 */ 1641 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1642 tp->t_dupacks = 0; 1643 #if defined(TCP_SACK) && defined(TCP_FACK) 1644 /* 1645 * In FACK, can enter fast rec. if the receiver 1646 * reports a reass. queue longer than 3 segs. 1647 */ 1648 else if (++tp->t_dupacks == tcprexmtthresh || 1649 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1650 tp->t_maxseg + tp->snd_una)) && 1651 SEQ_GT(tp->snd_una, tp->snd_last))) { 1652 #else 1653 else if (++tp->t_dupacks == tcprexmtthresh) { 1654 #endif /* TCP_FACK */ 1655 tcp_seq onxt = tp->snd_nxt; 1656 u_long win = 1657 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1658 2 / tp->t_maxseg; 1659 1660 #if defined(TCP_SACK) || defined(TCP_ECN) 1661 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1662 /* 1663 * False fast retx after 1664 * timeout. Do not cut window. 1665 */ 1666 tp->t_dupacks = 0; 1667 goto drop; 1668 } 1669 #endif 1670 if (win < 2) 1671 win = 2; 1672 tp->snd_ssthresh = win * tp->t_maxseg; 1673 #ifdef TCP_SACK 1674 tp->snd_last = tp->snd_max; 1675 if (tp->sack_enable) { 1676 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1677 tp->t_rtttime = 0; 1678 #ifdef TCP_ECN 1679 tp->t_flags |= TF_SEND_CWR; 1680 #endif 1681 tcpstat.tcps_cwr_frecovery++; 1682 tcpstat.tcps_sack_recovery_episode++; 1683 #if defined(TCP_SACK) && defined(TCP_FACK) 1684 tp->t_dupacks = tcprexmtthresh; 1685 (void) tcp_output(tp); 1686 /* 1687 * During FR, snd_cwnd is held 1688 * constant for FACK. 1689 */ 1690 tp->snd_cwnd = tp->snd_ssthresh; 1691 #else 1692 /* 1693 * tcp_output() will send 1694 * oldest SACK-eligible rtx. 1695 */ 1696 (void) tcp_output(tp); 1697 tp->snd_cwnd = tp->snd_ssthresh+ 1698 tp->t_maxseg * tp->t_dupacks; 1699 #endif /* TCP_FACK */ 1700 goto drop; 1701 } 1702 #endif /* TCP_SACK */ 1703 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1704 tp->t_rtttime = 0; 1705 tp->snd_nxt = th->th_ack; 1706 tp->snd_cwnd = tp->t_maxseg; 1707 #ifdef TCP_ECN 1708 tp->t_flags |= TF_SEND_CWR; 1709 #endif 1710 tcpstat.tcps_cwr_frecovery++; 1711 tcpstat.tcps_sndrexmitfast++; 1712 (void) tcp_output(tp); 1713 1714 tp->snd_cwnd = tp->snd_ssthresh + 1715 tp->t_maxseg * tp->t_dupacks; 1716 if (SEQ_GT(onxt, tp->snd_nxt)) 1717 tp->snd_nxt = onxt; 1718 goto drop; 1719 } else if (tp->t_dupacks > tcprexmtthresh) { 1720 #if defined(TCP_SACK) && defined(TCP_FACK) 1721 /* 1722 * while (awnd < cwnd) 1723 * sendsomething(); 1724 */ 1725 if (tp->sack_enable) { 1726 if (tp->snd_awnd < tp->snd_cwnd) 1727 tcp_output(tp); 1728 goto drop; 1729 } 1730 #endif /* TCP_FACK */ 1731 tp->snd_cwnd += tp->t_maxseg; 1732 (void) tcp_output(tp); 1733 goto drop; 1734 } 1735 } else if (tiwin < tp->snd_wnd) { 1736 /* 1737 * The window was retracted! Previous dup 1738 * ACKs may have been due to packets arriving 1739 * after the shrunken window, not a missing 1740 * packet, so play it safe and reset t_dupacks 1741 */ 1742 tp->t_dupacks = 0; 1743 } 1744 break; 1745 } 1746 /* 1747 * If the congestion window was inflated to account 1748 * for the other side's cached packets, retract it. 1749 */ 1750 #if defined(TCP_SACK) 1751 if (tp->sack_enable) { 1752 if (tp->t_dupacks >= tcprexmtthresh) { 1753 /* Check for a partial ACK */ 1754 if (tcp_sack_partialack(tp, th)) { 1755 #if defined(TCP_SACK) && defined(TCP_FACK) 1756 /* Force call to tcp_output */ 1757 if (tp->snd_awnd < tp->snd_cwnd) 1758 tp->t_flags |= TF_NEEDOUTPUT; 1759 #else 1760 tp->snd_cwnd += tp->t_maxseg; 1761 tp->t_flags |= TF_NEEDOUTPUT; 1762 #endif /* TCP_FACK */ 1763 } else { 1764 /* Out of fast recovery */ 1765 tp->snd_cwnd = tp->snd_ssthresh; 1766 if (tcp_seq_subtract(tp->snd_max, 1767 th->th_ack) < tp->snd_ssthresh) 1768 tp->snd_cwnd = 1769 tcp_seq_subtract(tp->snd_max, 1770 th->th_ack); 1771 tp->t_dupacks = 0; 1772 #if defined(TCP_SACK) && defined(TCP_FACK) 1773 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1774 tp->snd_fack = th->th_ack; 1775 #endif /* TCP_FACK */ 1776 } 1777 } 1778 } else { 1779 if (tp->t_dupacks >= tcprexmtthresh && 1780 !tcp_newreno(tp, th)) { 1781 /* Out of fast recovery */ 1782 tp->snd_cwnd = tp->snd_ssthresh; 1783 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1784 tp->snd_ssthresh) 1785 tp->snd_cwnd = 1786 tcp_seq_subtract(tp->snd_max, 1787 th->th_ack); 1788 tp->t_dupacks = 0; 1789 } 1790 } 1791 if (tp->t_dupacks < tcprexmtthresh) 1792 tp->t_dupacks = 0; 1793 #else /* else no TCP_SACK */ 1794 if (tp->t_dupacks >= tcprexmtthresh && 1795 tp->snd_cwnd > tp->snd_ssthresh) 1796 tp->snd_cwnd = tp->snd_ssthresh; 1797 tp->t_dupacks = 0; 1798 #endif 1799 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1800 tcpstat.tcps_rcvacktoomuch++; 1801 goto dropafterack_ratelim; 1802 } 1803 acked = th->th_ack - tp->snd_una; 1804 tcpstat.tcps_rcvackpack++; 1805 tcpstat.tcps_rcvackbyte += acked; 1806 1807 /* 1808 * If we have a timestamp reply, update smoothed 1809 * round trip time. If no timestamp is present but 1810 * transmit timer is running and timed sequence 1811 * number was acked, update smoothed round trip time. 1812 * Since we now have an rtt measurement, cancel the 1813 * timer backoff (cf., Phil Karn's retransmit alg.). 1814 * Recompute the initial retransmit timer. 1815 */ 1816 if (opti.ts_present && opti.ts_ecr) 1817 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1818 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1819 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1820 1821 /* 1822 * If all outstanding data is acked, stop retransmit 1823 * timer and remember to restart (more output or persist). 1824 * If there is more data to be acked, restart retransmit 1825 * timer, using current (possibly backed-off) value. 1826 */ 1827 if (th->th_ack == tp->snd_max) { 1828 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1829 tp->t_flags |= TF_NEEDOUTPUT; 1830 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1831 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1832 /* 1833 * When new data is acked, open the congestion window. 1834 * If the window gives us less than ssthresh packets 1835 * in flight, open exponentially (maxseg per packet). 1836 * Otherwise open linearly: maxseg per window 1837 * (maxseg^2 / cwnd per packet). 1838 */ 1839 { 1840 u_int cw = tp->snd_cwnd; 1841 u_int incr = tp->t_maxseg; 1842 1843 if (cw > tp->snd_ssthresh) 1844 incr = incr * incr / cw; 1845 #if defined (TCP_SACK) 1846 if (tp->t_dupacks < tcprexmtthresh) 1847 #endif 1848 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1849 } 1850 ND6_HINT(tp); 1851 if (acked > so->so_snd.sb_cc) { 1852 tp->snd_wnd -= so->so_snd.sb_cc; 1853 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1854 ourfinisacked = 1; 1855 } else { 1856 sbdrop(&so->so_snd, acked); 1857 tp->snd_wnd -= acked; 1858 ourfinisacked = 0; 1859 } 1860 1861 tcp_update_sndspace(tp); 1862 if (sb_notify(&so->so_snd)) { 1863 tp->t_flags |= TF_BLOCKOUTPUT; 1864 sowwakeup(so); 1865 tp->t_flags &= ~TF_BLOCKOUTPUT; 1866 } 1867 1868 /* 1869 * If we had a pending ICMP message that referred to data 1870 * that have just been acknowledged, disregard the recorded 1871 * ICMP message. 1872 */ 1873 if ((tp->t_flags & TF_PMTUD_PEND) && 1874 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1875 tp->t_flags &= ~TF_PMTUD_PEND; 1876 1877 /* 1878 * Keep track of the largest chunk of data acknowledged 1879 * since last PMTU update 1880 */ 1881 if (tp->t_pmtud_mss_acked < acked) 1882 tp->t_pmtud_mss_acked = acked; 1883 1884 tp->snd_una = th->th_ack; 1885 #ifdef TCP_ECN 1886 /* sync snd_last with snd_una */ 1887 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1888 tp->snd_last = tp->snd_una; 1889 #endif 1890 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1891 tp->snd_nxt = tp->snd_una; 1892 #if defined (TCP_SACK) && defined (TCP_FACK) 1893 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1894 tp->snd_fack = tp->snd_una; 1895 /* Update snd_awnd for partial ACK 1896 * without any SACK blocks. 1897 */ 1898 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1899 tp->snd_fack) + tp->retran_data; 1900 } 1901 #endif 1902 1903 switch (tp->t_state) { 1904 1905 /* 1906 * In FIN_WAIT_1 STATE in addition to the processing 1907 * for the ESTABLISHED state if our FIN is now acknowledged 1908 * then enter FIN_WAIT_2. 1909 */ 1910 case TCPS_FIN_WAIT_1: 1911 if (ourfinisacked) { 1912 /* 1913 * If we can't receive any more 1914 * data, then closing user can proceed. 1915 * Starting the timer is contrary to the 1916 * specification, but if we don't get a FIN 1917 * we'll hang forever. 1918 */ 1919 if (so->so_state & SS_CANTRCVMORE) { 1920 soisdisconnected(so); 1921 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1922 } 1923 tp->t_state = TCPS_FIN_WAIT_2; 1924 } 1925 break; 1926 1927 /* 1928 * In CLOSING STATE in addition to the processing for 1929 * the ESTABLISHED state if the ACK acknowledges our FIN 1930 * then enter the TIME-WAIT state, otherwise ignore 1931 * the segment. 1932 */ 1933 case TCPS_CLOSING: 1934 if (ourfinisacked) { 1935 tp->t_state = TCPS_TIME_WAIT; 1936 tcp_canceltimers(tp); 1937 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1938 soisdisconnected(so); 1939 } 1940 break; 1941 1942 /* 1943 * In LAST_ACK, we may still be waiting for data to drain 1944 * and/or to be acked, as well as for the ack of our FIN. 1945 * If our FIN is now acknowledged, delete the TCB, 1946 * enter the closed state and return. 1947 */ 1948 case TCPS_LAST_ACK: 1949 if (ourfinisacked) { 1950 tp = tcp_close(tp); 1951 goto drop; 1952 } 1953 break; 1954 1955 /* 1956 * In TIME_WAIT state the only thing that should arrive 1957 * is a retransmission of the remote FIN. Acknowledge 1958 * it and restart the finack timer. 1959 */ 1960 case TCPS_TIME_WAIT: 1961 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1962 goto dropafterack; 1963 } 1964 } 1965 1966 step6: 1967 /* 1968 * Update window information. 1969 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1970 */ 1971 if ((tiflags & TH_ACK) && 1972 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1973 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1974 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1975 /* keep track of pure window updates */ 1976 if (tlen == 0 && 1977 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1978 tcpstat.tcps_rcvwinupd++; 1979 tp->snd_wnd = tiwin; 1980 tp->snd_wl1 = th->th_seq; 1981 tp->snd_wl2 = th->th_ack; 1982 if (tp->snd_wnd > tp->max_sndwnd) 1983 tp->max_sndwnd = tp->snd_wnd; 1984 tp->t_flags |= TF_NEEDOUTPUT; 1985 } 1986 1987 /* 1988 * Process segments with URG. 1989 */ 1990 if ((tiflags & TH_URG) && th->th_urp && 1991 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1992 /* 1993 * This is a kludge, but if we receive and accept 1994 * random urgent pointers, we'll crash in 1995 * soreceive. It's hard to imagine someone 1996 * actually wanting to send this much urgent data. 1997 */ 1998 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1999 th->th_urp = 0; /* XXX */ 2000 tiflags &= ~TH_URG; /* XXX */ 2001 goto dodata; /* XXX */ 2002 } 2003 /* 2004 * If this segment advances the known urgent pointer, 2005 * then mark the data stream. This should not happen 2006 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2007 * a FIN has been received from the remote side. 2008 * In these states we ignore the URG. 2009 * 2010 * According to RFC961 (Assigned Protocols), 2011 * the urgent pointer points to the last octet 2012 * of urgent data. We continue, however, 2013 * to consider it to indicate the first octet 2014 * of data past the urgent section as the original 2015 * spec states (in one of two places). 2016 */ 2017 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2018 tp->rcv_up = th->th_seq + th->th_urp; 2019 so->so_oobmark = so->so_rcv.sb_cc + 2020 (tp->rcv_up - tp->rcv_nxt) - 1; 2021 if (so->so_oobmark == 0) 2022 so->so_state |= SS_RCVATMARK; 2023 sohasoutofband(so); 2024 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2025 } 2026 /* 2027 * Remove out of band data so doesn't get presented to user. 2028 * This can happen independent of advancing the URG pointer, 2029 * but if two URG's are pending at once, some out-of-band 2030 * data may creep in... ick. 2031 */ 2032 if (th->th_urp <= (u_int16_t) tlen && 2033 (so->so_options & SO_OOBINLINE) == 0) 2034 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2035 } else 2036 /* 2037 * If no out of band data is expected, 2038 * pull receive urgent pointer along 2039 * with the receive window. 2040 */ 2041 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2042 tp->rcv_up = tp->rcv_nxt; 2043 dodata: /* XXX */ 2044 2045 /* 2046 * Process the segment text, merging it into the TCP sequencing queue, 2047 * and arranging for acknowledgment of receipt if necessary. 2048 * This process logically involves adjusting tp->rcv_wnd as data 2049 * is presented to the user (this happens in tcp_usrreq.c, 2050 * case PRU_RCVD). If a FIN has already been received on this 2051 * connection then we just ignore the text. 2052 */ 2053 if ((tlen || (tiflags & TH_FIN)) && 2054 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2055 #ifdef TCP_SACK 2056 tcp_seq laststart = th->th_seq; 2057 tcp_seq lastend = th->th_seq + tlen; 2058 #endif 2059 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2060 tp->t_state == TCPS_ESTABLISHED) { 2061 TCP_SETUP_ACK(tp, tiflags, m); 2062 tp->rcv_nxt += tlen; 2063 tiflags = th->th_flags & TH_FIN; 2064 tcpstat.tcps_rcvpack++; 2065 tcpstat.tcps_rcvbyte += tlen; 2066 ND6_HINT(tp); 2067 if (so->so_state & SS_CANTRCVMORE) 2068 m_freem(m); 2069 else { 2070 m_adj(m, hdroptlen); 2071 sbappendstream(&so->so_rcv, m); 2072 } 2073 tp->t_flags |= TF_BLOCKOUTPUT; 2074 sorwakeup(so); 2075 tp->t_flags &= ~TF_BLOCKOUTPUT; 2076 } else { 2077 m_adj(m, hdroptlen); 2078 tiflags = tcp_reass(tp, th, m, &tlen); 2079 tp->t_flags |= TF_ACKNOW; 2080 } 2081 #ifdef TCP_SACK 2082 if (tp->sack_enable) 2083 tcp_update_sack_list(tp, laststart, lastend); 2084 #endif 2085 2086 /* 2087 * variable len never referenced again in modern BSD, 2088 * so why bother computing it ?? 2089 */ 2090 #if 0 2091 /* 2092 * Note the amount of data that peer has sent into 2093 * our window, in order to estimate the sender's 2094 * buffer size. 2095 */ 2096 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2097 #endif /* 0 */ 2098 } else { 2099 m_freem(m); 2100 tiflags &= ~TH_FIN; 2101 } 2102 2103 /* 2104 * If FIN is received ACK the FIN and let the user know 2105 * that the connection is closing. Ignore a FIN received before 2106 * the connection is fully established. 2107 */ 2108 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2109 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2110 socantrcvmore(so); 2111 tp->t_flags |= TF_ACKNOW; 2112 tp->rcv_nxt++; 2113 } 2114 switch (tp->t_state) { 2115 2116 /* 2117 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2118 */ 2119 case TCPS_ESTABLISHED: 2120 tp->t_state = TCPS_CLOSE_WAIT; 2121 break; 2122 2123 /* 2124 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2125 * enter the CLOSING state. 2126 */ 2127 case TCPS_FIN_WAIT_1: 2128 tp->t_state = TCPS_CLOSING; 2129 break; 2130 2131 /* 2132 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2133 * starting the time-wait timer, turning off the other 2134 * standard timers. 2135 */ 2136 case TCPS_FIN_WAIT_2: 2137 tp->t_state = TCPS_TIME_WAIT; 2138 tcp_canceltimers(tp); 2139 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2140 soisdisconnected(so); 2141 break; 2142 2143 /* 2144 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2145 */ 2146 case TCPS_TIME_WAIT: 2147 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2148 break; 2149 } 2150 } 2151 if (so->so_options & SO_DEBUG) { 2152 switch (tp->pf) { 2153 #ifdef INET6 2154 case PF_INET6: 2155 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2156 0, tlen); 2157 break; 2158 #endif /* INET6 */ 2159 case PF_INET: 2160 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2161 0, tlen); 2162 break; 2163 } 2164 } 2165 2166 /* 2167 * Return any desired output. 2168 */ 2169 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2170 (void) tcp_output(tp); 2171 return; 2172 2173 badsyn: 2174 /* 2175 * Received a bad SYN. Increment counters and dropwithreset. 2176 */ 2177 tcpstat.tcps_badsyn++; 2178 tp = NULL; 2179 goto dropwithreset; 2180 2181 dropafterack_ratelim: 2182 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2183 tcp_ackdrop_ppslim) == 0) { 2184 /* XXX stat */ 2185 goto drop; 2186 } 2187 /* ...fall into dropafterack... */ 2188 2189 dropafterack: 2190 /* 2191 * Generate an ACK dropping incoming segment if it occupies 2192 * sequence space, where the ACK reflects our state. 2193 */ 2194 if (tiflags & TH_RST) 2195 goto drop; 2196 m_freem(m); 2197 tp->t_flags |= TF_ACKNOW; 2198 (void) tcp_output(tp); 2199 return; 2200 2201 dropwithreset_ratelim: 2202 /* 2203 * We may want to rate-limit RSTs in certain situations, 2204 * particularly if we are sending an RST in response to 2205 * an attempt to connect to or otherwise communicate with 2206 * a port for which we have no socket. 2207 */ 2208 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2209 tcp_rst_ppslim) == 0) { 2210 /* XXX stat */ 2211 goto drop; 2212 } 2213 /* ...fall into dropwithreset... */ 2214 2215 dropwithreset: 2216 /* 2217 * Generate a RST, dropping incoming segment. 2218 * Make ACK acceptable to originator of segment. 2219 * Don't bother to respond to RST. 2220 */ 2221 if (tiflags & TH_RST) 2222 goto drop; 2223 if (tiflags & TH_ACK) { 2224 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2225 TH_RST, m->m_pkthdr.ph_rtableid); 2226 } else { 2227 if (tiflags & TH_SYN) 2228 tlen++; 2229 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2230 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2231 } 2232 m_freem(m); 2233 return; 2234 2235 drop: 2236 /* 2237 * Drop space held by incoming segment and return. 2238 */ 2239 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2240 switch (tp->pf) { 2241 #ifdef INET6 2242 case PF_INET6: 2243 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2244 0, tlen); 2245 break; 2246 #endif /* INET6 */ 2247 case PF_INET: 2248 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2249 0, tlen); 2250 break; 2251 } 2252 } 2253 2254 m_freem(m); 2255 return; 2256 } 2257 2258 int 2259 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2260 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2261 u_int rtableid) 2262 { 2263 u_int16_t mss = 0; 2264 int opt, optlen; 2265 #ifdef TCP_SIGNATURE 2266 caddr_t sigp = NULL; 2267 struct tdb *tdb = NULL; 2268 #endif /* TCP_SIGNATURE */ 2269 2270 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2271 opt = cp[0]; 2272 if (opt == TCPOPT_EOL) 2273 break; 2274 if (opt == TCPOPT_NOP) 2275 optlen = 1; 2276 else { 2277 if (cnt < 2) 2278 break; 2279 optlen = cp[1]; 2280 if (optlen < 2 || optlen > cnt) 2281 break; 2282 } 2283 switch (opt) { 2284 2285 default: 2286 continue; 2287 2288 case TCPOPT_MAXSEG: 2289 if (optlen != TCPOLEN_MAXSEG) 2290 continue; 2291 if (!(th->th_flags & TH_SYN)) 2292 continue; 2293 if (TCPS_HAVERCVDSYN(tp->t_state)) 2294 continue; 2295 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2296 mss = ntohs(mss); 2297 oi->maxseg = mss; 2298 break; 2299 2300 case TCPOPT_WINDOW: 2301 if (optlen != TCPOLEN_WINDOW) 2302 continue; 2303 if (!(th->th_flags & TH_SYN)) 2304 continue; 2305 if (TCPS_HAVERCVDSYN(tp->t_state)) 2306 continue; 2307 tp->t_flags |= TF_RCVD_SCALE; 2308 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2309 break; 2310 2311 case TCPOPT_TIMESTAMP: 2312 if (optlen != TCPOLEN_TIMESTAMP) 2313 continue; 2314 oi->ts_present = 1; 2315 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2316 oi->ts_val = ntohl(oi->ts_val); 2317 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2318 oi->ts_ecr = ntohl(oi->ts_ecr); 2319 2320 if (!(th->th_flags & TH_SYN)) 2321 continue; 2322 if (TCPS_HAVERCVDSYN(tp->t_state)) 2323 continue; 2324 /* 2325 * A timestamp received in a SYN makes 2326 * it ok to send timestamp requests and replies. 2327 */ 2328 tp->t_flags |= TF_RCVD_TSTMP; 2329 tp->ts_recent = oi->ts_val; 2330 tp->ts_recent_age = tcp_now; 2331 break; 2332 2333 #ifdef TCP_SACK 2334 case TCPOPT_SACK_PERMITTED: 2335 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2336 continue; 2337 if (!(th->th_flags & TH_SYN)) 2338 continue; 2339 if (TCPS_HAVERCVDSYN(tp->t_state)) 2340 continue; 2341 /* MUST only be set on SYN */ 2342 tp->t_flags |= TF_SACK_PERMIT; 2343 break; 2344 case TCPOPT_SACK: 2345 tcp_sack_option(tp, th, cp, optlen); 2346 break; 2347 #endif 2348 #ifdef TCP_SIGNATURE 2349 case TCPOPT_SIGNATURE: 2350 if (optlen != TCPOLEN_SIGNATURE) 2351 continue; 2352 2353 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2354 return (-1); 2355 2356 sigp = cp + 2; 2357 break; 2358 #endif /* TCP_SIGNATURE */ 2359 } 2360 } 2361 2362 #ifdef TCP_SIGNATURE 2363 if (tp->t_flags & TF_SIGNATURE) { 2364 union sockaddr_union src, dst; 2365 2366 memset(&src, 0, sizeof(union sockaddr_union)); 2367 memset(&dst, 0, sizeof(union sockaddr_union)); 2368 2369 switch (tp->pf) { 2370 case 0: 2371 case AF_INET: 2372 src.sa.sa_len = sizeof(struct sockaddr_in); 2373 src.sa.sa_family = AF_INET; 2374 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2375 dst.sa.sa_len = sizeof(struct sockaddr_in); 2376 dst.sa.sa_family = AF_INET; 2377 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2378 break; 2379 #ifdef INET6 2380 case AF_INET6: 2381 src.sa.sa_len = sizeof(struct sockaddr_in6); 2382 src.sa.sa_family = AF_INET6; 2383 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2384 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2385 dst.sa.sa_family = AF_INET6; 2386 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2387 break; 2388 #endif /* INET6 */ 2389 } 2390 2391 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2392 0, &src, &dst, IPPROTO_TCP); 2393 2394 /* 2395 * We don't have an SA for this peer, so we turn off 2396 * TF_SIGNATURE on the listen socket 2397 */ 2398 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2399 tp->t_flags &= ~TF_SIGNATURE; 2400 2401 } 2402 2403 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2404 tcpstat.tcps_rcvbadsig++; 2405 return (-1); 2406 } 2407 2408 if (sigp) { 2409 char sig[16]; 2410 2411 if (tdb == NULL) { 2412 tcpstat.tcps_rcvbadsig++; 2413 return (-1); 2414 } 2415 2416 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2417 return (-1); 2418 2419 if (timingsafe_bcmp(sig, sigp, 16)) { 2420 tcpstat.tcps_rcvbadsig++; 2421 return (-1); 2422 } 2423 2424 tcpstat.tcps_rcvgoodsig++; 2425 } 2426 #endif /* TCP_SIGNATURE */ 2427 2428 return (0); 2429 } 2430 2431 #if defined(TCP_SACK) 2432 u_long 2433 tcp_seq_subtract(u_long a, u_long b) 2434 { 2435 return ((long)(a - b)); 2436 } 2437 #endif 2438 2439 2440 #ifdef TCP_SACK 2441 /* 2442 * This function is called upon receipt of new valid data (while not in header 2443 * prediction mode), and it updates the ordered list of sacks. 2444 */ 2445 void 2446 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2447 tcp_seq rcv_lastend) 2448 { 2449 /* 2450 * First reported block MUST be the most recent one. Subsequent 2451 * blocks SHOULD be in the order in which they arrived at the 2452 * receiver. These two conditions make the implementation fully 2453 * compliant with RFC 2018. 2454 */ 2455 int i, j = 0, count = 0, lastpos = -1; 2456 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2457 2458 /* First clean up current list of sacks */ 2459 for (i = 0; i < tp->rcv_numsacks; i++) { 2460 sack = tp->sackblks[i]; 2461 if (sack.start == 0 && sack.end == 0) { 2462 count++; /* count = number of blocks to be discarded */ 2463 continue; 2464 } 2465 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2466 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2467 count++; 2468 } else { 2469 temp[j].start = tp->sackblks[i].start; 2470 temp[j++].end = tp->sackblks[i].end; 2471 } 2472 } 2473 tp->rcv_numsacks -= count; 2474 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2475 tcp_clean_sackreport(tp); 2476 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2477 /* ==> need first sack block */ 2478 tp->sackblks[0].start = rcv_laststart; 2479 tp->sackblks[0].end = rcv_lastend; 2480 tp->rcv_numsacks = 1; 2481 } 2482 return; 2483 } 2484 /* Otherwise, sack blocks are already present. */ 2485 for (i = 0; i < tp->rcv_numsacks; i++) 2486 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2487 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2488 return; /* sack list remains unchanged */ 2489 /* 2490 * From here, segment just received should be (part of) the 1st sack. 2491 * Go through list, possibly coalescing sack block entries. 2492 */ 2493 firstsack.start = rcv_laststart; 2494 firstsack.end = rcv_lastend; 2495 for (i = 0; i < tp->rcv_numsacks; i++) { 2496 sack = tp->sackblks[i]; 2497 if (SEQ_LT(sack.end, firstsack.start) || 2498 SEQ_GT(sack.start, firstsack.end)) 2499 continue; /* no overlap */ 2500 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2501 /* 2502 * identical block; delete it here since we will 2503 * move it to the front of the list. 2504 */ 2505 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2506 lastpos = i; /* last posn with a zero entry */ 2507 continue; 2508 } 2509 if (SEQ_LEQ(sack.start, firstsack.start)) 2510 firstsack.start = sack.start; /* merge blocks */ 2511 if (SEQ_GEQ(sack.end, firstsack.end)) 2512 firstsack.end = sack.end; /* merge blocks */ 2513 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2514 lastpos = i; /* last posn with a zero entry */ 2515 } 2516 if (lastpos != -1) { /* at least one merge */ 2517 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2518 sack = tp->sackblks[i]; 2519 if (sack.start == 0 && sack.end == 0) 2520 continue; 2521 temp[j++] = sack; 2522 } 2523 tp->rcv_numsacks = j; /* including first blk (added later) */ 2524 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2525 tp->sackblks[i] = temp[i]; 2526 } else { /* no merges -- shift sacks by 1 */ 2527 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2528 tp->rcv_numsacks++; 2529 for (i = tp->rcv_numsacks-1; i > 0; i--) 2530 tp->sackblks[i] = tp->sackblks[i-1]; 2531 } 2532 tp->sackblks[0] = firstsack; 2533 return; 2534 } 2535 2536 /* 2537 * Process the TCP SACK option. tp->snd_holes is an ordered list 2538 * of holes (oldest to newest, in terms of the sequence space). 2539 */ 2540 void 2541 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2542 { 2543 int tmp_olen; 2544 u_char *tmp_cp; 2545 struct sackhole *cur, *p, *temp; 2546 2547 if (!tp->sack_enable) 2548 return; 2549 /* SACK without ACK doesn't make sense. */ 2550 if ((th->th_flags & TH_ACK) == 0) 2551 return; 2552 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2553 if (SEQ_LT(th->th_ack, tp->snd_una) || 2554 SEQ_GT(th->th_ack, tp->snd_max)) 2555 return; 2556 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2557 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2558 return; 2559 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2560 tmp_cp = cp + 2; 2561 tmp_olen = optlen - 2; 2562 tcpstat.tcps_sack_rcv_opts++; 2563 if (tp->snd_numholes < 0) 2564 tp->snd_numholes = 0; 2565 if (tp->t_maxseg == 0) 2566 panic("tcp_sack_option"); /* Should never happen */ 2567 while (tmp_olen > 0) { 2568 struct sackblk sack; 2569 2570 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2571 sack.start = ntohl(sack.start); 2572 bcopy(tmp_cp + sizeof(tcp_seq), 2573 (char *) &(sack.end), sizeof(tcp_seq)); 2574 sack.end = ntohl(sack.end); 2575 tmp_olen -= TCPOLEN_SACK; 2576 tmp_cp += TCPOLEN_SACK; 2577 if (SEQ_LEQ(sack.end, sack.start)) 2578 continue; /* bad SACK fields */ 2579 if (SEQ_LEQ(sack.end, tp->snd_una)) 2580 continue; /* old block */ 2581 #if defined(TCP_SACK) && defined(TCP_FACK) 2582 /* Updates snd_fack. */ 2583 if (SEQ_GT(sack.end, tp->snd_fack)) 2584 tp->snd_fack = sack.end; 2585 #endif /* TCP_FACK */ 2586 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2587 if (SEQ_LT(sack.start, th->th_ack)) 2588 continue; 2589 } 2590 if (SEQ_GT(sack.end, tp->snd_max)) 2591 continue; 2592 if (tp->snd_holes == NULL) { /* first hole */ 2593 tp->snd_holes = (struct sackhole *) 2594 pool_get(&sackhl_pool, PR_NOWAIT); 2595 if (tp->snd_holes == NULL) { 2596 /* ENOBUFS, so ignore SACKed block for now*/ 2597 goto done; 2598 } 2599 cur = tp->snd_holes; 2600 cur->start = th->th_ack; 2601 cur->end = sack.start; 2602 cur->rxmit = cur->start; 2603 cur->next = NULL; 2604 tp->snd_numholes = 1; 2605 tp->rcv_lastsack = sack.end; 2606 /* 2607 * dups is at least one. If more data has been 2608 * SACKed, it can be greater than one. 2609 */ 2610 cur->dups = min(tcprexmtthresh, 2611 ((sack.end - cur->end)/tp->t_maxseg)); 2612 if (cur->dups < 1) 2613 cur->dups = 1; 2614 continue; /* with next sack block */ 2615 } 2616 /* Go thru list of holes: p = previous, cur = current */ 2617 p = cur = tp->snd_holes; 2618 while (cur) { 2619 if (SEQ_LEQ(sack.end, cur->start)) 2620 /* SACKs data before the current hole */ 2621 break; /* no use going through more holes */ 2622 if (SEQ_GEQ(sack.start, cur->end)) { 2623 /* SACKs data beyond the current hole */ 2624 cur->dups++; 2625 if (((sack.end - cur->end)/tp->t_maxseg) >= 2626 tcprexmtthresh) 2627 cur->dups = tcprexmtthresh; 2628 p = cur; 2629 cur = cur->next; 2630 continue; 2631 } 2632 if (SEQ_LEQ(sack.start, cur->start)) { 2633 /* Data acks at least the beginning of hole */ 2634 #if defined(TCP_SACK) && defined(TCP_FACK) 2635 if (SEQ_GT(sack.end, cur->rxmit)) 2636 tp->retran_data -= 2637 tcp_seq_subtract(cur->rxmit, 2638 cur->start); 2639 else 2640 tp->retran_data -= 2641 tcp_seq_subtract(sack.end, 2642 cur->start); 2643 #endif /* TCP_FACK */ 2644 if (SEQ_GEQ(sack.end, cur->end)) { 2645 /* Acks entire hole, so delete hole */ 2646 if (p != cur) { 2647 p->next = cur->next; 2648 pool_put(&sackhl_pool, cur); 2649 cur = p->next; 2650 } else { 2651 cur = cur->next; 2652 pool_put(&sackhl_pool, p); 2653 p = cur; 2654 tp->snd_holes = p; 2655 } 2656 tp->snd_numholes--; 2657 continue; 2658 } 2659 /* otherwise, move start of hole forward */ 2660 cur->start = sack.end; 2661 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2662 p = cur; 2663 cur = cur->next; 2664 continue; 2665 } 2666 /* move end of hole backward */ 2667 if (SEQ_GEQ(sack.end, cur->end)) { 2668 #if defined(TCP_SACK) && defined(TCP_FACK) 2669 if (SEQ_GT(cur->rxmit, sack.start)) 2670 tp->retran_data -= 2671 tcp_seq_subtract(cur->rxmit, 2672 sack.start); 2673 #endif /* TCP_FACK */ 2674 cur->end = sack.start; 2675 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2676 cur->dups++; 2677 if (((sack.end - cur->end)/tp->t_maxseg) >= 2678 tcprexmtthresh) 2679 cur->dups = tcprexmtthresh; 2680 p = cur; 2681 cur = cur->next; 2682 continue; 2683 } 2684 if (SEQ_LT(cur->start, sack.start) && 2685 SEQ_GT(cur->end, sack.end)) { 2686 /* 2687 * ACKs some data in middle of a hole; need to 2688 * split current hole 2689 */ 2690 temp = (struct sackhole *) 2691 pool_get(&sackhl_pool, PR_NOWAIT); 2692 if (temp == NULL) 2693 goto done; /* ENOBUFS */ 2694 #if defined(TCP_SACK) && defined(TCP_FACK) 2695 if (SEQ_GT(cur->rxmit, sack.end)) 2696 tp->retran_data -= 2697 tcp_seq_subtract(sack.end, 2698 sack.start); 2699 else if (SEQ_GT(cur->rxmit, sack.start)) 2700 tp->retran_data -= 2701 tcp_seq_subtract(cur->rxmit, 2702 sack.start); 2703 #endif /* TCP_FACK */ 2704 temp->next = cur->next; 2705 temp->start = sack.end; 2706 temp->end = cur->end; 2707 temp->dups = cur->dups; 2708 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2709 cur->end = sack.start; 2710 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2711 cur->dups++; 2712 if (((sack.end - cur->end)/tp->t_maxseg) >= 2713 tcprexmtthresh) 2714 cur->dups = tcprexmtthresh; 2715 cur->next = temp; 2716 p = temp; 2717 cur = p->next; 2718 tp->snd_numholes++; 2719 } 2720 } 2721 /* At this point, p points to the last hole on the list */ 2722 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2723 /* 2724 * Need to append new hole at end. 2725 * Last hole is p (and it's not NULL). 2726 */ 2727 temp = (struct sackhole *) 2728 pool_get(&sackhl_pool, PR_NOWAIT); 2729 if (temp == NULL) 2730 goto done; /* ENOBUFS */ 2731 temp->start = tp->rcv_lastsack; 2732 temp->end = sack.start; 2733 temp->dups = min(tcprexmtthresh, 2734 ((sack.end - sack.start)/tp->t_maxseg)); 2735 if (temp->dups < 1) 2736 temp->dups = 1; 2737 temp->rxmit = temp->start; 2738 temp->next = 0; 2739 p->next = temp; 2740 tp->rcv_lastsack = sack.end; 2741 tp->snd_numholes++; 2742 } 2743 } 2744 done: 2745 #if defined(TCP_SACK) && defined(TCP_FACK) 2746 /* 2747 * Update retran_data and snd_awnd. Go through the list of 2748 * holes. Increment retran_data by (hole->rxmit - hole->start). 2749 */ 2750 tp->retran_data = 0; 2751 cur = tp->snd_holes; 2752 while (cur) { 2753 tp->retran_data += cur->rxmit - cur->start; 2754 cur = cur->next; 2755 } 2756 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2757 tp->retran_data; 2758 #endif /* TCP_FACK */ 2759 2760 return; 2761 } 2762 2763 /* 2764 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2765 * it is completely acked; otherwise, tcp_sack_option(), called from 2766 * tcp_dooptions(), will fix up the hole. 2767 */ 2768 void 2769 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2770 { 2771 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2772 /* max because this could be an older ack just arrived */ 2773 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2774 th->th_ack : tp->snd_una; 2775 struct sackhole *cur = tp->snd_holes; 2776 struct sackhole *prev; 2777 while (cur) 2778 if (SEQ_LEQ(cur->end, lastack)) { 2779 prev = cur; 2780 cur = cur->next; 2781 pool_put(&sackhl_pool, prev); 2782 tp->snd_numholes--; 2783 } else if (SEQ_LT(cur->start, lastack)) { 2784 cur->start = lastack; 2785 if (SEQ_LT(cur->rxmit, cur->start)) 2786 cur->rxmit = cur->start; 2787 break; 2788 } else 2789 break; 2790 tp->snd_holes = cur; 2791 } 2792 } 2793 2794 /* 2795 * Delete all receiver-side SACK information. 2796 */ 2797 void 2798 tcp_clean_sackreport(struct tcpcb *tp) 2799 { 2800 int i; 2801 2802 tp->rcv_numsacks = 0; 2803 for (i = 0; i < MAX_SACK_BLKS; i++) 2804 tp->sackblks[i].start = tp->sackblks[i].end=0; 2805 2806 } 2807 2808 /* 2809 * Checks for partial ack. If partial ack arrives, turn off retransmission 2810 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2811 * If the ack advances at least to tp->snd_last, return 0. 2812 */ 2813 int 2814 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2815 { 2816 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2817 /* Turn off retx. timer (will start again next segment) */ 2818 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2819 tp->t_rtttime = 0; 2820 #ifndef TCP_FACK 2821 /* 2822 * Partial window deflation. This statement relies on the 2823 * fact that tp->snd_una has not been updated yet. In FACK 2824 * hold snd_cwnd constant during fast recovery. 2825 */ 2826 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2827 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2828 tp->snd_cwnd += tp->t_maxseg; 2829 } else 2830 tp->snd_cwnd = tp->t_maxseg; 2831 #endif 2832 return (1); 2833 } 2834 return (0); 2835 } 2836 #endif /* TCP_SACK */ 2837 2838 /* 2839 * Pull out of band byte out of a segment so 2840 * it doesn't appear in the user's data queue. 2841 * It is still reflected in the segment length for 2842 * sequencing purposes. 2843 */ 2844 void 2845 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2846 { 2847 int cnt = off + urgent - 1; 2848 2849 while (cnt >= 0) { 2850 if (m->m_len > cnt) { 2851 char *cp = mtod(m, caddr_t) + cnt; 2852 struct tcpcb *tp = sototcpcb(so); 2853 2854 tp->t_iobc = *cp; 2855 tp->t_oobflags |= TCPOOB_HAVEDATA; 2856 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2857 m->m_len--; 2858 return; 2859 } 2860 cnt -= m->m_len; 2861 m = m->m_next; 2862 if (m == NULL) 2863 break; 2864 } 2865 panic("tcp_pulloutofband"); 2866 } 2867 2868 /* 2869 * Collect new round-trip time estimate 2870 * and update averages and current timeout. 2871 */ 2872 void 2873 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2874 { 2875 short delta; 2876 short rttmin; 2877 2878 if (rtt < 0) 2879 rtt = 0; 2880 else if (rtt > TCP_RTT_MAX) 2881 rtt = TCP_RTT_MAX; 2882 2883 tcpstat.tcps_rttupdated++; 2884 if (tp->t_srtt != 0) { 2885 /* 2886 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2887 * after the binary point (scaled by 4), whereas 2888 * srtt is stored as fixed point with 5 bits after the 2889 * binary point (i.e., scaled by 32). The following magic 2890 * is equivalent to the smoothing algorithm in rfc793 with 2891 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2892 * point). 2893 */ 2894 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2895 (tp->t_srtt >> TCP_RTT_SHIFT); 2896 if ((tp->t_srtt += delta) <= 0) 2897 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2898 /* 2899 * We accumulate a smoothed rtt variance (actually, a 2900 * smoothed mean difference), then set the retransmit 2901 * timer to smoothed rtt + 4 times the smoothed variance. 2902 * rttvar is stored as fixed point with 4 bits after the 2903 * binary point (scaled by 16). The following is 2904 * equivalent to rfc793 smoothing with an alpha of .75 2905 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2906 * rfc793's wired-in beta. 2907 */ 2908 if (delta < 0) 2909 delta = -delta; 2910 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2911 if ((tp->t_rttvar += delta) <= 0) 2912 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2913 } else { 2914 /* 2915 * No rtt measurement yet - use the unsmoothed rtt. 2916 * Set the variance to half the rtt (so our first 2917 * retransmit happens at 3*rtt). 2918 */ 2919 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2920 tp->t_rttvar = (rtt + 1) << 2921 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2922 } 2923 tp->t_rtttime = 0; 2924 tp->t_rxtshift = 0; 2925 2926 /* 2927 * the retransmit should happen at rtt + 4 * rttvar. 2928 * Because of the way we do the smoothing, srtt and rttvar 2929 * will each average +1/2 tick of bias. When we compute 2930 * the retransmit timer, we want 1/2 tick of rounding and 2931 * 1 extra tick because of +-1/2 tick uncertainty in the 2932 * firing of the timer. The bias will give us exactly the 2933 * 1.5 tick we need. But, because the bias is 2934 * statistical, we have to test that we don't drop below 2935 * the minimum feasible timer (which is 2 ticks). 2936 */ 2937 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2938 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2939 2940 /* 2941 * We received an ack for a packet that wasn't retransmitted; 2942 * it is probably safe to discard any error indications we've 2943 * received recently. This isn't quite right, but close enough 2944 * for now (a route might have failed after we sent a segment, 2945 * and the return path might not be symmetrical). 2946 */ 2947 tp->t_softerror = 0; 2948 } 2949 2950 /* 2951 * Determine a reasonable value for maxseg size. 2952 * If the route is known, check route for mtu. 2953 * If none, use an mss that can be handled on the outgoing 2954 * interface without forcing IP to fragment; if bigger than 2955 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2956 * to utilize large mbufs. If no route is found, route has no mtu, 2957 * or the destination isn't local, use a default, hopefully conservative 2958 * size (usually 512 or the default IP max size, but no more than the mtu 2959 * of the interface), as we can't discover anything about intervening 2960 * gateways or networks. We also initialize the congestion/slow start 2961 * window to be a single segment if the destination isn't local. 2962 * While looking at the routing entry, we also initialize other path-dependent 2963 * parameters from pre-set or cached values in the routing entry. 2964 * 2965 * Also take into account the space needed for options that we 2966 * send regularly. Make maxseg shorter by that amount to assure 2967 * that we can send maxseg amount of data even when the options 2968 * are present. Store the upper limit of the length of options plus 2969 * data in maxopd. 2970 * 2971 * NOTE: offer == -1 indicates that the maxseg size changed due to 2972 * Path MTU discovery. 2973 */ 2974 int 2975 tcp_mss(struct tcpcb *tp, int offer) 2976 { 2977 struct rtentry *rt; 2978 struct ifnet *ifp; 2979 int mss, mssopt; 2980 int iphlen; 2981 struct inpcb *inp; 2982 2983 inp = tp->t_inpcb; 2984 2985 mssopt = mss = tcp_mssdflt; 2986 2987 rt = in_pcbrtentry(inp); 2988 2989 if (rt == NULL) 2990 goto out; 2991 2992 ifp = rt->rt_ifp; 2993 2994 switch (tp->pf) { 2995 #ifdef INET6 2996 case AF_INET6: 2997 iphlen = sizeof(struct ip6_hdr); 2998 break; 2999 #endif 3000 case AF_INET: 3001 iphlen = sizeof(struct ip); 3002 break; 3003 default: 3004 /* the family does not support path MTU discovery */ 3005 goto out; 3006 } 3007 3008 /* 3009 * if there's an mtu associated with the route and we support 3010 * path MTU discovery for the underlying protocol family, use it. 3011 */ 3012 if (rt->rt_rmx.rmx_mtu) { 3013 /* 3014 * One may wish to lower MSS to take into account options, 3015 * especially security-related options. 3016 */ 3017 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3018 /* 3019 * RFC2460 section 5, last paragraph: if path MTU is 3020 * smaller than 1280, use 1280 as packet size and 3021 * attach fragment header. 3022 */ 3023 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3024 sizeof(struct tcphdr); 3025 } else { 3026 mss = rt->rt_rmx.rmx_mtu - iphlen - 3027 sizeof(struct tcphdr); 3028 } 3029 } else if (!ifp) { 3030 /* 3031 * ifp may be null and rmx_mtu may be zero in certain 3032 * v6 cases (e.g., if ND wasn't able to resolve the 3033 * destination host. 3034 */ 3035 goto out; 3036 } else if (ifp->if_flags & IFF_LOOPBACK) { 3037 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3038 } else if (tp->pf == AF_INET) { 3039 if (ip_mtudisc) 3040 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3041 } 3042 #ifdef INET6 3043 else if (tp->pf == AF_INET6) { 3044 /* 3045 * for IPv6, path MTU discovery is always turned on, 3046 * or the node must use packet size <= 1280. 3047 */ 3048 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3049 } 3050 #endif /* INET6 */ 3051 3052 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3053 if (offer != -1) { 3054 #ifndef INET6 3055 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3056 #else 3057 if (tp->pf == AF_INET6) 3058 mssopt = IN6_LINKMTU(ifp) - iphlen - 3059 sizeof(struct tcphdr); 3060 else 3061 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3062 #endif 3063 3064 mssopt = max(tcp_mssdflt, mssopt); 3065 } 3066 3067 out: 3068 /* 3069 * The current mss, t_maxseg, is initialized to the default value. 3070 * If we compute a smaller value, reduce the current mss. 3071 * If we compute a larger value, return it for use in sending 3072 * a max seg size option, but don't store it for use 3073 * unless we received an offer at least that large from peer. 3074 * 3075 * However, do not accept offers lower than the minimum of 3076 * the interface MTU and 216. 3077 */ 3078 if (offer > 0) 3079 tp->t_peermss = offer; 3080 if (tp->t_peermss) 3081 mss = min(mss, max(tp->t_peermss, 216)); 3082 3083 /* sanity - at least max opt. space */ 3084 mss = max(mss, 64); 3085 3086 /* 3087 * maxopd stores the maximum length of data AND options 3088 * in a segment; maxseg is the amount of data in a normal 3089 * segment. We need to store this value (maxopd) apart 3090 * from maxseg, because now every segment carries options 3091 * and thus we normally have somewhat less data in segments. 3092 */ 3093 tp->t_maxopd = mss; 3094 3095 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3096 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3097 mss -= TCPOLEN_TSTAMP_APPA; 3098 #ifdef TCP_SIGNATURE 3099 if (tp->t_flags & TF_SIGNATURE) 3100 mss -= TCPOLEN_SIGLEN; 3101 #endif 3102 3103 if (offer == -1) { 3104 /* mss changed due to Path MTU discovery */ 3105 tp->t_flags &= ~TF_PMTUD_PEND; 3106 tp->t_pmtud_mtu_sent = 0; 3107 tp->t_pmtud_mss_acked = 0; 3108 if (mss < tp->t_maxseg) { 3109 /* 3110 * Follow suggestion in RFC 2414 to reduce the 3111 * congestion window by the ratio of the old 3112 * segment size to the new segment size. 3113 */ 3114 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3115 mss, mss); 3116 } 3117 } else if (tcp_do_rfc3390 == 2) { 3118 /* increase initial window */ 3119 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3120 } else if (tcp_do_rfc3390) { 3121 /* increase initial window */ 3122 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3123 } else 3124 tp->snd_cwnd = mss; 3125 3126 tp->t_maxseg = mss; 3127 3128 return (offer != -1 ? mssopt : mss); 3129 } 3130 3131 u_int 3132 tcp_hdrsz(struct tcpcb *tp) 3133 { 3134 u_int hlen; 3135 3136 switch (tp->pf) { 3137 #ifdef INET6 3138 case AF_INET6: 3139 hlen = sizeof(struct ip6_hdr); 3140 break; 3141 #endif 3142 case AF_INET: 3143 hlen = sizeof(struct ip); 3144 break; 3145 default: 3146 hlen = 0; 3147 break; 3148 } 3149 hlen += sizeof(struct tcphdr); 3150 3151 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3152 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3153 hlen += TCPOLEN_TSTAMP_APPA; 3154 #ifdef TCP_SIGNATURE 3155 if (tp->t_flags & TF_SIGNATURE) 3156 hlen += TCPOLEN_SIGLEN; 3157 #endif 3158 return (hlen); 3159 } 3160 3161 /* 3162 * Set connection variables based on the effective MSS. 3163 * We are passed the TCPCB for the actual connection. If we 3164 * are the server, we are called by the compressed state engine 3165 * when the 3-way handshake is complete. If we are the client, 3166 * we are called when we receive the SYN,ACK from the server. 3167 * 3168 * NOTE: The t_maxseg value must be initialized in the TCPCB 3169 * before this routine is called! 3170 */ 3171 void 3172 tcp_mss_update(struct tcpcb *tp) 3173 { 3174 int mss; 3175 u_long bufsize; 3176 struct rtentry *rt; 3177 struct socket *so; 3178 3179 so = tp->t_inpcb->inp_socket; 3180 mss = tp->t_maxseg; 3181 3182 rt = in_pcbrtentry(tp->t_inpcb); 3183 3184 if (rt == NULL) 3185 return; 3186 3187 bufsize = so->so_snd.sb_hiwat; 3188 if (bufsize < mss) { 3189 mss = bufsize; 3190 /* Update t_maxseg and t_maxopd */ 3191 tcp_mss(tp, mss); 3192 } else { 3193 bufsize = roundup(bufsize, mss); 3194 if (bufsize > sb_max) 3195 bufsize = sb_max; 3196 (void)sbreserve(&so->so_snd, bufsize); 3197 } 3198 3199 bufsize = so->so_rcv.sb_hiwat; 3200 if (bufsize > mss) { 3201 bufsize = roundup(bufsize, mss); 3202 if (bufsize > sb_max) 3203 bufsize = sb_max; 3204 (void)sbreserve(&so->so_rcv, bufsize); 3205 } 3206 3207 } 3208 3209 #if defined (TCP_SACK) 3210 /* 3211 * Checks for partial ack. If partial ack arrives, force the retransmission 3212 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3213 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3214 * be started again. If the ack advances at least to tp->snd_last, return 0. 3215 */ 3216 int 3217 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3218 { 3219 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3220 /* 3221 * snd_una has not been updated and the socket send buffer 3222 * not yet drained of the acked data, so we have to leave 3223 * snd_una as it was to get the correct data offset in 3224 * tcp_output(). 3225 */ 3226 tcp_seq onxt = tp->snd_nxt; 3227 u_long ocwnd = tp->snd_cwnd; 3228 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3229 tp->t_rtttime = 0; 3230 tp->snd_nxt = th->th_ack; 3231 /* 3232 * Set snd_cwnd to one segment beyond acknowledged offset 3233 * (tp->snd_una not yet updated when this function is called) 3234 */ 3235 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3236 (void) tcp_output(tp); 3237 tp->snd_cwnd = ocwnd; 3238 if (SEQ_GT(onxt, tp->snd_nxt)) 3239 tp->snd_nxt = onxt; 3240 /* 3241 * Partial window deflation. Relies on fact that tp->snd_una 3242 * not updated yet. 3243 */ 3244 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3245 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3246 else 3247 tp->snd_cwnd = 0; 3248 tp->snd_cwnd += tp->t_maxseg; 3249 3250 return 1; 3251 } 3252 return 0; 3253 } 3254 #endif /* TCP_SACK */ 3255 3256 int 3257 tcp_mss_adv(struct mbuf *m, int af) 3258 { 3259 int mss = 0; 3260 int iphlen; 3261 struct ifnet *ifp = NULL; 3262 3263 if (m && (m->m_flags & M_PKTHDR)) 3264 ifp = if_get(m->m_pkthdr.ph_ifidx); 3265 3266 switch (af) { 3267 case AF_INET: 3268 if (ifp != NULL) 3269 mss = ifp->if_mtu; 3270 iphlen = sizeof(struct ip); 3271 break; 3272 #ifdef INET6 3273 case AF_INET6: 3274 if (ifp != NULL) 3275 mss = IN6_LINKMTU(ifp); 3276 iphlen = sizeof(struct ip6_hdr); 3277 break; 3278 #endif 3279 default: 3280 unhandled_af(af); 3281 } 3282 if_put(ifp); 3283 mss = mss - iphlen - sizeof(struct tcphdr); 3284 return (max(mss, tcp_mssdflt)); 3285 } 3286 3287 /* 3288 * TCP compressed state engine. Currently used to hold compressed 3289 * state for SYN_RECEIVED. 3290 */ 3291 3292 /* syn hash parameters */ 3293 #define TCP_SYN_HASH_SIZE 293 3294 #define TCP_SYN_BUCKET_SIZE 35 3295 int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; 3296 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3297 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3298 int tcp_syn_cache_count; 3299 struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE]; 3300 u_int32_t tcp_syn_hash[5]; 3301 3302 #define SYN_HASH(sa, sp, dp) \ 3303 (((sa)->s_addr ^ tcp_syn_hash[0]) * \ 3304 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ tcp_syn_hash[4])) 3305 #ifndef INET6 3306 #define SYN_HASHALL(hash, src, dst) \ 3307 do { \ 3308 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3309 satosin(src)->sin_port, \ 3310 satosin(dst)->sin_port); \ 3311 } while (/*CONSTCOND*/ 0) 3312 #else 3313 #define SYN_HASH6(sa, sp, dp) \ 3314 (((sa)->s6_addr32[0] ^ tcp_syn_hash[0]) * \ 3315 ((sa)->s6_addr32[1] ^ tcp_syn_hash[1]) * \ 3316 ((sa)->s6_addr32[2] ^ tcp_syn_hash[2]) * \ 3317 ((sa)->s6_addr32[3] ^ tcp_syn_hash[3]) * \ 3318 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ tcp_syn_hash[4])) 3319 3320 #define SYN_HASHALL(hash, src, dst) \ 3321 do { \ 3322 switch ((src)->sa_family) { \ 3323 case AF_INET: \ 3324 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3325 satosin(src)->sin_port, \ 3326 satosin(dst)->sin_port); \ 3327 break; \ 3328 case AF_INET6: \ 3329 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3330 satosin6(src)->sin6_port, \ 3331 satosin6(dst)->sin6_port); \ 3332 break; \ 3333 default: \ 3334 hash = 0; \ 3335 } \ 3336 } while (/*CONSTCOND*/0) 3337 #endif /* INET6 */ 3338 3339 void 3340 syn_cache_rm(struct syn_cache *sc) 3341 { 3342 sc->sc_flags |= SCF_DEAD; 3343 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 3344 sc, sc_bucketq); 3345 sc->sc_tp = NULL; 3346 LIST_REMOVE(sc, sc_tpq); 3347 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 3348 timeout_del(&sc->sc_timer); 3349 tcp_syn_cache_count--; 3350 } 3351 3352 void 3353 syn_cache_put(struct syn_cache *sc) 3354 { 3355 if (sc->sc_ipopts) 3356 (void) m_free(sc->sc_ipopts); 3357 if (sc->sc_route4.ro_rt != NULL) { 3358 rtfree(sc->sc_route4.ro_rt); 3359 sc->sc_route4.ro_rt = NULL; 3360 } 3361 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3362 timeout_add(&sc->sc_timer, 0); 3363 } 3364 3365 struct pool syn_cache_pool; 3366 3367 /* 3368 * We don't estimate RTT with SYNs, so each packet starts with the default 3369 * RTT and each timer step has a fixed timeout value. 3370 */ 3371 #define SYN_CACHE_TIMER_ARM(sc) \ 3372 do { \ 3373 TCPT_RANGESET((sc)->sc_rxtcur, \ 3374 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3375 TCPTV_REXMTMAX); \ 3376 if (!timeout_initialized(&(sc)->sc_timer)) \ 3377 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3378 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3379 } while (/*CONSTCOND*/0) 3380 3381 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3382 3383 void 3384 syn_cache_init() 3385 { 3386 int i; 3387 3388 /* Initialize the hash buckets. */ 3389 for (i = 0; i < tcp_syn_cache_size; i++) 3390 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3391 3392 /* Initialize the syn cache pool. */ 3393 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3394 "syncache", NULL); 3395 pool_setipl(&syn_cache_pool, IPL_SOFTNET); 3396 } 3397 3398 void 3399 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3400 { 3401 struct syn_cache_head *scp; 3402 struct syn_cache *sc2; 3403 int s; 3404 3405 /* 3406 * If there are no entries in the hash table, reinitialize 3407 * the hash secrets. 3408 */ 3409 if (tcp_syn_cache_count == 0) 3410 arc4random_buf(tcp_syn_hash, sizeof(tcp_syn_hash)); 3411 3412 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3413 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3414 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3415 3416 /* 3417 * Make sure that we don't overflow the per-bucket 3418 * limit or the total cache size limit. 3419 */ 3420 s = splsoftnet(); 3421 if (scp->sch_length >= tcp_syn_bucket_limit) { 3422 tcpstat.tcps_sc_bucketoverflow++; 3423 /* 3424 * The bucket is full. Toss the oldest element in the 3425 * bucket. This will be the first entry in the bucket. 3426 */ 3427 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3428 #ifdef DIAGNOSTIC 3429 /* 3430 * This should never happen; we should always find an 3431 * entry in our bucket. 3432 */ 3433 if (sc2 == NULL) 3434 panic("syn_cache_insert: bucketoverflow: impossible"); 3435 #endif 3436 syn_cache_rm(sc2); 3437 syn_cache_put(sc2); 3438 } else if (tcp_syn_cache_count >= tcp_syn_cache_limit) { 3439 struct syn_cache_head *scp2, *sce; 3440 3441 tcpstat.tcps_sc_overflowed++; 3442 /* 3443 * The cache is full. Toss the oldest entry in the 3444 * first non-empty bucket we can find. 3445 * 3446 * XXX We would really like to toss the oldest 3447 * entry in the cache, but we hope that this 3448 * condition doesn't happen very often. 3449 */ 3450 scp2 = scp; 3451 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3452 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3453 for (++scp2; scp2 != scp; scp2++) { 3454 if (scp2 >= sce) 3455 scp2 = &tcp_syn_cache[0]; 3456 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3457 break; 3458 } 3459 #ifdef DIAGNOSTIC 3460 /* 3461 * This should never happen; we should always find a 3462 * non-empty bucket. 3463 */ 3464 if (scp2 == scp) 3465 panic("syn_cache_insert: cacheoverflow: " 3466 "impossible"); 3467 #endif 3468 } 3469 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3470 syn_cache_rm(sc2); 3471 syn_cache_put(sc2); 3472 } 3473 3474 /* 3475 * Initialize the entry's timer. 3476 */ 3477 sc->sc_rxttot = 0; 3478 sc->sc_rxtshift = 0; 3479 SYN_CACHE_TIMER_ARM(sc); 3480 3481 /* Link it from tcpcb entry */ 3482 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3483 3484 /* Put it into the bucket. */ 3485 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3486 scp->sch_length++; 3487 tcp_syn_cache_count++; 3488 3489 tcpstat.tcps_sc_added++; 3490 splx(s); 3491 } 3492 3493 /* 3494 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3495 * If we have retransmitted an entry the maximum number of times, expire 3496 * that entry. 3497 */ 3498 void 3499 syn_cache_timer(void *arg) 3500 { 3501 struct syn_cache *sc = arg; 3502 int s; 3503 3504 s = splsoftnet(); 3505 if (sc->sc_flags & SCF_DEAD) { 3506 splx(s); 3507 return; 3508 } 3509 3510 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3511 /* Drop it -- too many retransmissions. */ 3512 goto dropit; 3513 } 3514 3515 /* 3516 * Compute the total amount of time this entry has 3517 * been on a queue. If this entry has been on longer 3518 * than the keep alive timer would allow, expire it. 3519 */ 3520 sc->sc_rxttot += sc->sc_rxtcur; 3521 if (sc->sc_rxttot >= tcptv_keep_init) 3522 goto dropit; 3523 3524 tcpstat.tcps_sc_retransmitted++; 3525 (void) syn_cache_respond(sc, NULL); 3526 3527 /* Advance the timer back-off. */ 3528 sc->sc_rxtshift++; 3529 SYN_CACHE_TIMER_ARM(sc); 3530 3531 splx(s); 3532 return; 3533 3534 dropit: 3535 tcpstat.tcps_sc_timed_out++; 3536 syn_cache_rm(sc); 3537 syn_cache_put(sc); 3538 splx(s); 3539 } 3540 3541 void 3542 syn_cache_reaper(void *arg) 3543 { 3544 struct syn_cache *sc = arg; 3545 3546 pool_put(&syn_cache_pool, (sc)); 3547 return; 3548 } 3549 3550 /* 3551 * Remove syn cache created by the specified tcb entry, 3552 * because this does not make sense to keep them 3553 * (if there's no tcb entry, syn cache entry will never be used) 3554 */ 3555 void 3556 syn_cache_cleanup(struct tcpcb *tp) 3557 { 3558 struct syn_cache *sc, *nsc; 3559 int s; 3560 3561 s = splsoftnet(); 3562 3563 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3564 #ifdef DIAGNOSTIC 3565 if (sc->sc_tp != tp) 3566 panic("invalid sc_tp in syn_cache_cleanup"); 3567 #endif 3568 syn_cache_rm(sc); 3569 syn_cache_put(sc); 3570 } 3571 /* just for safety */ 3572 LIST_INIT(&tp->t_sc); 3573 3574 splx(s); 3575 } 3576 3577 /* 3578 * Find an entry in the syn cache. 3579 */ 3580 struct syn_cache * 3581 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3582 struct syn_cache_head **headp, u_int rtableid) 3583 { 3584 struct syn_cache *sc; 3585 struct syn_cache_head *scp; 3586 u_int32_t hash; 3587 3588 splsoftassert(IPL_SOFTNET); 3589 3590 if (tcp_syn_cache_count == 0) 3591 return (NULL); 3592 3593 SYN_HASHALL(hash, src, dst); 3594 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3595 *headp = scp; 3596 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3597 if (sc->sc_hash != hash) 3598 continue; 3599 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3600 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3601 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3602 return (sc); 3603 } 3604 return (NULL); 3605 } 3606 3607 /* 3608 * This function gets called when we receive an ACK for a 3609 * socket in the LISTEN state. We look up the connection 3610 * in the syn cache, and if its there, we pull it out of 3611 * the cache and turn it into a full-blown connection in 3612 * the SYN-RECEIVED state. 3613 * 3614 * The return values may not be immediately obvious, and their effects 3615 * can be subtle, so here they are: 3616 * 3617 * NULL SYN was not found in cache; caller should drop the 3618 * packet and send an RST. 3619 * 3620 * -1 We were unable to create the new connection, and are 3621 * aborting it. An ACK,RST is being sent to the peer 3622 * (unless we got screwey sequence numbners; see below), 3623 * because the 3-way handshake has been completed. Caller 3624 * should not free the mbuf, since we may be using it. If 3625 * we are not, we will free it. 3626 * 3627 * Otherwise, the return value is a pointer to the new socket 3628 * associated with the connection. 3629 */ 3630 struct socket * 3631 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3632 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3633 { 3634 struct syn_cache *sc; 3635 struct syn_cache_head *scp; 3636 struct inpcb *inp = NULL; 3637 struct tcpcb *tp = NULL; 3638 struct mbuf *am; 3639 int s; 3640 struct socket *oso; 3641 #if NPF > 0 3642 struct pf_divert *divert = NULL; 3643 #endif 3644 3645 s = splsoftnet(); 3646 if ((sc = syn_cache_lookup(src, dst, &scp, 3647 sotoinpcb(so)->inp_rtableid)) == NULL) { 3648 splx(s); 3649 return (NULL); 3650 } 3651 3652 /* 3653 * Verify the sequence and ack numbers. Try getting the correct 3654 * response again. 3655 */ 3656 if ((th->th_ack != sc->sc_iss + 1) || 3657 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3658 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3659 (void) syn_cache_respond(sc, m); 3660 splx(s); 3661 return ((struct socket *)(-1)); 3662 } 3663 3664 /* Remove this cache entry */ 3665 syn_cache_rm(sc); 3666 splx(s); 3667 3668 /* 3669 * Ok, create the full blown connection, and set things up 3670 * as they would have been set up if we had created the 3671 * connection when the SYN arrived. If we can't create 3672 * the connection, abort it. 3673 */ 3674 oso = so; 3675 so = sonewconn(so, SS_ISCONNECTED); 3676 if (so == NULL) 3677 goto resetandabort; 3678 3679 inp = sotoinpcb(oso); 3680 3681 #ifdef IPSEC 3682 /* 3683 * We need to copy the required security levels 3684 * from the old pcb. Ditto for any other 3685 * IPsec-related information. 3686 */ 3687 { 3688 struct inpcb *newinp = sotoinpcb(so); 3689 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3690 sizeof(inp->inp_seclevel)); 3691 } 3692 #endif /* IPSEC */ 3693 #ifdef INET6 3694 /* 3695 * inp still has the OLD in_pcb stuff, set the 3696 * v6-related flags on the new guy, too. 3697 */ 3698 { 3699 int flags = inp->inp_flags; 3700 struct inpcb *oldinpcb = inp; 3701 3702 inp = sotoinpcb(so); 3703 inp->inp_flags |= (flags & INP_IPV6); 3704 if ((inp->inp_flags & INP_IPV6) != 0) { 3705 inp->inp_ipv6.ip6_hlim = 3706 oldinpcb->inp_ipv6.ip6_hlim; 3707 } 3708 } 3709 #else /* INET6 */ 3710 inp = sotoinpcb(so); 3711 #endif /* INET6 */ 3712 3713 #if NPF > 0 3714 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3715 (divert = pf_find_divert(m)) != NULL) 3716 inp->inp_rtableid = divert->rdomain; 3717 else 3718 #endif 3719 /* inherit rtable from listening socket */ 3720 inp->inp_rtableid = sc->sc_rtableid; 3721 3722 inp->inp_lport = th->th_dport; 3723 switch (src->sa_family) { 3724 #ifdef INET6 3725 case AF_INET6: 3726 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3727 break; 3728 #endif /* INET6 */ 3729 case AF_INET: 3730 inp->inp_laddr = satosin(dst)->sin_addr; 3731 inp->inp_options = ip_srcroute(m); 3732 if (inp->inp_options == NULL) { 3733 inp->inp_options = sc->sc_ipopts; 3734 sc->sc_ipopts = NULL; 3735 } 3736 break; 3737 } 3738 in_pcbrehash(inp); 3739 3740 /* 3741 * Give the new socket our cached route reference. 3742 */ 3743 if (src->sa_family == AF_INET) 3744 inp->inp_route = sc->sc_route4; /* struct assignment */ 3745 #ifdef INET6 3746 else 3747 inp->inp_route6 = sc->sc_route6; 3748 #endif 3749 sc->sc_route4.ro_rt = NULL; 3750 3751 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3752 if (am == NULL) 3753 goto resetandabort; 3754 am->m_len = src->sa_len; 3755 bcopy(src, mtod(am, caddr_t), src->sa_len); 3756 3757 switch (src->sa_family) { 3758 case AF_INET: 3759 /* drop IPv4 packet to AF_INET6 socket */ 3760 if (inp->inp_flags & INP_IPV6) { 3761 (void) m_free(am); 3762 goto resetandabort; 3763 } 3764 if (in_pcbconnect(inp, am)) { 3765 (void) m_free(am); 3766 goto resetandabort; 3767 } 3768 break; 3769 #ifdef INET6 3770 case AF_INET6: 3771 if (in6_pcbconnect(inp, am)) { 3772 (void) m_free(am); 3773 goto resetandabort; 3774 } 3775 break; 3776 #endif 3777 } 3778 (void) m_free(am); 3779 3780 tp = intotcpcb(inp); 3781 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3782 if (sc->sc_request_r_scale != 15) { 3783 tp->requested_s_scale = sc->sc_requested_s_scale; 3784 tp->request_r_scale = sc->sc_request_r_scale; 3785 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3786 } 3787 if (sc->sc_flags & SCF_TIMESTAMP) 3788 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3789 3790 tp->t_template = tcp_template(tp); 3791 if (tp->t_template == 0) { 3792 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3793 so = NULL; 3794 m_freem(m); 3795 goto abort; 3796 } 3797 #ifdef TCP_SACK 3798 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3799 #endif 3800 3801 tp->ts_modulate = sc->sc_modulate; 3802 tp->ts_recent = sc->sc_timestamp; 3803 tp->iss = sc->sc_iss; 3804 tp->irs = sc->sc_irs; 3805 tcp_sendseqinit(tp); 3806 #if defined (TCP_SACK) || defined(TCP_ECN) 3807 tp->snd_last = tp->snd_una; 3808 #endif /* TCP_SACK */ 3809 #if defined(TCP_SACK) && defined(TCP_FACK) 3810 tp->snd_fack = tp->snd_una; 3811 tp->retran_data = 0; 3812 tp->snd_awnd = 0; 3813 #endif /* TCP_FACK */ 3814 #ifdef TCP_ECN 3815 if (sc->sc_flags & SCF_ECN_PERMIT) { 3816 tp->t_flags |= TF_ECN_PERMIT; 3817 tcpstat.tcps_ecn_accepts++; 3818 } 3819 #endif 3820 #ifdef TCP_SACK 3821 if (sc->sc_flags & SCF_SACK_PERMIT) 3822 tp->t_flags |= TF_SACK_PERMIT; 3823 #endif 3824 #ifdef TCP_SIGNATURE 3825 if (sc->sc_flags & SCF_SIGNATURE) 3826 tp->t_flags |= TF_SIGNATURE; 3827 #endif 3828 tcp_rcvseqinit(tp); 3829 tp->t_state = TCPS_SYN_RECEIVED; 3830 tp->t_rcvtime = tcp_now; 3831 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3832 tcpstat.tcps_accepts++; 3833 3834 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3835 if (sc->sc_peermaxseg) 3836 tcp_mss_update(tp); 3837 /* Reset initial window to 1 segment for retransmit */ 3838 if (sc->sc_rxtshift > 0) 3839 tp->snd_cwnd = tp->t_maxseg; 3840 tp->snd_wl1 = sc->sc_irs; 3841 tp->rcv_up = sc->sc_irs + 1; 3842 3843 /* 3844 * This is what whould have happened in tcp_output() when 3845 * the SYN,ACK was sent. 3846 */ 3847 tp->snd_up = tp->snd_una; 3848 tp->snd_max = tp->snd_nxt = tp->iss+1; 3849 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3850 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3851 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3852 tp->last_ack_sent = tp->rcv_nxt; 3853 3854 tcpstat.tcps_sc_completed++; 3855 syn_cache_put(sc); 3856 return (so); 3857 3858 resetandabort: 3859 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3860 m->m_pkthdr.ph_rtableid); 3861 m_freem(m); 3862 abort: 3863 if (so != NULL) 3864 (void) soabort(so); 3865 syn_cache_put(sc); 3866 tcpstat.tcps_sc_aborted++; 3867 return ((struct socket *)(-1)); 3868 } 3869 3870 /* 3871 * This function is called when we get a RST for a 3872 * non-existent connection, so that we can see if the 3873 * connection is in the syn cache. If it is, zap it. 3874 */ 3875 3876 void 3877 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3878 u_int rtableid) 3879 { 3880 struct syn_cache *sc; 3881 struct syn_cache_head *scp; 3882 int s = splsoftnet(); 3883 3884 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3885 splx(s); 3886 return; 3887 } 3888 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3889 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3890 splx(s); 3891 return; 3892 } 3893 syn_cache_rm(sc); 3894 splx(s); 3895 tcpstat.tcps_sc_reset++; 3896 syn_cache_put(sc); 3897 } 3898 3899 void 3900 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3901 u_int rtableid) 3902 { 3903 struct syn_cache *sc; 3904 struct syn_cache_head *scp; 3905 int s; 3906 3907 s = splsoftnet(); 3908 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3909 splx(s); 3910 return; 3911 } 3912 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3913 if (ntohl (th->th_seq) != sc->sc_iss) { 3914 splx(s); 3915 return; 3916 } 3917 3918 /* 3919 * If we've retransmitted 3 times and this is our second error, 3920 * we remove the entry. Otherwise, we allow it to continue on. 3921 * This prevents us from incorrectly nuking an entry during a 3922 * spurious network outage. 3923 * 3924 * See tcp_notify(). 3925 */ 3926 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3927 sc->sc_flags |= SCF_UNREACH; 3928 splx(s); 3929 return; 3930 } 3931 3932 syn_cache_rm(sc); 3933 splx(s); 3934 tcpstat.tcps_sc_unreach++; 3935 syn_cache_put(sc); 3936 } 3937 3938 /* 3939 * Given a LISTEN socket and an inbound SYN request, add 3940 * this to the syn cache, and send back a segment: 3941 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3942 * to the source. 3943 * 3944 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3945 * Doing so would require that we hold onto the data and deliver it 3946 * to the application. However, if we are the target of a SYN-flood 3947 * DoS attack, an attacker could send data which would eventually 3948 * consume all available buffer space if it were ACKed. By not ACKing 3949 * the data, we avoid this DoS scenario. 3950 */ 3951 3952 int 3953 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3954 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3955 struct tcp_opt_info *oi, tcp_seq *issp) 3956 { 3957 struct tcpcb tb, *tp; 3958 long win; 3959 struct syn_cache *sc; 3960 struct syn_cache_head *scp; 3961 struct mbuf *ipopts; 3962 3963 tp = sototcpcb(so); 3964 3965 /* 3966 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3967 * 3968 * Note this check is performed in tcp_input() very early on. 3969 */ 3970 3971 /* 3972 * Initialize some local state. 3973 */ 3974 win = sbspace(&so->so_rcv); 3975 if (win > TCP_MAXWIN) 3976 win = TCP_MAXWIN; 3977 3978 bzero(&tb, sizeof(tb)); 3979 #ifdef TCP_SIGNATURE 3980 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3981 #else 3982 if (optp) { 3983 #endif 3984 tb.pf = tp->pf; 3985 #ifdef TCP_SACK 3986 tb.sack_enable = tp->sack_enable; 3987 #endif 3988 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3989 #ifdef TCP_SIGNATURE 3990 if (tp->t_flags & TF_SIGNATURE) 3991 tb.t_flags |= TF_SIGNATURE; 3992 #endif 3993 tb.t_state = TCPS_LISTEN; 3994 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3995 sotoinpcb(so)->inp_rtableid)) 3996 return (-1); 3997 } 3998 3999 switch (src->sa_family) { 4000 case AF_INET: 4001 /* 4002 * Remember the IP options, if any. 4003 */ 4004 ipopts = ip_srcroute(m); 4005 break; 4006 default: 4007 ipopts = NULL; 4008 } 4009 4010 /* 4011 * See if we already have an entry for this connection. 4012 * If we do, resend the SYN,ACK. We do not count this 4013 * as a retransmission (XXX though maybe we should). 4014 */ 4015 if ((sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid)) 4016 != NULL) { 4017 tcpstat.tcps_sc_dupesyn++; 4018 if (ipopts) { 4019 /* 4020 * If we were remembering a previous source route, 4021 * forget it and use the new one we've been given. 4022 */ 4023 if (sc->sc_ipopts) 4024 (void) m_free(sc->sc_ipopts); 4025 sc->sc_ipopts = ipopts; 4026 } 4027 sc->sc_timestamp = tb.ts_recent; 4028 if (syn_cache_respond(sc, m) == 0) { 4029 tcpstat.tcps_sndacks++; 4030 tcpstat.tcps_sndtotal++; 4031 } 4032 return (0); 4033 } 4034 4035 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4036 if (sc == NULL) { 4037 if (ipopts) 4038 (void) m_free(ipopts); 4039 return (-1); 4040 } 4041 4042 /* 4043 * Fill in the cache, and put the necessary IP and TCP 4044 * options into the reply. 4045 */ 4046 bcopy(src, &sc->sc_src, src->sa_len); 4047 bcopy(dst, &sc->sc_dst, dst->sa_len); 4048 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 4049 sc->sc_flags = 0; 4050 sc->sc_ipopts = ipopts; 4051 sc->sc_irs = th->th_seq; 4052 4053 sc->sc_iss = issp ? *issp : arc4random(); 4054 sc->sc_peermaxseg = oi->maxseg; 4055 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 4056 sc->sc_win = win; 4057 sc->sc_timestamp = tb.ts_recent; 4058 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4059 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4060 sc->sc_flags |= SCF_TIMESTAMP; 4061 sc->sc_modulate = arc4random(); 4062 } 4063 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4064 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4065 sc->sc_requested_s_scale = tb.requested_s_scale; 4066 sc->sc_request_r_scale = 0; 4067 /* 4068 * Pick the smallest possible scaling factor that 4069 * will still allow us to scale up to sb_max. 4070 * 4071 * We do this because there are broken firewalls that 4072 * will corrupt the window scale option, leading to 4073 * the other endpoint believing that our advertised 4074 * window is unscaled. At scale factors larger than 4075 * 5 the unscaled window will drop below 1500 bytes, 4076 * leading to serious problems when traversing these 4077 * broken firewalls. 4078 * 4079 * With the default sbmax of 256K, a scale factor 4080 * of 3 will be chosen by this algorithm. Those who 4081 * choose a larger sbmax should watch out 4082 * for the compatiblity problems mentioned above. 4083 * 4084 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4085 * or <SYN,ACK>) segment itself is never scaled. 4086 */ 4087 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4088 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4089 sc->sc_request_r_scale++; 4090 } else { 4091 sc->sc_requested_s_scale = 15; 4092 sc->sc_request_r_scale = 15; 4093 } 4094 #ifdef TCP_ECN 4095 /* 4096 * if both ECE and CWR flag bits are set, peer is ECN capable. 4097 */ 4098 if (tcp_do_ecn && 4099 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4100 sc->sc_flags |= SCF_ECN_PERMIT; 4101 #endif 4102 #ifdef TCP_SACK 4103 /* 4104 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4105 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4106 */ 4107 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4108 sc->sc_flags |= SCF_SACK_PERMIT; 4109 #endif 4110 #ifdef TCP_SIGNATURE 4111 if (tb.t_flags & TF_SIGNATURE) 4112 sc->sc_flags |= SCF_SIGNATURE; 4113 #endif 4114 sc->sc_tp = tp; 4115 if (syn_cache_respond(sc, m) == 0) { 4116 syn_cache_insert(sc, tp); 4117 tcpstat.tcps_sndacks++; 4118 tcpstat.tcps_sndtotal++; 4119 } else { 4120 syn_cache_put(sc); 4121 tcpstat.tcps_sc_dropped++; 4122 } 4123 4124 return (0); 4125 } 4126 4127 int 4128 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4129 { 4130 struct route *ro; 4131 u_int8_t *optp; 4132 int optlen, error; 4133 u_int16_t tlen; 4134 struct ip *ip = NULL; 4135 #ifdef INET6 4136 struct ip6_hdr *ip6 = NULL; 4137 #endif 4138 struct tcphdr *th; 4139 u_int hlen; 4140 struct inpcb *inp; 4141 4142 switch (sc->sc_src.sa.sa_family) { 4143 case AF_INET: 4144 hlen = sizeof(struct ip); 4145 ro = &sc->sc_route4; 4146 break; 4147 #ifdef INET6 4148 case AF_INET6: 4149 hlen = sizeof(struct ip6_hdr); 4150 ro = (struct route *)&sc->sc_route6; 4151 break; 4152 #endif 4153 default: 4154 m_freem(m); 4155 return (EAFNOSUPPORT); 4156 } 4157 4158 /* Compute the size of the TCP options. */ 4159 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4160 #ifdef TCP_SACK 4161 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4162 #endif 4163 #ifdef TCP_SIGNATURE 4164 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4165 #endif 4166 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4167 4168 tlen = hlen + sizeof(struct tcphdr) + optlen; 4169 4170 /* 4171 * Create the IP+TCP header from scratch. 4172 */ 4173 m_freem(m); 4174 #ifdef DIAGNOSTIC 4175 if (max_linkhdr + tlen > MCLBYTES) 4176 return (ENOBUFS); 4177 #endif 4178 MGETHDR(m, M_DONTWAIT, MT_DATA); 4179 if (m && max_linkhdr + tlen > MHLEN) { 4180 MCLGET(m, M_DONTWAIT); 4181 if ((m->m_flags & M_EXT) == 0) { 4182 m_freem(m); 4183 m = NULL; 4184 } 4185 } 4186 if (m == NULL) 4187 return (ENOBUFS); 4188 4189 /* Fixup the mbuf. */ 4190 m->m_data += max_linkhdr; 4191 m->m_len = m->m_pkthdr.len = tlen; 4192 m->m_pkthdr.ph_ifidx = 0; 4193 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4194 memset(mtod(m, u_char *), 0, tlen); 4195 4196 switch (sc->sc_src.sa.sa_family) { 4197 case AF_INET: 4198 ip = mtod(m, struct ip *); 4199 ip->ip_dst = sc->sc_src.sin.sin_addr; 4200 ip->ip_src = sc->sc_dst.sin.sin_addr; 4201 ip->ip_p = IPPROTO_TCP; 4202 th = (struct tcphdr *)(ip + 1); 4203 th->th_dport = sc->sc_src.sin.sin_port; 4204 th->th_sport = sc->sc_dst.sin.sin_port; 4205 break; 4206 #ifdef INET6 4207 case AF_INET6: 4208 ip6 = mtod(m, struct ip6_hdr *); 4209 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4210 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4211 ip6->ip6_nxt = IPPROTO_TCP; 4212 /* ip6_plen will be updated in ip6_output() */ 4213 th = (struct tcphdr *)(ip6 + 1); 4214 th->th_dport = sc->sc_src.sin6.sin6_port; 4215 th->th_sport = sc->sc_dst.sin6.sin6_port; 4216 break; 4217 #endif 4218 default: 4219 unhandled_af(sc->sc_src.sa.sa_family); 4220 } 4221 4222 th->th_seq = htonl(sc->sc_iss); 4223 th->th_ack = htonl(sc->sc_irs + 1); 4224 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4225 th->th_flags = TH_SYN|TH_ACK; 4226 #ifdef TCP_ECN 4227 /* Set ECE for SYN-ACK if peer supports ECN. */ 4228 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4229 th->th_flags |= TH_ECE; 4230 #endif 4231 th->th_win = htons(sc->sc_win); 4232 /* th_sum already 0 */ 4233 /* th_urp already 0 */ 4234 4235 /* Tack on the TCP options. */ 4236 optp = (u_int8_t *)(th + 1); 4237 *optp++ = TCPOPT_MAXSEG; 4238 *optp++ = 4; 4239 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4240 *optp++ = sc->sc_ourmaxseg & 0xff; 4241 4242 #ifdef TCP_SACK 4243 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4244 if (sc->sc_flags & SCF_SACK_PERMIT) { 4245 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4246 optp += 4; 4247 } 4248 #endif 4249 4250 if (sc->sc_request_r_scale != 15) { 4251 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4252 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4253 sc->sc_request_r_scale); 4254 optp += 4; 4255 } 4256 4257 if (sc->sc_flags & SCF_TIMESTAMP) { 4258 u_int32_t *lp = (u_int32_t *)(optp); 4259 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4260 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4261 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4262 *lp = htonl(sc->sc_timestamp); 4263 optp += TCPOLEN_TSTAMP_APPA; 4264 } 4265 4266 #ifdef TCP_SIGNATURE 4267 if (sc->sc_flags & SCF_SIGNATURE) { 4268 union sockaddr_union src, dst; 4269 struct tdb *tdb; 4270 4271 bzero(&src, sizeof(union sockaddr_union)); 4272 bzero(&dst, sizeof(union sockaddr_union)); 4273 src.sa.sa_len = sc->sc_src.sa.sa_len; 4274 src.sa.sa_family = sc->sc_src.sa.sa_family; 4275 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4276 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4277 4278 switch (sc->sc_src.sa.sa_family) { 4279 case 0: /*default to PF_INET*/ 4280 case AF_INET: 4281 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4282 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4283 break; 4284 #ifdef INET6 4285 case AF_INET6: 4286 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4287 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4288 break; 4289 #endif /* INET6 */ 4290 } 4291 4292 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4293 0, &src, &dst, IPPROTO_TCP); 4294 if (tdb == NULL) { 4295 m_freem(m); 4296 return (EPERM); 4297 } 4298 4299 /* Send signature option */ 4300 *(optp++) = TCPOPT_SIGNATURE; 4301 *(optp++) = TCPOLEN_SIGNATURE; 4302 4303 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4304 hlen, 0, optp) < 0) { 4305 m_freem(m); 4306 return (EINVAL); 4307 } 4308 optp += 16; 4309 4310 /* Pad options list to the next 32 bit boundary and 4311 * terminate it. 4312 */ 4313 *optp++ = TCPOPT_NOP; 4314 *optp++ = TCPOPT_EOL; 4315 } 4316 #endif /* TCP_SIGNATURE */ 4317 4318 /* Compute the packet's checksum. */ 4319 switch (sc->sc_src.sa.sa_family) { 4320 case AF_INET: 4321 ip->ip_len = htons(tlen - hlen); 4322 th->th_sum = 0; 4323 th->th_sum = in_cksum(m, tlen); 4324 break; 4325 #ifdef INET6 4326 case AF_INET6: 4327 ip6->ip6_plen = htons(tlen - hlen); 4328 th->th_sum = 0; 4329 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4330 break; 4331 #endif 4332 } 4333 4334 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4335 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4336 4337 /* 4338 * Fill in some straggling IP bits. Note the stack expects 4339 * ip_len to be in host order, for convenience. 4340 */ 4341 switch (sc->sc_src.sa.sa_family) { 4342 case AF_INET: 4343 ip->ip_len = htons(tlen); 4344 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4345 if (inp != NULL) 4346 ip->ip_tos = inp->inp_ip.ip_tos; 4347 break; 4348 #ifdef INET6 4349 case AF_INET6: 4350 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4351 ip6->ip6_vfc |= IPV6_VERSION; 4352 ip6->ip6_plen = htons(tlen - hlen); 4353 /* ip6_hlim will be initialized afterwards */ 4354 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4355 break; 4356 #endif 4357 } 4358 4359 switch (sc->sc_src.sa.sa_family) { 4360 case AF_INET: 4361 error = ip_output(m, sc->sc_ipopts, ro, 4362 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4363 break; 4364 #ifdef INET6 4365 case AF_INET6: 4366 ip6->ip6_hlim = in6_selecthlim(NULL); 4367 4368 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4369 NULL, NULL); 4370 break; 4371 #endif 4372 default: 4373 error = EAFNOSUPPORT; 4374 break; 4375 } 4376 return (error); 4377 } 4378