1 /* $OpenBSD: tcp_input.c,v 1.219 2008/06/14 22:15:30 jsing Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/protosw.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/kernel.h> 78 #include <sys/pool.h> 79 80 #include <dev/rndvar.h> 81 82 #include <net/if.h> 83 #include <net/route.h> 84 85 #include <netinet/in.h> 86 #include <netinet/in_systm.h> 87 #include <netinet/ip.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/tcp.h> 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/tcp_debug.h> 97 98 #include "faith.h" 99 100 struct tcpiphdr tcp_saveti; 101 102 int tcp_mss_adv(struct ifnet *, int); 103 104 #ifdef INET6 105 #include <netinet6/in6_var.h> 106 #include <netinet6/nd6.h> 107 108 struct tcpipv6hdr tcp_saveti6; 109 110 /* for the packet header length in the mbuf */ 111 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 112 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 113 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 114 #endif /* INET6 */ 115 116 int tcprexmtthresh = 3; 117 int tcptv_keep_init = TCPTV_KEEP_INIT; 118 119 extern u_long sb_max; 120 121 int tcp_rst_ppslim = 100; /* 100pps */ 122 int tcp_rst_ppslim_count = 0; 123 struct timeval tcp_rst_ppslim_last; 124 125 int tcp_ackdrop_ppslim = 100; /* 100pps */ 126 int tcp_ackdrop_ppslim_count = 0; 127 struct timeval tcp_ackdrop_ppslim_last; 128 129 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 130 131 /* for modulo comparisons of timestamps */ 132 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 133 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 134 135 /* for TCP SACK comparisons */ 136 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 137 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 138 139 /* 140 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 141 */ 142 #ifdef INET6 143 #define ND6_HINT(tp) \ 144 do { \ 145 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 146 tp->t_inpcb->inp_route6.ro_rt) { \ 147 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 148 } \ 149 } while (0) 150 #else 151 #define ND6_HINT(tp) 152 #endif 153 154 #ifdef TCP_ECN 155 /* 156 * ECN (Explicit Congestion Notification) support based on RFC3168 157 * implementation note: 158 * snd_last is used to track a recovery phase. 159 * when cwnd is reduced, snd_last is set to snd_max. 160 * while snd_last > snd_una, the sender is in a recovery phase and 161 * its cwnd should not be reduced again. 162 * snd_last follows snd_una when not in a recovery phase. 163 */ 164 #endif 165 166 /* 167 * Macro to compute ACK transmission behavior. Delay the ACK unless 168 * we have already delayed an ACK (must send an ACK every two segments). 169 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 170 * option is enabled. 171 */ 172 #define TCP_SETUP_ACK(tp, tiflags) \ 173 do { \ 174 if ((tp)->t_flags & TF_DELACK || \ 175 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \ 176 tp->t_flags |= TF_ACKNOW; \ 177 else \ 178 TCP_SET_DELACK(tp); \ 179 } while (0) 180 181 /* 182 * Insert segment ti into reassembly queue of tcp with 183 * control block tp. Return TH_FIN if reassembly now includes 184 * a segment with FIN. The macro form does the common case inline 185 * (segment is the next to be received on an established connection, 186 * and the queue is empty), avoiding linkage into and removal 187 * from the queue and repetition of various conversions. 188 * Set DELACK for segments received in order, but ack immediately 189 * when segments are out of order (so fast retransmit can work). 190 */ 191 192 int 193 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 194 { 195 struct tcpqent *p, *q, *nq, *tiqe; 196 struct socket *so = tp->t_inpcb->inp_socket; 197 int flags; 198 199 /* 200 * Call with th==0 after become established to 201 * force pre-ESTABLISHED data up to user socket. 202 */ 203 if (th == 0) 204 goto present; 205 206 /* 207 * Allocate a new queue entry, before we throw away any data. 208 * If we can't, just drop the packet. XXX 209 */ 210 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 211 if (tiqe == NULL) { 212 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 213 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 214 /* Reuse last entry since new segment fills a hole */ 215 m_freem(tiqe->tcpqe_m); 216 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 217 } 218 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 219 /* Flush segment queue for this connection */ 220 tcp_freeq(tp); 221 tcpstat.tcps_rcvmemdrop++; 222 m_freem(m); 223 return (0); 224 } 225 } 226 227 /* 228 * Find a segment which begins after this one does. 229 */ 230 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 231 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 232 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 233 break; 234 235 /* 236 * If there is a preceding segment, it may provide some of 237 * our data already. If so, drop the data from the incoming 238 * segment. If it provides all of our data, drop us. 239 */ 240 if (p != NULL) { 241 struct tcphdr *phdr = p->tcpqe_tcp; 242 int i; 243 244 /* conversion to int (in i) handles seq wraparound */ 245 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 246 if (i > 0) { 247 if (i >= *tlen) { 248 tcpstat.tcps_rcvduppack++; 249 tcpstat.tcps_rcvdupbyte += *tlen; 250 m_freem(m); 251 pool_put(&tcpqe_pool, tiqe); 252 return (0); 253 } 254 m_adj(m, i); 255 *tlen -= i; 256 th->th_seq += i; 257 } 258 } 259 tcpstat.tcps_rcvoopack++; 260 tcpstat.tcps_rcvoobyte += *tlen; 261 262 /* 263 * While we overlap succeeding segments trim them or, 264 * if they are completely covered, dequeue them. 265 */ 266 for (; q != NULL; q = nq) { 267 struct tcphdr *qhdr = q->tcpqe_tcp; 268 int i = (th->th_seq + *tlen) - qhdr->th_seq; 269 270 if (i <= 0) 271 break; 272 if (i < qhdr->th_reseqlen) { 273 qhdr->th_seq += i; 274 qhdr->th_reseqlen -= i; 275 m_adj(q->tcpqe_m, i); 276 break; 277 } 278 nq = TAILQ_NEXT(q, tcpqe_q); 279 m_freem(q->tcpqe_m); 280 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 281 pool_put(&tcpqe_pool, q); 282 } 283 284 /* Insert the new segment queue entry into place. */ 285 tiqe->tcpqe_m = m; 286 th->th_reseqlen = *tlen; 287 tiqe->tcpqe_tcp = th; 288 if (p == NULL) { 289 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 290 } else { 291 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 292 } 293 294 present: 295 /* 296 * Present data to user, advancing rcv_nxt through 297 * completed sequence space. 298 */ 299 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 300 return (0); 301 q = TAILQ_FIRST(&tp->t_segq); 302 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 303 return (0); 304 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 305 return (0); 306 do { 307 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 308 flags = q->tcpqe_tcp->th_flags & TH_FIN; 309 310 nq = TAILQ_NEXT(q, tcpqe_q); 311 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 312 ND6_HINT(tp); 313 if (so->so_state & SS_CANTRCVMORE) 314 m_freem(q->tcpqe_m); 315 else 316 sbappendstream(&so->so_rcv, q->tcpqe_m); 317 pool_put(&tcpqe_pool, q); 318 q = nq; 319 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 320 sorwakeup(so); 321 return (flags); 322 } 323 324 #ifdef INET6 325 int 326 tcp6_input(struct mbuf **mp, int *offp, int proto) 327 { 328 struct mbuf *m = *mp; 329 330 #if NFAITH > 0 331 if (m->m_pkthdr.rcvif) { 332 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 333 /* XXX send icmp6 host/port unreach? */ 334 m_freem(m); 335 return IPPROTO_DONE; 336 } 337 } 338 #endif 339 340 /* 341 * draft-itojun-ipv6-tcp-to-anycast 342 * better place to put this in? 343 */ 344 if (m->m_flags & M_ANYCAST6) { 345 if (m->m_len >= sizeof(struct ip6_hdr)) { 346 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 347 icmp6_error(m, ICMP6_DST_UNREACH, 348 ICMP6_DST_UNREACH_ADDR, 349 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 350 } else 351 m_freem(m); 352 return IPPROTO_DONE; 353 } 354 355 tcp_input(m, *offp, proto); 356 return IPPROTO_DONE; 357 } 358 #endif 359 360 /* 361 * TCP input routine, follows pages 65-76 of the 362 * protocol specification dated September, 1981 very closely. 363 */ 364 void 365 tcp_input(struct mbuf *m, ...) 366 { 367 struct ip *ip; 368 struct inpcb *inp; 369 u_int8_t *optp = NULL; 370 int optlen = 0; 371 int tlen, off; 372 struct tcpcb *tp = 0; 373 int tiflags; 374 struct socket *so = NULL; 375 int todrop, acked, ourfinisacked, needoutput = 0; 376 int hdroptlen = 0; 377 short ostate = 0; 378 tcp_seq iss, *reuse = NULL; 379 u_long tiwin; 380 struct tcp_opt_info opti; 381 int iphlen; 382 va_list ap; 383 struct tcphdr *th; 384 #ifdef INET6 385 struct ip6_hdr *ip6 = NULL; 386 #endif /* INET6 */ 387 #ifdef IPSEC 388 struct m_tag *mtag; 389 struct tdb_ident *tdbi; 390 struct tdb *tdb; 391 int error, s; 392 #endif /* IPSEC */ 393 int af; 394 #ifdef TCP_ECN 395 u_char iptos; 396 #endif 397 398 va_start(ap, m); 399 iphlen = va_arg(ap, int); 400 va_end(ap); 401 402 tcpstat.tcps_rcvtotal++; 403 404 opti.ts_present = 0; 405 opti.maxseg = 0; 406 407 /* 408 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 409 * See below for AF specific multicast. 410 */ 411 if (m->m_flags & (M_BCAST|M_MCAST)) 412 goto drop; 413 414 /* 415 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 416 * TCP/IPv4. 417 */ 418 switch (mtod(m, struct ip *)->ip_v) { 419 #ifdef INET6 420 case 6: 421 af = AF_INET6; 422 break; 423 #endif 424 case 4: 425 af = AF_INET; 426 break; 427 default: 428 m_freem(m); 429 return; /*EAFNOSUPPORT*/ 430 } 431 432 /* 433 * Get IP and TCP header together in first mbuf. 434 * Note: IP leaves IP header in first mbuf. 435 */ 436 switch (af) { 437 case AF_INET: 438 #ifdef DIAGNOSTIC 439 if (iphlen < sizeof(struct ip)) { 440 m_freem(m); 441 return; 442 } 443 #endif /* DIAGNOSTIC */ 444 break; 445 #ifdef INET6 446 case AF_INET6: 447 #ifdef DIAGNOSTIC 448 if (iphlen < sizeof(struct ip6_hdr)) { 449 m_freem(m); 450 return; 451 } 452 #endif /* DIAGNOSTIC */ 453 break; 454 #endif 455 default: 456 m_freem(m); 457 return; 458 } 459 460 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 461 if (!th) { 462 tcpstat.tcps_rcvshort++; 463 return; 464 } 465 466 tlen = m->m_pkthdr.len - iphlen; 467 ip = NULL; 468 #ifdef INET6 469 ip6 = NULL; 470 #endif 471 switch (af) { 472 case AF_INET: 473 ip = mtod(m, struct ip *); 474 if (IN_MULTICAST(ip->ip_dst.s_addr) || 475 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 476 goto drop; 477 #ifdef TCP_ECN 478 /* save ip_tos before clearing it for checksum */ 479 iptos = ip->ip_tos; 480 #endif 481 /* 482 * Checksum extended TCP header and data. 483 */ 484 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 485 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 486 tcpstat.tcps_inhwcsum++; 487 tcpstat.tcps_rcvbadsum++; 488 goto drop; 489 } 490 if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) { 491 tcpstat.tcps_rcvbadsum++; 492 goto drop; 493 } 494 } else { 495 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 496 tcpstat.tcps_inhwcsum++; 497 } 498 break; 499 #ifdef INET6 500 case AF_INET6: 501 ip6 = mtod(m, struct ip6_hdr *); 502 #ifdef TCP_ECN 503 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 504 #endif 505 506 /* Be proactive about malicious use of IPv4 mapped address */ 507 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 508 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 509 /* XXX stat */ 510 goto drop; 511 } 512 513 /* 514 * Be proactive about unspecified IPv6 address in source. 515 * As we use all-zero to indicate unbounded/unconnected pcb, 516 * unspecified IPv6 address can be used to confuse us. 517 * 518 * Note that packets with unspecified IPv6 destination is 519 * already dropped in ip6_input. 520 */ 521 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 522 /* XXX stat */ 523 goto drop; 524 } 525 526 /* Discard packets to multicast */ 527 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 528 /* XXX stat */ 529 goto drop; 530 } 531 532 /* 533 * Checksum extended TCP header and data. 534 */ 535 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 536 tcpstat.tcps_rcvbadsum++; 537 goto drop; 538 } 539 break; 540 #endif 541 } 542 543 /* 544 * Check that TCP offset makes sense, 545 * pull out TCP options and adjust length. XXX 546 */ 547 off = th->th_off << 2; 548 if (off < sizeof(struct tcphdr) || off > tlen) { 549 tcpstat.tcps_rcvbadoff++; 550 goto drop; 551 } 552 tlen -= off; 553 if (off > sizeof(struct tcphdr)) { 554 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 555 if (!th) { 556 tcpstat.tcps_rcvshort++; 557 return; 558 } 559 optlen = off - sizeof(struct tcphdr); 560 optp = (u_int8_t *)(th + 1); 561 /* 562 * Do quick retrieval of timestamp options ("options 563 * prediction?"). If timestamp is the only option and it's 564 * formatted as recommended in RFC 1323 appendix A, we 565 * quickly get the values now and not bother calling 566 * tcp_dooptions(), etc. 567 */ 568 if ((optlen == TCPOLEN_TSTAMP_APPA || 569 (optlen > TCPOLEN_TSTAMP_APPA && 570 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 571 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 572 (th->th_flags & TH_SYN) == 0) { 573 opti.ts_present = 1; 574 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 575 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 576 optp = NULL; /* we've parsed the options */ 577 } 578 } 579 tiflags = th->th_flags; 580 581 /* 582 * Convert TCP protocol specific fields to host format. 583 */ 584 NTOHL(th->th_seq); 585 NTOHL(th->th_ack); 586 NTOHS(th->th_win); 587 NTOHS(th->th_urp); 588 589 /* 590 * Locate pcb for segment. 591 */ 592 findpcb: 593 switch (af) { 594 #ifdef INET6 595 case AF_INET6: 596 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport, 597 &ip6->ip6_dst, th->th_dport); 598 break; 599 #endif 600 case AF_INET: 601 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport, 602 ip->ip_dst, th->th_dport); 603 break; 604 } 605 if (inp == 0) { 606 int inpl_flags = 0; 607 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 608 inpl_flags = INPLOOKUP_WILDCARD; 609 ++tcpstat.tcps_pcbhashmiss; 610 switch (af) { 611 #ifdef INET6 612 case AF_INET6: 613 inp = in6_pcblookup_listen(&tcbtable, 614 &ip6->ip6_dst, th->th_dport, inpl_flags, m); 615 break; 616 #endif /* INET6 */ 617 case AF_INET: 618 inp = in_pcblookup_listen(&tcbtable, 619 ip->ip_dst, th->th_dport, inpl_flags, m); 620 break; 621 } 622 /* 623 * If the state is CLOSED (i.e., TCB does not exist) then 624 * all data in the incoming segment is discarded. 625 * If the TCB exists but is in CLOSED state, it is embryonic, 626 * but should either do a listen or a connect soon. 627 */ 628 if (inp == 0) { 629 ++tcpstat.tcps_noport; 630 goto dropwithreset_ratelim; 631 } 632 } 633 634 /* Check the minimum TTL for socket. */ 635 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 636 goto drop; 637 638 tp = intotcpcb(inp); 639 if (tp == 0) 640 goto dropwithreset_ratelim; 641 if (tp->t_state == TCPS_CLOSED) 642 goto drop; 643 644 /* Unscale the window into a 32-bit value. */ 645 if ((tiflags & TH_SYN) == 0) 646 tiwin = th->th_win << tp->snd_scale; 647 else 648 tiwin = th->th_win; 649 650 so = inp->inp_socket; 651 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 652 union syn_cache_sa src; 653 union syn_cache_sa dst; 654 655 bzero(&src, sizeof(src)); 656 bzero(&dst, sizeof(dst)); 657 switch (af) { 658 #ifdef INET 659 case AF_INET: 660 src.sin.sin_len = sizeof(struct sockaddr_in); 661 src.sin.sin_family = AF_INET; 662 src.sin.sin_addr = ip->ip_src; 663 src.sin.sin_port = th->th_sport; 664 665 dst.sin.sin_len = sizeof(struct sockaddr_in); 666 dst.sin.sin_family = AF_INET; 667 dst.sin.sin_addr = ip->ip_dst; 668 dst.sin.sin_port = th->th_dport; 669 break; 670 #endif 671 #ifdef INET6 672 case AF_INET6: 673 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 674 src.sin6.sin6_family = AF_INET6; 675 src.sin6.sin6_addr = ip6->ip6_src; 676 src.sin6.sin6_port = th->th_sport; 677 678 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 679 dst.sin6.sin6_family = AF_INET6; 680 dst.sin6.sin6_addr = ip6->ip6_dst; 681 dst.sin6.sin6_port = th->th_dport; 682 break; 683 #endif /* INET6 */ 684 default: 685 goto badsyn; /*sanity*/ 686 } 687 688 if (so->so_options & SO_DEBUG) { 689 ostate = tp->t_state; 690 switch (af) { 691 #ifdef INET6 692 case AF_INET6: 693 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 694 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 695 break; 696 #endif 697 case AF_INET: 698 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 699 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 700 break; 701 } 702 } 703 if (so->so_options & SO_ACCEPTCONN) { 704 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 705 if (tiflags & TH_RST) { 706 syn_cache_reset(&src.sa, &dst.sa, th); 707 } else if ((tiflags & (TH_ACK|TH_SYN)) == 708 (TH_ACK|TH_SYN)) { 709 /* 710 * Received a SYN,ACK. This should 711 * never happen while we are in 712 * LISTEN. Send an RST. 713 */ 714 goto badsyn; 715 } else if (tiflags & TH_ACK) { 716 so = syn_cache_get(&src.sa, &dst.sa, 717 th, iphlen, tlen, so, m); 718 if (so == NULL) { 719 /* 720 * We don't have a SYN for 721 * this ACK; send an RST. 722 */ 723 goto badsyn; 724 } else if (so == 725 (struct socket *)(-1)) { 726 /* 727 * We were unable to create 728 * the connection. If the 729 * 3-way handshake was 730 * completed, and RST has 731 * been sent to the peer. 732 * Since the mbuf might be 733 * in use for the reply, 734 * do not free it. 735 */ 736 m = NULL; 737 } else { 738 /* 739 * We have created a 740 * full-blown connection. 741 */ 742 tp = NULL; 743 inp = (struct inpcb *)so->so_pcb; 744 tp = intotcpcb(inp); 745 if (tp == NULL) 746 goto badsyn; /*XXX*/ 747 748 /* 749 * Compute proper scaling 750 * value from buffer space 751 */ 752 tcp_rscale(tp, so->so_rcv.sb_hiwat); 753 goto after_listen; 754 } 755 } else { 756 /* 757 * None of RST, SYN or ACK was set. 758 * This is an invalid packet for a 759 * TCB in LISTEN state. Send a RST. 760 */ 761 goto badsyn; 762 } 763 } else { 764 /* 765 * Received a SYN. 766 */ 767 #ifdef INET6 768 /* 769 * If deprecated address is forbidden, we do 770 * not accept SYN to deprecated interface 771 * address to prevent any new inbound 772 * connection from getting established. 773 * When we do not accept SYN, we send a TCP 774 * RST, with deprecated source address (instead 775 * of dropping it). We compromise it as it is 776 * much better for peer to send a RST, and 777 * RST will be the final packet for the 778 * exchange. 779 * 780 * If we do not forbid deprecated addresses, we 781 * accept the SYN packet. RFC2462 does not 782 * suggest dropping SYN in this case. 783 * If we decipher RFC2462 5.5.4, it says like 784 * this: 785 * 1. use of deprecated addr with existing 786 * communication is okay - "SHOULD continue 787 * to be used" 788 * 2. use of it with new communication: 789 * (2a) "SHOULD NOT be used if alternate 790 * address with sufficient scope is 791 * available" 792 * (2b) nothing mentioned otherwise. 793 * Here we fall into (2b) case as we have no 794 * choice in our source address selection - we 795 * must obey the peer. 796 * 797 * The wording in RFC2462 is confusing, and 798 * there are multiple description text for 799 * deprecated address handling - worse, they 800 * are not exactly the same. I believe 5.5.4 801 * is the best one, so we follow 5.5.4. 802 */ 803 if (ip6 && !ip6_use_deprecated) { 804 struct in6_ifaddr *ia6; 805 806 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 807 &ip6->ip6_dst)) && 808 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 809 tp = NULL; 810 goto dropwithreset; 811 } 812 } 813 #endif 814 815 /* 816 * LISTEN socket received a SYN 817 * from itself? This can't possibly 818 * be valid; drop the packet. 819 */ 820 if (th->th_dport == th->th_sport) { 821 switch (af) { 822 #ifdef INET6 823 case AF_INET6: 824 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 825 &ip6->ip6_dst)) { 826 tcpstat.tcps_badsyn++; 827 goto drop; 828 } 829 break; 830 #endif /* INET6 */ 831 case AF_INET: 832 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 833 tcpstat.tcps_badsyn++; 834 goto drop; 835 } 836 break; 837 } 838 } 839 840 /* 841 * SYN looks ok; create compressed TCP 842 * state for it. 843 */ 844 if (so->so_qlen <= so->so_qlimit && 845 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 846 so, m, optp, optlen, &opti, reuse)) 847 m = NULL; 848 } 849 goto drop; 850 } 851 } 852 853 after_listen: 854 #ifdef DIAGNOSTIC 855 /* 856 * Should not happen now that all embryonic connections 857 * are handled with compressed state. 858 */ 859 if (tp->t_state == TCPS_LISTEN) 860 panic("tcp_input: TCPS_LISTEN"); 861 #endif 862 863 #ifdef IPSEC 864 /* Find most recent IPsec tag */ 865 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 866 s = splnet(); 867 if (mtag != NULL) { 868 tdbi = (struct tdb_ident *)(mtag + 1); 869 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 870 } else 871 tdb = NULL; 872 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 873 tdb, inp); 874 if (error) { 875 splx(s); 876 goto drop; 877 } 878 879 /* Latch SA */ 880 if (inp->inp_tdb_in != tdb) { 881 if (tdb) { 882 tdb_add_inp(tdb, inp, 1); 883 if (inp->inp_ipo == NULL) { 884 inp->inp_ipo = ipsec_add_policy(inp, af, 885 IPSP_DIRECTION_OUT); 886 if (inp->inp_ipo == NULL) { 887 splx(s); 888 goto drop; 889 } 890 } 891 if (inp->inp_ipo->ipo_dstid == NULL && 892 tdb->tdb_srcid != NULL) { 893 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 894 tdb->tdb_srcid->ref_count++; 895 } 896 if (inp->inp_ipsec_remotecred == NULL && 897 tdb->tdb_remote_cred != NULL) { 898 inp->inp_ipsec_remotecred = 899 tdb->tdb_remote_cred; 900 tdb->tdb_remote_cred->ref_count++; 901 } 902 if (inp->inp_ipsec_remoteauth == NULL && 903 tdb->tdb_remote_auth != NULL) { 904 inp->inp_ipsec_remoteauth = 905 tdb->tdb_remote_auth; 906 tdb->tdb_remote_auth->ref_count++; 907 } 908 } else { /* Just reset */ 909 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 910 inp_tdb_in_next); 911 inp->inp_tdb_in = NULL; 912 } 913 } 914 splx(s); 915 #endif /* IPSEC */ 916 917 /* 918 * Segment received on connection. 919 * Reset idle time and keep-alive timer. 920 */ 921 tp->t_rcvtime = tcp_now; 922 if (TCPS_HAVEESTABLISHED(tp->t_state)) 923 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 924 925 #ifdef TCP_SACK 926 if (tp->sack_enable) 927 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 928 #endif /* TCP_SACK */ 929 930 /* 931 * Process options. 932 */ 933 #ifdef TCP_SIGNATURE 934 if (optp || (tp->t_flags & TF_SIGNATURE)) 935 #else 936 if (optp) 937 #endif 938 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti)) 939 goto drop; 940 941 if (opti.ts_present && opti.ts_ecr) { 942 int rtt_test; 943 944 /* subtract out the tcp timestamp modulator */ 945 opti.ts_ecr -= tp->ts_modulate; 946 947 /* make sure ts_ecr is sensible */ 948 rtt_test = tcp_now - opti.ts_ecr; 949 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 950 opti.ts_ecr = 0; 951 } 952 953 #ifdef TCP_ECN 954 /* if congestion experienced, set ECE bit in subsequent packets. */ 955 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 956 tp->t_flags |= TF_RCVD_CE; 957 tcpstat.tcps_ecn_rcvce++; 958 } 959 #endif 960 /* 961 * Header prediction: check for the two common cases 962 * of a uni-directional data xfer. If the packet has 963 * no control flags, is in-sequence, the window didn't 964 * change and we're not retransmitting, it's a 965 * candidate. If the length is zero and the ack moved 966 * forward, we're the sender side of the xfer. Just 967 * free the data acked & wake any higher level process 968 * that was blocked waiting for space. If the length 969 * is non-zero and the ack didn't move, we're the 970 * receiver side. If we're getting packets in-order 971 * (the reassembly queue is empty), add the data to 972 * the socket buffer and note that we need a delayed ack. 973 */ 974 if (tp->t_state == TCPS_ESTABLISHED && 975 #ifdef TCP_ECN 976 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 977 #else 978 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 979 #endif 980 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 981 th->th_seq == tp->rcv_nxt && 982 tiwin && tiwin == tp->snd_wnd && 983 tp->snd_nxt == tp->snd_max) { 984 985 /* 986 * If last ACK falls within this segment's sequence numbers, 987 * record the timestamp. 988 * Fix from Braden, see Stevens p. 870 989 */ 990 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 991 tp->ts_recent_age = tcp_now; 992 tp->ts_recent = opti.ts_val; 993 } 994 995 if (tlen == 0) { 996 if (SEQ_GT(th->th_ack, tp->snd_una) && 997 SEQ_LEQ(th->th_ack, tp->snd_max) && 998 tp->snd_cwnd >= tp->snd_wnd && 999 tp->t_dupacks == 0) { 1000 /* 1001 * this is a pure ack for outstanding data. 1002 */ 1003 ++tcpstat.tcps_predack; 1004 if (opti.ts_present && opti.ts_ecr) 1005 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1006 else if (tp->t_rtttime && 1007 SEQ_GT(th->th_ack, tp->t_rtseq)) 1008 tcp_xmit_timer(tp, 1009 tcp_now - tp->t_rtttime); 1010 acked = th->th_ack - tp->snd_una; 1011 tcpstat.tcps_rcvackpack++; 1012 tcpstat.tcps_rcvackbyte += acked; 1013 ND6_HINT(tp); 1014 sbdrop(&so->so_snd, acked); 1015 1016 /* 1017 * If we had a pending ICMP message that 1018 * referres to data that have just been 1019 * acknowledged, disregard the recorded ICMP 1020 * message. 1021 */ 1022 if ((tp->t_flags & TF_PMTUD_PEND) && 1023 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1024 tp->t_flags &= ~TF_PMTUD_PEND; 1025 1026 /* 1027 * Keep track of the largest chunk of data 1028 * acknowledged since last PMTU update 1029 */ 1030 if (tp->t_pmtud_mss_acked < acked) 1031 tp->t_pmtud_mss_acked = acked; 1032 1033 tp->snd_una = th->th_ack; 1034 #if defined(TCP_SACK) || defined(TCP_ECN) 1035 /* 1036 * We want snd_last to track snd_una so 1037 * as to avoid sequence wraparound problems 1038 * for very large transfers. 1039 */ 1040 #ifdef TCP_ECN 1041 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1042 #endif 1043 tp->snd_last = tp->snd_una; 1044 #endif /* TCP_SACK */ 1045 #if defined(TCP_SACK) && defined(TCP_FACK) 1046 tp->snd_fack = tp->snd_una; 1047 tp->retran_data = 0; 1048 #endif /* TCP_FACK */ 1049 m_freem(m); 1050 1051 /* 1052 * If all outstanding data are acked, stop 1053 * retransmit timer, otherwise restart timer 1054 * using current (possibly backed-off) value. 1055 * If process is waiting for space, 1056 * wakeup/selwakeup/signal. If data 1057 * are ready to send, let tcp_output 1058 * decide between more output or persist. 1059 */ 1060 if (tp->snd_una == tp->snd_max) 1061 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1062 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1063 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1064 1065 if (sb_notify(&so->so_snd)) 1066 sowwakeup(so); 1067 if (so->so_snd.sb_cc) 1068 (void) tcp_output(tp); 1069 return; 1070 } 1071 } else if (th->th_ack == tp->snd_una && 1072 TAILQ_EMPTY(&tp->t_segq) && 1073 tlen <= sbspace(&so->so_rcv)) { 1074 /* 1075 * This is a pure, in-sequence data packet 1076 * with nothing on the reassembly queue and 1077 * we have enough buffer space to take it. 1078 */ 1079 #ifdef TCP_SACK 1080 /* Clean receiver SACK report if present */ 1081 if (tp->sack_enable && tp->rcv_numsacks) 1082 tcp_clean_sackreport(tp); 1083 #endif /* TCP_SACK */ 1084 ++tcpstat.tcps_preddat; 1085 tp->rcv_nxt += tlen; 1086 tcpstat.tcps_rcvpack++; 1087 tcpstat.tcps_rcvbyte += tlen; 1088 ND6_HINT(tp); 1089 /* 1090 * Drop TCP, IP headers and TCP options then add data 1091 * to socket buffer. 1092 */ 1093 if (so->so_state & SS_CANTRCVMORE) 1094 m_freem(m); 1095 else { 1096 m_adj(m, iphlen + off); 1097 sbappendstream(&so->so_rcv, m); 1098 } 1099 sorwakeup(so); 1100 TCP_SETUP_ACK(tp, tiflags); 1101 if (tp->t_flags & TF_ACKNOW) 1102 (void) tcp_output(tp); 1103 return; 1104 } 1105 } 1106 1107 /* 1108 * Compute mbuf offset to TCP data segment. 1109 */ 1110 hdroptlen = iphlen + off; 1111 1112 /* 1113 * Calculate amount of space in receive window, 1114 * and then do TCP input processing. 1115 * Receive window is amount of space in rcv queue, 1116 * but not less than advertised window. 1117 */ 1118 { int win; 1119 1120 win = sbspace(&so->so_rcv); 1121 if (win < 0) 1122 win = 0; 1123 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1124 } 1125 1126 switch (tp->t_state) { 1127 1128 /* 1129 * If the state is SYN_RECEIVED: 1130 * if seg contains SYN/ACK, send an RST. 1131 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1132 */ 1133 1134 case TCPS_SYN_RECEIVED: 1135 if (tiflags & TH_ACK) { 1136 if (tiflags & TH_SYN) { 1137 tcpstat.tcps_badsyn++; 1138 goto dropwithreset; 1139 } 1140 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1141 SEQ_GT(th->th_ack, tp->snd_max)) 1142 goto dropwithreset; 1143 } 1144 break; 1145 1146 /* 1147 * If the state is SYN_SENT: 1148 * if seg contains an ACK, but not for our SYN, drop the input. 1149 * if seg contains a RST, then drop the connection. 1150 * if seg does not contain SYN, then drop it. 1151 * Otherwise this is an acceptable SYN segment 1152 * initialize tp->rcv_nxt and tp->irs 1153 * if seg contains ack then advance tp->snd_una 1154 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1155 * arrange for segment to be acked (eventually) 1156 * continue processing rest of data/controls, beginning with URG 1157 */ 1158 case TCPS_SYN_SENT: 1159 if ((tiflags & TH_ACK) && 1160 (SEQ_LEQ(th->th_ack, tp->iss) || 1161 SEQ_GT(th->th_ack, tp->snd_max))) 1162 goto dropwithreset; 1163 if (tiflags & TH_RST) { 1164 #ifdef TCP_ECN 1165 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1166 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1167 goto drop; 1168 #endif 1169 if (tiflags & TH_ACK) 1170 tp = tcp_drop(tp, ECONNREFUSED); 1171 goto drop; 1172 } 1173 if ((tiflags & TH_SYN) == 0) 1174 goto drop; 1175 if (tiflags & TH_ACK) { 1176 tp->snd_una = th->th_ack; 1177 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1178 tp->snd_nxt = tp->snd_una; 1179 } 1180 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1181 tp->irs = th->th_seq; 1182 tcp_mss(tp, opti.maxseg); 1183 /* Reset initial window to 1 segment for retransmit */ 1184 if (tp->t_rxtshift > 0) 1185 tp->snd_cwnd = tp->t_maxseg; 1186 tcp_rcvseqinit(tp); 1187 tp->t_flags |= TF_ACKNOW; 1188 #ifdef TCP_SACK 1189 /* 1190 * If we've sent a SACK_PERMITTED option, and the peer 1191 * also replied with one, then TF_SACK_PERMIT should have 1192 * been set in tcp_dooptions(). If it was not, disable SACKs. 1193 */ 1194 if (tp->sack_enable) 1195 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1196 #endif 1197 #ifdef TCP_ECN 1198 /* 1199 * if ECE is set but CWR is not set for SYN-ACK, or 1200 * both ECE and CWR are set for simultaneous open, 1201 * peer is ECN capable. 1202 */ 1203 if (tcp_do_ecn) { 1204 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1205 == (TH_ACK|TH_ECE) || 1206 (tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1207 == (TH_ECE|TH_CWR)) { 1208 tp->t_flags |= TF_ECN_PERMIT; 1209 tiflags &= ~(TH_ECE|TH_CWR); 1210 tcpstat.tcps_ecn_accepts++; 1211 } 1212 } 1213 #endif 1214 1215 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1216 tcpstat.tcps_connects++; 1217 soisconnected(so); 1218 tp->t_state = TCPS_ESTABLISHED; 1219 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1220 /* Do window scaling on this connection? */ 1221 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1222 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1223 tp->snd_scale = tp->requested_s_scale; 1224 tp->rcv_scale = tp->request_r_scale; 1225 } 1226 (void) tcp_reass(tp, (struct tcphdr *)0, 1227 (struct mbuf *)0, &tlen); 1228 /* 1229 * if we didn't have to retransmit the SYN, 1230 * use its rtt as our initial srtt & rtt var. 1231 */ 1232 if (tp->t_rtttime) 1233 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1234 /* 1235 * Since new data was acked (the SYN), open the 1236 * congestion window by one MSS. We do this 1237 * here, because we won't go through the normal 1238 * ACK processing below. And since this is the 1239 * start of the connection, we know we are in 1240 * the exponential phase of slow-start. 1241 */ 1242 tp->snd_cwnd += tp->t_maxseg; 1243 } else 1244 tp->t_state = TCPS_SYN_RECEIVED; 1245 1246 #if 0 1247 trimthenstep6: 1248 #endif 1249 /* 1250 * Advance th->th_seq to correspond to first data byte. 1251 * If data, trim to stay within window, 1252 * dropping FIN if necessary. 1253 */ 1254 th->th_seq++; 1255 if (tlen > tp->rcv_wnd) { 1256 todrop = tlen - tp->rcv_wnd; 1257 m_adj(m, -todrop); 1258 tlen = tp->rcv_wnd; 1259 tiflags &= ~TH_FIN; 1260 tcpstat.tcps_rcvpackafterwin++; 1261 tcpstat.tcps_rcvbyteafterwin += todrop; 1262 } 1263 tp->snd_wl1 = th->th_seq - 1; 1264 tp->rcv_up = th->th_seq; 1265 goto step6; 1266 /* 1267 * If a new connection request is received while in TIME_WAIT, 1268 * drop the old connection and start over if the if the 1269 * timestamp or the sequence numbers are above the previous 1270 * ones. 1271 */ 1272 case TCPS_TIME_WAIT: 1273 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1274 ((opti.ts_present && 1275 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1276 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1277 /* 1278 * Advance the iss by at least 32768, but 1279 * clear the msb in order to make sure 1280 * that SEG_LT(snd_nxt, iss). 1281 */ 1282 iss = tp->snd_nxt + 1283 ((arc4random() & 0x7fffffff) | 0x8000); 1284 reuse = &iss; 1285 tp = tcp_close(tp); 1286 goto findpcb; 1287 } 1288 } 1289 1290 /* 1291 * States other than LISTEN or SYN_SENT. 1292 * First check timestamp, if present. 1293 * Then check that at least some bytes of segment are within 1294 * receive window. If segment begins before rcv_nxt, 1295 * drop leading data (and SYN); if nothing left, just ack. 1296 * 1297 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1298 * and it's less than opti.ts_recent, drop it. 1299 */ 1300 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1301 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1302 1303 /* Check to see if ts_recent is over 24 days old. */ 1304 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1305 /* 1306 * Invalidate ts_recent. If this segment updates 1307 * ts_recent, the age will be reset later and ts_recent 1308 * will get a valid value. If it does not, setting 1309 * ts_recent to zero will at least satisfy the 1310 * requirement that zero be placed in the timestamp 1311 * echo reply when ts_recent isn't valid. The 1312 * age isn't reset until we get a valid ts_recent 1313 * because we don't want out-of-order segments to be 1314 * dropped when ts_recent is old. 1315 */ 1316 tp->ts_recent = 0; 1317 } else { 1318 tcpstat.tcps_rcvduppack++; 1319 tcpstat.tcps_rcvdupbyte += tlen; 1320 tcpstat.tcps_pawsdrop++; 1321 goto dropafterack; 1322 } 1323 } 1324 1325 todrop = tp->rcv_nxt - th->th_seq; 1326 if (todrop > 0) { 1327 if (tiflags & TH_SYN) { 1328 tiflags &= ~TH_SYN; 1329 th->th_seq++; 1330 if (th->th_urp > 1) 1331 th->th_urp--; 1332 else 1333 tiflags &= ~TH_URG; 1334 todrop--; 1335 } 1336 if (todrop > tlen || 1337 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1338 /* 1339 * Any valid FIN must be to the left of the 1340 * window. At this point, FIN must be a 1341 * duplicate or out-of-sequence, so drop it. 1342 */ 1343 tiflags &= ~TH_FIN; 1344 /* 1345 * Send ACK to resynchronize, and drop any data, 1346 * but keep on processing for RST or ACK. 1347 */ 1348 tp->t_flags |= TF_ACKNOW; 1349 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1350 tcpstat.tcps_rcvduppack++; 1351 } else { 1352 tcpstat.tcps_rcvpartduppack++; 1353 tcpstat.tcps_rcvpartdupbyte += todrop; 1354 } 1355 hdroptlen += todrop; /* drop from head afterwards */ 1356 th->th_seq += todrop; 1357 tlen -= todrop; 1358 if (th->th_urp > todrop) 1359 th->th_urp -= todrop; 1360 else { 1361 tiflags &= ~TH_URG; 1362 th->th_urp = 0; 1363 } 1364 } 1365 1366 /* 1367 * If new data are received on a connection after the 1368 * user processes are gone, then RST the other end. 1369 */ 1370 if ((so->so_state & SS_NOFDREF) && 1371 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1372 tp = tcp_close(tp); 1373 tcpstat.tcps_rcvafterclose++; 1374 goto dropwithreset; 1375 } 1376 1377 /* 1378 * If segment ends after window, drop trailing data 1379 * (and PUSH and FIN); if nothing left, just ACK. 1380 */ 1381 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1382 if (todrop > 0) { 1383 tcpstat.tcps_rcvpackafterwin++; 1384 if (todrop >= tlen) { 1385 tcpstat.tcps_rcvbyteafterwin += tlen; 1386 /* 1387 * If window is closed can only take segments at 1388 * window edge, and have to drop data and PUSH from 1389 * incoming segments. Continue processing, but 1390 * remember to ack. Otherwise, drop segment 1391 * and ack. 1392 */ 1393 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1394 tp->t_flags |= TF_ACKNOW; 1395 tcpstat.tcps_rcvwinprobe++; 1396 } else 1397 goto dropafterack; 1398 } else 1399 tcpstat.tcps_rcvbyteafterwin += todrop; 1400 m_adj(m, -todrop); 1401 tlen -= todrop; 1402 tiflags &= ~(TH_PUSH|TH_FIN); 1403 } 1404 1405 /* 1406 * If last ACK falls within this segment's sequence numbers, 1407 * record its timestamp if it's more recent. 1408 * Cf fix from Braden, see Stevens p. 870 1409 */ 1410 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1411 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1412 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1413 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1414 tp->ts_recent = opti.ts_val; 1415 else 1416 tp->ts_recent = 0; 1417 tp->ts_recent_age = tcp_now; 1418 } 1419 1420 /* 1421 * If the RST bit is set examine the state: 1422 * SYN_RECEIVED STATE: 1423 * If passive open, return to LISTEN state. 1424 * If active open, inform user that connection was refused. 1425 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1426 * Inform user that connection was reset, and close tcb. 1427 * CLOSING, LAST_ACK, TIME_WAIT STATES 1428 * Close the tcb. 1429 */ 1430 if (tiflags & TH_RST) { 1431 if (th->th_seq != tp->last_ack_sent && 1432 th->th_seq != tp->rcv_nxt && 1433 th->th_seq != (tp->rcv_nxt + 1)) 1434 goto drop; 1435 1436 switch (tp->t_state) { 1437 case TCPS_SYN_RECEIVED: 1438 #ifdef TCP_ECN 1439 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1440 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1441 goto drop; 1442 #endif 1443 so->so_error = ECONNREFUSED; 1444 goto close; 1445 1446 case TCPS_ESTABLISHED: 1447 case TCPS_FIN_WAIT_1: 1448 case TCPS_FIN_WAIT_2: 1449 case TCPS_CLOSE_WAIT: 1450 so->so_error = ECONNRESET; 1451 close: 1452 tp->t_state = TCPS_CLOSED; 1453 tcpstat.tcps_drops++; 1454 tp = tcp_close(tp); 1455 goto drop; 1456 case TCPS_CLOSING: 1457 case TCPS_LAST_ACK: 1458 case TCPS_TIME_WAIT: 1459 tp = tcp_close(tp); 1460 goto drop; 1461 } 1462 } 1463 1464 /* 1465 * If a SYN is in the window, then this is an 1466 * error and we ACK and drop the packet. 1467 */ 1468 if (tiflags & TH_SYN) 1469 goto dropafterack_ratelim; 1470 1471 /* 1472 * If the ACK bit is off we drop the segment and return. 1473 */ 1474 if ((tiflags & TH_ACK) == 0) { 1475 if (tp->t_flags & TF_ACKNOW) 1476 goto dropafterack; 1477 else 1478 goto drop; 1479 } 1480 1481 /* 1482 * Ack processing. 1483 */ 1484 switch (tp->t_state) { 1485 1486 /* 1487 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1488 * ESTABLISHED state and continue processing. 1489 * The ACK was checked above. 1490 */ 1491 case TCPS_SYN_RECEIVED: 1492 tcpstat.tcps_connects++; 1493 soisconnected(so); 1494 tp->t_state = TCPS_ESTABLISHED; 1495 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1496 /* Do window scaling? */ 1497 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1498 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1499 tp->snd_scale = tp->requested_s_scale; 1500 tp->rcv_scale = tp->request_r_scale; 1501 tiwin = th->th_win << tp->snd_scale; 1502 } 1503 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1504 &tlen); 1505 tp->snd_wl1 = th->th_seq - 1; 1506 /* fall into ... */ 1507 1508 /* 1509 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1510 * ACKs. If the ack is in the range 1511 * tp->snd_una < th->th_ack <= tp->snd_max 1512 * then advance tp->snd_una to th->th_ack and drop 1513 * data from the retransmission queue. If this ACK reflects 1514 * more up to date window information we update our window information. 1515 */ 1516 case TCPS_ESTABLISHED: 1517 case TCPS_FIN_WAIT_1: 1518 case TCPS_FIN_WAIT_2: 1519 case TCPS_CLOSE_WAIT: 1520 case TCPS_CLOSING: 1521 case TCPS_LAST_ACK: 1522 case TCPS_TIME_WAIT: 1523 #ifdef TCP_ECN 1524 /* 1525 * if we receive ECE and are not already in recovery phase, 1526 * reduce cwnd by half but don't slow-start. 1527 * advance snd_last to snd_max not to reduce cwnd again 1528 * until all outstanding packets are acked. 1529 */ 1530 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1531 if ((tp->t_flags & TF_ECN_PERMIT) && 1532 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1533 u_int win; 1534 1535 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1536 if (win > 1) { 1537 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1538 tp->snd_cwnd = tp->snd_ssthresh; 1539 tp->snd_last = tp->snd_max; 1540 tp->t_flags |= TF_SEND_CWR; 1541 tcpstat.tcps_cwr_ecn++; 1542 } 1543 } 1544 tcpstat.tcps_ecn_rcvece++; 1545 } 1546 /* 1547 * if we receive CWR, we know that the peer has reduced 1548 * its congestion window. stop sending ecn-echo. 1549 */ 1550 if ((tiflags & TH_CWR)) { 1551 tp->t_flags &= ~TF_RCVD_CE; 1552 tcpstat.tcps_ecn_rcvcwr++; 1553 } 1554 #endif /* TCP_ECN */ 1555 1556 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1557 /* 1558 * Duplicate/old ACK processing. 1559 * Increments t_dupacks: 1560 * Pure duplicate (same seq/ack/window, no data) 1561 * Doesn't affect t_dupacks: 1562 * Data packets. 1563 * Normal window updates (window opens) 1564 * Resets t_dupacks: 1565 * New data ACKed. 1566 * Window shrinks 1567 * Old ACK 1568 */ 1569 if (tlen) { 1570 /* Drop very old ACKs unless th_seq matches */ 1571 if (th->th_seq != tp->rcv_nxt && 1572 SEQ_LT(th->th_ack, 1573 tp->snd_una - tp->max_sndwnd)) { 1574 tcpstat.tcps_rcvacktooold++; 1575 goto drop; 1576 } 1577 break; 1578 } 1579 /* 1580 * If we get an old ACK, there is probably packet 1581 * reordering going on. Be conservative and reset 1582 * t_dupacks so that we are less aggressive in 1583 * doing a fast retransmit. 1584 */ 1585 if (th->th_ack != tp->snd_una) { 1586 tp->t_dupacks = 0; 1587 break; 1588 } 1589 if (tiwin == tp->snd_wnd) { 1590 tcpstat.tcps_rcvdupack++; 1591 /* 1592 * If we have outstanding data (other than 1593 * a window probe), this is a completely 1594 * duplicate ack (ie, window info didn't 1595 * change), the ack is the biggest we've 1596 * seen and we've seen exactly our rexmt 1597 * threshold of them, assume a packet 1598 * has been dropped and retransmit it. 1599 * Kludge snd_nxt & the congestion 1600 * window so we send only this one 1601 * packet. 1602 * 1603 * We know we're losing at the current 1604 * window size so do congestion avoidance 1605 * (set ssthresh to half the current window 1606 * and pull our congestion window back to 1607 * the new ssthresh). 1608 * 1609 * Dup acks mean that packets have left the 1610 * network (they're now cached at the receiver) 1611 * so bump cwnd by the amount in the receiver 1612 * to keep a constant cwnd packets in the 1613 * network. 1614 */ 1615 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1616 tp->t_dupacks = 0; 1617 #if defined(TCP_SACK) && defined(TCP_FACK) 1618 /* 1619 * In FACK, can enter fast rec. if the receiver 1620 * reports a reass. queue longer than 3 segs. 1621 */ 1622 else if (++tp->t_dupacks == tcprexmtthresh || 1623 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1624 tp->t_maxseg + tp->snd_una)) && 1625 SEQ_GT(tp->snd_una, tp->snd_last))) { 1626 #else 1627 else if (++tp->t_dupacks == tcprexmtthresh) { 1628 #endif /* TCP_FACK */ 1629 tcp_seq onxt = tp->snd_nxt; 1630 u_long win = 1631 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1632 2 / tp->t_maxseg; 1633 1634 #if defined(TCP_SACK) || defined(TCP_ECN) 1635 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1636 /* 1637 * False fast retx after 1638 * timeout. Do not cut window. 1639 */ 1640 tp->t_dupacks = 0; 1641 goto drop; 1642 } 1643 #endif 1644 if (win < 2) 1645 win = 2; 1646 tp->snd_ssthresh = win * tp->t_maxseg; 1647 #ifdef TCP_SACK 1648 tp->snd_last = tp->snd_max; 1649 if (tp->sack_enable) { 1650 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1651 tp->t_rtttime = 0; 1652 #ifdef TCP_ECN 1653 tp->t_flags |= TF_SEND_CWR; 1654 #endif 1655 tcpstat.tcps_cwr_frecovery++; 1656 tcpstat.tcps_sack_recovery_episode++; 1657 #if defined(TCP_SACK) && defined(TCP_FACK) 1658 tp->t_dupacks = tcprexmtthresh; 1659 (void) tcp_output(tp); 1660 /* 1661 * During FR, snd_cwnd is held 1662 * constant for FACK. 1663 */ 1664 tp->snd_cwnd = tp->snd_ssthresh; 1665 #else 1666 /* 1667 * tcp_output() will send 1668 * oldest SACK-eligible rtx. 1669 */ 1670 (void) tcp_output(tp); 1671 tp->snd_cwnd = tp->snd_ssthresh+ 1672 tp->t_maxseg * tp->t_dupacks; 1673 #endif /* TCP_FACK */ 1674 goto drop; 1675 } 1676 #endif /* TCP_SACK */ 1677 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1678 tp->t_rtttime = 0; 1679 tp->snd_nxt = th->th_ack; 1680 tp->snd_cwnd = tp->t_maxseg; 1681 #ifdef TCP_ECN 1682 tp->t_flags |= TF_SEND_CWR; 1683 #endif 1684 tcpstat.tcps_cwr_frecovery++; 1685 tcpstat.tcps_sndrexmitfast++; 1686 (void) tcp_output(tp); 1687 1688 tp->snd_cwnd = tp->snd_ssthresh + 1689 tp->t_maxseg * tp->t_dupacks; 1690 if (SEQ_GT(onxt, tp->snd_nxt)) 1691 tp->snd_nxt = onxt; 1692 goto drop; 1693 } else if (tp->t_dupacks > tcprexmtthresh) { 1694 #if defined(TCP_SACK) && defined(TCP_FACK) 1695 /* 1696 * while (awnd < cwnd) 1697 * sendsomething(); 1698 */ 1699 if (tp->sack_enable) { 1700 if (tp->snd_awnd < tp->snd_cwnd) 1701 tcp_output(tp); 1702 goto drop; 1703 } 1704 #endif /* TCP_FACK */ 1705 tp->snd_cwnd += tp->t_maxseg; 1706 (void) tcp_output(tp); 1707 goto drop; 1708 } 1709 } else if (tiwin < tp->snd_wnd) { 1710 /* 1711 * The window was retracted! Previous dup 1712 * ACKs may have been due to packets arriving 1713 * after the shrunken window, not a missing 1714 * packet, so play it safe and reset t_dupacks 1715 */ 1716 tp->t_dupacks = 0; 1717 } 1718 break; 1719 } 1720 /* 1721 * If the congestion window was inflated to account 1722 * for the other side's cached packets, retract it. 1723 */ 1724 #if defined(TCP_SACK) 1725 if (tp->sack_enable) { 1726 if (tp->t_dupacks >= tcprexmtthresh) { 1727 /* Check for a partial ACK */ 1728 if (tcp_sack_partialack(tp, th)) { 1729 #if defined(TCP_SACK) && defined(TCP_FACK) 1730 /* Force call to tcp_output */ 1731 if (tp->snd_awnd < tp->snd_cwnd) 1732 needoutput = 1; 1733 #else 1734 tp->snd_cwnd += tp->t_maxseg; 1735 needoutput = 1; 1736 #endif /* TCP_FACK */ 1737 } else { 1738 /* Out of fast recovery */ 1739 tp->snd_cwnd = tp->snd_ssthresh; 1740 if (tcp_seq_subtract(tp->snd_max, 1741 th->th_ack) < tp->snd_ssthresh) 1742 tp->snd_cwnd = 1743 tcp_seq_subtract(tp->snd_max, 1744 th->th_ack); 1745 tp->t_dupacks = 0; 1746 #if defined(TCP_SACK) && defined(TCP_FACK) 1747 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1748 tp->snd_fack = th->th_ack; 1749 #endif /* TCP_FACK */ 1750 } 1751 } 1752 } else { 1753 if (tp->t_dupacks >= tcprexmtthresh && 1754 !tcp_newreno(tp, th)) { 1755 /* Out of fast recovery */ 1756 tp->snd_cwnd = tp->snd_ssthresh; 1757 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1758 tp->snd_ssthresh) 1759 tp->snd_cwnd = 1760 tcp_seq_subtract(tp->snd_max, 1761 th->th_ack); 1762 tp->t_dupacks = 0; 1763 } 1764 } 1765 if (tp->t_dupacks < tcprexmtthresh) 1766 tp->t_dupacks = 0; 1767 #else /* else no TCP_SACK */ 1768 if (tp->t_dupacks >= tcprexmtthresh && 1769 tp->snd_cwnd > tp->snd_ssthresh) 1770 tp->snd_cwnd = tp->snd_ssthresh; 1771 tp->t_dupacks = 0; 1772 #endif 1773 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1774 tcpstat.tcps_rcvacktoomuch++; 1775 goto dropafterack_ratelim; 1776 } 1777 acked = th->th_ack - tp->snd_una; 1778 tcpstat.tcps_rcvackpack++; 1779 tcpstat.tcps_rcvackbyte += acked; 1780 1781 /* 1782 * If we have a timestamp reply, update smoothed 1783 * round trip time. If no timestamp is present but 1784 * transmit timer is running and timed sequence 1785 * number was acked, update smoothed round trip time. 1786 * Since we now have an rtt measurement, cancel the 1787 * timer backoff (cf., Phil Karn's retransmit alg.). 1788 * Recompute the initial retransmit timer. 1789 */ 1790 if (opti.ts_present && opti.ts_ecr) 1791 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1792 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1793 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1794 1795 /* 1796 * If all outstanding data is acked, stop retransmit 1797 * timer and remember to restart (more output or persist). 1798 * If there is more data to be acked, restart retransmit 1799 * timer, using current (possibly backed-off) value. 1800 */ 1801 if (th->th_ack == tp->snd_max) { 1802 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1803 needoutput = 1; 1804 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1805 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1806 /* 1807 * When new data is acked, open the congestion window. 1808 * If the window gives us less than ssthresh packets 1809 * in flight, open exponentially (maxseg per packet). 1810 * Otherwise open linearly: maxseg per window 1811 * (maxseg^2 / cwnd per packet). 1812 */ 1813 { 1814 u_int cw = tp->snd_cwnd; 1815 u_int incr = tp->t_maxseg; 1816 1817 if (cw > tp->snd_ssthresh) 1818 incr = incr * incr / cw; 1819 #if defined (TCP_SACK) 1820 if (tp->t_dupacks < tcprexmtthresh) 1821 #endif 1822 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1823 } 1824 ND6_HINT(tp); 1825 if (acked > so->so_snd.sb_cc) { 1826 tp->snd_wnd -= so->so_snd.sb_cc; 1827 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1828 ourfinisacked = 1; 1829 } else { 1830 sbdrop(&so->so_snd, acked); 1831 tp->snd_wnd -= acked; 1832 ourfinisacked = 0; 1833 } 1834 if (sb_notify(&so->so_snd)) 1835 sowwakeup(so); 1836 1837 /* 1838 * If we had a pending ICMP message that referred to data 1839 * that have just been acknowledged, disregard the recorded 1840 * ICMP message. 1841 */ 1842 if ((tp->t_flags & TF_PMTUD_PEND) && 1843 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1844 tp->t_flags &= ~TF_PMTUD_PEND; 1845 1846 /* 1847 * Keep track of the largest chunk of data acknowledged 1848 * since last PMTU update 1849 */ 1850 if (tp->t_pmtud_mss_acked < acked) 1851 tp->t_pmtud_mss_acked = acked; 1852 1853 tp->snd_una = th->th_ack; 1854 #ifdef TCP_ECN 1855 /* sync snd_last with snd_una */ 1856 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1857 tp->snd_last = tp->snd_una; 1858 #endif 1859 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1860 tp->snd_nxt = tp->snd_una; 1861 #if defined (TCP_SACK) && defined (TCP_FACK) 1862 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1863 tp->snd_fack = tp->snd_una; 1864 /* Update snd_awnd for partial ACK 1865 * without any SACK blocks. 1866 */ 1867 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1868 tp->snd_fack) + tp->retran_data; 1869 } 1870 #endif 1871 1872 switch (tp->t_state) { 1873 1874 /* 1875 * In FIN_WAIT_1 STATE in addition to the processing 1876 * for the ESTABLISHED state if our FIN is now acknowledged 1877 * then enter FIN_WAIT_2. 1878 */ 1879 case TCPS_FIN_WAIT_1: 1880 if (ourfinisacked) { 1881 /* 1882 * If we can't receive any more 1883 * data, then closing user can proceed. 1884 * Starting the timer is contrary to the 1885 * specification, but if we don't get a FIN 1886 * we'll hang forever. 1887 */ 1888 if (so->so_state & SS_CANTRCVMORE) { 1889 soisdisconnected(so); 1890 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1891 } 1892 tp->t_state = TCPS_FIN_WAIT_2; 1893 } 1894 break; 1895 1896 /* 1897 * In CLOSING STATE in addition to the processing for 1898 * the ESTABLISHED state if the ACK acknowledges our FIN 1899 * then enter the TIME-WAIT state, otherwise ignore 1900 * the segment. 1901 */ 1902 case TCPS_CLOSING: 1903 if (ourfinisacked) { 1904 tp->t_state = TCPS_TIME_WAIT; 1905 tcp_canceltimers(tp); 1906 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1907 soisdisconnected(so); 1908 } 1909 break; 1910 1911 /* 1912 * In LAST_ACK, we may still be waiting for data to drain 1913 * and/or to be acked, as well as for the ack of our FIN. 1914 * If our FIN is now acknowledged, delete the TCB, 1915 * enter the closed state and return. 1916 */ 1917 case TCPS_LAST_ACK: 1918 if (ourfinisacked) { 1919 tp = tcp_close(tp); 1920 goto drop; 1921 } 1922 break; 1923 1924 /* 1925 * In TIME_WAIT state the only thing that should arrive 1926 * is a retransmission of the remote FIN. Acknowledge 1927 * it and restart the finack timer. 1928 */ 1929 case TCPS_TIME_WAIT: 1930 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1931 goto dropafterack; 1932 } 1933 } 1934 1935 step6: 1936 /* 1937 * Update window information. 1938 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1939 */ 1940 if ((tiflags & TH_ACK) && 1941 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1942 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1943 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1944 /* keep track of pure window updates */ 1945 if (tlen == 0 && 1946 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1947 tcpstat.tcps_rcvwinupd++; 1948 tp->snd_wnd = tiwin; 1949 tp->snd_wl1 = th->th_seq; 1950 tp->snd_wl2 = th->th_ack; 1951 if (tp->snd_wnd > tp->max_sndwnd) 1952 tp->max_sndwnd = tp->snd_wnd; 1953 needoutput = 1; 1954 } 1955 1956 /* 1957 * Process segments with URG. 1958 */ 1959 if ((tiflags & TH_URG) && th->th_urp && 1960 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1961 /* 1962 * This is a kludge, but if we receive and accept 1963 * random urgent pointers, we'll crash in 1964 * soreceive. It's hard to imagine someone 1965 * actually wanting to send this much urgent data. 1966 */ 1967 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1968 th->th_urp = 0; /* XXX */ 1969 tiflags &= ~TH_URG; /* XXX */ 1970 goto dodata; /* XXX */ 1971 } 1972 /* 1973 * If this segment advances the known urgent pointer, 1974 * then mark the data stream. This should not happen 1975 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1976 * a FIN has been received from the remote side. 1977 * In these states we ignore the URG. 1978 * 1979 * According to RFC961 (Assigned Protocols), 1980 * the urgent pointer points to the last octet 1981 * of urgent data. We continue, however, 1982 * to consider it to indicate the first octet 1983 * of data past the urgent section as the original 1984 * spec states (in one of two places). 1985 */ 1986 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1987 tp->rcv_up = th->th_seq + th->th_urp; 1988 so->so_oobmark = so->so_rcv.sb_cc + 1989 (tp->rcv_up - tp->rcv_nxt) - 1; 1990 if (so->so_oobmark == 0) 1991 so->so_state |= SS_RCVATMARK; 1992 sohasoutofband(so); 1993 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1994 } 1995 /* 1996 * Remove out of band data so doesn't get presented to user. 1997 * This can happen independent of advancing the URG pointer, 1998 * but if two URG's are pending at once, some out-of-band 1999 * data may creep in... ick. 2000 */ 2001 if (th->th_urp <= (u_int16_t) tlen 2002 #ifdef SO_OOBINLINE 2003 && (so->so_options & SO_OOBINLINE) == 0 2004 #endif 2005 ) 2006 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2007 } else 2008 /* 2009 * If no out of band data is expected, 2010 * pull receive urgent pointer along 2011 * with the receive window. 2012 */ 2013 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2014 tp->rcv_up = tp->rcv_nxt; 2015 dodata: /* XXX */ 2016 2017 /* 2018 * Process the segment text, merging it into the TCP sequencing queue, 2019 * and arranging for acknowledgment of receipt if necessary. 2020 * This process logically involves adjusting tp->rcv_wnd as data 2021 * is presented to the user (this happens in tcp_usrreq.c, 2022 * case PRU_RCVD). If a FIN has already been received on this 2023 * connection then we just ignore the text. 2024 */ 2025 if ((tlen || (tiflags & TH_FIN)) && 2026 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2027 #ifdef TCP_SACK 2028 tcp_seq laststart = th->th_seq; 2029 tcp_seq lastend = th->th_seq + tlen; 2030 #endif 2031 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2032 tp->t_state == TCPS_ESTABLISHED) { 2033 TCP_SETUP_ACK(tp, tiflags); 2034 tp->rcv_nxt += tlen; 2035 tiflags = th->th_flags & TH_FIN; 2036 tcpstat.tcps_rcvpack++; 2037 tcpstat.tcps_rcvbyte += tlen; 2038 ND6_HINT(tp); 2039 if (so->so_state & SS_CANTRCVMORE) 2040 m_freem(m); 2041 else { 2042 m_adj(m, hdroptlen); 2043 sbappendstream(&so->so_rcv, m); 2044 } 2045 sorwakeup(so); 2046 } else { 2047 m_adj(m, hdroptlen); 2048 tiflags = tcp_reass(tp, th, m, &tlen); 2049 tp->t_flags |= TF_ACKNOW; 2050 } 2051 #ifdef TCP_SACK 2052 if (tp->sack_enable) 2053 tcp_update_sack_list(tp, laststart, lastend); 2054 #endif 2055 2056 /* 2057 * variable len never referenced again in modern BSD, 2058 * so why bother computing it ?? 2059 */ 2060 #if 0 2061 /* 2062 * Note the amount of data that peer has sent into 2063 * our window, in order to estimate the sender's 2064 * buffer size. 2065 */ 2066 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2067 #endif /* 0 */ 2068 } else { 2069 m_freem(m); 2070 tiflags &= ~TH_FIN; 2071 } 2072 2073 /* 2074 * If FIN is received ACK the FIN and let the user know 2075 * that the connection is closing. Ignore a FIN received before 2076 * the connection is fully established. 2077 */ 2078 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2079 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2080 socantrcvmore(so); 2081 tp->t_flags |= TF_ACKNOW; 2082 tp->rcv_nxt++; 2083 } 2084 switch (tp->t_state) { 2085 2086 /* 2087 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2088 */ 2089 case TCPS_ESTABLISHED: 2090 tp->t_state = TCPS_CLOSE_WAIT; 2091 break; 2092 2093 /* 2094 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2095 * enter the CLOSING state. 2096 */ 2097 case TCPS_FIN_WAIT_1: 2098 tp->t_state = TCPS_CLOSING; 2099 break; 2100 2101 /* 2102 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2103 * starting the time-wait timer, turning off the other 2104 * standard timers. 2105 */ 2106 case TCPS_FIN_WAIT_2: 2107 tp->t_state = TCPS_TIME_WAIT; 2108 tcp_canceltimers(tp); 2109 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2110 soisdisconnected(so); 2111 break; 2112 2113 /* 2114 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2115 */ 2116 case TCPS_TIME_WAIT: 2117 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2118 break; 2119 } 2120 } 2121 if (so->so_options & SO_DEBUG) { 2122 switch (tp->pf) { 2123 #ifdef INET6 2124 case PF_INET6: 2125 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2126 0, tlen); 2127 break; 2128 #endif /* INET6 */ 2129 case PF_INET: 2130 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2131 0, tlen); 2132 break; 2133 } 2134 } 2135 2136 /* 2137 * Return any desired output. 2138 */ 2139 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2140 (void) tcp_output(tp); 2141 } 2142 return; 2143 2144 badsyn: 2145 /* 2146 * Received a bad SYN. Increment counters and dropwithreset. 2147 */ 2148 tcpstat.tcps_badsyn++; 2149 tp = NULL; 2150 goto dropwithreset; 2151 2152 dropafterack_ratelim: 2153 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2154 tcp_ackdrop_ppslim) == 0) { 2155 /* XXX stat */ 2156 goto drop; 2157 } 2158 /* ...fall into dropafterack... */ 2159 2160 dropafterack: 2161 /* 2162 * Generate an ACK dropping incoming segment if it occupies 2163 * sequence space, where the ACK reflects our state. 2164 */ 2165 if (tiflags & TH_RST) 2166 goto drop; 2167 m_freem(m); 2168 tp->t_flags |= TF_ACKNOW; 2169 (void) tcp_output(tp); 2170 return; 2171 2172 dropwithreset_ratelim: 2173 /* 2174 * We may want to rate-limit RSTs in certain situations, 2175 * particularly if we are sending an RST in response to 2176 * an attempt to connect to or otherwise communicate with 2177 * a port for which we have no socket. 2178 */ 2179 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2180 tcp_rst_ppslim) == 0) { 2181 /* XXX stat */ 2182 goto drop; 2183 } 2184 /* ...fall into dropwithreset... */ 2185 2186 dropwithreset: 2187 /* 2188 * Generate a RST, dropping incoming segment. 2189 * Make ACK acceptable to originator of segment. 2190 * Don't bother to respond to RST. 2191 */ 2192 if (tiflags & TH_RST) 2193 goto drop; 2194 if (tiflags & TH_ACK) { 2195 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2196 TH_RST); 2197 } else { 2198 if (tiflags & TH_SYN) 2199 tlen++; 2200 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2201 (tcp_seq)0, TH_RST|TH_ACK); 2202 } 2203 m_freem(m); 2204 return; 2205 2206 drop: 2207 /* 2208 * Drop space held by incoming segment and return. 2209 */ 2210 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2211 switch (tp->pf) { 2212 #ifdef INET6 2213 case PF_INET6: 2214 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2215 0, tlen); 2216 break; 2217 #endif /* INET6 */ 2218 case PF_INET: 2219 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2220 0, tlen); 2221 break; 2222 } 2223 } 2224 2225 m_freem(m); 2226 return; 2227 } 2228 2229 int 2230 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2231 struct mbuf *m, int iphlen, struct tcp_opt_info *oi) 2232 { 2233 u_int16_t mss = 0; 2234 int opt, optlen; 2235 #ifdef TCP_SIGNATURE 2236 caddr_t sigp = NULL; 2237 struct tdb *tdb = NULL; 2238 #endif /* TCP_SIGNATURE */ 2239 2240 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2241 opt = cp[0]; 2242 if (opt == TCPOPT_EOL) 2243 break; 2244 if (opt == TCPOPT_NOP) 2245 optlen = 1; 2246 else { 2247 if (cnt < 2) 2248 break; 2249 optlen = cp[1]; 2250 if (optlen < 2 || optlen > cnt) 2251 break; 2252 } 2253 switch (opt) { 2254 2255 default: 2256 continue; 2257 2258 case TCPOPT_MAXSEG: 2259 if (optlen != TCPOLEN_MAXSEG) 2260 continue; 2261 if (!(th->th_flags & TH_SYN)) 2262 continue; 2263 if (TCPS_HAVERCVDSYN(tp->t_state)) 2264 continue; 2265 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2266 NTOHS(mss); 2267 oi->maxseg = mss; 2268 break; 2269 2270 case TCPOPT_WINDOW: 2271 if (optlen != TCPOLEN_WINDOW) 2272 continue; 2273 if (!(th->th_flags & TH_SYN)) 2274 continue; 2275 if (TCPS_HAVERCVDSYN(tp->t_state)) 2276 continue; 2277 tp->t_flags |= TF_RCVD_SCALE; 2278 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2279 break; 2280 2281 case TCPOPT_TIMESTAMP: 2282 if (optlen != TCPOLEN_TIMESTAMP) 2283 continue; 2284 oi->ts_present = 1; 2285 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2286 NTOHL(oi->ts_val); 2287 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2288 NTOHL(oi->ts_ecr); 2289 2290 if (!(th->th_flags & TH_SYN)) 2291 continue; 2292 if (TCPS_HAVERCVDSYN(tp->t_state)) 2293 continue; 2294 /* 2295 * A timestamp received in a SYN makes 2296 * it ok to send timestamp requests and replies. 2297 */ 2298 tp->t_flags |= TF_RCVD_TSTMP; 2299 tp->ts_recent = oi->ts_val; 2300 tp->ts_recent_age = tcp_now; 2301 break; 2302 2303 #ifdef TCP_SACK 2304 case TCPOPT_SACK_PERMITTED: 2305 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2306 continue; 2307 if (!(th->th_flags & TH_SYN)) 2308 continue; 2309 if (TCPS_HAVERCVDSYN(tp->t_state)) 2310 continue; 2311 /* MUST only be set on SYN */ 2312 tp->t_flags |= TF_SACK_PERMIT; 2313 break; 2314 case TCPOPT_SACK: 2315 tcp_sack_option(tp, th, cp, optlen); 2316 break; 2317 #endif 2318 #ifdef TCP_SIGNATURE 2319 case TCPOPT_SIGNATURE: 2320 if (optlen != TCPOLEN_SIGNATURE) 2321 continue; 2322 2323 if (sigp && bcmp(sigp, cp + 2, 16)) 2324 return (-1); 2325 2326 sigp = cp + 2; 2327 break; 2328 #endif /* TCP_SIGNATURE */ 2329 } 2330 } 2331 2332 #ifdef TCP_SIGNATURE 2333 if (tp->t_flags & TF_SIGNATURE) { 2334 union sockaddr_union src, dst; 2335 2336 memset(&src, 0, sizeof(union sockaddr_union)); 2337 memset(&dst, 0, sizeof(union sockaddr_union)); 2338 2339 switch (tp->pf) { 2340 case 0: 2341 #ifdef INET 2342 case AF_INET: 2343 src.sa.sa_len = sizeof(struct sockaddr_in); 2344 src.sa.sa_family = AF_INET; 2345 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2346 dst.sa.sa_len = sizeof(struct sockaddr_in); 2347 dst.sa.sa_family = AF_INET; 2348 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2349 break; 2350 #endif 2351 #ifdef INET6 2352 case AF_INET6: 2353 src.sa.sa_len = sizeof(struct sockaddr_in6); 2354 src.sa.sa_family = AF_INET6; 2355 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2356 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2357 dst.sa.sa_family = AF_INET6; 2358 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2359 break; 2360 #endif /* INET6 */ 2361 } 2362 2363 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 2364 2365 /* 2366 * We don't have an SA for this peer, so we turn off 2367 * TF_SIGNATURE on the listen socket 2368 */ 2369 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2370 tp->t_flags &= ~TF_SIGNATURE; 2371 2372 } 2373 2374 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2375 tcpstat.tcps_rcvbadsig++; 2376 return (-1); 2377 } 2378 2379 if (sigp) { 2380 char sig[16]; 2381 2382 if (tdb == NULL) { 2383 tcpstat.tcps_rcvbadsig++; 2384 return (-1); 2385 } 2386 2387 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2388 return (-1); 2389 2390 if (bcmp(sig, sigp, 16)) { 2391 tcpstat.tcps_rcvbadsig++; 2392 return (-1); 2393 } 2394 2395 tcpstat.tcps_rcvgoodsig++; 2396 } 2397 #endif /* TCP_SIGNATURE */ 2398 2399 return (0); 2400 } 2401 2402 #if defined(TCP_SACK) 2403 u_long 2404 tcp_seq_subtract(u_long a, u_long b) 2405 { 2406 return ((long)(a - b)); 2407 } 2408 #endif 2409 2410 2411 #ifdef TCP_SACK 2412 /* 2413 * This function is called upon receipt of new valid data (while not in header 2414 * prediction mode), and it updates the ordered list of sacks. 2415 */ 2416 void 2417 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2418 tcp_seq rcv_lastend) 2419 { 2420 /* 2421 * First reported block MUST be the most recent one. Subsequent 2422 * blocks SHOULD be in the order in which they arrived at the 2423 * receiver. These two conditions make the implementation fully 2424 * compliant with RFC 2018. 2425 */ 2426 int i, j = 0, count = 0, lastpos = -1; 2427 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2428 2429 /* First clean up current list of sacks */ 2430 for (i = 0; i < tp->rcv_numsacks; i++) { 2431 sack = tp->sackblks[i]; 2432 if (sack.start == 0 && sack.end == 0) { 2433 count++; /* count = number of blocks to be discarded */ 2434 continue; 2435 } 2436 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2437 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2438 count++; 2439 } else { 2440 temp[j].start = tp->sackblks[i].start; 2441 temp[j++].end = tp->sackblks[i].end; 2442 } 2443 } 2444 tp->rcv_numsacks -= count; 2445 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2446 tcp_clean_sackreport(tp); 2447 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2448 /* ==> need first sack block */ 2449 tp->sackblks[0].start = rcv_laststart; 2450 tp->sackblks[0].end = rcv_lastend; 2451 tp->rcv_numsacks = 1; 2452 } 2453 return; 2454 } 2455 /* Otherwise, sack blocks are already present. */ 2456 for (i = 0; i < tp->rcv_numsacks; i++) 2457 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2458 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2459 return; /* sack list remains unchanged */ 2460 /* 2461 * From here, segment just received should be (part of) the 1st sack. 2462 * Go through list, possibly coalescing sack block entries. 2463 */ 2464 firstsack.start = rcv_laststart; 2465 firstsack.end = rcv_lastend; 2466 for (i = 0; i < tp->rcv_numsacks; i++) { 2467 sack = tp->sackblks[i]; 2468 if (SEQ_LT(sack.end, firstsack.start) || 2469 SEQ_GT(sack.start, firstsack.end)) 2470 continue; /* no overlap */ 2471 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2472 /* 2473 * identical block; delete it here since we will 2474 * move it to the front of the list. 2475 */ 2476 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2477 lastpos = i; /* last posn with a zero entry */ 2478 continue; 2479 } 2480 if (SEQ_LEQ(sack.start, firstsack.start)) 2481 firstsack.start = sack.start; /* merge blocks */ 2482 if (SEQ_GEQ(sack.end, firstsack.end)) 2483 firstsack.end = sack.end; /* merge blocks */ 2484 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2485 lastpos = i; /* last posn with a zero entry */ 2486 } 2487 if (lastpos != -1) { /* at least one merge */ 2488 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2489 sack = tp->sackblks[i]; 2490 if (sack.start == 0 && sack.end == 0) 2491 continue; 2492 temp[j++] = sack; 2493 } 2494 tp->rcv_numsacks = j; /* including first blk (added later) */ 2495 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2496 tp->sackblks[i] = temp[i]; 2497 } else { /* no merges -- shift sacks by 1 */ 2498 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2499 tp->rcv_numsacks++; 2500 for (i = tp->rcv_numsacks-1; i > 0; i--) 2501 tp->sackblks[i] = tp->sackblks[i-1]; 2502 } 2503 tp->sackblks[0] = firstsack; 2504 return; 2505 } 2506 2507 /* 2508 * Process the TCP SACK option. tp->snd_holes is an ordered list 2509 * of holes (oldest to newest, in terms of the sequence space). 2510 */ 2511 void 2512 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2513 { 2514 int tmp_olen; 2515 u_char *tmp_cp; 2516 struct sackhole *cur, *p, *temp; 2517 2518 if (!tp->sack_enable) 2519 return; 2520 /* SACK without ACK doesn't make sense. */ 2521 if ((th->th_flags & TH_ACK) == 0) 2522 return; 2523 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2524 if (SEQ_LT(th->th_ack, tp->snd_una) || 2525 SEQ_GT(th->th_ack, tp->snd_max)) 2526 return; 2527 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2528 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2529 return; 2530 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2531 tmp_cp = cp + 2; 2532 tmp_olen = optlen - 2; 2533 tcpstat.tcps_sack_rcv_opts++; 2534 if (tp->snd_numholes < 0) 2535 tp->snd_numholes = 0; 2536 if (tp->t_maxseg == 0) 2537 panic("tcp_sack_option"); /* Should never happen */ 2538 while (tmp_olen > 0) { 2539 struct sackblk sack; 2540 2541 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2542 NTOHL(sack.start); 2543 bcopy(tmp_cp + sizeof(tcp_seq), 2544 (char *) &(sack.end), sizeof(tcp_seq)); 2545 NTOHL(sack.end); 2546 tmp_olen -= TCPOLEN_SACK; 2547 tmp_cp += TCPOLEN_SACK; 2548 if (SEQ_LEQ(sack.end, sack.start)) 2549 continue; /* bad SACK fields */ 2550 if (SEQ_LEQ(sack.end, tp->snd_una)) 2551 continue; /* old block */ 2552 #if defined(TCP_SACK) && defined(TCP_FACK) 2553 /* Updates snd_fack. */ 2554 if (SEQ_GT(sack.end, tp->snd_fack)) 2555 tp->snd_fack = sack.end; 2556 #endif /* TCP_FACK */ 2557 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2558 if (SEQ_LT(sack.start, th->th_ack)) 2559 continue; 2560 } 2561 if (SEQ_GT(sack.end, tp->snd_max)) 2562 continue; 2563 if (tp->snd_holes == NULL) { /* first hole */ 2564 tp->snd_holes = (struct sackhole *) 2565 pool_get(&sackhl_pool, PR_NOWAIT); 2566 if (tp->snd_holes == NULL) { 2567 /* ENOBUFS, so ignore SACKed block for now*/ 2568 goto done; 2569 } 2570 cur = tp->snd_holes; 2571 cur->start = th->th_ack; 2572 cur->end = sack.start; 2573 cur->rxmit = cur->start; 2574 cur->next = NULL; 2575 tp->snd_numholes = 1; 2576 tp->rcv_lastsack = sack.end; 2577 /* 2578 * dups is at least one. If more data has been 2579 * SACKed, it can be greater than one. 2580 */ 2581 cur->dups = min(tcprexmtthresh, 2582 ((sack.end - cur->end)/tp->t_maxseg)); 2583 if (cur->dups < 1) 2584 cur->dups = 1; 2585 continue; /* with next sack block */ 2586 } 2587 /* Go thru list of holes: p = previous, cur = current */ 2588 p = cur = tp->snd_holes; 2589 while (cur) { 2590 if (SEQ_LEQ(sack.end, cur->start)) 2591 /* SACKs data before the current hole */ 2592 break; /* no use going through more holes */ 2593 if (SEQ_GEQ(sack.start, cur->end)) { 2594 /* SACKs data beyond the current hole */ 2595 cur->dups++; 2596 if (((sack.end - cur->end)/tp->t_maxseg) >= 2597 tcprexmtthresh) 2598 cur->dups = tcprexmtthresh; 2599 p = cur; 2600 cur = cur->next; 2601 continue; 2602 } 2603 if (SEQ_LEQ(sack.start, cur->start)) { 2604 /* Data acks at least the beginning of hole */ 2605 #if defined(TCP_SACK) && defined(TCP_FACK) 2606 if (SEQ_GT(sack.end, cur->rxmit)) 2607 tp->retran_data -= 2608 tcp_seq_subtract(cur->rxmit, 2609 cur->start); 2610 else 2611 tp->retran_data -= 2612 tcp_seq_subtract(sack.end, 2613 cur->start); 2614 #endif /* TCP_FACK */ 2615 if (SEQ_GEQ(sack.end, cur->end)) { 2616 /* Acks entire hole, so delete hole */ 2617 if (p != cur) { 2618 p->next = cur->next; 2619 pool_put(&sackhl_pool, cur); 2620 cur = p->next; 2621 } else { 2622 cur = cur->next; 2623 pool_put(&sackhl_pool, p); 2624 p = cur; 2625 tp->snd_holes = p; 2626 } 2627 tp->snd_numholes--; 2628 continue; 2629 } 2630 /* otherwise, move start of hole forward */ 2631 cur->start = sack.end; 2632 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2633 p = cur; 2634 cur = cur->next; 2635 continue; 2636 } 2637 /* move end of hole backward */ 2638 if (SEQ_GEQ(sack.end, cur->end)) { 2639 #if defined(TCP_SACK) && defined(TCP_FACK) 2640 if (SEQ_GT(cur->rxmit, sack.start)) 2641 tp->retran_data -= 2642 tcp_seq_subtract(cur->rxmit, 2643 sack.start); 2644 #endif /* TCP_FACK */ 2645 cur->end = sack.start; 2646 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2647 cur->dups++; 2648 if (((sack.end - cur->end)/tp->t_maxseg) >= 2649 tcprexmtthresh) 2650 cur->dups = tcprexmtthresh; 2651 p = cur; 2652 cur = cur->next; 2653 continue; 2654 } 2655 if (SEQ_LT(cur->start, sack.start) && 2656 SEQ_GT(cur->end, sack.end)) { 2657 /* 2658 * ACKs some data in middle of a hole; need to 2659 * split current hole 2660 */ 2661 temp = (struct sackhole *) 2662 pool_get(&sackhl_pool, PR_NOWAIT); 2663 if (temp == NULL) 2664 goto done; /* ENOBUFS */ 2665 #if defined(TCP_SACK) && defined(TCP_FACK) 2666 if (SEQ_GT(cur->rxmit, sack.end)) 2667 tp->retran_data -= 2668 tcp_seq_subtract(sack.end, 2669 sack.start); 2670 else if (SEQ_GT(cur->rxmit, sack.start)) 2671 tp->retran_data -= 2672 tcp_seq_subtract(cur->rxmit, 2673 sack.start); 2674 #endif /* TCP_FACK */ 2675 temp->next = cur->next; 2676 temp->start = sack.end; 2677 temp->end = cur->end; 2678 temp->dups = cur->dups; 2679 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2680 cur->end = sack.start; 2681 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2682 cur->dups++; 2683 if (((sack.end - cur->end)/tp->t_maxseg) >= 2684 tcprexmtthresh) 2685 cur->dups = tcprexmtthresh; 2686 cur->next = temp; 2687 p = temp; 2688 cur = p->next; 2689 tp->snd_numholes++; 2690 } 2691 } 2692 /* At this point, p points to the last hole on the list */ 2693 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2694 /* 2695 * Need to append new hole at end. 2696 * Last hole is p (and it's not NULL). 2697 */ 2698 temp = (struct sackhole *) 2699 pool_get(&sackhl_pool, PR_NOWAIT); 2700 if (temp == NULL) 2701 goto done; /* ENOBUFS */ 2702 temp->start = tp->rcv_lastsack; 2703 temp->end = sack.start; 2704 temp->dups = min(tcprexmtthresh, 2705 ((sack.end - sack.start)/tp->t_maxseg)); 2706 if (temp->dups < 1) 2707 temp->dups = 1; 2708 temp->rxmit = temp->start; 2709 temp->next = 0; 2710 p->next = temp; 2711 tp->rcv_lastsack = sack.end; 2712 tp->snd_numholes++; 2713 } 2714 } 2715 done: 2716 #if defined(TCP_SACK) && defined(TCP_FACK) 2717 /* 2718 * Update retran_data and snd_awnd. Go through the list of 2719 * holes. Increment retran_data by (hole->rxmit - hole->start). 2720 */ 2721 tp->retran_data = 0; 2722 cur = tp->snd_holes; 2723 while (cur) { 2724 tp->retran_data += cur->rxmit - cur->start; 2725 cur = cur->next; 2726 } 2727 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2728 tp->retran_data; 2729 #endif /* TCP_FACK */ 2730 2731 return; 2732 } 2733 2734 /* 2735 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2736 * it is completely acked; otherwise, tcp_sack_option(), called from 2737 * tcp_dooptions(), will fix up the hole. 2738 */ 2739 void 2740 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2741 { 2742 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2743 /* max because this could be an older ack just arrived */ 2744 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2745 th->th_ack : tp->snd_una; 2746 struct sackhole *cur = tp->snd_holes; 2747 struct sackhole *prev; 2748 while (cur) 2749 if (SEQ_LEQ(cur->end, lastack)) { 2750 prev = cur; 2751 cur = cur->next; 2752 pool_put(&sackhl_pool, prev); 2753 tp->snd_numholes--; 2754 } else if (SEQ_LT(cur->start, lastack)) { 2755 cur->start = lastack; 2756 if (SEQ_LT(cur->rxmit, cur->start)) 2757 cur->rxmit = cur->start; 2758 break; 2759 } else 2760 break; 2761 tp->snd_holes = cur; 2762 } 2763 } 2764 2765 /* 2766 * Delete all receiver-side SACK information. 2767 */ 2768 void 2769 tcp_clean_sackreport(struct tcpcb *tp) 2770 { 2771 int i; 2772 2773 tp->rcv_numsacks = 0; 2774 for (i = 0; i < MAX_SACK_BLKS; i++) 2775 tp->sackblks[i].start = tp->sackblks[i].end=0; 2776 2777 } 2778 2779 /* 2780 * Checks for partial ack. If partial ack arrives, turn off retransmission 2781 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2782 * If the ack advances at least to tp->snd_last, return 0. 2783 */ 2784 int 2785 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2786 { 2787 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2788 /* Turn off retx. timer (will start again next segment) */ 2789 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2790 tp->t_rtttime = 0; 2791 #ifndef TCP_FACK 2792 /* 2793 * Partial window deflation. This statement relies on the 2794 * fact that tp->snd_una has not been updated yet. In FACK 2795 * hold snd_cwnd constant during fast recovery. 2796 */ 2797 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2798 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2799 tp->snd_cwnd += tp->t_maxseg; 2800 } else 2801 tp->snd_cwnd = tp->t_maxseg; 2802 #endif 2803 return (1); 2804 } 2805 return (0); 2806 } 2807 #endif /* TCP_SACK */ 2808 2809 /* 2810 * Pull out of band byte out of a segment so 2811 * it doesn't appear in the user's data queue. 2812 * It is still reflected in the segment length for 2813 * sequencing purposes. 2814 */ 2815 void 2816 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2817 { 2818 int cnt = off + urgent - 1; 2819 2820 while (cnt >= 0) { 2821 if (m->m_len > cnt) { 2822 char *cp = mtod(m, caddr_t) + cnt; 2823 struct tcpcb *tp = sototcpcb(so); 2824 2825 tp->t_iobc = *cp; 2826 tp->t_oobflags |= TCPOOB_HAVEDATA; 2827 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2828 m->m_len--; 2829 return; 2830 } 2831 cnt -= m->m_len; 2832 m = m->m_next; 2833 if (m == 0) 2834 break; 2835 } 2836 panic("tcp_pulloutofband"); 2837 } 2838 2839 /* 2840 * Collect new round-trip time estimate 2841 * and update averages and current timeout. 2842 */ 2843 void 2844 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2845 { 2846 short delta; 2847 short rttmin; 2848 2849 if (rtt < 0) 2850 rtt = 0; 2851 else if (rtt > TCP_RTT_MAX) 2852 rtt = TCP_RTT_MAX; 2853 2854 tcpstat.tcps_rttupdated++; 2855 if (tp->t_srtt != 0) { 2856 /* 2857 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2858 * after the binary point (scaled by 4), whereas 2859 * srtt is stored as fixed point with 5 bits after the 2860 * binary point (i.e., scaled by 32). The following magic 2861 * is equivalent to the smoothing algorithm in rfc793 with 2862 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2863 * point). 2864 */ 2865 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2866 (tp->t_srtt >> TCP_RTT_SHIFT); 2867 if ((tp->t_srtt += delta) <= 0) 2868 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2869 /* 2870 * We accumulate a smoothed rtt variance (actually, a 2871 * smoothed mean difference), then set the retransmit 2872 * timer to smoothed rtt + 4 times the smoothed variance. 2873 * rttvar is stored as fixed point with 4 bits after the 2874 * binary point (scaled by 16). The following is 2875 * equivalent to rfc793 smoothing with an alpha of .75 2876 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2877 * rfc793's wired-in beta. 2878 */ 2879 if (delta < 0) 2880 delta = -delta; 2881 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2882 if ((tp->t_rttvar += delta) <= 0) 2883 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2884 } else { 2885 /* 2886 * No rtt measurement yet - use the unsmoothed rtt. 2887 * Set the variance to half the rtt (so our first 2888 * retransmit happens at 3*rtt). 2889 */ 2890 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2891 tp->t_rttvar = (rtt + 1) << 2892 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2893 } 2894 tp->t_rtttime = 0; 2895 tp->t_rxtshift = 0; 2896 2897 /* 2898 * the retransmit should happen at rtt + 4 * rttvar. 2899 * Because of the way we do the smoothing, srtt and rttvar 2900 * will each average +1/2 tick of bias. When we compute 2901 * the retransmit timer, we want 1/2 tick of rounding and 2902 * 1 extra tick because of +-1/2 tick uncertainty in the 2903 * firing of the timer. The bias will give us exactly the 2904 * 1.5 tick we need. But, because the bias is 2905 * statistical, we have to test that we don't drop below 2906 * the minimum feasible timer (which is 2 ticks). 2907 */ 2908 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2909 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2910 2911 /* 2912 * We received an ack for a packet that wasn't retransmitted; 2913 * it is probably safe to discard any error indications we've 2914 * received recently. This isn't quite right, but close enough 2915 * for now (a route might have failed after we sent a segment, 2916 * and the return path might not be symmetrical). 2917 */ 2918 tp->t_softerror = 0; 2919 } 2920 2921 /* 2922 * Determine a reasonable value for maxseg size. 2923 * If the route is known, check route for mtu. 2924 * If none, use an mss that can be handled on the outgoing 2925 * interface without forcing IP to fragment; if bigger than 2926 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2927 * to utilize large mbufs. If no route is found, route has no mtu, 2928 * or the destination isn't local, use a default, hopefully conservative 2929 * size (usually 512 or the default IP max size, but no more than the mtu 2930 * of the interface), as we can't discover anything about intervening 2931 * gateways or networks. We also initialize the congestion/slow start 2932 * window to be a single segment if the destination isn't local. 2933 * While looking at the routing entry, we also initialize other path-dependent 2934 * parameters from pre-set or cached values in the routing entry. 2935 * 2936 * Also take into account the space needed for options that we 2937 * send regularly. Make maxseg shorter by that amount to assure 2938 * that we can send maxseg amount of data even when the options 2939 * are present. Store the upper limit of the length of options plus 2940 * data in maxopd. 2941 * 2942 * NOTE: offer == -1 indicates that the maxseg size changed due to 2943 * Path MTU discovery. 2944 */ 2945 int 2946 tcp_mss(struct tcpcb *tp, int offer) 2947 { 2948 struct rtentry *rt; 2949 struct ifnet *ifp; 2950 int mss, mssopt; 2951 int iphlen; 2952 struct inpcb *inp; 2953 2954 inp = tp->t_inpcb; 2955 2956 mssopt = mss = tcp_mssdflt; 2957 2958 rt = in_pcbrtentry(inp); 2959 2960 if (rt == NULL) 2961 goto out; 2962 2963 ifp = rt->rt_ifp; 2964 2965 switch (tp->pf) { 2966 #ifdef INET6 2967 case AF_INET6: 2968 iphlen = sizeof(struct ip6_hdr); 2969 break; 2970 #endif 2971 case AF_INET: 2972 iphlen = sizeof(struct ip); 2973 break; 2974 default: 2975 /* the family does not support path MTU discovery */ 2976 goto out; 2977 } 2978 2979 #ifdef RTV_MTU 2980 /* 2981 * if there's an mtu associated with the route and we support 2982 * path MTU discovery for the underlying protocol family, use it. 2983 */ 2984 if (rt->rt_rmx.rmx_mtu) { 2985 /* 2986 * One may wish to lower MSS to take into account options, 2987 * especially security-related options. 2988 */ 2989 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 2990 /* 2991 * RFC2460 section 5, last paragraph: if path MTU is 2992 * smaller than 1280, use 1280 as packet size and 2993 * attach fragment header. 2994 */ 2995 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2996 sizeof(struct tcphdr); 2997 } else 2998 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 2999 } else 3000 #endif /* RTV_MTU */ 3001 if (!ifp) 3002 /* 3003 * ifp may be null and rmx_mtu may be zero in certain 3004 * v6 cases (e.g., if ND wasn't able to resolve the 3005 * destination host. 3006 */ 3007 goto out; 3008 else if (ifp->if_flags & IFF_LOOPBACK) 3009 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3010 else if (tp->pf == AF_INET) { 3011 if (ip_mtudisc) 3012 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3013 else if (inp && in_localaddr(inp->inp_faddr)) 3014 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3015 } 3016 #ifdef INET6 3017 else if (tp->pf == AF_INET6) { 3018 /* 3019 * for IPv6, path MTU discovery is always turned on, 3020 * or the node must use packet size <= 1280. 3021 */ 3022 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3023 } 3024 #endif /* INET6 */ 3025 3026 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3027 if (offer != -1) { 3028 #ifndef INET6 3029 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3030 #else 3031 if (tp->pf == AF_INET6) 3032 mssopt = IN6_LINKMTU(ifp) - iphlen - 3033 sizeof(struct tcphdr); 3034 else 3035 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3036 #endif 3037 3038 mssopt = max(tcp_mssdflt, mssopt); 3039 } 3040 3041 out: 3042 /* 3043 * The current mss, t_maxseg, is initialized to the default value. 3044 * If we compute a smaller value, reduce the current mss. 3045 * If we compute a larger value, return it for use in sending 3046 * a max seg size option, but don't store it for use 3047 * unless we received an offer at least that large from peer. 3048 * 3049 * However, do not accept offers lower than the minimum of 3050 * the interface MTU and 216. 3051 */ 3052 if (offer > 0) 3053 tp->t_peermss = offer; 3054 if (tp->t_peermss) 3055 mss = min(mss, max(tp->t_peermss, 216)); 3056 3057 /* sanity - at least max opt. space */ 3058 mss = max(mss, 64); 3059 3060 /* 3061 * maxopd stores the maximum length of data AND options 3062 * in a segment; maxseg is the amount of data in a normal 3063 * segment. We need to store this value (maxopd) apart 3064 * from maxseg, because now every segment carries options 3065 * and thus we normally have somewhat less data in segments. 3066 */ 3067 tp->t_maxopd = mss; 3068 3069 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3070 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3071 mss -= TCPOLEN_TSTAMP_APPA; 3072 #ifdef TCP_SIGNATURE 3073 if (tp->t_flags & TF_SIGNATURE) 3074 mss -= TCPOLEN_SIGLEN; 3075 #endif 3076 3077 if (offer == -1) { 3078 /* mss changed due to Path MTU discovery */ 3079 tp->t_flags &= ~TF_PMTUD_PEND; 3080 tp->t_pmtud_mtu_sent = 0; 3081 tp->t_pmtud_mss_acked = 0; 3082 if (mss < tp->t_maxseg) { 3083 /* 3084 * Follow suggestion in RFC 2414 to reduce the 3085 * congestion window by the ratio of the old 3086 * segment size to the new segment size. 3087 */ 3088 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3089 mss, mss); 3090 } 3091 } else if (tcp_do_rfc3390) { 3092 /* increase initial window */ 3093 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3094 } else 3095 tp->snd_cwnd = mss; 3096 3097 tp->t_maxseg = mss; 3098 3099 return (offer != -1 ? mssopt : mss); 3100 } 3101 3102 u_int 3103 tcp_hdrsz(struct tcpcb *tp) 3104 { 3105 u_int hlen; 3106 3107 switch (tp->pf) { 3108 #ifdef INET6 3109 case AF_INET6: 3110 hlen = sizeof(struct ip6_hdr); 3111 break; 3112 #endif 3113 case AF_INET: 3114 hlen = sizeof(struct ip); 3115 break; 3116 default: 3117 hlen = 0; 3118 break; 3119 } 3120 hlen += sizeof(struct tcphdr); 3121 3122 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3123 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3124 hlen += TCPOLEN_TSTAMP_APPA; 3125 #ifdef TCP_SIGNATURE 3126 if (tp->t_flags & TF_SIGNATURE) 3127 hlen += TCPOLEN_SIGLEN; 3128 #endif 3129 return (hlen); 3130 } 3131 3132 /* 3133 * Set connection variables based on the effective MSS. 3134 * We are passed the TCPCB for the actual connection. If we 3135 * are the server, we are called by the compressed state engine 3136 * when the 3-way handshake is complete. If we are the client, 3137 * we are called when we receive the SYN,ACK from the server. 3138 * 3139 * NOTE: The t_maxseg value must be initialized in the TCPCB 3140 * before this routine is called! 3141 */ 3142 void 3143 tcp_mss_update(struct tcpcb *tp) 3144 { 3145 int mss; 3146 u_long bufsize; 3147 struct rtentry *rt; 3148 struct socket *so; 3149 3150 so = tp->t_inpcb->inp_socket; 3151 mss = tp->t_maxseg; 3152 3153 rt = in_pcbrtentry(tp->t_inpcb); 3154 3155 if (rt == NULL) 3156 return; 3157 3158 bufsize = so->so_snd.sb_hiwat; 3159 if (bufsize < mss) { 3160 mss = bufsize; 3161 /* Update t_maxseg and t_maxopd */ 3162 tcp_mss(tp, mss); 3163 } else { 3164 bufsize = roundup(bufsize, mss); 3165 if (bufsize > sb_max) 3166 bufsize = sb_max; 3167 (void)sbreserve(&so->so_snd, bufsize); 3168 } 3169 3170 bufsize = so->so_rcv.sb_hiwat; 3171 if (bufsize > mss) { 3172 bufsize = roundup(bufsize, mss); 3173 if (bufsize > sb_max) 3174 bufsize = sb_max; 3175 (void)sbreserve(&so->so_rcv, bufsize); 3176 } 3177 3178 } 3179 3180 #if defined (TCP_SACK) 3181 /* 3182 * Checks for partial ack. If partial ack arrives, force the retransmission 3183 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3184 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3185 * be started again. If the ack advances at least to tp->snd_last, return 0. 3186 */ 3187 int 3188 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3189 { 3190 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3191 /* 3192 * snd_una has not been updated and the socket send buffer 3193 * not yet drained of the acked data, so we have to leave 3194 * snd_una as it was to get the correct data offset in 3195 * tcp_output(). 3196 */ 3197 tcp_seq onxt = tp->snd_nxt; 3198 u_long ocwnd = tp->snd_cwnd; 3199 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3200 tp->t_rtttime = 0; 3201 tp->snd_nxt = th->th_ack; 3202 /* 3203 * Set snd_cwnd to one segment beyond acknowledged offset 3204 * (tp->snd_una not yet updated when this function is called) 3205 */ 3206 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3207 (void) tcp_output(tp); 3208 tp->snd_cwnd = ocwnd; 3209 if (SEQ_GT(onxt, tp->snd_nxt)) 3210 tp->snd_nxt = onxt; 3211 /* 3212 * Partial window deflation. Relies on fact that tp->snd_una 3213 * not updated yet. 3214 */ 3215 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3216 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3217 else 3218 tp->snd_cwnd = 0; 3219 tp->snd_cwnd += tp->t_maxseg; 3220 3221 return 1; 3222 } 3223 return 0; 3224 } 3225 #endif /* TCP_SACK */ 3226 3227 int 3228 tcp_mss_adv(struct ifnet *ifp, int af) 3229 { 3230 int mss = 0; 3231 int iphlen; 3232 3233 switch (af) { 3234 case AF_INET: 3235 if (ifp != NULL) 3236 mss = ifp->if_mtu; 3237 iphlen = sizeof(struct ip); 3238 break; 3239 #ifdef INET6 3240 case AF_INET6: 3241 if (ifp != NULL) 3242 mss = IN6_LINKMTU(ifp); 3243 iphlen = sizeof(struct ip6_hdr); 3244 break; 3245 #endif 3246 } 3247 mss = mss - iphlen - sizeof(struct tcphdr); 3248 return (max(mss, tcp_mssdflt)); 3249 } 3250 3251 /* 3252 * TCP compressed state engine. Currently used to hold compressed 3253 * state for SYN_RECEIVED. 3254 */ 3255 3256 u_long syn_cache_count; 3257 u_int32_t syn_hash1, syn_hash2; 3258 3259 #define SYN_HASH(sa, sp, dp) \ 3260 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3261 ((u_int32_t)(sp)))^syn_hash2))) 3262 #ifndef INET6 3263 #define SYN_HASHALL(hash, src, dst) \ 3264 do { \ 3265 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3266 ((struct sockaddr_in *)(src))->sin_port, \ 3267 ((struct sockaddr_in *)(dst))->sin_port); \ 3268 } while (/*CONSTCOND*/ 0) 3269 #else 3270 #define SYN_HASH6(sa, sp, dp) \ 3271 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3272 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3273 & 0x7fffffff) 3274 3275 #define SYN_HASHALL(hash, src, dst) \ 3276 do { \ 3277 switch ((src)->sa_family) { \ 3278 case AF_INET: \ 3279 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3280 ((struct sockaddr_in *)(src))->sin_port, \ 3281 ((struct sockaddr_in *)(dst))->sin_port); \ 3282 break; \ 3283 case AF_INET6: \ 3284 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3285 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3286 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3287 break; \ 3288 default: \ 3289 hash = 0; \ 3290 } \ 3291 } while (/*CONSTCOND*/0) 3292 #endif /* INET6 */ 3293 3294 #define SYN_CACHE_RM(sc) \ 3295 do { \ 3296 (sc)->sc_flags |= SCF_DEAD; \ 3297 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 3298 (sc), sc_bucketq); \ 3299 (sc)->sc_tp = NULL; \ 3300 LIST_REMOVE((sc), sc_tpq); \ 3301 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 3302 timeout_del(&(sc)->sc_timer); \ 3303 syn_cache_count--; \ 3304 } while (/*CONSTCOND*/0) 3305 3306 #define SYN_CACHE_PUT(sc) \ 3307 do { \ 3308 if ((sc)->sc_ipopts) \ 3309 (void) m_free((sc)->sc_ipopts); \ 3310 if ((sc)->sc_route4.ro_rt != NULL) \ 3311 RTFREE((sc)->sc_route4.ro_rt); \ 3312 timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \ 3313 timeout_add(&(sc)->sc_timer, 0); \ 3314 } while (/*CONSTCOND*/0) 3315 3316 struct pool syn_cache_pool; 3317 3318 /* 3319 * We don't estimate RTT with SYNs, so each packet starts with the default 3320 * RTT and each timer step has a fixed timeout value. 3321 */ 3322 #define SYN_CACHE_TIMER_ARM(sc) \ 3323 do { \ 3324 TCPT_RANGESET((sc)->sc_rxtcur, \ 3325 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3326 TCPTV_REXMTMAX); \ 3327 if (!timeout_initialized(&(sc)->sc_timer)) \ 3328 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3329 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3330 } while (/*CONSTCOND*/0) 3331 3332 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3333 3334 void 3335 syn_cache_init() 3336 { 3337 int i; 3338 3339 /* Initialize the hash buckets. */ 3340 for (i = 0; i < tcp_syn_cache_size; i++) 3341 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3342 3343 /* Initialize the syn cache pool. */ 3344 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3345 "synpl", NULL); 3346 } 3347 3348 void 3349 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3350 { 3351 struct syn_cache_head *scp; 3352 struct syn_cache *sc2; 3353 int s; 3354 3355 /* 3356 * If there are no entries in the hash table, reinitialize 3357 * the hash secrets. 3358 */ 3359 if (syn_cache_count == 0) { 3360 syn_hash1 = arc4random(); 3361 syn_hash2 = arc4random(); 3362 } 3363 3364 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3365 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3366 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3367 3368 /* 3369 * Make sure that we don't overflow the per-bucket 3370 * limit or the total cache size limit. 3371 */ 3372 s = splsoftnet(); 3373 if (scp->sch_length >= tcp_syn_bucket_limit) { 3374 tcpstat.tcps_sc_bucketoverflow++; 3375 /* 3376 * The bucket is full. Toss the oldest element in the 3377 * bucket. This will be the first entry in the bucket. 3378 */ 3379 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3380 #ifdef DIAGNOSTIC 3381 /* 3382 * This should never happen; we should always find an 3383 * entry in our bucket. 3384 */ 3385 if (sc2 == NULL) 3386 panic("syn_cache_insert: bucketoverflow: impossible"); 3387 #endif 3388 SYN_CACHE_RM(sc2); 3389 SYN_CACHE_PUT(sc2); 3390 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3391 struct syn_cache_head *scp2, *sce; 3392 3393 tcpstat.tcps_sc_overflowed++; 3394 /* 3395 * The cache is full. Toss the oldest entry in the 3396 * first non-empty bucket we can find. 3397 * 3398 * XXX We would really like to toss the oldest 3399 * entry in the cache, but we hope that this 3400 * condition doesn't happen very often. 3401 */ 3402 scp2 = scp; 3403 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3404 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3405 for (++scp2; scp2 != scp; scp2++) { 3406 if (scp2 >= sce) 3407 scp2 = &tcp_syn_cache[0]; 3408 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3409 break; 3410 } 3411 #ifdef DIAGNOSTIC 3412 /* 3413 * This should never happen; we should always find a 3414 * non-empty bucket. 3415 */ 3416 if (scp2 == scp) 3417 panic("syn_cache_insert: cacheoverflow: " 3418 "impossible"); 3419 #endif 3420 } 3421 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3422 SYN_CACHE_RM(sc2); 3423 SYN_CACHE_PUT(sc2); 3424 } 3425 3426 /* 3427 * Initialize the entry's timer. 3428 */ 3429 sc->sc_rxttot = 0; 3430 sc->sc_rxtshift = 0; 3431 SYN_CACHE_TIMER_ARM(sc); 3432 3433 /* Link it from tcpcb entry */ 3434 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3435 3436 /* Put it into the bucket. */ 3437 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3438 scp->sch_length++; 3439 syn_cache_count++; 3440 3441 tcpstat.tcps_sc_added++; 3442 splx(s); 3443 } 3444 3445 /* 3446 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3447 * If we have retransmitted an entry the maximum number of times, expire 3448 * that entry. 3449 */ 3450 void 3451 syn_cache_timer(void *arg) 3452 { 3453 struct syn_cache *sc = arg; 3454 int s; 3455 3456 s = splsoftnet(); 3457 if (sc->sc_flags & SCF_DEAD) { 3458 splx(s); 3459 return; 3460 } 3461 3462 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3463 /* Drop it -- too many retransmissions. */ 3464 goto dropit; 3465 } 3466 3467 /* 3468 * Compute the total amount of time this entry has 3469 * been on a queue. If this entry has been on longer 3470 * than the keep alive timer would allow, expire it. 3471 */ 3472 sc->sc_rxttot += sc->sc_rxtcur; 3473 if (sc->sc_rxttot >= tcptv_keep_init) 3474 goto dropit; 3475 3476 tcpstat.tcps_sc_retransmitted++; 3477 (void) syn_cache_respond(sc, NULL); 3478 3479 /* Advance the timer back-off. */ 3480 sc->sc_rxtshift++; 3481 SYN_CACHE_TIMER_ARM(sc); 3482 3483 splx(s); 3484 return; 3485 3486 dropit: 3487 tcpstat.tcps_sc_timed_out++; 3488 SYN_CACHE_RM(sc); 3489 SYN_CACHE_PUT(sc); 3490 splx(s); 3491 } 3492 3493 void 3494 syn_cache_reaper(void *arg) 3495 { 3496 struct syn_cache *sc = arg; 3497 int s; 3498 3499 s = splsoftnet(); 3500 pool_put(&syn_cache_pool, (sc)); 3501 splx(s); 3502 return; 3503 } 3504 3505 /* 3506 * Remove syn cache created by the specified tcb entry, 3507 * because this does not make sense to keep them 3508 * (if there's no tcb entry, syn cache entry will never be used) 3509 */ 3510 void 3511 syn_cache_cleanup(struct tcpcb *tp) 3512 { 3513 struct syn_cache *sc, *nsc; 3514 int s; 3515 3516 s = splsoftnet(); 3517 3518 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3519 nsc = LIST_NEXT(sc, sc_tpq); 3520 3521 #ifdef DIAGNOSTIC 3522 if (sc->sc_tp != tp) 3523 panic("invalid sc_tp in syn_cache_cleanup"); 3524 #endif 3525 SYN_CACHE_RM(sc); 3526 SYN_CACHE_PUT(sc); 3527 } 3528 /* just for safety */ 3529 LIST_INIT(&tp->t_sc); 3530 3531 splx(s); 3532 } 3533 3534 /* 3535 * Find an entry in the syn cache. 3536 */ 3537 struct syn_cache * 3538 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3539 struct syn_cache_head **headp) 3540 { 3541 struct syn_cache *sc; 3542 struct syn_cache_head *scp; 3543 u_int32_t hash; 3544 int s; 3545 3546 SYN_HASHALL(hash, src, dst); 3547 3548 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3549 *headp = scp; 3550 s = splsoftnet(); 3551 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3552 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3553 if (sc->sc_hash != hash) 3554 continue; 3555 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3556 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 3557 splx(s); 3558 return (sc); 3559 } 3560 } 3561 splx(s); 3562 return (NULL); 3563 } 3564 3565 /* 3566 * This function gets called when we receive an ACK for a 3567 * socket in the LISTEN state. We look up the connection 3568 * in the syn cache, and if its there, we pull it out of 3569 * the cache and turn it into a full-blown connection in 3570 * the SYN-RECEIVED state. 3571 * 3572 * The return values may not be immediately obvious, and their effects 3573 * can be subtle, so here they are: 3574 * 3575 * NULL SYN was not found in cache; caller should drop the 3576 * packet and send an RST. 3577 * 3578 * -1 We were unable to create the new connection, and are 3579 * aborting it. An ACK,RST is being sent to the peer 3580 * (unless we got screwey sequence numbners; see below), 3581 * because the 3-way handshake has been completed. Caller 3582 * should not free the mbuf, since we may be using it. If 3583 * we are not, we will free it. 3584 * 3585 * Otherwise, the return value is a pointer to the new socket 3586 * associated with the connection. 3587 */ 3588 struct socket * 3589 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3590 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3591 { 3592 struct syn_cache *sc; 3593 struct syn_cache_head *scp; 3594 struct inpcb *inp = NULL; 3595 struct tcpcb *tp = 0; 3596 struct mbuf *am; 3597 int s; 3598 struct socket *oso; 3599 3600 s = splsoftnet(); 3601 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3602 splx(s); 3603 return (NULL); 3604 } 3605 3606 /* 3607 * Verify the sequence and ack numbers. Try getting the correct 3608 * response again. 3609 */ 3610 if ((th->th_ack != sc->sc_iss + 1) || 3611 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3612 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3613 (void) syn_cache_respond(sc, m); 3614 splx(s); 3615 return ((struct socket *)(-1)); 3616 } 3617 3618 /* Remove this cache entry */ 3619 SYN_CACHE_RM(sc); 3620 splx(s); 3621 3622 /* 3623 * Ok, create the full blown connection, and set things up 3624 * as they would have been set up if we had created the 3625 * connection when the SYN arrived. If we can't create 3626 * the connection, abort it. 3627 */ 3628 oso = so; 3629 so = sonewconn(so, SS_ISCONNECTED); 3630 if (so == NULL) 3631 goto resetandabort; 3632 3633 inp = sotoinpcb(oso); 3634 #ifdef IPSEC 3635 /* 3636 * We need to copy the required security levels 3637 * from the old pcb. Ditto for any other 3638 * IPsec-related information. 3639 */ 3640 { 3641 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 3642 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3643 sizeof(inp->inp_seclevel)); 3644 newinp->inp_secrequire = inp->inp_secrequire; 3645 if (inp->inp_ipo != NULL) { 3646 newinp->inp_ipo = inp->inp_ipo; 3647 inp->inp_ipo->ipo_ref_count++; 3648 } 3649 if (inp->inp_ipsec_remotecred != NULL) { 3650 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3651 inp->inp_ipsec_remotecred->ref_count++; 3652 } 3653 if (inp->inp_ipsec_remoteauth != NULL) { 3654 newinp->inp_ipsec_remoteauth 3655 = inp->inp_ipsec_remoteauth; 3656 inp->inp_ipsec_remoteauth->ref_count++; 3657 } 3658 } 3659 #endif /* IPSEC */ 3660 #ifdef INET6 3661 /* 3662 * inp still has the OLD in_pcb stuff, set the 3663 * v6-related flags on the new guy, too. 3664 */ 3665 { 3666 int flags = inp->inp_flags; 3667 struct inpcb *oldinpcb = inp; 3668 3669 inp = (struct inpcb *)so->so_pcb; 3670 inp->inp_flags |= (flags & INP_IPV6); 3671 if ((inp->inp_flags & INP_IPV6) != 0) { 3672 inp->inp_ipv6.ip6_hlim = 3673 oldinpcb->inp_ipv6.ip6_hlim; 3674 } 3675 } 3676 #else /* INET6 */ 3677 inp = (struct inpcb *)so->so_pcb; 3678 #endif /* INET6 */ 3679 3680 inp->inp_lport = th->th_dport; 3681 switch (src->sa_family) { 3682 #ifdef INET6 3683 case AF_INET6: 3684 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3685 break; 3686 #endif /* INET6 */ 3687 case AF_INET: 3688 3689 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3690 inp->inp_options = ip_srcroute(); 3691 if (inp->inp_options == NULL) { 3692 inp->inp_options = sc->sc_ipopts; 3693 sc->sc_ipopts = NULL; 3694 } 3695 break; 3696 } 3697 in_pcbrehash(inp); 3698 3699 /* 3700 * Give the new socket our cached route reference. 3701 */ 3702 if (src->sa_family == AF_INET) 3703 inp->inp_route = sc->sc_route4; /* struct assignment */ 3704 #ifdef INET6 3705 else 3706 inp->inp_route6 = sc->sc_route6; 3707 #endif 3708 sc->sc_route4.ro_rt = NULL; 3709 3710 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3711 if (am == NULL) 3712 goto resetandabort; 3713 am->m_len = src->sa_len; 3714 bcopy(src, mtod(am, caddr_t), src->sa_len); 3715 3716 switch (src->sa_family) { 3717 case AF_INET: 3718 /* drop IPv4 packet to AF_INET6 socket */ 3719 if (inp->inp_flags & INP_IPV6) { 3720 (void) m_free(am); 3721 goto resetandabort; 3722 } 3723 if (in_pcbconnect(inp, am)) { 3724 (void) m_free(am); 3725 goto resetandabort; 3726 } 3727 break; 3728 #ifdef INET6 3729 case AF_INET6: 3730 if (in6_pcbconnect(inp, am)) { 3731 (void) m_free(am); 3732 goto resetandabort; 3733 } 3734 break; 3735 #endif 3736 } 3737 (void) m_free(am); 3738 3739 tp = intotcpcb(inp); 3740 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3741 if (sc->sc_request_r_scale != 15) { 3742 tp->requested_s_scale = sc->sc_requested_s_scale; 3743 tp->request_r_scale = sc->sc_request_r_scale; 3744 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3745 } 3746 if (sc->sc_flags & SCF_TIMESTAMP) 3747 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3748 3749 tp->t_template = tcp_template(tp); 3750 if (tp->t_template == 0) { 3751 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3752 so = NULL; 3753 m_freem(m); 3754 goto abort; 3755 } 3756 #ifdef TCP_SACK 3757 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3758 #endif 3759 3760 tp->ts_modulate = sc->sc_modulate; 3761 tp->iss = sc->sc_iss; 3762 tp->irs = sc->sc_irs; 3763 tcp_sendseqinit(tp); 3764 #if defined (TCP_SACK) || defined(TCP_ECN) 3765 tp->snd_last = tp->snd_una; 3766 #endif /* TCP_SACK */ 3767 #if defined(TCP_SACK) && defined(TCP_FACK) 3768 tp->snd_fack = tp->snd_una; 3769 tp->retran_data = 0; 3770 tp->snd_awnd = 0; 3771 #endif /* TCP_FACK */ 3772 #ifdef TCP_ECN 3773 if (sc->sc_flags & SCF_ECN_PERMIT) { 3774 tp->t_flags |= TF_ECN_PERMIT; 3775 tcpstat.tcps_ecn_accepts++; 3776 } 3777 #endif 3778 #ifdef TCP_SACK 3779 if (sc->sc_flags & SCF_SACK_PERMIT) 3780 tp->t_flags |= TF_SACK_PERMIT; 3781 #endif 3782 #ifdef TCP_SIGNATURE 3783 if (sc->sc_flags & SCF_SIGNATURE) 3784 tp->t_flags |= TF_SIGNATURE; 3785 #endif 3786 tcp_rcvseqinit(tp); 3787 tp->t_state = TCPS_SYN_RECEIVED; 3788 tp->t_rcvtime = tcp_now; 3789 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3790 tcpstat.tcps_accepts++; 3791 3792 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3793 if (sc->sc_peermaxseg) 3794 tcp_mss_update(tp); 3795 /* Reset initial window to 1 segment for retransmit */ 3796 if (sc->sc_rxtshift > 0) 3797 tp->snd_cwnd = tp->t_maxseg; 3798 tp->snd_wl1 = sc->sc_irs; 3799 tp->rcv_up = sc->sc_irs + 1; 3800 3801 /* 3802 * This is what whould have happened in tcp_output() when 3803 * the SYN,ACK was sent. 3804 */ 3805 tp->snd_up = tp->snd_una; 3806 tp->snd_max = tp->snd_nxt = tp->iss+1; 3807 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3808 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3809 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3810 tp->last_ack_sent = tp->rcv_nxt; 3811 3812 tcpstat.tcps_sc_completed++; 3813 SYN_CACHE_PUT(sc); 3814 return (so); 3815 3816 resetandabort: 3817 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST); 3818 m_freem(m); 3819 abort: 3820 if (so != NULL) 3821 (void) soabort(so); 3822 SYN_CACHE_PUT(sc); 3823 tcpstat.tcps_sc_aborted++; 3824 return ((struct socket *)(-1)); 3825 } 3826 3827 /* 3828 * This function is called when we get a RST for a 3829 * non-existent connection, so that we can see if the 3830 * connection is in the syn cache. If it is, zap it. 3831 */ 3832 3833 void 3834 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 3835 { 3836 struct syn_cache *sc; 3837 struct syn_cache_head *scp; 3838 int s = splsoftnet(); 3839 3840 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3841 splx(s); 3842 return; 3843 } 3844 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3845 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3846 splx(s); 3847 return; 3848 } 3849 SYN_CACHE_RM(sc); 3850 splx(s); 3851 tcpstat.tcps_sc_reset++; 3852 SYN_CACHE_PUT(sc); 3853 } 3854 3855 void 3856 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 3857 { 3858 struct syn_cache *sc; 3859 struct syn_cache_head *scp; 3860 int s; 3861 3862 s = splsoftnet(); 3863 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3864 splx(s); 3865 return; 3866 } 3867 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3868 if (ntohl (th->th_seq) != sc->sc_iss) { 3869 splx(s); 3870 return; 3871 } 3872 3873 /* 3874 * If we've retransmitted 3 times and this is our second error, 3875 * we remove the entry. Otherwise, we allow it to continue on. 3876 * This prevents us from incorrectly nuking an entry during a 3877 * spurious network outage. 3878 * 3879 * See tcp_notify(). 3880 */ 3881 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3882 sc->sc_flags |= SCF_UNREACH; 3883 splx(s); 3884 return; 3885 } 3886 3887 SYN_CACHE_RM(sc); 3888 splx(s); 3889 tcpstat.tcps_sc_unreach++; 3890 SYN_CACHE_PUT(sc); 3891 } 3892 3893 /* 3894 * Given a LISTEN socket and an inbound SYN request, add 3895 * this to the syn cache, and send back a segment: 3896 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3897 * to the source. 3898 * 3899 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3900 * Doing so would require that we hold onto the data and deliver it 3901 * to the application. However, if we are the target of a SYN-flood 3902 * DoS attack, an attacker could send data which would eventually 3903 * consume all available buffer space if it were ACKed. By not ACKing 3904 * the data, we avoid this DoS scenario. 3905 */ 3906 3907 int 3908 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3909 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3910 struct tcp_opt_info *oi, tcp_seq *issp) 3911 { 3912 struct tcpcb tb, *tp; 3913 long win; 3914 struct syn_cache *sc; 3915 struct syn_cache_head *scp; 3916 struct mbuf *ipopts; 3917 3918 tp = sototcpcb(so); 3919 3920 /* 3921 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3922 * 3923 * Note this check is performed in tcp_input() very early on. 3924 */ 3925 3926 /* 3927 * Initialize some local state. 3928 */ 3929 win = sbspace(&so->so_rcv); 3930 if (win > TCP_MAXWIN) 3931 win = TCP_MAXWIN; 3932 3933 #ifdef TCP_SIGNATURE 3934 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3935 #else 3936 if (optp) { 3937 #endif 3938 tb.pf = tp->pf; 3939 #ifdef TCP_SACK 3940 tb.sack_enable = tp->sack_enable; 3941 #endif 3942 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3943 #ifdef TCP_SIGNATURE 3944 if (tp->t_flags & TF_SIGNATURE) 3945 tb.t_flags |= TF_SIGNATURE; 3946 #endif 3947 tb.t_state = TCPS_LISTEN; 3948 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi)) 3949 return (0); 3950 } else 3951 tb.t_flags = 0; 3952 3953 switch (src->sa_family) { 3954 #ifdef INET 3955 case AF_INET: 3956 /* 3957 * Remember the IP options, if any. 3958 */ 3959 ipopts = ip_srcroute(); 3960 break; 3961 #endif 3962 default: 3963 ipopts = NULL; 3964 } 3965 3966 /* 3967 * See if we already have an entry for this connection. 3968 * If we do, resend the SYN,ACK. We do not count this 3969 * as a retransmission (XXX though maybe we should). 3970 */ 3971 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3972 tcpstat.tcps_sc_dupesyn++; 3973 if (ipopts) { 3974 /* 3975 * If we were remembering a previous source route, 3976 * forget it and use the new one we've been given. 3977 */ 3978 if (sc->sc_ipopts) 3979 (void) m_free(sc->sc_ipopts); 3980 sc->sc_ipopts = ipopts; 3981 } 3982 sc->sc_timestamp = tb.ts_recent; 3983 if (syn_cache_respond(sc, m) == 0) { 3984 tcpstat.tcps_sndacks++; 3985 tcpstat.tcps_sndtotal++; 3986 } 3987 return (1); 3988 } 3989 3990 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 3991 if (sc == NULL) { 3992 if (ipopts) 3993 (void) m_free(ipopts); 3994 return (0); 3995 } 3996 3997 /* 3998 * Fill in the cache, and put the necessary IP and TCP 3999 * options into the reply. 4000 */ 4001 bzero(sc, sizeof(struct syn_cache)); 4002 bzero(&sc->sc_timer, sizeof(sc->sc_timer)); 4003 bcopy(src, &sc->sc_src, src->sa_len); 4004 bcopy(dst, &sc->sc_dst, dst->sa_len); 4005 sc->sc_flags = 0; 4006 sc->sc_ipopts = ipopts; 4007 sc->sc_irs = th->th_seq; 4008 4009 sc->sc_iss = issp ? *issp : arc4random(); 4010 sc->sc_peermaxseg = oi->maxseg; 4011 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4012 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4013 sc->sc_win = win; 4014 sc->sc_timestamp = tb.ts_recent; 4015 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4016 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4017 sc->sc_flags |= SCF_TIMESTAMP; 4018 sc->sc_modulate = arc4random(); 4019 } 4020 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4021 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4022 sc->sc_requested_s_scale = tb.requested_s_scale; 4023 sc->sc_request_r_scale = 0; 4024 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4025 TCP_MAXWIN << sc->sc_request_r_scale < 4026 so->so_rcv.sb_hiwat) 4027 sc->sc_request_r_scale++; 4028 } else { 4029 sc->sc_requested_s_scale = 15; 4030 sc->sc_request_r_scale = 15; 4031 } 4032 #ifdef TCP_ECN 4033 /* 4034 * if both ECE and CWR flag bits are set, peer is ECN capable. 4035 */ 4036 if (tcp_do_ecn && 4037 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4038 sc->sc_flags |= SCF_ECN_PERMIT; 4039 #endif 4040 #ifdef TCP_SACK 4041 /* 4042 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4043 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4044 */ 4045 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4046 sc->sc_flags |= SCF_SACK_PERMIT; 4047 #endif 4048 #ifdef TCP_SIGNATURE 4049 if (tb.t_flags & TF_SIGNATURE) 4050 sc->sc_flags |= SCF_SIGNATURE; 4051 #endif 4052 sc->sc_tp = tp; 4053 if (syn_cache_respond(sc, m) == 0) { 4054 syn_cache_insert(sc, tp); 4055 tcpstat.tcps_sndacks++; 4056 tcpstat.tcps_sndtotal++; 4057 } else { 4058 SYN_CACHE_PUT(sc); 4059 tcpstat.tcps_sc_dropped++; 4060 } 4061 return (1); 4062 } 4063 4064 int 4065 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4066 { 4067 struct route *ro; 4068 u_int8_t *optp; 4069 int optlen, error; 4070 u_int16_t tlen; 4071 struct ip *ip = NULL; 4072 #ifdef INET6 4073 struct ip6_hdr *ip6 = NULL; 4074 #endif 4075 struct tcphdr *th; 4076 u_int hlen; 4077 struct inpcb *inp; 4078 4079 switch (sc->sc_src.sa.sa_family) { 4080 case AF_INET: 4081 hlen = sizeof(struct ip); 4082 ro = &sc->sc_route4; 4083 break; 4084 #ifdef INET6 4085 case AF_INET6: 4086 hlen = sizeof(struct ip6_hdr); 4087 ro = (struct route *)&sc->sc_route6; 4088 break; 4089 #endif 4090 default: 4091 if (m) 4092 m_freem(m); 4093 return (EAFNOSUPPORT); 4094 } 4095 4096 /* Compute the size of the TCP options. */ 4097 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4098 #ifdef TCP_SACK 4099 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4100 #endif 4101 #ifdef TCP_SIGNATURE 4102 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4103 #endif 4104 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4105 4106 tlen = hlen + sizeof(struct tcphdr) + optlen; 4107 4108 /* 4109 * Create the IP+TCP header from scratch. 4110 */ 4111 if (m) 4112 m_freem(m); 4113 #ifdef DIAGNOSTIC 4114 if (max_linkhdr + tlen > MCLBYTES) 4115 return (ENOBUFS); 4116 #endif 4117 MGETHDR(m, M_DONTWAIT, MT_DATA); 4118 if (m && max_linkhdr + tlen > MHLEN) { 4119 MCLGET(m, M_DONTWAIT); 4120 if ((m->m_flags & M_EXT) == 0) { 4121 m_freem(m); 4122 m = NULL; 4123 } 4124 } 4125 if (m == NULL) 4126 return (ENOBUFS); 4127 4128 /* Fixup the mbuf. */ 4129 m->m_data += max_linkhdr; 4130 m->m_len = m->m_pkthdr.len = tlen; 4131 m->m_pkthdr.rcvif = NULL; 4132 memset(mtod(m, u_char *), 0, tlen); 4133 4134 switch (sc->sc_src.sa.sa_family) { 4135 case AF_INET: 4136 ip = mtod(m, struct ip *); 4137 ip->ip_dst = sc->sc_src.sin.sin_addr; 4138 ip->ip_src = sc->sc_dst.sin.sin_addr; 4139 ip->ip_p = IPPROTO_TCP; 4140 th = (struct tcphdr *)(ip + 1); 4141 th->th_dport = sc->sc_src.sin.sin_port; 4142 th->th_sport = sc->sc_dst.sin.sin_port; 4143 break; 4144 #ifdef INET6 4145 case AF_INET6: 4146 ip6 = mtod(m, struct ip6_hdr *); 4147 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4148 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4149 ip6->ip6_nxt = IPPROTO_TCP; 4150 /* ip6_plen will be updated in ip6_output() */ 4151 th = (struct tcphdr *)(ip6 + 1); 4152 th->th_dport = sc->sc_src.sin6.sin6_port; 4153 th->th_sport = sc->sc_dst.sin6.sin6_port; 4154 break; 4155 #endif 4156 default: 4157 th = NULL; 4158 } 4159 4160 th->th_seq = htonl(sc->sc_iss); 4161 th->th_ack = htonl(sc->sc_irs + 1); 4162 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4163 th->th_flags = TH_SYN|TH_ACK; 4164 #ifdef TCP_ECN 4165 /* Set ECE for SYN-ACK if peer supports ECN. */ 4166 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4167 th->th_flags |= TH_ECE; 4168 #endif 4169 th->th_win = htons(sc->sc_win); 4170 /* th_sum already 0 */ 4171 /* th_urp already 0 */ 4172 4173 /* Tack on the TCP options. */ 4174 optp = (u_int8_t *)(th + 1); 4175 *optp++ = TCPOPT_MAXSEG; 4176 *optp++ = 4; 4177 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4178 *optp++ = sc->sc_ourmaxseg & 0xff; 4179 4180 #ifdef TCP_SACK 4181 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4182 if (sc->sc_flags & SCF_SACK_PERMIT) { 4183 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4184 optp += 4; 4185 } 4186 #endif 4187 4188 if (sc->sc_request_r_scale != 15) { 4189 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4190 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4191 sc->sc_request_r_scale); 4192 optp += 4; 4193 } 4194 4195 if (sc->sc_flags & SCF_TIMESTAMP) { 4196 u_int32_t *lp = (u_int32_t *)(optp); 4197 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4198 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4199 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4200 *lp = htonl(sc->sc_timestamp); 4201 optp += TCPOLEN_TSTAMP_APPA; 4202 } 4203 4204 #ifdef TCP_SIGNATURE 4205 if (sc->sc_flags & SCF_SIGNATURE) { 4206 union sockaddr_union src, dst; 4207 struct tdb *tdb; 4208 4209 bzero(&src, sizeof(union sockaddr_union)); 4210 bzero(&dst, sizeof(union sockaddr_union)); 4211 src.sa.sa_len = sc->sc_src.sa.sa_len; 4212 src.sa.sa_family = sc->sc_src.sa.sa_family; 4213 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4214 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4215 4216 switch (sc->sc_src.sa.sa_family) { 4217 case 0: /*default to PF_INET*/ 4218 #ifdef INET 4219 case AF_INET: 4220 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4221 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4222 break; 4223 #endif /* INET */ 4224 #ifdef INET6 4225 case AF_INET6: 4226 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4227 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4228 break; 4229 #endif /* INET6 */ 4230 } 4231 4232 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 4233 if (tdb == NULL) { 4234 if (m) 4235 m_freem(m); 4236 return (EPERM); 4237 } 4238 4239 /* Send signature option */ 4240 *(optp++) = TCPOPT_SIGNATURE; 4241 *(optp++) = TCPOLEN_SIGNATURE; 4242 4243 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4244 hlen, 0, optp) < 0) { 4245 if (m) 4246 m_freem(m); 4247 return (EINVAL); 4248 } 4249 optp += 16; 4250 4251 /* Pad options list to the next 32 bit boundary and 4252 * terminate it. 4253 */ 4254 *optp++ = TCPOPT_NOP; 4255 *optp++ = TCPOPT_EOL; 4256 } 4257 #endif /* TCP_SIGNATURE */ 4258 4259 /* Compute the packet's checksum. */ 4260 switch (sc->sc_src.sa.sa_family) { 4261 case AF_INET: 4262 ip->ip_len = htons(tlen - hlen); 4263 th->th_sum = 0; 4264 th->th_sum = in_cksum(m, tlen); 4265 break; 4266 #ifdef INET6 4267 case AF_INET6: 4268 ip6->ip6_plen = htons(tlen - hlen); 4269 th->th_sum = 0; 4270 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4271 break; 4272 #endif 4273 } 4274 4275 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4276 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4277 4278 /* 4279 * Fill in some straggling IP bits. Note the stack expects 4280 * ip_len to be in host order, for convenience. 4281 */ 4282 switch (sc->sc_src.sa.sa_family) { 4283 #ifdef INET 4284 case AF_INET: 4285 ip->ip_len = htons(tlen); 4286 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4287 /* XXX tos? */ 4288 break; 4289 #endif 4290 #ifdef INET6 4291 case AF_INET6: 4292 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4293 ip6->ip6_vfc |= IPV6_VERSION; 4294 ip6->ip6_plen = htons(tlen - hlen); 4295 /* ip6_hlim will be initialized afterwards */ 4296 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4297 break; 4298 #endif 4299 } 4300 4301 switch (sc->sc_src.sa.sa_family) { 4302 #ifdef INET 4303 case AF_INET: 4304 error = ip_output(m, sc->sc_ipopts, ro, 4305 (ip_mtudisc ? IP_MTUDISC : 0), 4306 (struct ip_moptions *)NULL, inp); 4307 break; 4308 #endif 4309 #ifdef INET6 4310 case AF_INET6: 4311 ip6->ip6_hlim = in6_selecthlim(NULL, 4312 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4313 4314 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4315 (struct ip6_moptions *)0, NULL, NULL); 4316 break; 4317 #endif 4318 default: 4319 error = EAFNOSUPPORT; 4320 break; 4321 } 4322 return (error); 4323 } 4324