1 /* $OpenBSD: tcp_input.c,v 1.224 2008/11/02 10:37:29 claudio Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/protosw.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/kernel.h> 78 #include <sys/pool.h> 79 80 #include <dev/rndvar.h> 81 82 #include <net/if.h> 83 #include <net/route.h> 84 85 #include <netinet/in.h> 86 #include <netinet/in_systm.h> 87 #include <netinet/ip.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/tcp.h> 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/tcp_debug.h> 97 98 #include "faith.h" 99 100 #include "pf.h" 101 #if NPF > 0 102 #include <net/pfvar.h> 103 #endif 104 105 struct tcpiphdr tcp_saveti; 106 107 int tcp_mss_adv(struct ifnet *, int); 108 109 #ifdef INET6 110 #include <netinet6/in6_var.h> 111 #include <netinet6/nd6.h> 112 113 struct tcpipv6hdr tcp_saveti6; 114 115 /* for the packet header length in the mbuf */ 116 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 117 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 118 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 119 #endif /* INET6 */ 120 121 int tcprexmtthresh = 3; 122 int tcptv_keep_init = TCPTV_KEEP_INIT; 123 124 extern u_long sb_max; 125 126 int tcp_rst_ppslim = 100; /* 100pps */ 127 int tcp_rst_ppslim_count = 0; 128 struct timeval tcp_rst_ppslim_last; 129 130 int tcp_ackdrop_ppslim = 100; /* 100pps */ 131 int tcp_ackdrop_ppslim_count = 0; 132 struct timeval tcp_ackdrop_ppslim_last; 133 134 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 135 136 /* for modulo comparisons of timestamps */ 137 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 138 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 139 140 /* for TCP SACK comparisons */ 141 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 142 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 143 144 /* 145 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 146 */ 147 #ifdef INET6 148 #define ND6_HINT(tp) \ 149 do { \ 150 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 151 tp->t_inpcb->inp_route6.ro_rt) { \ 152 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 153 } \ 154 } while (0) 155 #else 156 #define ND6_HINT(tp) 157 #endif 158 159 #ifdef TCP_ECN 160 /* 161 * ECN (Explicit Congestion Notification) support based on RFC3168 162 * implementation note: 163 * snd_last is used to track a recovery phase. 164 * when cwnd is reduced, snd_last is set to snd_max. 165 * while snd_last > snd_una, the sender is in a recovery phase and 166 * its cwnd should not be reduced again. 167 * snd_last follows snd_una when not in a recovery phase. 168 */ 169 #endif 170 171 /* 172 * Macro to compute ACK transmission behavior. Delay the ACK unless 173 * we have already delayed an ACK (must send an ACK every two segments). 174 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 175 * option is enabled. 176 */ 177 #define TCP_SETUP_ACK(tp, tiflags) \ 178 do { \ 179 if ((tp)->t_flags & TF_DELACK || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \ 181 tp->t_flags |= TF_ACKNOW; \ 182 else \ 183 TCP_SET_DELACK(tp); \ 184 } while (0) 185 186 /* 187 * Insert segment ti into reassembly queue of tcp with 188 * control block tp. Return TH_FIN if reassembly now includes 189 * a segment with FIN. The macro form does the common case inline 190 * (segment is the next to be received on an established connection, 191 * and the queue is empty), avoiding linkage into and removal 192 * from the queue and repetition of various conversions. 193 * Set DELACK for segments received in order, but ack immediately 194 * when segments are out of order (so fast retransmit can work). 195 */ 196 197 int 198 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 199 { 200 struct tcpqent *p, *q, *nq, *tiqe; 201 struct socket *so = tp->t_inpcb->inp_socket; 202 int flags; 203 204 /* 205 * Call with th==0 after become established to 206 * force pre-ESTABLISHED data up to user socket. 207 */ 208 if (th == 0) 209 goto present; 210 211 /* 212 * Allocate a new queue entry, before we throw away any data. 213 * If we can't, just drop the packet. XXX 214 */ 215 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 216 if (tiqe == NULL) { 217 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 218 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 219 /* Reuse last entry since new segment fills a hole */ 220 m_freem(tiqe->tcpqe_m); 221 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 222 } 223 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 224 /* Flush segment queue for this connection */ 225 tcp_freeq(tp); 226 tcpstat.tcps_rcvmemdrop++; 227 m_freem(m); 228 return (0); 229 } 230 } 231 232 /* 233 * Find a segment which begins after this one does. 234 */ 235 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 236 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 237 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 238 break; 239 240 /* 241 * If there is a preceding segment, it may provide some of 242 * our data already. If so, drop the data from the incoming 243 * segment. If it provides all of our data, drop us. 244 */ 245 if (p != NULL) { 246 struct tcphdr *phdr = p->tcpqe_tcp; 247 int i; 248 249 /* conversion to int (in i) handles seq wraparound */ 250 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 251 if (i > 0) { 252 if (i >= *tlen) { 253 tcpstat.tcps_rcvduppack++; 254 tcpstat.tcps_rcvdupbyte += *tlen; 255 m_freem(m); 256 pool_put(&tcpqe_pool, tiqe); 257 return (0); 258 } 259 m_adj(m, i); 260 *tlen -= i; 261 th->th_seq += i; 262 } 263 } 264 tcpstat.tcps_rcvoopack++; 265 tcpstat.tcps_rcvoobyte += *tlen; 266 267 /* 268 * While we overlap succeeding segments trim them or, 269 * if they are completely covered, dequeue them. 270 */ 271 for (; q != NULL; q = nq) { 272 struct tcphdr *qhdr = q->tcpqe_tcp; 273 int i = (th->th_seq + *tlen) - qhdr->th_seq; 274 275 if (i <= 0) 276 break; 277 if (i < qhdr->th_reseqlen) { 278 qhdr->th_seq += i; 279 qhdr->th_reseqlen -= i; 280 m_adj(q->tcpqe_m, i); 281 break; 282 } 283 nq = TAILQ_NEXT(q, tcpqe_q); 284 m_freem(q->tcpqe_m); 285 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 286 pool_put(&tcpqe_pool, q); 287 } 288 289 /* Insert the new segment queue entry into place. */ 290 tiqe->tcpqe_m = m; 291 th->th_reseqlen = *tlen; 292 tiqe->tcpqe_tcp = th; 293 if (p == NULL) { 294 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 295 } else { 296 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 297 } 298 299 present: 300 /* 301 * Present data to user, advancing rcv_nxt through 302 * completed sequence space. 303 */ 304 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 305 return (0); 306 q = TAILQ_FIRST(&tp->t_segq); 307 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 308 return (0); 309 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 310 return (0); 311 do { 312 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 313 flags = q->tcpqe_tcp->th_flags & TH_FIN; 314 315 nq = TAILQ_NEXT(q, tcpqe_q); 316 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 317 ND6_HINT(tp); 318 if (so->so_state & SS_CANTRCVMORE) 319 m_freem(q->tcpqe_m); 320 else 321 sbappendstream(&so->so_rcv, q->tcpqe_m); 322 pool_put(&tcpqe_pool, q); 323 q = nq; 324 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 325 sorwakeup(so); 326 return (flags); 327 } 328 329 #ifdef INET6 330 int 331 tcp6_input(struct mbuf **mp, int *offp, int proto) 332 { 333 struct mbuf *m = *mp; 334 335 #if NFAITH > 0 336 if (m->m_pkthdr.rcvif) { 337 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 338 /* XXX send icmp6 host/port unreach? */ 339 m_freem(m); 340 return IPPROTO_DONE; 341 } 342 } 343 #endif 344 345 tcp_input(m, *offp, proto); 346 return IPPROTO_DONE; 347 } 348 #endif 349 350 /* 351 * TCP input routine, follows pages 65-76 of the 352 * protocol specification dated September, 1981 very closely. 353 */ 354 void 355 tcp_input(struct mbuf *m, ...) 356 { 357 struct ip *ip; 358 struct inpcb *inp = NULL; 359 u_int8_t *optp = NULL; 360 int optlen = 0; 361 int tlen, off; 362 struct tcpcb *tp = 0; 363 int tiflags; 364 struct socket *so = NULL; 365 int todrop, acked, ourfinisacked, needoutput = 0; 366 int hdroptlen = 0; 367 short ostate = 0; 368 tcp_seq iss, *reuse = NULL; 369 u_long tiwin; 370 struct tcp_opt_info opti; 371 int iphlen; 372 va_list ap; 373 struct tcphdr *th; 374 #ifdef INET6 375 struct ip6_hdr *ip6 = NULL; 376 #endif /* INET6 */ 377 #ifdef IPSEC 378 struct m_tag *mtag; 379 struct tdb_ident *tdbi; 380 struct tdb *tdb; 381 int error, s; 382 #endif /* IPSEC */ 383 int af; 384 #ifdef TCP_ECN 385 u_char iptos; 386 #endif 387 388 va_start(ap, m); 389 iphlen = va_arg(ap, int); 390 va_end(ap); 391 392 tcpstat.tcps_rcvtotal++; 393 394 opti.ts_present = 0; 395 opti.maxseg = 0; 396 397 /* 398 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 399 * See below for AF specific multicast. 400 */ 401 if (m->m_flags & (M_BCAST|M_MCAST)) 402 goto drop; 403 404 /* 405 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 406 * TCP/IPv4. 407 */ 408 switch (mtod(m, struct ip *)->ip_v) { 409 #ifdef INET6 410 case 6: 411 af = AF_INET6; 412 break; 413 #endif 414 case 4: 415 af = AF_INET; 416 break; 417 default: 418 m_freem(m); 419 return; /*EAFNOSUPPORT*/ 420 } 421 422 /* 423 * Get IP and TCP header together in first mbuf. 424 * Note: IP leaves IP header in first mbuf. 425 */ 426 switch (af) { 427 case AF_INET: 428 #ifdef DIAGNOSTIC 429 if (iphlen < sizeof(struct ip)) { 430 m_freem(m); 431 return; 432 } 433 #endif /* DIAGNOSTIC */ 434 break; 435 #ifdef INET6 436 case AF_INET6: 437 #ifdef DIAGNOSTIC 438 if (iphlen < sizeof(struct ip6_hdr)) { 439 m_freem(m); 440 return; 441 } 442 #endif /* DIAGNOSTIC */ 443 break; 444 #endif 445 default: 446 m_freem(m); 447 return; 448 } 449 450 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 451 if (!th) { 452 tcpstat.tcps_rcvshort++; 453 return; 454 } 455 456 tlen = m->m_pkthdr.len - iphlen; 457 ip = NULL; 458 #ifdef INET6 459 ip6 = NULL; 460 #endif 461 switch (af) { 462 case AF_INET: 463 ip = mtod(m, struct ip *); 464 if (IN_MULTICAST(ip->ip_dst.s_addr) || 465 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 466 goto drop; 467 #ifdef TCP_ECN 468 /* save ip_tos before clearing it for checksum */ 469 iptos = ip->ip_tos; 470 #endif 471 /* 472 * Checksum extended TCP header and data. 473 */ 474 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 475 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 476 tcpstat.tcps_inhwcsum++; 477 tcpstat.tcps_rcvbadsum++; 478 goto drop; 479 } 480 if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) { 481 tcpstat.tcps_rcvbadsum++; 482 goto drop; 483 } 484 } else { 485 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 486 tcpstat.tcps_inhwcsum++; 487 } 488 break; 489 #ifdef INET6 490 case AF_INET6: 491 ip6 = mtod(m, struct ip6_hdr *); 492 #ifdef TCP_ECN 493 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 494 #endif 495 496 /* Be proactive about malicious use of IPv4 mapped address */ 497 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 498 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 499 /* XXX stat */ 500 goto drop; 501 } 502 503 /* 504 * Be proactive about unspecified IPv6 address in source. 505 * As we use all-zero to indicate unbounded/unconnected pcb, 506 * unspecified IPv6 address can be used to confuse us. 507 * 508 * Note that packets with unspecified IPv6 destination is 509 * already dropped in ip6_input. 510 */ 511 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 512 /* XXX stat */ 513 goto drop; 514 } 515 516 /* Discard packets to multicast */ 517 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 518 /* XXX stat */ 519 goto drop; 520 } 521 522 /* 523 * Checksum extended TCP header and data. 524 */ 525 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 526 tcpstat.tcps_rcvbadsum++; 527 goto drop; 528 } 529 break; 530 #endif 531 } 532 533 /* 534 * Check that TCP offset makes sense, 535 * pull out TCP options and adjust length. XXX 536 */ 537 off = th->th_off << 2; 538 if (off < sizeof(struct tcphdr) || off > tlen) { 539 tcpstat.tcps_rcvbadoff++; 540 goto drop; 541 } 542 tlen -= off; 543 if (off > sizeof(struct tcphdr)) { 544 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 545 if (!th) { 546 tcpstat.tcps_rcvshort++; 547 return; 548 } 549 optlen = off - sizeof(struct tcphdr); 550 optp = (u_int8_t *)(th + 1); 551 /* 552 * Do quick retrieval of timestamp options ("options 553 * prediction?"). If timestamp is the only option and it's 554 * formatted as recommended in RFC 1323 appendix A, we 555 * quickly get the values now and not bother calling 556 * tcp_dooptions(), etc. 557 */ 558 if ((optlen == TCPOLEN_TSTAMP_APPA || 559 (optlen > TCPOLEN_TSTAMP_APPA && 560 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 561 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 562 (th->th_flags & TH_SYN) == 0) { 563 opti.ts_present = 1; 564 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 565 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 566 optp = NULL; /* we've parsed the options */ 567 } 568 } 569 tiflags = th->th_flags; 570 571 /* 572 * Convert TCP protocol specific fields to host format. 573 */ 574 NTOHL(th->th_seq); 575 NTOHL(th->th_ack); 576 NTOHS(th->th_win); 577 NTOHS(th->th_urp); 578 579 /* 580 * Locate pcb for segment. 581 */ 582 #if NPF > 0 583 if (m->m_pkthdr.pf.statekey) 584 inp = ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp; 585 #endif 586 findpcb: 587 if (inp == NULL) { 588 switch (af) { 589 #ifdef INET6 590 case AF_INET6: 591 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 592 th->th_sport, &ip6->ip6_dst, th->th_dport); 593 break; 594 #endif 595 case AF_INET: 596 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 597 th->th_sport, ip->ip_dst, th->th_dport); 598 break; 599 } 600 #if NPF > 0 601 if (m->m_pkthdr.pf.statekey && inp) { 602 ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp = 603 inp; 604 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 605 } 606 #endif 607 } 608 if (inp == NULL) { 609 int inpl_flags = 0; 610 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 611 inpl_flags = INPLOOKUP_WILDCARD; 612 ++tcpstat.tcps_pcbhashmiss; 613 switch (af) { 614 #ifdef INET6 615 case AF_INET6: 616 inp = in6_pcblookup_listen(&tcbtable, 617 &ip6->ip6_dst, th->th_dport, inpl_flags, m); 618 break; 619 #endif /* INET6 */ 620 case AF_INET: 621 inp = in_pcblookup_listen(&tcbtable, 622 ip->ip_dst, th->th_dport, inpl_flags, m); 623 break; 624 } 625 /* 626 * If the state is CLOSED (i.e., TCB does not exist) then 627 * all data in the incoming segment is discarded. 628 * If the TCB exists but is in CLOSED state, it is embryonic, 629 * but should either do a listen or a connect soon. 630 */ 631 if (inp == 0) { 632 ++tcpstat.tcps_noport; 633 goto dropwithreset_ratelim; 634 } 635 } 636 637 /* Check the minimum TTL for socket. */ 638 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 639 goto drop; 640 641 tp = intotcpcb(inp); 642 if (tp == 0) 643 goto dropwithreset_ratelim; 644 if (tp->t_state == TCPS_CLOSED) 645 goto drop; 646 647 /* Unscale the window into a 32-bit value. */ 648 if ((tiflags & TH_SYN) == 0) 649 tiwin = th->th_win << tp->snd_scale; 650 else 651 tiwin = th->th_win; 652 653 so = inp->inp_socket; 654 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 655 union syn_cache_sa src; 656 union syn_cache_sa dst; 657 658 bzero(&src, sizeof(src)); 659 bzero(&dst, sizeof(dst)); 660 switch (af) { 661 #ifdef INET 662 case AF_INET: 663 src.sin.sin_len = sizeof(struct sockaddr_in); 664 src.sin.sin_family = AF_INET; 665 src.sin.sin_addr = ip->ip_src; 666 src.sin.sin_port = th->th_sport; 667 668 dst.sin.sin_len = sizeof(struct sockaddr_in); 669 dst.sin.sin_family = AF_INET; 670 dst.sin.sin_addr = ip->ip_dst; 671 dst.sin.sin_port = th->th_dport; 672 break; 673 #endif 674 #ifdef INET6 675 case AF_INET6: 676 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 677 src.sin6.sin6_family = AF_INET6; 678 src.sin6.sin6_addr = ip6->ip6_src; 679 src.sin6.sin6_port = th->th_sport; 680 681 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 682 dst.sin6.sin6_family = AF_INET6; 683 dst.sin6.sin6_addr = ip6->ip6_dst; 684 dst.sin6.sin6_port = th->th_dport; 685 break; 686 #endif /* INET6 */ 687 default: 688 goto badsyn; /*sanity*/ 689 } 690 691 if (so->so_options & SO_DEBUG) { 692 ostate = tp->t_state; 693 switch (af) { 694 #ifdef INET6 695 case AF_INET6: 696 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 697 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 698 break; 699 #endif 700 case AF_INET: 701 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 702 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 703 break; 704 } 705 } 706 if (so->so_options & SO_ACCEPTCONN) { 707 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 708 if (tiflags & TH_RST) { 709 syn_cache_reset(&src.sa, &dst.sa, th); 710 } else if ((tiflags & (TH_ACK|TH_SYN)) == 711 (TH_ACK|TH_SYN)) { 712 /* 713 * Received a SYN,ACK. This should 714 * never happen while we are in 715 * LISTEN. Send an RST. 716 */ 717 goto badsyn; 718 } else if (tiflags & TH_ACK) { 719 so = syn_cache_get(&src.sa, &dst.sa, 720 th, iphlen, tlen, so, m); 721 if (so == NULL) { 722 /* 723 * We don't have a SYN for 724 * this ACK; send an RST. 725 */ 726 goto badsyn; 727 } else if (so == 728 (struct socket *)(-1)) { 729 /* 730 * We were unable to create 731 * the connection. If the 732 * 3-way handshake was 733 * completed, and RST has 734 * been sent to the peer. 735 * Since the mbuf might be 736 * in use for the reply, 737 * do not free it. 738 */ 739 m = NULL; 740 } else { 741 /* 742 * We have created a 743 * full-blown connection. 744 */ 745 tp = NULL; 746 inp = (struct inpcb *)so->so_pcb; 747 tp = intotcpcb(inp); 748 if (tp == NULL) 749 goto badsyn; /*XXX*/ 750 751 /* 752 * Compute proper scaling 753 * value from buffer space 754 */ 755 tcp_rscale(tp, so->so_rcv.sb_hiwat); 756 goto after_listen; 757 } 758 } else { 759 /* 760 * None of RST, SYN or ACK was set. 761 * This is an invalid packet for a 762 * TCB in LISTEN state. Send a RST. 763 */ 764 goto badsyn; 765 } 766 } else { 767 /* 768 * Received a SYN. 769 */ 770 #ifdef INET6 771 /* 772 * If deprecated address is forbidden, we do 773 * not accept SYN to deprecated interface 774 * address to prevent any new inbound 775 * connection from getting established. 776 * When we do not accept SYN, we send a TCP 777 * RST, with deprecated source address (instead 778 * of dropping it). We compromise it as it is 779 * much better for peer to send a RST, and 780 * RST will be the final packet for the 781 * exchange. 782 * 783 * If we do not forbid deprecated addresses, we 784 * accept the SYN packet. RFC2462 does not 785 * suggest dropping SYN in this case. 786 * If we decipher RFC2462 5.5.4, it says like 787 * this: 788 * 1. use of deprecated addr with existing 789 * communication is okay - "SHOULD continue 790 * to be used" 791 * 2. use of it with new communication: 792 * (2a) "SHOULD NOT be used if alternate 793 * address with sufficient scope is 794 * available" 795 * (2b) nothing mentioned otherwise. 796 * Here we fall into (2b) case as we have no 797 * choice in our source address selection - we 798 * must obey the peer. 799 * 800 * The wording in RFC2462 is confusing, and 801 * there are multiple description text for 802 * deprecated address handling - worse, they 803 * are not exactly the same. I believe 5.5.4 804 * is the best one, so we follow 5.5.4. 805 */ 806 if (ip6 && !ip6_use_deprecated) { 807 struct in6_ifaddr *ia6; 808 809 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 810 &ip6->ip6_dst)) && 811 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 812 tp = NULL; 813 goto dropwithreset; 814 } 815 } 816 #endif 817 818 /* 819 * LISTEN socket received a SYN 820 * from itself? This can't possibly 821 * be valid; drop the packet. 822 */ 823 if (th->th_dport == th->th_sport) { 824 switch (af) { 825 #ifdef INET6 826 case AF_INET6: 827 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 828 &ip6->ip6_dst)) { 829 tcpstat.tcps_badsyn++; 830 goto drop; 831 } 832 break; 833 #endif /* INET6 */ 834 case AF_INET: 835 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 836 tcpstat.tcps_badsyn++; 837 goto drop; 838 } 839 break; 840 } 841 } 842 843 /* 844 * SYN looks ok; create compressed TCP 845 * state for it. 846 */ 847 if (so->so_qlen <= so->so_qlimit && 848 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 849 so, m, optp, optlen, &opti, reuse)) 850 m = NULL; 851 } 852 goto drop; 853 } 854 } 855 856 after_listen: 857 #ifdef DIAGNOSTIC 858 /* 859 * Should not happen now that all embryonic connections 860 * are handled with compressed state. 861 */ 862 if (tp->t_state == TCPS_LISTEN) 863 panic("tcp_input: TCPS_LISTEN"); 864 #endif 865 866 #if NPF > 0 867 if (m->m_pkthdr.pf.statekey) { 868 ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp = 869 inp; 870 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 871 } 872 #endif 873 874 #ifdef IPSEC 875 /* Find most recent IPsec tag */ 876 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 877 s = splnet(); 878 if (mtag != NULL) { 879 tdbi = (struct tdb_ident *)(mtag + 1); 880 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 881 } else 882 tdb = NULL; 883 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 884 tdb, inp); 885 if (error) { 886 splx(s); 887 goto drop; 888 } 889 890 /* Latch SA */ 891 if (inp->inp_tdb_in != tdb) { 892 if (tdb) { 893 tdb_add_inp(tdb, inp, 1); 894 if (inp->inp_ipo == NULL) { 895 inp->inp_ipo = ipsec_add_policy(inp, af, 896 IPSP_DIRECTION_OUT); 897 if (inp->inp_ipo == NULL) { 898 splx(s); 899 goto drop; 900 } 901 } 902 if (inp->inp_ipo->ipo_dstid == NULL && 903 tdb->tdb_srcid != NULL) { 904 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 905 tdb->tdb_srcid->ref_count++; 906 } 907 if (inp->inp_ipsec_remotecred == NULL && 908 tdb->tdb_remote_cred != NULL) { 909 inp->inp_ipsec_remotecred = 910 tdb->tdb_remote_cred; 911 tdb->tdb_remote_cred->ref_count++; 912 } 913 if (inp->inp_ipsec_remoteauth == NULL && 914 tdb->tdb_remote_auth != NULL) { 915 inp->inp_ipsec_remoteauth = 916 tdb->tdb_remote_auth; 917 tdb->tdb_remote_auth->ref_count++; 918 } 919 } else { /* Just reset */ 920 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 921 inp_tdb_in_next); 922 inp->inp_tdb_in = NULL; 923 } 924 } 925 splx(s); 926 #endif /* IPSEC */ 927 928 /* 929 * Segment received on connection. 930 * Reset idle time and keep-alive timer. 931 */ 932 tp->t_rcvtime = tcp_now; 933 if (TCPS_HAVEESTABLISHED(tp->t_state)) 934 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 935 936 #ifdef TCP_SACK 937 if (tp->sack_enable) 938 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 939 #endif /* TCP_SACK */ 940 941 /* 942 * Process options. 943 */ 944 #ifdef TCP_SIGNATURE 945 if (optp || (tp->t_flags & TF_SIGNATURE)) 946 #else 947 if (optp) 948 #endif 949 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti)) 950 goto drop; 951 952 if (opti.ts_present && opti.ts_ecr) { 953 int rtt_test; 954 955 /* subtract out the tcp timestamp modulator */ 956 opti.ts_ecr -= tp->ts_modulate; 957 958 /* make sure ts_ecr is sensible */ 959 rtt_test = tcp_now - opti.ts_ecr; 960 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 961 opti.ts_ecr = 0; 962 } 963 964 #ifdef TCP_ECN 965 /* if congestion experienced, set ECE bit in subsequent packets. */ 966 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 967 tp->t_flags |= TF_RCVD_CE; 968 tcpstat.tcps_ecn_rcvce++; 969 } 970 #endif 971 /* 972 * Header prediction: check for the two common cases 973 * of a uni-directional data xfer. If the packet has 974 * no control flags, is in-sequence, the window didn't 975 * change and we're not retransmitting, it's a 976 * candidate. If the length is zero and the ack moved 977 * forward, we're the sender side of the xfer. Just 978 * free the data acked & wake any higher level process 979 * that was blocked waiting for space. If the length 980 * is non-zero and the ack didn't move, we're the 981 * receiver side. If we're getting packets in-order 982 * (the reassembly queue is empty), add the data to 983 * the socket buffer and note that we need a delayed ack. 984 */ 985 if (tp->t_state == TCPS_ESTABLISHED && 986 #ifdef TCP_ECN 987 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 988 #else 989 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 990 #endif 991 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 992 th->th_seq == tp->rcv_nxt && 993 tiwin && tiwin == tp->snd_wnd && 994 tp->snd_nxt == tp->snd_max) { 995 996 /* 997 * If last ACK falls within this segment's sequence numbers, 998 * record the timestamp. 999 * Fix from Braden, see Stevens p. 870 1000 */ 1001 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1002 tp->ts_recent_age = tcp_now; 1003 tp->ts_recent = opti.ts_val; 1004 } 1005 1006 if (tlen == 0) { 1007 if (SEQ_GT(th->th_ack, tp->snd_una) && 1008 SEQ_LEQ(th->th_ack, tp->snd_max) && 1009 tp->snd_cwnd >= tp->snd_wnd && 1010 tp->t_dupacks == 0) { 1011 /* 1012 * this is a pure ack for outstanding data. 1013 */ 1014 ++tcpstat.tcps_predack; 1015 if (opti.ts_present && opti.ts_ecr) 1016 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1017 else if (tp->t_rtttime && 1018 SEQ_GT(th->th_ack, tp->t_rtseq)) 1019 tcp_xmit_timer(tp, 1020 tcp_now - tp->t_rtttime); 1021 acked = th->th_ack - tp->snd_una; 1022 tcpstat.tcps_rcvackpack++; 1023 tcpstat.tcps_rcvackbyte += acked; 1024 ND6_HINT(tp); 1025 sbdrop(&so->so_snd, acked); 1026 1027 /* 1028 * If we had a pending ICMP message that 1029 * referres to data that have just been 1030 * acknowledged, disregard the recorded ICMP 1031 * message. 1032 */ 1033 if ((tp->t_flags & TF_PMTUD_PEND) && 1034 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1035 tp->t_flags &= ~TF_PMTUD_PEND; 1036 1037 /* 1038 * Keep track of the largest chunk of data 1039 * acknowledged since last PMTU update 1040 */ 1041 if (tp->t_pmtud_mss_acked < acked) 1042 tp->t_pmtud_mss_acked = acked; 1043 1044 tp->snd_una = th->th_ack; 1045 #if defined(TCP_SACK) || defined(TCP_ECN) 1046 /* 1047 * We want snd_last to track snd_una so 1048 * as to avoid sequence wraparound problems 1049 * for very large transfers. 1050 */ 1051 #ifdef TCP_ECN 1052 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1053 #endif 1054 tp->snd_last = tp->snd_una; 1055 #endif /* TCP_SACK */ 1056 #if defined(TCP_SACK) && defined(TCP_FACK) 1057 tp->snd_fack = tp->snd_una; 1058 tp->retran_data = 0; 1059 #endif /* TCP_FACK */ 1060 m_freem(m); 1061 1062 /* 1063 * If all outstanding data are acked, stop 1064 * retransmit timer, otherwise restart timer 1065 * using current (possibly backed-off) value. 1066 * If process is waiting for space, 1067 * wakeup/selwakeup/signal. If data 1068 * are ready to send, let tcp_output 1069 * decide between more output or persist. 1070 */ 1071 if (tp->snd_una == tp->snd_max) 1072 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1073 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1074 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1075 1076 if (sb_notify(&so->so_snd)) 1077 sowwakeup(so); 1078 if (so->so_snd.sb_cc) 1079 (void) tcp_output(tp); 1080 return; 1081 } 1082 } else if (th->th_ack == tp->snd_una && 1083 TAILQ_EMPTY(&tp->t_segq) && 1084 tlen <= sbspace(&so->so_rcv)) { 1085 /* 1086 * This is a pure, in-sequence data packet 1087 * with nothing on the reassembly queue and 1088 * we have enough buffer space to take it. 1089 */ 1090 #ifdef TCP_SACK 1091 /* Clean receiver SACK report if present */ 1092 if (tp->sack_enable && tp->rcv_numsacks) 1093 tcp_clean_sackreport(tp); 1094 #endif /* TCP_SACK */ 1095 ++tcpstat.tcps_preddat; 1096 tp->rcv_nxt += tlen; 1097 tcpstat.tcps_rcvpack++; 1098 tcpstat.tcps_rcvbyte += tlen; 1099 ND6_HINT(tp); 1100 /* 1101 * Drop TCP, IP headers and TCP options then add data 1102 * to socket buffer. 1103 */ 1104 if (so->so_state & SS_CANTRCVMORE) 1105 m_freem(m); 1106 else { 1107 m_adj(m, iphlen + off); 1108 sbappendstream(&so->so_rcv, m); 1109 } 1110 sorwakeup(so); 1111 TCP_SETUP_ACK(tp, tiflags); 1112 if (tp->t_flags & TF_ACKNOW) 1113 (void) tcp_output(tp); 1114 return; 1115 } 1116 } 1117 1118 /* 1119 * Compute mbuf offset to TCP data segment. 1120 */ 1121 hdroptlen = iphlen + off; 1122 1123 /* 1124 * Calculate amount of space in receive window, 1125 * and then do TCP input processing. 1126 * Receive window is amount of space in rcv queue, 1127 * but not less than advertised window. 1128 */ 1129 { int win; 1130 1131 win = sbspace(&so->so_rcv); 1132 if (win < 0) 1133 win = 0; 1134 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1135 } 1136 1137 switch (tp->t_state) { 1138 1139 /* 1140 * If the state is SYN_RECEIVED: 1141 * if seg contains SYN/ACK, send an RST. 1142 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1143 */ 1144 1145 case TCPS_SYN_RECEIVED: 1146 if (tiflags & TH_ACK) { 1147 if (tiflags & TH_SYN) { 1148 tcpstat.tcps_badsyn++; 1149 goto dropwithreset; 1150 } 1151 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1152 SEQ_GT(th->th_ack, tp->snd_max)) 1153 goto dropwithreset; 1154 } 1155 break; 1156 1157 /* 1158 * If the state is SYN_SENT: 1159 * if seg contains an ACK, but not for our SYN, drop the input. 1160 * if seg contains a RST, then drop the connection. 1161 * if seg does not contain SYN, then drop it. 1162 * Otherwise this is an acceptable SYN segment 1163 * initialize tp->rcv_nxt and tp->irs 1164 * if seg contains ack then advance tp->snd_una 1165 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1166 * arrange for segment to be acked (eventually) 1167 * continue processing rest of data/controls, beginning with URG 1168 */ 1169 case TCPS_SYN_SENT: 1170 if ((tiflags & TH_ACK) && 1171 (SEQ_LEQ(th->th_ack, tp->iss) || 1172 SEQ_GT(th->th_ack, tp->snd_max))) 1173 goto dropwithreset; 1174 if (tiflags & TH_RST) { 1175 #ifdef TCP_ECN 1176 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1177 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1178 goto drop; 1179 #endif 1180 if (tiflags & TH_ACK) 1181 tp = tcp_drop(tp, ECONNREFUSED); 1182 goto drop; 1183 } 1184 if ((tiflags & TH_SYN) == 0) 1185 goto drop; 1186 if (tiflags & TH_ACK) { 1187 tp->snd_una = th->th_ack; 1188 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1189 tp->snd_nxt = tp->snd_una; 1190 } 1191 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1192 tp->irs = th->th_seq; 1193 tcp_mss(tp, opti.maxseg); 1194 /* Reset initial window to 1 segment for retransmit */ 1195 if (tp->t_rxtshift > 0) 1196 tp->snd_cwnd = tp->t_maxseg; 1197 tcp_rcvseqinit(tp); 1198 tp->t_flags |= TF_ACKNOW; 1199 #ifdef TCP_SACK 1200 /* 1201 * If we've sent a SACK_PERMITTED option, and the peer 1202 * also replied with one, then TF_SACK_PERMIT should have 1203 * been set in tcp_dooptions(). If it was not, disable SACKs. 1204 */ 1205 if (tp->sack_enable) 1206 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1207 #endif 1208 #ifdef TCP_ECN 1209 /* 1210 * if ECE is set but CWR is not set for SYN-ACK, or 1211 * both ECE and CWR are set for simultaneous open, 1212 * peer is ECN capable. 1213 */ 1214 if (tcp_do_ecn) { 1215 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1216 == (TH_ACK|TH_ECE) || 1217 (tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1218 == (TH_ECE|TH_CWR)) { 1219 tp->t_flags |= TF_ECN_PERMIT; 1220 tiflags &= ~(TH_ECE|TH_CWR); 1221 tcpstat.tcps_ecn_accepts++; 1222 } 1223 } 1224 #endif 1225 1226 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1227 tcpstat.tcps_connects++; 1228 soisconnected(so); 1229 tp->t_state = TCPS_ESTABLISHED; 1230 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1231 /* Do window scaling on this connection? */ 1232 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1233 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1234 tp->snd_scale = tp->requested_s_scale; 1235 tp->rcv_scale = tp->request_r_scale; 1236 } 1237 (void) tcp_reass(tp, (struct tcphdr *)0, 1238 (struct mbuf *)0, &tlen); 1239 /* 1240 * if we didn't have to retransmit the SYN, 1241 * use its rtt as our initial srtt & rtt var. 1242 */ 1243 if (tp->t_rtttime) 1244 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1245 /* 1246 * Since new data was acked (the SYN), open the 1247 * congestion window by one MSS. We do this 1248 * here, because we won't go through the normal 1249 * ACK processing below. And since this is the 1250 * start of the connection, we know we are in 1251 * the exponential phase of slow-start. 1252 */ 1253 tp->snd_cwnd += tp->t_maxseg; 1254 } else 1255 tp->t_state = TCPS_SYN_RECEIVED; 1256 1257 #if 0 1258 trimthenstep6: 1259 #endif 1260 /* 1261 * Advance th->th_seq to correspond to first data byte. 1262 * If data, trim to stay within window, 1263 * dropping FIN if necessary. 1264 */ 1265 th->th_seq++; 1266 if (tlen > tp->rcv_wnd) { 1267 todrop = tlen - tp->rcv_wnd; 1268 m_adj(m, -todrop); 1269 tlen = tp->rcv_wnd; 1270 tiflags &= ~TH_FIN; 1271 tcpstat.tcps_rcvpackafterwin++; 1272 tcpstat.tcps_rcvbyteafterwin += todrop; 1273 } 1274 tp->snd_wl1 = th->th_seq - 1; 1275 tp->rcv_up = th->th_seq; 1276 goto step6; 1277 /* 1278 * If a new connection request is received while in TIME_WAIT, 1279 * drop the old connection and start over if the if the 1280 * timestamp or the sequence numbers are above the previous 1281 * ones. 1282 */ 1283 case TCPS_TIME_WAIT: 1284 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1285 ((opti.ts_present && 1286 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1287 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1288 /* 1289 * Advance the iss by at least 32768, but 1290 * clear the msb in order to make sure 1291 * that SEG_LT(snd_nxt, iss). 1292 */ 1293 iss = tp->snd_nxt + 1294 ((arc4random() & 0x7fffffff) | 0x8000); 1295 reuse = &iss; 1296 tp = tcp_close(tp); 1297 inp = NULL; 1298 goto findpcb; 1299 } 1300 } 1301 1302 /* 1303 * States other than LISTEN or SYN_SENT. 1304 * First check timestamp, if present. 1305 * Then check that at least some bytes of segment are within 1306 * receive window. If segment begins before rcv_nxt, 1307 * drop leading data (and SYN); if nothing left, just ack. 1308 * 1309 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1310 * and it's less than opti.ts_recent, drop it. 1311 */ 1312 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1313 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1314 1315 /* Check to see if ts_recent is over 24 days old. */ 1316 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1317 /* 1318 * Invalidate ts_recent. If this segment updates 1319 * ts_recent, the age will be reset later and ts_recent 1320 * will get a valid value. If it does not, setting 1321 * ts_recent to zero will at least satisfy the 1322 * requirement that zero be placed in the timestamp 1323 * echo reply when ts_recent isn't valid. The 1324 * age isn't reset until we get a valid ts_recent 1325 * because we don't want out-of-order segments to be 1326 * dropped when ts_recent is old. 1327 */ 1328 tp->ts_recent = 0; 1329 } else { 1330 tcpstat.tcps_rcvduppack++; 1331 tcpstat.tcps_rcvdupbyte += tlen; 1332 tcpstat.tcps_pawsdrop++; 1333 goto dropafterack; 1334 } 1335 } 1336 1337 todrop = tp->rcv_nxt - th->th_seq; 1338 if (todrop > 0) { 1339 if (tiflags & TH_SYN) { 1340 tiflags &= ~TH_SYN; 1341 th->th_seq++; 1342 if (th->th_urp > 1) 1343 th->th_urp--; 1344 else 1345 tiflags &= ~TH_URG; 1346 todrop--; 1347 } 1348 if (todrop > tlen || 1349 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1350 /* 1351 * Any valid FIN must be to the left of the 1352 * window. At this point, FIN must be a 1353 * duplicate or out-of-sequence, so drop it. 1354 */ 1355 tiflags &= ~TH_FIN; 1356 /* 1357 * Send ACK to resynchronize, and drop any data, 1358 * but keep on processing for RST or ACK. 1359 */ 1360 tp->t_flags |= TF_ACKNOW; 1361 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1362 tcpstat.tcps_rcvduppack++; 1363 } else { 1364 tcpstat.tcps_rcvpartduppack++; 1365 tcpstat.tcps_rcvpartdupbyte += todrop; 1366 } 1367 hdroptlen += todrop; /* drop from head afterwards */ 1368 th->th_seq += todrop; 1369 tlen -= todrop; 1370 if (th->th_urp > todrop) 1371 th->th_urp -= todrop; 1372 else { 1373 tiflags &= ~TH_URG; 1374 th->th_urp = 0; 1375 } 1376 } 1377 1378 /* 1379 * If new data are received on a connection after the 1380 * user processes are gone, then RST the other end. 1381 */ 1382 if ((so->so_state & SS_NOFDREF) && 1383 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1384 tp = tcp_close(tp); 1385 tcpstat.tcps_rcvafterclose++; 1386 goto dropwithreset; 1387 } 1388 1389 /* 1390 * If segment ends after window, drop trailing data 1391 * (and PUSH and FIN); if nothing left, just ACK. 1392 */ 1393 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1394 if (todrop > 0) { 1395 tcpstat.tcps_rcvpackafterwin++; 1396 if (todrop >= tlen) { 1397 tcpstat.tcps_rcvbyteafterwin += tlen; 1398 /* 1399 * If window is closed can only take segments at 1400 * window edge, and have to drop data and PUSH from 1401 * incoming segments. Continue processing, but 1402 * remember to ack. Otherwise, drop segment 1403 * and ack. 1404 */ 1405 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1406 tp->t_flags |= TF_ACKNOW; 1407 tcpstat.tcps_rcvwinprobe++; 1408 } else 1409 goto dropafterack; 1410 } else 1411 tcpstat.tcps_rcvbyteafterwin += todrop; 1412 m_adj(m, -todrop); 1413 tlen -= todrop; 1414 tiflags &= ~(TH_PUSH|TH_FIN); 1415 } 1416 1417 /* 1418 * If last ACK falls within this segment's sequence numbers, 1419 * record its timestamp if it's more recent. 1420 * Cf fix from Braden, see Stevens p. 870 1421 */ 1422 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1423 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1424 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1425 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1426 tp->ts_recent = opti.ts_val; 1427 else 1428 tp->ts_recent = 0; 1429 tp->ts_recent_age = tcp_now; 1430 } 1431 1432 /* 1433 * If the RST bit is set examine the state: 1434 * SYN_RECEIVED STATE: 1435 * If passive open, return to LISTEN state. 1436 * If active open, inform user that connection was refused. 1437 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1438 * Inform user that connection was reset, and close tcb. 1439 * CLOSING, LAST_ACK, TIME_WAIT STATES 1440 * Close the tcb. 1441 */ 1442 if (tiflags & TH_RST) { 1443 if (th->th_seq != tp->last_ack_sent && 1444 th->th_seq != tp->rcv_nxt && 1445 th->th_seq != (tp->rcv_nxt + 1)) 1446 goto drop; 1447 1448 switch (tp->t_state) { 1449 case TCPS_SYN_RECEIVED: 1450 #ifdef TCP_ECN 1451 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1452 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1453 goto drop; 1454 #endif 1455 so->so_error = ECONNREFUSED; 1456 goto close; 1457 1458 case TCPS_ESTABLISHED: 1459 case TCPS_FIN_WAIT_1: 1460 case TCPS_FIN_WAIT_2: 1461 case TCPS_CLOSE_WAIT: 1462 so->so_error = ECONNRESET; 1463 close: 1464 tp->t_state = TCPS_CLOSED; 1465 tcpstat.tcps_drops++; 1466 tp = tcp_close(tp); 1467 goto drop; 1468 case TCPS_CLOSING: 1469 case TCPS_LAST_ACK: 1470 case TCPS_TIME_WAIT: 1471 tp = tcp_close(tp); 1472 goto drop; 1473 } 1474 } 1475 1476 /* 1477 * If a SYN is in the window, then this is an 1478 * error and we ACK and drop the packet. 1479 */ 1480 if (tiflags & TH_SYN) 1481 goto dropafterack_ratelim; 1482 1483 /* 1484 * If the ACK bit is off we drop the segment and return. 1485 */ 1486 if ((tiflags & TH_ACK) == 0) { 1487 if (tp->t_flags & TF_ACKNOW) 1488 goto dropafterack; 1489 else 1490 goto drop; 1491 } 1492 1493 /* 1494 * Ack processing. 1495 */ 1496 switch (tp->t_state) { 1497 1498 /* 1499 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1500 * ESTABLISHED state and continue processing. 1501 * The ACK was checked above. 1502 */ 1503 case TCPS_SYN_RECEIVED: 1504 tcpstat.tcps_connects++; 1505 soisconnected(so); 1506 tp->t_state = TCPS_ESTABLISHED; 1507 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1508 /* Do window scaling? */ 1509 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1510 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1511 tp->snd_scale = tp->requested_s_scale; 1512 tp->rcv_scale = tp->request_r_scale; 1513 tiwin = th->th_win << tp->snd_scale; 1514 } 1515 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1516 &tlen); 1517 tp->snd_wl1 = th->th_seq - 1; 1518 /* fall into ... */ 1519 1520 /* 1521 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1522 * ACKs. If the ack is in the range 1523 * tp->snd_una < th->th_ack <= tp->snd_max 1524 * then advance tp->snd_una to th->th_ack and drop 1525 * data from the retransmission queue. If this ACK reflects 1526 * more up to date window information we update our window information. 1527 */ 1528 case TCPS_ESTABLISHED: 1529 case TCPS_FIN_WAIT_1: 1530 case TCPS_FIN_WAIT_2: 1531 case TCPS_CLOSE_WAIT: 1532 case TCPS_CLOSING: 1533 case TCPS_LAST_ACK: 1534 case TCPS_TIME_WAIT: 1535 #ifdef TCP_ECN 1536 /* 1537 * if we receive ECE and are not already in recovery phase, 1538 * reduce cwnd by half but don't slow-start. 1539 * advance snd_last to snd_max not to reduce cwnd again 1540 * until all outstanding packets are acked. 1541 */ 1542 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1543 if ((tp->t_flags & TF_ECN_PERMIT) && 1544 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1545 u_int win; 1546 1547 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1548 if (win > 1) { 1549 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1550 tp->snd_cwnd = tp->snd_ssthresh; 1551 tp->snd_last = tp->snd_max; 1552 tp->t_flags |= TF_SEND_CWR; 1553 tcpstat.tcps_cwr_ecn++; 1554 } 1555 } 1556 tcpstat.tcps_ecn_rcvece++; 1557 } 1558 /* 1559 * if we receive CWR, we know that the peer has reduced 1560 * its congestion window. stop sending ecn-echo. 1561 */ 1562 if ((tiflags & TH_CWR)) { 1563 tp->t_flags &= ~TF_RCVD_CE; 1564 tcpstat.tcps_ecn_rcvcwr++; 1565 } 1566 #endif /* TCP_ECN */ 1567 1568 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1569 /* 1570 * Duplicate/old ACK processing. 1571 * Increments t_dupacks: 1572 * Pure duplicate (same seq/ack/window, no data) 1573 * Doesn't affect t_dupacks: 1574 * Data packets. 1575 * Normal window updates (window opens) 1576 * Resets t_dupacks: 1577 * New data ACKed. 1578 * Window shrinks 1579 * Old ACK 1580 */ 1581 if (tlen) { 1582 /* Drop very old ACKs unless th_seq matches */ 1583 if (th->th_seq != tp->rcv_nxt && 1584 SEQ_LT(th->th_ack, 1585 tp->snd_una - tp->max_sndwnd)) { 1586 tcpstat.tcps_rcvacktooold++; 1587 goto drop; 1588 } 1589 break; 1590 } 1591 /* 1592 * If we get an old ACK, there is probably packet 1593 * reordering going on. Be conservative and reset 1594 * t_dupacks so that we are less aggressive in 1595 * doing a fast retransmit. 1596 */ 1597 if (th->th_ack != tp->snd_una) { 1598 tp->t_dupacks = 0; 1599 break; 1600 } 1601 if (tiwin == tp->snd_wnd) { 1602 tcpstat.tcps_rcvdupack++; 1603 /* 1604 * If we have outstanding data (other than 1605 * a window probe), this is a completely 1606 * duplicate ack (ie, window info didn't 1607 * change), the ack is the biggest we've 1608 * seen and we've seen exactly our rexmt 1609 * threshold of them, assume a packet 1610 * has been dropped and retransmit it. 1611 * Kludge snd_nxt & the congestion 1612 * window so we send only this one 1613 * packet. 1614 * 1615 * We know we're losing at the current 1616 * window size so do congestion avoidance 1617 * (set ssthresh to half the current window 1618 * and pull our congestion window back to 1619 * the new ssthresh). 1620 * 1621 * Dup acks mean that packets have left the 1622 * network (they're now cached at the receiver) 1623 * so bump cwnd by the amount in the receiver 1624 * to keep a constant cwnd packets in the 1625 * network. 1626 */ 1627 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1628 tp->t_dupacks = 0; 1629 #if defined(TCP_SACK) && defined(TCP_FACK) 1630 /* 1631 * In FACK, can enter fast rec. if the receiver 1632 * reports a reass. queue longer than 3 segs. 1633 */ 1634 else if (++tp->t_dupacks == tcprexmtthresh || 1635 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1636 tp->t_maxseg + tp->snd_una)) && 1637 SEQ_GT(tp->snd_una, tp->snd_last))) { 1638 #else 1639 else if (++tp->t_dupacks == tcprexmtthresh) { 1640 #endif /* TCP_FACK */ 1641 tcp_seq onxt = tp->snd_nxt; 1642 u_long win = 1643 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1644 2 / tp->t_maxseg; 1645 1646 #if defined(TCP_SACK) || defined(TCP_ECN) 1647 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1648 /* 1649 * False fast retx after 1650 * timeout. Do not cut window. 1651 */ 1652 tp->t_dupacks = 0; 1653 goto drop; 1654 } 1655 #endif 1656 if (win < 2) 1657 win = 2; 1658 tp->snd_ssthresh = win * tp->t_maxseg; 1659 #ifdef TCP_SACK 1660 tp->snd_last = tp->snd_max; 1661 if (tp->sack_enable) { 1662 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1663 tp->t_rtttime = 0; 1664 #ifdef TCP_ECN 1665 tp->t_flags |= TF_SEND_CWR; 1666 #endif 1667 tcpstat.tcps_cwr_frecovery++; 1668 tcpstat.tcps_sack_recovery_episode++; 1669 #if defined(TCP_SACK) && defined(TCP_FACK) 1670 tp->t_dupacks = tcprexmtthresh; 1671 (void) tcp_output(tp); 1672 /* 1673 * During FR, snd_cwnd is held 1674 * constant for FACK. 1675 */ 1676 tp->snd_cwnd = tp->snd_ssthresh; 1677 #else 1678 /* 1679 * tcp_output() will send 1680 * oldest SACK-eligible rtx. 1681 */ 1682 (void) tcp_output(tp); 1683 tp->snd_cwnd = tp->snd_ssthresh+ 1684 tp->t_maxseg * tp->t_dupacks; 1685 #endif /* TCP_FACK */ 1686 goto drop; 1687 } 1688 #endif /* TCP_SACK */ 1689 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1690 tp->t_rtttime = 0; 1691 tp->snd_nxt = th->th_ack; 1692 tp->snd_cwnd = tp->t_maxseg; 1693 #ifdef TCP_ECN 1694 tp->t_flags |= TF_SEND_CWR; 1695 #endif 1696 tcpstat.tcps_cwr_frecovery++; 1697 tcpstat.tcps_sndrexmitfast++; 1698 (void) tcp_output(tp); 1699 1700 tp->snd_cwnd = tp->snd_ssthresh + 1701 tp->t_maxseg * tp->t_dupacks; 1702 if (SEQ_GT(onxt, tp->snd_nxt)) 1703 tp->snd_nxt = onxt; 1704 goto drop; 1705 } else if (tp->t_dupacks > tcprexmtthresh) { 1706 #if defined(TCP_SACK) && defined(TCP_FACK) 1707 /* 1708 * while (awnd < cwnd) 1709 * sendsomething(); 1710 */ 1711 if (tp->sack_enable) { 1712 if (tp->snd_awnd < tp->snd_cwnd) 1713 tcp_output(tp); 1714 goto drop; 1715 } 1716 #endif /* TCP_FACK */ 1717 tp->snd_cwnd += tp->t_maxseg; 1718 (void) tcp_output(tp); 1719 goto drop; 1720 } 1721 } else if (tiwin < tp->snd_wnd) { 1722 /* 1723 * The window was retracted! Previous dup 1724 * ACKs may have been due to packets arriving 1725 * after the shrunken window, not a missing 1726 * packet, so play it safe and reset t_dupacks 1727 */ 1728 tp->t_dupacks = 0; 1729 } 1730 break; 1731 } 1732 /* 1733 * If the congestion window was inflated to account 1734 * for the other side's cached packets, retract it. 1735 */ 1736 #if defined(TCP_SACK) 1737 if (tp->sack_enable) { 1738 if (tp->t_dupacks >= tcprexmtthresh) { 1739 /* Check for a partial ACK */ 1740 if (tcp_sack_partialack(tp, th)) { 1741 #if defined(TCP_SACK) && defined(TCP_FACK) 1742 /* Force call to tcp_output */ 1743 if (tp->snd_awnd < tp->snd_cwnd) 1744 needoutput = 1; 1745 #else 1746 tp->snd_cwnd += tp->t_maxseg; 1747 needoutput = 1; 1748 #endif /* TCP_FACK */ 1749 } else { 1750 /* Out of fast recovery */ 1751 tp->snd_cwnd = tp->snd_ssthresh; 1752 if (tcp_seq_subtract(tp->snd_max, 1753 th->th_ack) < tp->snd_ssthresh) 1754 tp->snd_cwnd = 1755 tcp_seq_subtract(tp->snd_max, 1756 th->th_ack); 1757 tp->t_dupacks = 0; 1758 #if defined(TCP_SACK) && defined(TCP_FACK) 1759 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1760 tp->snd_fack = th->th_ack; 1761 #endif /* TCP_FACK */ 1762 } 1763 } 1764 } else { 1765 if (tp->t_dupacks >= tcprexmtthresh && 1766 !tcp_newreno(tp, th)) { 1767 /* Out of fast recovery */ 1768 tp->snd_cwnd = tp->snd_ssthresh; 1769 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1770 tp->snd_ssthresh) 1771 tp->snd_cwnd = 1772 tcp_seq_subtract(tp->snd_max, 1773 th->th_ack); 1774 tp->t_dupacks = 0; 1775 } 1776 } 1777 if (tp->t_dupacks < tcprexmtthresh) 1778 tp->t_dupacks = 0; 1779 #else /* else no TCP_SACK */ 1780 if (tp->t_dupacks >= tcprexmtthresh && 1781 tp->snd_cwnd > tp->snd_ssthresh) 1782 tp->snd_cwnd = tp->snd_ssthresh; 1783 tp->t_dupacks = 0; 1784 #endif 1785 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1786 tcpstat.tcps_rcvacktoomuch++; 1787 goto dropafterack_ratelim; 1788 } 1789 acked = th->th_ack - tp->snd_una; 1790 tcpstat.tcps_rcvackpack++; 1791 tcpstat.tcps_rcvackbyte += acked; 1792 1793 /* 1794 * If we have a timestamp reply, update smoothed 1795 * round trip time. If no timestamp is present but 1796 * transmit timer is running and timed sequence 1797 * number was acked, update smoothed round trip time. 1798 * Since we now have an rtt measurement, cancel the 1799 * timer backoff (cf., Phil Karn's retransmit alg.). 1800 * Recompute the initial retransmit timer. 1801 */ 1802 if (opti.ts_present && opti.ts_ecr) 1803 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1804 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1805 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1806 1807 /* 1808 * If all outstanding data is acked, stop retransmit 1809 * timer and remember to restart (more output or persist). 1810 * If there is more data to be acked, restart retransmit 1811 * timer, using current (possibly backed-off) value. 1812 */ 1813 if (th->th_ack == tp->snd_max) { 1814 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1815 needoutput = 1; 1816 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1817 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1818 /* 1819 * When new data is acked, open the congestion window. 1820 * If the window gives us less than ssthresh packets 1821 * in flight, open exponentially (maxseg per packet). 1822 * Otherwise open linearly: maxseg per window 1823 * (maxseg^2 / cwnd per packet). 1824 */ 1825 { 1826 u_int cw = tp->snd_cwnd; 1827 u_int incr = tp->t_maxseg; 1828 1829 if (cw > tp->snd_ssthresh) 1830 incr = incr * incr / cw; 1831 #if defined (TCP_SACK) 1832 if (tp->t_dupacks < tcprexmtthresh) 1833 #endif 1834 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1835 } 1836 ND6_HINT(tp); 1837 if (acked > so->so_snd.sb_cc) { 1838 tp->snd_wnd -= so->so_snd.sb_cc; 1839 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1840 ourfinisacked = 1; 1841 } else { 1842 sbdrop(&so->so_snd, acked); 1843 tp->snd_wnd -= acked; 1844 ourfinisacked = 0; 1845 } 1846 if (sb_notify(&so->so_snd)) 1847 sowwakeup(so); 1848 1849 /* 1850 * If we had a pending ICMP message that referred to data 1851 * that have just been acknowledged, disregard the recorded 1852 * ICMP message. 1853 */ 1854 if ((tp->t_flags & TF_PMTUD_PEND) && 1855 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1856 tp->t_flags &= ~TF_PMTUD_PEND; 1857 1858 /* 1859 * Keep track of the largest chunk of data acknowledged 1860 * since last PMTU update 1861 */ 1862 if (tp->t_pmtud_mss_acked < acked) 1863 tp->t_pmtud_mss_acked = acked; 1864 1865 tp->snd_una = th->th_ack; 1866 #ifdef TCP_ECN 1867 /* sync snd_last with snd_una */ 1868 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1869 tp->snd_last = tp->snd_una; 1870 #endif 1871 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1872 tp->snd_nxt = tp->snd_una; 1873 #if defined (TCP_SACK) && defined (TCP_FACK) 1874 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1875 tp->snd_fack = tp->snd_una; 1876 /* Update snd_awnd for partial ACK 1877 * without any SACK blocks. 1878 */ 1879 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1880 tp->snd_fack) + tp->retran_data; 1881 } 1882 #endif 1883 1884 switch (tp->t_state) { 1885 1886 /* 1887 * In FIN_WAIT_1 STATE in addition to the processing 1888 * for the ESTABLISHED state if our FIN is now acknowledged 1889 * then enter FIN_WAIT_2. 1890 */ 1891 case TCPS_FIN_WAIT_1: 1892 if (ourfinisacked) { 1893 /* 1894 * If we can't receive any more 1895 * data, then closing user can proceed. 1896 * Starting the timer is contrary to the 1897 * specification, but if we don't get a FIN 1898 * we'll hang forever. 1899 */ 1900 if (so->so_state & SS_CANTRCVMORE) { 1901 soisdisconnected(so); 1902 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1903 } 1904 tp->t_state = TCPS_FIN_WAIT_2; 1905 } 1906 break; 1907 1908 /* 1909 * In CLOSING STATE in addition to the processing for 1910 * the ESTABLISHED state if the ACK acknowledges our FIN 1911 * then enter the TIME-WAIT state, otherwise ignore 1912 * the segment. 1913 */ 1914 case TCPS_CLOSING: 1915 if (ourfinisacked) { 1916 tp->t_state = TCPS_TIME_WAIT; 1917 tcp_canceltimers(tp); 1918 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1919 soisdisconnected(so); 1920 } 1921 break; 1922 1923 /* 1924 * In LAST_ACK, we may still be waiting for data to drain 1925 * and/or to be acked, as well as for the ack of our FIN. 1926 * If our FIN is now acknowledged, delete the TCB, 1927 * enter the closed state and return. 1928 */ 1929 case TCPS_LAST_ACK: 1930 if (ourfinisacked) { 1931 tp = tcp_close(tp); 1932 goto drop; 1933 } 1934 break; 1935 1936 /* 1937 * In TIME_WAIT state the only thing that should arrive 1938 * is a retransmission of the remote FIN. Acknowledge 1939 * it and restart the finack timer. 1940 */ 1941 case TCPS_TIME_WAIT: 1942 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1943 goto dropafterack; 1944 } 1945 } 1946 1947 step6: 1948 /* 1949 * Update window information. 1950 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1951 */ 1952 if ((tiflags & TH_ACK) && 1953 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1954 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1955 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1956 /* keep track of pure window updates */ 1957 if (tlen == 0 && 1958 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1959 tcpstat.tcps_rcvwinupd++; 1960 tp->snd_wnd = tiwin; 1961 tp->snd_wl1 = th->th_seq; 1962 tp->snd_wl2 = th->th_ack; 1963 if (tp->snd_wnd > tp->max_sndwnd) 1964 tp->max_sndwnd = tp->snd_wnd; 1965 needoutput = 1; 1966 } 1967 1968 /* 1969 * Process segments with URG. 1970 */ 1971 if ((tiflags & TH_URG) && th->th_urp && 1972 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1973 /* 1974 * This is a kludge, but if we receive and accept 1975 * random urgent pointers, we'll crash in 1976 * soreceive. It's hard to imagine someone 1977 * actually wanting to send this much urgent data. 1978 */ 1979 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1980 th->th_urp = 0; /* XXX */ 1981 tiflags &= ~TH_URG; /* XXX */ 1982 goto dodata; /* XXX */ 1983 } 1984 /* 1985 * If this segment advances the known urgent pointer, 1986 * then mark the data stream. This should not happen 1987 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1988 * a FIN has been received from the remote side. 1989 * In these states we ignore the URG. 1990 * 1991 * According to RFC961 (Assigned Protocols), 1992 * the urgent pointer points to the last octet 1993 * of urgent data. We continue, however, 1994 * to consider it to indicate the first octet 1995 * of data past the urgent section as the original 1996 * spec states (in one of two places). 1997 */ 1998 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1999 tp->rcv_up = th->th_seq + th->th_urp; 2000 so->so_oobmark = so->so_rcv.sb_cc + 2001 (tp->rcv_up - tp->rcv_nxt) - 1; 2002 if (so->so_oobmark == 0) 2003 so->so_state |= SS_RCVATMARK; 2004 sohasoutofband(so); 2005 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2006 } 2007 /* 2008 * Remove out of band data so doesn't get presented to user. 2009 * This can happen independent of advancing the URG pointer, 2010 * but if two URG's are pending at once, some out-of-band 2011 * data may creep in... ick. 2012 */ 2013 if (th->th_urp <= (u_int16_t) tlen 2014 #ifdef SO_OOBINLINE 2015 && (so->so_options & SO_OOBINLINE) == 0 2016 #endif 2017 ) 2018 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2019 } else 2020 /* 2021 * If no out of band data is expected, 2022 * pull receive urgent pointer along 2023 * with the receive window. 2024 */ 2025 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2026 tp->rcv_up = tp->rcv_nxt; 2027 dodata: /* XXX */ 2028 2029 /* 2030 * Process the segment text, merging it into the TCP sequencing queue, 2031 * and arranging for acknowledgment of receipt if necessary. 2032 * This process logically involves adjusting tp->rcv_wnd as data 2033 * is presented to the user (this happens in tcp_usrreq.c, 2034 * case PRU_RCVD). If a FIN has already been received on this 2035 * connection then we just ignore the text. 2036 */ 2037 if ((tlen || (tiflags & TH_FIN)) && 2038 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2039 #ifdef TCP_SACK 2040 tcp_seq laststart = th->th_seq; 2041 tcp_seq lastend = th->th_seq + tlen; 2042 #endif 2043 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2044 tp->t_state == TCPS_ESTABLISHED) { 2045 TCP_SETUP_ACK(tp, tiflags); 2046 tp->rcv_nxt += tlen; 2047 tiflags = th->th_flags & TH_FIN; 2048 tcpstat.tcps_rcvpack++; 2049 tcpstat.tcps_rcvbyte += tlen; 2050 ND6_HINT(tp); 2051 if (so->so_state & SS_CANTRCVMORE) 2052 m_freem(m); 2053 else { 2054 m_adj(m, hdroptlen); 2055 sbappendstream(&so->so_rcv, m); 2056 } 2057 sorwakeup(so); 2058 } else { 2059 m_adj(m, hdroptlen); 2060 tiflags = tcp_reass(tp, th, m, &tlen); 2061 tp->t_flags |= TF_ACKNOW; 2062 } 2063 #ifdef TCP_SACK 2064 if (tp->sack_enable) 2065 tcp_update_sack_list(tp, laststart, lastend); 2066 #endif 2067 2068 /* 2069 * variable len never referenced again in modern BSD, 2070 * so why bother computing it ?? 2071 */ 2072 #if 0 2073 /* 2074 * Note the amount of data that peer has sent into 2075 * our window, in order to estimate the sender's 2076 * buffer size. 2077 */ 2078 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2079 #endif /* 0 */ 2080 } else { 2081 m_freem(m); 2082 tiflags &= ~TH_FIN; 2083 } 2084 2085 /* 2086 * If FIN is received ACK the FIN and let the user know 2087 * that the connection is closing. Ignore a FIN received before 2088 * the connection is fully established. 2089 */ 2090 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2091 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2092 socantrcvmore(so); 2093 tp->t_flags |= TF_ACKNOW; 2094 tp->rcv_nxt++; 2095 } 2096 switch (tp->t_state) { 2097 2098 /* 2099 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2100 */ 2101 case TCPS_ESTABLISHED: 2102 tp->t_state = TCPS_CLOSE_WAIT; 2103 break; 2104 2105 /* 2106 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2107 * enter the CLOSING state. 2108 */ 2109 case TCPS_FIN_WAIT_1: 2110 tp->t_state = TCPS_CLOSING; 2111 break; 2112 2113 /* 2114 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2115 * starting the time-wait timer, turning off the other 2116 * standard timers. 2117 */ 2118 case TCPS_FIN_WAIT_2: 2119 tp->t_state = TCPS_TIME_WAIT; 2120 tcp_canceltimers(tp); 2121 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2122 soisdisconnected(so); 2123 break; 2124 2125 /* 2126 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2127 */ 2128 case TCPS_TIME_WAIT: 2129 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2130 break; 2131 } 2132 } 2133 if (so->so_options & SO_DEBUG) { 2134 switch (tp->pf) { 2135 #ifdef INET6 2136 case PF_INET6: 2137 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2138 0, tlen); 2139 break; 2140 #endif /* INET6 */ 2141 case PF_INET: 2142 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2143 0, tlen); 2144 break; 2145 } 2146 } 2147 2148 /* 2149 * Return any desired output. 2150 */ 2151 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2152 (void) tcp_output(tp); 2153 } 2154 return; 2155 2156 badsyn: 2157 /* 2158 * Received a bad SYN. Increment counters and dropwithreset. 2159 */ 2160 tcpstat.tcps_badsyn++; 2161 tp = NULL; 2162 goto dropwithreset; 2163 2164 dropafterack_ratelim: 2165 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2166 tcp_ackdrop_ppslim) == 0) { 2167 /* XXX stat */ 2168 goto drop; 2169 } 2170 /* ...fall into dropafterack... */ 2171 2172 dropafterack: 2173 /* 2174 * Generate an ACK dropping incoming segment if it occupies 2175 * sequence space, where the ACK reflects our state. 2176 */ 2177 if (tiflags & TH_RST) 2178 goto drop; 2179 m_freem(m); 2180 tp->t_flags |= TF_ACKNOW; 2181 (void) tcp_output(tp); 2182 return; 2183 2184 dropwithreset_ratelim: 2185 /* 2186 * We may want to rate-limit RSTs in certain situations, 2187 * particularly if we are sending an RST in response to 2188 * an attempt to connect to or otherwise communicate with 2189 * a port for which we have no socket. 2190 */ 2191 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2192 tcp_rst_ppslim) == 0) { 2193 /* XXX stat */ 2194 goto drop; 2195 } 2196 /* ...fall into dropwithreset... */ 2197 2198 dropwithreset: 2199 /* 2200 * Generate a RST, dropping incoming segment. 2201 * Make ACK acceptable to originator of segment. 2202 * Don't bother to respond to RST. 2203 */ 2204 if (tiflags & TH_RST) 2205 goto drop; 2206 if (tiflags & TH_ACK) { 2207 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2208 TH_RST); 2209 } else { 2210 if (tiflags & TH_SYN) 2211 tlen++; 2212 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2213 (tcp_seq)0, TH_RST|TH_ACK); 2214 } 2215 m_freem(m); 2216 return; 2217 2218 drop: 2219 /* 2220 * Drop space held by incoming segment and return. 2221 */ 2222 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2223 switch (tp->pf) { 2224 #ifdef INET6 2225 case PF_INET6: 2226 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2227 0, tlen); 2228 break; 2229 #endif /* INET6 */ 2230 case PF_INET: 2231 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2232 0, tlen); 2233 break; 2234 } 2235 } 2236 2237 m_freem(m); 2238 return; 2239 } 2240 2241 int 2242 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2243 struct mbuf *m, int iphlen, struct tcp_opt_info *oi) 2244 { 2245 u_int16_t mss = 0; 2246 int opt, optlen; 2247 #ifdef TCP_SIGNATURE 2248 caddr_t sigp = NULL; 2249 struct tdb *tdb = NULL; 2250 #endif /* TCP_SIGNATURE */ 2251 2252 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2253 opt = cp[0]; 2254 if (opt == TCPOPT_EOL) 2255 break; 2256 if (opt == TCPOPT_NOP) 2257 optlen = 1; 2258 else { 2259 if (cnt < 2) 2260 break; 2261 optlen = cp[1]; 2262 if (optlen < 2 || optlen > cnt) 2263 break; 2264 } 2265 switch (opt) { 2266 2267 default: 2268 continue; 2269 2270 case TCPOPT_MAXSEG: 2271 if (optlen != TCPOLEN_MAXSEG) 2272 continue; 2273 if (!(th->th_flags & TH_SYN)) 2274 continue; 2275 if (TCPS_HAVERCVDSYN(tp->t_state)) 2276 continue; 2277 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2278 NTOHS(mss); 2279 oi->maxseg = mss; 2280 break; 2281 2282 case TCPOPT_WINDOW: 2283 if (optlen != TCPOLEN_WINDOW) 2284 continue; 2285 if (!(th->th_flags & TH_SYN)) 2286 continue; 2287 if (TCPS_HAVERCVDSYN(tp->t_state)) 2288 continue; 2289 tp->t_flags |= TF_RCVD_SCALE; 2290 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2291 break; 2292 2293 case TCPOPT_TIMESTAMP: 2294 if (optlen != TCPOLEN_TIMESTAMP) 2295 continue; 2296 oi->ts_present = 1; 2297 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2298 NTOHL(oi->ts_val); 2299 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2300 NTOHL(oi->ts_ecr); 2301 2302 if (!(th->th_flags & TH_SYN)) 2303 continue; 2304 if (TCPS_HAVERCVDSYN(tp->t_state)) 2305 continue; 2306 /* 2307 * A timestamp received in a SYN makes 2308 * it ok to send timestamp requests and replies. 2309 */ 2310 tp->t_flags |= TF_RCVD_TSTMP; 2311 tp->ts_recent = oi->ts_val; 2312 tp->ts_recent_age = tcp_now; 2313 break; 2314 2315 #ifdef TCP_SACK 2316 case TCPOPT_SACK_PERMITTED: 2317 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2318 continue; 2319 if (!(th->th_flags & TH_SYN)) 2320 continue; 2321 if (TCPS_HAVERCVDSYN(tp->t_state)) 2322 continue; 2323 /* MUST only be set on SYN */ 2324 tp->t_flags |= TF_SACK_PERMIT; 2325 break; 2326 case TCPOPT_SACK: 2327 tcp_sack_option(tp, th, cp, optlen); 2328 break; 2329 #endif 2330 #ifdef TCP_SIGNATURE 2331 case TCPOPT_SIGNATURE: 2332 if (optlen != TCPOLEN_SIGNATURE) 2333 continue; 2334 2335 if (sigp && bcmp(sigp, cp + 2, 16)) 2336 return (-1); 2337 2338 sigp = cp + 2; 2339 break; 2340 #endif /* TCP_SIGNATURE */ 2341 } 2342 } 2343 2344 #ifdef TCP_SIGNATURE 2345 if (tp->t_flags & TF_SIGNATURE) { 2346 union sockaddr_union src, dst; 2347 2348 memset(&src, 0, sizeof(union sockaddr_union)); 2349 memset(&dst, 0, sizeof(union sockaddr_union)); 2350 2351 switch (tp->pf) { 2352 case 0: 2353 #ifdef INET 2354 case AF_INET: 2355 src.sa.sa_len = sizeof(struct sockaddr_in); 2356 src.sa.sa_family = AF_INET; 2357 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2358 dst.sa.sa_len = sizeof(struct sockaddr_in); 2359 dst.sa.sa_family = AF_INET; 2360 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2361 break; 2362 #endif 2363 #ifdef INET6 2364 case AF_INET6: 2365 src.sa.sa_len = sizeof(struct sockaddr_in6); 2366 src.sa.sa_family = AF_INET6; 2367 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2368 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2369 dst.sa.sa_family = AF_INET6; 2370 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2371 break; 2372 #endif /* INET6 */ 2373 } 2374 2375 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 2376 2377 /* 2378 * We don't have an SA for this peer, so we turn off 2379 * TF_SIGNATURE on the listen socket 2380 */ 2381 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2382 tp->t_flags &= ~TF_SIGNATURE; 2383 2384 } 2385 2386 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2387 tcpstat.tcps_rcvbadsig++; 2388 return (-1); 2389 } 2390 2391 if (sigp) { 2392 char sig[16]; 2393 2394 if (tdb == NULL) { 2395 tcpstat.tcps_rcvbadsig++; 2396 return (-1); 2397 } 2398 2399 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2400 return (-1); 2401 2402 if (bcmp(sig, sigp, 16)) { 2403 tcpstat.tcps_rcvbadsig++; 2404 return (-1); 2405 } 2406 2407 tcpstat.tcps_rcvgoodsig++; 2408 } 2409 #endif /* TCP_SIGNATURE */ 2410 2411 return (0); 2412 } 2413 2414 #if defined(TCP_SACK) 2415 u_long 2416 tcp_seq_subtract(u_long a, u_long b) 2417 { 2418 return ((long)(a - b)); 2419 } 2420 #endif 2421 2422 2423 #ifdef TCP_SACK 2424 /* 2425 * This function is called upon receipt of new valid data (while not in header 2426 * prediction mode), and it updates the ordered list of sacks. 2427 */ 2428 void 2429 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2430 tcp_seq rcv_lastend) 2431 { 2432 /* 2433 * First reported block MUST be the most recent one. Subsequent 2434 * blocks SHOULD be in the order in which they arrived at the 2435 * receiver. These two conditions make the implementation fully 2436 * compliant with RFC 2018. 2437 */ 2438 int i, j = 0, count = 0, lastpos = -1; 2439 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2440 2441 /* First clean up current list of sacks */ 2442 for (i = 0; i < tp->rcv_numsacks; i++) { 2443 sack = tp->sackblks[i]; 2444 if (sack.start == 0 && sack.end == 0) { 2445 count++; /* count = number of blocks to be discarded */ 2446 continue; 2447 } 2448 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2449 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2450 count++; 2451 } else { 2452 temp[j].start = tp->sackblks[i].start; 2453 temp[j++].end = tp->sackblks[i].end; 2454 } 2455 } 2456 tp->rcv_numsacks -= count; 2457 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2458 tcp_clean_sackreport(tp); 2459 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2460 /* ==> need first sack block */ 2461 tp->sackblks[0].start = rcv_laststart; 2462 tp->sackblks[0].end = rcv_lastend; 2463 tp->rcv_numsacks = 1; 2464 } 2465 return; 2466 } 2467 /* Otherwise, sack blocks are already present. */ 2468 for (i = 0; i < tp->rcv_numsacks; i++) 2469 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2470 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2471 return; /* sack list remains unchanged */ 2472 /* 2473 * From here, segment just received should be (part of) the 1st sack. 2474 * Go through list, possibly coalescing sack block entries. 2475 */ 2476 firstsack.start = rcv_laststart; 2477 firstsack.end = rcv_lastend; 2478 for (i = 0; i < tp->rcv_numsacks; i++) { 2479 sack = tp->sackblks[i]; 2480 if (SEQ_LT(sack.end, firstsack.start) || 2481 SEQ_GT(sack.start, firstsack.end)) 2482 continue; /* no overlap */ 2483 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2484 /* 2485 * identical block; delete it here since we will 2486 * move it to the front of the list. 2487 */ 2488 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2489 lastpos = i; /* last posn with a zero entry */ 2490 continue; 2491 } 2492 if (SEQ_LEQ(sack.start, firstsack.start)) 2493 firstsack.start = sack.start; /* merge blocks */ 2494 if (SEQ_GEQ(sack.end, firstsack.end)) 2495 firstsack.end = sack.end; /* merge blocks */ 2496 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2497 lastpos = i; /* last posn with a zero entry */ 2498 } 2499 if (lastpos != -1) { /* at least one merge */ 2500 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2501 sack = tp->sackblks[i]; 2502 if (sack.start == 0 && sack.end == 0) 2503 continue; 2504 temp[j++] = sack; 2505 } 2506 tp->rcv_numsacks = j; /* including first blk (added later) */ 2507 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2508 tp->sackblks[i] = temp[i]; 2509 } else { /* no merges -- shift sacks by 1 */ 2510 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2511 tp->rcv_numsacks++; 2512 for (i = tp->rcv_numsacks-1; i > 0; i--) 2513 tp->sackblks[i] = tp->sackblks[i-1]; 2514 } 2515 tp->sackblks[0] = firstsack; 2516 return; 2517 } 2518 2519 /* 2520 * Process the TCP SACK option. tp->snd_holes is an ordered list 2521 * of holes (oldest to newest, in terms of the sequence space). 2522 */ 2523 void 2524 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2525 { 2526 int tmp_olen; 2527 u_char *tmp_cp; 2528 struct sackhole *cur, *p, *temp; 2529 2530 if (!tp->sack_enable) 2531 return; 2532 /* SACK without ACK doesn't make sense. */ 2533 if ((th->th_flags & TH_ACK) == 0) 2534 return; 2535 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2536 if (SEQ_LT(th->th_ack, tp->snd_una) || 2537 SEQ_GT(th->th_ack, tp->snd_max)) 2538 return; 2539 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2540 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2541 return; 2542 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2543 tmp_cp = cp + 2; 2544 tmp_olen = optlen - 2; 2545 tcpstat.tcps_sack_rcv_opts++; 2546 if (tp->snd_numholes < 0) 2547 tp->snd_numholes = 0; 2548 if (tp->t_maxseg == 0) 2549 panic("tcp_sack_option"); /* Should never happen */ 2550 while (tmp_olen > 0) { 2551 struct sackblk sack; 2552 2553 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2554 NTOHL(sack.start); 2555 bcopy(tmp_cp + sizeof(tcp_seq), 2556 (char *) &(sack.end), sizeof(tcp_seq)); 2557 NTOHL(sack.end); 2558 tmp_olen -= TCPOLEN_SACK; 2559 tmp_cp += TCPOLEN_SACK; 2560 if (SEQ_LEQ(sack.end, sack.start)) 2561 continue; /* bad SACK fields */ 2562 if (SEQ_LEQ(sack.end, tp->snd_una)) 2563 continue; /* old block */ 2564 #if defined(TCP_SACK) && defined(TCP_FACK) 2565 /* Updates snd_fack. */ 2566 if (SEQ_GT(sack.end, tp->snd_fack)) 2567 tp->snd_fack = sack.end; 2568 #endif /* TCP_FACK */ 2569 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2570 if (SEQ_LT(sack.start, th->th_ack)) 2571 continue; 2572 } 2573 if (SEQ_GT(sack.end, tp->snd_max)) 2574 continue; 2575 if (tp->snd_holes == NULL) { /* first hole */ 2576 tp->snd_holes = (struct sackhole *) 2577 pool_get(&sackhl_pool, PR_NOWAIT); 2578 if (tp->snd_holes == NULL) { 2579 /* ENOBUFS, so ignore SACKed block for now*/ 2580 goto done; 2581 } 2582 cur = tp->snd_holes; 2583 cur->start = th->th_ack; 2584 cur->end = sack.start; 2585 cur->rxmit = cur->start; 2586 cur->next = NULL; 2587 tp->snd_numholes = 1; 2588 tp->rcv_lastsack = sack.end; 2589 /* 2590 * dups is at least one. If more data has been 2591 * SACKed, it can be greater than one. 2592 */ 2593 cur->dups = min(tcprexmtthresh, 2594 ((sack.end - cur->end)/tp->t_maxseg)); 2595 if (cur->dups < 1) 2596 cur->dups = 1; 2597 continue; /* with next sack block */ 2598 } 2599 /* Go thru list of holes: p = previous, cur = current */ 2600 p = cur = tp->snd_holes; 2601 while (cur) { 2602 if (SEQ_LEQ(sack.end, cur->start)) 2603 /* SACKs data before the current hole */ 2604 break; /* no use going through more holes */ 2605 if (SEQ_GEQ(sack.start, cur->end)) { 2606 /* SACKs data beyond the current hole */ 2607 cur->dups++; 2608 if (((sack.end - cur->end)/tp->t_maxseg) >= 2609 tcprexmtthresh) 2610 cur->dups = tcprexmtthresh; 2611 p = cur; 2612 cur = cur->next; 2613 continue; 2614 } 2615 if (SEQ_LEQ(sack.start, cur->start)) { 2616 /* Data acks at least the beginning of hole */ 2617 #if defined(TCP_SACK) && defined(TCP_FACK) 2618 if (SEQ_GT(sack.end, cur->rxmit)) 2619 tp->retran_data -= 2620 tcp_seq_subtract(cur->rxmit, 2621 cur->start); 2622 else 2623 tp->retran_data -= 2624 tcp_seq_subtract(sack.end, 2625 cur->start); 2626 #endif /* TCP_FACK */ 2627 if (SEQ_GEQ(sack.end, cur->end)) { 2628 /* Acks entire hole, so delete hole */ 2629 if (p != cur) { 2630 p->next = cur->next; 2631 pool_put(&sackhl_pool, cur); 2632 cur = p->next; 2633 } else { 2634 cur = cur->next; 2635 pool_put(&sackhl_pool, p); 2636 p = cur; 2637 tp->snd_holes = p; 2638 } 2639 tp->snd_numholes--; 2640 continue; 2641 } 2642 /* otherwise, move start of hole forward */ 2643 cur->start = sack.end; 2644 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2645 p = cur; 2646 cur = cur->next; 2647 continue; 2648 } 2649 /* move end of hole backward */ 2650 if (SEQ_GEQ(sack.end, cur->end)) { 2651 #if defined(TCP_SACK) && defined(TCP_FACK) 2652 if (SEQ_GT(cur->rxmit, sack.start)) 2653 tp->retran_data -= 2654 tcp_seq_subtract(cur->rxmit, 2655 sack.start); 2656 #endif /* TCP_FACK */ 2657 cur->end = sack.start; 2658 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2659 cur->dups++; 2660 if (((sack.end - cur->end)/tp->t_maxseg) >= 2661 tcprexmtthresh) 2662 cur->dups = tcprexmtthresh; 2663 p = cur; 2664 cur = cur->next; 2665 continue; 2666 } 2667 if (SEQ_LT(cur->start, sack.start) && 2668 SEQ_GT(cur->end, sack.end)) { 2669 /* 2670 * ACKs some data in middle of a hole; need to 2671 * split current hole 2672 */ 2673 temp = (struct sackhole *) 2674 pool_get(&sackhl_pool, PR_NOWAIT); 2675 if (temp == NULL) 2676 goto done; /* ENOBUFS */ 2677 #if defined(TCP_SACK) && defined(TCP_FACK) 2678 if (SEQ_GT(cur->rxmit, sack.end)) 2679 tp->retran_data -= 2680 tcp_seq_subtract(sack.end, 2681 sack.start); 2682 else if (SEQ_GT(cur->rxmit, sack.start)) 2683 tp->retran_data -= 2684 tcp_seq_subtract(cur->rxmit, 2685 sack.start); 2686 #endif /* TCP_FACK */ 2687 temp->next = cur->next; 2688 temp->start = sack.end; 2689 temp->end = cur->end; 2690 temp->dups = cur->dups; 2691 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2692 cur->end = sack.start; 2693 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2694 cur->dups++; 2695 if (((sack.end - cur->end)/tp->t_maxseg) >= 2696 tcprexmtthresh) 2697 cur->dups = tcprexmtthresh; 2698 cur->next = temp; 2699 p = temp; 2700 cur = p->next; 2701 tp->snd_numholes++; 2702 } 2703 } 2704 /* At this point, p points to the last hole on the list */ 2705 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2706 /* 2707 * Need to append new hole at end. 2708 * Last hole is p (and it's not NULL). 2709 */ 2710 temp = (struct sackhole *) 2711 pool_get(&sackhl_pool, PR_NOWAIT); 2712 if (temp == NULL) 2713 goto done; /* ENOBUFS */ 2714 temp->start = tp->rcv_lastsack; 2715 temp->end = sack.start; 2716 temp->dups = min(tcprexmtthresh, 2717 ((sack.end - sack.start)/tp->t_maxseg)); 2718 if (temp->dups < 1) 2719 temp->dups = 1; 2720 temp->rxmit = temp->start; 2721 temp->next = 0; 2722 p->next = temp; 2723 tp->rcv_lastsack = sack.end; 2724 tp->snd_numholes++; 2725 } 2726 } 2727 done: 2728 #if defined(TCP_SACK) && defined(TCP_FACK) 2729 /* 2730 * Update retran_data and snd_awnd. Go through the list of 2731 * holes. Increment retran_data by (hole->rxmit - hole->start). 2732 */ 2733 tp->retran_data = 0; 2734 cur = tp->snd_holes; 2735 while (cur) { 2736 tp->retran_data += cur->rxmit - cur->start; 2737 cur = cur->next; 2738 } 2739 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2740 tp->retran_data; 2741 #endif /* TCP_FACK */ 2742 2743 return; 2744 } 2745 2746 /* 2747 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2748 * it is completely acked; otherwise, tcp_sack_option(), called from 2749 * tcp_dooptions(), will fix up the hole. 2750 */ 2751 void 2752 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2753 { 2754 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2755 /* max because this could be an older ack just arrived */ 2756 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2757 th->th_ack : tp->snd_una; 2758 struct sackhole *cur = tp->snd_holes; 2759 struct sackhole *prev; 2760 while (cur) 2761 if (SEQ_LEQ(cur->end, lastack)) { 2762 prev = cur; 2763 cur = cur->next; 2764 pool_put(&sackhl_pool, prev); 2765 tp->snd_numholes--; 2766 } else if (SEQ_LT(cur->start, lastack)) { 2767 cur->start = lastack; 2768 if (SEQ_LT(cur->rxmit, cur->start)) 2769 cur->rxmit = cur->start; 2770 break; 2771 } else 2772 break; 2773 tp->snd_holes = cur; 2774 } 2775 } 2776 2777 /* 2778 * Delete all receiver-side SACK information. 2779 */ 2780 void 2781 tcp_clean_sackreport(struct tcpcb *tp) 2782 { 2783 int i; 2784 2785 tp->rcv_numsacks = 0; 2786 for (i = 0; i < MAX_SACK_BLKS; i++) 2787 tp->sackblks[i].start = tp->sackblks[i].end=0; 2788 2789 } 2790 2791 /* 2792 * Checks for partial ack. If partial ack arrives, turn off retransmission 2793 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2794 * If the ack advances at least to tp->snd_last, return 0. 2795 */ 2796 int 2797 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2798 { 2799 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2800 /* Turn off retx. timer (will start again next segment) */ 2801 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2802 tp->t_rtttime = 0; 2803 #ifndef TCP_FACK 2804 /* 2805 * Partial window deflation. This statement relies on the 2806 * fact that tp->snd_una has not been updated yet. In FACK 2807 * hold snd_cwnd constant during fast recovery. 2808 */ 2809 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2810 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2811 tp->snd_cwnd += tp->t_maxseg; 2812 } else 2813 tp->snd_cwnd = tp->t_maxseg; 2814 #endif 2815 return (1); 2816 } 2817 return (0); 2818 } 2819 #endif /* TCP_SACK */ 2820 2821 /* 2822 * Pull out of band byte out of a segment so 2823 * it doesn't appear in the user's data queue. 2824 * It is still reflected in the segment length for 2825 * sequencing purposes. 2826 */ 2827 void 2828 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2829 { 2830 int cnt = off + urgent - 1; 2831 2832 while (cnt >= 0) { 2833 if (m->m_len > cnt) { 2834 char *cp = mtod(m, caddr_t) + cnt; 2835 struct tcpcb *tp = sototcpcb(so); 2836 2837 tp->t_iobc = *cp; 2838 tp->t_oobflags |= TCPOOB_HAVEDATA; 2839 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2840 m->m_len--; 2841 return; 2842 } 2843 cnt -= m->m_len; 2844 m = m->m_next; 2845 if (m == 0) 2846 break; 2847 } 2848 panic("tcp_pulloutofband"); 2849 } 2850 2851 /* 2852 * Collect new round-trip time estimate 2853 * and update averages and current timeout. 2854 */ 2855 void 2856 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2857 { 2858 short delta; 2859 short rttmin; 2860 2861 if (rtt < 0) 2862 rtt = 0; 2863 else if (rtt > TCP_RTT_MAX) 2864 rtt = TCP_RTT_MAX; 2865 2866 tcpstat.tcps_rttupdated++; 2867 if (tp->t_srtt != 0) { 2868 /* 2869 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2870 * after the binary point (scaled by 4), whereas 2871 * srtt is stored as fixed point with 5 bits after the 2872 * binary point (i.e., scaled by 32). The following magic 2873 * is equivalent to the smoothing algorithm in rfc793 with 2874 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2875 * point). 2876 */ 2877 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2878 (tp->t_srtt >> TCP_RTT_SHIFT); 2879 if ((tp->t_srtt += delta) <= 0) 2880 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2881 /* 2882 * We accumulate a smoothed rtt variance (actually, a 2883 * smoothed mean difference), then set the retransmit 2884 * timer to smoothed rtt + 4 times the smoothed variance. 2885 * rttvar is stored as fixed point with 4 bits after the 2886 * binary point (scaled by 16). The following is 2887 * equivalent to rfc793 smoothing with an alpha of .75 2888 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2889 * rfc793's wired-in beta. 2890 */ 2891 if (delta < 0) 2892 delta = -delta; 2893 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2894 if ((tp->t_rttvar += delta) <= 0) 2895 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2896 } else { 2897 /* 2898 * No rtt measurement yet - use the unsmoothed rtt. 2899 * Set the variance to half the rtt (so our first 2900 * retransmit happens at 3*rtt). 2901 */ 2902 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2903 tp->t_rttvar = (rtt + 1) << 2904 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2905 } 2906 tp->t_rtttime = 0; 2907 tp->t_rxtshift = 0; 2908 2909 /* 2910 * the retransmit should happen at rtt + 4 * rttvar. 2911 * Because of the way we do the smoothing, srtt and rttvar 2912 * will each average +1/2 tick of bias. When we compute 2913 * the retransmit timer, we want 1/2 tick of rounding and 2914 * 1 extra tick because of +-1/2 tick uncertainty in the 2915 * firing of the timer. The bias will give us exactly the 2916 * 1.5 tick we need. But, because the bias is 2917 * statistical, we have to test that we don't drop below 2918 * the minimum feasible timer (which is 2 ticks). 2919 */ 2920 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2921 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2922 2923 /* 2924 * We received an ack for a packet that wasn't retransmitted; 2925 * it is probably safe to discard any error indications we've 2926 * received recently. This isn't quite right, but close enough 2927 * for now (a route might have failed after we sent a segment, 2928 * and the return path might not be symmetrical). 2929 */ 2930 tp->t_softerror = 0; 2931 } 2932 2933 /* 2934 * Determine a reasonable value for maxseg size. 2935 * If the route is known, check route for mtu. 2936 * If none, use an mss that can be handled on the outgoing 2937 * interface without forcing IP to fragment; if bigger than 2938 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2939 * to utilize large mbufs. If no route is found, route has no mtu, 2940 * or the destination isn't local, use a default, hopefully conservative 2941 * size (usually 512 or the default IP max size, but no more than the mtu 2942 * of the interface), as we can't discover anything about intervening 2943 * gateways or networks. We also initialize the congestion/slow start 2944 * window to be a single segment if the destination isn't local. 2945 * While looking at the routing entry, we also initialize other path-dependent 2946 * parameters from pre-set or cached values in the routing entry. 2947 * 2948 * Also take into account the space needed for options that we 2949 * send regularly. Make maxseg shorter by that amount to assure 2950 * that we can send maxseg amount of data even when the options 2951 * are present. Store the upper limit of the length of options plus 2952 * data in maxopd. 2953 * 2954 * NOTE: offer == -1 indicates that the maxseg size changed due to 2955 * Path MTU discovery. 2956 */ 2957 int 2958 tcp_mss(struct tcpcb *tp, int offer) 2959 { 2960 struct rtentry *rt; 2961 struct ifnet *ifp; 2962 int mss, mssopt; 2963 int iphlen; 2964 struct inpcb *inp; 2965 2966 inp = tp->t_inpcb; 2967 2968 mssopt = mss = tcp_mssdflt; 2969 2970 rt = in_pcbrtentry(inp); 2971 2972 if (rt == NULL) 2973 goto out; 2974 2975 ifp = rt->rt_ifp; 2976 2977 switch (tp->pf) { 2978 #ifdef INET6 2979 case AF_INET6: 2980 iphlen = sizeof(struct ip6_hdr); 2981 break; 2982 #endif 2983 case AF_INET: 2984 iphlen = sizeof(struct ip); 2985 break; 2986 default: 2987 /* the family does not support path MTU discovery */ 2988 goto out; 2989 } 2990 2991 #ifdef RTV_MTU 2992 /* 2993 * if there's an mtu associated with the route and we support 2994 * path MTU discovery for the underlying protocol family, use it. 2995 */ 2996 if (rt->rt_rmx.rmx_mtu) { 2997 /* 2998 * One may wish to lower MSS to take into account options, 2999 * especially security-related options. 3000 */ 3001 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3002 /* 3003 * RFC2460 section 5, last paragraph: if path MTU is 3004 * smaller than 1280, use 1280 as packet size and 3005 * attach fragment header. 3006 */ 3007 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3008 sizeof(struct tcphdr); 3009 } else 3010 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3011 } else 3012 #endif /* RTV_MTU */ 3013 if (!ifp) 3014 /* 3015 * ifp may be null and rmx_mtu may be zero in certain 3016 * v6 cases (e.g., if ND wasn't able to resolve the 3017 * destination host. 3018 */ 3019 goto out; 3020 else if (ifp->if_flags & IFF_LOOPBACK) 3021 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3022 else if (tp->pf == AF_INET) { 3023 if (ip_mtudisc) 3024 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3025 else if (inp && in_localaddr(inp->inp_faddr)) 3026 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3027 } 3028 #ifdef INET6 3029 else if (tp->pf == AF_INET6) { 3030 /* 3031 * for IPv6, path MTU discovery is always turned on, 3032 * or the node must use packet size <= 1280. 3033 */ 3034 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3035 } 3036 #endif /* INET6 */ 3037 3038 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3039 if (offer != -1) { 3040 #ifndef INET6 3041 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3042 #else 3043 if (tp->pf == AF_INET6) 3044 mssopt = IN6_LINKMTU(ifp) - iphlen - 3045 sizeof(struct tcphdr); 3046 else 3047 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3048 #endif 3049 3050 mssopt = max(tcp_mssdflt, mssopt); 3051 } 3052 3053 out: 3054 /* 3055 * The current mss, t_maxseg, is initialized to the default value. 3056 * If we compute a smaller value, reduce the current mss. 3057 * If we compute a larger value, return it for use in sending 3058 * a max seg size option, but don't store it for use 3059 * unless we received an offer at least that large from peer. 3060 * 3061 * However, do not accept offers lower than the minimum of 3062 * the interface MTU and 216. 3063 */ 3064 if (offer > 0) 3065 tp->t_peermss = offer; 3066 if (tp->t_peermss) 3067 mss = min(mss, max(tp->t_peermss, 216)); 3068 3069 /* sanity - at least max opt. space */ 3070 mss = max(mss, 64); 3071 3072 /* 3073 * maxopd stores the maximum length of data AND options 3074 * in a segment; maxseg is the amount of data in a normal 3075 * segment. We need to store this value (maxopd) apart 3076 * from maxseg, because now every segment carries options 3077 * and thus we normally have somewhat less data in segments. 3078 */ 3079 tp->t_maxopd = mss; 3080 3081 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3082 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3083 mss -= TCPOLEN_TSTAMP_APPA; 3084 #ifdef TCP_SIGNATURE 3085 if (tp->t_flags & TF_SIGNATURE) 3086 mss -= TCPOLEN_SIGLEN; 3087 #endif 3088 3089 if (offer == -1) { 3090 /* mss changed due to Path MTU discovery */ 3091 tp->t_flags &= ~TF_PMTUD_PEND; 3092 tp->t_pmtud_mtu_sent = 0; 3093 tp->t_pmtud_mss_acked = 0; 3094 if (mss < tp->t_maxseg) { 3095 /* 3096 * Follow suggestion in RFC 2414 to reduce the 3097 * congestion window by the ratio of the old 3098 * segment size to the new segment size. 3099 */ 3100 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3101 mss, mss); 3102 } 3103 } else if (tcp_do_rfc3390) { 3104 /* increase initial window */ 3105 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3106 } else 3107 tp->snd_cwnd = mss; 3108 3109 tp->t_maxseg = mss; 3110 3111 return (offer != -1 ? mssopt : mss); 3112 } 3113 3114 u_int 3115 tcp_hdrsz(struct tcpcb *tp) 3116 { 3117 u_int hlen; 3118 3119 switch (tp->pf) { 3120 #ifdef INET6 3121 case AF_INET6: 3122 hlen = sizeof(struct ip6_hdr); 3123 break; 3124 #endif 3125 case AF_INET: 3126 hlen = sizeof(struct ip); 3127 break; 3128 default: 3129 hlen = 0; 3130 break; 3131 } 3132 hlen += sizeof(struct tcphdr); 3133 3134 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3135 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3136 hlen += TCPOLEN_TSTAMP_APPA; 3137 #ifdef TCP_SIGNATURE 3138 if (tp->t_flags & TF_SIGNATURE) 3139 hlen += TCPOLEN_SIGLEN; 3140 #endif 3141 return (hlen); 3142 } 3143 3144 /* 3145 * Set connection variables based on the effective MSS. 3146 * We are passed the TCPCB for the actual connection. If we 3147 * are the server, we are called by the compressed state engine 3148 * when the 3-way handshake is complete. If we are the client, 3149 * we are called when we receive the SYN,ACK from the server. 3150 * 3151 * NOTE: The t_maxseg value must be initialized in the TCPCB 3152 * before this routine is called! 3153 */ 3154 void 3155 tcp_mss_update(struct tcpcb *tp) 3156 { 3157 int mss; 3158 u_long bufsize; 3159 struct rtentry *rt; 3160 struct socket *so; 3161 3162 so = tp->t_inpcb->inp_socket; 3163 mss = tp->t_maxseg; 3164 3165 rt = in_pcbrtentry(tp->t_inpcb); 3166 3167 if (rt == NULL) 3168 return; 3169 3170 bufsize = so->so_snd.sb_hiwat; 3171 if (bufsize < mss) { 3172 mss = bufsize; 3173 /* Update t_maxseg and t_maxopd */ 3174 tcp_mss(tp, mss); 3175 } else { 3176 bufsize = roundup(bufsize, mss); 3177 if (bufsize > sb_max) 3178 bufsize = sb_max; 3179 (void)sbreserve(&so->so_snd, bufsize); 3180 } 3181 3182 bufsize = so->so_rcv.sb_hiwat; 3183 if (bufsize > mss) { 3184 bufsize = roundup(bufsize, mss); 3185 if (bufsize > sb_max) 3186 bufsize = sb_max; 3187 (void)sbreserve(&so->so_rcv, bufsize); 3188 } 3189 3190 } 3191 3192 #if defined (TCP_SACK) 3193 /* 3194 * Checks for partial ack. If partial ack arrives, force the retransmission 3195 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3196 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3197 * be started again. If the ack advances at least to tp->snd_last, return 0. 3198 */ 3199 int 3200 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3201 { 3202 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3203 /* 3204 * snd_una has not been updated and the socket send buffer 3205 * not yet drained of the acked data, so we have to leave 3206 * snd_una as it was to get the correct data offset in 3207 * tcp_output(). 3208 */ 3209 tcp_seq onxt = tp->snd_nxt; 3210 u_long ocwnd = tp->snd_cwnd; 3211 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3212 tp->t_rtttime = 0; 3213 tp->snd_nxt = th->th_ack; 3214 /* 3215 * Set snd_cwnd to one segment beyond acknowledged offset 3216 * (tp->snd_una not yet updated when this function is called) 3217 */ 3218 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3219 (void) tcp_output(tp); 3220 tp->snd_cwnd = ocwnd; 3221 if (SEQ_GT(onxt, tp->snd_nxt)) 3222 tp->snd_nxt = onxt; 3223 /* 3224 * Partial window deflation. Relies on fact that tp->snd_una 3225 * not updated yet. 3226 */ 3227 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3228 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3229 else 3230 tp->snd_cwnd = 0; 3231 tp->snd_cwnd += tp->t_maxseg; 3232 3233 return 1; 3234 } 3235 return 0; 3236 } 3237 #endif /* TCP_SACK */ 3238 3239 int 3240 tcp_mss_adv(struct ifnet *ifp, int af) 3241 { 3242 int mss = 0; 3243 int iphlen; 3244 3245 switch (af) { 3246 case AF_INET: 3247 if (ifp != NULL) 3248 mss = ifp->if_mtu; 3249 iphlen = sizeof(struct ip); 3250 break; 3251 #ifdef INET6 3252 case AF_INET6: 3253 if (ifp != NULL) 3254 mss = IN6_LINKMTU(ifp); 3255 iphlen = sizeof(struct ip6_hdr); 3256 break; 3257 #endif 3258 } 3259 mss = mss - iphlen - sizeof(struct tcphdr); 3260 return (max(mss, tcp_mssdflt)); 3261 } 3262 3263 /* 3264 * TCP compressed state engine. Currently used to hold compressed 3265 * state for SYN_RECEIVED. 3266 */ 3267 3268 u_long syn_cache_count; 3269 u_int32_t syn_hash1, syn_hash2; 3270 3271 #define SYN_HASH(sa, sp, dp) \ 3272 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3273 ((u_int32_t)(sp)))^syn_hash2))) 3274 #ifndef INET6 3275 #define SYN_HASHALL(hash, src, dst) \ 3276 do { \ 3277 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3278 ((struct sockaddr_in *)(src))->sin_port, \ 3279 ((struct sockaddr_in *)(dst))->sin_port); \ 3280 } while (/*CONSTCOND*/ 0) 3281 #else 3282 #define SYN_HASH6(sa, sp, dp) \ 3283 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3284 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3285 & 0x7fffffff) 3286 3287 #define SYN_HASHALL(hash, src, dst) \ 3288 do { \ 3289 switch ((src)->sa_family) { \ 3290 case AF_INET: \ 3291 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3292 ((struct sockaddr_in *)(src))->sin_port, \ 3293 ((struct sockaddr_in *)(dst))->sin_port); \ 3294 break; \ 3295 case AF_INET6: \ 3296 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3297 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3298 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3299 break; \ 3300 default: \ 3301 hash = 0; \ 3302 } \ 3303 } while (/*CONSTCOND*/0) 3304 #endif /* INET6 */ 3305 3306 #define SYN_CACHE_RM(sc) \ 3307 do { \ 3308 (sc)->sc_flags |= SCF_DEAD; \ 3309 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 3310 (sc), sc_bucketq); \ 3311 (sc)->sc_tp = NULL; \ 3312 LIST_REMOVE((sc), sc_tpq); \ 3313 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 3314 timeout_del(&(sc)->sc_timer); \ 3315 syn_cache_count--; \ 3316 } while (/*CONSTCOND*/0) 3317 3318 #define SYN_CACHE_PUT(sc) \ 3319 do { \ 3320 if ((sc)->sc_ipopts) \ 3321 (void) m_free((sc)->sc_ipopts); \ 3322 if ((sc)->sc_route4.ro_rt != NULL) \ 3323 RTFREE((sc)->sc_route4.ro_rt); \ 3324 timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \ 3325 timeout_add(&(sc)->sc_timer, 0); \ 3326 } while (/*CONSTCOND*/0) 3327 3328 struct pool syn_cache_pool; 3329 3330 /* 3331 * We don't estimate RTT with SYNs, so each packet starts with the default 3332 * RTT and each timer step has a fixed timeout value. 3333 */ 3334 #define SYN_CACHE_TIMER_ARM(sc) \ 3335 do { \ 3336 TCPT_RANGESET((sc)->sc_rxtcur, \ 3337 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3338 TCPTV_REXMTMAX); \ 3339 if (!timeout_initialized(&(sc)->sc_timer)) \ 3340 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3341 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3342 } while (/*CONSTCOND*/0) 3343 3344 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3345 3346 void 3347 syn_cache_init() 3348 { 3349 int i; 3350 3351 /* Initialize the hash buckets. */ 3352 for (i = 0; i < tcp_syn_cache_size; i++) 3353 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3354 3355 /* Initialize the syn cache pool. */ 3356 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3357 "synpl", NULL); 3358 } 3359 3360 void 3361 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3362 { 3363 struct syn_cache_head *scp; 3364 struct syn_cache *sc2; 3365 int s; 3366 3367 /* 3368 * If there are no entries in the hash table, reinitialize 3369 * the hash secrets. 3370 */ 3371 if (syn_cache_count == 0) { 3372 syn_hash1 = arc4random(); 3373 syn_hash2 = arc4random(); 3374 } 3375 3376 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3377 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3378 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3379 3380 /* 3381 * Make sure that we don't overflow the per-bucket 3382 * limit or the total cache size limit. 3383 */ 3384 s = splsoftnet(); 3385 if (scp->sch_length >= tcp_syn_bucket_limit) { 3386 tcpstat.tcps_sc_bucketoverflow++; 3387 /* 3388 * The bucket is full. Toss the oldest element in the 3389 * bucket. This will be the first entry in the bucket. 3390 */ 3391 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3392 #ifdef DIAGNOSTIC 3393 /* 3394 * This should never happen; we should always find an 3395 * entry in our bucket. 3396 */ 3397 if (sc2 == NULL) 3398 panic("syn_cache_insert: bucketoverflow: impossible"); 3399 #endif 3400 SYN_CACHE_RM(sc2); 3401 SYN_CACHE_PUT(sc2); 3402 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3403 struct syn_cache_head *scp2, *sce; 3404 3405 tcpstat.tcps_sc_overflowed++; 3406 /* 3407 * The cache is full. Toss the oldest entry in the 3408 * first non-empty bucket we can find. 3409 * 3410 * XXX We would really like to toss the oldest 3411 * entry in the cache, but we hope that this 3412 * condition doesn't happen very often. 3413 */ 3414 scp2 = scp; 3415 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3416 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3417 for (++scp2; scp2 != scp; scp2++) { 3418 if (scp2 >= sce) 3419 scp2 = &tcp_syn_cache[0]; 3420 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3421 break; 3422 } 3423 #ifdef DIAGNOSTIC 3424 /* 3425 * This should never happen; we should always find a 3426 * non-empty bucket. 3427 */ 3428 if (scp2 == scp) 3429 panic("syn_cache_insert: cacheoverflow: " 3430 "impossible"); 3431 #endif 3432 } 3433 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3434 SYN_CACHE_RM(sc2); 3435 SYN_CACHE_PUT(sc2); 3436 } 3437 3438 /* 3439 * Initialize the entry's timer. 3440 */ 3441 sc->sc_rxttot = 0; 3442 sc->sc_rxtshift = 0; 3443 SYN_CACHE_TIMER_ARM(sc); 3444 3445 /* Link it from tcpcb entry */ 3446 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3447 3448 /* Put it into the bucket. */ 3449 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3450 scp->sch_length++; 3451 syn_cache_count++; 3452 3453 tcpstat.tcps_sc_added++; 3454 splx(s); 3455 } 3456 3457 /* 3458 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3459 * If we have retransmitted an entry the maximum number of times, expire 3460 * that entry. 3461 */ 3462 void 3463 syn_cache_timer(void *arg) 3464 { 3465 struct syn_cache *sc = arg; 3466 int s; 3467 3468 s = splsoftnet(); 3469 if (sc->sc_flags & SCF_DEAD) { 3470 splx(s); 3471 return; 3472 } 3473 3474 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3475 /* Drop it -- too many retransmissions. */ 3476 goto dropit; 3477 } 3478 3479 /* 3480 * Compute the total amount of time this entry has 3481 * been on a queue. If this entry has been on longer 3482 * than the keep alive timer would allow, expire it. 3483 */ 3484 sc->sc_rxttot += sc->sc_rxtcur; 3485 if (sc->sc_rxttot >= tcptv_keep_init) 3486 goto dropit; 3487 3488 tcpstat.tcps_sc_retransmitted++; 3489 (void) syn_cache_respond(sc, NULL); 3490 3491 /* Advance the timer back-off. */ 3492 sc->sc_rxtshift++; 3493 SYN_CACHE_TIMER_ARM(sc); 3494 3495 splx(s); 3496 return; 3497 3498 dropit: 3499 tcpstat.tcps_sc_timed_out++; 3500 SYN_CACHE_RM(sc); 3501 SYN_CACHE_PUT(sc); 3502 splx(s); 3503 } 3504 3505 void 3506 syn_cache_reaper(void *arg) 3507 { 3508 struct syn_cache *sc = arg; 3509 int s; 3510 3511 s = splsoftnet(); 3512 pool_put(&syn_cache_pool, (sc)); 3513 splx(s); 3514 return; 3515 } 3516 3517 /* 3518 * Remove syn cache created by the specified tcb entry, 3519 * because this does not make sense to keep them 3520 * (if there's no tcb entry, syn cache entry will never be used) 3521 */ 3522 void 3523 syn_cache_cleanup(struct tcpcb *tp) 3524 { 3525 struct syn_cache *sc, *nsc; 3526 int s; 3527 3528 s = splsoftnet(); 3529 3530 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3531 nsc = LIST_NEXT(sc, sc_tpq); 3532 3533 #ifdef DIAGNOSTIC 3534 if (sc->sc_tp != tp) 3535 panic("invalid sc_tp in syn_cache_cleanup"); 3536 #endif 3537 SYN_CACHE_RM(sc); 3538 SYN_CACHE_PUT(sc); 3539 } 3540 /* just for safety */ 3541 LIST_INIT(&tp->t_sc); 3542 3543 splx(s); 3544 } 3545 3546 /* 3547 * Find an entry in the syn cache. 3548 */ 3549 struct syn_cache * 3550 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3551 struct syn_cache_head **headp) 3552 { 3553 struct syn_cache *sc; 3554 struct syn_cache_head *scp; 3555 u_int32_t hash; 3556 int s; 3557 3558 SYN_HASHALL(hash, src, dst); 3559 3560 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3561 *headp = scp; 3562 s = splsoftnet(); 3563 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3564 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3565 if (sc->sc_hash != hash) 3566 continue; 3567 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3568 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 3569 splx(s); 3570 return (sc); 3571 } 3572 } 3573 splx(s); 3574 return (NULL); 3575 } 3576 3577 /* 3578 * This function gets called when we receive an ACK for a 3579 * socket in the LISTEN state. We look up the connection 3580 * in the syn cache, and if its there, we pull it out of 3581 * the cache and turn it into a full-blown connection in 3582 * the SYN-RECEIVED state. 3583 * 3584 * The return values may not be immediately obvious, and their effects 3585 * can be subtle, so here they are: 3586 * 3587 * NULL SYN was not found in cache; caller should drop the 3588 * packet and send an RST. 3589 * 3590 * -1 We were unable to create the new connection, and are 3591 * aborting it. An ACK,RST is being sent to the peer 3592 * (unless we got screwey sequence numbners; see below), 3593 * because the 3-way handshake has been completed. Caller 3594 * should not free the mbuf, since we may be using it. If 3595 * we are not, we will free it. 3596 * 3597 * Otherwise, the return value is a pointer to the new socket 3598 * associated with the connection. 3599 */ 3600 struct socket * 3601 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3602 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3603 { 3604 struct syn_cache *sc; 3605 struct syn_cache_head *scp; 3606 struct inpcb *inp = NULL; 3607 struct tcpcb *tp = 0; 3608 struct mbuf *am; 3609 int s; 3610 struct socket *oso; 3611 3612 s = splsoftnet(); 3613 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3614 splx(s); 3615 return (NULL); 3616 } 3617 3618 /* 3619 * Verify the sequence and ack numbers. Try getting the correct 3620 * response again. 3621 */ 3622 if ((th->th_ack != sc->sc_iss + 1) || 3623 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3624 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3625 (void) syn_cache_respond(sc, m); 3626 splx(s); 3627 return ((struct socket *)(-1)); 3628 } 3629 3630 /* Remove this cache entry */ 3631 SYN_CACHE_RM(sc); 3632 splx(s); 3633 3634 /* 3635 * Ok, create the full blown connection, and set things up 3636 * as they would have been set up if we had created the 3637 * connection when the SYN arrived. If we can't create 3638 * the connection, abort it. 3639 */ 3640 oso = so; 3641 so = sonewconn(so, SS_ISCONNECTED); 3642 if (so == NULL) 3643 goto resetandabort; 3644 3645 inp = sotoinpcb(oso); 3646 #ifdef IPSEC 3647 /* 3648 * We need to copy the required security levels 3649 * from the old pcb. Ditto for any other 3650 * IPsec-related information. 3651 */ 3652 { 3653 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 3654 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3655 sizeof(inp->inp_seclevel)); 3656 newinp->inp_secrequire = inp->inp_secrequire; 3657 if (inp->inp_ipo != NULL) { 3658 newinp->inp_ipo = inp->inp_ipo; 3659 inp->inp_ipo->ipo_ref_count++; 3660 } 3661 if (inp->inp_ipsec_remotecred != NULL) { 3662 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3663 inp->inp_ipsec_remotecred->ref_count++; 3664 } 3665 if (inp->inp_ipsec_remoteauth != NULL) { 3666 newinp->inp_ipsec_remoteauth 3667 = inp->inp_ipsec_remoteauth; 3668 inp->inp_ipsec_remoteauth->ref_count++; 3669 } 3670 } 3671 #endif /* IPSEC */ 3672 #ifdef INET6 3673 /* 3674 * inp still has the OLD in_pcb stuff, set the 3675 * v6-related flags on the new guy, too. 3676 */ 3677 { 3678 int flags = inp->inp_flags; 3679 struct inpcb *oldinpcb = inp; 3680 3681 inp = (struct inpcb *)so->so_pcb; 3682 inp->inp_flags |= (flags & INP_IPV6); 3683 if ((inp->inp_flags & INP_IPV6) != 0) { 3684 inp->inp_ipv6.ip6_hlim = 3685 oldinpcb->inp_ipv6.ip6_hlim; 3686 } 3687 } 3688 #else /* INET6 */ 3689 inp = (struct inpcb *)so->so_pcb; 3690 #endif /* INET6 */ 3691 3692 inp->inp_lport = th->th_dport; 3693 switch (src->sa_family) { 3694 #ifdef INET6 3695 case AF_INET6: 3696 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3697 break; 3698 #endif /* INET6 */ 3699 case AF_INET: 3700 3701 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3702 inp->inp_options = ip_srcroute(); 3703 if (inp->inp_options == NULL) { 3704 inp->inp_options = sc->sc_ipopts; 3705 sc->sc_ipopts = NULL; 3706 } 3707 break; 3708 } 3709 in_pcbrehash(inp); 3710 3711 /* 3712 * Give the new socket our cached route reference. 3713 */ 3714 if (src->sa_family == AF_INET) 3715 inp->inp_route = sc->sc_route4; /* struct assignment */ 3716 #ifdef INET6 3717 else 3718 inp->inp_route6 = sc->sc_route6; 3719 #endif 3720 sc->sc_route4.ro_rt = NULL; 3721 3722 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3723 if (am == NULL) 3724 goto resetandabort; 3725 am->m_len = src->sa_len; 3726 bcopy(src, mtod(am, caddr_t), src->sa_len); 3727 3728 switch (src->sa_family) { 3729 case AF_INET: 3730 /* drop IPv4 packet to AF_INET6 socket */ 3731 if (inp->inp_flags & INP_IPV6) { 3732 (void) m_free(am); 3733 goto resetandabort; 3734 } 3735 if (in_pcbconnect(inp, am)) { 3736 (void) m_free(am); 3737 goto resetandabort; 3738 } 3739 break; 3740 #ifdef INET6 3741 case AF_INET6: 3742 if (in6_pcbconnect(inp, am)) { 3743 (void) m_free(am); 3744 goto resetandabort; 3745 } 3746 break; 3747 #endif 3748 } 3749 (void) m_free(am); 3750 3751 tp = intotcpcb(inp); 3752 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3753 if (sc->sc_request_r_scale != 15) { 3754 tp->requested_s_scale = sc->sc_requested_s_scale; 3755 tp->request_r_scale = sc->sc_request_r_scale; 3756 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3757 } 3758 if (sc->sc_flags & SCF_TIMESTAMP) 3759 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3760 3761 tp->t_template = tcp_template(tp); 3762 if (tp->t_template == 0) { 3763 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3764 so = NULL; 3765 m_freem(m); 3766 goto abort; 3767 } 3768 #ifdef TCP_SACK 3769 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3770 #endif 3771 3772 tp->ts_modulate = sc->sc_modulate; 3773 tp->iss = sc->sc_iss; 3774 tp->irs = sc->sc_irs; 3775 tcp_sendseqinit(tp); 3776 #if defined (TCP_SACK) || defined(TCP_ECN) 3777 tp->snd_last = tp->snd_una; 3778 #endif /* TCP_SACK */ 3779 #if defined(TCP_SACK) && defined(TCP_FACK) 3780 tp->snd_fack = tp->snd_una; 3781 tp->retran_data = 0; 3782 tp->snd_awnd = 0; 3783 #endif /* TCP_FACK */ 3784 #ifdef TCP_ECN 3785 if (sc->sc_flags & SCF_ECN_PERMIT) { 3786 tp->t_flags |= TF_ECN_PERMIT; 3787 tcpstat.tcps_ecn_accepts++; 3788 } 3789 #endif 3790 #ifdef TCP_SACK 3791 if (sc->sc_flags & SCF_SACK_PERMIT) 3792 tp->t_flags |= TF_SACK_PERMIT; 3793 #endif 3794 #ifdef TCP_SIGNATURE 3795 if (sc->sc_flags & SCF_SIGNATURE) 3796 tp->t_flags |= TF_SIGNATURE; 3797 #endif 3798 tcp_rcvseqinit(tp); 3799 tp->t_state = TCPS_SYN_RECEIVED; 3800 tp->t_rcvtime = tcp_now; 3801 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3802 tcpstat.tcps_accepts++; 3803 3804 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3805 if (sc->sc_peermaxseg) 3806 tcp_mss_update(tp); 3807 /* Reset initial window to 1 segment for retransmit */ 3808 if (sc->sc_rxtshift > 0) 3809 tp->snd_cwnd = tp->t_maxseg; 3810 tp->snd_wl1 = sc->sc_irs; 3811 tp->rcv_up = sc->sc_irs + 1; 3812 3813 /* 3814 * This is what whould have happened in tcp_output() when 3815 * the SYN,ACK was sent. 3816 */ 3817 tp->snd_up = tp->snd_una; 3818 tp->snd_max = tp->snd_nxt = tp->iss+1; 3819 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3820 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3821 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3822 tp->last_ack_sent = tp->rcv_nxt; 3823 3824 tcpstat.tcps_sc_completed++; 3825 SYN_CACHE_PUT(sc); 3826 return (so); 3827 3828 resetandabort: 3829 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST); 3830 m_freem(m); 3831 abort: 3832 if (so != NULL) 3833 (void) soabort(so); 3834 SYN_CACHE_PUT(sc); 3835 tcpstat.tcps_sc_aborted++; 3836 return ((struct socket *)(-1)); 3837 } 3838 3839 /* 3840 * This function is called when we get a RST for a 3841 * non-existent connection, so that we can see if the 3842 * connection is in the syn cache. If it is, zap it. 3843 */ 3844 3845 void 3846 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 3847 { 3848 struct syn_cache *sc; 3849 struct syn_cache_head *scp; 3850 int s = splsoftnet(); 3851 3852 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3853 splx(s); 3854 return; 3855 } 3856 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3857 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3858 splx(s); 3859 return; 3860 } 3861 SYN_CACHE_RM(sc); 3862 splx(s); 3863 tcpstat.tcps_sc_reset++; 3864 SYN_CACHE_PUT(sc); 3865 } 3866 3867 void 3868 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 3869 { 3870 struct syn_cache *sc; 3871 struct syn_cache_head *scp; 3872 int s; 3873 3874 s = splsoftnet(); 3875 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3876 splx(s); 3877 return; 3878 } 3879 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3880 if (ntohl (th->th_seq) != sc->sc_iss) { 3881 splx(s); 3882 return; 3883 } 3884 3885 /* 3886 * If we've retransmitted 3 times and this is our second error, 3887 * we remove the entry. Otherwise, we allow it to continue on. 3888 * This prevents us from incorrectly nuking an entry during a 3889 * spurious network outage. 3890 * 3891 * See tcp_notify(). 3892 */ 3893 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3894 sc->sc_flags |= SCF_UNREACH; 3895 splx(s); 3896 return; 3897 } 3898 3899 SYN_CACHE_RM(sc); 3900 splx(s); 3901 tcpstat.tcps_sc_unreach++; 3902 SYN_CACHE_PUT(sc); 3903 } 3904 3905 /* 3906 * Given a LISTEN socket and an inbound SYN request, add 3907 * this to the syn cache, and send back a segment: 3908 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3909 * to the source. 3910 * 3911 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3912 * Doing so would require that we hold onto the data and deliver it 3913 * to the application. However, if we are the target of a SYN-flood 3914 * DoS attack, an attacker could send data which would eventually 3915 * consume all available buffer space if it were ACKed. By not ACKing 3916 * the data, we avoid this DoS scenario. 3917 */ 3918 3919 int 3920 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3921 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3922 struct tcp_opt_info *oi, tcp_seq *issp) 3923 { 3924 struct tcpcb tb, *tp; 3925 long win; 3926 struct syn_cache *sc; 3927 struct syn_cache_head *scp; 3928 struct mbuf *ipopts; 3929 3930 tp = sototcpcb(so); 3931 3932 /* 3933 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3934 * 3935 * Note this check is performed in tcp_input() very early on. 3936 */ 3937 3938 /* 3939 * Initialize some local state. 3940 */ 3941 win = sbspace(&so->so_rcv); 3942 if (win > TCP_MAXWIN) 3943 win = TCP_MAXWIN; 3944 3945 #ifdef TCP_SIGNATURE 3946 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3947 #else 3948 if (optp) { 3949 #endif 3950 tb.pf = tp->pf; 3951 #ifdef TCP_SACK 3952 tb.sack_enable = tp->sack_enable; 3953 #endif 3954 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3955 #ifdef TCP_SIGNATURE 3956 if (tp->t_flags & TF_SIGNATURE) 3957 tb.t_flags |= TF_SIGNATURE; 3958 #endif 3959 tb.t_state = TCPS_LISTEN; 3960 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi)) 3961 return (0); 3962 } else 3963 tb.t_flags = 0; 3964 3965 switch (src->sa_family) { 3966 #ifdef INET 3967 case AF_INET: 3968 /* 3969 * Remember the IP options, if any. 3970 */ 3971 ipopts = ip_srcroute(); 3972 break; 3973 #endif 3974 default: 3975 ipopts = NULL; 3976 } 3977 3978 /* 3979 * See if we already have an entry for this connection. 3980 * If we do, resend the SYN,ACK. We do not count this 3981 * as a retransmission (XXX though maybe we should). 3982 */ 3983 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3984 tcpstat.tcps_sc_dupesyn++; 3985 if (ipopts) { 3986 /* 3987 * If we were remembering a previous source route, 3988 * forget it and use the new one we've been given. 3989 */ 3990 if (sc->sc_ipopts) 3991 (void) m_free(sc->sc_ipopts); 3992 sc->sc_ipopts = ipopts; 3993 } 3994 sc->sc_timestamp = tb.ts_recent; 3995 if (syn_cache_respond(sc, m) == 0) { 3996 tcpstat.tcps_sndacks++; 3997 tcpstat.tcps_sndtotal++; 3998 } 3999 return (1); 4000 } 4001 4002 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 4003 if (sc == NULL) { 4004 if (ipopts) 4005 (void) m_free(ipopts); 4006 return (0); 4007 } 4008 4009 /* 4010 * Fill in the cache, and put the necessary IP and TCP 4011 * options into the reply. 4012 */ 4013 bzero(sc, sizeof(struct syn_cache)); 4014 bzero(&sc->sc_timer, sizeof(sc->sc_timer)); 4015 bcopy(src, &sc->sc_src, src->sa_len); 4016 bcopy(dst, &sc->sc_dst, dst->sa_len); 4017 sc->sc_flags = 0; 4018 sc->sc_ipopts = ipopts; 4019 sc->sc_irs = th->th_seq; 4020 4021 sc->sc_iss = issp ? *issp : arc4random(); 4022 sc->sc_peermaxseg = oi->maxseg; 4023 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4024 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4025 sc->sc_win = win; 4026 sc->sc_timestamp = tb.ts_recent; 4027 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4028 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4029 sc->sc_flags |= SCF_TIMESTAMP; 4030 sc->sc_modulate = arc4random(); 4031 } 4032 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4033 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4034 sc->sc_requested_s_scale = tb.requested_s_scale; 4035 sc->sc_request_r_scale = 0; 4036 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4037 TCP_MAXWIN << sc->sc_request_r_scale < 4038 so->so_rcv.sb_hiwat) 4039 sc->sc_request_r_scale++; 4040 } else { 4041 sc->sc_requested_s_scale = 15; 4042 sc->sc_request_r_scale = 15; 4043 } 4044 #ifdef TCP_ECN 4045 /* 4046 * if both ECE and CWR flag bits are set, peer is ECN capable. 4047 */ 4048 if (tcp_do_ecn && 4049 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4050 sc->sc_flags |= SCF_ECN_PERMIT; 4051 #endif 4052 #ifdef TCP_SACK 4053 /* 4054 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4055 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4056 */ 4057 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4058 sc->sc_flags |= SCF_SACK_PERMIT; 4059 #endif 4060 #ifdef TCP_SIGNATURE 4061 if (tb.t_flags & TF_SIGNATURE) 4062 sc->sc_flags |= SCF_SIGNATURE; 4063 #endif 4064 sc->sc_tp = tp; 4065 if (syn_cache_respond(sc, m) == 0) { 4066 syn_cache_insert(sc, tp); 4067 tcpstat.tcps_sndacks++; 4068 tcpstat.tcps_sndtotal++; 4069 } else { 4070 SYN_CACHE_PUT(sc); 4071 tcpstat.tcps_sc_dropped++; 4072 } 4073 return (1); 4074 } 4075 4076 int 4077 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4078 { 4079 struct route *ro; 4080 u_int8_t *optp; 4081 int optlen, error; 4082 u_int16_t tlen; 4083 struct ip *ip = NULL; 4084 #ifdef INET6 4085 struct ip6_hdr *ip6 = NULL; 4086 #endif 4087 struct tcphdr *th; 4088 u_int hlen; 4089 struct inpcb *inp; 4090 4091 switch (sc->sc_src.sa.sa_family) { 4092 case AF_INET: 4093 hlen = sizeof(struct ip); 4094 ro = &sc->sc_route4; 4095 break; 4096 #ifdef INET6 4097 case AF_INET6: 4098 hlen = sizeof(struct ip6_hdr); 4099 ro = (struct route *)&sc->sc_route6; 4100 break; 4101 #endif 4102 default: 4103 if (m) 4104 m_freem(m); 4105 return (EAFNOSUPPORT); 4106 } 4107 4108 /* Compute the size of the TCP options. */ 4109 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4110 #ifdef TCP_SACK 4111 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4112 #endif 4113 #ifdef TCP_SIGNATURE 4114 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4115 #endif 4116 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4117 4118 tlen = hlen + sizeof(struct tcphdr) + optlen; 4119 4120 /* 4121 * Create the IP+TCP header from scratch. 4122 */ 4123 if (m) 4124 m_freem(m); 4125 #ifdef DIAGNOSTIC 4126 if (max_linkhdr + tlen > MCLBYTES) 4127 return (ENOBUFS); 4128 #endif 4129 MGETHDR(m, M_DONTWAIT, MT_DATA); 4130 if (m && max_linkhdr + tlen > MHLEN) { 4131 MCLGET(m, M_DONTWAIT); 4132 if ((m->m_flags & M_EXT) == 0) { 4133 m_freem(m); 4134 m = NULL; 4135 } 4136 } 4137 if (m == NULL) 4138 return (ENOBUFS); 4139 4140 /* Fixup the mbuf. */ 4141 m->m_data += max_linkhdr; 4142 m->m_len = m->m_pkthdr.len = tlen; 4143 m->m_pkthdr.rcvif = NULL; 4144 memset(mtod(m, u_char *), 0, tlen); 4145 4146 switch (sc->sc_src.sa.sa_family) { 4147 case AF_INET: 4148 ip = mtod(m, struct ip *); 4149 ip->ip_dst = sc->sc_src.sin.sin_addr; 4150 ip->ip_src = sc->sc_dst.sin.sin_addr; 4151 ip->ip_p = IPPROTO_TCP; 4152 th = (struct tcphdr *)(ip + 1); 4153 th->th_dport = sc->sc_src.sin.sin_port; 4154 th->th_sport = sc->sc_dst.sin.sin_port; 4155 break; 4156 #ifdef INET6 4157 case AF_INET6: 4158 ip6 = mtod(m, struct ip6_hdr *); 4159 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4160 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4161 ip6->ip6_nxt = IPPROTO_TCP; 4162 /* ip6_plen will be updated in ip6_output() */ 4163 th = (struct tcphdr *)(ip6 + 1); 4164 th->th_dport = sc->sc_src.sin6.sin6_port; 4165 th->th_sport = sc->sc_dst.sin6.sin6_port; 4166 break; 4167 #endif 4168 default: 4169 th = NULL; 4170 } 4171 4172 th->th_seq = htonl(sc->sc_iss); 4173 th->th_ack = htonl(sc->sc_irs + 1); 4174 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4175 th->th_flags = TH_SYN|TH_ACK; 4176 #ifdef TCP_ECN 4177 /* Set ECE for SYN-ACK if peer supports ECN. */ 4178 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4179 th->th_flags |= TH_ECE; 4180 #endif 4181 th->th_win = htons(sc->sc_win); 4182 /* th_sum already 0 */ 4183 /* th_urp already 0 */ 4184 4185 /* Tack on the TCP options. */ 4186 optp = (u_int8_t *)(th + 1); 4187 *optp++ = TCPOPT_MAXSEG; 4188 *optp++ = 4; 4189 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4190 *optp++ = sc->sc_ourmaxseg & 0xff; 4191 4192 #ifdef TCP_SACK 4193 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4194 if (sc->sc_flags & SCF_SACK_PERMIT) { 4195 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4196 optp += 4; 4197 } 4198 #endif 4199 4200 if (sc->sc_request_r_scale != 15) { 4201 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4202 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4203 sc->sc_request_r_scale); 4204 optp += 4; 4205 } 4206 4207 if (sc->sc_flags & SCF_TIMESTAMP) { 4208 u_int32_t *lp = (u_int32_t *)(optp); 4209 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4210 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4211 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4212 *lp = htonl(sc->sc_timestamp); 4213 optp += TCPOLEN_TSTAMP_APPA; 4214 } 4215 4216 #ifdef TCP_SIGNATURE 4217 if (sc->sc_flags & SCF_SIGNATURE) { 4218 union sockaddr_union src, dst; 4219 struct tdb *tdb; 4220 4221 bzero(&src, sizeof(union sockaddr_union)); 4222 bzero(&dst, sizeof(union sockaddr_union)); 4223 src.sa.sa_len = sc->sc_src.sa.sa_len; 4224 src.sa.sa_family = sc->sc_src.sa.sa_family; 4225 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4226 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4227 4228 switch (sc->sc_src.sa.sa_family) { 4229 case 0: /*default to PF_INET*/ 4230 #ifdef INET 4231 case AF_INET: 4232 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4233 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4234 break; 4235 #endif /* INET */ 4236 #ifdef INET6 4237 case AF_INET6: 4238 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4239 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4240 break; 4241 #endif /* INET6 */ 4242 } 4243 4244 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 4245 if (tdb == NULL) { 4246 if (m) 4247 m_freem(m); 4248 return (EPERM); 4249 } 4250 4251 /* Send signature option */ 4252 *(optp++) = TCPOPT_SIGNATURE; 4253 *(optp++) = TCPOLEN_SIGNATURE; 4254 4255 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4256 hlen, 0, optp) < 0) { 4257 if (m) 4258 m_freem(m); 4259 return (EINVAL); 4260 } 4261 optp += 16; 4262 4263 /* Pad options list to the next 32 bit boundary and 4264 * terminate it. 4265 */ 4266 *optp++ = TCPOPT_NOP; 4267 *optp++ = TCPOPT_EOL; 4268 } 4269 #endif /* TCP_SIGNATURE */ 4270 4271 /* Compute the packet's checksum. */ 4272 switch (sc->sc_src.sa.sa_family) { 4273 case AF_INET: 4274 ip->ip_len = htons(tlen - hlen); 4275 th->th_sum = 0; 4276 th->th_sum = in_cksum(m, tlen); 4277 break; 4278 #ifdef INET6 4279 case AF_INET6: 4280 ip6->ip6_plen = htons(tlen - hlen); 4281 th->th_sum = 0; 4282 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4283 break; 4284 #endif 4285 } 4286 4287 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4288 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4289 4290 /* 4291 * Fill in some straggling IP bits. Note the stack expects 4292 * ip_len to be in host order, for convenience. 4293 */ 4294 switch (sc->sc_src.sa.sa_family) { 4295 #ifdef INET 4296 case AF_INET: 4297 ip->ip_len = htons(tlen); 4298 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4299 /* XXX tos? */ 4300 break; 4301 #endif 4302 #ifdef INET6 4303 case AF_INET6: 4304 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4305 ip6->ip6_vfc |= IPV6_VERSION; 4306 ip6->ip6_plen = htons(tlen - hlen); 4307 /* ip6_hlim will be initialized afterwards */ 4308 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4309 break; 4310 #endif 4311 } 4312 4313 switch (sc->sc_src.sa.sa_family) { 4314 #ifdef INET 4315 case AF_INET: 4316 error = ip_output(m, sc->sc_ipopts, ro, 4317 (ip_mtudisc ? IP_MTUDISC : 0), 4318 (struct ip_moptions *)NULL, inp); 4319 break; 4320 #endif 4321 #ifdef INET6 4322 case AF_INET6: 4323 ip6->ip6_hlim = in6_selecthlim(NULL, 4324 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4325 4326 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4327 (struct ip6_moptions *)0, NULL, NULL); 4328 break; 4329 #endif 4330 default: 4331 error = EAFNOSUPPORT; 4332 break; 4333 } 4334 return (error); 4335 } 4336