1 /* $OpenBSD: tcp_input.c,v 1.132 2003/07/09 22:03:16 itojun Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #ifndef TUBA_INCLUDE 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/mbuf.h> 75 #include <sys/protosw.h> 76 #include <sys/socket.h> 77 #include <sys/socketvar.h> 78 #include <sys/kernel.h> 79 80 #include <net/if.h> 81 #include <net/route.h> 82 83 #include <netinet/in.h> 84 #include <netinet/in_systm.h> 85 #include <netinet/ip.h> 86 #include <netinet/in_pcb.h> 87 #include <netinet/ip_var.h> 88 #include <netinet/tcp.h> 89 #include <netinet/tcp_fsm.h> 90 #include <netinet/tcp_seq.h> 91 #include <netinet/tcp_timer.h> 92 #include <netinet/tcp_var.h> 93 #include <netinet/tcpip.h> 94 #include <netinet/tcp_debug.h> 95 96 #ifdef INET6 97 #include <netinet6/in6_var.h> 98 #include <netinet6/nd6.h> 99 100 struct tcpiphdr tcp_saveti; 101 struct tcpipv6hdr tcp_saveti6; 102 103 /* for the packet header length in the mbuf */ 104 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 105 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 106 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 107 #endif /* INET6 */ 108 109 int tcprexmtthresh = 3; 110 struct tcpiphdr tcp_saveti; 111 int tcptv_keep_init = TCPTV_KEEP_INIT; 112 113 extern u_long sb_max; 114 115 int tcp_rst_ppslim = 100; /* 100pps */ 116 int tcp_rst_ppslim_count = 0; 117 struct timeval tcp_rst_ppslim_last; 118 119 #endif /* TUBA_INCLUDE */ 120 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 121 122 /* for modulo comparisons of timestamps */ 123 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 124 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 125 126 /* 127 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 128 */ 129 #ifdef INET6 130 #define ND6_HINT(tp) \ 131 do { \ 132 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 133 tp->t_inpcb->inp_route6.ro_rt) { \ 134 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 135 } \ 136 } while (0) 137 #else 138 #define ND6_HINT(tp) 139 #endif 140 141 #ifdef TCP_ECN 142 /* 143 * ECN (Explicit Congestion Notification) support based on RFC3168 144 * implementation note: 145 * snd_last is used to track a recovery phase. 146 * when cwnd is reduced, snd_last is set to snd_max. 147 * while snd_last > snd_una, the sender is in a recovery phase and 148 * its cwnd should not be reduced again. 149 * snd_last follows snd_una when not in a recovery phase. 150 */ 151 #endif 152 153 /* 154 * Macro to compute ACK transmission behavior. Delay the ACK unless 155 * we have already delayed an ACK (must send an ACK every two segments). 156 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 157 * option is enabled. 158 */ 159 #define TCP_SETUP_ACK(tp, tiflags) \ 160 do { \ 161 if ((tp)->t_flags & TF_DELACK || \ 162 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \ 163 tp->t_flags |= TF_ACKNOW; \ 164 else \ 165 TCP_SET_DELACK(tp); \ 166 } while (0) 167 168 /* 169 * Insert segment ti into reassembly queue of tcp with 170 * control block tp. Return TH_FIN if reassembly now includes 171 * a segment with FIN. The macro form does the common case inline 172 * (segment is the next to be received on an established connection, 173 * and the queue is empty), avoiding linkage into and removal 174 * from the queue and repetition of various conversions. 175 * Set DELACK for segments received in order, but ack immediately 176 * when segments are out of order (so fast retransmit can work). 177 */ 178 179 #ifndef TUBA_INCLUDE 180 181 int 182 tcp_reass(tp, th, m, tlen) 183 struct tcpcb *tp; 184 struct tcphdr *th; 185 struct mbuf *m; 186 int *tlen; 187 { 188 struct ipqent *p, *q, *nq, *tiqe; 189 struct socket *so = tp->t_inpcb->inp_socket; 190 int flags; 191 192 /* 193 * Call with th==0 after become established to 194 * force pre-ESTABLISHED data up to user socket. 195 */ 196 if (th == 0) 197 goto present; 198 199 /* 200 * Allocate a new queue entry, before we throw away any data. 201 * If we can't, just drop the packet. XXX 202 */ 203 tiqe = pool_get(&ipqent_pool, PR_NOWAIT); 204 if (tiqe == NULL) { 205 tcpstat.tcps_rcvmemdrop++; 206 m_freem(m); 207 return (0); 208 } 209 210 /* 211 * Find a segment which begins after this one does. 212 */ 213 for (p = NULL, q = tp->segq.lh_first; q != NULL; 214 p = q, q = q->ipqe_q.le_next) 215 if (SEQ_GT(q->ipqe_tcp->th_seq, th->th_seq)) 216 break; 217 218 /* 219 * If there is a preceding segment, it may provide some of 220 * our data already. If so, drop the data from the incoming 221 * segment. If it provides all of our data, drop us. 222 */ 223 if (p != NULL) { 224 struct tcphdr *phdr = p->ipqe_tcp; 225 int i; 226 227 /* conversion to int (in i) handles seq wraparound */ 228 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 229 if (i > 0) { 230 if (i >= *tlen) { 231 tcpstat.tcps_rcvduppack++; 232 tcpstat.tcps_rcvdupbyte += *tlen; 233 m_freem(m); 234 pool_put(&ipqent_pool, tiqe); 235 return (0); 236 } 237 m_adj(m, i); 238 *tlen -= i; 239 th->th_seq += i; 240 } 241 } 242 tcpstat.tcps_rcvoopack++; 243 tcpstat.tcps_rcvoobyte += *tlen; 244 245 /* 246 * While we overlap succeeding segments trim them or, 247 * if they are completely covered, dequeue them. 248 */ 249 for (; q != NULL; q = nq) { 250 struct tcphdr *qhdr = q->ipqe_tcp; 251 int i = (th->th_seq + *tlen) - qhdr->th_seq; 252 253 if (i <= 0) 254 break; 255 if (i < qhdr->th_reseqlen) { 256 qhdr->th_seq += i; 257 qhdr->th_reseqlen -= i; 258 m_adj(q->ipqe_m, i); 259 break; 260 } 261 nq = q->ipqe_q.le_next; 262 m_freem(q->ipqe_m); 263 LIST_REMOVE(q, ipqe_q); 264 pool_put(&ipqent_pool, q); 265 } 266 267 /* Insert the new fragment queue entry into place. */ 268 tiqe->ipqe_m = m; 269 th->th_reseqlen = *tlen; 270 tiqe->ipqe_tcp = th; 271 if (p == NULL) { 272 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 273 } else { 274 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 275 } 276 277 present: 278 /* 279 * Present data to user, advancing rcv_nxt through 280 * completed sequence space. 281 */ 282 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 283 return (0); 284 q = tp->segq.lh_first; 285 if (q == NULL || q->ipqe_tcp->th_seq != tp->rcv_nxt) 286 return (0); 287 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_tcp->th_reseqlen) 288 return (0); 289 do { 290 tp->rcv_nxt += q->ipqe_tcp->th_reseqlen; 291 flags = q->ipqe_tcp->th_flags & TH_FIN; 292 293 nq = q->ipqe_q.le_next; 294 LIST_REMOVE(q, ipqe_q); 295 ND6_HINT(tp); 296 if (so->so_state & SS_CANTRCVMORE) 297 m_freem(q->ipqe_m); 298 else 299 sbappendstream(&so->so_rcv, q->ipqe_m); 300 pool_put(&ipqent_pool, q); 301 q = nq; 302 } while (q != NULL && q->ipqe_tcp->th_seq == tp->rcv_nxt); 303 sorwakeup(so); 304 return (flags); 305 } 306 307 /* 308 * First check for a port-specific bomb. We do not want to drop half-opens 309 * for other ports if this is the only port being bombed. We only check 310 * the bottom 40 half open connections, to avoid wasting too much time. 311 * 312 * Or, otherwise it is more likely a generic syn bomb, so delete the oldest 313 * half-open connection. 314 */ 315 void 316 tcpdropoldhalfopen(avoidtp, port) 317 struct tcpcb *avoidtp; 318 u_int16_t port; 319 { 320 struct inpcb *inp; 321 struct tcpcb *tp; 322 int ncheck = 40; 323 int s; 324 325 s = splnet(); 326 inp = tcbtable.inpt_queue.cqh_first; 327 if (inp) /* XXX */ 328 for (; inp != (struct inpcb *)&tcbtable.inpt_queue && --ncheck; 329 inp = inp->inp_queue.cqe_prev) { 330 if ((tp = (struct tcpcb *)inp->inp_ppcb) && 331 tp != avoidtp && 332 tp->t_state == TCPS_SYN_RECEIVED && 333 port == inp->inp_lport) { 334 tcp_close(tp); 335 goto done; 336 } 337 } 338 339 inp = tcbtable.inpt_queue.cqh_first; 340 if (inp) /* XXX */ 341 for (; inp != (struct inpcb *)&tcbtable.inpt_queue; 342 inp = inp->inp_queue.cqe_prev) { 343 if ((tp = (struct tcpcb *)inp->inp_ppcb) && 344 tp != avoidtp && 345 tp->t_state == TCPS_SYN_RECEIVED) { 346 tcp_close(tp); 347 goto done; 348 } 349 } 350 done: 351 splx(s); 352 } 353 354 #ifdef INET6 355 int 356 tcp6_input(mp, offp, proto) 357 struct mbuf **mp; 358 int *offp, proto; 359 { 360 struct mbuf *m = *mp; 361 362 #if defined(NFAITH) && 0 < NFAITH 363 if (m->m_pkthdr.rcvif) { 364 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 365 /* XXX send icmp6 host/port unreach? */ 366 m_freem(m); 367 return IPPROTO_DONE; 368 } 369 } 370 #endif 371 372 /* 373 * draft-itojun-ipv6-tcp-to-anycast 374 * better place to put this in? 375 */ 376 if (m->m_flags & M_ANYCAST6) { 377 if (m->m_len >= sizeof(struct ip6_hdr)) { 378 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 379 icmp6_error(m, ICMP6_DST_UNREACH, 380 ICMP6_DST_UNREACH_ADDR, 381 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 382 } else 383 m_freem(m); 384 return IPPROTO_DONE; 385 } 386 387 tcp_input(m, *offp, proto); 388 return IPPROTO_DONE; 389 } 390 #endif 391 392 /* 393 * TCP input routine, follows pages 65-76 of the 394 * protocol specification dated September, 1981 very closely. 395 */ 396 void 397 tcp_input(struct mbuf *m, ...) 398 { 399 struct ip *ip; 400 struct inpcb *inp; 401 u_int8_t *optp = NULL; 402 int optlen = 0; 403 int len, tlen, off; 404 struct tcpcb *tp = 0; 405 int tiflags; 406 struct socket *so = NULL; 407 int todrop, acked, ourfinisacked, needoutput = 0; 408 int hdroptlen = 0; 409 short ostate = 0; 410 struct in_addr laddr; 411 int dropsocket = 0; 412 int iss = 0; 413 u_long tiwin; 414 u_int32_t ts_val, ts_ecr; 415 int ts_present = 0; 416 int iphlen; 417 va_list ap; 418 struct tcphdr *th; 419 #ifdef INET6 420 struct in6_addr laddr6; 421 struct ip6_hdr *ip6 = NULL; 422 #endif /* INET6 */ 423 #ifdef IPSEC 424 struct m_tag *mtag; 425 struct tdb_ident *tdbi; 426 struct tdb *tdb; 427 int error, s; 428 #endif /* IPSEC */ 429 int af; 430 #ifdef TCP_ECN 431 u_char iptos; 432 #endif 433 434 va_start(ap, m); 435 iphlen = va_arg(ap, int); 436 va_end(ap); 437 438 tcpstat.tcps_rcvtotal++; 439 440 /* 441 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 442 * TCP/IPv4. 443 */ 444 switch (mtod(m, struct ip *)->ip_v) { 445 #ifdef INET6 446 case 6: 447 af = AF_INET6; 448 break; 449 #endif 450 case 4: 451 af = AF_INET; 452 break; 453 default: 454 m_freem(m); 455 return; /*EAFNOSUPPORT*/ 456 } 457 458 /* 459 * Get IP and TCP header together in first mbuf. 460 * Note: IP leaves IP header in first mbuf. 461 */ 462 switch (af) { 463 case AF_INET: 464 #ifdef DIAGNOSTIC 465 if (iphlen < sizeof(struct ip)) { 466 m_freem(m); 467 return; 468 } 469 #endif /* DIAGNOSTIC */ 470 if (iphlen > sizeof(struct ip)) { 471 #if 0 /*XXX*/ 472 ip_stripoptions(m, (struct mbuf *)0); 473 iphlen = sizeof(struct ip); 474 #else 475 m_freem(m); 476 return; 477 #endif 478 } 479 break; 480 #ifdef INET6 481 case AF_INET6: 482 #ifdef DIAGNOSTIC 483 if (iphlen < sizeof(struct ip6_hdr)) { 484 m_freem(m); 485 return; 486 } 487 #endif /* DIAGNOSTIC */ 488 if (iphlen > sizeof(struct ip6_hdr)) { 489 #if 0 /*XXX*/ 490 ipv6_stripoptions(m, iphlen); 491 iphlen = sizeof(struct ip6_hdr); 492 #else 493 m_freem(m); 494 return; 495 #endif 496 } 497 break; 498 #endif 499 default: 500 m_freem(m); 501 return; 502 } 503 504 if (m->m_len < iphlen + sizeof(struct tcphdr)) { 505 m = m_pullup2(m, iphlen + sizeof(struct tcphdr)); 506 if (m == NULL) { 507 tcpstat.tcps_rcvshort++; 508 return; 509 } 510 } 511 512 ip = NULL; 513 #ifdef INET6 514 ip6 = NULL; 515 #endif 516 switch (af) { 517 case AF_INET: 518 { 519 struct tcpiphdr *ti; 520 521 ip = mtod(m, struct ip *); 522 tlen = m->m_pkthdr.len - iphlen; 523 ti = mtod(m, struct tcpiphdr *); 524 525 #ifdef TCP_ECN 526 /* save ip_tos before clearing it for checksum */ 527 iptos = ip->ip_tos; 528 #endif 529 /* 530 * Checksum extended TCP header and data. 531 */ 532 len = sizeof(struct ip) + tlen; 533 bzero(ti->ti_x1, sizeof ti->ti_x1); 534 ti->ti_len = (u_int16_t)tlen; 535 HTONS(ti->ti_len); 536 if ((m->m_pkthdr.csum & M_TCP_CSUM_IN_OK) == 0) { 537 if (m->m_pkthdr.csum & M_TCP_CSUM_IN_BAD) { 538 tcpstat.tcps_inhwcsum++; 539 tcpstat.tcps_rcvbadsum++; 540 goto drop; 541 } 542 if ((ti->ti_sum = in_cksum(m, len)) != 0) { 543 tcpstat.tcps_rcvbadsum++; 544 goto drop; 545 } 546 } else { 547 m->m_pkthdr.csum &= ~M_TCP_CSUM_IN_OK; 548 tcpstat.tcps_inhwcsum++; 549 } 550 break; 551 } 552 #ifdef INET6 553 case AF_INET6: 554 ip6 = mtod(m, struct ip6_hdr *); 555 tlen = m->m_pkthdr.len - iphlen; 556 #ifdef TCP_ECN 557 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 558 #endif 559 560 /* Be proactive about malicious use of IPv4 mapped address */ 561 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 562 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 563 /* XXX stat */ 564 goto drop; 565 } 566 567 /* 568 * Be proactive about unspecified IPv6 address in source. 569 * As we use all-zero to indicate unbounded/unconnected pcb, 570 * unspecified IPv6 address can be used to confuse us. 571 * 572 * Note that packets with unspecified IPv6 destination is 573 * already dropped in ip6_input. 574 */ 575 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 576 /* XXX stat */ 577 goto drop; 578 } 579 580 /* 581 * Checksum extended TCP header and data. 582 */ 583 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 584 tcpstat.tcps_rcvbadsum++; 585 goto drop; 586 } 587 break; 588 #endif 589 } 590 #endif /* TUBA_INCLUDE */ 591 592 th = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); 593 594 /* 595 * Check that TCP offset makes sense, 596 * pull out TCP options and adjust length. XXX 597 */ 598 off = th->th_off << 2; 599 if (off < sizeof(struct tcphdr) || off > tlen) { 600 tcpstat.tcps_rcvbadoff++; 601 goto drop; 602 } 603 tlen -= off; 604 if (off > sizeof(struct tcphdr)) { 605 if (m->m_len < iphlen + off) { 606 if ((m = m_pullup2(m, iphlen + off)) == NULL) { 607 tcpstat.tcps_rcvshort++; 608 return; 609 } 610 switch (af) { 611 case AF_INET: 612 ip = mtod(m, struct ip *); 613 break; 614 #ifdef INET6 615 case AF_INET6: 616 ip6 = mtod(m, struct ip6_hdr *); 617 break; 618 #endif 619 } 620 th = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); 621 } 622 optlen = off - sizeof(struct tcphdr); 623 optp = mtod(m, u_int8_t *) + iphlen + sizeof(struct tcphdr); 624 /* 625 * Do quick retrieval of timestamp options ("options 626 * prediction?"). If timestamp is the only option and it's 627 * formatted as recommended in RFC 1323 appendix A, we 628 * quickly get the values now and not bother calling 629 * tcp_dooptions(), etc. 630 */ 631 if ((optlen == TCPOLEN_TSTAMP_APPA || 632 (optlen > TCPOLEN_TSTAMP_APPA && 633 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 634 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 635 (th->th_flags & TH_SYN) == 0) { 636 ts_present = 1; 637 ts_val = ntohl(*(u_int32_t *)(optp + 4)); 638 ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 639 optp = NULL; /* we've parsed the options */ 640 } 641 } 642 tiflags = th->th_flags; 643 644 /* 645 * Convert TCP protocol specific fields to host format. 646 */ 647 NTOHL(th->th_seq); 648 NTOHL(th->th_ack); 649 NTOHS(th->th_win); 650 NTOHS(th->th_urp); 651 652 /* 653 * Locate pcb for segment. 654 */ 655 findpcb: 656 switch (af) { 657 #ifdef INET6 658 case AF_INET6: 659 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport, 660 &ip6->ip6_dst, th->th_dport); 661 break; 662 #endif 663 case AF_INET: 664 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport, 665 ip->ip_dst, th->th_dport); 666 break; 667 } 668 if (inp == 0) { 669 ++tcpstat.tcps_pcbhashmiss; 670 switch (af) { 671 #ifdef INET6 672 case AF_INET6: 673 inp = in_pcblookup(&tcbtable, &ip6->ip6_src, 674 th->th_sport, &ip6->ip6_dst, th->th_dport, 675 INPLOOKUP_WILDCARD | INPLOOKUP_IPV6); 676 break; 677 #endif /* INET6 */ 678 case AF_INET: 679 inp = in_pcblookup(&tcbtable, &ip->ip_src, th->th_sport, 680 &ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD); 681 break; 682 } 683 /* 684 * If the state is CLOSED (i.e., TCB does not exist) then 685 * all data in the incoming segment is discarded. 686 * If the TCB exists but is in CLOSED state, it is embryonic, 687 * but should either do a listen or a connect soon. 688 */ 689 if (inp == 0) { 690 ++tcpstat.tcps_noport; 691 goto dropwithreset_ratelim; 692 } 693 } 694 695 tp = intotcpcb(inp); 696 if (tp == 0) 697 goto dropwithreset_ratelim; 698 if (tp->t_state == TCPS_CLOSED) 699 goto drop; 700 701 /* Unscale the window into a 32-bit value. */ 702 if ((tiflags & TH_SYN) == 0) 703 tiwin = th->th_win << tp->snd_scale; 704 else 705 tiwin = th->th_win; 706 707 so = inp->inp_socket; 708 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 709 if (so->so_options & SO_DEBUG) { 710 ostate = tp->t_state; 711 switch (af) { 712 #ifdef INET6 713 case AF_INET6: 714 tcp_saveti6 = *(mtod(m, struct tcpipv6hdr *)); 715 break; 716 #endif 717 case AF_INET: 718 tcp_saveti = *(mtod(m, struct tcpiphdr *)); 719 break; 720 } 721 } 722 if (so->so_options & SO_ACCEPTCONN) { 723 struct socket *so1; 724 725 #ifdef INET6 726 /* 727 * If deprecated address is forbidden, 728 * we do not accept SYN to deprecated interface 729 * address to prevent any new inbound connection from 730 * getting established. So drop the SYN packet. 731 * When we do not accept SYN, we send a TCP RST, 732 * with deprecated source address (instead of dropping 733 * it). We compromise it as it is much better for peer 734 * to send a RST, and RST will be the final packet 735 * for the exchange. 736 * 737 * If we do not forbid deprecated addresses, we accept 738 * the SYN packet. RFC2462 does not suggest dropping 739 * SYN in this case. 740 * If we decipher RFC2462 5.5.4, it says like this: 741 * 1. use of deprecated addr with existing 742 * communication is okay - "SHOULD continue to be 743 * used" 744 * 2. use of it with new communication: 745 * (2a) "SHOULD NOT be used if alternate address 746 * with sufficient scope is available" 747 * (2b) nothing mentioned otherwise. 748 * Here we fall into (2b) case as we have no choice in 749 * our source address selection - we must obey the peer. 750 * 751 * The wording in RFC2462 is confusing, and there are 752 * multiple description text for deprecated address 753 * handling - worse, they are not exactly the same. 754 * I believe 5.5.4 is the best one, so we follow 5.5.4. 755 */ 756 if (ip6 && !ip6_use_deprecated) { 757 struct in6_ifaddr *ia6; 758 759 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, &ip6->ip6_dst)) && 760 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 761 tp = NULL; 762 goto dropwithreset; 763 } 764 } 765 #endif 766 767 so1 = sonewconn(so, 0); 768 if (so1 == NULL) { 769 tcpdropoldhalfopen(tp, th->th_dport); 770 so1 = sonewconn(so, 0); 771 if (so1 == NULL) 772 goto drop; 773 } 774 so = so1; 775 /* 776 * This is ugly, but .... 777 * 778 * Mark socket as temporary until we're 779 * committed to keeping it. The code at 780 * ``drop'' and ``dropwithreset'' check the 781 * flag dropsocket to see if the temporary 782 * socket created here should be discarded. 783 * We mark the socket as discardable until 784 * we're committed to it below in TCPS_LISTEN. 785 */ 786 dropsocket++; 787 #ifdef IPSEC 788 /* 789 * We need to copy the required security levels 790 * from the old pcb. Ditto for any other 791 * IPsec-related information. 792 */ 793 { 794 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 795 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 796 sizeof(inp->inp_seclevel)); 797 newinp->inp_secrequire = inp->inp_secrequire; 798 if (inp->inp_ipo != NULL) { 799 newinp->inp_ipo = inp->inp_ipo; 800 inp->inp_ipo->ipo_ref_count++; 801 } 802 if (inp->inp_ipsec_remotecred != NULL) { 803 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 804 inp->inp_ipsec_remotecred->ref_count++; 805 } 806 if (inp->inp_ipsec_remoteauth != NULL) { 807 newinp->inp_ipsec_remoteauth 808 = inp->inp_ipsec_remoteauth; 809 inp->inp_ipsec_remoteauth->ref_count++; 810 } 811 } 812 #endif /* IPSEC */ 813 #ifdef INET6 814 /* 815 * inp still has the OLD in_pcb stuff, set the 816 * v6-related flags on the new guy, too. This is 817 * done particularly for the case where an AF_INET6 818 * socket is bound only to a port, and a v4 connection 819 * comes in on that port. 820 * we also copy the flowinfo from the original pcb 821 * to the new one. 822 */ 823 { 824 int flags = inp->inp_flags; 825 struct inpcb *oldinpcb = inp; 826 827 inp = (struct inpcb *)so->so_pcb; 828 inp->inp_flags |= (flags & INP_IPV6); 829 if ((inp->inp_flags & INP_IPV6) != 0) { 830 inp->inp_ipv6.ip6_hlim = 831 oldinpcb->inp_ipv6.ip6_hlim; 832 inp->inp_ipv6.ip6_flow = 833 oldinpcb->inp_ipv6.ip6_flow; 834 } 835 } 836 #else /* INET6 */ 837 inp = (struct inpcb *)so->so_pcb; 838 #endif /* INET6 */ 839 inp->inp_lport = th->th_dport; 840 switch (af) { 841 #ifdef INET6 842 case AF_INET6: 843 inp->inp_laddr6 = ip6->ip6_dst; 844 845 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 846 /* 847 * still need to tweak outbound options 848 * processing to include this mbuf in 849 * the right place and put the correct 850 * NextHdr values in the right places. 851 * XXX rja 852 */ 853 break; 854 #endif /* INET6 */ 855 case AF_INET: 856 inp->inp_laddr = ip->ip_dst; 857 inp->inp_options = ip_srcroute(); 858 break; 859 } 860 in_pcbrehash(inp); 861 tp = intotcpcb(inp); 862 tp->t_state = TCPS_LISTEN; 863 864 /* Compute proper scaling value from buffer space */ 865 tcp_rscale(tp, so->so_rcv.sb_hiwat); 866 } 867 } 868 869 #ifdef IPSEC 870 /* Find most recent IPsec tag */ 871 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 872 s = splnet(); 873 if (mtag != NULL) { 874 tdbi = (struct tdb_ident *)(mtag + 1); 875 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 876 } else 877 tdb = NULL; 878 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 879 tdb, inp); 880 if (error) { 881 splx(s); 882 goto drop; 883 } 884 885 /* Latch SA */ 886 if (inp->inp_tdb_in != tdb) { 887 if (tdb) { 888 tdb_add_inp(tdb, inp, 1); 889 if (inp->inp_ipo == NULL) { 890 inp->inp_ipo = ipsec_add_policy(inp, af, 891 IPSP_DIRECTION_OUT); 892 if (inp->inp_ipo == NULL) { 893 splx(s); 894 goto drop; 895 } 896 } 897 if (inp->inp_ipo->ipo_dstid == NULL && 898 tdb->tdb_srcid != NULL) { 899 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 900 tdb->tdb_srcid->ref_count++; 901 } 902 if (inp->inp_ipsec_remotecred == NULL && 903 tdb->tdb_remote_cred != NULL) { 904 inp->inp_ipsec_remotecred = 905 tdb->tdb_remote_cred; 906 tdb->tdb_remote_cred->ref_count++; 907 } 908 if (inp->inp_ipsec_remoteauth == NULL && 909 tdb->tdb_remote_auth != NULL) { 910 inp->inp_ipsec_remoteauth = 911 tdb->tdb_remote_auth; 912 tdb->tdb_remote_auth->ref_count++; 913 } 914 } else { /* Just reset */ 915 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 916 inp_tdb_in_next); 917 inp->inp_tdb_in = NULL; 918 } 919 } 920 splx(s); 921 #endif /* IPSEC */ 922 923 /* 924 * Segment received on connection. 925 * Reset idle time and keep-alive timer. 926 */ 927 tp->t_rcvtime = tcp_now; 928 if (tp->t_state != TCPS_SYN_RECEIVED) 929 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 930 931 #ifdef TCP_SACK 932 if (!tp->sack_disable) 933 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 934 #endif /* TCP_SACK */ 935 936 /* 937 * Process options if not in LISTEN state, 938 * else do it below (after getting remote address). 939 */ 940 if (optp && tp->t_state != TCPS_LISTEN) 941 tcp_dooptions(tp, optp, optlen, th, 942 &ts_present, &ts_val, &ts_ecr); 943 944 #ifdef TCP_SACK 945 if (!tp->sack_disable) { 946 tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/ 947 tp->rcv_lastend = th->th_seq + tlen; 948 } 949 #endif /* TCP_SACK */ 950 #ifdef TCP_ECN 951 /* if congestion experienced, set ECE bit in subsequent packets. */ 952 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 953 tp->t_flags |= TF_RCVD_CE; 954 tcpstat.tcps_ecn_rcvce++; 955 } 956 #endif 957 /* 958 * Header prediction: check for the two common cases 959 * of a uni-directional data xfer. If the packet has 960 * no control flags, is in-sequence, the window didn't 961 * change and we're not retransmitting, it's a 962 * candidate. If the length is zero and the ack moved 963 * forward, we're the sender side of the xfer. Just 964 * free the data acked & wake any higher level process 965 * that was blocked waiting for space. If the length 966 * is non-zero and the ack didn't move, we're the 967 * receiver side. If we're getting packets in-order 968 * (the reassembly queue is empty), add the data to 969 * the socket buffer and note that we need a delayed ack. 970 */ 971 if (tp->t_state == TCPS_ESTABLISHED && 972 #ifdef TCP_ECN 973 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 974 #else 975 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 976 #endif 977 (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && 978 th->th_seq == tp->rcv_nxt && 979 tiwin && tiwin == tp->snd_wnd && 980 tp->snd_nxt == tp->snd_max) { 981 982 /* 983 * If last ACK falls within this segment's sequence numbers, 984 * record the timestamp. 985 * Fix from Braden, see Stevens p. 870 986 */ 987 if (ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 988 tp->ts_recent_age = tcp_now; 989 tp->ts_recent = ts_val; 990 } 991 992 if (tlen == 0) { 993 if (SEQ_GT(th->th_ack, tp->snd_una) && 994 SEQ_LEQ(th->th_ack, tp->snd_max) && 995 tp->snd_cwnd >= tp->snd_wnd && 996 tp->t_dupacks == 0) { 997 /* 998 * this is a pure ack for outstanding data. 999 */ 1000 ++tcpstat.tcps_predack; 1001 if (ts_present) 1002 tcp_xmit_timer(tp, tcp_now-ts_ecr+1); 1003 else if (tp->t_rtttime && 1004 SEQ_GT(th->th_ack, tp->t_rtseq)) 1005 tcp_xmit_timer(tp, 1006 tcp_now - tp->t_rtttime); 1007 acked = th->th_ack - tp->snd_una; 1008 tcpstat.tcps_rcvackpack++; 1009 tcpstat.tcps_rcvackbyte += acked; 1010 ND6_HINT(tp); 1011 sbdrop(&so->so_snd, acked); 1012 tp->snd_una = th->th_ack; 1013 #if defined(TCP_SACK) || defined(TCP_ECN) 1014 /* 1015 * We want snd_last to track snd_una so 1016 * as to avoid sequence wraparound problems 1017 * for very large transfers. 1018 */ 1019 #ifdef TCP_ECN 1020 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1021 #endif 1022 tp->snd_last = tp->snd_una; 1023 #endif /* TCP_SACK */ 1024 #if defined(TCP_SACK) && defined(TCP_FACK) 1025 tp->snd_fack = tp->snd_una; 1026 tp->retran_data = 0; 1027 #endif /* TCP_FACK */ 1028 m_freem(m); 1029 1030 /* 1031 * If all outstanding data are acked, stop 1032 * retransmit timer, otherwise restart timer 1033 * using current (possibly backed-off) value. 1034 * If process is waiting for space, 1035 * wakeup/selwakeup/signal. If data 1036 * are ready to send, let tcp_output 1037 * decide between more output or persist. 1038 */ 1039 if (tp->snd_una == tp->snd_max) 1040 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1041 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1042 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1043 1044 if (sb_notify(&so->so_snd)) 1045 sowwakeup(so); 1046 if (so->so_snd.sb_cc) 1047 (void) tcp_output(tp); 1048 return; 1049 } 1050 } else if (th->th_ack == tp->snd_una && 1051 tp->segq.lh_first == NULL && 1052 tlen <= sbspace(&so->so_rcv)) { 1053 /* 1054 * This is a pure, in-sequence data packet 1055 * with nothing on the reassembly queue and 1056 * we have enough buffer space to take it. 1057 */ 1058 #ifdef TCP_SACK 1059 /* Clean receiver SACK report if present */ 1060 if (!tp->sack_disable && tp->rcv_numsacks) 1061 tcp_clean_sackreport(tp); 1062 #endif /* TCP_SACK */ 1063 ++tcpstat.tcps_preddat; 1064 tp->rcv_nxt += tlen; 1065 tcpstat.tcps_rcvpack++; 1066 tcpstat.tcps_rcvbyte += tlen; 1067 ND6_HINT(tp); 1068 /* 1069 * Drop TCP, IP headers and TCP options then add data 1070 * to socket buffer. 1071 */ 1072 if (so->so_state & SS_CANTRCVMORE) 1073 m_freem(m); 1074 else { 1075 m_adj(m, iphlen + off); 1076 sbappendstream(&so->so_rcv, m); 1077 } 1078 sorwakeup(so); 1079 TCP_SETUP_ACK(tp, tiflags); 1080 if (tp->t_flags & TF_ACKNOW) 1081 (void) tcp_output(tp); 1082 return; 1083 } 1084 } 1085 1086 /* 1087 * Compute mbuf offset to TCP data segment. 1088 */ 1089 hdroptlen = iphlen + off; 1090 1091 /* 1092 * Calculate amount of space in receive window, 1093 * and then do TCP input processing. 1094 * Receive window is amount of space in rcv queue, 1095 * but not less than advertised window. 1096 */ 1097 { int win; 1098 1099 win = sbspace(&so->so_rcv); 1100 if (win < 0) 1101 win = 0; 1102 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1103 } 1104 1105 switch (tp->t_state) { 1106 1107 /* 1108 * If the state is LISTEN then ignore segment if it contains an RST. 1109 * If the segment contains an ACK then it is bad and send a RST. 1110 * If it does not contain a SYN then it is not interesting; drop it. 1111 * If it is from this socket, drop it, it must be forged. 1112 * Don't bother responding if the destination was a broadcast. 1113 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial 1114 * tp->iss, and send a segment: 1115 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 1116 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. 1117 * Fill in remote peer address fields if not previously specified. 1118 * Enter SYN_RECEIVED state, and process any other fields of this 1119 * segment in this state. 1120 */ 1121 case TCPS_LISTEN: { 1122 struct mbuf *am; 1123 struct sockaddr_in *sin; 1124 #ifdef INET6 1125 struct sockaddr_in6 *sin6; 1126 #endif /* INET6 */ 1127 1128 if (tiflags & TH_RST) 1129 goto drop; 1130 if (tiflags & TH_ACK) 1131 goto dropwithreset; 1132 if ((tiflags & TH_SYN) == 0) 1133 goto drop; 1134 if (th->th_dport == th->th_sport) { 1135 switch (af) { 1136 #ifdef INET6 1137 case AF_INET6: 1138 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 1139 &ip6->ip6_dst)) 1140 goto drop; 1141 break; 1142 #endif /* INET6 */ 1143 case AF_INET: 1144 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) 1145 goto drop; 1146 break; 1147 } 1148 } 1149 1150 /* 1151 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 1152 */ 1153 if (m->m_flags & (M_BCAST|M_MCAST)) 1154 goto drop; 1155 switch (af) { 1156 #ifdef INET6 1157 case AF_INET6: 1158 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) 1159 goto drop; 1160 break; 1161 #endif /* INET6 */ 1162 case AF_INET: 1163 if (IN_MULTICAST(ip->ip_dst.s_addr) || 1164 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 1165 goto drop; 1166 break; 1167 } 1168 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 1169 if (am == NULL) 1170 goto drop; 1171 switch (af) { 1172 #ifdef INET6 1173 case AF_INET6: 1174 /* 1175 * This is probably the place to set the tp->pf value. 1176 * (Don't forget to do it in the v4 code as well!) 1177 * 1178 * Also, remember to blank out things like flowlabel, or 1179 * set flowlabel for accepted sockets in v6. 1180 * 1181 * FURTHERMORE, this is PROBABLY the place where the 1182 * whole business of key munging is set up for passive 1183 * connections. 1184 */ 1185 am->m_len = sizeof(struct sockaddr_in6); 1186 sin6 = mtod(am, struct sockaddr_in6 *); 1187 bzero(sin6, sizeof(*sin6)); 1188 sin6->sin6_family = AF_INET6; 1189 sin6->sin6_len = sizeof(struct sockaddr_in6); 1190 sin6->sin6_addr = ip6->ip6_src; 1191 sin6->sin6_port = th->th_sport; 1192 sin6->sin6_flowinfo = htonl(0x0fffffff) & 1193 inp->inp_ipv6.ip6_flow; 1194 laddr6 = inp->inp_laddr6; 1195 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) 1196 inp->inp_laddr6 = ip6->ip6_dst; 1197 /* This is a good optimization. */ 1198 if (in6_pcbconnect(inp, am)) { 1199 inp->inp_laddr6 = laddr6; 1200 (void) m_free(am); 1201 goto drop; 1202 } 1203 break; 1204 #endif 1205 case AF_INET: 1206 /* drop IPv4 packet to AF_INET6 socket */ 1207 if (inp->inp_flags & INP_IPV6) { 1208 (void) m_free(am); 1209 goto drop; 1210 } 1211 am->m_len = sizeof(struct sockaddr_in); 1212 sin = mtod(am, struct sockaddr_in *); 1213 bzero(sin, sizeof(*sin)); 1214 sin->sin_family = AF_INET; 1215 sin->sin_len = sizeof(*sin); 1216 sin->sin_addr = ip->ip_src; 1217 sin->sin_port = th->th_sport; 1218 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); 1219 laddr = inp->inp_laddr; 1220 if (inp->inp_laddr.s_addr == INADDR_ANY) 1221 inp->inp_laddr = ip->ip_dst; 1222 if (in_pcbconnect(inp, am)) { 1223 inp->inp_laddr = laddr; 1224 (void) m_free(am); 1225 goto drop; 1226 } 1227 break; 1228 } 1229 (void) m_free(am); 1230 tp->t_template = tcp_template(tp); 1231 if (tp->t_template == 0) { 1232 tp = tcp_drop(tp, ENOBUFS); 1233 dropsocket = 0; /* socket is already gone */ 1234 goto drop; 1235 } 1236 if (optp) 1237 tcp_dooptions(tp, optp, optlen, th, 1238 &ts_present, &ts_val, &ts_ecr); 1239 #ifdef TCP_SACK 1240 /* 1241 * If peer did not send a SACK_PERMITTED option (i.e., if 1242 * tcp_dooptions() did not set TF_SACK_PERMIT), set 1243 * sack_disable to 1 if it is currently 0. 1244 */ 1245 if (!tp->sack_disable) 1246 if ((tp->t_flags & TF_SACK_PERMIT) == 0) 1247 tp->sack_disable = 1; 1248 #endif 1249 1250 if (iss) 1251 tp->iss = iss; 1252 else { 1253 #ifdef TCP_COMPAT_42 1254 tcp_iss += TCP_ISSINCR/2; 1255 tp->iss = tcp_iss; 1256 #else /* TCP_COMPAT_42 */ 1257 tp->iss = tcp_rndiss_next(); 1258 #endif /* !TCP_COMPAT_42 */ 1259 } 1260 tp->irs = th->th_seq; 1261 tcp_sendseqinit(tp); 1262 #if defined (TCP_SACK) || defined(TCP_ECN) 1263 tp->snd_last = tp->snd_una; 1264 #endif /* TCP_SACK */ 1265 #if defined(TCP_SACK) && defined(TCP_FACK) 1266 tp->snd_fack = tp->snd_una; 1267 tp->retran_data = 0; 1268 tp->snd_awnd = 0; 1269 #endif /* TCP_FACK */ 1270 #ifdef TCP_ECN 1271 /* 1272 * if both ECE and CWR flag bits are set, peer is ECN capable. 1273 */ 1274 if (tcp_do_ecn && 1275 (tiflags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1276 tp->t_flags |= TF_ECN_PERMIT; 1277 tcpstat.tcps_ecn_accepts++; 1278 } 1279 #endif 1280 tcp_rcvseqinit(tp); 1281 tp->t_flags |= TF_ACKNOW; 1282 tp->t_state = TCPS_SYN_RECEIVED; 1283 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 1284 dropsocket = 0; /* committed to socket */ 1285 tcpstat.tcps_accepts++; 1286 goto trimthenstep6; 1287 } 1288 1289 /* 1290 * If the state is SYN_RECEIVED: 1291 * if seg contains SYN/ACK, send an RST. 1292 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1293 */ 1294 1295 case TCPS_SYN_RECEIVED: 1296 if (tiflags & TH_ACK) { 1297 if (tiflags & TH_SYN) { 1298 tcpstat.tcps_badsyn++; 1299 goto dropwithreset; 1300 } 1301 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1302 SEQ_GT(th->th_ack, tp->snd_max)) 1303 goto dropwithreset; 1304 } 1305 break; 1306 1307 /* 1308 * If the state is SYN_SENT: 1309 * if seg contains an ACK, but not for our SYN, drop the input. 1310 * if seg contains a RST, then drop the connection. 1311 * if seg does not contain SYN, then drop it. 1312 * Otherwise this is an acceptable SYN segment 1313 * initialize tp->rcv_nxt and tp->irs 1314 * if seg contains ack then advance tp->snd_una 1315 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1316 * arrange for segment to be acked (eventually) 1317 * continue processing rest of data/controls, beginning with URG 1318 */ 1319 case TCPS_SYN_SENT: 1320 if ((tiflags & TH_ACK) && 1321 (SEQ_LEQ(th->th_ack, tp->iss) || 1322 SEQ_GT(th->th_ack, tp->snd_max))) 1323 goto dropwithreset; 1324 if (tiflags & TH_RST) { 1325 #ifdef TCP_ECN 1326 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1327 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1328 goto drop; 1329 #endif 1330 if (tiflags & TH_ACK) 1331 tp = tcp_drop(tp, ECONNREFUSED); 1332 goto drop; 1333 } 1334 if ((tiflags & TH_SYN) == 0) 1335 goto drop; 1336 if (tiflags & TH_ACK) { 1337 tp->snd_una = th->th_ack; 1338 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1339 tp->snd_nxt = tp->snd_una; 1340 } 1341 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1342 tp->irs = th->th_seq; 1343 tcp_rcvseqinit(tp); 1344 tp->t_flags |= TF_ACKNOW; 1345 #ifdef TCP_SACK 1346 /* 1347 * If we've sent a SACK_PERMITTED option, and the peer 1348 * also replied with one, then TF_SACK_PERMIT should have 1349 * been set in tcp_dooptions(). If it was not, disable SACKs. 1350 */ 1351 if (!tp->sack_disable) 1352 if ((tp->t_flags & TF_SACK_PERMIT) == 0) 1353 tp->sack_disable = 1; 1354 #endif 1355 #ifdef TCP_ECN 1356 /* 1357 * if ECE is set but CWR is not set for SYN-ACK, or 1358 * both ECE and CWR are set for simultaneous open, 1359 * peer is ECN capable. 1360 */ 1361 if (tcp_do_ecn) { 1362 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1363 == (TH_ACK|TH_ECE) || 1364 (tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1365 == (TH_ECE|TH_CWR)) { 1366 tp->t_flags |= TF_ECN_PERMIT; 1367 tiflags &= ~(TH_ECE|TH_CWR); 1368 tcpstat.tcps_ecn_accepts++; 1369 } 1370 } 1371 #endif 1372 1373 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1374 tcpstat.tcps_connects++; 1375 soisconnected(so); 1376 tp->t_state = TCPS_ESTABLISHED; 1377 /* Do window scaling on this connection? */ 1378 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1379 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1380 tp->snd_scale = tp->requested_s_scale; 1381 tp->rcv_scale = tp->request_r_scale; 1382 } 1383 (void) tcp_reass(tp, (struct tcphdr *)0, 1384 (struct mbuf *)0, &tlen); 1385 /* 1386 * if we didn't have to retransmit the SYN, 1387 * use its rtt as our initial srtt & rtt var. 1388 */ 1389 if (tp->t_rtttime) 1390 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1391 /* 1392 * Since new data was acked (the SYN), open the 1393 * congestion window by one MSS. We do this 1394 * here, because we won't go through the normal 1395 * ACK processing below. And since this is the 1396 * start of the connection, we know we are in 1397 * the exponential phase of slow-start. 1398 */ 1399 tp->snd_cwnd += tp->t_maxseg; 1400 } else 1401 tp->t_state = TCPS_SYN_RECEIVED; 1402 1403 trimthenstep6: 1404 /* 1405 * Advance th->th_seq to correspond to first data byte. 1406 * If data, trim to stay within window, 1407 * dropping FIN if necessary. 1408 */ 1409 th->th_seq++; 1410 if (tlen > tp->rcv_wnd) { 1411 todrop = tlen - tp->rcv_wnd; 1412 m_adj(m, -todrop); 1413 tlen = tp->rcv_wnd; 1414 tiflags &= ~TH_FIN; 1415 tcpstat.tcps_rcvpackafterwin++; 1416 tcpstat.tcps_rcvbyteafterwin += todrop; 1417 } 1418 tp->snd_wl1 = th->th_seq - 1; 1419 tp->rcv_up = th->th_seq; 1420 goto step6; 1421 } 1422 1423 /* 1424 * States other than LISTEN or SYN_SENT. 1425 * First check timestamp, if present. 1426 * Then check that at least some bytes of segment are within 1427 * receive window. If segment begins before rcv_nxt, 1428 * drop leading data (and SYN); if nothing left, just ack. 1429 * 1430 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1431 * and it's less than ts_recent, drop it. 1432 */ 1433 if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1434 TSTMP_LT(ts_val, tp->ts_recent)) { 1435 1436 /* Check to see if ts_recent is over 24 days old. */ 1437 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1438 /* 1439 * Invalidate ts_recent. If this segment updates 1440 * ts_recent, the age will be reset later and ts_recent 1441 * will get a valid value. If it does not, setting 1442 * ts_recent to zero will at least satisfy the 1443 * requirement that zero be placed in the timestamp 1444 * echo reply when ts_recent isn't valid. The 1445 * age isn't reset until we get a valid ts_recent 1446 * because we don't want out-of-order segments to be 1447 * dropped when ts_recent is old. 1448 */ 1449 tp->ts_recent = 0; 1450 } else { 1451 tcpstat.tcps_rcvduppack++; 1452 tcpstat.tcps_rcvdupbyte += tlen; 1453 tcpstat.tcps_pawsdrop++; 1454 goto dropafterack; 1455 } 1456 } 1457 1458 todrop = tp->rcv_nxt - th->th_seq; 1459 if (todrop > 0) { 1460 if (tiflags & TH_SYN) { 1461 tiflags &= ~TH_SYN; 1462 th->th_seq++; 1463 if (th->th_urp > 1) 1464 th->th_urp--; 1465 else 1466 tiflags &= ~TH_URG; 1467 todrop--; 1468 } 1469 if (todrop > tlen || 1470 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1471 /* 1472 * Any valid FIN must be to the left of the 1473 * window. At this point, FIN must be a 1474 * duplicate or out-of-sequence, so drop it. 1475 */ 1476 tiflags &= ~TH_FIN; 1477 /* 1478 * Send ACK to resynchronize, and drop any data, 1479 * but keep on processing for RST or ACK. 1480 */ 1481 tp->t_flags |= TF_ACKNOW; 1482 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1483 tcpstat.tcps_rcvduppack++; 1484 } else { 1485 tcpstat.tcps_rcvpartduppack++; 1486 tcpstat.tcps_rcvpartdupbyte += todrop; 1487 } 1488 hdroptlen += todrop; /* drop from head afterwards */ 1489 th->th_seq += todrop; 1490 tlen -= todrop; 1491 if (th->th_urp > todrop) 1492 th->th_urp -= todrop; 1493 else { 1494 tiflags &= ~TH_URG; 1495 th->th_urp = 0; 1496 } 1497 } 1498 1499 /* 1500 * If new data are received on a connection after the 1501 * user processes are gone, then RST the other end. 1502 */ 1503 if ((so->so_state & SS_NOFDREF) && 1504 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1505 tp = tcp_close(tp); 1506 tcpstat.tcps_rcvafterclose++; 1507 goto dropwithreset; 1508 } 1509 1510 /* 1511 * If segment ends after window, drop trailing data 1512 * (and PUSH and FIN); if nothing left, just ACK. 1513 */ 1514 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1515 if (todrop > 0) { 1516 tcpstat.tcps_rcvpackafterwin++; 1517 if (todrop >= tlen) { 1518 tcpstat.tcps_rcvbyteafterwin += tlen; 1519 /* 1520 * If a new connection request is received 1521 * while in TIME_WAIT, drop the old connection 1522 * and start over if the sequence numbers 1523 * are above the previous ones. 1524 */ 1525 if (tiflags & TH_SYN && 1526 tp->t_state == TCPS_TIME_WAIT && 1527 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1528 iss = tp->snd_nxt + TCP_ISSINCR; 1529 tp = tcp_close(tp); 1530 goto findpcb; 1531 } 1532 /* 1533 * If window is closed can only take segments at 1534 * window edge, and have to drop data and PUSH from 1535 * incoming segments. Continue processing, but 1536 * remember to ack. Otherwise, drop segment 1537 * and ack. 1538 */ 1539 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1540 tp->t_flags |= TF_ACKNOW; 1541 tcpstat.tcps_rcvwinprobe++; 1542 } else 1543 goto dropafterack; 1544 } else 1545 tcpstat.tcps_rcvbyteafterwin += todrop; 1546 m_adj(m, -todrop); 1547 tlen -= todrop; 1548 tiflags &= ~(TH_PUSH|TH_FIN); 1549 } 1550 1551 /* 1552 * If last ACK falls within this segment's sequence numbers, 1553 * record its timestamp. 1554 * Fix from Braden, see Stevens p. 870 1555 */ 1556 if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && 1557 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1558 tp->ts_recent_age = tcp_now; 1559 tp->ts_recent = ts_val; 1560 } 1561 1562 /* 1563 * If the RST bit is set examine the state: 1564 * SYN_RECEIVED STATE: 1565 * If passive open, return to LISTEN state. 1566 * If active open, inform user that connection was refused. 1567 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1568 * Inform user that connection was reset, and close tcb. 1569 * CLOSING, LAST_ACK, TIME_WAIT STATES 1570 * Close the tcb. 1571 */ 1572 if (tiflags & TH_RST) { 1573 if (th->th_seq != tp->last_ack_sent) 1574 goto drop; 1575 1576 switch (tp->t_state) { 1577 case TCPS_SYN_RECEIVED: 1578 #ifdef TCP_ECN 1579 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1580 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1581 goto drop; 1582 #endif 1583 so->so_error = ECONNREFUSED; 1584 goto close; 1585 1586 case TCPS_ESTABLISHED: 1587 case TCPS_FIN_WAIT_1: 1588 case TCPS_FIN_WAIT_2: 1589 case TCPS_CLOSE_WAIT: 1590 so->so_error = ECONNRESET; 1591 close: 1592 tp->t_state = TCPS_CLOSED; 1593 tcpstat.tcps_drops++; 1594 tp = tcp_close(tp); 1595 goto drop; 1596 case TCPS_CLOSING: 1597 case TCPS_LAST_ACK: 1598 case TCPS_TIME_WAIT: 1599 tp = tcp_close(tp); 1600 goto drop; 1601 } 1602 } 1603 1604 /* 1605 * If a SYN is in the window, then this is an 1606 * error and we send an RST and drop the connection. 1607 */ 1608 if (tiflags & TH_SYN) { 1609 tp = tcp_drop(tp, ECONNRESET); 1610 goto dropwithreset; 1611 } 1612 1613 /* 1614 * If the ACK bit is off we drop the segment and return. 1615 */ 1616 if ((tiflags & TH_ACK) == 0) { 1617 if (tp->t_flags & TF_ACKNOW) 1618 goto dropafterack; 1619 else 1620 goto drop; 1621 } 1622 1623 /* 1624 * Ack processing. 1625 */ 1626 switch (tp->t_state) { 1627 1628 /* 1629 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1630 * ESTABLISHED state and continue processing. 1631 * The ACK was checked above. 1632 */ 1633 case TCPS_SYN_RECEIVED: 1634 tcpstat.tcps_connects++; 1635 soisconnected(so); 1636 tp->t_state = TCPS_ESTABLISHED; 1637 /* Do window scaling? */ 1638 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1639 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1640 tp->snd_scale = tp->requested_s_scale; 1641 tp->rcv_scale = tp->request_r_scale; 1642 } 1643 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1644 &tlen); 1645 tp->snd_wl1 = th->th_seq - 1; 1646 /* fall into ... */ 1647 1648 /* 1649 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1650 * ACKs. If the ack is in the range 1651 * tp->snd_una < th->th_ack <= tp->snd_max 1652 * then advance tp->snd_una to th->th_ack and drop 1653 * data from the retransmission queue. If this ACK reflects 1654 * more up to date window information we update our window information. 1655 */ 1656 case TCPS_ESTABLISHED: 1657 case TCPS_FIN_WAIT_1: 1658 case TCPS_FIN_WAIT_2: 1659 case TCPS_CLOSE_WAIT: 1660 case TCPS_CLOSING: 1661 case TCPS_LAST_ACK: 1662 case TCPS_TIME_WAIT: 1663 #ifdef TCP_ECN 1664 /* 1665 * if we receive ECE and are not already in recovery phase, 1666 * reduce cwnd by half but don't slow-start. 1667 * advance snd_last to snd_max not to reduce cwnd again 1668 * until all outstanding packets are acked. 1669 */ 1670 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1671 if ((tp->t_flags & TF_ECN_PERMIT) && 1672 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1673 u_int win; 1674 1675 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1676 if (win > 1) { 1677 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1678 tp->snd_cwnd = tp->snd_ssthresh; 1679 tp->snd_last = tp->snd_max; 1680 tp->t_flags |= TF_SEND_CWR; 1681 tcpstat.tcps_cwr_ecn++; 1682 } 1683 } 1684 tcpstat.tcps_ecn_rcvece++; 1685 } 1686 /* 1687 * if we receive CWR, we know that the peer has reduced 1688 * its congestion window. stop sending ecn-echo. 1689 */ 1690 if ((tiflags & TH_CWR)) { 1691 tp->t_flags &= ~TF_RCVD_CE; 1692 tcpstat.tcps_ecn_rcvcwr++; 1693 } 1694 #endif /* TCP_ECN */ 1695 1696 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1697 /* 1698 * Duplicate/old ACK processing. 1699 * Increments t_dupacks: 1700 * Pure duplicate (same seq/ack/window, no data) 1701 * Doesn't affect t_dupacks: 1702 * Data packets. 1703 * Normal window updates (window opens) 1704 * Resets t_dupacks: 1705 * New data ACKed. 1706 * Window shrinks 1707 * Old ACK 1708 */ 1709 if (tlen) 1710 break; 1711 /* 1712 * If we get an old ACK, there is probably packet 1713 * reordering going on. Be conservative and reset 1714 * t_dupacks so that we are less agressive in 1715 * doing a fast retransmit. 1716 */ 1717 if (th->th_ack != tp->snd_una) { 1718 tp->t_dupacks = 0; 1719 break; 1720 } 1721 if (tiwin == tp->snd_wnd) { 1722 tcpstat.tcps_rcvdupack++; 1723 /* 1724 * If we have outstanding data (other than 1725 * a window probe), this is a completely 1726 * duplicate ack (ie, window info didn't 1727 * change), the ack is the biggest we've 1728 * seen and we've seen exactly our rexmt 1729 * threshhold of them, assume a packet 1730 * has been dropped and retransmit it. 1731 * Kludge snd_nxt & the congestion 1732 * window so we send only this one 1733 * packet. 1734 * 1735 * We know we're losing at the current 1736 * window size so do congestion avoidance 1737 * (set ssthresh to half the current window 1738 * and pull our congestion window back to 1739 * the new ssthresh). 1740 * 1741 * Dup acks mean that packets have left the 1742 * network (they're now cached at the receiver) 1743 * so bump cwnd by the amount in the receiver 1744 * to keep a constant cwnd packets in the 1745 * network. 1746 */ 1747 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1748 tp->t_dupacks = 0; 1749 #if defined(TCP_SACK) && defined(TCP_FACK) 1750 /* 1751 * In FACK, can enter fast rec. if the receiver 1752 * reports a reass. queue longer than 3 segs. 1753 */ 1754 else if (++tp->t_dupacks == tcprexmtthresh || 1755 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1756 tp->t_maxseg + tp->snd_una)) && 1757 SEQ_GT(tp->snd_una, tp->snd_last))) { 1758 #else 1759 else if (++tp->t_dupacks == tcprexmtthresh) { 1760 #endif /* TCP_FACK */ 1761 tcp_seq onxt = tp->snd_nxt; 1762 u_long win = 1763 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1764 2 / tp->t_maxseg; 1765 1766 #if defined(TCP_SACK) || defined(TCP_ECN) 1767 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1768 /* 1769 * False fast retx after 1770 * timeout. Do not cut window. 1771 */ 1772 tp->t_dupacks = 0; 1773 goto drop; 1774 } 1775 #endif 1776 if (win < 2) 1777 win = 2; 1778 tp->snd_ssthresh = win * tp->t_maxseg; 1779 #if defined(TCP_SACK) 1780 tp->snd_last = tp->snd_max; 1781 #endif 1782 #ifdef TCP_SACK 1783 if (!tp->sack_disable) { 1784 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1785 tp->t_rtttime = 0; 1786 #ifdef TCP_ECN 1787 tp->t_flags |= TF_SEND_CWR; 1788 #endif 1789 #if 1 /* TCP_ECN */ 1790 tcpstat.tcps_cwr_frecovery++; 1791 #endif 1792 tcpstat.tcps_sndrexmitfast++; 1793 #if defined(TCP_SACK) && defined(TCP_FACK) 1794 tp->t_dupacks = tcprexmtthresh; 1795 (void) tcp_output(tp); 1796 /* 1797 * During FR, snd_cwnd is held 1798 * constant for FACK. 1799 */ 1800 tp->snd_cwnd = tp->snd_ssthresh; 1801 #else 1802 /* 1803 * tcp_output() will send 1804 * oldest SACK-eligible rtx. 1805 */ 1806 (void) tcp_output(tp); 1807 tp->snd_cwnd = tp->snd_ssthresh+ 1808 tp->t_maxseg * tp->t_dupacks; 1809 #endif /* TCP_FACK */ 1810 goto drop; 1811 } 1812 #endif /* TCP_SACK */ 1813 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1814 tp->t_rtttime = 0; 1815 tp->snd_nxt = th->th_ack; 1816 tp->snd_cwnd = tp->t_maxseg; 1817 #ifdef TCP_ECN 1818 tp->t_flags |= TF_SEND_CWR; 1819 #endif 1820 #if 1 /* TCP_ECN */ 1821 tcpstat.tcps_cwr_frecovery++; 1822 #endif 1823 tcpstat.tcps_sndrexmitfast++; 1824 (void) tcp_output(tp); 1825 1826 tp->snd_cwnd = tp->snd_ssthresh + 1827 tp->t_maxseg * tp->t_dupacks; 1828 if (SEQ_GT(onxt, tp->snd_nxt)) 1829 tp->snd_nxt = onxt; 1830 goto drop; 1831 } else if (tp->t_dupacks > tcprexmtthresh) { 1832 #if defined(TCP_SACK) && defined(TCP_FACK) 1833 /* 1834 * while (awnd < cwnd) 1835 * sendsomething(); 1836 */ 1837 if (!tp->sack_disable) { 1838 if (tp->snd_awnd < tp->snd_cwnd) 1839 tcp_output(tp); 1840 goto drop; 1841 } 1842 #endif /* TCP_FACK */ 1843 tp->snd_cwnd += tp->t_maxseg; 1844 (void) tcp_output(tp); 1845 goto drop; 1846 } 1847 } else if (tiwin < tp->snd_wnd) { 1848 /* 1849 * The window was retracted! Previous dup 1850 * ACKs may have been due to packets arriving 1851 * after the shrunken window, not a missing 1852 * packet, so play it safe and reset t_dupacks 1853 */ 1854 tp->t_dupacks = 0; 1855 } 1856 break; 1857 } 1858 /* 1859 * If the congestion window was inflated to account 1860 * for the other side's cached packets, retract it. 1861 */ 1862 #if defined(TCP_SACK) 1863 if (!tp->sack_disable) { 1864 if (tp->t_dupacks >= tcprexmtthresh) { 1865 /* Check for a partial ACK */ 1866 if (tcp_sack_partialack(tp, th)) { 1867 #if defined(TCP_SACK) && defined(TCP_FACK) 1868 /* Force call to tcp_output */ 1869 if (tp->snd_awnd < tp->snd_cwnd) 1870 needoutput = 1; 1871 #else 1872 tp->snd_cwnd += tp->t_maxseg; 1873 needoutput = 1; 1874 #endif /* TCP_FACK */ 1875 } else { 1876 /* Out of fast recovery */ 1877 tp->snd_cwnd = tp->snd_ssthresh; 1878 if (tcp_seq_subtract(tp->snd_max, 1879 th->th_ack) < tp->snd_ssthresh) 1880 tp->snd_cwnd = 1881 tcp_seq_subtract(tp->snd_max, 1882 th->th_ack); 1883 tp->t_dupacks = 0; 1884 #if defined(TCP_SACK) && defined(TCP_FACK) 1885 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1886 tp->snd_fack = th->th_ack; 1887 #endif /* TCP_FACK */ 1888 } 1889 } 1890 } else { 1891 if (tp->t_dupacks >= tcprexmtthresh && 1892 !tcp_newreno(tp, th)) { 1893 /* Out of fast recovery */ 1894 tp->snd_cwnd = tp->snd_ssthresh; 1895 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1896 tp->snd_ssthresh) 1897 tp->snd_cwnd = 1898 tcp_seq_subtract(tp->snd_max, 1899 th->th_ack); 1900 tp->t_dupacks = 0; 1901 } 1902 } 1903 if (tp->t_dupacks < tcprexmtthresh) 1904 tp->t_dupacks = 0; 1905 #else /* else no TCP_SACK */ 1906 if (tp->t_dupacks >= tcprexmtthresh && 1907 tp->snd_cwnd > tp->snd_ssthresh) 1908 tp->snd_cwnd = tp->snd_ssthresh; 1909 tp->t_dupacks = 0; 1910 #endif 1911 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1912 tcpstat.tcps_rcvacktoomuch++; 1913 goto dropafterack; 1914 } 1915 acked = th->th_ack - tp->snd_una; 1916 tcpstat.tcps_rcvackpack++; 1917 tcpstat.tcps_rcvackbyte += acked; 1918 1919 /* 1920 * If we have a timestamp reply, update smoothed 1921 * round trip time. If no timestamp is present but 1922 * transmit timer is running and timed sequence 1923 * number was acked, update smoothed round trip time. 1924 * Since we now have an rtt measurement, cancel the 1925 * timer backoff (cf., Phil Karn's retransmit alg.). 1926 * Recompute the initial retransmit timer. 1927 */ 1928 if (ts_present) 1929 tcp_xmit_timer(tp, tcp_now-ts_ecr+1); 1930 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1931 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1932 1933 /* 1934 * If all outstanding data is acked, stop retransmit 1935 * timer and remember to restart (more output or persist). 1936 * If there is more data to be acked, restart retransmit 1937 * timer, using current (possibly backed-off) value. 1938 */ 1939 if (th->th_ack == tp->snd_max) { 1940 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1941 needoutput = 1; 1942 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1943 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1944 /* 1945 * When new data is acked, open the congestion window. 1946 * If the window gives us less than ssthresh packets 1947 * in flight, open exponentially (maxseg per packet). 1948 * Otherwise open linearly: maxseg per window 1949 * (maxseg^2 / cwnd per packet). 1950 */ 1951 { 1952 u_int cw = tp->snd_cwnd; 1953 u_int incr = tp->t_maxseg; 1954 1955 if (cw > tp->snd_ssthresh) 1956 incr = incr * incr / cw; 1957 #if defined (TCP_SACK) 1958 if (tp->t_dupacks < tcprexmtthresh) 1959 #endif 1960 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1961 } 1962 ND6_HINT(tp); 1963 if (acked > so->so_snd.sb_cc) { 1964 tp->snd_wnd -= so->so_snd.sb_cc; 1965 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1966 ourfinisacked = 1; 1967 } else { 1968 sbdrop(&so->so_snd, acked); 1969 tp->snd_wnd -= acked; 1970 ourfinisacked = 0; 1971 } 1972 if (sb_notify(&so->so_snd)) 1973 sowwakeup(so); 1974 tp->snd_una = th->th_ack; 1975 #ifdef TCP_ECN 1976 /* sync snd_last with snd_una */ 1977 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1978 tp->snd_last = tp->snd_una; 1979 #endif 1980 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1981 tp->snd_nxt = tp->snd_una; 1982 #if defined (TCP_SACK) && defined (TCP_FACK) 1983 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1984 tp->snd_fack = tp->snd_una; 1985 /* Update snd_awnd for partial ACK 1986 * without any SACK blocks. 1987 */ 1988 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1989 tp->snd_fack) + tp->retran_data; 1990 } 1991 #endif 1992 1993 switch (tp->t_state) { 1994 1995 /* 1996 * In FIN_WAIT_1 STATE in addition to the processing 1997 * for the ESTABLISHED state if our FIN is now acknowledged 1998 * then enter FIN_WAIT_2. 1999 */ 2000 case TCPS_FIN_WAIT_1: 2001 if (ourfinisacked) { 2002 /* 2003 * If we can't receive any more 2004 * data, then closing user can proceed. 2005 * Starting the timer is contrary to the 2006 * specification, but if we don't get a FIN 2007 * we'll hang forever. 2008 */ 2009 if (so->so_state & SS_CANTRCVMORE) { 2010 soisdisconnected(so); 2011 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 2012 } 2013 tp->t_state = TCPS_FIN_WAIT_2; 2014 } 2015 break; 2016 2017 /* 2018 * In CLOSING STATE in addition to the processing for 2019 * the ESTABLISHED state if the ACK acknowledges our FIN 2020 * then enter the TIME-WAIT state, otherwise ignore 2021 * the segment. 2022 */ 2023 case TCPS_CLOSING: 2024 if (ourfinisacked) { 2025 tp->t_state = TCPS_TIME_WAIT; 2026 tcp_canceltimers(tp); 2027 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2028 soisdisconnected(so); 2029 } 2030 break; 2031 2032 /* 2033 * In LAST_ACK, we may still be waiting for data to drain 2034 * and/or to be acked, as well as for the ack of our FIN. 2035 * If our FIN is now acknowledged, delete the TCB, 2036 * enter the closed state and return. 2037 */ 2038 case TCPS_LAST_ACK: 2039 if (ourfinisacked) { 2040 tp = tcp_close(tp); 2041 goto drop; 2042 } 2043 break; 2044 2045 /* 2046 * In TIME_WAIT state the only thing that should arrive 2047 * is a retransmission of the remote FIN. Acknowledge 2048 * it and restart the finack timer. 2049 */ 2050 case TCPS_TIME_WAIT: 2051 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2052 goto dropafterack; 2053 } 2054 } 2055 2056 step6: 2057 /* 2058 * Update window information. 2059 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2060 */ 2061 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 2062 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 2063 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 2064 /* keep track of pure window updates */ 2065 if (tlen == 0 && 2066 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2067 tcpstat.tcps_rcvwinupd++; 2068 tp->snd_wnd = tiwin; 2069 tp->snd_wl1 = th->th_seq; 2070 tp->snd_wl2 = th->th_ack; 2071 if (tp->snd_wnd > tp->max_sndwnd) 2072 tp->max_sndwnd = tp->snd_wnd; 2073 needoutput = 1; 2074 } 2075 2076 /* 2077 * Process segments with URG. 2078 */ 2079 if ((tiflags & TH_URG) && th->th_urp && 2080 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2081 /* 2082 * This is a kludge, but if we receive and accept 2083 * random urgent pointers, we'll crash in 2084 * soreceive. It's hard to imagine someone 2085 * actually wanting to send this much urgent data. 2086 */ 2087 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2088 th->th_urp = 0; /* XXX */ 2089 tiflags &= ~TH_URG; /* XXX */ 2090 goto dodata; /* XXX */ 2091 } 2092 /* 2093 * If this segment advances the known urgent pointer, 2094 * then mark the data stream. This should not happen 2095 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2096 * a FIN has been received from the remote side. 2097 * In these states we ignore the URG. 2098 * 2099 * According to RFC961 (Assigned Protocols), 2100 * the urgent pointer points to the last octet 2101 * of urgent data. We continue, however, 2102 * to consider it to indicate the first octet 2103 * of data past the urgent section as the original 2104 * spec states (in one of two places). 2105 */ 2106 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2107 tp->rcv_up = th->th_seq + th->th_urp; 2108 so->so_oobmark = so->so_rcv.sb_cc + 2109 (tp->rcv_up - tp->rcv_nxt) - 1; 2110 if (so->so_oobmark == 0) 2111 so->so_state |= SS_RCVATMARK; 2112 sohasoutofband(so); 2113 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2114 } 2115 /* 2116 * Remove out of band data so doesn't get presented to user. 2117 * This can happen independent of advancing the URG pointer, 2118 * but if two URG's are pending at once, some out-of-band 2119 * data may creep in... ick. 2120 */ 2121 if (th->th_urp <= (u_int16_t) tlen 2122 #ifdef SO_OOBINLINE 2123 && (so->so_options & SO_OOBINLINE) == 0 2124 #endif 2125 ) 2126 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2127 } else 2128 /* 2129 * If no out of band data is expected, 2130 * pull receive urgent pointer along 2131 * with the receive window. 2132 */ 2133 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2134 tp->rcv_up = tp->rcv_nxt; 2135 dodata: /* XXX */ 2136 2137 /* 2138 * Process the segment text, merging it into the TCP sequencing queue, 2139 * and arranging for acknowledgment of receipt if necessary. 2140 * This process logically involves adjusting tp->rcv_wnd as data 2141 * is presented to the user (this happens in tcp_usrreq.c, 2142 * case PRU_RCVD). If a FIN has already been received on this 2143 * connection then we just ignore the text. 2144 */ 2145 if ((tlen || (tiflags & TH_FIN)) && 2146 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2147 if (th->th_seq == tp->rcv_nxt && tp->segq.lh_first == NULL && 2148 tp->t_state == TCPS_ESTABLISHED) { 2149 TCP_SETUP_ACK(tp, tiflags); 2150 tp->rcv_nxt += tlen; 2151 tiflags = th->th_flags & TH_FIN; 2152 tcpstat.tcps_rcvpack++; 2153 tcpstat.tcps_rcvbyte += tlen; 2154 ND6_HINT(tp); 2155 if (so->so_state & SS_CANTRCVMORE) 2156 m_freem(m); 2157 else { 2158 m_adj(m, hdroptlen); 2159 sbappendstream(&so->so_rcv, m); 2160 } 2161 sorwakeup(so); 2162 } else { 2163 m_adj(m, hdroptlen); 2164 tiflags = tcp_reass(tp, th, m, &tlen); 2165 tp->t_flags |= TF_ACKNOW; 2166 } 2167 #ifdef TCP_SACK 2168 if (!tp->sack_disable) 2169 tcp_update_sack_list(tp); 2170 #endif 2171 2172 /* 2173 * variable len never referenced again in modern BSD, 2174 * so why bother computing it ?? 2175 */ 2176 #if 0 2177 /* 2178 * Note the amount of data that peer has sent into 2179 * our window, in order to estimate the sender's 2180 * buffer size. 2181 */ 2182 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2183 #endif /* 0 */ 2184 } else { 2185 m_freem(m); 2186 tiflags &= ~TH_FIN; 2187 } 2188 2189 /* 2190 * If FIN is received ACK the FIN and let the user know 2191 * that the connection is closing. Ignore a FIN received before 2192 * the connection is fully established. 2193 */ 2194 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2195 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2196 socantrcvmore(so); 2197 tp->t_flags |= TF_ACKNOW; 2198 tp->rcv_nxt++; 2199 } 2200 switch (tp->t_state) { 2201 2202 /* 2203 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2204 */ 2205 case TCPS_ESTABLISHED: 2206 tp->t_state = TCPS_CLOSE_WAIT; 2207 break; 2208 2209 /* 2210 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2211 * enter the CLOSING state. 2212 */ 2213 case TCPS_FIN_WAIT_1: 2214 tp->t_state = TCPS_CLOSING; 2215 break; 2216 2217 /* 2218 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2219 * starting the time-wait timer, turning off the other 2220 * standard timers. 2221 */ 2222 case TCPS_FIN_WAIT_2: 2223 tp->t_state = TCPS_TIME_WAIT; 2224 tcp_canceltimers(tp); 2225 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2226 soisdisconnected(so); 2227 break; 2228 2229 /* 2230 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2231 */ 2232 case TCPS_TIME_WAIT: 2233 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2234 break; 2235 } 2236 } 2237 if (so->so_options & SO_DEBUG) { 2238 switch (tp->pf) { 2239 #ifdef INET6 2240 case PF_INET6: 2241 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2242 0, tlen); 2243 break; 2244 #endif /* INET6 */ 2245 case PF_INET: 2246 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2247 0, tlen); 2248 break; 2249 } 2250 } 2251 2252 /* 2253 * Return any desired output. 2254 */ 2255 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2256 (void) tcp_output(tp); 2257 } 2258 return; 2259 2260 dropafterack: 2261 /* 2262 * Generate an ACK dropping incoming segment if it occupies 2263 * sequence space, where the ACK reflects our state. 2264 */ 2265 if (tiflags & TH_RST) 2266 goto drop; 2267 m_freem(m); 2268 tp->t_flags |= TF_ACKNOW; 2269 (void) tcp_output(tp); 2270 return; 2271 2272 dropwithreset_ratelim: 2273 /* 2274 * We may want to rate-limit RSTs in certain situations, 2275 * particularly if we are sending an RST in response to 2276 * an attempt to connect to or otherwise communicate with 2277 * a port for which we have no socket. 2278 */ 2279 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2280 tcp_rst_ppslim) == 0) { 2281 /* XXX stat */ 2282 goto drop; 2283 } 2284 /* ...fall into dropwithreset... */ 2285 2286 dropwithreset: 2287 /* 2288 * Generate a RST, dropping incoming segment. 2289 * Make ACK acceptable to originator of segment. 2290 * Don't bother to respond if destination was broadcast/multicast. 2291 */ 2292 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2293 goto drop; 2294 switch (af) { 2295 #ifdef INET6 2296 case AF_INET6: 2297 /* For following calls to tcp_respond */ 2298 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) 2299 goto drop; 2300 break; 2301 #endif /* INET6 */ 2302 case AF_INET: 2303 if (IN_MULTICAST(ip->ip_dst.s_addr) || 2304 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2305 goto drop; 2306 } 2307 if (tiflags & TH_ACK) { 2308 tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, 2309 TH_RST); 2310 } else { 2311 if (tiflags & TH_SYN) 2312 tlen++; 2313 tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen, 2314 (tcp_seq)0, TH_RST|TH_ACK); 2315 } 2316 /* destroy temporarily created socket */ 2317 if (dropsocket) 2318 (void) soabort(so); 2319 return; 2320 2321 drop: 2322 /* 2323 * Drop space held by incoming segment and return. 2324 */ 2325 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2326 switch (tp->pf) { 2327 #ifdef INET6 2328 case PF_INET6: 2329 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2330 0, tlen); 2331 break; 2332 #endif /* INET6 */ 2333 case PF_INET: 2334 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2335 0, tlen); 2336 break; 2337 } 2338 } 2339 2340 m_freem(m); 2341 /* destroy temporarily created socket */ 2342 if (dropsocket) 2343 (void) soabort(so); 2344 return; 2345 #ifndef TUBA_INCLUDE 2346 } 2347 2348 void 2349 tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr) 2350 struct tcpcb *tp; 2351 u_char *cp; 2352 int cnt; 2353 struct tcphdr *th; 2354 int *ts_present; 2355 u_int32_t *ts_val, *ts_ecr; 2356 { 2357 u_int16_t mss = 0; 2358 int opt, optlen; 2359 2360 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2361 opt = cp[0]; 2362 if (opt == TCPOPT_EOL) 2363 break; 2364 if (opt == TCPOPT_NOP) 2365 optlen = 1; 2366 else { 2367 if (cnt < 2) 2368 break; 2369 optlen = cp[1]; 2370 if (optlen < 2 || optlen > cnt) 2371 break; 2372 } 2373 switch (opt) { 2374 2375 default: 2376 continue; 2377 2378 case TCPOPT_MAXSEG: 2379 if (optlen != TCPOLEN_MAXSEG) 2380 continue; 2381 if (!(th->th_flags & TH_SYN)) 2382 continue; 2383 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2384 NTOHS(mss); 2385 break; 2386 2387 case TCPOPT_WINDOW: 2388 if (optlen != TCPOLEN_WINDOW) 2389 continue; 2390 if (!(th->th_flags & TH_SYN)) 2391 continue; 2392 tp->t_flags |= TF_RCVD_SCALE; 2393 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2394 break; 2395 2396 case TCPOPT_TIMESTAMP: 2397 if (optlen != TCPOLEN_TIMESTAMP) 2398 continue; 2399 *ts_present = 1; 2400 bcopy((char *)cp + 2, (char *) ts_val, sizeof(*ts_val)); 2401 NTOHL(*ts_val); 2402 bcopy((char *)cp + 6, (char *) ts_ecr, sizeof(*ts_ecr)); 2403 NTOHL(*ts_ecr); 2404 2405 /* 2406 * A timestamp received in a SYN makes 2407 * it ok to send timestamp requests and replies. 2408 */ 2409 if (th->th_flags & TH_SYN) { 2410 tp->t_flags |= TF_RCVD_TSTMP; 2411 tp->ts_recent = *ts_val; 2412 tp->ts_recent_age = tcp_now; 2413 } 2414 break; 2415 2416 #ifdef TCP_SACK 2417 case TCPOPT_SACK_PERMITTED: 2418 if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED) 2419 continue; 2420 if (th->th_flags & TH_SYN) 2421 /* MUST only be set on SYN */ 2422 tp->t_flags |= TF_SACK_PERMIT; 2423 break; 2424 case TCPOPT_SACK: 2425 if (tcp_sack_option(tp, th, cp, optlen)) 2426 continue; 2427 break; 2428 #endif 2429 } 2430 } 2431 /* Update t_maxopd and t_maxseg after all options are processed */ 2432 if (th->th_flags & TH_SYN) { 2433 (void) tcp_mss(tp, mss); /* sets t_maxseg */ 2434 2435 if (mss) 2436 tcp_mss_update(tp); 2437 } 2438 } 2439 2440 #if defined(TCP_SACK) 2441 u_long 2442 tcp_seq_subtract(a, b) 2443 u_long a, b; 2444 { 2445 return ((long)(a - b)); 2446 } 2447 #endif 2448 2449 2450 #ifdef TCP_SACK 2451 /* 2452 * This function is called upon receipt of new valid data (while not in header 2453 * prediction mode), and it updates the ordered list of sacks. 2454 */ 2455 void 2456 tcp_update_sack_list(tp) 2457 struct tcpcb *tp; 2458 { 2459 /* 2460 * First reported block MUST be the most recent one. Subsequent 2461 * blocks SHOULD be in the order in which they arrived at the 2462 * receiver. These two conditions make the implementation fully 2463 * compliant with RFC 2018. 2464 */ 2465 int i, j = 0, count = 0, lastpos = -1; 2466 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2467 2468 /* First clean up current list of sacks */ 2469 for (i = 0; i < tp->rcv_numsacks; i++) { 2470 sack = tp->sackblks[i]; 2471 if (sack.start == 0 && sack.end == 0) { 2472 count++; /* count = number of blocks to be discarded */ 2473 continue; 2474 } 2475 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2476 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2477 count++; 2478 } else { 2479 temp[j].start = tp->sackblks[i].start; 2480 temp[j++].end = tp->sackblks[i].end; 2481 } 2482 } 2483 tp->rcv_numsacks -= count; 2484 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2485 tcp_clean_sackreport(tp); 2486 if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) { 2487 /* ==> need first sack block */ 2488 tp->sackblks[0].start = tp->rcv_laststart; 2489 tp->sackblks[0].end = tp->rcv_lastend; 2490 tp->rcv_numsacks = 1; 2491 } 2492 return; 2493 } 2494 /* Otherwise, sack blocks are already present. */ 2495 for (i = 0; i < tp->rcv_numsacks; i++) 2496 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2497 if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend)) 2498 return; /* sack list remains unchanged */ 2499 /* 2500 * From here, segment just received should be (part of) the 1st sack. 2501 * Go through list, possibly coalescing sack block entries. 2502 */ 2503 firstsack.start = tp->rcv_laststart; 2504 firstsack.end = tp->rcv_lastend; 2505 for (i = 0; i < tp->rcv_numsacks; i++) { 2506 sack = tp->sackblks[i]; 2507 if (SEQ_LT(sack.end, firstsack.start) || 2508 SEQ_GT(sack.start, firstsack.end)) 2509 continue; /* no overlap */ 2510 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2511 /* 2512 * identical block; delete it here since we will 2513 * move it to the front of the list. 2514 */ 2515 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2516 lastpos = i; /* last posn with a zero entry */ 2517 continue; 2518 } 2519 if (SEQ_LEQ(sack.start, firstsack.start)) 2520 firstsack.start = sack.start; /* merge blocks */ 2521 if (SEQ_GEQ(sack.end, firstsack.end)) 2522 firstsack.end = sack.end; /* merge blocks */ 2523 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2524 lastpos = i; /* last posn with a zero entry */ 2525 } 2526 if (lastpos != -1) { /* at least one merge */ 2527 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2528 sack = tp->sackblks[i]; 2529 if (sack.start == 0 && sack.end == 0) 2530 continue; 2531 temp[j++] = sack; 2532 } 2533 tp->rcv_numsacks = j; /* including first blk (added later) */ 2534 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2535 tp->sackblks[i] = temp[i]; 2536 } else { /* no merges -- shift sacks by 1 */ 2537 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2538 tp->rcv_numsacks++; 2539 for (i = tp->rcv_numsacks-1; i > 0; i--) 2540 tp->sackblks[i] = tp->sackblks[i-1]; 2541 } 2542 tp->sackblks[0] = firstsack; 2543 return; 2544 } 2545 2546 /* 2547 * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue, 2548 * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list 2549 * of holes (oldest to newest, in terms of the sequence space). 2550 */ 2551 int 2552 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2553 { 2554 int tmp_olen; 2555 u_char *tmp_cp; 2556 struct sackhole *cur, *p, *temp; 2557 2558 if (tp->sack_disable) 2559 return (1); 2560 2561 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2562 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2563 return (1); 2564 tmp_cp = cp + 2; 2565 tmp_olen = optlen - 2; 2566 if (tp->snd_numholes < 0) 2567 tp->snd_numholes = 0; 2568 if (tp->t_maxseg == 0) 2569 panic("tcp_sack_option"); /* Should never happen */ 2570 while (tmp_olen > 0) { 2571 struct sackblk sack; 2572 2573 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2574 NTOHL(sack.start); 2575 bcopy(tmp_cp + sizeof(tcp_seq), 2576 (char *) &(sack.end), sizeof(tcp_seq)); 2577 NTOHL(sack.end); 2578 tmp_olen -= TCPOLEN_SACK; 2579 tmp_cp += TCPOLEN_SACK; 2580 if (SEQ_LEQ(sack.end, sack.start)) 2581 continue; /* bad SACK fields */ 2582 if (SEQ_LEQ(sack.end, tp->snd_una)) 2583 continue; /* old block */ 2584 #if defined(TCP_SACK) && defined(TCP_FACK) 2585 /* Updates snd_fack. */ 2586 if (SEQ_GT(sack.end, tp->snd_fack)) 2587 tp->snd_fack = sack.end; 2588 #endif /* TCP_FACK */ 2589 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2590 if (SEQ_LT(sack.start, th->th_ack)) 2591 continue; 2592 } 2593 if (SEQ_GT(sack.end, tp->snd_max)) 2594 continue; 2595 if (tp->snd_holes == NULL) { /* first hole */ 2596 tp->snd_holes = (struct sackhole *) 2597 pool_get(&sackhl_pool, PR_NOWAIT); 2598 if (tp->snd_holes == NULL) { 2599 /* ENOBUFS, so ignore SACKed block for now*/ 2600 continue; 2601 } 2602 cur = tp->snd_holes; 2603 cur->start = th->th_ack; 2604 cur->end = sack.start; 2605 cur->rxmit = cur->start; 2606 cur->next = NULL; 2607 tp->snd_numholes = 1; 2608 tp->rcv_lastsack = sack.end; 2609 /* 2610 * dups is at least one. If more data has been 2611 * SACKed, it can be greater than one. 2612 */ 2613 cur->dups = min(tcprexmtthresh, 2614 ((sack.end - cur->end)/tp->t_maxseg)); 2615 if (cur->dups < 1) 2616 cur->dups = 1; 2617 continue; /* with next sack block */ 2618 } 2619 /* Go thru list of holes: p = previous, cur = current */ 2620 p = cur = tp->snd_holes; 2621 while (cur) { 2622 if (SEQ_LEQ(sack.end, cur->start)) 2623 /* SACKs data before the current hole */ 2624 break; /* no use going through more holes */ 2625 if (SEQ_GEQ(sack.start, cur->end)) { 2626 /* SACKs data beyond the current hole */ 2627 cur->dups++; 2628 if (((sack.end - cur->end)/tp->t_maxseg) >= 2629 tcprexmtthresh) 2630 cur->dups = tcprexmtthresh; 2631 p = cur; 2632 cur = cur->next; 2633 continue; 2634 } 2635 if (SEQ_LEQ(sack.start, cur->start)) { 2636 /* Data acks at least the beginning of hole */ 2637 #if defined(TCP_SACK) && defined(TCP_FACK) 2638 if (SEQ_GT(sack.end, cur->rxmit)) 2639 tp->retran_data -= 2640 tcp_seq_subtract(cur->rxmit, 2641 cur->start); 2642 else 2643 tp->retran_data -= 2644 tcp_seq_subtract(sack.end, 2645 cur->start); 2646 #endif /* TCP_FACK */ 2647 if (SEQ_GEQ(sack.end, cur->end)) { 2648 /* Acks entire hole, so delete hole */ 2649 if (p != cur) { 2650 p->next = cur->next; 2651 pool_put(&sackhl_pool, cur); 2652 cur = p->next; 2653 } else { 2654 cur = cur->next; 2655 pool_put(&sackhl_pool, p); 2656 p = cur; 2657 tp->snd_holes = p; 2658 } 2659 tp->snd_numholes--; 2660 continue; 2661 } 2662 /* otherwise, move start of hole forward */ 2663 cur->start = sack.end; 2664 cur->rxmit = max (cur->rxmit, cur->start); 2665 p = cur; 2666 cur = cur->next; 2667 continue; 2668 } 2669 /* move end of hole backward */ 2670 if (SEQ_GEQ(sack.end, cur->end)) { 2671 #if defined(TCP_SACK) && defined(TCP_FACK) 2672 if (SEQ_GT(cur->rxmit, sack.start)) 2673 tp->retran_data -= 2674 tcp_seq_subtract(cur->rxmit, 2675 sack.start); 2676 #endif /* TCP_FACK */ 2677 cur->end = sack.start; 2678 cur->rxmit = min(cur->rxmit, cur->end); 2679 cur->dups++; 2680 if (((sack.end - cur->end)/tp->t_maxseg) >= 2681 tcprexmtthresh) 2682 cur->dups = tcprexmtthresh; 2683 p = cur; 2684 cur = cur->next; 2685 continue; 2686 } 2687 if (SEQ_LT(cur->start, sack.start) && 2688 SEQ_GT(cur->end, sack.end)) { 2689 /* 2690 * ACKs some data in middle of a hole; need to 2691 * split current hole 2692 */ 2693 temp = (struct sackhole *) 2694 pool_get(&sackhl_pool, PR_NOWAIT); 2695 if (temp == NULL) 2696 continue; /* ENOBUFS */ 2697 #if defined(TCP_SACK) && defined(TCP_FACK) 2698 if (SEQ_GT(cur->rxmit, sack.end)) 2699 tp->retran_data -= 2700 tcp_seq_subtract(sack.end, 2701 sack.start); 2702 else if (SEQ_GT(cur->rxmit, sack.start)) 2703 tp->retran_data -= 2704 tcp_seq_subtract(cur->rxmit, 2705 sack.start); 2706 #endif /* TCP_FACK */ 2707 temp->next = cur->next; 2708 temp->start = sack.end; 2709 temp->end = cur->end; 2710 temp->dups = cur->dups; 2711 temp->rxmit = max(cur->rxmit, temp->start); 2712 cur->end = sack.start; 2713 cur->rxmit = min(cur->rxmit, cur->end); 2714 cur->dups++; 2715 if (((sack.end - cur->end)/tp->t_maxseg) >= 2716 tcprexmtthresh) 2717 cur->dups = tcprexmtthresh; 2718 cur->next = temp; 2719 p = temp; 2720 cur = p->next; 2721 tp->snd_numholes++; 2722 } 2723 } 2724 /* At this point, p points to the last hole on the list */ 2725 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2726 /* 2727 * Need to append new hole at end. 2728 * Last hole is p (and it's not NULL). 2729 */ 2730 temp = (struct sackhole *) 2731 pool_get(&sackhl_pool, PR_NOWAIT); 2732 if (temp == NULL) 2733 continue; /* ENOBUFS */ 2734 temp->start = tp->rcv_lastsack; 2735 temp->end = sack.start; 2736 temp->dups = min(tcprexmtthresh, 2737 ((sack.end - sack.start)/tp->t_maxseg)); 2738 if (temp->dups < 1) 2739 temp->dups = 1; 2740 temp->rxmit = temp->start; 2741 temp->next = 0; 2742 p->next = temp; 2743 tp->rcv_lastsack = sack.end; 2744 tp->snd_numholes++; 2745 } 2746 } 2747 #if defined(TCP_SACK) && defined(TCP_FACK) 2748 /* 2749 * Update retran_data and snd_awnd. Go through the list of 2750 * holes. Increment retran_data by (hole->rxmit - hole->start). 2751 */ 2752 tp->retran_data = 0; 2753 cur = tp->snd_holes; 2754 while (cur) { 2755 tp->retran_data += cur->rxmit - cur->start; 2756 cur = cur->next; 2757 } 2758 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2759 tp->retran_data; 2760 #endif /* TCP_FACK */ 2761 2762 return (0); 2763 } 2764 2765 /* 2766 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2767 * it is completely acked; otherwise, tcp_sack_option(), called from 2768 * tcp_dooptions(), will fix up the hole. 2769 */ 2770 void 2771 tcp_del_sackholes(tp, th) 2772 struct tcpcb *tp; 2773 struct tcphdr *th; 2774 { 2775 if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) { 2776 /* max because this could be an older ack just arrived */ 2777 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2778 th->th_ack : tp->snd_una; 2779 struct sackhole *cur = tp->snd_holes; 2780 struct sackhole *prev; 2781 while (cur) 2782 if (SEQ_LEQ(cur->end, lastack)) { 2783 prev = cur; 2784 cur = cur->next; 2785 pool_put(&sackhl_pool, prev); 2786 tp->snd_numholes--; 2787 } else if (SEQ_LT(cur->start, lastack)) { 2788 cur->start = lastack; 2789 if (SEQ_LT(cur->rxmit, cur->start)) 2790 cur->rxmit = cur->start; 2791 break; 2792 } else 2793 break; 2794 tp->snd_holes = cur; 2795 } 2796 } 2797 2798 /* 2799 * Delete all receiver-side SACK information. 2800 */ 2801 void 2802 tcp_clean_sackreport(tp) 2803 struct tcpcb *tp; 2804 { 2805 int i; 2806 2807 tp->rcv_numsacks = 0; 2808 for (i = 0; i < MAX_SACK_BLKS; i++) 2809 tp->sackblks[i].start = tp->sackblks[i].end=0; 2810 2811 } 2812 2813 /* 2814 * Checks for partial ack. If partial ack arrives, turn off retransmission 2815 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2816 * If the ack advances at least to tp->snd_last, return 0. 2817 */ 2818 int 2819 tcp_sack_partialack(tp, th) 2820 struct tcpcb *tp; 2821 struct tcphdr *th; 2822 { 2823 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2824 /* Turn off retx. timer (will start again next segment) */ 2825 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2826 tp->t_rtttime = 0; 2827 #ifndef TCP_FACK 2828 /* 2829 * Partial window deflation. This statement relies on the 2830 * fact that tp->snd_una has not been updated yet. In FACK 2831 * hold snd_cwnd constant during fast recovery. 2832 */ 2833 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2834 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2835 tp->snd_cwnd += tp->t_maxseg; 2836 } else 2837 tp->snd_cwnd = tp->t_maxseg; 2838 #endif 2839 return (1); 2840 } 2841 return (0); 2842 } 2843 #endif /* TCP_SACK */ 2844 2845 /* 2846 * Pull out of band byte out of a segment so 2847 * it doesn't appear in the user's data queue. 2848 * It is still reflected in the segment length for 2849 * sequencing purposes. 2850 */ 2851 void 2852 tcp_pulloutofband(so, urgent, m, off) 2853 struct socket *so; 2854 u_int urgent; 2855 struct mbuf *m; 2856 int off; 2857 { 2858 int cnt = off + urgent - 1; 2859 2860 while (cnt >= 0) { 2861 if (m->m_len > cnt) { 2862 char *cp = mtod(m, caddr_t) + cnt; 2863 struct tcpcb *tp = sototcpcb(so); 2864 2865 tp->t_iobc = *cp; 2866 tp->t_oobflags |= TCPOOB_HAVEDATA; 2867 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2868 m->m_len--; 2869 return; 2870 } 2871 cnt -= m->m_len; 2872 m = m->m_next; 2873 if (m == 0) 2874 break; 2875 } 2876 panic("tcp_pulloutofband"); 2877 } 2878 2879 /* 2880 * Collect new round-trip time estimate 2881 * and update averages and current timeout. 2882 */ 2883 void 2884 tcp_xmit_timer(tp, rtt) 2885 struct tcpcb *tp; 2886 short rtt; 2887 { 2888 short delta; 2889 short rttmin; 2890 2891 tcpstat.tcps_rttupdated++; 2892 --rtt; 2893 if (tp->t_srtt != 0) { 2894 /* 2895 * srtt is stored as fixed point with 3 bits after the 2896 * binary point (i.e., scaled by 8). The following magic 2897 * is equivalent to the smoothing algorithm in rfc793 with 2898 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2899 * point). Adjust rtt to origin 0. 2900 */ 2901 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2902 if ((tp->t_srtt += delta) <= 0) 2903 tp->t_srtt = 1; 2904 /* 2905 * We accumulate a smoothed rtt variance (actually, a 2906 * smoothed mean difference), then set the retransmit 2907 * timer to smoothed rtt + 4 times the smoothed variance. 2908 * rttvar is stored as fixed point with 2 bits after the 2909 * binary point (scaled by 4). The following is 2910 * equivalent to rfc793 smoothing with an alpha of .75 2911 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2912 * rfc793's wired-in beta. 2913 */ 2914 if (delta < 0) 2915 delta = -delta; 2916 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2917 if ((tp->t_rttvar += delta) <= 0) 2918 tp->t_rttvar = 1; 2919 } else { 2920 /* 2921 * No rtt measurement yet - use the unsmoothed rtt. 2922 * Set the variance to half the rtt (so our first 2923 * retransmit happens at 3*rtt). 2924 */ 2925 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2926 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2927 } 2928 tp->t_rtttime = 0; 2929 tp->t_rxtshift = 0; 2930 2931 /* 2932 * the retransmit should happen at rtt + 4 * rttvar. 2933 * Because of the way we do the smoothing, srtt and rttvar 2934 * will each average +1/2 tick of bias. When we compute 2935 * the retransmit timer, we want 1/2 tick of rounding and 2936 * 1 extra tick because of +-1/2 tick uncertainty in the 2937 * firing of the timer. The bias will give us exactly the 2938 * 1.5 tick we need. But, because the bias is 2939 * statistical, we have to test that we don't drop below 2940 * the minimum feasible timer (which is 2 ticks). 2941 */ 2942 if (tp->t_rttmin > rtt + 2) 2943 rttmin = tp->t_rttmin; 2944 else 2945 rttmin = rtt + 2; 2946 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2947 2948 /* 2949 * We received an ack for a packet that wasn't retransmitted; 2950 * it is probably safe to discard any error indications we've 2951 * received recently. This isn't quite right, but close enough 2952 * for now (a route might have failed after we sent a segment, 2953 * and the return path might not be symmetrical). 2954 */ 2955 tp->t_softerror = 0; 2956 } 2957 2958 /* 2959 * Determine a reasonable value for maxseg size. 2960 * If the route is known, check route for mtu. 2961 * If none, use an mss that can be handled on the outgoing 2962 * interface without forcing IP to fragment; if bigger than 2963 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2964 * to utilize large mbufs. If no route is found, route has no mtu, 2965 * or the destination isn't local, use a default, hopefully conservative 2966 * size (usually 512 or the default IP max size, but no more than the mtu 2967 * of the interface), as we can't discover anything about intervening 2968 * gateways or networks. We also initialize the congestion/slow start 2969 * window to be a single segment if the destination isn't local. 2970 * While looking at the routing entry, we also initialize other path-dependent 2971 * parameters from pre-set or cached values in the routing entry. 2972 * 2973 * Also take into account the space needed for options that we 2974 * send regularly. Make maxseg shorter by that amount to assure 2975 * that we can send maxseg amount of data even when the options 2976 * are present. Store the upper limit of the length of options plus 2977 * data in maxopd. 2978 * 2979 * NOTE: offer == -1 indicates that the maxseg size changed due to 2980 * Path MTU discovery. 2981 */ 2982 int 2983 tcp_mss(tp, offer) 2984 struct tcpcb *tp; 2985 int offer; 2986 { 2987 struct rtentry *rt; 2988 struct ifnet *ifp; 2989 int mss, mssopt; 2990 int iphlen; 2991 struct inpcb *inp; 2992 2993 inp = tp->t_inpcb; 2994 2995 mssopt = mss = tcp_mssdflt; 2996 2997 rt = in_pcbrtentry(inp); 2998 2999 if (rt == NULL) 3000 goto out; 3001 3002 ifp = rt->rt_ifp; 3003 3004 switch (tp->pf) { 3005 #ifdef INET6 3006 case AF_INET6: 3007 iphlen = sizeof(struct ip6_hdr); 3008 break; 3009 #endif 3010 case AF_INET: 3011 iphlen = sizeof(struct ip); 3012 break; 3013 default: 3014 /* the family does not support path MTU discovery */ 3015 goto out; 3016 } 3017 3018 #ifdef RTV_MTU 3019 /* 3020 * if there's an mtu associated with the route and we support 3021 * path MTU discovery for the underlying protocol family, use it. 3022 */ 3023 if (rt->rt_rmx.rmx_mtu) { 3024 /* 3025 * One may wish to lower MSS to take into account options, 3026 * especially security-related options. 3027 */ 3028 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3029 } else 3030 #endif /* RTV_MTU */ 3031 if (!ifp) 3032 /* 3033 * ifp may be null and rmx_mtu may be zero in certain 3034 * v6 cases (e.g., if ND wasn't able to resolve the 3035 * destination host. 3036 */ 3037 goto out; 3038 else if (ifp->if_flags & IFF_LOOPBACK) 3039 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3040 else if (tp->pf == AF_INET) { 3041 if (ip_mtudisc) 3042 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3043 else if (inp && in_localaddr(inp->inp_faddr)) 3044 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3045 } 3046 #ifdef INET6 3047 else if (tp->pf == AF_INET6) { 3048 /* 3049 * for IPv6, path MTU discovery is always turned on, 3050 * or the node must use packet size <= 1280. 3051 */ 3052 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3053 } 3054 #endif /* INET6 */ 3055 3056 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3057 if (offer != -1) { 3058 #ifndef INET6 3059 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3060 #else 3061 if (tp->pf == AF_INET) 3062 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3063 else 3064 mssopt = IN6_LINKMTU(ifp) - iphlen - 3065 sizeof(struct tcphdr); 3066 #endif 3067 3068 mssopt = max(tcp_mssdflt, mssopt); 3069 } 3070 3071 out: 3072 /* 3073 * The current mss, t_maxseg, is initialized to the default value. 3074 * If we compute a smaller value, reduce the current mss. 3075 * If we compute a larger value, return it for use in sending 3076 * a max seg size option, but don't store it for use 3077 * unless we received an offer at least that large from peer. 3078 * However, do not accept offers under 64 bytes. 3079 */ 3080 if (offer > 0) 3081 tp->t_peermss = offer; 3082 if (tp->t_peermss) 3083 mss = min(mss, tp->t_peermss); 3084 mss = max(mss, 64); /* sanity - at least max opt. space */ 3085 3086 /* 3087 * maxopd stores the maximum length of data AND options 3088 * in a segment; maxseg is the amount of data in a normal 3089 * segment. We need to store this value (maxopd) apart 3090 * from maxseg, because now every segment carries options 3091 * and thus we normally have somewhat less data in segments. 3092 */ 3093 tp->t_maxopd = mss; 3094 3095 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3096 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3097 mss -= TCPOLEN_TSTAMP_APPA; 3098 3099 if (offer == -1) { 3100 /* mss changed due to Path MTU discovery */ 3101 if (mss < tp->t_maxseg) { 3102 /* 3103 * Follow suggestion in RFC 2414 to reduce the 3104 * congestion window by the ratio of the old 3105 * segment size to the new segment size. 3106 */ 3107 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3108 mss, mss); 3109 } 3110 } else 3111 tp->snd_cwnd = mss; 3112 3113 tp->t_maxseg = mss; 3114 3115 return (offer != -1 ? mssopt : mss); 3116 } 3117 3118 /* 3119 * Set connection variables based on the effective MSS. 3120 * We are passed the TCPCB for the actual connection. If we 3121 * are the server, we are called by the compressed state engine 3122 * when the 3-way handshake is complete. If we are the client, 3123 * we are called when we receive the SYN,ACK from the server. 3124 * 3125 * NOTE: The t_maxseg value must be initialized in the TCPCB 3126 * before this routine is called! 3127 */ 3128 void 3129 tcp_mss_update(tp) 3130 struct tcpcb *tp; 3131 { 3132 int mss, rtt; 3133 u_long bufsize; 3134 struct rtentry *rt; 3135 struct socket *so; 3136 3137 so = tp->t_inpcb->inp_socket; 3138 mss = tp->t_maxseg; 3139 3140 rt = in_pcbrtentry(tp->t_inpcb); 3141 3142 if (rt == NULL) 3143 return; 3144 3145 #ifdef RTV_MTU /* if route characteristics exist ... */ 3146 /* 3147 * While we're here, check if there's an initial rtt 3148 * or rttvar. Convert from the route-table units 3149 * to scaled multiples of the slow timeout timer. 3150 */ 3151 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { 3152 /* 3153 * XXX the lock bit for MTU indicates that the value 3154 * is also a minimum value; this is subject to time. 3155 */ 3156 if (rt->rt_rmx.rmx_locks & RTV_RTT) 3157 TCPT_RANGESET(tp->t_rttmin, 3158 rtt / (RTM_RTTUNIT / PR_SLOWHZ), 3159 TCPTV_MIN, TCPTV_REXMTMAX); 3160 tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); 3161 if (rt->rt_rmx.rmx_rttvar) 3162 tp->t_rttvar = rt->rt_rmx.rmx_rttvar / 3163 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); 3164 else 3165 /* default variation is +- 1 rtt */ 3166 tp->t_rttvar = 3167 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 3168 TCPT_RANGESET((long) tp->t_rxtcur, 3169 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 3170 tp->t_rttmin, TCPTV_REXMTMAX); 3171 } 3172 #endif 3173 3174 /* 3175 * If there's a pipesize, change the socket buffer 3176 * to that size. Make the socket buffers an integral 3177 * number of mss units; if the mss is larger than 3178 * the socket buffer, decrease the mss. 3179 */ 3180 #ifdef RTV_SPIPE 3181 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) 3182 #endif 3183 bufsize = so->so_snd.sb_hiwat; 3184 if (bufsize < mss) { 3185 mss = bufsize; 3186 /* Update t_maxseg and t_maxopd */ 3187 tcp_mss(tp, mss); 3188 } else { 3189 bufsize = roundup(bufsize, mss); 3190 if (bufsize > sb_max) 3191 bufsize = sb_max; 3192 (void)sbreserve(&so->so_snd, bufsize); 3193 } 3194 3195 #ifdef RTV_RPIPE 3196 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) 3197 #endif 3198 bufsize = so->so_rcv.sb_hiwat; 3199 if (bufsize > mss) { 3200 bufsize = roundup(bufsize, mss); 3201 if (bufsize > sb_max) 3202 bufsize = sb_max; 3203 (void)sbreserve(&so->so_rcv, bufsize); 3204 #ifdef RTV_RPIPE 3205 if (rt->rt_rmx.rmx_recvpipe > 0) 3206 tcp_rscale(tp, so->so_rcv.sb_hiwat); 3207 #endif 3208 } 3209 3210 #ifdef RTV_SSTHRESH 3211 if (rt->rt_rmx.rmx_ssthresh) { 3212 /* 3213 * There's some sort of gateway or interface 3214 * buffer limit on the path. Use this to set 3215 * the slow start threshhold, but set the 3216 * threshold to no less than 2*mss. 3217 */ 3218 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); 3219 } 3220 #endif /* RTV_MTU */ 3221 } 3222 #endif /* TUBA_INCLUDE */ 3223 3224 #if defined (TCP_SACK) 3225 /* 3226 * Checks for partial ack. If partial ack arrives, force the retransmission 3227 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3228 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3229 * be started again. If the ack advances at least to tp->snd_last, return 0. 3230 */ 3231 int 3232 tcp_newreno(tp, th) 3233 struct tcpcb *tp; 3234 struct tcphdr *th; 3235 { 3236 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3237 /* 3238 * snd_una has not been updated and the socket send buffer 3239 * not yet drained of the acked data, so we have to leave 3240 * snd_una as it was to get the correct data offset in 3241 * tcp_output(). 3242 */ 3243 tcp_seq onxt = tp->snd_nxt; 3244 u_long ocwnd = tp->snd_cwnd; 3245 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3246 tp->t_rtttime = 0; 3247 tp->snd_nxt = th->th_ack; 3248 /* 3249 * Set snd_cwnd to one segment beyond acknowledged offset 3250 * (tp->snd_una not yet updated when this function is called) 3251 */ 3252 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3253 (void) tcp_output(tp); 3254 tp->snd_cwnd = ocwnd; 3255 if (SEQ_GT(onxt, tp->snd_nxt)) 3256 tp->snd_nxt = onxt; 3257 /* 3258 * Partial window deflation. Relies on fact that tp->snd_una 3259 * not updated yet. 3260 */ 3261 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); 3262 return 1; 3263 } 3264 return 0; 3265 } 3266 #endif /* TCP_SACK */ 3267