1 /* $OpenBSD: tcp_input.c,v 1.57 2000/02/21 21:42:13 provos Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94 37 */ 38 39 /* 40 %%% portions-copyright-nrl-95 41 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 43 Reserved. All rights under this copyright have been assigned to the US 44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 46 software. 47 You should have received a copy of the license with this software. If you 48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 49 */ 50 51 #ifndef TUBA_INCLUDE 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/malloc.h> 55 #include <sys/mbuf.h> 56 #include <sys/protosw.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/errno.h> 60 61 #include <net/if.h> 62 #include <net/route.h> 63 64 #include <netinet/in.h> 65 #include <netinet/in_systm.h> 66 #include <netinet/ip.h> 67 #include <netinet/in_pcb.h> 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_seq.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcpip.h> 75 #include <netinet/tcp_debug.h> 76 #include <dev/rndvar.h> 77 #include <machine/stdarg.h> 78 #include <sys/md5k.h> 79 80 #ifdef IPSEC 81 #include <netinet/ip_ipsp.h> 82 #endif /* IPSEC */ 83 84 #ifdef INET6 85 #ifndef INET 86 #include <netinet/in.h> 87 #endif 88 #include <sys/domain.h> 89 #include <netinet6/in6_var.h> 90 #include <netinet/ip6.h> 91 #include <netinet6/ip6_var.h> 92 #include <netinet6/tcpipv6.h> 93 #include <netinet/icmp6.h> 94 #include <netinet6/nd6.h> 95 96 #ifndef CREATE_IPV6_MAPPED 97 #define CREATE_IPV6_MAPPED(a6, a4) \ 98 do { \ 99 bzero(&(a6), sizeof(a6)); \ 100 (a6).s6_addr[10] = (a6).s6_addr[11] = 0xff; \ 101 *(u_int32_t *)&(a6).s6_addr[12] = (a4); \ 102 } while (0) 103 #endif 104 105 struct tcpiphdr tcp_saveti; 106 struct tcpipv6hdr tcp_saveti6; 107 108 /* for the packet header length in the mbuf */ 109 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 110 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 111 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 112 #endif /* INET6 */ 113 114 int tcprexmtthresh = 3; 115 struct tcpiphdr tcp_saveti; 116 int tcptv_keep_init = TCPTV_KEEP_INIT; 117 118 extern u_long sb_max; 119 120 #endif /* TUBA_INCLUDE */ 121 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 122 123 /* for modulo comparisons of timestamps */ 124 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 125 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 126 127 /* 128 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 129 */ 130 #ifdef INET6 131 #define ND6_HINT(tp) \ 132 do { \ 133 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) \ 134 && !(tp->t_inpcb->inp_flags & INP_IPV6_MAPPED) \ 135 && tp->t_inpcb->inp_route6.ro_rt) { \ 136 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL); \ 137 } \ 138 } while (0) 139 #else 140 #define ND6_HINT(tp) 141 #endif 142 143 /* 144 * Insert segment ti into reassembly queue of tcp with 145 * control block tp. Return TH_FIN if reassembly now includes 146 * a segment with FIN. The macro form does the common case inline 147 * (segment is the next to be received on an established connection, 148 * and the queue is empty), avoiding linkage into and removal 149 * from the queue and repetition of various conversions. 150 * Set DELACK for segments received in order, but ack immediately 151 * when segments are out of order (so fast retransmit can work). 152 */ 153 154 #ifndef TUBA_INCLUDE 155 156 int 157 tcp_reass(tp, th, m, tlen) 158 register struct tcpcb *tp; 159 register struct tcphdr *th; 160 struct mbuf *m; 161 int *tlen; 162 { 163 register struct ipqent *p, *q, *nq, *tiqe; 164 struct socket *so = tp->t_inpcb->inp_socket; 165 int flags; 166 167 /* 168 * Call with th==0 after become established to 169 * force pre-ESTABLISHED data up to user socket. 170 */ 171 if (th == 0) 172 goto present; 173 174 /* 175 * Allocate a new queue entry, before we throw away any data. 176 * If we can't, just drop the packet. XXX 177 */ 178 MALLOC(tiqe, struct ipqent *, sizeof (struct ipqent), M_IPQ, M_NOWAIT); 179 if (tiqe == NULL) { 180 tcpstat.tcps_rcvmemdrop++; 181 m_freem(m); 182 return (0); 183 } 184 185 /* 186 * Find a segment which begins after this one does. 187 */ 188 for (p = NULL, q = tp->segq.lh_first; q != NULL; 189 p = q, q = q->ipqe_q.le_next) 190 if (SEQ_GT(q->ipqe_tcp->th_seq, th->th_seq)) 191 break; 192 193 /* 194 * If there is a preceding segment, it may provide some of 195 * our data already. If so, drop the data from the incoming 196 * segment. If it provides all of our data, drop us. 197 */ 198 if (p != NULL) { 199 register struct tcphdr *phdr = p->ipqe_tcp; 200 register int i; 201 202 /* conversion to int (in i) handles seq wraparound */ 203 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 204 if (i > 0) { 205 if (i >= *tlen) { 206 tcpstat.tcps_rcvduppack++; 207 tcpstat.tcps_rcvdupbyte += *tlen; 208 m_freem(m); 209 FREE(tiqe, M_IPQ); 210 return (0); 211 } 212 m_adj(m, i); 213 *tlen -= i; 214 th->th_seq += i; 215 } 216 } 217 tcpstat.tcps_rcvoopack++; 218 tcpstat.tcps_rcvoobyte += *tlen; 219 220 /* 221 * While we overlap succeeding segments trim them or, 222 * if they are completely covered, dequeue them. 223 */ 224 for (; q != NULL; q = nq) { 225 register struct tcphdr *qhdr = q->ipqe_tcp; 226 register int i = (th->th_seq + *tlen) - qhdr->th_seq; 227 228 if (i <= 0) 229 break; 230 if (i < qhdr->th_reseqlen) { 231 qhdr->th_seq += i; 232 qhdr->th_reseqlen -= i; 233 m_adj(q->ipqe_m, i); 234 break; 235 } 236 nq = q->ipqe_q.le_next; 237 m_freem(q->ipqe_m); 238 LIST_REMOVE(q, ipqe_q); 239 FREE(q, M_IPQ); 240 } 241 242 /* Insert the new fragment queue entry into place. */ 243 tiqe->ipqe_m = m; 244 th->th_reseqlen = *tlen; 245 tiqe->ipqe_tcp = th; 246 if (p == NULL) { 247 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 248 } else { 249 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 250 } 251 252 present: 253 /* 254 * Present data to user, advancing rcv_nxt through 255 * completed sequence space. 256 */ 257 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 258 return (0); 259 q = tp->segq.lh_first; 260 if (q == NULL || q->ipqe_tcp->th_seq != tp->rcv_nxt) 261 return (0); 262 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_tcp->th_reseqlen) 263 return (0); 264 do { 265 tp->rcv_nxt += q->ipqe_tcp->th_reseqlen; 266 flags = q->ipqe_tcp->th_flags & TH_FIN; 267 268 nq = q->ipqe_q.le_next; 269 LIST_REMOVE(q, ipqe_q); 270 ND6_HINT(tp); 271 if (so->so_state & SS_CANTRCVMORE) 272 m_freem(q->ipqe_m); 273 else 274 sbappend(&so->so_rcv, q->ipqe_m); 275 FREE(q, M_IPQ); 276 q = nq; 277 } while (q != NULL && q->ipqe_tcp->th_seq == tp->rcv_nxt); 278 sorwakeup(so); 279 return (flags); 280 } 281 282 /* 283 * First check for a port-specific bomb. We do not want to drop half-opens 284 * for other ports if this is the only port being bombed. We only check 285 * the bottom 40 half open connections, to avoid wasting too much time. 286 * 287 * Or, otherwise it is more likely a generic syn bomb, so delete the oldest 288 * half-open connection. 289 */ 290 void 291 tcpdropoldhalfopen(avoidtp, port) 292 struct tcpcb *avoidtp; 293 u_int16_t port; 294 { 295 register struct inpcb *inp; 296 register struct tcpcb *tp; 297 int ncheck = 40; 298 int s; 299 300 s = splnet(); 301 inp = tcbtable.inpt_queue.cqh_first; 302 if (inp) /* XXX */ 303 for (; inp != (struct inpcb *)&tcbtable.inpt_queue && --ncheck; 304 inp = inp->inp_queue.cqe_prev) { 305 if ((tp = (struct tcpcb *)inp->inp_ppcb) && 306 tp != avoidtp && 307 tp->t_state == TCPS_SYN_RECEIVED && 308 port == inp->inp_lport) { 309 tcp_close(tp); 310 goto done; 311 } 312 } 313 314 inp = tcbtable.inpt_queue.cqh_first; 315 if (inp) /* XXX */ 316 for (; inp != (struct inpcb *)&tcbtable.inpt_queue; 317 inp = inp->inp_queue.cqe_prev) { 318 if ((tp = (struct tcpcb *)inp->inp_ppcb) && 319 tp != avoidtp && 320 tp->t_state == TCPS_SYN_RECEIVED) { 321 tcp_close(tp); 322 goto done; 323 } 324 } 325 done: 326 splx(s); 327 } 328 329 #if defined(INET6) && !defined(TCP6) 330 int 331 tcp6_input(mp, offp, proto) 332 struct mbuf **mp; 333 int *offp, proto; 334 { 335 struct mbuf *m = *mp; 336 337 #if defined(NFAITH) && 0 < NFAITH 338 if (m->m_pkthdr.rcvif) { 339 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 340 /* XXX send icmp6 host/port unreach? */ 341 m_freem(m); 342 return IPPROTO_DONE; 343 } 344 } 345 #endif 346 347 /* 348 * draft-itojun-ipv6-tcp-to-anycast 349 * better place to put this in? 350 */ 351 if (m->m_flags & M_ANYCAST6) { 352 if (m->m_len >= sizeof(struct ip6_hdr)) { 353 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 354 icmp6_error(m, ICMP6_DST_UNREACH, 355 ICMP6_DST_UNREACH_ADDR, 356 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 357 } else 358 m_freem(m); 359 return IPPROTO_DONE; 360 } 361 362 tcp_input(m, *offp, proto); 363 return IPPROTO_DONE; 364 } 365 #endif 366 367 /* 368 * TCP input routine, follows pages 65-76 of the 369 * protocol specification dated September, 1981 very closely. 370 */ 371 void 372 #if __STDC__ 373 tcp_input(struct mbuf *m, ...) 374 #else 375 tcp_input(m, va_alist) 376 register struct mbuf *m; 377 #endif 378 { 379 register struct tcpiphdr *ti; 380 register struct inpcb *inp; 381 caddr_t optp = NULL; 382 int optlen = 0; 383 int len, tlen, off; 384 register struct tcpcb *tp = 0; 385 register int tiflags; 386 struct socket *so = NULL; 387 int todrop, acked, ourfinisacked, needoutput = 0; 388 int hdroptlen = 0; 389 short ostate = 0; 390 struct in_addr laddr; 391 int dropsocket = 0; 392 int iss = 0; 393 u_long tiwin; 394 u_int32_t ts_val, ts_ecr; 395 int ts_present = 0; 396 int iphlen; 397 va_list ap; 398 register struct tcphdr *th; 399 #ifdef IPSEC 400 struct tdb *tdb = NULL; 401 #endif /* IPSEC */ 402 #ifdef INET6 403 struct in6_addr laddr6; 404 unsigned short is_ipv6; /* Type of incoming datagram. */ 405 struct ip6_hdr *ipv6 = NULL; 406 #endif /* INET6 */ 407 408 va_start(ap, m); 409 iphlen = va_arg(ap, int); 410 va_end(ap); 411 412 tcpstat.tcps_rcvtotal++; 413 414 #ifdef IPSEC 415 /* Save the last SA which was used to process the mbuf */ 416 if ((m->m_flags & (M_CONF|M_AUTH)) && m->m_pkthdr.tdbi) { 417 struct tdb_ident *tdbi = m->m_pkthdr.tdbi; 418 /* XXX gettdb() should really be called at spltdb(). */ 419 /* XXX this is splsoftnet(), currently they are the same. */ 420 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 421 free(m->m_pkthdr.tdbi, M_TEMP); 422 m->m_pkthdr.tdbi = NULL; 423 } 424 #endif /* IPSEC */ 425 #ifdef INET6 426 /* 427 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 428 * TCP/IPv4. 429 */ 430 is_ipv6 = mtod(m, struct ip *)->ip_v == 6; 431 #endif /* INET6 */ 432 433 /* 434 * Get IP and TCP header together in first mbuf. 435 * Note: IP leaves IP header in first mbuf. 436 */ 437 #ifndef INET6 438 ti = mtod(m, struct tcpiphdr *); 439 #else /* INET6 */ 440 if (!is_ipv6) 441 #endif /* INET6 */ 442 if (iphlen > sizeof (struct ip)) { 443 #if 0 /*XXX*/ 444 ip_stripoptions(m, (struct mbuf *)0); 445 #else 446 printf("extension headers are not allowed\n"); 447 m_freem(m); 448 return; 449 #endif 450 } 451 if (m->m_len < iphlen + sizeof(struct tcphdr)) { 452 if ((m = m_pullup2(m, iphlen + sizeof(struct tcphdr))) == 0) { 453 tcpstat.tcps_rcvshort++; 454 return; 455 } 456 #ifndef INET6 457 ti = mtod(m, struct tcpiphdr *); 458 #endif /* INET6 */ 459 } 460 461 tlen = m->m_pkthdr.len - iphlen; 462 463 #ifdef INET6 464 /* 465 * After that, do initial segment processing which is still very 466 * dependent on what IP version you're using. 467 */ 468 469 if (is_ipv6) { 470 #ifdef DIAGNOSTIC 471 if (iphlen < sizeof(struct ip6_hdr)) { 472 m_freem(m); 473 return; 474 } 475 #endif /* DIAGNOSTIC */ 476 477 /* strip off any options */ 478 if (iphlen > sizeof(struct ip6_hdr)) { 479 #if 0 /*XXX*/ 480 ipv6_stripoptions(m, iphlen); 481 #else 482 printf("extension headers are not allowed\n"); 483 m_freem(m); 484 return; 485 #endif 486 iphlen = sizeof(struct ip6_hdr); 487 } 488 489 ti = NULL; 490 ipv6 = mtod(m, struct ip6_hdr *); 491 492 /* Be proactive about malicious use of IPv4 mapped address */ 493 if (IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_src) || 494 IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_dst)) { 495 /* XXX stat */ 496 goto drop; 497 } 498 499 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 500 tcpstat.tcps_rcvbadsum++; 501 goto drop; 502 } /* endif in6_cksum */ 503 } else { 504 ti = mtod(m, struct tcpiphdr *); 505 #endif /* INET6 */ 506 507 /* 508 * Checksum extended TCP header and data. 509 */ 510 #ifndef INET6 511 tlen = ((struct ip *)ti)->ip_len; 512 #endif /* INET6 */ 513 len = sizeof (struct ip) + tlen; 514 bzero(ti->ti_x1, sizeof ti->ti_x1); 515 ti->ti_len = (u_int16_t)tlen; 516 HTONS(ti->ti_len); 517 if ((ti->ti_sum = in_cksum(m, len)) != 0) { 518 tcpstat.tcps_rcvbadsum++; 519 goto drop; 520 } 521 #ifdef INET6 522 } 523 #endif /* INET6 */ 524 #endif /* TUBA_INCLUDE */ 525 526 th = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); 527 528 /* 529 * Check that TCP offset makes sense, 530 * pull out TCP options and adjust length. XXX 531 */ 532 off = th->th_off << 2; 533 if (off < sizeof (struct tcphdr) || off > tlen) { 534 tcpstat.tcps_rcvbadoff++; 535 goto drop; 536 } 537 tlen -= off; 538 if (off > sizeof (struct tcphdr)) { 539 if (m->m_len < iphlen + off) { 540 if ((m = m_pullup2(m, iphlen + off)) == 0) { 541 tcpstat.tcps_rcvshort++; 542 return; 543 } 544 #ifdef INET6 545 if (is_ipv6) 546 ipv6 = mtod(m, struct ip6_hdr *); 547 else 548 #endif /* INET6 */ 549 ti = mtod(m, struct tcpiphdr *); 550 th = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); 551 } 552 optlen = off - sizeof (struct tcphdr); 553 optp = mtod(m, caddr_t) + iphlen + sizeof(struct tcphdr); 554 /* 555 * Do quick retrieval of timestamp options ("options 556 * prediction?"). If timestamp is the only option and it's 557 * formatted as recommended in RFC 1323 appendix A, we 558 * quickly get the values now and not bother calling 559 * tcp_dooptions(), etc. 560 */ 561 if ((optlen == TCPOLEN_TSTAMP_APPA || 562 (optlen > TCPOLEN_TSTAMP_APPA && 563 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 564 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 565 (th->th_flags & TH_SYN) == 0) { 566 ts_present = 1; 567 ts_val = ntohl(*(u_int32_t *)(optp + 4)); 568 ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 569 optp = NULL; /* we've parsed the options */ 570 } 571 } 572 tiflags = th->th_flags; 573 574 /* 575 * Convert TCP protocol specific fields to host format. 576 */ 577 NTOHL(th->th_seq); 578 NTOHL(th->th_ack); 579 NTOHS(th->th_win); 580 NTOHS(th->th_urp); 581 582 /* 583 * Locate pcb for segment. 584 */ 585 findpcb: 586 #ifdef INET6 587 if (is_ipv6) { 588 inp = in6_pcbhashlookup(&tcbtable, &ipv6->ip6_src, th->th_sport, 589 &ipv6->ip6_dst, th->th_dport); 590 } else 591 #endif /* INET6 */ 592 inp = in_pcbhashlookup(&tcbtable, ti->ti_src, ti->ti_sport, 593 ti->ti_dst, ti->ti_dport); 594 if (inp == 0) { 595 ++tcpstat.tcps_pcbhashmiss; 596 #ifdef INET6 597 if (is_ipv6) 598 inp = in_pcblookup(&tcbtable, &ipv6->ip6_src, 599 th->th_sport, &ipv6->ip6_dst, th->th_dport, 600 INPLOOKUP_WILDCARD | INPLOOKUP_IPV6); 601 else 602 #endif /* INET6 */ 603 inp = in_pcblookup(&tcbtable, &ti->ti_src, ti->ti_sport, 604 &ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD); 605 /* 606 * If the state is CLOSED (i.e., TCB does not exist) then 607 * all data in the incoming segment is discarded. 608 * If the TCB exists but is in CLOSED state, it is embryonic, 609 * but should either do a listen or a connect soon. 610 */ 611 if (inp == 0) { 612 ++tcpstat.tcps_noport; 613 goto dropwithreset; 614 } 615 } 616 617 tp = intotcpcb(inp); 618 if (tp == 0) 619 goto dropwithreset; 620 if (tp->t_state == TCPS_CLOSED) 621 goto drop; 622 623 /* Unscale the window into a 32-bit value. */ 624 if ((tiflags & TH_SYN) == 0) 625 tiwin = th->th_win << tp->snd_scale; 626 else 627 tiwin = th->th_win; 628 629 so = inp->inp_socket; 630 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 631 if (so->so_options & SO_DEBUG) { 632 ostate = tp->t_state; 633 #ifdef INET6 634 if (is_ipv6) 635 tcp_saveti6 = *(mtod(m, struct tcpipv6hdr *)); 636 else 637 #endif /* INET6 */ 638 tcp_saveti = *ti; 639 } 640 if (so->so_options & SO_ACCEPTCONN) { 641 struct socket *so1; 642 643 so1 = sonewconn(so, 0); 644 if (so1 == NULL) { 645 tcpdropoldhalfopen(tp, th->th_dport); 646 so1 = sonewconn(so, 0); 647 if (so1 == NULL) 648 goto drop; 649 } 650 so = so1; 651 /* 652 * This is ugly, but .... 653 * 654 * Mark socket as temporary until we're 655 * committed to keeping it. The code at 656 * ``drop'' and ``dropwithreset'' check the 657 * flag dropsocket to see if the temporary 658 * socket created here should be discarded. 659 * We mark the socket as discardable until 660 * we're committed to it below in TCPS_LISTEN. 661 */ 662 dropsocket++; 663 #ifdef IPSEC 664 /* 665 * We need to copy the required security levels 666 * from the old pcb. 667 */ 668 { 669 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 670 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 671 sizeof(inp->inp_seclevel)); 672 newinp->inp_secrequire = inp->inp_secrequire; 673 } 674 #endif /* IPSEC */ 675 #ifdef INET6 676 /* 677 * inp still has the OLD in_pcb stuff, set the 678 * v6-related flags on the new guy, too. This is 679 * done particularly for the case where an AF_INET6 680 * socket is bound only to a port, and a v4 connection 681 * comes in on that port. 682 * we also copy the flowinfo from the original pcb 683 * to the new one. 684 */ 685 { 686 int flags = inp->inp_flags; 687 struct inpcb *oldinpcb = inp; 688 689 inp = (struct inpcb *)so->so_pcb; 690 inp->inp_flags |= (flags & (INP_IPV6 | INP_IPV6_UNDEC 691 | INP_IPV6_MAPPED)); 692 if ((inp->inp_flags & INP_IPV6) && 693 !(inp->inp_flags & INP_IPV6_MAPPED)) { 694 inp->inp_ipv6.ip6_hlim = 695 oldinpcb->inp_ipv6.ip6_hlim; 696 inp->inp_ipv6.ip6_flow = 697 oldinpcb->inp_ipv6.ip6_flow; 698 } 699 } 700 #else /* INET6 */ 701 inp = (struct inpcb *)so->so_pcb; 702 #endif /* INET6 */ 703 inp->inp_lport = th->th_dport; 704 #ifdef INET6 705 if (is_ipv6) { 706 inp->inp_laddr6 = ipv6->ip6_dst; 707 inp->inp_fflowinfo = htonl(0x0fffffff) & 708 ipv6->ip6_flow; 709 710 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 711 /* still need to tweak outbound options 712 processing to include this mbuf in 713 the right place and put the correct 714 NextHdr values in the right places. 715 XXX rja */ 716 } else { 717 if (inp->inp_flags & INP_IPV6) {/* v4 to v6 socket */ 718 CREATE_IPV6_MAPPED(inp->inp_laddr6, 719 ti->ti_dst.s_addr); 720 } else { 721 #endif /* INET6 */ 722 inp->inp_laddr = ti->ti_dst; 723 inp->inp_options = ip_srcroute(); 724 #if INET6 725 } 726 } 727 #endif /* INET6 */ 728 in_pcbrehash(inp); 729 tp = intotcpcb(inp); 730 tp->t_state = TCPS_LISTEN; 731 732 /* Compute proper scaling value from buffer space 733 */ 734 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 735 TCP_MAXWIN << tp->request_r_scale < so->so_rcv.sb_hiwat) 736 tp->request_r_scale++; 737 } 738 } 739 740 #ifdef IPSEC 741 /* Check if this socket requires security for incoming packets */ 742 if ((inp->inp_seclevel[SL_AUTH] >= IPSEC_LEVEL_REQUIRE && 743 !(m->m_flags & M_AUTH)) || 744 (inp->inp_seclevel[SL_ESP_TRANS] >= IPSEC_LEVEL_REQUIRE && 745 !(m->m_flags & M_CONF))) { 746 #ifdef notyet 747 #ifdef INET6 748 if (is_ipv6) 749 icmp6_error(m, ICMPV6_BLAH, ICMPV6_BLAH, 0); 750 else 751 #endif /* INET6 */ 752 icmp_error(m, ICMP_BLAH, ICMP_BLAH, 0, 0); 753 #endif /* notyet */ 754 tcpstat.tcps_rcvnosec++; 755 goto drop; 756 } 757 /* Use tdb_bind_out for this inp's outbound communication */ 758 if (tdb) 759 tdb_add_inp(tdb, inp); 760 #endif /*IPSEC */ 761 762 /* 763 * Segment received on connection. 764 * Reset idle time and keep-alive timer. 765 */ 766 tp->t_idle = 0; 767 if (tp->t_state != TCPS_SYN_RECEIVED) 768 tp->t_timer[TCPT_KEEP] = tcp_keepidle; 769 770 #ifdef TCP_SACK 771 if (!tp->sack_disable) 772 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 773 #endif /* TCP_SACK */ 774 775 /* 776 * Process options if not in LISTEN state, 777 * else do it below (after getting remote address). 778 */ 779 if (optp && tp->t_state != TCPS_LISTEN) 780 tcp_dooptions(tp, optp, optlen, th, 781 &ts_present, &ts_val, &ts_ecr); 782 783 #ifdef TCP_SACK 784 if (!tp->sack_disable) { 785 tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/ 786 tp->rcv_lastend = th->th_seq + tlen; 787 } 788 #endif /* TCP_SACK */ 789 /* 790 * Header prediction: check for the two common cases 791 * of a uni-directional data xfer. If the packet has 792 * no control flags, is in-sequence, the window didn't 793 * change and we're not retransmitting, it's a 794 * candidate. If the length is zero and the ack moved 795 * forward, we're the sender side of the xfer. Just 796 * free the data acked & wake any higher level process 797 * that was blocked waiting for space. If the length 798 * is non-zero and the ack didn't move, we're the 799 * receiver side. If we're getting packets in-order 800 * (the reassembly queue is empty), add the data to 801 * the socket buffer and note that we need a delayed ack. 802 */ 803 if (tp->t_state == TCPS_ESTABLISHED && 804 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 805 (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && 806 th->th_seq == tp->rcv_nxt && 807 tiwin && tiwin == tp->snd_wnd && 808 tp->snd_nxt == tp->snd_max) { 809 810 /* 811 * If last ACK falls within this segment's sequence numbers, 812 * record the timestamp. 813 * Fix from Braden, see Stevens p. 870 814 */ 815 if (ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 816 tp->ts_recent_age = tcp_now; 817 tp->ts_recent = ts_val; 818 } 819 820 if (tlen == 0) { 821 if (SEQ_GT(th->th_ack, tp->snd_una) && 822 SEQ_LEQ(th->th_ack, tp->snd_max) && 823 tp->snd_cwnd >= tp->snd_wnd && 824 tp->t_dupacks == 0) { 825 /* 826 * this is a pure ack for outstanding data. 827 */ 828 ++tcpstat.tcps_predack; 829 if (ts_present) 830 tcp_xmit_timer(tp, tcp_now-ts_ecr+1); 831 else if (tp->t_rtt && 832 SEQ_GT(th->th_ack, tp->t_rtseq)) 833 tcp_xmit_timer(tp, tp->t_rtt); 834 acked = th->th_ack - tp->snd_una; 835 tcpstat.tcps_rcvackpack++; 836 tcpstat.tcps_rcvackbyte += acked; 837 ND6_HINT(tp); 838 sbdrop(&so->so_snd, acked); 839 tp->snd_una = th->th_ack; 840 #if defined(TCP_SACK) 841 /* 842 * We want snd_last to track snd_una so 843 * as to avoid sequence wraparound problems 844 * for very large transfers. 845 */ 846 tp->snd_last = tp->snd_una; 847 #endif /* TCP_SACK */ 848 #if defined(TCP_SACK) && defined(TCP_FACK) 849 tp->snd_fack = tp->snd_una; 850 tp->retran_data = 0; 851 #endif /* TCP_FACK */ 852 m_freem(m); 853 854 /* 855 * If all outstanding data are acked, stop 856 * retransmit timer, otherwise restart timer 857 * using current (possibly backed-off) value. 858 * If process is waiting for space, 859 * wakeup/selwakeup/signal. If data 860 * are ready to send, let tcp_output 861 * decide between more output or persist. 862 */ 863 if (tp->snd_una == tp->snd_max) 864 tp->t_timer[TCPT_REXMT] = 0; 865 else if (tp->t_timer[TCPT_PERSIST] == 0) 866 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 867 868 if (sb_notify(&so->so_snd)) 869 sowwakeup(so); 870 if (so->so_snd.sb_cc) 871 (void) tcp_output(tp); 872 return; 873 } 874 } else if (th->th_ack == tp->snd_una && 875 tp->segq.lh_first == NULL && 876 tlen <= sbspace(&so->so_rcv)) { 877 /* 878 * This is a pure, in-sequence data packet 879 * with nothing on the reassembly queue and 880 * we have enough buffer space to take it. 881 */ 882 #ifdef TCP_SACK 883 /* Clean receiver SACK report if present */ 884 if (!tp->sack_disable && tp->rcv_numsacks) 885 tcp_clean_sackreport(tp); 886 #endif /* TCP_SACK */ 887 ++tcpstat.tcps_preddat; 888 tp->rcv_nxt += tlen; 889 tcpstat.tcps_rcvpack++; 890 tcpstat.tcps_rcvbyte += tlen; 891 ND6_HINT(tp); 892 /* 893 * Drop TCP, IP headers and TCP options then add data 894 * to socket buffer. 895 */ 896 m_adj(m, iphlen + off); 897 sbappend(&so->so_rcv, m); 898 sorwakeup(so); 899 if (th->th_flags & TH_PUSH) 900 tp->t_flags |= TF_ACKNOW; 901 else 902 tp->t_flags |= TF_DELACK; 903 return; 904 } 905 } 906 907 /* 908 * Compute mbuf offset to TCP data segment. 909 */ 910 hdroptlen = iphlen + off; 911 912 /* 913 * Calculate amount of space in receive window, 914 * and then do TCP input processing. 915 * Receive window is amount of space in rcv queue, 916 * but not less than advertised window. 917 */ 918 { int win; 919 920 win = sbspace(&so->so_rcv); 921 if (win < 0) 922 win = 0; 923 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 924 } 925 926 switch (tp->t_state) { 927 928 /* 929 * If the state is LISTEN then ignore segment if it contains an RST. 930 * If the segment contains an ACK then it is bad and send a RST. 931 * If it does not contain a SYN then it is not interesting; drop it. 932 * If it is from this socket, drop it, it must be forged. 933 * Don't bother responding if the destination was a broadcast. 934 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial 935 * tp->iss, and send a segment: 936 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 937 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. 938 * Fill in remote peer address fields if not previously specified. 939 * Enter SYN_RECEIVED state, and process any other fields of this 940 * segment in this state. 941 */ 942 case TCPS_LISTEN: { 943 struct mbuf *am; 944 register struct sockaddr_in *sin; 945 #ifdef INET6 946 register struct sockaddr_in6 *sin6; 947 #endif /* INET6 */ 948 949 if (tiflags & TH_RST) 950 goto drop; 951 if (tiflags & TH_ACK) 952 goto dropwithreset; 953 if ((tiflags & TH_SYN) == 0) 954 goto drop; 955 if (th->th_dport == th->th_sport) { 956 #ifdef INET6 957 if (is_ipv6) { 958 if (IN6_ARE_ADDR_EQUAL(&ipv6->ip6_src, &ipv6->ip6_dst)) 959 goto drop; 960 } else { 961 #endif /* INET6 */ 962 if (ti->ti_dst.s_addr == ti->ti_src.s_addr) 963 goto drop; 964 #ifdef INET6 965 } 966 #endif /* INET6 */ 967 } 968 969 /* 970 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 971 * in_broadcast() should never return true on a received 972 * packet with M_BCAST not set. 973 */ 974 if (m->m_flags & (M_BCAST|M_MCAST)) 975 goto drop; 976 #ifdef INET6 977 if (is_ipv6) { 978 /* XXX What about IPv6 Anycasting ?? :-( rja */ 979 if (IN6_IS_ADDR_MULTICAST(&ipv6->ip6_dst)) 980 goto drop; 981 } else 982 #endif /* INET6 */ 983 if (IN_MULTICAST(ti->ti_dst.s_addr)) 984 goto drop; 985 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 986 if (am == NULL) 987 goto drop; 988 #ifdef INET6 989 if (is_ipv6) { 990 /* 991 * This is probably the place to set the tp->pf value. 992 * (Don't forget to do it in the v4 code as well!) 993 * 994 * Also, remember to blank out things like flowlabel, or 995 * set flowlabel for accepted sockets in v6. 996 * 997 * FURTHERMORE, this is PROBABLY the place where the whole 998 * business of key munging is set up for passive 999 * connections. 1000 */ 1001 am->m_len = sizeof(struct sockaddr_in6); 1002 sin6 = mtod(am, struct sockaddr_in6 *); 1003 sin6->sin6_family = AF_INET6; 1004 sin6->sin6_len = sizeof(struct sockaddr_in6); 1005 sin6->sin6_addr = ipv6->ip6_src; 1006 sin6->sin6_port = th->th_sport; 1007 sin6->sin6_flowinfo = htonl(0x0fffffff) & 1008 inp->inp_ipv6.ip6_flow; 1009 laddr6 = inp->inp_laddr6; 1010 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) 1011 inp->inp_laddr6 = ipv6->ip6_dst; 1012 /* This is a good optimization. */ 1013 if (in6_pcbconnect(inp, am)) { 1014 inp->inp_laddr6 = laddr6; 1015 (void) m_free(am); 1016 goto drop; 1017 } /* endif in6_pcbconnect() */ 1018 tp->pf = PF_INET6; 1019 } else { 1020 /* 1021 * Letting v4 incoming datagrams to reach valid 1022 * PF_INET6 sockets causes some overhead here. 1023 */ 1024 if (inp->inp_flags & INP_IPV6) { 1025 if (!(inp->inp_flags & (INP_IPV6_UNDEC|INP_IPV6_MAPPED))) { 1026 (void) m_free(am); 1027 goto drop; 1028 } 1029 1030 am->m_len = sizeof(struct sockaddr_in6); 1031 1032 sin6 = mtod(am, struct sockaddr_in6 *); 1033 sin6->sin6_family = AF_INET6; 1034 sin6->sin6_len = sizeof(*sin6); 1035 CREATE_IPV6_MAPPED(sin6->sin6_addr, ti->ti_src.s_addr); 1036 sin6->sin6_port = th->th_sport; 1037 sin6->sin6_flowinfo = 0; 1038 1039 laddr6 = inp->inp_laddr6; 1040 if (inp->inp_laddr.s_addr == INADDR_ANY) 1041 CREATE_IPV6_MAPPED(inp->inp_laddr6, ti->ti_dst.s_addr); 1042 1043 /* 1044 * The pcb initially has the v6 default hoplimit 1045 * set. We're sending v4 packets so we need to set 1046 * the v4 ttl and tos. 1047 */ 1048 inp->inp_ip.ip_ttl = ip_defttl; 1049 inp->inp_ip.ip_tos = 0; 1050 1051 if (in6_pcbconnect(inp, am)) { 1052 inp->inp_laddr6 = laddr6; 1053 (void) m_freem(am); 1054 goto drop; 1055 } 1056 tp->pf = PF_INET; 1057 } else { 1058 #endif /* INET6 */ 1059 am->m_len = sizeof (struct sockaddr_in); 1060 sin = mtod(am, struct sockaddr_in *); 1061 sin->sin_family = AF_INET; 1062 sin->sin_len = sizeof(*sin); 1063 sin->sin_addr = ti->ti_src; 1064 sin->sin_port = ti->ti_sport; 1065 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); 1066 laddr = inp->inp_laddr; 1067 if (inp->inp_laddr.s_addr == INADDR_ANY) 1068 inp->inp_laddr = ti->ti_dst; 1069 if (in_pcbconnect(inp, am)) { 1070 inp->inp_laddr = laddr; 1071 (void) m_free(am); 1072 goto drop; 1073 } 1074 (void) m_free(am); 1075 tp->pf = PF_INET; 1076 #ifdef INET6 1077 } /* if (inp->inp_flags & INP_IPV6) */ 1078 } /* if (is_ipv6) */ 1079 #endif /* INET6 */ 1080 tp->t_template = tcp_template(tp); 1081 if (tp->t_template == 0) { 1082 tp = tcp_drop(tp, ENOBUFS); 1083 dropsocket = 0; /* socket is already gone */ 1084 goto drop; 1085 } 1086 if (optp) 1087 tcp_dooptions(tp, optp, optlen, th, 1088 &ts_present, &ts_val, &ts_ecr); 1089 #ifdef TCP_SACK 1090 /* 1091 * If peer did not send a SACK_PERMITTED option (i.e., if 1092 * tcp_dooptions() did not set TF_SACK_PERMIT), set 1093 * sack_disable to 1 if it is currently 0. 1094 */ 1095 if (!tp->sack_disable) 1096 if ((tp->t_flags & TF_SACK_PERMIT) == 0) 1097 tp->sack_disable = 1; 1098 #endif 1099 1100 if (iss) 1101 tp->iss = iss; 1102 else 1103 tp->iss = tcp_iss; 1104 #ifdef TCP_COMPAT_42 1105 tcp_iss += TCP_ISSINCR/2; 1106 #else /* TCP_COMPAT_42 */ 1107 tcp_iss += arc4random() % TCP_ISSINCR + 1; 1108 #endif /* !TCP_COMPAT_42 */ 1109 tp->irs = th->th_seq; 1110 tcp_sendseqinit(tp); 1111 #if defined (TCP_SACK) 1112 tp->snd_last = tp->snd_una; 1113 #endif /* TCP_SACK */ 1114 #if defined(TCP_SACK) && defined(TCP_FACK) 1115 tp->snd_fack = tp->snd_una; 1116 tp->retran_data = 0; 1117 tp->snd_awnd = 0; 1118 #endif /* TCP_FACK */ 1119 tcp_rcvseqinit(tp); 1120 tp->t_flags |= TF_ACKNOW; 1121 tp->t_state = TCPS_SYN_RECEIVED; 1122 tp->t_timer[TCPT_KEEP] = tcptv_keep_init; 1123 dropsocket = 0; /* committed to socket */ 1124 tcpstat.tcps_accepts++; 1125 goto trimthenstep6; 1126 } 1127 1128 /* 1129 * If the state is SYN_RECEIVED: 1130 * if seg contains SYN/ACK, send an RST. 1131 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1132 */ 1133 1134 case TCPS_SYN_RECEIVED: 1135 if (tiflags & TH_ACK) { 1136 if (tiflags & TH_SYN) { 1137 tcpstat.tcps_badsyn++; 1138 goto dropwithreset; 1139 } 1140 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1141 SEQ_GT(th->th_ack, tp->snd_max)) 1142 goto dropwithreset; 1143 } 1144 break; 1145 1146 /* 1147 * If the state is SYN_SENT: 1148 * if seg contains an ACK, but not for our SYN, drop the input. 1149 * if seg contains a RST, then drop the connection. 1150 * if seg does not contain SYN, then drop it. 1151 * Otherwise this is an acceptable SYN segment 1152 * initialize tp->rcv_nxt and tp->irs 1153 * if seg contains ack then advance tp->snd_una 1154 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1155 * arrange for segment to be acked (eventually) 1156 * continue processing rest of data/controls, beginning with URG 1157 */ 1158 case TCPS_SYN_SENT: 1159 if ((tiflags & TH_ACK) && 1160 (SEQ_LEQ(th->th_ack, tp->iss) || 1161 SEQ_GT(th->th_ack, tp->snd_max))) 1162 goto dropwithreset; 1163 if (tiflags & TH_RST) { 1164 if (tiflags & TH_ACK) 1165 tp = tcp_drop(tp, ECONNREFUSED); 1166 goto drop; 1167 } 1168 if ((tiflags & TH_SYN) == 0) 1169 goto drop; 1170 if (tiflags & TH_ACK) { 1171 tp->snd_una = th->th_ack; 1172 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1173 tp->snd_nxt = tp->snd_una; 1174 } 1175 tp->t_timer[TCPT_REXMT] = 0; 1176 tp->irs = th->th_seq; 1177 tcp_rcvseqinit(tp); 1178 tp->t_flags |= TF_ACKNOW; 1179 #ifdef TCP_SACK 1180 /* 1181 * If we've sent a SACK_PERMITTED option, and the peer 1182 * also replied with one, then TF_SACK_PERMIT should have 1183 * been set in tcp_dooptions(). If it was not, disable SACKs. 1184 */ 1185 if (!tp->sack_disable) 1186 if ((tp->t_flags & TF_SACK_PERMIT) == 0) 1187 tp->sack_disable = 1; 1188 #endif 1189 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1190 tcpstat.tcps_connects++; 1191 soisconnected(so); 1192 tp->t_state = TCPS_ESTABLISHED; 1193 /* Do window scaling on this connection? */ 1194 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1195 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1196 tp->snd_scale = tp->requested_s_scale; 1197 tp->rcv_scale = tp->request_r_scale; 1198 } 1199 (void) tcp_reass(tp, (struct tcphdr *)0, 1200 (struct mbuf *)0, &tlen); 1201 /* 1202 * if we didn't have to retransmit the SYN, 1203 * use its rtt as our initial srtt & rtt var. 1204 */ 1205 if (tp->t_rtt) 1206 tcp_xmit_timer(tp, tp->t_rtt); 1207 /* 1208 * Since new data was acked (the SYN), open the 1209 * congestion window by one MSS. We do this 1210 * here, because we won't go through the normal 1211 * ACK processing below. And since this is the 1212 * start of the connection, we know we are in 1213 * the exponential phase of slow-start. 1214 */ 1215 tp->snd_cwnd += tp->t_maxseg; 1216 } else 1217 tp->t_state = TCPS_SYN_RECEIVED; 1218 1219 trimthenstep6: 1220 /* 1221 * Advance ti->ti_seq to correspond to first data byte. 1222 * If data, trim to stay within window, 1223 * dropping FIN if necessary. 1224 */ 1225 th->th_seq++; 1226 if (tlen > tp->rcv_wnd) { 1227 todrop = tlen - tp->rcv_wnd; 1228 m_adj(m, -todrop); 1229 tlen = tp->rcv_wnd; 1230 tiflags &= ~TH_FIN; 1231 tcpstat.tcps_rcvpackafterwin++; 1232 tcpstat.tcps_rcvbyteafterwin += todrop; 1233 } 1234 tp->snd_wl1 = th->th_seq - 1; 1235 tp->rcv_up = th->th_seq; 1236 goto step6; 1237 } 1238 1239 /* 1240 * States other than LISTEN or SYN_SENT. 1241 * First check timestamp, if present. 1242 * Then check that at least some bytes of segment are within 1243 * receive window. If segment begins before rcv_nxt, 1244 * drop leading data (and SYN); if nothing left, just ack. 1245 * 1246 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1247 * and it's less than ts_recent, drop it. 1248 */ 1249 if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1250 TSTMP_LT(ts_val, tp->ts_recent)) { 1251 1252 /* Check to see if ts_recent is over 24 days old. */ 1253 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1254 /* 1255 * Invalidate ts_recent. If this segment updates 1256 * ts_recent, the age will be reset later and ts_recent 1257 * will get a valid value. If it does not, setting 1258 * ts_recent to zero will at least satisfy the 1259 * requirement that zero be placed in the timestamp 1260 * echo reply when ts_recent isn't valid. The 1261 * age isn't reset until we get a valid ts_recent 1262 * because we don't want out-of-order segments to be 1263 * dropped when ts_recent is old. 1264 */ 1265 tp->ts_recent = 0; 1266 } else { 1267 tcpstat.tcps_rcvduppack++; 1268 tcpstat.tcps_rcvdupbyte += tlen; 1269 tcpstat.tcps_pawsdrop++; 1270 goto dropafterack; 1271 } 1272 } 1273 1274 todrop = tp->rcv_nxt - th->th_seq; 1275 if (todrop > 0) { 1276 if (tiflags & TH_SYN) { 1277 tiflags &= ~TH_SYN; 1278 th->th_seq++; 1279 if (th->th_urp > 1) 1280 th->th_urp--; 1281 else 1282 tiflags &= ~TH_URG; 1283 todrop--; 1284 } 1285 if (todrop >= tlen || 1286 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1287 /* 1288 * Any valid FIN must be to the left of the 1289 * window. At this point, FIN must be a 1290 * duplicate or out-of-sequence, so drop it. 1291 */ 1292 tiflags &= ~TH_FIN; 1293 /* 1294 * Send ACK to resynchronize, and drop any data, 1295 * but keep on processing for RST or ACK. 1296 */ 1297 tp->t_flags |= TF_ACKNOW; 1298 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1299 tcpstat.tcps_rcvduppack++; 1300 } else { 1301 tcpstat.tcps_rcvpartduppack++; 1302 tcpstat.tcps_rcvpartdupbyte += todrop; 1303 } 1304 hdroptlen += todrop; /* drop from head afterwards */ 1305 th->th_seq += todrop; 1306 tlen -= todrop; 1307 if (th->th_urp > todrop) 1308 th->th_urp -= todrop; 1309 else { 1310 tiflags &= ~TH_URG; 1311 th->th_urp = 0; 1312 } 1313 } 1314 1315 /* 1316 * If new data are received on a connection after the 1317 * user processes are gone, then RST the other end. 1318 */ 1319 if ((so->so_state & SS_NOFDREF) && 1320 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1321 tp = tcp_close(tp); 1322 tcpstat.tcps_rcvafterclose++; 1323 goto dropwithreset; 1324 } 1325 1326 /* 1327 * If segment ends after window, drop trailing data 1328 * (and PUSH and FIN); if nothing left, just ACK. 1329 */ 1330 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1331 if (todrop > 0) { 1332 tcpstat.tcps_rcvpackafterwin++; 1333 if (todrop >= tlen) { 1334 tcpstat.tcps_rcvbyteafterwin += tlen; 1335 /* 1336 * If a new connection request is received 1337 * while in TIME_WAIT, drop the old connection 1338 * and start over if the sequence numbers 1339 * are above the previous ones. 1340 */ 1341 if (tiflags & TH_SYN && 1342 tp->t_state == TCPS_TIME_WAIT && 1343 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1344 iss = tp->snd_nxt + TCP_ISSINCR; 1345 tp = tcp_close(tp); 1346 goto findpcb; 1347 } 1348 /* 1349 * If window is closed can only take segments at 1350 * window edge, and have to drop data and PUSH from 1351 * incoming segments. Continue processing, but 1352 * remember to ack. Otherwise, drop segment 1353 * and ack. 1354 */ 1355 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1356 tp->t_flags |= TF_ACKNOW; 1357 tcpstat.tcps_rcvwinprobe++; 1358 } else 1359 goto dropafterack; 1360 } else 1361 tcpstat.tcps_rcvbyteafterwin += todrop; 1362 m_adj(m, -todrop); 1363 tlen -= todrop; 1364 tiflags &= ~(TH_PUSH|TH_FIN); 1365 } 1366 1367 /* 1368 * If last ACK falls within this segment's sequence numbers, 1369 * record its timestamp. 1370 * Fix from Braden, see Stevens p. 870 1371 */ 1372 if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && 1373 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1374 tp->ts_recent_age = tcp_now; 1375 tp->ts_recent = ts_val; 1376 } 1377 1378 /* 1379 * If the RST bit is set examine the state: 1380 * SYN_RECEIVED STATE: 1381 * If passive open, return to LISTEN state. 1382 * If active open, inform user that connection was refused. 1383 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1384 * Inform user that connection was reset, and close tcb. 1385 * CLOSING, LAST_ACK, TIME_WAIT STATES 1386 * Close the tcb. 1387 */ 1388 if (tiflags & TH_RST) { 1389 #ifndef INET6 1390 if (ti->ti_seq != tp->last_ack_sent) 1391 #else 1392 if (th->th_seq != tp->last_ack_sent) 1393 #endif 1394 goto drop; 1395 1396 switch (tp->t_state) { 1397 case TCPS_SYN_RECEIVED: 1398 so->so_error = ECONNREFUSED; 1399 goto close; 1400 1401 case TCPS_ESTABLISHED: 1402 case TCPS_FIN_WAIT_1: 1403 case TCPS_FIN_WAIT_2: 1404 case TCPS_CLOSE_WAIT: 1405 so->so_error = ECONNRESET; 1406 close: 1407 tp->t_state = TCPS_CLOSED; 1408 tcpstat.tcps_drops++; 1409 tp = tcp_close(tp); 1410 goto drop; 1411 case TCPS_CLOSING: 1412 case TCPS_LAST_ACK: 1413 case TCPS_TIME_WAIT: 1414 tp = tcp_close(tp); 1415 goto drop; 1416 } 1417 } 1418 1419 /* 1420 * If a SYN is in the window, then this is an 1421 * error and we send an RST and drop the connection. 1422 */ 1423 if (tiflags & TH_SYN) { 1424 tp = tcp_drop(tp, ECONNRESET); 1425 goto dropwithreset; 1426 } 1427 1428 /* 1429 * If the ACK bit is off we drop the segment and return. 1430 */ 1431 if ((tiflags & TH_ACK) == 0) { 1432 if (tp->t_flags & TF_ACKNOW) 1433 goto dropafterack; 1434 else 1435 goto drop; 1436 } 1437 1438 /* 1439 * Ack processing. 1440 */ 1441 switch (tp->t_state) { 1442 1443 /* 1444 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1445 * ESTABLISHED state and continue processing. 1446 * The ACK was checked above. 1447 */ 1448 case TCPS_SYN_RECEIVED: 1449 tcpstat.tcps_connects++; 1450 soisconnected(so); 1451 tp->t_state = TCPS_ESTABLISHED; 1452 /* Do window scaling? */ 1453 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1454 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1455 tp->snd_scale = tp->requested_s_scale; 1456 tp->rcv_scale = tp->request_r_scale; 1457 } 1458 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1459 &tlen); 1460 tp->snd_wl1 = th->th_seq - 1; 1461 /* fall into ... */ 1462 1463 /* 1464 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1465 * ACKs. If the ack is in the range 1466 * tp->snd_una < ti->ti_ack <= tp->snd_max 1467 * then advance tp->snd_una to ti->ti_ack and drop 1468 * data from the retransmission queue. If this ACK reflects 1469 * more up to date window information we update our window information. 1470 */ 1471 case TCPS_ESTABLISHED: 1472 case TCPS_FIN_WAIT_1: 1473 case TCPS_FIN_WAIT_2: 1474 case TCPS_CLOSE_WAIT: 1475 case TCPS_CLOSING: 1476 case TCPS_LAST_ACK: 1477 case TCPS_TIME_WAIT: 1478 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1479 /* 1480 * Duplicate/old ACK processing. 1481 * Increments t_dupacks: 1482 * Pure duplicate (same seq/ack/window, no data) 1483 * Doesn't affect t_dupacks: 1484 * Data packets. 1485 * Normal window updates (window opens) 1486 * Resets t_dupacks: 1487 * New data ACKed. 1488 * Window shrinks 1489 * Old ACK 1490 */ 1491 if (tlen) 1492 break; 1493 /* 1494 * If we get an old ACK, there is probably packet 1495 * reordering going on. Be conservative and reset 1496 * t_dupacks so that we are less agressive in 1497 * doing a fast retransmit. 1498 */ 1499 if (th->th_ack != tp->snd_una) { 1500 tp->t_dupacks = 0; 1501 break; 1502 } 1503 if (tiwin == tp->snd_wnd) { 1504 tcpstat.tcps_rcvdupack++; 1505 /* 1506 * If we have outstanding data (other than 1507 * a window probe), this is a completely 1508 * duplicate ack (ie, window info didn't 1509 * change), the ack is the biggest we've 1510 * seen and we've seen exactly our rexmt 1511 * threshhold of them, assume a packet 1512 * has been dropped and retransmit it. 1513 * Kludge snd_nxt & the congestion 1514 * window so we send only this one 1515 * packet. 1516 * 1517 * We know we're losing at the current 1518 * window size so do congestion avoidance 1519 * (set ssthresh to half the current window 1520 * and pull our congestion window back to 1521 * the new ssthresh). 1522 * 1523 * Dup acks mean that packets have left the 1524 * network (they're now cached at the receiver) 1525 * so bump cwnd by the amount in the receiver 1526 * to keep a constant cwnd packets in the 1527 * network. 1528 */ 1529 if (tp->t_timer[TCPT_REXMT] == 0) 1530 tp->t_dupacks = 0; 1531 #if defined(TCP_SACK) && defined(TCP_FACK) 1532 /* 1533 * In FACK, can enter fast rec. if the receiver 1534 * reports a reass. queue longer than 3 segs. 1535 */ 1536 else if (++tp->t_dupacks == tcprexmtthresh || 1537 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1538 tp->t_maxseg + tp->snd_una)) && 1539 SEQ_GT(tp->snd_una, tp->snd_last))) { 1540 #else 1541 else if (++tp->t_dupacks == tcprexmtthresh) { 1542 #endif /* TCP_FACK */ 1543 tcp_seq onxt = tp->snd_nxt; 1544 u_long win = 1545 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1546 2 / tp->t_maxseg; 1547 1548 #if defined(TCP_SACK) 1549 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1550 /* 1551 * False fast retx after 1552 * timeout. Do not cut window. 1553 */ 1554 tp->snd_cwnd += tp->t_maxseg; 1555 tp->t_dupacks = 0; 1556 (void) tcp_output(tp); 1557 goto drop; 1558 } 1559 #endif 1560 if (win < 2) 1561 win = 2; 1562 tp->snd_ssthresh = win * tp->t_maxseg; 1563 #if defined(TCP_SACK) 1564 tp->snd_last = tp->snd_max; 1565 #endif 1566 #ifdef TCP_SACK 1567 if (!tp->sack_disable) { 1568 tp->t_timer[TCPT_REXMT] = 0; 1569 tp->t_rtt = 0; 1570 tcpstat.tcps_sndrexmitfast++; 1571 #if defined(TCP_SACK) && defined(TCP_FACK) 1572 (void) tcp_output(tp); 1573 /* 1574 * During FR, snd_cwnd is held 1575 * constant for FACK. 1576 */ 1577 tp->snd_cwnd = tp->snd_ssthresh; 1578 tp->t_dupacks = tcprexmtthresh; 1579 #else 1580 /* 1581 * tcp_output() will send 1582 * oldest SACK-eligible rtx. 1583 */ 1584 (void) tcp_output(tp); 1585 tp->snd_cwnd = tp->snd_ssthresh+ 1586 tp->t_maxseg * tp->t_dupacks; 1587 #endif /* TCP_FACK */ 1588 goto drop; 1589 } 1590 #endif /* TCP_SACK */ 1591 tp->t_timer[TCPT_REXMT] = 0; 1592 tp->t_rtt = 0; 1593 tp->snd_nxt = th->th_ack; 1594 tp->snd_cwnd = tp->t_maxseg; 1595 tcpstat.tcps_sndrexmitfast++; 1596 (void) tcp_output(tp); 1597 1598 tp->snd_cwnd = tp->snd_ssthresh + 1599 tp->t_maxseg * tp->t_dupacks; 1600 if (SEQ_GT(onxt, tp->snd_nxt)) 1601 tp->snd_nxt = onxt; 1602 goto drop; 1603 } else if (tp->t_dupacks > tcprexmtthresh) { 1604 #if defined(TCP_SACK) && defined(TCP_FACK) 1605 /* 1606 * while (awnd < cwnd) 1607 * sendsomething(); 1608 */ 1609 if (!tp->sack_disable) { 1610 if (tp->snd_awnd < tp->snd_cwnd) 1611 tcp_output(tp); 1612 goto drop; 1613 } 1614 #endif /* TCP_FACK */ 1615 tp->snd_cwnd += tp->t_maxseg; 1616 (void) tcp_output(tp); 1617 goto drop; 1618 } 1619 } else if (tiwin < tp->snd_wnd) { 1620 /* 1621 * The window was retracted! Previous dup 1622 * ACKs may have been due to packets arriving 1623 * after the shrunken window, not a missing 1624 * packet, so play it safe and reset t_dupacks 1625 */ 1626 tp->t_dupacks = 0; 1627 } 1628 break; 1629 } 1630 /* 1631 * If the congestion window was inflated to account 1632 * for the other side's cached packets, retract it. 1633 */ 1634 #if defined(TCP_SACK) 1635 if (!tp->sack_disable) { 1636 if (tp->t_dupacks >= tcprexmtthresh) { 1637 /* Check for a partial ACK */ 1638 if (tcp_sack_partialack(tp, th)) { 1639 #if defined(TCP_SACK) && defined(TCP_FACK) 1640 /* Force call to tcp_output */ 1641 if (tp->snd_awnd < tp->snd_cwnd) 1642 needoutput = 1; 1643 #else 1644 tp->snd_cwnd += tp->t_maxseg; 1645 needoutput = 1; 1646 #endif /* TCP_FACK */ 1647 } else { 1648 /* Out of fast recovery */ 1649 tp->snd_cwnd = tp->snd_ssthresh; 1650 if (tcp_seq_subtract(tp->snd_max, 1651 th->th_ack) < tp->snd_ssthresh) 1652 tp->snd_cwnd = 1653 tcp_seq_subtract(tp->snd_max, 1654 th->th_ack) + tp->t_maxseg; 1655 tp->t_dupacks = 0; 1656 #if defined(TCP_SACK) && defined(TCP_FACK) 1657 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1658 tp->snd_fack = th->th_ack; 1659 #endif /* TCP_FACK */ 1660 } 1661 } 1662 } else { 1663 if (tp->t_dupacks >= tcprexmtthresh && 1664 !tcp_newreno(tp, th)) { 1665 /* Out of fast recovery */ 1666 tp->snd_cwnd = tp->snd_ssthresh; 1667 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1668 tp->snd_ssthresh) 1669 tp->snd_cwnd = 1670 tcp_seq_subtract(tp->snd_max, 1671 th->th_ack) + tp->t_maxseg; 1672 tp->t_dupacks = 0; 1673 } 1674 } 1675 if (tp->t_dupacks < tcprexmtthresh) 1676 tp->t_dupacks = 0; 1677 #else /* else no TCP_SACK */ 1678 if (tp->t_dupacks >= tcprexmtthresh && 1679 tp->snd_cwnd > tp->snd_ssthresh) 1680 tp->snd_cwnd = tp->snd_ssthresh; 1681 tp->t_dupacks = 0; 1682 #endif 1683 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1684 tcpstat.tcps_rcvacktoomuch++; 1685 goto dropafterack; 1686 } 1687 acked = th->th_ack - tp->snd_una; 1688 tcpstat.tcps_rcvackpack++; 1689 tcpstat.tcps_rcvackbyte += acked; 1690 1691 /* 1692 * If we have a timestamp reply, update smoothed 1693 * round trip time. If no timestamp is present but 1694 * transmit timer is running and timed sequence 1695 * number was acked, update smoothed round trip time. 1696 * Since we now have an rtt measurement, cancel the 1697 * timer backoff (cf., Phil Karn's retransmit alg.). 1698 * Recompute the initial retransmit timer. 1699 */ 1700 if (ts_present) 1701 tcp_xmit_timer(tp, tcp_now-ts_ecr+1); 1702 else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq)) 1703 tcp_xmit_timer(tp,tp->t_rtt); 1704 1705 /* 1706 * If all outstanding data is acked, stop retransmit 1707 * timer and remember to restart (more output or persist). 1708 * If there is more data to be acked, restart retransmit 1709 * timer, using current (possibly backed-off) value. 1710 */ 1711 if (th->th_ack == tp->snd_max) { 1712 tp->t_timer[TCPT_REXMT] = 0; 1713 needoutput = 1; 1714 } else if (tp->t_timer[TCPT_PERSIST] == 0) 1715 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1716 /* 1717 * When new data is acked, open the congestion window. 1718 * If the window gives us less than ssthresh packets 1719 * in flight, open exponentially (maxseg per packet). 1720 * Otherwise open linearly: maxseg per window 1721 * (maxseg^2 / cwnd per packet). 1722 */ 1723 { 1724 register u_int cw = tp->snd_cwnd; 1725 register u_int incr = tp->t_maxseg; 1726 1727 if (cw > tp->snd_ssthresh) 1728 incr = incr * incr / cw; 1729 #if defined (TCP_SACK) 1730 if (SEQ_GEQ(th->th_ack, tp->snd_last)) 1731 #endif 1732 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1733 } 1734 ND6_HINT(tp); 1735 if (acked > so->so_snd.sb_cc) { 1736 tp->snd_wnd -= so->so_snd.sb_cc; 1737 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1738 ourfinisacked = 1; 1739 } else { 1740 sbdrop(&so->so_snd, acked); 1741 tp->snd_wnd -= acked; 1742 ourfinisacked = 0; 1743 } 1744 if (sb_notify(&so->so_snd)) 1745 sowwakeup(so); 1746 tp->snd_una = th->th_ack; 1747 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1748 tp->snd_nxt = tp->snd_una; 1749 #if defined (TCP_SACK) && defined (TCP_FACK) 1750 if (SEQ_GT(tp->snd_una, tp->snd_fack)) 1751 tp->snd_fack = tp->snd_una; 1752 #endif 1753 1754 switch (tp->t_state) { 1755 1756 /* 1757 * In FIN_WAIT_1 STATE in addition to the processing 1758 * for the ESTABLISHED state if our FIN is now acknowledged 1759 * then enter FIN_WAIT_2. 1760 */ 1761 case TCPS_FIN_WAIT_1: 1762 if (ourfinisacked) { 1763 /* 1764 * If we can't receive any more 1765 * data, then closing user can proceed. 1766 * Starting the timer is contrary to the 1767 * specification, but if we don't get a FIN 1768 * we'll hang forever. 1769 */ 1770 if (so->so_state & SS_CANTRCVMORE) { 1771 soisdisconnected(so); 1772 tp->t_timer[TCPT_2MSL] = tcp_maxidle; 1773 } 1774 tp->t_state = TCPS_FIN_WAIT_2; 1775 } 1776 break; 1777 1778 /* 1779 * In CLOSING STATE in addition to the processing for 1780 * the ESTABLISHED state if the ACK acknowledges our FIN 1781 * then enter the TIME-WAIT state, otherwise ignore 1782 * the segment. 1783 */ 1784 case TCPS_CLOSING: 1785 if (ourfinisacked) { 1786 tp->t_state = TCPS_TIME_WAIT; 1787 tcp_canceltimers(tp); 1788 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1789 soisdisconnected(so); 1790 } 1791 break; 1792 1793 /* 1794 * In LAST_ACK, we may still be waiting for data to drain 1795 * and/or to be acked, as well as for the ack of our FIN. 1796 * If our FIN is now acknowledged, delete the TCB, 1797 * enter the closed state and return. 1798 */ 1799 case TCPS_LAST_ACK: 1800 if (ourfinisacked) { 1801 tp = tcp_close(tp); 1802 goto drop; 1803 } 1804 break; 1805 1806 /* 1807 * In TIME_WAIT state the only thing that should arrive 1808 * is a retransmission of the remote FIN. Acknowledge 1809 * it and restart the finack timer. 1810 */ 1811 case TCPS_TIME_WAIT: 1812 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1813 goto dropafterack; 1814 } 1815 } 1816 1817 step6: 1818 /* 1819 * Update window information. 1820 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1821 */ 1822 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 1823 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 1824 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 1825 /* keep track of pure window updates */ 1826 if (tlen == 0 && 1827 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1828 tcpstat.tcps_rcvwinupd++; 1829 tp->snd_wnd = tiwin; 1830 tp->snd_wl1 = th->th_seq; 1831 tp->snd_wl2 = th->th_ack; 1832 if (tp->snd_wnd > tp->max_sndwnd) 1833 tp->max_sndwnd = tp->snd_wnd; 1834 needoutput = 1; 1835 } 1836 1837 /* 1838 * Process segments with URG. 1839 */ 1840 if ((tiflags & TH_URG) && th->th_urp && 1841 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1842 /* 1843 * This is a kludge, but if we receive and accept 1844 * random urgent pointers, we'll crash in 1845 * soreceive. It's hard to imagine someone 1846 * actually wanting to send this much urgent data. 1847 */ 1848 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1849 th->th_urp = 0; /* XXX */ 1850 tiflags &= ~TH_URG; /* XXX */ 1851 goto dodata; /* XXX */ 1852 } 1853 /* 1854 * If this segment advances the known urgent pointer, 1855 * then mark the data stream. This should not happen 1856 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1857 * a FIN has been received from the remote side. 1858 * In these states we ignore the URG. 1859 * 1860 * According to RFC961 (Assigned Protocols), 1861 * the urgent pointer points to the last octet 1862 * of urgent data. We continue, however, 1863 * to consider it to indicate the first octet 1864 * of data past the urgent section as the original 1865 * spec states (in one of two places). 1866 */ 1867 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1868 tp->rcv_up = th->th_seq + th->th_urp; 1869 so->so_oobmark = so->so_rcv.sb_cc + 1870 (tp->rcv_up - tp->rcv_nxt) - 1; 1871 if (so->so_oobmark == 0) 1872 so->so_state |= SS_RCVATMARK; 1873 sohasoutofband(so); 1874 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1875 } 1876 /* 1877 * Remove out of band data so doesn't get presented to user. 1878 * This can happen independent of advancing the URG pointer, 1879 * but if two URG's are pending at once, some out-of-band 1880 * data may creep in... ick. 1881 */ 1882 if (th->th_urp <= (u_int16_t) tlen 1883 #ifdef SO_OOBINLINE 1884 && (so->so_options & SO_OOBINLINE) == 0 1885 #endif 1886 ) 1887 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1888 } else 1889 /* 1890 * If no out of band data is expected, 1891 * pull receive urgent pointer along 1892 * with the receive window. 1893 */ 1894 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1895 tp->rcv_up = tp->rcv_nxt; 1896 dodata: /* XXX */ 1897 1898 /* 1899 * Process the segment text, merging it into the TCP sequencing queue, 1900 * and arranging for acknowledgment of receipt if necessary. 1901 * This process logically involves adjusting tp->rcv_wnd as data 1902 * is presented to the user (this happens in tcp_usrreq.c, 1903 * case PRU_RCVD). If a FIN has already been received on this 1904 * connection then we just ignore the text. 1905 */ 1906 if ((tlen || (tiflags & TH_FIN)) && 1907 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1908 if (th->th_seq == tp->rcv_nxt && tp->segq.lh_first == NULL && 1909 tp->t_state == TCPS_ESTABLISHED) { 1910 if (th->th_flags & TH_PUSH) 1911 tp->t_flags |= TF_ACKNOW; 1912 else 1913 tp->t_flags |= TF_DELACK; 1914 tp->rcv_nxt += tlen; 1915 tiflags = th->th_flags & TH_FIN; 1916 tcpstat.tcps_rcvpack++; 1917 tcpstat.tcps_rcvbyte += tlen; 1918 ND6_HINT(tp); 1919 m_adj(m, hdroptlen); 1920 sbappend(&so->so_rcv, m); 1921 sorwakeup(so); 1922 } else { 1923 m_adj(m, hdroptlen); 1924 tiflags = tcp_reass(tp, th, m, &tlen); 1925 tp->t_flags |= TF_ACKNOW; 1926 } 1927 #ifdef TCP_SACK 1928 if (!tp->sack_disable) 1929 tcp_update_sack_list(tp); 1930 #endif 1931 1932 /* 1933 * variable len never referenced again in modern BSD, 1934 * so why bother computing it ?? 1935 */ 1936 #if 0 1937 /* 1938 * Note the amount of data that peer has sent into 1939 * our window, in order to estimate the sender's 1940 * buffer size. 1941 */ 1942 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1943 #endif /* 0 */ 1944 } else { 1945 m_freem(m); 1946 tiflags &= ~TH_FIN; 1947 } 1948 1949 /* 1950 * If FIN is received ACK the FIN and let the user know 1951 * that the connection is closing. Ignore a FIN received before 1952 * the connection is fully established. 1953 */ 1954 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1955 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1956 socantrcvmore(so); 1957 tp->t_flags |= TF_ACKNOW; 1958 tp->rcv_nxt++; 1959 } 1960 switch (tp->t_state) { 1961 1962 /* 1963 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1964 */ 1965 case TCPS_ESTABLISHED: 1966 tp->t_state = TCPS_CLOSE_WAIT; 1967 break; 1968 1969 /* 1970 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1971 * enter the CLOSING state. 1972 */ 1973 case TCPS_FIN_WAIT_1: 1974 tp->t_state = TCPS_CLOSING; 1975 break; 1976 1977 /* 1978 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1979 * starting the time-wait timer, turning off the other 1980 * standard timers. 1981 */ 1982 case TCPS_FIN_WAIT_2: 1983 tp->t_state = TCPS_TIME_WAIT; 1984 tcp_canceltimers(tp); 1985 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1986 soisdisconnected(so); 1987 break; 1988 1989 /* 1990 * In TIME_WAIT state restart the 2 MSL time_wait timer. 1991 */ 1992 case TCPS_TIME_WAIT: 1993 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1994 break; 1995 } 1996 } 1997 if (so->so_options & SO_DEBUG) { 1998 #ifdef INET6 1999 if (tp->pf == PF_INET6) 2000 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 0, tlen); 2001 else 2002 #endif /* INET6 */ 2003 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 0, tlen); 2004 } 2005 2006 /* 2007 * Return any desired output. 2008 */ 2009 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2010 (void) tcp_output(tp); 2011 } 2012 return; 2013 2014 dropafterack: 2015 /* 2016 * Generate an ACK dropping incoming segment if it occupies 2017 * sequence space, where the ACK reflects our state. 2018 */ 2019 if (tiflags & TH_RST) 2020 goto drop; 2021 m_freem(m); 2022 tp->t_flags |= TF_ACKNOW; 2023 (void) tcp_output(tp); 2024 return; 2025 2026 dropwithreset: 2027 /* 2028 * Generate a RST, dropping incoming segment. 2029 * Make ACK acceptable to originator of segment. 2030 * Don't bother to respond if destination was broadcast/multicast. 2031 */ 2032 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2033 goto drop; 2034 #ifdef INET6 2035 if (is_ipv6) { 2036 /* For following calls to tcp_respond */ 2037 ti = mtod(m, struct tcpiphdr *); 2038 if (IN6_IS_ADDR_MULTICAST(&ipv6->ip6_dst)) 2039 goto drop; 2040 } else { 2041 #endif /* INET6 */ 2042 if (IN_MULTICAST(ti->ti_dst.s_addr)) 2043 goto drop; 2044 #ifdef INET6 2045 } 2046 #endif /* INET6 */ 2047 if (tiflags & TH_ACK) 2048 tcp_respond(tp, (caddr_t) ti, m, (tcp_seq)0, th->th_ack, TH_RST); 2049 else { 2050 if (tiflags & TH_SYN) 2051 tlen++; 2052 tcp_respond(tp, (caddr_t) ti, m, th->th_seq+tlen, (tcp_seq)0, 2053 TH_RST|TH_ACK); 2054 } 2055 /* destroy temporarily created socket */ 2056 if (dropsocket) 2057 (void) soabort(so); 2058 return; 2059 2060 drop: 2061 /* 2062 * Drop space held by incoming segment and return. 2063 */ 2064 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2065 #ifdef INET6 2066 if (tp->pf == PF_INET6) 2067 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 0, tlen); 2068 else 2069 #endif /* INET6 */ 2070 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 0, tlen); 2071 } 2072 2073 m_freem(m); 2074 /* destroy temporarily created socket */ 2075 if (dropsocket) 2076 (void) soabort(so); 2077 return; 2078 #ifndef TUBA_INCLUDE 2079 } 2080 2081 void 2082 tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr) 2083 struct tcpcb *tp; 2084 u_char *cp; 2085 int cnt; 2086 struct tcphdr *th; 2087 int *ts_present; 2088 u_int32_t *ts_val, *ts_ecr; 2089 { 2090 u_int16_t mss = 0; 2091 int opt, optlen; 2092 2093 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2094 opt = cp[0]; 2095 if (opt == TCPOPT_EOL) 2096 break; 2097 if (opt == TCPOPT_NOP) 2098 optlen = 1; 2099 else { 2100 optlen = cp[1]; 2101 if (optlen <= 0) 2102 break; 2103 } 2104 switch (opt) { 2105 2106 default: 2107 continue; 2108 2109 case TCPOPT_MAXSEG: 2110 if (optlen != TCPOLEN_MAXSEG) 2111 continue; 2112 if (!(th->th_flags & TH_SYN)) 2113 continue; 2114 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2115 NTOHS(mss); 2116 break; 2117 2118 case TCPOPT_WINDOW: 2119 if (optlen != TCPOLEN_WINDOW) 2120 continue; 2121 if (!(th->th_flags & TH_SYN)) 2122 continue; 2123 tp->t_flags |= TF_RCVD_SCALE; 2124 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2125 break; 2126 2127 case TCPOPT_TIMESTAMP: 2128 if (optlen != TCPOLEN_TIMESTAMP) 2129 continue; 2130 *ts_present = 1; 2131 bcopy((char *)cp + 2, (char *) ts_val, sizeof(*ts_val)); 2132 NTOHL(*ts_val); 2133 bcopy((char *)cp + 6, (char *) ts_ecr, sizeof(*ts_ecr)); 2134 NTOHL(*ts_ecr); 2135 2136 /* 2137 * A timestamp received in a SYN makes 2138 * it ok to send timestamp requests and replies. 2139 */ 2140 if (th->th_flags & TH_SYN) { 2141 tp->t_flags |= TF_RCVD_TSTMP; 2142 tp->ts_recent = *ts_val; 2143 tp->ts_recent_age = tcp_now; 2144 } 2145 break; 2146 2147 #ifdef TCP_SACK 2148 case TCPOPT_SACK_PERMITTED: 2149 if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED) 2150 continue; 2151 if (th->th_flags & TH_SYN) 2152 /* MUST only be set on SYN */ 2153 tp->t_flags |= TF_SACK_PERMIT; 2154 break; 2155 case TCPOPT_SACK: 2156 if (tcp_sack_option(tp, th, cp, optlen)) 2157 continue; 2158 break; 2159 #endif 2160 } 2161 } 2162 /* Update t_maxopd and t_maxseg after all options are processed */ 2163 if (th->th_flags & TH_SYN) 2164 (void) tcp_mss(tp, mss); /* sets t_maxseg */ 2165 } 2166 2167 #if defined(TCP_SACK) 2168 u_long 2169 tcp_seq_subtract(a, b) 2170 u_long a, b; 2171 { 2172 return ((long)(a - b)); 2173 } 2174 #endif 2175 2176 2177 #ifdef TCP_SACK 2178 /* 2179 * This function is called upon receipt of new valid data (while not in header 2180 * prediction mode), and it updates the ordered list of sacks. 2181 */ 2182 void 2183 tcp_update_sack_list(tp) 2184 struct tcpcb *tp; 2185 { 2186 /* 2187 * First reported block MUST be the most recent one. Subsequent 2188 * blocks SHOULD be in the order in which they arrived at the 2189 * receiver. These two conditions make the implementation fully 2190 * compliant with RFC 2018. 2191 */ 2192 int i, j = 0, count = 0, lastpos = -1; 2193 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2194 2195 /* First clean up current list of sacks */ 2196 for (i = 0; i < tp->rcv_numsacks; i++) { 2197 sack = tp->sackblks[i]; 2198 if (sack.start == 0 && sack.end == 0) { 2199 count++; /* count = number of blocks to be discarded */ 2200 continue; 2201 } 2202 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2203 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2204 count++; 2205 } else { 2206 temp[j].start = tp->sackblks[i].start; 2207 temp[j++].end = tp->sackblks[i].end; 2208 } 2209 } 2210 tp->rcv_numsacks -= count; 2211 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2212 tcp_clean_sackreport(tp); 2213 if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) { 2214 /* ==> need first sack block */ 2215 tp->sackblks[0].start = tp->rcv_laststart; 2216 tp->sackblks[0].end = tp->rcv_lastend; 2217 tp->rcv_numsacks = 1; 2218 } 2219 return; 2220 } 2221 /* Otherwise, sack blocks are already present. */ 2222 for (i = 0; i < tp->rcv_numsacks; i++) 2223 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2224 if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend)) 2225 return; /* sack list remains unchanged */ 2226 /* 2227 * From here, segment just received should be (part of) the 1st sack. 2228 * Go through list, possibly coalescing sack block entries. 2229 */ 2230 firstsack.start = tp->rcv_laststart; 2231 firstsack.end = tp->rcv_lastend; 2232 for (i = 0; i < tp->rcv_numsacks; i++) { 2233 sack = tp->sackblks[i]; 2234 if (SEQ_LT(sack.end, firstsack.start) || 2235 SEQ_GT(sack.start, firstsack.end)) 2236 continue; /* no overlap */ 2237 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2238 /* 2239 * identical block; delete it here since we will 2240 * move it to the front of the list. 2241 */ 2242 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2243 lastpos = i; /* last posn with a zero entry */ 2244 continue; 2245 } 2246 if (SEQ_LEQ(sack.start, firstsack.start)) 2247 firstsack.start = sack.start; /* merge blocks */ 2248 if (SEQ_GEQ(sack.end, firstsack.end)) 2249 firstsack.end = sack.end; /* merge blocks */ 2250 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2251 lastpos = i; /* last posn with a zero entry */ 2252 } 2253 if (lastpos != -1) { /* at least one merge */ 2254 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2255 sack = tp->sackblks[i]; 2256 if (sack.start == 0 && sack.end == 0) 2257 continue; 2258 temp[j++] = sack; 2259 } 2260 tp->rcv_numsacks = j; /* including first blk (added later) */ 2261 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2262 tp->sackblks[i] = temp[i]; 2263 } else { /* no merges -- shift sacks by 1 */ 2264 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2265 tp->rcv_numsacks++; 2266 for (i = tp->rcv_numsacks-1; i > 0; i--) 2267 tp->sackblks[i] = tp->sackblks[i-1]; 2268 } 2269 tp->sackblks[0] = firstsack; 2270 return; 2271 } 2272 2273 /* 2274 * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue, 2275 * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list 2276 * of holes (oldest to newest, in terms of the sequence space). 2277 */ 2278 int 2279 tcp_sack_option(tp, th, cp, optlen) 2280 struct tcpcb *tp; 2281 struct tcphdr *th; 2282 u_char *cp; 2283 int optlen; 2284 { 2285 int tmp_olen; 2286 u_char *tmp_cp; 2287 struct sackhole *cur, *p, *temp; 2288 2289 if (tp->sack_disable) 2290 return 1; 2291 2292 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2293 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2294 return 1; 2295 tmp_cp = cp + 2; 2296 tmp_olen = optlen - 2; 2297 if (tp->snd_numholes < 0) 2298 tp->snd_numholes = 0; 2299 if (tp->t_maxseg == 0) 2300 panic("tcp_sack_option"); /* Should never happen */ 2301 while (tmp_olen > 0) { 2302 struct sackblk sack; 2303 2304 bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2305 NTOHL(sack.start); 2306 bcopy((char *) tmp_cp + sizeof(tcp_seq), 2307 (char *) &(sack.end), sizeof(tcp_seq)); 2308 NTOHL(sack.end); 2309 tmp_olen -= TCPOLEN_SACK; 2310 tmp_cp += TCPOLEN_SACK; 2311 if (SEQ_LEQ(sack.end, sack.start)) 2312 continue; /* bad SACK fields */ 2313 if (SEQ_LEQ(sack.end, tp->snd_una)) 2314 continue; /* old block */ 2315 #if defined(TCP_SACK) && defined(TCP_FACK) 2316 /* Updates snd_fack. */ 2317 if (SEQ_GEQ(sack.end, tp->snd_fack)) 2318 tp->snd_fack = sack.end; 2319 #endif /* TCP_FACK */ 2320 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2321 if (SEQ_LT(sack.start, th->th_ack)) 2322 continue; 2323 } else { 2324 if (SEQ_LT(sack.start, tp->snd_una)) 2325 continue; 2326 } 2327 if (SEQ_GT(sack.end, tp->snd_max)) 2328 continue; 2329 if (tp->snd_holes == 0) { /* first hole */ 2330 tp->snd_holes = (struct sackhole *) 2331 malloc(sizeof(struct sackhole), M_PCB, M_NOWAIT); 2332 if (tp->snd_holes == NULL) { 2333 /* ENOBUFS, so ignore SACKed block for now*/ 2334 continue; 2335 } 2336 cur = tp->snd_holes; 2337 cur->start = th->th_ack; 2338 cur->end = sack.start; 2339 cur->rxmit = cur->start; 2340 cur->next = 0; 2341 tp->snd_numholes = 1; 2342 tp->rcv_lastsack = sack.end; 2343 /* 2344 * dups is at least one. If more data has been 2345 * SACKed, it can be greater than one. 2346 */ 2347 cur->dups = min(tcprexmtthresh, 2348 ((sack.end - cur->end)/tp->t_maxseg)); 2349 if (cur->dups < 1) 2350 cur->dups = 1; 2351 continue; /* with next sack block */ 2352 } 2353 /* Go thru list of holes: p = previous, cur = current */ 2354 p = cur = tp->snd_holes; 2355 while (cur) { 2356 if (SEQ_LEQ(sack.end, cur->start)) 2357 /* SACKs data before the current hole */ 2358 break; /* no use going through more holes */ 2359 if (SEQ_GEQ(sack.start, cur->end)) { 2360 /* SACKs data beyond the current hole */ 2361 cur->dups++; 2362 if ( ((sack.end - cur->end)/tp->t_maxseg) >= 2363 tcprexmtthresh) 2364 cur->dups = tcprexmtthresh; 2365 p = cur; 2366 cur = cur->next; 2367 continue; 2368 } 2369 if (SEQ_LEQ(sack.start, cur->start)) { 2370 /* Data acks at least the beginning of hole */ 2371 #if defined(TCP_SACK) && defined(TCP_FACK) 2372 if (SEQ_GT(sack.end, cur->rxmit)) 2373 tp->retran_data -= 2374 tcp_seq_subtract(cur->rxmit, 2375 cur->start); 2376 else 2377 tp->retran_data -= 2378 tcp_seq_subtract(sack.end, 2379 cur->start); 2380 #endif /* TCP_FACK */ 2381 if (SEQ_GEQ(sack.end,cur->end)){ 2382 /* Acks entire hole, so delete hole */ 2383 if (p != cur) { 2384 p->next = cur->next; 2385 free(cur, M_PCB); 2386 cur = p->next; 2387 } else { 2388 cur=cur->next; 2389 free(p, M_PCB); 2390 p = cur; 2391 tp->snd_holes = p; 2392 } 2393 tp->snd_numholes--; 2394 continue; 2395 } 2396 /* otherwise, move start of hole forward */ 2397 cur->start = sack.end; 2398 cur->rxmit = max (cur->rxmit, cur->start); 2399 p = cur; 2400 cur = cur->next; 2401 continue; 2402 } 2403 /* move end of hole backward */ 2404 if (SEQ_GEQ(sack.end, cur->end)) { 2405 #if defined(TCP_SACK) && defined(TCP_FACK) 2406 if (SEQ_GT(cur->rxmit, sack.start)) 2407 tp->retran_data -= 2408 tcp_seq_subtract(cur->rxmit, 2409 sack.start); 2410 #endif /* TCP_FACK */ 2411 cur->end = sack.start; 2412 cur->rxmit = min (cur->rxmit, cur->end); 2413 cur->dups++; 2414 if ( ((sack.end - cur->end)/tp->t_maxseg) >= 2415 tcprexmtthresh) 2416 cur->dups = tcprexmtthresh; 2417 p = cur; 2418 cur = cur->next; 2419 continue; 2420 } 2421 if (SEQ_LT(cur->start, sack.start) && 2422 SEQ_GT(cur->end, sack.end)) { 2423 /* 2424 * ACKs some data in middle of a hole; need to 2425 * split current hole 2426 */ 2427 temp = (struct sackhole *)malloc(sizeof(*temp), 2428 M_PCB,M_NOWAIT); 2429 if (temp == NULL) 2430 continue; /* ENOBUFS */ 2431 #if defined(TCP_SACK) && defined(TCP_FACK) 2432 if (SEQ_GT(cur->rxmit, sack.end)) 2433 tp->retran_data -= 2434 tcp_seq_subtract(sack.end, 2435 sack.start); 2436 else if (SEQ_GT(cur->rxmit, sack.start)) 2437 tp->retran_data -= 2438 tcp_seq_subtract(cur->rxmit, 2439 sack.start); 2440 #endif /* TCP_FACK */ 2441 temp->next = cur->next; 2442 temp->start = sack.end; 2443 temp->end = cur->end; 2444 temp->dups = cur->dups; 2445 temp->rxmit = max (cur->rxmit, temp->start); 2446 cur->end = sack.start; 2447 cur->rxmit = min (cur->rxmit, cur->end); 2448 cur->dups++; 2449 if ( ((sack.end - cur->end)/tp->t_maxseg) >= 2450 tcprexmtthresh) 2451 cur->dups = tcprexmtthresh; 2452 cur->next = temp; 2453 p = temp; 2454 cur = p->next; 2455 tp->snd_numholes++; 2456 } 2457 } 2458 /* At this point, p points to the last hole on the list */ 2459 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2460 /* 2461 * Need to append new hole at end. 2462 * Last hole is p (and it's not NULL). 2463 */ 2464 temp = (struct sackhole *) malloc(sizeof(*temp), 2465 M_PCB, M_NOWAIT); 2466 if (temp == NULL) 2467 continue; /* ENOBUFS */ 2468 temp->start = tp->rcv_lastsack; 2469 temp->end = sack.start; 2470 temp->dups = min(tcprexmtthresh, 2471 ((sack.end - sack.start)/tp->t_maxseg)); 2472 if (temp->dups < 1) 2473 temp->dups = 1; 2474 temp->rxmit = temp->start; 2475 temp->next = 0; 2476 p->next = temp; 2477 tp->rcv_lastsack = sack.end; 2478 tp->snd_numholes++; 2479 } 2480 } 2481 #if defined(TCP_SACK) && defined(TCP_FACK) 2482 /* 2483 * Update retran_data and snd_awnd. Go through the list of 2484 * holes. Increment retran_data by (hole->rxmit - hole->start). 2485 */ 2486 tp->retran_data = 0; 2487 cur = tp->snd_holes; 2488 while (cur) { 2489 tp->retran_data += cur->rxmit - cur->start; 2490 cur = cur->next; 2491 } 2492 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2493 tp->retran_data; 2494 #endif /* TCP_FACK */ 2495 2496 return 0; 2497 } 2498 2499 /* 2500 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2501 * it is completely acked; otherwise, tcp_sack_option(), called from 2502 * tcp_dooptions(), will fix up the hole. 2503 */ 2504 void 2505 tcp_del_sackholes(tp, th) 2506 struct tcpcb *tp; 2507 struct tcphdr *th; 2508 { 2509 if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) { 2510 /* max because this could be an older ack just arrived */ 2511 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2512 th->th_ack : tp->snd_una; 2513 struct sackhole *cur = tp->snd_holes; 2514 struct sackhole *prev = cur; 2515 while (cur) 2516 if (SEQ_LEQ(cur->end, lastack)) { 2517 cur = cur->next; 2518 free(prev, M_PCB); 2519 prev = cur; 2520 tp->snd_numholes--; 2521 } else if (SEQ_LT(cur->start, lastack)) { 2522 cur->start = lastack; 2523 break; 2524 } else 2525 break; 2526 tp->snd_holes = cur; 2527 } 2528 } 2529 2530 /* 2531 * Delete all receiver-side SACK information. 2532 */ 2533 void 2534 tcp_clean_sackreport(tp) 2535 struct tcpcb *tp; 2536 { 2537 int i; 2538 2539 tp->rcv_numsacks = 0; 2540 for (i = 0; i < MAX_SACK_BLKS; i++) 2541 tp->sackblks[i].start = tp->sackblks[i].end=0; 2542 2543 } 2544 2545 /* 2546 * Checks for partial ack. If partial ack arrives, turn off retransmission 2547 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2548 * If the ack advances at least to tp->snd_last, return 0. 2549 */ 2550 int 2551 tcp_sack_partialack(tp, th) 2552 struct tcpcb *tp; 2553 struct tcphdr *th; 2554 { 2555 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2556 /* Turn off retx. timer (will start again next segment) */ 2557 tp->t_timer[TCPT_REXMT] = 0; 2558 tp->t_rtt = 0; 2559 #ifndef TCP_FACK 2560 /* 2561 * Partial window deflation. This statement relies on the 2562 * fact that tp->snd_una has not been updated yet. In FACK 2563 * hold snd_cwnd constant during fast recovery. 2564 */ 2565 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); 2566 #endif 2567 return 1; 2568 } 2569 return 0; 2570 } 2571 #endif TCP_SACK 2572 2573 /* 2574 * Pull out of band byte out of a segment so 2575 * it doesn't appear in the user's data queue. 2576 * It is still reflected in the segment length for 2577 * sequencing purposes. 2578 */ 2579 void 2580 tcp_pulloutofband(so, urgent, m, off) 2581 struct socket *so; 2582 u_int urgent; 2583 register struct mbuf *m; 2584 int off; 2585 { 2586 int cnt = off + urgent - 1; 2587 2588 while (cnt >= 0) { 2589 if (m->m_len > cnt) { 2590 char *cp = mtod(m, caddr_t) + cnt; 2591 struct tcpcb *tp = sototcpcb(so); 2592 2593 tp->t_iobc = *cp; 2594 tp->t_oobflags |= TCPOOB_HAVEDATA; 2595 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2596 m->m_len--; 2597 return; 2598 } 2599 cnt -= m->m_len; 2600 m = m->m_next; 2601 if (m == 0) 2602 break; 2603 } 2604 panic("tcp_pulloutofband"); 2605 } 2606 2607 /* 2608 * Collect new round-trip time estimate 2609 * and update averages and current timeout. 2610 */ 2611 void 2612 tcp_xmit_timer(tp, rtt) 2613 register struct tcpcb *tp; 2614 short rtt; 2615 { 2616 register short delta; 2617 short rttmin; 2618 2619 tcpstat.tcps_rttupdated++; 2620 --rtt; 2621 if (tp->t_srtt != 0) { 2622 /* 2623 * srtt is stored as fixed point with 3 bits after the 2624 * binary point (i.e., scaled by 8). The following magic 2625 * is equivalent to the smoothing algorithm in rfc793 with 2626 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2627 * point). Adjust rtt to origin 0. 2628 */ 2629 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2630 if ((tp->t_srtt += delta) <= 0) 2631 tp->t_srtt = 1; 2632 /* 2633 * We accumulate a smoothed rtt variance (actually, a 2634 * smoothed mean difference), then set the retransmit 2635 * timer to smoothed rtt + 4 times the smoothed variance. 2636 * rttvar is stored as fixed point with 2 bits after the 2637 * binary point (scaled by 4). The following is 2638 * equivalent to rfc793 smoothing with an alpha of .75 2639 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2640 * rfc793's wired-in beta. 2641 */ 2642 if (delta < 0) 2643 delta = -delta; 2644 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2645 if ((tp->t_rttvar += delta) <= 0) 2646 tp->t_rttvar = 1; 2647 } else { 2648 /* 2649 * No rtt measurement yet - use the unsmoothed rtt. 2650 * Set the variance to half the rtt (so our first 2651 * retransmit happens at 3*rtt). 2652 */ 2653 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2654 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2655 } 2656 tp->t_rtt = 0; 2657 tp->t_rxtshift = 0; 2658 2659 /* 2660 * the retransmit should happen at rtt + 4 * rttvar. 2661 * Because of the way we do the smoothing, srtt and rttvar 2662 * will each average +1/2 tick of bias. When we compute 2663 * the retransmit timer, we want 1/2 tick of rounding and 2664 * 1 extra tick because of +-1/2 tick uncertainty in the 2665 * firing of the timer. The bias will give us exactly the 2666 * 1.5 tick we need. But, because the bias is 2667 * statistical, we have to test that we don't drop below 2668 * the minimum feasible timer (which is 2 ticks). 2669 */ 2670 if (tp->t_rttmin > rtt + 2) 2671 rttmin = tp->t_rttmin; 2672 else 2673 rttmin = rtt + 2; 2674 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2675 2676 /* 2677 * We received an ack for a packet that wasn't retransmitted; 2678 * it is probably safe to discard any error indications we've 2679 * received recently. This isn't quite right, but close enough 2680 * for now (a route might have failed after we sent a segment, 2681 * and the return path might not be symmetrical). 2682 */ 2683 tp->t_softerror = 0; 2684 } 2685 2686 /* 2687 * Determine a reasonable value for maxseg size. 2688 * If the route is known, check route for mtu. 2689 * If none, use an mss that can be handled on the outgoing 2690 * interface without forcing IP to fragment; if bigger than 2691 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2692 * to utilize large mbufs. If no route is found, route has no mtu, 2693 * or the destination isn't local, use a default, hopefully conservative 2694 * size (usually 512 or the default IP max size, but no more than the mtu 2695 * of the interface), as we can't discover anything about intervening 2696 * gateways or networks. We also initialize the congestion/slow start 2697 * window to be a single segment if the destination isn't local. 2698 * While looking at the routing entry, we also initialize other path-dependent 2699 * parameters from pre-set or cached values in the routing entry. 2700 * 2701 * Also take into account the space needed for options that we 2702 * send regularly. Make maxseg shorter by that amount to assure 2703 * that we can send maxseg amount of data even when the options 2704 * are present. Store the upper limit of the length of options plus 2705 * data in maxopd. 2706 */ 2707 int 2708 tcp_mss(tp, offer) 2709 register struct tcpcb *tp; 2710 u_int offer; 2711 { 2712 struct route *ro; 2713 register struct rtentry *rt; 2714 struct ifnet *ifp; 2715 register int rtt, mss; 2716 u_long bufsize; 2717 struct inpcb *inp; 2718 struct socket *so; 2719 2720 inp = tp->t_inpcb; 2721 ro = &inp->inp_route; 2722 so = inp->inp_socket; 2723 2724 if ((rt = ro->ro_rt) == (struct rtentry *)0) { 2725 /* No route yet, so try to acquire one */ 2726 #ifdef INET6 2727 /* 2728 * Get a new IPv6 route if an IPv6 destination, otherwise, get 2729 * and IPv4 route (including those pesky IPv4-mapped addresses). 2730 */ 2731 bzero(ro,sizeof(struct route_in6)); 2732 if (sotopf(so) == AF_INET6) { 2733 if (IN6_IS_ADDR_V4MAPPED(&inp->inp_faddr6)) { 2734 /* Get an IPv4 route. */ 2735 ro->ro_dst.sa_family = AF_INET; 2736 ro->ro_dst.sa_len = sizeof(ro->ro_dst); 2737 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 2738 inp->inp_faddr; 2739 rtalloc(ro); 2740 } else { 2741 ro->ro_dst.sa_family = AF_INET6; 2742 ro->ro_dst.sa_len = sizeof(struct sockaddr_in6); 2743 ((struct sockaddr_in6 *) &ro->ro_dst)->sin6_addr = 2744 inp->inp_faddr6; 2745 rtalloc(ro); 2746 } 2747 } else 2748 #endif /* INET6 */ 2749 if (inp->inp_faddr.s_addr != INADDR_ANY) { 2750 ro->ro_dst.sa_family = AF_INET; 2751 ro->ro_dst.sa_len = sizeof(ro->ro_dst); 2752 satosin(&ro->ro_dst)->sin_addr = inp->inp_faddr; 2753 rtalloc(ro); 2754 } 2755 if ((rt = ro->ro_rt) == (struct rtentry *)0) { 2756 tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; 2757 return (tcp_mssdflt); 2758 } 2759 } 2760 ifp = rt->rt_ifp; 2761 2762 #ifdef RTV_MTU /* if route characteristics exist ... */ 2763 /* 2764 * While we're here, check if there's an initial rtt 2765 * or rttvar. Convert from the route-table units 2766 * to scaled multiples of the slow timeout timer. 2767 */ 2768 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { 2769 /* 2770 * XXX the lock bit for MTU indicates that the value 2771 * is also a minimum value; this is subject to time. 2772 */ 2773 if (rt->rt_rmx.rmx_locks & RTV_RTT) 2774 TCPT_RANGESET(tp->t_rttmin, 2775 rtt / (RTM_RTTUNIT / PR_SLOWHZ), 2776 TCPTV_MIN, TCPTV_REXMTMAX); 2777 tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); 2778 if (rt->rt_rmx.rmx_rttvar) 2779 tp->t_rttvar = rt->rt_rmx.rmx_rttvar / 2780 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); 2781 else 2782 /* default variation is +- 1 rtt */ 2783 tp->t_rttvar = 2784 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 2785 TCPT_RANGESET((long) tp->t_rxtcur, 2786 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 2787 tp->t_rttmin, TCPTV_REXMTMAX); 2788 } 2789 /* 2790 * if there's an mtu associated with the route, use it 2791 */ 2792 if (rt->rt_rmx.rmx_mtu) 2793 #ifdef INET6 2794 { 2795 /* 2796 * One may wish to lower MSS to take into account options, 2797 * especially security-related options. 2798 */ 2799 if (tp->pf == AF_INET6) 2800 mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpipv6hdr); 2801 else 2802 #endif /* INET6 */ 2803 mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); 2804 #ifdef INET6 2805 } 2806 #endif /* INET6 */ 2807 else 2808 #endif /* RTV_MTU */ 2809 { 2810 /* 2811 * ifp may be null and rmx_mtu may be zero in certain 2812 * v6 cases (e.g., if ND wasn't able to resolve the 2813 * destination host. 2814 */ 2815 mss = ifp ? ifp->if_mtu - sizeof(struct tcpiphdr) : 0; 2816 #ifdef INET6 2817 if (tp->pf == AF_INET) 2818 #endif /* INET6 */ 2819 if (!in_localaddr(inp->inp_faddr)) 2820 mss = min(mss, tcp_mssdflt); 2821 } 2822 /* 2823 * The current mss, t_maxseg, is initialized to the default value. 2824 * If we compute a smaller value, reduce the current mss. 2825 * If we compute a larger value, return it for use in sending 2826 * a max seg size option, but don't store it for use 2827 * unless we received an offer at least that large from peer. 2828 * However, do not accept offers under 32 bytes. 2829 */ 2830 if (offer) 2831 mss = min(mss, offer); 2832 mss = max(mss, 64); /* sanity - at least max opt. space */ 2833 /* 2834 * maxopd stores the maximum length of data AND options 2835 * in a segment; maxseg is the amount of data in a normal 2836 * segment. We need to store this value (maxopd) apart 2837 * from maxseg, because now every segment carries options 2838 * and thus we normally have somewhat less data in segments. 2839 */ 2840 tp->t_maxopd = mss; 2841 2842 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2843 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2844 mss -= TCPOLEN_TSTAMP_APPA; 2845 2846 #if (MCLBYTES & (MCLBYTES - 1)) == 0 2847 if (mss > MCLBYTES) 2848 mss &= ~(MCLBYTES-1); 2849 #else 2850 if (mss > MCLBYTES) 2851 mss = mss / MCLBYTES * MCLBYTES; 2852 #endif 2853 /* 2854 * If there's a pipesize, change the socket buffer 2855 * to that size. Make the socket buffers an integral 2856 * number of mss units; if the mss is larger than 2857 * the socket buffer, decrease the mss. 2858 */ 2859 #ifdef RTV_SPIPE 2860 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) 2861 #endif 2862 bufsize = so->so_snd.sb_hiwat; 2863 if (bufsize < mss) 2864 mss = bufsize; 2865 else { 2866 bufsize = roundup(bufsize, mss); 2867 if (bufsize > sb_max) 2868 bufsize = sb_max; 2869 (void)sbreserve(&so->so_snd, bufsize); 2870 } 2871 tp->t_maxseg = mss; 2872 2873 #ifdef RTV_RPIPE 2874 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) 2875 #endif 2876 bufsize = so->so_rcv.sb_hiwat; 2877 if (bufsize > mss) { 2878 bufsize = roundup(bufsize, mss); 2879 if (bufsize > sb_max) 2880 bufsize = sb_max; 2881 (void)sbreserve(&so->so_rcv, bufsize); 2882 } 2883 tp->snd_cwnd = mss; 2884 2885 #ifdef RTV_SSTHRESH 2886 if (rt->rt_rmx.rmx_ssthresh) { 2887 /* 2888 * There's some sort of gateway or interface 2889 * buffer limit on the path. Use this to set 2890 * the slow start threshhold, but set the 2891 * threshold to no less than 2*mss. 2892 */ 2893 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); 2894 } 2895 #endif /* RTV_MTU */ 2896 return (mss); 2897 } 2898 #endif /* TUBA_INCLUDE */ 2899 2900 #if defined (TCP_SACK) 2901 /* 2902 * Checks for partial ack. If partial ack arrives, force the retransmission 2903 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2904 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 2905 * be started again. If the ack advances at least to tp->snd_last, return 0. 2906 */ 2907 int 2908 tcp_newreno(tp, th) 2909 struct tcpcb *tp; 2910 struct tcphdr *th; 2911 { 2912 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2913 /* 2914 * snd_una has not been updated and the socket send buffer 2915 * not yet drained of the acked data, so we have to leave 2916 * snd_una as it was to get the correct data offset in 2917 * tcp_output(). 2918 */ 2919 tcp_seq onxt = tp->snd_nxt; 2920 u_long ocwnd = tp->snd_cwnd; 2921 tp->t_timer[TCPT_REXMT] = 0; 2922 tp->t_rtt = 0; 2923 tp->snd_nxt = th->th_ack; 2924 /* 2925 * Set snd_cwnd to one segment beyond acknowledged offset 2926 * (tp->snd_una not yet updated when this function is called) 2927 */ 2928 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 2929 (void) tcp_output(tp); 2930 tp->snd_cwnd = ocwnd; 2931 if (SEQ_GT(onxt, tp->snd_nxt)) 2932 tp->snd_nxt = onxt; 2933 /* 2934 * Partial window deflation. Relies on fact that tp->snd_una 2935 * not updated yet. 2936 */ 2937 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); 2938 return 1; 2939 } 2940 return 0; 2941 } 2942 #endif /* TCP_SACK */ 2943