1 /* $OpenBSD: tcp_input.c,v 1.196 2006/03/12 18:42:40 markus Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/protosw.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/kernel.h> 78 79 #include <dev/rndvar.h> 80 81 #include <net/if.h> 82 #include <net/route.h> 83 84 #include <netinet/in.h> 85 #include <netinet/in_systm.h> 86 #include <netinet/ip.h> 87 #include <netinet/in_pcb.h> 88 #include <netinet/ip_var.h> 89 #include <netinet/tcp.h> 90 #include <netinet/tcp_fsm.h> 91 #include <netinet/tcp_seq.h> 92 #include <netinet/tcp_timer.h> 93 #include <netinet/tcp_var.h> 94 #include <netinet/tcpip.h> 95 #include <netinet/tcp_debug.h> 96 97 struct tcpiphdr tcp_saveti; 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #include <netinet6/nd6.h> 102 103 struct tcpipv6hdr tcp_saveti6; 104 105 /* for the packet header length in the mbuf */ 106 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 107 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 108 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 109 #endif /* INET6 */ 110 111 int tcprexmtthresh = 3; 112 int tcptv_keep_init = TCPTV_KEEP_INIT; 113 114 extern u_long sb_max; 115 116 int tcp_rst_ppslim = 100; /* 100pps */ 117 int tcp_rst_ppslim_count = 0; 118 struct timeval tcp_rst_ppslim_last; 119 120 int tcp_ackdrop_ppslim = 100; /* 100pps */ 121 int tcp_ackdrop_ppslim_count = 0; 122 struct timeval tcp_ackdrop_ppslim_last; 123 124 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 125 126 /* for modulo comparisons of timestamps */ 127 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 128 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 129 130 /* for TCP SACK comparisons */ 131 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 132 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 133 134 /* 135 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 136 */ 137 #ifdef INET6 138 #define ND6_HINT(tp) \ 139 do { \ 140 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 141 tp->t_inpcb->inp_route6.ro_rt) { \ 142 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 143 } \ 144 } while (0) 145 #else 146 #define ND6_HINT(tp) 147 #endif 148 149 #ifdef TCP_ECN 150 /* 151 * ECN (Explicit Congestion Notification) support based on RFC3168 152 * implementation note: 153 * snd_last is used to track a recovery phase. 154 * when cwnd is reduced, snd_last is set to snd_max. 155 * while snd_last > snd_una, the sender is in a recovery phase and 156 * its cwnd should not be reduced again. 157 * snd_last follows snd_una when not in a recovery phase. 158 */ 159 #endif 160 161 /* 162 * Macro to compute ACK transmission behavior. Delay the ACK unless 163 * we have already delayed an ACK (must send an ACK every two segments). 164 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 165 * option is enabled. 166 */ 167 #define TCP_SETUP_ACK(tp, tiflags) \ 168 do { \ 169 if ((tp)->t_flags & TF_DELACK || \ 170 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \ 171 tp->t_flags |= TF_ACKNOW; \ 172 else \ 173 TCP_SET_DELACK(tp); \ 174 } while (0) 175 176 /* 177 * Insert segment ti into reassembly queue of tcp with 178 * control block tp. Return TH_FIN if reassembly now includes 179 * a segment with FIN. The macro form does the common case inline 180 * (segment is the next to be received on an established connection, 181 * and the queue is empty), avoiding linkage into and removal 182 * from the queue and repetition of various conversions. 183 * Set DELACK for segments received in order, but ack immediately 184 * when segments are out of order (so fast retransmit can work). 185 */ 186 187 int 188 tcp_reass(tp, th, m, tlen) 189 struct tcpcb *tp; 190 struct tcphdr *th; 191 struct mbuf *m; 192 int *tlen; 193 { 194 struct tcpqent *p, *q, *nq, *tiqe; 195 struct socket *so = tp->t_inpcb->inp_socket; 196 int flags; 197 198 /* 199 * Call with th==0 after become established to 200 * force pre-ESTABLISHED data up to user socket. 201 */ 202 if (th == 0) 203 goto present; 204 205 /* 206 * Allocate a new queue entry, before we throw away any data. 207 * If we can't, just drop the packet. XXX 208 */ 209 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 210 if (tiqe == NULL) { 211 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 212 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 213 /* Reuse last entry since new segment fills a hole */ 214 m_freem(tiqe->tcpqe_m); 215 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 216 } 217 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 218 /* Flush segment queue for this connection */ 219 tcp_freeq(tp); 220 tcpstat.tcps_rcvmemdrop++; 221 m_freem(m); 222 return (0); 223 } 224 } 225 226 /* 227 * Find a segment which begins after this one does. 228 */ 229 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 230 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 231 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 232 break; 233 234 /* 235 * If there is a preceding segment, it may provide some of 236 * our data already. If so, drop the data from the incoming 237 * segment. If it provides all of our data, drop us. 238 */ 239 if (p != NULL) { 240 struct tcphdr *phdr = p->tcpqe_tcp; 241 int i; 242 243 /* conversion to int (in i) handles seq wraparound */ 244 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 245 if (i > 0) { 246 if (i >= *tlen) { 247 tcpstat.tcps_rcvduppack++; 248 tcpstat.tcps_rcvdupbyte += *tlen; 249 m_freem(m); 250 pool_put(&tcpqe_pool, tiqe); 251 return (0); 252 } 253 m_adj(m, i); 254 *tlen -= i; 255 th->th_seq += i; 256 } 257 } 258 tcpstat.tcps_rcvoopack++; 259 tcpstat.tcps_rcvoobyte += *tlen; 260 261 /* 262 * While we overlap succeeding segments trim them or, 263 * if they are completely covered, dequeue them. 264 */ 265 for (; q != NULL; q = nq) { 266 struct tcphdr *qhdr = q->tcpqe_tcp; 267 int i = (th->th_seq + *tlen) - qhdr->th_seq; 268 269 if (i <= 0) 270 break; 271 if (i < qhdr->th_reseqlen) { 272 qhdr->th_seq += i; 273 qhdr->th_reseqlen -= i; 274 m_adj(q->tcpqe_m, i); 275 break; 276 } 277 nq = TAILQ_NEXT(q, tcpqe_q); 278 m_freem(q->tcpqe_m); 279 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 280 pool_put(&tcpqe_pool, q); 281 } 282 283 /* Insert the new segment queue entry into place. */ 284 tiqe->tcpqe_m = m; 285 th->th_reseqlen = *tlen; 286 tiqe->tcpqe_tcp = th; 287 if (p == NULL) { 288 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 289 } else { 290 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 291 } 292 293 present: 294 /* 295 * Present data to user, advancing rcv_nxt through 296 * completed sequence space. 297 */ 298 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 299 return (0); 300 q = TAILQ_FIRST(&tp->t_segq); 301 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 302 return (0); 303 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 304 return (0); 305 do { 306 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 307 flags = q->tcpqe_tcp->th_flags & TH_FIN; 308 309 nq = TAILQ_NEXT(q, tcpqe_q); 310 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 311 ND6_HINT(tp); 312 if (so->so_state & SS_CANTRCVMORE) 313 m_freem(q->tcpqe_m); 314 else 315 sbappendstream(&so->so_rcv, q->tcpqe_m); 316 pool_put(&tcpqe_pool, q); 317 q = nq; 318 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 319 sorwakeup(so); 320 return (flags); 321 } 322 323 #ifdef INET6 324 int 325 tcp6_input(mp, offp, proto) 326 struct mbuf **mp; 327 int *offp, proto; 328 { 329 struct mbuf *m = *mp; 330 331 #if defined(NFAITH) && 0 < NFAITH 332 if (m->m_pkthdr.rcvif) { 333 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 334 /* XXX send icmp6 host/port unreach? */ 335 m_freem(m); 336 return IPPROTO_DONE; 337 } 338 } 339 #endif 340 341 /* 342 * draft-itojun-ipv6-tcp-to-anycast 343 * better place to put this in? 344 */ 345 if (m->m_flags & M_ANYCAST6) { 346 if (m->m_len >= sizeof(struct ip6_hdr)) { 347 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 348 icmp6_error(m, ICMP6_DST_UNREACH, 349 ICMP6_DST_UNREACH_ADDR, 350 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 351 } else 352 m_freem(m); 353 return IPPROTO_DONE; 354 } 355 356 tcp_input(m, *offp, proto); 357 return IPPROTO_DONE; 358 } 359 #endif 360 361 /* 362 * TCP input routine, follows pages 65-76 of the 363 * protocol specification dated September, 1981 very closely. 364 */ 365 void 366 tcp_input(struct mbuf *m, ...) 367 { 368 struct ip *ip; 369 struct inpcb *inp; 370 u_int8_t *optp = NULL; 371 int optlen = 0; 372 int tlen, off; 373 struct tcpcb *tp = 0; 374 int tiflags; 375 struct socket *so = NULL; 376 int todrop, acked, ourfinisacked, needoutput = 0; 377 int hdroptlen = 0; 378 short ostate = 0; 379 int iss = 0; 380 u_long tiwin; 381 struct tcp_opt_info opti; 382 int iphlen; 383 va_list ap; 384 struct tcphdr *th; 385 #ifdef INET6 386 struct ip6_hdr *ip6 = NULL; 387 #endif /* INET6 */ 388 #ifdef IPSEC 389 struct m_tag *mtag; 390 struct tdb_ident *tdbi; 391 struct tdb *tdb; 392 int error, s; 393 #endif /* IPSEC */ 394 int af; 395 #ifdef TCP_ECN 396 u_char iptos; 397 #endif 398 399 va_start(ap, m); 400 iphlen = va_arg(ap, int); 401 va_end(ap); 402 403 tcpstat.tcps_rcvtotal++; 404 405 opti.ts_present = 0; 406 opti.maxseg = 0; 407 408 /* 409 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 410 * See below for AF specific multicast. 411 */ 412 if (m->m_flags & (M_BCAST|M_MCAST)) 413 goto drop; 414 415 /* 416 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 417 * TCP/IPv4. 418 */ 419 switch (mtod(m, struct ip *)->ip_v) { 420 #ifdef INET6 421 case 6: 422 af = AF_INET6; 423 break; 424 #endif 425 case 4: 426 af = AF_INET; 427 break; 428 default: 429 m_freem(m); 430 return; /*EAFNOSUPPORT*/ 431 } 432 433 /* 434 * Get IP and TCP header together in first mbuf. 435 * Note: IP leaves IP header in first mbuf. 436 */ 437 switch (af) { 438 case AF_INET: 439 #ifdef DIAGNOSTIC 440 if (iphlen < sizeof(struct ip)) { 441 m_freem(m); 442 return; 443 } 444 #endif /* DIAGNOSTIC */ 445 break; 446 #ifdef INET6 447 case AF_INET6: 448 #ifdef DIAGNOSTIC 449 if (iphlen < sizeof(struct ip6_hdr)) { 450 m_freem(m); 451 return; 452 } 453 #endif /* DIAGNOSTIC */ 454 break; 455 #endif 456 default: 457 m_freem(m); 458 return; 459 } 460 461 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 462 if (!th) { 463 tcpstat.tcps_rcvshort++; 464 return; 465 } 466 467 tlen = m->m_pkthdr.len - iphlen; 468 ip = NULL; 469 #ifdef INET6 470 ip6 = NULL; 471 #endif 472 switch (af) { 473 case AF_INET: 474 ip = mtod(m, struct ip *); 475 if (IN_MULTICAST(ip->ip_dst.s_addr) || 476 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 477 goto drop; 478 #ifdef TCP_ECN 479 /* save ip_tos before clearing it for checksum */ 480 iptos = ip->ip_tos; 481 #endif 482 /* 483 * Checksum extended TCP header and data. 484 */ 485 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 486 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 487 tcpstat.tcps_inhwcsum++; 488 tcpstat.tcps_rcvbadsum++; 489 goto drop; 490 } 491 if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) { 492 tcpstat.tcps_rcvbadsum++; 493 goto drop; 494 } 495 } else { 496 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 497 tcpstat.tcps_inhwcsum++; 498 } 499 break; 500 #ifdef INET6 501 case AF_INET6: 502 ip6 = mtod(m, struct ip6_hdr *); 503 #ifdef TCP_ECN 504 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 505 #endif 506 507 /* Be proactive about malicious use of IPv4 mapped address */ 508 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 509 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 510 /* XXX stat */ 511 goto drop; 512 } 513 514 /* 515 * Be proactive about unspecified IPv6 address in source. 516 * As we use all-zero to indicate unbounded/unconnected pcb, 517 * unspecified IPv6 address can be used to confuse us. 518 * 519 * Note that packets with unspecified IPv6 destination is 520 * already dropped in ip6_input. 521 */ 522 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 523 /* XXX stat */ 524 goto drop; 525 } 526 527 /* Discard packets to multicast */ 528 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 529 /* XXX stat */ 530 goto drop; 531 } 532 533 /* 534 * Checksum extended TCP header and data. 535 */ 536 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 537 tcpstat.tcps_rcvbadsum++; 538 goto drop; 539 } 540 break; 541 #endif 542 } 543 544 /* 545 * Check that TCP offset makes sense, 546 * pull out TCP options and adjust length. XXX 547 */ 548 off = th->th_off << 2; 549 if (off < sizeof(struct tcphdr) || off > tlen) { 550 tcpstat.tcps_rcvbadoff++; 551 goto drop; 552 } 553 tlen -= off; 554 if (off > sizeof(struct tcphdr)) { 555 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 556 if (!th) { 557 tcpstat.tcps_rcvshort++; 558 return; 559 } 560 optlen = off - sizeof(struct tcphdr); 561 optp = (u_int8_t *)(th + 1); 562 /* 563 * Do quick retrieval of timestamp options ("options 564 * prediction?"). If timestamp is the only option and it's 565 * formatted as recommended in RFC 1323 appendix A, we 566 * quickly get the values now and not bother calling 567 * tcp_dooptions(), etc. 568 */ 569 if ((optlen == TCPOLEN_TSTAMP_APPA || 570 (optlen > TCPOLEN_TSTAMP_APPA && 571 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 572 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 573 (th->th_flags & TH_SYN) == 0) { 574 opti.ts_present = 1; 575 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 576 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 577 optp = NULL; /* we've parsed the options */ 578 } 579 } 580 tiflags = th->th_flags; 581 582 /* 583 * Convert TCP protocol specific fields to host format. 584 */ 585 NTOHL(th->th_seq); 586 NTOHL(th->th_ack); 587 NTOHS(th->th_win); 588 NTOHS(th->th_urp); 589 590 /* 591 * Locate pcb for segment. 592 */ 593 findpcb: 594 switch (af) { 595 #ifdef INET6 596 case AF_INET6: 597 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport, 598 &ip6->ip6_dst, th->th_dport); 599 break; 600 #endif 601 case AF_INET: 602 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport, 603 ip->ip_dst, th->th_dport); 604 break; 605 } 606 if (inp == 0) { 607 int inpl_flags = 0; 608 #if NPF > 0 609 struct pf_mtag *t; 610 611 if ((t = pf_find_mtag(m)) != NULL && 612 t->flags & PF_TAG_TRANSLATE_LOCALHOST) 613 inpl_flags = INPLOOKUP_WILDCARD; 614 #endif 615 ++tcpstat.tcps_pcbhashmiss; 616 switch (af) { 617 #ifdef INET6 618 case AF_INET6: 619 inp = in6_pcblookup_listen(&tcbtable, 620 &ip6->ip6_dst, th->th_dport, inpl_flags); 621 break; 622 #endif /* INET6 */ 623 case AF_INET: 624 inp = in_pcblookup_listen(&tcbtable, 625 ip->ip_dst, th->th_dport, inpl_flags); 626 break; 627 } 628 /* 629 * If the state is CLOSED (i.e., TCB does not exist) then 630 * all data in the incoming segment is discarded. 631 * If the TCB exists but is in CLOSED state, it is embryonic, 632 * but should either do a listen or a connect soon. 633 */ 634 if (inp == 0) { 635 ++tcpstat.tcps_noport; 636 goto dropwithreset_ratelim; 637 } 638 } 639 640 tp = intotcpcb(inp); 641 if (tp == 0) 642 goto dropwithreset_ratelim; 643 if (tp->t_state == TCPS_CLOSED) 644 goto drop; 645 646 /* Unscale the window into a 32-bit value. */ 647 if ((tiflags & TH_SYN) == 0) 648 tiwin = th->th_win << tp->snd_scale; 649 else 650 tiwin = th->th_win; 651 652 so = inp->inp_socket; 653 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 654 union syn_cache_sa src; 655 union syn_cache_sa dst; 656 657 bzero(&src, sizeof(src)); 658 bzero(&dst, sizeof(dst)); 659 switch (af) { 660 #ifdef INET 661 case AF_INET: 662 src.sin.sin_len = sizeof(struct sockaddr_in); 663 src.sin.sin_family = AF_INET; 664 src.sin.sin_addr = ip->ip_src; 665 src.sin.sin_port = th->th_sport; 666 667 dst.sin.sin_len = sizeof(struct sockaddr_in); 668 dst.sin.sin_family = AF_INET; 669 dst.sin.sin_addr = ip->ip_dst; 670 dst.sin.sin_port = th->th_dport; 671 break; 672 #endif 673 #ifdef INET6 674 case AF_INET6: 675 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 676 src.sin6.sin6_family = AF_INET6; 677 src.sin6.sin6_addr = ip6->ip6_src; 678 src.sin6.sin6_port = th->th_sport; 679 680 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 681 dst.sin6.sin6_family = AF_INET6; 682 dst.sin6.sin6_addr = ip6->ip6_dst; 683 dst.sin6.sin6_port = th->th_dport; 684 break; 685 #endif /* INET6 */ 686 default: 687 goto badsyn; /*sanity*/ 688 } 689 690 if (so->so_options & SO_DEBUG) { 691 ostate = tp->t_state; 692 switch (af) { 693 #ifdef INET6 694 case AF_INET6: 695 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 696 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 697 break; 698 #endif 699 case AF_INET: 700 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 701 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 702 break; 703 } 704 } 705 if (so->so_options & SO_ACCEPTCONN) { 706 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 707 if (tiflags & TH_RST) { 708 syn_cache_reset(&src.sa, &dst.sa, th); 709 } else if ((tiflags & (TH_ACK|TH_SYN)) == 710 (TH_ACK|TH_SYN)) { 711 /* 712 * Received a SYN,ACK. This should 713 * never happen while we are in 714 * LISTEN. Send an RST. 715 */ 716 goto badsyn; 717 } else if (tiflags & TH_ACK) { 718 so = syn_cache_get(&src.sa, &dst.sa, 719 th, iphlen, tlen, so, m); 720 if (so == NULL) { 721 /* 722 * We don't have a SYN for 723 * this ACK; send an RST. 724 */ 725 goto badsyn; 726 } else if (so == 727 (struct socket *)(-1)) { 728 /* 729 * We were unable to create 730 * the connection. If the 731 * 3-way handshake was 732 * completed, and RST has 733 * been sent to the peer. 734 * Since the mbuf might be 735 * in use for the reply, 736 * do not free it. 737 */ 738 m = NULL; 739 } else { 740 /* 741 * We have created a 742 * full-blown connection. 743 */ 744 tp = NULL; 745 inp = (struct inpcb *)so->so_pcb; 746 tp = intotcpcb(inp); 747 if (tp == NULL) 748 goto badsyn; /*XXX*/ 749 750 /* 751 * Compute proper scaling 752 * value from buffer space 753 */ 754 tcp_rscale(tp, so->so_rcv.sb_hiwat); 755 goto after_listen; 756 } 757 } else { 758 /* 759 * None of RST, SYN or ACK was set. 760 * This is an invalid packet for a 761 * TCB in LISTEN state. Send a RST. 762 */ 763 goto badsyn; 764 } 765 } else { 766 /* 767 * Received a SYN. 768 */ 769 #ifdef INET6 770 /* 771 * If deprecated address is forbidden, we do 772 * not accept SYN to deprecated interface 773 * address to prevent any new inbound 774 * connection from getting established. 775 * When we do not accept SYN, we send a TCP 776 * RST, with deprecated source address (instead 777 * of dropping it). We compromise it as it is 778 * much better for peer to send a RST, and 779 * RST will be the final packet for the 780 * exchange. 781 * 782 * If we do not forbid deprecated addresses, we 783 * accept the SYN packet. RFC2462 does not 784 * suggest dropping SYN in this case. 785 * If we decipher RFC2462 5.5.4, it says like 786 * this: 787 * 1. use of deprecated addr with existing 788 * communication is okay - "SHOULD continue 789 * to be used" 790 * 2. use of it with new communication: 791 * (2a) "SHOULD NOT be used if alternate 792 * address with sufficient scope is 793 * available" 794 * (2b) nothing mentioned otherwise. 795 * Here we fall into (2b) case as we have no 796 * choice in our source address selection - we 797 * must obey the peer. 798 * 799 * The wording in RFC2462 is confusing, and 800 * there are multiple description text for 801 * deprecated address handling - worse, they 802 * are not exactly the same. I believe 5.5.4 803 * is the best one, so we follow 5.5.4. 804 */ 805 if (ip6 && !ip6_use_deprecated) { 806 struct in6_ifaddr *ia6; 807 808 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 809 &ip6->ip6_dst)) && 810 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 811 tp = NULL; 812 goto dropwithreset; 813 } 814 } 815 #endif 816 817 /* 818 * LISTEN socket received a SYN 819 * from itself? This can't possibly 820 * be valid; drop the packet. 821 */ 822 if (th->th_dport == th->th_sport) { 823 switch (af) { 824 #ifdef INET6 825 case AF_INET6: 826 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 827 &ip6->ip6_dst)) { 828 tcpstat.tcps_badsyn++; 829 goto drop; 830 } 831 break; 832 #endif /* INET6 */ 833 case AF_INET: 834 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 835 tcpstat.tcps_badsyn++; 836 goto drop; 837 } 838 break; 839 } 840 } 841 842 /* 843 * SYN looks ok; create compressed TCP 844 * state for it. 845 */ 846 if (so->so_qlen <= so->so_qlimit && 847 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 848 so, m, optp, optlen, &opti)) 849 m = NULL; 850 } 851 goto drop; 852 } 853 } 854 855 after_listen: 856 #ifdef DIAGNOSTIC 857 /* 858 * Should not happen now that all embryonic connections 859 * are handled with compressed state. 860 */ 861 if (tp->t_state == TCPS_LISTEN) 862 panic("tcp_input: TCPS_LISTEN"); 863 #endif 864 865 #ifdef IPSEC 866 /* Find most recent IPsec tag */ 867 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 868 s = splnet(); 869 if (mtag != NULL) { 870 tdbi = (struct tdb_ident *)(mtag + 1); 871 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 872 } else 873 tdb = NULL; 874 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 875 tdb, inp); 876 if (error) { 877 splx(s); 878 goto drop; 879 } 880 881 /* Latch SA */ 882 if (inp->inp_tdb_in != tdb) { 883 if (tdb) { 884 tdb_add_inp(tdb, inp, 1); 885 if (inp->inp_ipo == NULL) { 886 inp->inp_ipo = ipsec_add_policy(inp, af, 887 IPSP_DIRECTION_OUT); 888 if (inp->inp_ipo == NULL) { 889 splx(s); 890 goto drop; 891 } 892 } 893 if (inp->inp_ipo->ipo_dstid == NULL && 894 tdb->tdb_srcid != NULL) { 895 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 896 tdb->tdb_srcid->ref_count++; 897 } 898 if (inp->inp_ipsec_remotecred == NULL && 899 tdb->tdb_remote_cred != NULL) { 900 inp->inp_ipsec_remotecred = 901 tdb->tdb_remote_cred; 902 tdb->tdb_remote_cred->ref_count++; 903 } 904 if (inp->inp_ipsec_remoteauth == NULL && 905 tdb->tdb_remote_auth != NULL) { 906 inp->inp_ipsec_remoteauth = 907 tdb->tdb_remote_auth; 908 tdb->tdb_remote_auth->ref_count++; 909 } 910 } else { /* Just reset */ 911 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 912 inp_tdb_in_next); 913 inp->inp_tdb_in = NULL; 914 } 915 } 916 splx(s); 917 #endif /* IPSEC */ 918 919 /* 920 * Segment received on connection. 921 * Reset idle time and keep-alive timer. 922 */ 923 tp->t_rcvtime = tcp_now; 924 if (TCPS_HAVEESTABLISHED(tp->t_state)) 925 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 926 927 #ifdef TCP_SACK 928 if (tp->sack_enable) 929 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 930 #endif /* TCP_SACK */ 931 932 /* 933 * Process options. 934 */ 935 #ifdef TCP_SIGNATURE 936 if (optp || (tp->t_flags & TF_SIGNATURE)) 937 #else 938 if (optp) 939 #endif 940 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti)) 941 goto drop; 942 943 if (opti.ts_present && opti.ts_ecr) { 944 int rtt_test; 945 946 /* subtract out the tcp timestamp modulator */ 947 opti.ts_ecr -= tp->ts_modulate; 948 949 /* make sure ts_ecr is sensible */ 950 rtt_test = tcp_now - opti.ts_ecr; 951 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 952 opti.ts_ecr = 0; 953 } 954 955 #ifdef TCP_ECN 956 /* if congestion experienced, set ECE bit in subsequent packets. */ 957 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 958 tp->t_flags |= TF_RCVD_CE; 959 tcpstat.tcps_ecn_rcvce++; 960 } 961 #endif 962 /* 963 * Header prediction: check for the two common cases 964 * of a uni-directional data xfer. If the packet has 965 * no control flags, is in-sequence, the window didn't 966 * change and we're not retransmitting, it's a 967 * candidate. If the length is zero and the ack moved 968 * forward, we're the sender side of the xfer. Just 969 * free the data acked & wake any higher level process 970 * that was blocked waiting for space. If the length 971 * is non-zero and the ack didn't move, we're the 972 * receiver side. If we're getting packets in-order 973 * (the reassembly queue is empty), add the data to 974 * the socket buffer and note that we need a delayed ack. 975 */ 976 if (tp->t_state == TCPS_ESTABLISHED && 977 #ifdef TCP_ECN 978 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 979 #else 980 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 981 #endif 982 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 983 th->th_seq == tp->rcv_nxt && 984 tiwin && tiwin == tp->snd_wnd && 985 tp->snd_nxt == tp->snd_max) { 986 987 /* 988 * If last ACK falls within this segment's sequence numbers, 989 * record the timestamp. 990 * Fix from Braden, see Stevens p. 870 991 */ 992 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 993 tp->ts_recent_age = tcp_now; 994 tp->ts_recent = opti.ts_val; 995 } 996 997 if (tlen == 0) { 998 if (SEQ_GT(th->th_ack, tp->snd_una) && 999 SEQ_LEQ(th->th_ack, tp->snd_max) && 1000 tp->snd_cwnd >= tp->snd_wnd && 1001 tp->t_dupacks == 0) { 1002 /* 1003 * this is a pure ack for outstanding data. 1004 */ 1005 ++tcpstat.tcps_predack; 1006 if (opti.ts_present && opti.ts_ecr) 1007 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1008 else if (tp->t_rtttime && 1009 SEQ_GT(th->th_ack, tp->t_rtseq)) 1010 tcp_xmit_timer(tp, 1011 tcp_now - tp->t_rtttime); 1012 acked = th->th_ack - tp->snd_una; 1013 tcpstat.tcps_rcvackpack++; 1014 tcpstat.tcps_rcvackbyte += acked; 1015 ND6_HINT(tp); 1016 sbdrop(&so->so_snd, acked); 1017 1018 /* 1019 * If we had a pending ICMP message that 1020 * referres to data that have just been 1021 * acknowledged, disregard the recorded ICMP 1022 * message. 1023 */ 1024 if ((tp->t_flags & TF_PMTUD_PEND) && 1025 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1026 tp->t_flags &= ~TF_PMTUD_PEND; 1027 1028 /* 1029 * Keep track of the largest chunk of data 1030 * acknowledged since last PMTU update 1031 */ 1032 if (tp->t_pmtud_mss_acked < acked) 1033 tp->t_pmtud_mss_acked = acked; 1034 1035 tp->snd_una = th->th_ack; 1036 #if defined(TCP_SACK) || defined(TCP_ECN) 1037 /* 1038 * We want snd_last to track snd_una so 1039 * as to avoid sequence wraparound problems 1040 * for very large transfers. 1041 */ 1042 #ifdef TCP_ECN 1043 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1044 #endif 1045 tp->snd_last = tp->snd_una; 1046 #endif /* TCP_SACK */ 1047 #if defined(TCP_SACK) && defined(TCP_FACK) 1048 tp->snd_fack = tp->snd_una; 1049 tp->retran_data = 0; 1050 #endif /* TCP_FACK */ 1051 m_freem(m); 1052 1053 /* 1054 * If all outstanding data are acked, stop 1055 * retransmit timer, otherwise restart timer 1056 * using current (possibly backed-off) value. 1057 * If process is waiting for space, 1058 * wakeup/selwakeup/signal. If data 1059 * are ready to send, let tcp_output 1060 * decide between more output or persist. 1061 */ 1062 if (tp->snd_una == tp->snd_max) 1063 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1064 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1065 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1066 1067 if (sb_notify(&so->so_snd)) 1068 sowwakeup(so); 1069 if (so->so_snd.sb_cc) 1070 (void) tcp_output(tp); 1071 return; 1072 } 1073 } else if (th->th_ack == tp->snd_una && 1074 TAILQ_EMPTY(&tp->t_segq) && 1075 tlen <= sbspace(&so->so_rcv)) { 1076 /* 1077 * This is a pure, in-sequence data packet 1078 * with nothing on the reassembly queue and 1079 * we have enough buffer space to take it. 1080 */ 1081 #ifdef TCP_SACK 1082 /* Clean receiver SACK report if present */ 1083 if (tp->sack_enable && tp->rcv_numsacks) 1084 tcp_clean_sackreport(tp); 1085 #endif /* TCP_SACK */ 1086 ++tcpstat.tcps_preddat; 1087 tp->rcv_nxt += tlen; 1088 tcpstat.tcps_rcvpack++; 1089 tcpstat.tcps_rcvbyte += tlen; 1090 ND6_HINT(tp); 1091 /* 1092 * Drop TCP, IP headers and TCP options then add data 1093 * to socket buffer. 1094 */ 1095 if (so->so_state & SS_CANTRCVMORE) 1096 m_freem(m); 1097 else { 1098 m_adj(m, iphlen + off); 1099 sbappendstream(&so->so_rcv, m); 1100 } 1101 sorwakeup(so); 1102 TCP_SETUP_ACK(tp, tiflags); 1103 if (tp->t_flags & TF_ACKNOW) 1104 (void) tcp_output(tp); 1105 return; 1106 } 1107 } 1108 1109 /* 1110 * Compute mbuf offset to TCP data segment. 1111 */ 1112 hdroptlen = iphlen + off; 1113 1114 /* 1115 * Calculate amount of space in receive window, 1116 * and then do TCP input processing. 1117 * Receive window is amount of space in rcv queue, 1118 * but not less than advertised window. 1119 */ 1120 { int win; 1121 1122 win = sbspace(&so->so_rcv); 1123 if (win < 0) 1124 win = 0; 1125 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1126 } 1127 1128 switch (tp->t_state) { 1129 1130 /* 1131 * If the state is SYN_RECEIVED: 1132 * if seg contains SYN/ACK, send an RST. 1133 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1134 */ 1135 1136 case TCPS_SYN_RECEIVED: 1137 if (tiflags & TH_ACK) { 1138 if (tiflags & TH_SYN) { 1139 tcpstat.tcps_badsyn++; 1140 goto dropwithreset; 1141 } 1142 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1143 SEQ_GT(th->th_ack, tp->snd_max)) 1144 goto dropwithreset; 1145 } 1146 break; 1147 1148 /* 1149 * If the state is SYN_SENT: 1150 * if seg contains an ACK, but not for our SYN, drop the input. 1151 * if seg contains a RST, then drop the connection. 1152 * if seg does not contain SYN, then drop it. 1153 * Otherwise this is an acceptable SYN segment 1154 * initialize tp->rcv_nxt and tp->irs 1155 * if seg contains ack then advance tp->snd_una 1156 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1157 * arrange for segment to be acked (eventually) 1158 * continue processing rest of data/controls, beginning with URG 1159 */ 1160 case TCPS_SYN_SENT: 1161 if ((tiflags & TH_ACK) && 1162 (SEQ_LEQ(th->th_ack, tp->iss) || 1163 SEQ_GT(th->th_ack, tp->snd_max))) 1164 goto dropwithreset; 1165 if (tiflags & TH_RST) { 1166 #ifdef TCP_ECN 1167 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1168 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1169 goto drop; 1170 #endif 1171 if (tiflags & TH_ACK) 1172 tp = tcp_drop(tp, ECONNREFUSED); 1173 goto drop; 1174 } 1175 if ((tiflags & TH_SYN) == 0) 1176 goto drop; 1177 if (tiflags & TH_ACK) { 1178 tp->snd_una = th->th_ack; 1179 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1180 tp->snd_nxt = tp->snd_una; 1181 } 1182 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1183 tp->irs = th->th_seq; 1184 tcp_mss(tp, opti.maxseg); 1185 /* Reset initial window to 1 segment for retransmit */ 1186 if (tp->t_rxtshift > 0) 1187 tp->snd_cwnd = tp->t_maxseg; 1188 tcp_rcvseqinit(tp); 1189 tp->t_flags |= TF_ACKNOW; 1190 #ifdef TCP_SACK 1191 /* 1192 * If we've sent a SACK_PERMITTED option, and the peer 1193 * also replied with one, then TF_SACK_PERMIT should have 1194 * been set in tcp_dooptions(). If it was not, disable SACKs. 1195 */ 1196 if (tp->sack_enable) 1197 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1198 #endif 1199 #ifdef TCP_ECN 1200 /* 1201 * if ECE is set but CWR is not set for SYN-ACK, or 1202 * both ECE and CWR are set for simultaneous open, 1203 * peer is ECN capable. 1204 */ 1205 if (tcp_do_ecn) { 1206 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1207 == (TH_ACK|TH_ECE) || 1208 (tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1209 == (TH_ECE|TH_CWR)) { 1210 tp->t_flags |= TF_ECN_PERMIT; 1211 tiflags &= ~(TH_ECE|TH_CWR); 1212 tcpstat.tcps_ecn_accepts++; 1213 } 1214 } 1215 #endif 1216 1217 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1218 tcpstat.tcps_connects++; 1219 soisconnected(so); 1220 tp->t_state = TCPS_ESTABLISHED; 1221 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1222 /* Do window scaling on this connection? */ 1223 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1224 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1225 tp->snd_scale = tp->requested_s_scale; 1226 tp->rcv_scale = tp->request_r_scale; 1227 } 1228 tcp_reass_lock(tp); 1229 (void) tcp_reass(tp, (struct tcphdr *)0, 1230 (struct mbuf *)0, &tlen); 1231 tcp_reass_unlock(tp); 1232 /* 1233 * if we didn't have to retransmit the SYN, 1234 * use its rtt as our initial srtt & rtt var. 1235 */ 1236 if (tp->t_rtttime) 1237 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1238 /* 1239 * Since new data was acked (the SYN), open the 1240 * congestion window by one MSS. We do this 1241 * here, because we won't go through the normal 1242 * ACK processing below. And since this is the 1243 * start of the connection, we know we are in 1244 * the exponential phase of slow-start. 1245 */ 1246 tp->snd_cwnd += tp->t_maxseg; 1247 } else 1248 tp->t_state = TCPS_SYN_RECEIVED; 1249 1250 #if 0 1251 trimthenstep6: 1252 #endif 1253 /* 1254 * Advance th->th_seq to correspond to first data byte. 1255 * If data, trim to stay within window, 1256 * dropping FIN if necessary. 1257 */ 1258 th->th_seq++; 1259 if (tlen > tp->rcv_wnd) { 1260 todrop = tlen - tp->rcv_wnd; 1261 m_adj(m, -todrop); 1262 tlen = tp->rcv_wnd; 1263 tiflags &= ~TH_FIN; 1264 tcpstat.tcps_rcvpackafterwin++; 1265 tcpstat.tcps_rcvbyteafterwin += todrop; 1266 } 1267 tp->snd_wl1 = th->th_seq - 1; 1268 tp->rcv_up = th->th_seq; 1269 goto step6; 1270 } 1271 1272 /* 1273 * States other than LISTEN or SYN_SENT. 1274 * First check timestamp, if present. 1275 * Then check that at least some bytes of segment are within 1276 * receive window. If segment begins before rcv_nxt, 1277 * drop leading data (and SYN); if nothing left, just ack. 1278 * 1279 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1280 * and it's less than opti.ts_recent, drop it. 1281 */ 1282 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1283 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1284 1285 /* Check to see if ts_recent is over 24 days old. */ 1286 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1287 /* 1288 * Invalidate ts_recent. If this segment updates 1289 * ts_recent, the age will be reset later and ts_recent 1290 * will get a valid value. If it does not, setting 1291 * ts_recent to zero will at least satisfy the 1292 * requirement that zero be placed in the timestamp 1293 * echo reply when ts_recent isn't valid. The 1294 * age isn't reset until we get a valid ts_recent 1295 * because we don't want out-of-order segments to be 1296 * dropped when ts_recent is old. 1297 */ 1298 tp->ts_recent = 0; 1299 } else { 1300 tcpstat.tcps_rcvduppack++; 1301 tcpstat.tcps_rcvdupbyte += tlen; 1302 tcpstat.tcps_pawsdrop++; 1303 goto dropafterack; 1304 } 1305 } 1306 1307 todrop = tp->rcv_nxt - th->th_seq; 1308 if (todrop > 0) { 1309 if (tiflags & TH_SYN) { 1310 tiflags &= ~TH_SYN; 1311 th->th_seq++; 1312 if (th->th_urp > 1) 1313 th->th_urp--; 1314 else 1315 tiflags &= ~TH_URG; 1316 todrop--; 1317 } 1318 if (todrop > tlen || 1319 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1320 /* 1321 * Any valid FIN must be to the left of the 1322 * window. At this point, FIN must be a 1323 * duplicate or out-of-sequence, so drop it. 1324 */ 1325 tiflags &= ~TH_FIN; 1326 /* 1327 * Send ACK to resynchronize, and drop any data, 1328 * but keep on processing for RST or ACK. 1329 */ 1330 tp->t_flags |= TF_ACKNOW; 1331 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1332 tcpstat.tcps_rcvduppack++; 1333 } else { 1334 tcpstat.tcps_rcvpartduppack++; 1335 tcpstat.tcps_rcvpartdupbyte += todrop; 1336 } 1337 hdroptlen += todrop; /* drop from head afterwards */ 1338 th->th_seq += todrop; 1339 tlen -= todrop; 1340 if (th->th_urp > todrop) 1341 th->th_urp -= todrop; 1342 else { 1343 tiflags &= ~TH_URG; 1344 th->th_urp = 0; 1345 } 1346 } 1347 1348 /* 1349 * If new data are received on a connection after the 1350 * user processes are gone, then RST the other end. 1351 */ 1352 if ((so->so_state & SS_NOFDREF) && 1353 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1354 tp = tcp_close(tp); 1355 tcpstat.tcps_rcvafterclose++; 1356 goto dropwithreset; 1357 } 1358 1359 /* 1360 * If segment ends after window, drop trailing data 1361 * (and PUSH and FIN); if nothing left, just ACK. 1362 */ 1363 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1364 if (todrop > 0) { 1365 tcpstat.tcps_rcvpackafterwin++; 1366 if (todrop >= tlen) { 1367 tcpstat.tcps_rcvbyteafterwin += tlen; 1368 /* 1369 * If a new connection request is received 1370 * while in TIME_WAIT, drop the old connection 1371 * and start over if the sequence numbers 1372 * are above the previous ones. 1373 */ 1374 if (tiflags & TH_SYN && 1375 tp->t_state == TCPS_TIME_WAIT && 1376 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1377 iss = tp->snd_nxt + TCP_ISSINCR; 1378 tp = tcp_close(tp); 1379 goto findpcb; 1380 } 1381 /* 1382 * If window is closed can only take segments at 1383 * window edge, and have to drop data and PUSH from 1384 * incoming segments. Continue processing, but 1385 * remember to ack. Otherwise, drop segment 1386 * and ack. 1387 */ 1388 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1389 tp->t_flags |= TF_ACKNOW; 1390 tcpstat.tcps_rcvwinprobe++; 1391 } else 1392 goto dropafterack; 1393 } else 1394 tcpstat.tcps_rcvbyteafterwin += todrop; 1395 m_adj(m, -todrop); 1396 tlen -= todrop; 1397 tiflags &= ~(TH_PUSH|TH_FIN); 1398 } 1399 1400 /* 1401 * If last ACK falls within this segment's sequence numbers, 1402 * record its timestamp if it's more recent. 1403 * Cf fix from Braden, see Stevens p. 870 1404 */ 1405 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1406 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1407 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1408 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1409 tp->ts_recent = opti.ts_val; 1410 else 1411 tp->ts_recent = 0; 1412 tp->ts_recent_age = tcp_now; 1413 } 1414 1415 /* 1416 * If the RST bit is set examine the state: 1417 * SYN_RECEIVED STATE: 1418 * If passive open, return to LISTEN state. 1419 * If active open, inform user that connection was refused. 1420 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1421 * Inform user that connection was reset, and close tcb. 1422 * CLOSING, LAST_ACK, TIME_WAIT STATES 1423 * Close the tcb. 1424 */ 1425 if (tiflags & TH_RST) { 1426 if (th->th_seq != tp->last_ack_sent && 1427 th->th_seq != tp->rcv_nxt) 1428 goto drop; 1429 1430 switch (tp->t_state) { 1431 case TCPS_SYN_RECEIVED: 1432 #ifdef TCP_ECN 1433 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1434 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1435 goto drop; 1436 #endif 1437 so->so_error = ECONNREFUSED; 1438 goto close; 1439 1440 case TCPS_ESTABLISHED: 1441 case TCPS_FIN_WAIT_1: 1442 case TCPS_FIN_WAIT_2: 1443 case TCPS_CLOSE_WAIT: 1444 so->so_error = ECONNRESET; 1445 close: 1446 tp->t_state = TCPS_CLOSED; 1447 tcpstat.tcps_drops++; 1448 tp = tcp_close(tp); 1449 goto drop; 1450 case TCPS_CLOSING: 1451 case TCPS_LAST_ACK: 1452 case TCPS_TIME_WAIT: 1453 tp = tcp_close(tp); 1454 goto drop; 1455 } 1456 } 1457 1458 /* 1459 * If a SYN is in the window, then this is an 1460 * error and we ACK and drop the packet. 1461 */ 1462 if (tiflags & TH_SYN) 1463 goto dropafterack_ratelim; 1464 1465 /* 1466 * If the ACK bit is off we drop the segment and return. 1467 */ 1468 if ((tiflags & TH_ACK) == 0) { 1469 if (tp->t_flags & TF_ACKNOW) 1470 goto dropafterack; 1471 else 1472 goto drop; 1473 } 1474 1475 /* 1476 * Ack processing. 1477 */ 1478 switch (tp->t_state) { 1479 1480 /* 1481 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1482 * ESTABLISHED state and continue processing. 1483 * The ACK was checked above. 1484 */ 1485 case TCPS_SYN_RECEIVED: 1486 tcpstat.tcps_connects++; 1487 soisconnected(so); 1488 tp->t_state = TCPS_ESTABLISHED; 1489 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1490 /* Do window scaling? */ 1491 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1492 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1493 tp->snd_scale = tp->requested_s_scale; 1494 tp->rcv_scale = tp->request_r_scale; 1495 } 1496 tcp_reass_lock(tp); 1497 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1498 &tlen); 1499 tcp_reass_unlock(tp); 1500 tp->snd_wl1 = th->th_seq - 1; 1501 /* fall into ... */ 1502 1503 /* 1504 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1505 * ACKs. If the ack is in the range 1506 * tp->snd_una < th->th_ack <= tp->snd_max 1507 * then advance tp->snd_una to th->th_ack and drop 1508 * data from the retransmission queue. If this ACK reflects 1509 * more up to date window information we update our window information. 1510 */ 1511 case TCPS_ESTABLISHED: 1512 case TCPS_FIN_WAIT_1: 1513 case TCPS_FIN_WAIT_2: 1514 case TCPS_CLOSE_WAIT: 1515 case TCPS_CLOSING: 1516 case TCPS_LAST_ACK: 1517 case TCPS_TIME_WAIT: 1518 #ifdef TCP_ECN 1519 /* 1520 * if we receive ECE and are not already in recovery phase, 1521 * reduce cwnd by half but don't slow-start. 1522 * advance snd_last to snd_max not to reduce cwnd again 1523 * until all outstanding packets are acked. 1524 */ 1525 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1526 if ((tp->t_flags & TF_ECN_PERMIT) && 1527 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1528 u_int win; 1529 1530 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1531 if (win > 1) { 1532 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1533 tp->snd_cwnd = tp->snd_ssthresh; 1534 tp->snd_last = tp->snd_max; 1535 tp->t_flags |= TF_SEND_CWR; 1536 tcpstat.tcps_cwr_ecn++; 1537 } 1538 } 1539 tcpstat.tcps_ecn_rcvece++; 1540 } 1541 /* 1542 * if we receive CWR, we know that the peer has reduced 1543 * its congestion window. stop sending ecn-echo. 1544 */ 1545 if ((tiflags & TH_CWR)) { 1546 tp->t_flags &= ~TF_RCVD_CE; 1547 tcpstat.tcps_ecn_rcvcwr++; 1548 } 1549 #endif /* TCP_ECN */ 1550 1551 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1552 /* 1553 * Duplicate/old ACK processing. 1554 * Increments t_dupacks: 1555 * Pure duplicate (same seq/ack/window, no data) 1556 * Doesn't affect t_dupacks: 1557 * Data packets. 1558 * Normal window updates (window opens) 1559 * Resets t_dupacks: 1560 * New data ACKed. 1561 * Window shrinks 1562 * Old ACK 1563 */ 1564 if (tlen) { 1565 /* Drop very old ACKs unless th_seq matches */ 1566 if (th->th_seq != tp->rcv_nxt && 1567 SEQ_LT(th->th_ack, 1568 tp->snd_una - tp->max_sndwnd)) { 1569 tcpstat.tcps_rcvacktooold++; 1570 goto drop; 1571 } 1572 break; 1573 } 1574 /* 1575 * If we get an old ACK, there is probably packet 1576 * reordering going on. Be conservative and reset 1577 * t_dupacks so that we are less agressive in 1578 * doing a fast retransmit. 1579 */ 1580 if (th->th_ack != tp->snd_una) { 1581 tp->t_dupacks = 0; 1582 break; 1583 } 1584 if (tiwin == tp->snd_wnd) { 1585 tcpstat.tcps_rcvdupack++; 1586 /* 1587 * If we have outstanding data (other than 1588 * a window probe), this is a completely 1589 * duplicate ack (ie, window info didn't 1590 * change), the ack is the biggest we've 1591 * seen and we've seen exactly our rexmt 1592 * threshold of them, assume a packet 1593 * has been dropped and retransmit it. 1594 * Kludge snd_nxt & the congestion 1595 * window so we send only this one 1596 * packet. 1597 * 1598 * We know we're losing at the current 1599 * window size so do congestion avoidance 1600 * (set ssthresh to half the current window 1601 * and pull our congestion window back to 1602 * the new ssthresh). 1603 * 1604 * Dup acks mean that packets have left the 1605 * network (they're now cached at the receiver) 1606 * so bump cwnd by the amount in the receiver 1607 * to keep a constant cwnd packets in the 1608 * network. 1609 */ 1610 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1611 tp->t_dupacks = 0; 1612 #if defined(TCP_SACK) && defined(TCP_FACK) 1613 /* 1614 * In FACK, can enter fast rec. if the receiver 1615 * reports a reass. queue longer than 3 segs. 1616 */ 1617 else if (++tp->t_dupacks == tcprexmtthresh || 1618 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1619 tp->t_maxseg + tp->snd_una)) && 1620 SEQ_GT(tp->snd_una, tp->snd_last))) { 1621 #else 1622 else if (++tp->t_dupacks == tcprexmtthresh) { 1623 #endif /* TCP_FACK */ 1624 tcp_seq onxt = tp->snd_nxt; 1625 u_long win = 1626 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1627 2 / tp->t_maxseg; 1628 1629 #if defined(TCP_SACK) || defined(TCP_ECN) 1630 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1631 /* 1632 * False fast retx after 1633 * timeout. Do not cut window. 1634 */ 1635 tp->t_dupacks = 0; 1636 goto drop; 1637 } 1638 #endif 1639 if (win < 2) 1640 win = 2; 1641 tp->snd_ssthresh = win * tp->t_maxseg; 1642 #if defined(TCP_SACK) 1643 tp->snd_last = tp->snd_max; 1644 #endif 1645 #ifdef TCP_SACK 1646 if (tp->sack_enable) { 1647 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1648 tp->t_rtttime = 0; 1649 #ifdef TCP_ECN 1650 tp->t_flags |= TF_SEND_CWR; 1651 #endif 1652 #if 1 /* TCP_ECN */ 1653 tcpstat.tcps_cwr_frecovery++; 1654 #endif 1655 tcpstat.tcps_sack_recovery_episode++; 1656 #if defined(TCP_SACK) && defined(TCP_FACK) 1657 tp->t_dupacks = tcprexmtthresh; 1658 (void) tcp_output(tp); 1659 /* 1660 * During FR, snd_cwnd is held 1661 * constant for FACK. 1662 */ 1663 tp->snd_cwnd = tp->snd_ssthresh; 1664 #else 1665 /* 1666 * tcp_output() will send 1667 * oldest SACK-eligible rtx. 1668 */ 1669 (void) tcp_output(tp); 1670 tp->snd_cwnd = tp->snd_ssthresh+ 1671 tp->t_maxseg * tp->t_dupacks; 1672 #endif /* TCP_FACK */ 1673 goto drop; 1674 } 1675 #endif /* TCP_SACK */ 1676 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1677 tp->t_rtttime = 0; 1678 tp->snd_nxt = th->th_ack; 1679 tp->snd_cwnd = tp->t_maxseg; 1680 #ifdef TCP_ECN 1681 tp->t_flags |= TF_SEND_CWR; 1682 #endif 1683 #if 1 /* TCP_ECN */ 1684 tcpstat.tcps_cwr_frecovery++; 1685 #endif 1686 tcpstat.tcps_sndrexmitfast++; 1687 (void) tcp_output(tp); 1688 1689 tp->snd_cwnd = tp->snd_ssthresh + 1690 tp->t_maxseg * tp->t_dupacks; 1691 if (SEQ_GT(onxt, tp->snd_nxt)) 1692 tp->snd_nxt = onxt; 1693 goto drop; 1694 } else if (tp->t_dupacks > tcprexmtthresh) { 1695 #if defined(TCP_SACK) && defined(TCP_FACK) 1696 /* 1697 * while (awnd < cwnd) 1698 * sendsomething(); 1699 */ 1700 if (tp->sack_enable) { 1701 if (tp->snd_awnd < tp->snd_cwnd) 1702 tcp_output(tp); 1703 goto drop; 1704 } 1705 #endif /* TCP_FACK */ 1706 tp->snd_cwnd += tp->t_maxseg; 1707 (void) tcp_output(tp); 1708 goto drop; 1709 } 1710 } else if (tiwin < tp->snd_wnd) { 1711 /* 1712 * The window was retracted! Previous dup 1713 * ACKs may have been due to packets arriving 1714 * after the shrunken window, not a missing 1715 * packet, so play it safe and reset t_dupacks 1716 */ 1717 tp->t_dupacks = 0; 1718 } 1719 break; 1720 } 1721 /* 1722 * If the congestion window was inflated to account 1723 * for the other side's cached packets, retract it. 1724 */ 1725 #if defined(TCP_SACK) 1726 if (tp->sack_enable) { 1727 if (tp->t_dupacks >= tcprexmtthresh) { 1728 /* Check for a partial ACK */ 1729 if (tcp_sack_partialack(tp, th)) { 1730 #if defined(TCP_SACK) && defined(TCP_FACK) 1731 /* Force call to tcp_output */ 1732 if (tp->snd_awnd < tp->snd_cwnd) 1733 needoutput = 1; 1734 #else 1735 tp->snd_cwnd += tp->t_maxseg; 1736 needoutput = 1; 1737 #endif /* TCP_FACK */ 1738 } else { 1739 /* Out of fast recovery */ 1740 tp->snd_cwnd = tp->snd_ssthresh; 1741 if (tcp_seq_subtract(tp->snd_max, 1742 th->th_ack) < tp->snd_ssthresh) 1743 tp->snd_cwnd = 1744 tcp_seq_subtract(tp->snd_max, 1745 th->th_ack); 1746 tp->t_dupacks = 0; 1747 #if defined(TCP_SACK) && defined(TCP_FACK) 1748 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1749 tp->snd_fack = th->th_ack; 1750 #endif /* TCP_FACK */ 1751 } 1752 } 1753 } else { 1754 if (tp->t_dupacks >= tcprexmtthresh && 1755 !tcp_newreno(tp, th)) { 1756 /* Out of fast recovery */ 1757 tp->snd_cwnd = tp->snd_ssthresh; 1758 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1759 tp->snd_ssthresh) 1760 tp->snd_cwnd = 1761 tcp_seq_subtract(tp->snd_max, 1762 th->th_ack); 1763 tp->t_dupacks = 0; 1764 } 1765 } 1766 if (tp->t_dupacks < tcprexmtthresh) 1767 tp->t_dupacks = 0; 1768 #else /* else no TCP_SACK */ 1769 if (tp->t_dupacks >= tcprexmtthresh && 1770 tp->snd_cwnd > tp->snd_ssthresh) 1771 tp->snd_cwnd = tp->snd_ssthresh; 1772 tp->t_dupacks = 0; 1773 #endif 1774 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1775 tcpstat.tcps_rcvacktoomuch++; 1776 goto dropafterack_ratelim; 1777 } 1778 acked = th->th_ack - tp->snd_una; 1779 tcpstat.tcps_rcvackpack++; 1780 tcpstat.tcps_rcvackbyte += acked; 1781 1782 /* 1783 * If we have a timestamp reply, update smoothed 1784 * round trip time. If no timestamp is present but 1785 * transmit timer is running and timed sequence 1786 * number was acked, update smoothed round trip time. 1787 * Since we now have an rtt measurement, cancel the 1788 * timer backoff (cf., Phil Karn's retransmit alg.). 1789 * Recompute the initial retransmit timer. 1790 */ 1791 if (opti.ts_present && opti.ts_ecr) 1792 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1793 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1794 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1795 1796 /* 1797 * If all outstanding data is acked, stop retransmit 1798 * timer and remember to restart (more output or persist). 1799 * If there is more data to be acked, restart retransmit 1800 * timer, using current (possibly backed-off) value. 1801 */ 1802 if (th->th_ack == tp->snd_max) { 1803 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1804 needoutput = 1; 1805 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1806 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1807 /* 1808 * When new data is acked, open the congestion window. 1809 * If the window gives us less than ssthresh packets 1810 * in flight, open exponentially (maxseg per packet). 1811 * Otherwise open linearly: maxseg per window 1812 * (maxseg^2 / cwnd per packet). 1813 */ 1814 { 1815 u_int cw = tp->snd_cwnd; 1816 u_int incr = tp->t_maxseg; 1817 1818 if (cw > tp->snd_ssthresh) 1819 incr = incr * incr / cw; 1820 #if defined (TCP_SACK) 1821 if (tp->t_dupacks < tcprexmtthresh) 1822 #endif 1823 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1824 } 1825 ND6_HINT(tp); 1826 if (acked > so->so_snd.sb_cc) { 1827 tp->snd_wnd -= so->so_snd.sb_cc; 1828 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1829 ourfinisacked = 1; 1830 } else { 1831 sbdrop(&so->so_snd, acked); 1832 tp->snd_wnd -= acked; 1833 ourfinisacked = 0; 1834 } 1835 if (sb_notify(&so->so_snd)) 1836 sowwakeup(so); 1837 1838 /* 1839 * If we had a pending ICMP message that referred to data 1840 * that have just been acknowledged, disregard the recorded 1841 * ICMP message. 1842 */ 1843 if ((tp->t_flags & TF_PMTUD_PEND) && 1844 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1845 tp->t_flags &= ~TF_PMTUD_PEND; 1846 1847 /* 1848 * Keep track of the largest chunk of data acknowledged 1849 * since last PMTU update 1850 */ 1851 if (tp->t_pmtud_mss_acked < acked) 1852 tp->t_pmtud_mss_acked = acked; 1853 1854 tp->snd_una = th->th_ack; 1855 #ifdef TCP_ECN 1856 /* sync snd_last with snd_una */ 1857 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1858 tp->snd_last = tp->snd_una; 1859 #endif 1860 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1861 tp->snd_nxt = tp->snd_una; 1862 #if defined (TCP_SACK) && defined (TCP_FACK) 1863 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1864 tp->snd_fack = tp->snd_una; 1865 /* Update snd_awnd for partial ACK 1866 * without any SACK blocks. 1867 */ 1868 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1869 tp->snd_fack) + tp->retran_data; 1870 } 1871 #endif 1872 1873 switch (tp->t_state) { 1874 1875 /* 1876 * In FIN_WAIT_1 STATE in addition to the processing 1877 * for the ESTABLISHED state if our FIN is now acknowledged 1878 * then enter FIN_WAIT_2. 1879 */ 1880 case TCPS_FIN_WAIT_1: 1881 if (ourfinisacked) { 1882 /* 1883 * If we can't receive any more 1884 * data, then closing user can proceed. 1885 * Starting the timer is contrary to the 1886 * specification, but if we don't get a FIN 1887 * we'll hang forever. 1888 */ 1889 if (so->so_state & SS_CANTRCVMORE) { 1890 soisdisconnected(so); 1891 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1892 } 1893 tp->t_state = TCPS_FIN_WAIT_2; 1894 } 1895 break; 1896 1897 /* 1898 * In CLOSING STATE in addition to the processing for 1899 * the ESTABLISHED state if the ACK acknowledges our FIN 1900 * then enter the TIME-WAIT state, otherwise ignore 1901 * the segment. 1902 */ 1903 case TCPS_CLOSING: 1904 if (ourfinisacked) { 1905 tp->t_state = TCPS_TIME_WAIT; 1906 tcp_canceltimers(tp); 1907 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1908 soisdisconnected(so); 1909 } 1910 break; 1911 1912 /* 1913 * In LAST_ACK, we may still be waiting for data to drain 1914 * and/or to be acked, as well as for the ack of our FIN. 1915 * If our FIN is now acknowledged, delete the TCB, 1916 * enter the closed state and return. 1917 */ 1918 case TCPS_LAST_ACK: 1919 if (ourfinisacked) { 1920 tp = tcp_close(tp); 1921 goto drop; 1922 } 1923 break; 1924 1925 /* 1926 * In TIME_WAIT state the only thing that should arrive 1927 * is a retransmission of the remote FIN. Acknowledge 1928 * it and restart the finack timer. 1929 */ 1930 case TCPS_TIME_WAIT: 1931 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1932 goto dropafterack; 1933 } 1934 } 1935 1936 step6: 1937 /* 1938 * Update window information. 1939 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1940 */ 1941 if ((tiflags & TH_ACK) && 1942 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1943 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1944 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1945 /* keep track of pure window updates */ 1946 if (tlen == 0 && 1947 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1948 tcpstat.tcps_rcvwinupd++; 1949 tp->snd_wnd = tiwin; 1950 tp->snd_wl1 = th->th_seq; 1951 tp->snd_wl2 = th->th_ack; 1952 if (tp->snd_wnd > tp->max_sndwnd) 1953 tp->max_sndwnd = tp->snd_wnd; 1954 needoutput = 1; 1955 } 1956 1957 /* 1958 * Process segments with URG. 1959 */ 1960 if ((tiflags & TH_URG) && th->th_urp && 1961 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1962 /* 1963 * This is a kludge, but if we receive and accept 1964 * random urgent pointers, we'll crash in 1965 * soreceive. It's hard to imagine someone 1966 * actually wanting to send this much urgent data. 1967 */ 1968 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1969 th->th_urp = 0; /* XXX */ 1970 tiflags &= ~TH_URG; /* XXX */ 1971 goto dodata; /* XXX */ 1972 } 1973 /* 1974 * If this segment advances the known urgent pointer, 1975 * then mark the data stream. This should not happen 1976 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1977 * a FIN has been received from the remote side. 1978 * In these states we ignore the URG. 1979 * 1980 * According to RFC961 (Assigned Protocols), 1981 * the urgent pointer points to the last octet 1982 * of urgent data. We continue, however, 1983 * to consider it to indicate the first octet 1984 * of data past the urgent section as the original 1985 * spec states (in one of two places). 1986 */ 1987 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1988 tp->rcv_up = th->th_seq + th->th_urp; 1989 so->so_oobmark = so->so_rcv.sb_cc + 1990 (tp->rcv_up - tp->rcv_nxt) - 1; 1991 if (so->so_oobmark == 0) 1992 so->so_state |= SS_RCVATMARK; 1993 sohasoutofband(so); 1994 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1995 } 1996 /* 1997 * Remove out of band data so doesn't get presented to user. 1998 * This can happen independent of advancing the URG pointer, 1999 * but if two URG's are pending at once, some out-of-band 2000 * data may creep in... ick. 2001 */ 2002 if (th->th_urp <= (u_int16_t) tlen 2003 #ifdef SO_OOBINLINE 2004 && (so->so_options & SO_OOBINLINE) == 0 2005 #endif 2006 ) 2007 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2008 } else 2009 /* 2010 * If no out of band data is expected, 2011 * pull receive urgent pointer along 2012 * with the receive window. 2013 */ 2014 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2015 tp->rcv_up = tp->rcv_nxt; 2016 dodata: /* XXX */ 2017 2018 /* 2019 * Process the segment text, merging it into the TCP sequencing queue, 2020 * and arranging for acknowledgment of receipt if necessary. 2021 * This process logically involves adjusting tp->rcv_wnd as data 2022 * is presented to the user (this happens in tcp_usrreq.c, 2023 * case PRU_RCVD). If a FIN has already been received on this 2024 * connection then we just ignore the text. 2025 */ 2026 if ((tlen || (tiflags & TH_FIN)) && 2027 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2028 #ifdef TCP_SACK 2029 tcp_seq laststart = th->th_seq; 2030 tcp_seq lastend = th->th_seq + tlen; 2031 #endif 2032 tcp_reass_lock(tp); 2033 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2034 tp->t_state == TCPS_ESTABLISHED) { 2035 tcp_reass_unlock(tp); 2036 TCP_SETUP_ACK(tp, tiflags); 2037 tp->rcv_nxt += tlen; 2038 tiflags = th->th_flags & TH_FIN; 2039 tcpstat.tcps_rcvpack++; 2040 tcpstat.tcps_rcvbyte += tlen; 2041 ND6_HINT(tp); 2042 if (so->so_state & SS_CANTRCVMORE) 2043 m_freem(m); 2044 else { 2045 m_adj(m, hdroptlen); 2046 sbappendstream(&so->so_rcv, m); 2047 } 2048 sorwakeup(so); 2049 } else { 2050 m_adj(m, hdroptlen); 2051 tiflags = tcp_reass(tp, th, m, &tlen); 2052 tcp_reass_unlock(tp); 2053 tp->t_flags |= TF_ACKNOW; 2054 } 2055 #ifdef TCP_SACK 2056 if (tp->sack_enable) 2057 tcp_update_sack_list(tp, laststart, lastend); 2058 #endif 2059 2060 /* 2061 * variable len never referenced again in modern BSD, 2062 * so why bother computing it ?? 2063 */ 2064 #if 0 2065 /* 2066 * Note the amount of data that peer has sent into 2067 * our window, in order to estimate the sender's 2068 * buffer size. 2069 */ 2070 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2071 #endif /* 0 */ 2072 } else { 2073 m_freem(m); 2074 tiflags &= ~TH_FIN; 2075 } 2076 2077 /* 2078 * If FIN is received ACK the FIN and let the user know 2079 * that the connection is closing. Ignore a FIN received before 2080 * the connection is fully established. 2081 */ 2082 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2083 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2084 socantrcvmore(so); 2085 tp->t_flags |= TF_ACKNOW; 2086 tp->rcv_nxt++; 2087 } 2088 switch (tp->t_state) { 2089 2090 /* 2091 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2092 */ 2093 case TCPS_ESTABLISHED: 2094 tp->t_state = TCPS_CLOSE_WAIT; 2095 break; 2096 2097 /* 2098 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2099 * enter the CLOSING state. 2100 */ 2101 case TCPS_FIN_WAIT_1: 2102 tp->t_state = TCPS_CLOSING; 2103 break; 2104 2105 /* 2106 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2107 * starting the time-wait timer, turning off the other 2108 * standard timers. 2109 */ 2110 case TCPS_FIN_WAIT_2: 2111 tp->t_state = TCPS_TIME_WAIT; 2112 tcp_canceltimers(tp); 2113 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2114 soisdisconnected(so); 2115 break; 2116 2117 /* 2118 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2119 */ 2120 case TCPS_TIME_WAIT: 2121 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2122 break; 2123 } 2124 } 2125 if (so->so_options & SO_DEBUG) { 2126 switch (tp->pf) { 2127 #ifdef INET6 2128 case PF_INET6: 2129 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2130 0, tlen); 2131 break; 2132 #endif /* INET6 */ 2133 case PF_INET: 2134 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2135 0, tlen); 2136 break; 2137 } 2138 } 2139 2140 /* 2141 * Return any desired output. 2142 */ 2143 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2144 (void) tcp_output(tp); 2145 } 2146 return; 2147 2148 badsyn: 2149 /* 2150 * Received a bad SYN. Increment counters and dropwithreset. 2151 */ 2152 tcpstat.tcps_badsyn++; 2153 tp = NULL; 2154 goto dropwithreset; 2155 2156 dropafterack_ratelim: 2157 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2158 tcp_ackdrop_ppslim) == 0) { 2159 /* XXX stat */ 2160 goto drop; 2161 } 2162 /* ...fall into dropafterack... */ 2163 2164 dropafterack: 2165 /* 2166 * Generate an ACK dropping incoming segment if it occupies 2167 * sequence space, where the ACK reflects our state. 2168 */ 2169 if (tiflags & TH_RST) 2170 goto drop; 2171 m_freem(m); 2172 tp->t_flags |= TF_ACKNOW; 2173 (void) tcp_output(tp); 2174 return; 2175 2176 dropwithreset_ratelim: 2177 /* 2178 * We may want to rate-limit RSTs in certain situations, 2179 * particularly if we are sending an RST in response to 2180 * an attempt to connect to or otherwise communicate with 2181 * a port for which we have no socket. 2182 */ 2183 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2184 tcp_rst_ppslim) == 0) { 2185 /* XXX stat */ 2186 goto drop; 2187 } 2188 /* ...fall into dropwithreset... */ 2189 2190 dropwithreset: 2191 /* 2192 * Generate a RST, dropping incoming segment. 2193 * Make ACK acceptable to originator of segment. 2194 * Don't bother to respond to RST. 2195 */ 2196 if (tiflags & TH_RST) 2197 goto drop; 2198 if (tiflags & TH_ACK) { 2199 tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, 2200 TH_RST); 2201 } else { 2202 if (tiflags & TH_SYN) 2203 tlen++; 2204 tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen, 2205 (tcp_seq)0, TH_RST|TH_ACK); 2206 } 2207 return; 2208 2209 drop: 2210 /* 2211 * Drop space held by incoming segment and return. 2212 */ 2213 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2214 switch (tp->pf) { 2215 #ifdef INET6 2216 case PF_INET6: 2217 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2218 0, tlen); 2219 break; 2220 #endif /* INET6 */ 2221 case PF_INET: 2222 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2223 0, tlen); 2224 break; 2225 } 2226 } 2227 2228 m_freem(m); 2229 return; 2230 } 2231 2232 int 2233 tcp_dooptions(tp, cp, cnt, th, m, iphlen, oi) 2234 struct tcpcb *tp; 2235 u_char *cp; 2236 int cnt; 2237 struct tcphdr *th; 2238 struct mbuf *m; 2239 int iphlen; 2240 struct tcp_opt_info *oi; 2241 { 2242 u_int16_t mss = 0; 2243 int opt, optlen; 2244 #ifdef TCP_SIGNATURE 2245 caddr_t sigp = NULL; 2246 struct tdb *tdb = NULL; 2247 #endif /* TCP_SIGNATURE */ 2248 2249 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2250 opt = cp[0]; 2251 if (opt == TCPOPT_EOL) 2252 break; 2253 if (opt == TCPOPT_NOP) 2254 optlen = 1; 2255 else { 2256 if (cnt < 2) 2257 break; 2258 optlen = cp[1]; 2259 if (optlen < 2 || optlen > cnt) 2260 break; 2261 } 2262 switch (opt) { 2263 2264 default: 2265 continue; 2266 2267 case TCPOPT_MAXSEG: 2268 if (optlen != TCPOLEN_MAXSEG) 2269 continue; 2270 if (!(th->th_flags & TH_SYN)) 2271 continue; 2272 if (TCPS_HAVERCVDSYN(tp->t_state)) 2273 continue; 2274 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2275 NTOHS(mss); 2276 oi->maxseg = mss; 2277 break; 2278 2279 case TCPOPT_WINDOW: 2280 if (optlen != TCPOLEN_WINDOW) 2281 continue; 2282 if (!(th->th_flags & TH_SYN)) 2283 continue; 2284 if (TCPS_HAVERCVDSYN(tp->t_state)) 2285 continue; 2286 tp->t_flags |= TF_RCVD_SCALE; 2287 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2288 break; 2289 2290 case TCPOPT_TIMESTAMP: 2291 if (optlen != TCPOLEN_TIMESTAMP) 2292 continue; 2293 oi->ts_present = 1; 2294 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2295 NTOHL(oi->ts_val); 2296 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2297 NTOHL(oi->ts_ecr); 2298 2299 if (!(th->th_flags & TH_SYN)) 2300 continue; 2301 if (TCPS_HAVERCVDSYN(tp->t_state)) 2302 continue; 2303 /* 2304 * A timestamp received in a SYN makes 2305 * it ok to send timestamp requests and replies. 2306 */ 2307 tp->t_flags |= TF_RCVD_TSTMP; 2308 tp->ts_recent = oi->ts_val; 2309 tp->ts_recent_age = tcp_now; 2310 break; 2311 2312 #ifdef TCP_SACK 2313 case TCPOPT_SACK_PERMITTED: 2314 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2315 continue; 2316 if (!(th->th_flags & TH_SYN)) 2317 continue; 2318 if (TCPS_HAVERCVDSYN(tp->t_state)) 2319 continue; 2320 /* MUST only be set on SYN */ 2321 tp->t_flags |= TF_SACK_PERMIT; 2322 break; 2323 case TCPOPT_SACK: 2324 tcp_sack_option(tp, th, cp, optlen); 2325 break; 2326 #endif 2327 #ifdef TCP_SIGNATURE 2328 case TCPOPT_SIGNATURE: 2329 if (optlen != TCPOLEN_SIGNATURE) 2330 continue; 2331 2332 if (sigp && bcmp(sigp, cp + 2, 16)) 2333 return (-1); 2334 2335 sigp = cp + 2; 2336 break; 2337 #endif /* TCP_SIGNATURE */ 2338 } 2339 } 2340 2341 #ifdef TCP_SIGNATURE 2342 if (tp->t_flags & TF_SIGNATURE) { 2343 union sockaddr_union src, dst; 2344 2345 memset(&src, 0, sizeof(union sockaddr_union)); 2346 memset(&dst, 0, sizeof(union sockaddr_union)); 2347 2348 switch (tp->pf) { 2349 case 0: 2350 #ifdef INET 2351 case AF_INET: 2352 src.sa.sa_len = sizeof(struct sockaddr_in); 2353 src.sa.sa_family = AF_INET; 2354 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2355 dst.sa.sa_len = sizeof(struct sockaddr_in); 2356 dst.sa.sa_family = AF_INET; 2357 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2358 break; 2359 #endif 2360 #ifdef INET6 2361 case AF_INET6: 2362 src.sa.sa_len = sizeof(struct sockaddr_in6); 2363 src.sa.sa_family = AF_INET6; 2364 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2365 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2366 dst.sa.sa_family = AF_INET6; 2367 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2368 break; 2369 #endif /* INET6 */ 2370 } 2371 2372 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 2373 2374 /* 2375 * We don't have an SA for this peer, so we turn off 2376 * TF_SIGNATURE on the listen socket 2377 */ 2378 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2379 tp->t_flags &= ~TF_SIGNATURE; 2380 2381 } 2382 2383 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2384 tcpstat.tcps_rcvbadsig++; 2385 return (-1); 2386 } 2387 2388 if (sigp) { 2389 char sig[16]; 2390 2391 if (tdb == NULL) { 2392 tcpstat.tcps_rcvbadsig++; 2393 return (-1); 2394 } 2395 2396 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2397 return (-1); 2398 2399 if (bcmp(sig, sigp, 16)) { 2400 tcpstat.tcps_rcvbadsig++; 2401 return (-1); 2402 } 2403 2404 tcpstat.tcps_rcvgoodsig++; 2405 } 2406 #endif /* TCP_SIGNATURE */ 2407 2408 return (0); 2409 } 2410 2411 #if defined(TCP_SACK) 2412 u_long 2413 tcp_seq_subtract(a, b) 2414 u_long a, b; 2415 { 2416 return ((long)(a - b)); 2417 } 2418 #endif 2419 2420 2421 #ifdef TCP_SACK 2422 /* 2423 * This function is called upon receipt of new valid data (while not in header 2424 * prediction mode), and it updates the ordered list of sacks. 2425 */ 2426 void 2427 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2428 tcp_seq rcv_lastend) 2429 { 2430 /* 2431 * First reported block MUST be the most recent one. Subsequent 2432 * blocks SHOULD be in the order in which they arrived at the 2433 * receiver. These two conditions make the implementation fully 2434 * compliant with RFC 2018. 2435 */ 2436 int i, j = 0, count = 0, lastpos = -1; 2437 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2438 2439 /* First clean up current list of sacks */ 2440 for (i = 0; i < tp->rcv_numsacks; i++) { 2441 sack = tp->sackblks[i]; 2442 if (sack.start == 0 && sack.end == 0) { 2443 count++; /* count = number of blocks to be discarded */ 2444 continue; 2445 } 2446 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2447 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2448 count++; 2449 } else { 2450 temp[j].start = tp->sackblks[i].start; 2451 temp[j++].end = tp->sackblks[i].end; 2452 } 2453 } 2454 tp->rcv_numsacks -= count; 2455 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2456 tcp_clean_sackreport(tp); 2457 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2458 /* ==> need first sack block */ 2459 tp->sackblks[0].start = rcv_laststart; 2460 tp->sackblks[0].end = rcv_lastend; 2461 tp->rcv_numsacks = 1; 2462 } 2463 return; 2464 } 2465 /* Otherwise, sack blocks are already present. */ 2466 for (i = 0; i < tp->rcv_numsacks; i++) 2467 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2468 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2469 return; /* sack list remains unchanged */ 2470 /* 2471 * From here, segment just received should be (part of) the 1st sack. 2472 * Go through list, possibly coalescing sack block entries. 2473 */ 2474 firstsack.start = rcv_laststart; 2475 firstsack.end = rcv_lastend; 2476 for (i = 0; i < tp->rcv_numsacks; i++) { 2477 sack = tp->sackblks[i]; 2478 if (SEQ_LT(sack.end, firstsack.start) || 2479 SEQ_GT(sack.start, firstsack.end)) 2480 continue; /* no overlap */ 2481 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2482 /* 2483 * identical block; delete it here since we will 2484 * move it to the front of the list. 2485 */ 2486 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2487 lastpos = i; /* last posn with a zero entry */ 2488 continue; 2489 } 2490 if (SEQ_LEQ(sack.start, firstsack.start)) 2491 firstsack.start = sack.start; /* merge blocks */ 2492 if (SEQ_GEQ(sack.end, firstsack.end)) 2493 firstsack.end = sack.end; /* merge blocks */ 2494 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2495 lastpos = i; /* last posn with a zero entry */ 2496 } 2497 if (lastpos != -1) { /* at least one merge */ 2498 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2499 sack = tp->sackblks[i]; 2500 if (sack.start == 0 && sack.end == 0) 2501 continue; 2502 temp[j++] = sack; 2503 } 2504 tp->rcv_numsacks = j; /* including first blk (added later) */ 2505 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2506 tp->sackblks[i] = temp[i]; 2507 } else { /* no merges -- shift sacks by 1 */ 2508 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2509 tp->rcv_numsacks++; 2510 for (i = tp->rcv_numsacks-1; i > 0; i--) 2511 tp->sackblks[i] = tp->sackblks[i-1]; 2512 } 2513 tp->sackblks[0] = firstsack; 2514 return; 2515 } 2516 2517 /* 2518 * Process the TCP SACK option. tp->snd_holes is an ordered list 2519 * of holes (oldest to newest, in terms of the sequence space). 2520 */ 2521 void 2522 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2523 { 2524 int tmp_olen; 2525 u_char *tmp_cp; 2526 struct sackhole *cur, *p, *temp; 2527 2528 if (!tp->sack_enable) 2529 return; 2530 /* SACK without ACK doesn't make sense. */ 2531 if ((th->th_flags & TH_ACK) == 0) 2532 return; 2533 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2534 if (SEQ_LT(th->th_ack, tp->snd_una) || 2535 SEQ_GT(th->th_ack, tp->snd_max)) 2536 return; 2537 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2538 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2539 return; 2540 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2541 tmp_cp = cp + 2; 2542 tmp_olen = optlen - 2; 2543 tcpstat.tcps_sack_rcv_opts++; 2544 if (tp->snd_numholes < 0) 2545 tp->snd_numholes = 0; 2546 if (tp->t_maxseg == 0) 2547 panic("tcp_sack_option"); /* Should never happen */ 2548 while (tmp_olen > 0) { 2549 struct sackblk sack; 2550 2551 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2552 NTOHL(sack.start); 2553 bcopy(tmp_cp + sizeof(tcp_seq), 2554 (char *) &(sack.end), sizeof(tcp_seq)); 2555 NTOHL(sack.end); 2556 tmp_olen -= TCPOLEN_SACK; 2557 tmp_cp += TCPOLEN_SACK; 2558 if (SEQ_LEQ(sack.end, sack.start)) 2559 continue; /* bad SACK fields */ 2560 if (SEQ_LEQ(sack.end, tp->snd_una)) 2561 continue; /* old block */ 2562 #if defined(TCP_SACK) && defined(TCP_FACK) 2563 /* Updates snd_fack. */ 2564 if (SEQ_GT(sack.end, tp->snd_fack)) 2565 tp->snd_fack = sack.end; 2566 #endif /* TCP_FACK */ 2567 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2568 if (SEQ_LT(sack.start, th->th_ack)) 2569 continue; 2570 } 2571 if (SEQ_GT(sack.end, tp->snd_max)) 2572 continue; 2573 if (tp->snd_holes == NULL) { /* first hole */ 2574 tp->snd_holes = (struct sackhole *) 2575 pool_get(&sackhl_pool, PR_NOWAIT); 2576 if (tp->snd_holes == NULL) { 2577 /* ENOBUFS, so ignore SACKed block for now*/ 2578 goto done; 2579 } 2580 cur = tp->snd_holes; 2581 cur->start = th->th_ack; 2582 cur->end = sack.start; 2583 cur->rxmit = cur->start; 2584 cur->next = NULL; 2585 tp->snd_numholes = 1; 2586 tp->rcv_lastsack = sack.end; 2587 /* 2588 * dups is at least one. If more data has been 2589 * SACKed, it can be greater than one. 2590 */ 2591 cur->dups = min(tcprexmtthresh, 2592 ((sack.end - cur->end)/tp->t_maxseg)); 2593 if (cur->dups < 1) 2594 cur->dups = 1; 2595 continue; /* with next sack block */ 2596 } 2597 /* Go thru list of holes: p = previous, cur = current */ 2598 p = cur = tp->snd_holes; 2599 while (cur) { 2600 if (SEQ_LEQ(sack.end, cur->start)) 2601 /* SACKs data before the current hole */ 2602 break; /* no use going through more holes */ 2603 if (SEQ_GEQ(sack.start, cur->end)) { 2604 /* SACKs data beyond the current hole */ 2605 cur->dups++; 2606 if (((sack.end - cur->end)/tp->t_maxseg) >= 2607 tcprexmtthresh) 2608 cur->dups = tcprexmtthresh; 2609 p = cur; 2610 cur = cur->next; 2611 continue; 2612 } 2613 if (SEQ_LEQ(sack.start, cur->start)) { 2614 /* Data acks at least the beginning of hole */ 2615 #if defined(TCP_SACK) && defined(TCP_FACK) 2616 if (SEQ_GT(sack.end, cur->rxmit)) 2617 tp->retran_data -= 2618 tcp_seq_subtract(cur->rxmit, 2619 cur->start); 2620 else 2621 tp->retran_data -= 2622 tcp_seq_subtract(sack.end, 2623 cur->start); 2624 #endif /* TCP_FACK */ 2625 if (SEQ_GEQ(sack.end, cur->end)) { 2626 /* Acks entire hole, so delete hole */ 2627 if (p != cur) { 2628 p->next = cur->next; 2629 pool_put(&sackhl_pool, cur); 2630 cur = p->next; 2631 } else { 2632 cur = cur->next; 2633 pool_put(&sackhl_pool, p); 2634 p = cur; 2635 tp->snd_holes = p; 2636 } 2637 tp->snd_numholes--; 2638 continue; 2639 } 2640 /* otherwise, move start of hole forward */ 2641 cur->start = sack.end; 2642 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2643 p = cur; 2644 cur = cur->next; 2645 continue; 2646 } 2647 /* move end of hole backward */ 2648 if (SEQ_GEQ(sack.end, cur->end)) { 2649 #if defined(TCP_SACK) && defined(TCP_FACK) 2650 if (SEQ_GT(cur->rxmit, sack.start)) 2651 tp->retran_data -= 2652 tcp_seq_subtract(cur->rxmit, 2653 sack.start); 2654 #endif /* TCP_FACK */ 2655 cur->end = sack.start; 2656 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2657 cur->dups++; 2658 if (((sack.end - cur->end)/tp->t_maxseg) >= 2659 tcprexmtthresh) 2660 cur->dups = tcprexmtthresh; 2661 p = cur; 2662 cur = cur->next; 2663 continue; 2664 } 2665 if (SEQ_LT(cur->start, sack.start) && 2666 SEQ_GT(cur->end, sack.end)) { 2667 /* 2668 * ACKs some data in middle of a hole; need to 2669 * split current hole 2670 */ 2671 temp = (struct sackhole *) 2672 pool_get(&sackhl_pool, PR_NOWAIT); 2673 if (temp == NULL) 2674 goto done; /* ENOBUFS */ 2675 #if defined(TCP_SACK) && defined(TCP_FACK) 2676 if (SEQ_GT(cur->rxmit, sack.end)) 2677 tp->retran_data -= 2678 tcp_seq_subtract(sack.end, 2679 sack.start); 2680 else if (SEQ_GT(cur->rxmit, sack.start)) 2681 tp->retran_data -= 2682 tcp_seq_subtract(cur->rxmit, 2683 sack.start); 2684 #endif /* TCP_FACK */ 2685 temp->next = cur->next; 2686 temp->start = sack.end; 2687 temp->end = cur->end; 2688 temp->dups = cur->dups; 2689 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2690 cur->end = sack.start; 2691 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2692 cur->dups++; 2693 if (((sack.end - cur->end)/tp->t_maxseg) >= 2694 tcprexmtthresh) 2695 cur->dups = tcprexmtthresh; 2696 cur->next = temp; 2697 p = temp; 2698 cur = p->next; 2699 tp->snd_numholes++; 2700 } 2701 } 2702 /* At this point, p points to the last hole on the list */ 2703 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2704 /* 2705 * Need to append new hole at end. 2706 * Last hole is p (and it's not NULL). 2707 */ 2708 temp = (struct sackhole *) 2709 pool_get(&sackhl_pool, PR_NOWAIT); 2710 if (temp == NULL) 2711 goto done; /* ENOBUFS */ 2712 temp->start = tp->rcv_lastsack; 2713 temp->end = sack.start; 2714 temp->dups = min(tcprexmtthresh, 2715 ((sack.end - sack.start)/tp->t_maxseg)); 2716 if (temp->dups < 1) 2717 temp->dups = 1; 2718 temp->rxmit = temp->start; 2719 temp->next = 0; 2720 p->next = temp; 2721 tp->rcv_lastsack = sack.end; 2722 tp->snd_numholes++; 2723 } 2724 } 2725 done: 2726 #if defined(TCP_SACK) && defined(TCP_FACK) 2727 /* 2728 * Update retran_data and snd_awnd. Go through the list of 2729 * holes. Increment retran_data by (hole->rxmit - hole->start). 2730 */ 2731 tp->retran_data = 0; 2732 cur = tp->snd_holes; 2733 while (cur) { 2734 tp->retran_data += cur->rxmit - cur->start; 2735 cur = cur->next; 2736 } 2737 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2738 tp->retran_data; 2739 #endif /* TCP_FACK */ 2740 2741 return; 2742 } 2743 2744 /* 2745 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2746 * it is completely acked; otherwise, tcp_sack_option(), called from 2747 * tcp_dooptions(), will fix up the hole. 2748 */ 2749 void 2750 tcp_del_sackholes(tp, th) 2751 struct tcpcb *tp; 2752 struct tcphdr *th; 2753 { 2754 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2755 /* max because this could be an older ack just arrived */ 2756 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2757 th->th_ack : tp->snd_una; 2758 struct sackhole *cur = tp->snd_holes; 2759 struct sackhole *prev; 2760 while (cur) 2761 if (SEQ_LEQ(cur->end, lastack)) { 2762 prev = cur; 2763 cur = cur->next; 2764 pool_put(&sackhl_pool, prev); 2765 tp->snd_numholes--; 2766 } else if (SEQ_LT(cur->start, lastack)) { 2767 cur->start = lastack; 2768 if (SEQ_LT(cur->rxmit, cur->start)) 2769 cur->rxmit = cur->start; 2770 break; 2771 } else 2772 break; 2773 tp->snd_holes = cur; 2774 } 2775 } 2776 2777 /* 2778 * Delete all receiver-side SACK information. 2779 */ 2780 void 2781 tcp_clean_sackreport(tp) 2782 struct tcpcb *tp; 2783 { 2784 int i; 2785 2786 tp->rcv_numsacks = 0; 2787 for (i = 0; i < MAX_SACK_BLKS; i++) 2788 tp->sackblks[i].start = tp->sackblks[i].end=0; 2789 2790 } 2791 2792 /* 2793 * Checks for partial ack. If partial ack arrives, turn off retransmission 2794 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2795 * If the ack advances at least to tp->snd_last, return 0. 2796 */ 2797 int 2798 tcp_sack_partialack(tp, th) 2799 struct tcpcb *tp; 2800 struct tcphdr *th; 2801 { 2802 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2803 /* Turn off retx. timer (will start again next segment) */ 2804 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2805 tp->t_rtttime = 0; 2806 #ifndef TCP_FACK 2807 /* 2808 * Partial window deflation. This statement relies on the 2809 * fact that tp->snd_una has not been updated yet. In FACK 2810 * hold snd_cwnd constant during fast recovery. 2811 */ 2812 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2813 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2814 tp->snd_cwnd += tp->t_maxseg; 2815 } else 2816 tp->snd_cwnd = tp->t_maxseg; 2817 #endif 2818 return (1); 2819 } 2820 return (0); 2821 } 2822 #endif /* TCP_SACK */ 2823 2824 /* 2825 * Pull out of band byte out of a segment so 2826 * it doesn't appear in the user's data queue. 2827 * It is still reflected in the segment length for 2828 * sequencing purposes. 2829 */ 2830 void 2831 tcp_pulloutofband(so, urgent, m, off) 2832 struct socket *so; 2833 u_int urgent; 2834 struct mbuf *m; 2835 int off; 2836 { 2837 int cnt = off + urgent - 1; 2838 2839 while (cnt >= 0) { 2840 if (m->m_len > cnt) { 2841 char *cp = mtod(m, caddr_t) + cnt; 2842 struct tcpcb *tp = sototcpcb(so); 2843 2844 tp->t_iobc = *cp; 2845 tp->t_oobflags |= TCPOOB_HAVEDATA; 2846 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2847 m->m_len--; 2848 return; 2849 } 2850 cnt -= m->m_len; 2851 m = m->m_next; 2852 if (m == 0) 2853 break; 2854 } 2855 panic("tcp_pulloutofband"); 2856 } 2857 2858 /* 2859 * Collect new round-trip time estimate 2860 * and update averages and current timeout. 2861 */ 2862 void 2863 tcp_xmit_timer(tp, rtt) 2864 struct tcpcb *tp; 2865 short rtt; 2866 { 2867 short delta; 2868 short rttmin; 2869 2870 if (rtt < 0) 2871 rtt = 0; 2872 else if (rtt > TCP_RTT_MAX) 2873 rtt = TCP_RTT_MAX; 2874 2875 tcpstat.tcps_rttupdated++; 2876 if (tp->t_srtt != 0) { 2877 /* 2878 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2879 * after the binary point (scaled by 4), whereas 2880 * srtt is stored as fixed point with 5 bits after the 2881 * binary point (i.e., scaled by 32). The following magic 2882 * is equivalent to the smoothing algorithm in rfc793 with 2883 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2884 * point). 2885 */ 2886 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2887 (tp->t_srtt >> TCP_RTT_SHIFT); 2888 if ((tp->t_srtt += delta) <= 0) 2889 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2890 /* 2891 * We accumulate a smoothed rtt variance (actually, a 2892 * smoothed mean difference), then set the retransmit 2893 * timer to smoothed rtt + 4 times the smoothed variance. 2894 * rttvar is stored as fixed point with 4 bits after the 2895 * binary point (scaled by 16). The following is 2896 * equivalent to rfc793 smoothing with an alpha of .75 2897 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2898 * rfc793's wired-in beta. 2899 */ 2900 if (delta < 0) 2901 delta = -delta; 2902 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2903 if ((tp->t_rttvar += delta) <= 0) 2904 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2905 } else { 2906 /* 2907 * No rtt measurement yet - use the unsmoothed rtt. 2908 * Set the variance to half the rtt (so our first 2909 * retransmit happens at 3*rtt). 2910 */ 2911 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2912 tp->t_rttvar = (rtt + 1) << 2913 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2914 } 2915 tp->t_rtttime = 0; 2916 tp->t_rxtshift = 0; 2917 2918 /* 2919 * the retransmit should happen at rtt + 4 * rttvar. 2920 * Because of the way we do the smoothing, srtt and rttvar 2921 * will each average +1/2 tick of bias. When we compute 2922 * the retransmit timer, we want 1/2 tick of rounding and 2923 * 1 extra tick because of +-1/2 tick uncertainty in the 2924 * firing of the timer. The bias will give us exactly the 2925 * 1.5 tick we need. But, because the bias is 2926 * statistical, we have to test that we don't drop below 2927 * the minimum feasible timer (which is 2 ticks). 2928 */ 2929 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2930 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2931 2932 /* 2933 * We received an ack for a packet that wasn't retransmitted; 2934 * it is probably safe to discard any error indications we've 2935 * received recently. This isn't quite right, but close enough 2936 * for now (a route might have failed after we sent a segment, 2937 * and the return path might not be symmetrical). 2938 */ 2939 tp->t_softerror = 0; 2940 } 2941 2942 /* 2943 * Determine a reasonable value for maxseg size. 2944 * If the route is known, check route for mtu. 2945 * If none, use an mss that can be handled on the outgoing 2946 * interface without forcing IP to fragment; if bigger than 2947 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2948 * to utilize large mbufs. If no route is found, route has no mtu, 2949 * or the destination isn't local, use a default, hopefully conservative 2950 * size (usually 512 or the default IP max size, but no more than the mtu 2951 * of the interface), as we can't discover anything about intervening 2952 * gateways or networks. We also initialize the congestion/slow start 2953 * window to be a single segment if the destination isn't local. 2954 * While looking at the routing entry, we also initialize other path-dependent 2955 * parameters from pre-set or cached values in the routing entry. 2956 * 2957 * Also take into account the space needed for options that we 2958 * send regularly. Make maxseg shorter by that amount to assure 2959 * that we can send maxseg amount of data even when the options 2960 * are present. Store the upper limit of the length of options plus 2961 * data in maxopd. 2962 * 2963 * NOTE: offer == -1 indicates that the maxseg size changed due to 2964 * Path MTU discovery. 2965 */ 2966 int 2967 tcp_mss(tp, offer) 2968 struct tcpcb *tp; 2969 int offer; 2970 { 2971 struct rtentry *rt; 2972 struct ifnet *ifp; 2973 int mss, mssopt; 2974 int iphlen; 2975 struct inpcb *inp; 2976 2977 inp = tp->t_inpcb; 2978 2979 mssopt = mss = tcp_mssdflt; 2980 2981 rt = in_pcbrtentry(inp); 2982 2983 if (rt == NULL) 2984 goto out; 2985 2986 ifp = rt->rt_ifp; 2987 2988 switch (tp->pf) { 2989 #ifdef INET6 2990 case AF_INET6: 2991 iphlen = sizeof(struct ip6_hdr); 2992 break; 2993 #endif 2994 case AF_INET: 2995 iphlen = sizeof(struct ip); 2996 break; 2997 default: 2998 /* the family does not support path MTU discovery */ 2999 goto out; 3000 } 3001 3002 #ifdef RTV_MTU 3003 /* 3004 * if there's an mtu associated with the route and we support 3005 * path MTU discovery for the underlying protocol family, use it. 3006 */ 3007 if (rt->rt_rmx.rmx_mtu) { 3008 /* 3009 * One may wish to lower MSS to take into account options, 3010 * especially security-related options. 3011 */ 3012 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3013 /* 3014 * RFC2460 section 5, last paragraph: if path MTU is 3015 * smaller than 1280, use 1280 as packet size and 3016 * attach fragment header. 3017 */ 3018 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3019 sizeof(struct tcphdr); 3020 } else 3021 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3022 } else 3023 #endif /* RTV_MTU */ 3024 if (!ifp) 3025 /* 3026 * ifp may be null and rmx_mtu may be zero in certain 3027 * v6 cases (e.g., if ND wasn't able to resolve the 3028 * destination host. 3029 */ 3030 goto out; 3031 else if (ifp->if_flags & IFF_LOOPBACK) 3032 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3033 else if (tp->pf == AF_INET) { 3034 if (ip_mtudisc) 3035 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3036 else if (inp && in_localaddr(inp->inp_faddr)) 3037 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3038 } 3039 #ifdef INET6 3040 else if (tp->pf == AF_INET6) { 3041 /* 3042 * for IPv6, path MTU discovery is always turned on, 3043 * or the node must use packet size <= 1280. 3044 */ 3045 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3046 } 3047 #endif /* INET6 */ 3048 3049 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3050 if (offer != -1) { 3051 #ifndef INET6 3052 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3053 #else 3054 if (tp->pf == AF_INET6) 3055 mssopt = IN6_LINKMTU(ifp) - iphlen - 3056 sizeof(struct tcphdr); 3057 else 3058 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3059 #endif 3060 3061 mssopt = max(tcp_mssdflt, mssopt); 3062 } 3063 3064 out: 3065 /* 3066 * The current mss, t_maxseg, is initialized to the default value. 3067 * If we compute a smaller value, reduce the current mss. 3068 * If we compute a larger value, return it for use in sending 3069 * a max seg size option, but don't store it for use 3070 * unless we received an offer at least that large from peer. 3071 * 3072 * However, do not accept offers lower than the minimum of 3073 * the interface MTU and 216. 3074 */ 3075 if (offer > 0) 3076 tp->t_peermss = offer; 3077 if (tp->t_peermss) 3078 mss = min(mss, max(tp->t_peermss, 216)); 3079 3080 /* sanity - at least max opt. space */ 3081 mss = max(mss, 64); 3082 3083 /* 3084 * maxopd stores the maximum length of data AND options 3085 * in a segment; maxseg is the amount of data in a normal 3086 * segment. We need to store this value (maxopd) apart 3087 * from maxseg, because now every segment carries options 3088 * and thus we normally have somewhat less data in segments. 3089 */ 3090 tp->t_maxopd = mss; 3091 3092 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3093 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3094 mss -= TCPOLEN_TSTAMP_APPA; 3095 #ifdef TCP_SIGNATURE 3096 if (tp->t_flags & TF_SIGNATURE) 3097 mss -= TCPOLEN_SIGLEN; 3098 #endif 3099 3100 if (offer == -1) { 3101 /* mss changed due to Path MTU discovery */ 3102 tp->t_flags &= ~TF_PMTUD_PEND; 3103 tp->t_pmtud_mtu_sent = 0; 3104 tp->t_pmtud_mss_acked = 0; 3105 if (mss < tp->t_maxseg) { 3106 /* 3107 * Follow suggestion in RFC 2414 to reduce the 3108 * congestion window by the ratio of the old 3109 * segment size to the new segment size. 3110 */ 3111 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3112 mss, mss); 3113 } 3114 } else if (tcp_do_rfc3390) { 3115 /* increase initial window */ 3116 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3117 } else 3118 tp->snd_cwnd = mss; 3119 3120 tp->t_maxseg = mss; 3121 3122 return (offer != -1 ? mssopt : mss); 3123 } 3124 3125 u_int 3126 tcp_hdrsz(struct tcpcb *tp) 3127 { 3128 u_int hlen; 3129 3130 switch (tp->pf) { 3131 #ifdef INET6 3132 case AF_INET6: 3133 hlen = sizeof(struct ip6_hdr); 3134 break; 3135 #endif 3136 case AF_INET: 3137 hlen = sizeof(struct ip); 3138 break; 3139 default: 3140 hlen = 0; 3141 break; 3142 } 3143 hlen += sizeof(struct tcphdr); 3144 3145 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3146 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3147 hlen += TCPOLEN_TSTAMP_APPA; 3148 #ifdef TCP_SIGNATURE 3149 if (tp->t_flags & TF_SIGNATURE) 3150 hlen += TCPOLEN_SIGLEN; 3151 #endif 3152 return (hlen); 3153 } 3154 3155 /* 3156 * Set connection variables based on the effective MSS. 3157 * We are passed the TCPCB for the actual connection. If we 3158 * are the server, we are called by the compressed state engine 3159 * when the 3-way handshake is complete. If we are the client, 3160 * we are called when we receive the SYN,ACK from the server. 3161 * 3162 * NOTE: The t_maxseg value must be initialized in the TCPCB 3163 * before this routine is called! 3164 */ 3165 void 3166 tcp_mss_update(tp) 3167 struct tcpcb *tp; 3168 { 3169 int mss; 3170 u_long bufsize; 3171 struct rtentry *rt; 3172 struct socket *so; 3173 3174 so = tp->t_inpcb->inp_socket; 3175 mss = tp->t_maxseg; 3176 3177 rt = in_pcbrtentry(tp->t_inpcb); 3178 3179 if (rt == NULL) 3180 return; 3181 3182 bufsize = so->so_snd.sb_hiwat; 3183 if (bufsize < mss) { 3184 mss = bufsize; 3185 /* Update t_maxseg and t_maxopd */ 3186 tcp_mss(tp, mss); 3187 } else { 3188 bufsize = roundup(bufsize, mss); 3189 if (bufsize > sb_max) 3190 bufsize = sb_max; 3191 (void)sbreserve(&so->so_snd, bufsize); 3192 } 3193 3194 bufsize = so->so_rcv.sb_hiwat; 3195 if (bufsize > mss) { 3196 bufsize = roundup(bufsize, mss); 3197 if (bufsize > sb_max) 3198 bufsize = sb_max; 3199 (void)sbreserve(&so->so_rcv, bufsize); 3200 } 3201 3202 } 3203 3204 #if defined (TCP_SACK) 3205 /* 3206 * Checks for partial ack. If partial ack arrives, force the retransmission 3207 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3208 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3209 * be started again. If the ack advances at least to tp->snd_last, return 0. 3210 */ 3211 int 3212 tcp_newreno(tp, th) 3213 struct tcpcb *tp; 3214 struct tcphdr *th; 3215 { 3216 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3217 /* 3218 * snd_una has not been updated and the socket send buffer 3219 * not yet drained of the acked data, so we have to leave 3220 * snd_una as it was to get the correct data offset in 3221 * tcp_output(). 3222 */ 3223 tcp_seq onxt = tp->snd_nxt; 3224 u_long ocwnd = tp->snd_cwnd; 3225 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3226 tp->t_rtttime = 0; 3227 tp->snd_nxt = th->th_ack; 3228 /* 3229 * Set snd_cwnd to one segment beyond acknowledged offset 3230 * (tp->snd_una not yet updated when this function is called) 3231 */ 3232 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3233 (void) tcp_output(tp); 3234 tp->snd_cwnd = ocwnd; 3235 if (SEQ_GT(onxt, tp->snd_nxt)) 3236 tp->snd_nxt = onxt; 3237 /* 3238 * Partial window deflation. Relies on fact that tp->snd_una 3239 * not updated yet. 3240 */ 3241 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); 3242 return 1; 3243 } 3244 return 0; 3245 } 3246 #endif /* TCP_SACK */ 3247 3248 static int 3249 tcp_mss_adv(struct ifnet *ifp, int af) 3250 { 3251 int mss = 0; 3252 int iphlen; 3253 3254 switch (af) { 3255 case AF_INET: 3256 if (ifp != NULL) 3257 mss = ifp->if_mtu; 3258 iphlen = sizeof(struct ip); 3259 break; 3260 #ifdef INET6 3261 case AF_INET6: 3262 if (ifp != NULL) 3263 mss = IN6_LINKMTU(ifp); 3264 iphlen = sizeof(struct ip6_hdr); 3265 break; 3266 #endif 3267 } 3268 mss = mss - iphlen - sizeof(struct tcphdr); 3269 return (max(mss, tcp_mssdflt)); 3270 } 3271 3272 /* 3273 * TCP compressed state engine. Currently used to hold compressed 3274 * state for SYN_RECEIVED. 3275 */ 3276 3277 u_long syn_cache_count; 3278 u_int32_t syn_hash1, syn_hash2; 3279 3280 #define SYN_HASH(sa, sp, dp) \ 3281 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3282 ((u_int32_t)(sp)))^syn_hash2))) 3283 #ifndef INET6 3284 #define SYN_HASHALL(hash, src, dst) \ 3285 do { \ 3286 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3287 ((struct sockaddr_in *)(src))->sin_port, \ 3288 ((struct sockaddr_in *)(dst))->sin_port); \ 3289 } while (/*CONSTCOND*/ 0) 3290 #else 3291 #define SYN_HASH6(sa, sp, dp) \ 3292 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3293 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3294 & 0x7fffffff) 3295 3296 #define SYN_HASHALL(hash, src, dst) \ 3297 do { \ 3298 switch ((src)->sa_family) { \ 3299 case AF_INET: \ 3300 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3301 ((struct sockaddr_in *)(src))->sin_port, \ 3302 ((struct sockaddr_in *)(dst))->sin_port); \ 3303 break; \ 3304 case AF_INET6: \ 3305 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3306 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3307 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3308 break; \ 3309 default: \ 3310 hash = 0; \ 3311 } \ 3312 } while (/*CONSTCOND*/0) 3313 #endif /* INET6 */ 3314 3315 #define SYN_CACHE_RM(sc) \ 3316 do { \ 3317 (sc)->sc_flags |= SCF_DEAD; \ 3318 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 3319 (sc), sc_bucketq); \ 3320 (sc)->sc_tp = NULL; \ 3321 LIST_REMOVE((sc), sc_tpq); \ 3322 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 3323 timeout_del(&(sc)->sc_timer); \ 3324 syn_cache_count--; \ 3325 } while (/*CONSTCOND*/0) 3326 3327 #define SYN_CACHE_PUT(sc) \ 3328 do { \ 3329 if ((sc)->sc_ipopts) \ 3330 (void) m_free((sc)->sc_ipopts); \ 3331 if ((sc)->sc_route4.ro_rt != NULL) \ 3332 RTFREE((sc)->sc_route4.ro_rt); \ 3333 timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \ 3334 timeout_add(&(sc)->sc_timer, 0); \ 3335 } while (/*CONSTCOND*/0) 3336 3337 struct pool syn_cache_pool; 3338 3339 /* 3340 * We don't estimate RTT with SYNs, so each packet starts with the default 3341 * RTT and each timer step has a fixed timeout value. 3342 */ 3343 #define SYN_CACHE_TIMER_ARM(sc) \ 3344 do { \ 3345 TCPT_RANGESET((sc)->sc_rxtcur, \ 3346 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3347 TCPTV_REXMTMAX); \ 3348 if (!timeout_initialized(&(sc)->sc_timer)) \ 3349 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3350 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3351 } while (/*CONSTCOND*/0) 3352 3353 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3354 3355 void 3356 syn_cache_init() 3357 { 3358 int i; 3359 3360 /* Initialize the hash buckets. */ 3361 for (i = 0; i < tcp_syn_cache_size; i++) 3362 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3363 3364 /* Initialize the syn cache pool. */ 3365 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3366 "synpl", NULL); 3367 } 3368 3369 void 3370 syn_cache_insert(sc, tp) 3371 struct syn_cache *sc; 3372 struct tcpcb *tp; 3373 { 3374 struct syn_cache_head *scp; 3375 struct syn_cache *sc2; 3376 int s; 3377 3378 /* 3379 * If there are no entries in the hash table, reinitialize 3380 * the hash secrets. 3381 */ 3382 if (syn_cache_count == 0) { 3383 syn_hash1 = arc4random(); 3384 syn_hash2 = arc4random(); 3385 } 3386 3387 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3388 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3389 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3390 3391 /* 3392 * Make sure that we don't overflow the per-bucket 3393 * limit or the total cache size limit. 3394 */ 3395 s = splsoftnet(); 3396 if (scp->sch_length >= tcp_syn_bucket_limit) { 3397 tcpstat.tcps_sc_bucketoverflow++; 3398 /* 3399 * The bucket is full. Toss the oldest element in the 3400 * bucket. This will be the first entry in the bucket. 3401 */ 3402 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3403 #ifdef DIAGNOSTIC 3404 /* 3405 * This should never happen; we should always find an 3406 * entry in our bucket. 3407 */ 3408 if (sc2 == NULL) 3409 panic("syn_cache_insert: bucketoverflow: impossible"); 3410 #endif 3411 SYN_CACHE_RM(sc2); 3412 SYN_CACHE_PUT(sc2); 3413 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3414 struct syn_cache_head *scp2, *sce; 3415 3416 tcpstat.tcps_sc_overflowed++; 3417 /* 3418 * The cache is full. Toss the oldest entry in the 3419 * first non-empty bucket we can find. 3420 * 3421 * XXX We would really like to toss the oldest 3422 * entry in the cache, but we hope that this 3423 * condition doesn't happen very often. 3424 */ 3425 scp2 = scp; 3426 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3427 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3428 for (++scp2; scp2 != scp; scp2++) { 3429 if (scp2 >= sce) 3430 scp2 = &tcp_syn_cache[0]; 3431 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3432 break; 3433 } 3434 #ifdef DIAGNOSTIC 3435 /* 3436 * This should never happen; we should always find a 3437 * non-empty bucket. 3438 */ 3439 if (scp2 == scp) 3440 panic("syn_cache_insert: cacheoverflow: " 3441 "impossible"); 3442 #endif 3443 } 3444 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3445 SYN_CACHE_RM(sc2); 3446 SYN_CACHE_PUT(sc2); 3447 } 3448 3449 /* 3450 * Initialize the entry's timer. 3451 */ 3452 sc->sc_rxttot = 0; 3453 sc->sc_rxtshift = 0; 3454 SYN_CACHE_TIMER_ARM(sc); 3455 3456 /* Link it from tcpcb entry */ 3457 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3458 3459 /* Put it into the bucket. */ 3460 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3461 scp->sch_length++; 3462 syn_cache_count++; 3463 3464 tcpstat.tcps_sc_added++; 3465 splx(s); 3466 } 3467 3468 /* 3469 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3470 * If we have retransmitted an entry the maximum number of times, expire 3471 * that entry. 3472 */ 3473 void 3474 syn_cache_timer(void *arg) 3475 { 3476 struct syn_cache *sc = arg; 3477 int s; 3478 3479 s = splsoftnet(); 3480 if (sc->sc_flags & SCF_DEAD) { 3481 splx(s); 3482 return; 3483 } 3484 3485 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3486 /* Drop it -- too many retransmissions. */ 3487 goto dropit; 3488 } 3489 3490 /* 3491 * Compute the total amount of time this entry has 3492 * been on a queue. If this entry has been on longer 3493 * than the keep alive timer would allow, expire it. 3494 */ 3495 sc->sc_rxttot += sc->sc_rxtcur; 3496 if (sc->sc_rxttot >= tcptv_keep_init) 3497 goto dropit; 3498 3499 tcpstat.tcps_sc_retransmitted++; 3500 (void) syn_cache_respond(sc, NULL); 3501 3502 /* Advance the timer back-off. */ 3503 sc->sc_rxtshift++; 3504 SYN_CACHE_TIMER_ARM(sc); 3505 3506 splx(s); 3507 return; 3508 3509 dropit: 3510 tcpstat.tcps_sc_timed_out++; 3511 SYN_CACHE_RM(sc); 3512 SYN_CACHE_PUT(sc); 3513 splx(s); 3514 } 3515 3516 void 3517 syn_cache_reaper(void *arg) 3518 { 3519 struct syn_cache *sc = arg; 3520 int s; 3521 3522 s = splsoftnet(); 3523 pool_put(&syn_cache_pool, (sc)); 3524 splx(s); 3525 return; 3526 } 3527 3528 /* 3529 * Remove syn cache created by the specified tcb entry, 3530 * because this does not make sense to keep them 3531 * (if there's no tcb entry, syn cache entry will never be used) 3532 */ 3533 void 3534 syn_cache_cleanup(tp) 3535 struct tcpcb *tp; 3536 { 3537 struct syn_cache *sc, *nsc; 3538 int s; 3539 3540 s = splsoftnet(); 3541 3542 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3543 nsc = LIST_NEXT(sc, sc_tpq); 3544 3545 #ifdef DIAGNOSTIC 3546 if (sc->sc_tp != tp) 3547 panic("invalid sc_tp in syn_cache_cleanup"); 3548 #endif 3549 SYN_CACHE_RM(sc); 3550 SYN_CACHE_PUT(sc); 3551 } 3552 /* just for safety */ 3553 LIST_INIT(&tp->t_sc); 3554 3555 splx(s); 3556 } 3557 3558 /* 3559 * Find an entry in the syn cache. 3560 */ 3561 struct syn_cache * 3562 syn_cache_lookup(src, dst, headp) 3563 struct sockaddr *src; 3564 struct sockaddr *dst; 3565 struct syn_cache_head **headp; 3566 { 3567 struct syn_cache *sc; 3568 struct syn_cache_head *scp; 3569 u_int32_t hash; 3570 int s; 3571 3572 SYN_HASHALL(hash, src, dst); 3573 3574 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3575 *headp = scp; 3576 s = splsoftnet(); 3577 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3578 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3579 if (sc->sc_hash != hash) 3580 continue; 3581 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3582 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 3583 splx(s); 3584 return (sc); 3585 } 3586 } 3587 splx(s); 3588 return (NULL); 3589 } 3590 3591 /* 3592 * This function gets called when we receive an ACK for a 3593 * socket in the LISTEN state. We look up the connection 3594 * in the syn cache, and if its there, we pull it out of 3595 * the cache and turn it into a full-blown connection in 3596 * the SYN-RECEIVED state. 3597 * 3598 * The return values may not be immediately obvious, and their effects 3599 * can be subtle, so here they are: 3600 * 3601 * NULL SYN was not found in cache; caller should drop the 3602 * packet and send an RST. 3603 * 3604 * -1 We were unable to create the new connection, and are 3605 * aborting it. An ACK,RST is being sent to the peer 3606 * (unless we got screwey sequence numbners; see below), 3607 * because the 3-way handshake has been completed. Caller 3608 * should not free the mbuf, since we may be using it. If 3609 * we are not, we will free it. 3610 * 3611 * Otherwise, the return value is a pointer to the new socket 3612 * associated with the connection. 3613 */ 3614 struct socket * 3615 syn_cache_get(src, dst, th, hlen, tlen, so, m) 3616 struct sockaddr *src; 3617 struct sockaddr *dst; 3618 struct tcphdr *th; 3619 unsigned int hlen, tlen; 3620 struct socket *so; 3621 struct mbuf *m; 3622 { 3623 struct syn_cache *sc; 3624 struct syn_cache_head *scp; 3625 struct inpcb *inp = NULL; 3626 struct tcpcb *tp = 0; 3627 struct mbuf *am; 3628 int s; 3629 struct socket *oso; 3630 3631 s = splsoftnet(); 3632 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3633 splx(s); 3634 return (NULL); 3635 } 3636 3637 /* 3638 * Verify the sequence and ack numbers. Try getting the correct 3639 * response again. 3640 */ 3641 if ((th->th_ack != sc->sc_iss + 1) || 3642 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3643 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3644 (void) syn_cache_respond(sc, m); 3645 splx(s); 3646 return ((struct socket *)(-1)); 3647 } 3648 3649 /* Remove this cache entry */ 3650 SYN_CACHE_RM(sc); 3651 splx(s); 3652 3653 /* 3654 * Ok, create the full blown connection, and set things up 3655 * as they would have been set up if we had created the 3656 * connection when the SYN arrived. If we can't create 3657 * the connection, abort it. 3658 */ 3659 oso = so; 3660 so = sonewconn(so, SS_ISCONNECTED); 3661 if (so == NULL) 3662 goto resetandabort; 3663 3664 inp = sotoinpcb(oso); 3665 #ifdef IPSEC 3666 /* 3667 * We need to copy the required security levels 3668 * from the old pcb. Ditto for any other 3669 * IPsec-related information. 3670 */ 3671 { 3672 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 3673 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3674 sizeof(inp->inp_seclevel)); 3675 newinp->inp_secrequire = inp->inp_secrequire; 3676 if (inp->inp_ipo != NULL) { 3677 newinp->inp_ipo = inp->inp_ipo; 3678 inp->inp_ipo->ipo_ref_count++; 3679 } 3680 if (inp->inp_ipsec_remotecred != NULL) { 3681 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3682 inp->inp_ipsec_remotecred->ref_count++; 3683 } 3684 if (inp->inp_ipsec_remoteauth != NULL) { 3685 newinp->inp_ipsec_remoteauth 3686 = inp->inp_ipsec_remoteauth; 3687 inp->inp_ipsec_remoteauth->ref_count++; 3688 } 3689 } 3690 #endif /* IPSEC */ 3691 #ifdef INET6 3692 /* 3693 * inp still has the OLD in_pcb stuff, set the 3694 * v6-related flags on the new guy, too. 3695 */ 3696 { 3697 int flags = inp->inp_flags; 3698 struct inpcb *oldinpcb = inp; 3699 3700 inp = (struct inpcb *)so->so_pcb; 3701 inp->inp_flags |= (flags & INP_IPV6); 3702 if ((inp->inp_flags & INP_IPV6) != 0) { 3703 inp->inp_ipv6.ip6_hlim = 3704 oldinpcb->inp_ipv6.ip6_hlim; 3705 } 3706 } 3707 #else /* INET6 */ 3708 inp = (struct inpcb *)so->so_pcb; 3709 #endif /* INET6 */ 3710 3711 inp->inp_lport = th->th_dport; 3712 switch (src->sa_family) { 3713 #ifdef INET6 3714 case AF_INET6: 3715 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3716 break; 3717 #endif /* INET6 */ 3718 case AF_INET: 3719 3720 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3721 inp->inp_options = ip_srcroute(); 3722 if (inp->inp_options == NULL) { 3723 inp->inp_options = sc->sc_ipopts; 3724 sc->sc_ipopts = NULL; 3725 } 3726 break; 3727 } 3728 in_pcbrehash(inp); 3729 3730 /* 3731 * Give the new socket our cached route reference. 3732 */ 3733 if (src->sa_family == AF_INET) 3734 inp->inp_route = sc->sc_route4; /* struct assignment */ 3735 #ifdef INET6 3736 else 3737 inp->inp_route6 = sc->sc_route6; 3738 #endif 3739 sc->sc_route4.ro_rt = NULL; 3740 3741 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3742 if (am == NULL) 3743 goto resetandabort; 3744 am->m_len = src->sa_len; 3745 bcopy(src, mtod(am, caddr_t), src->sa_len); 3746 3747 switch (src->sa_family) { 3748 case AF_INET: 3749 /* drop IPv4 packet to AF_INET6 socket */ 3750 if (inp->inp_flags & INP_IPV6) { 3751 (void) m_free(am); 3752 goto resetandabort; 3753 } 3754 if (in_pcbconnect(inp, am)) { 3755 (void) m_free(am); 3756 goto resetandabort; 3757 } 3758 break; 3759 #ifdef INET6 3760 case AF_INET6: 3761 if (in6_pcbconnect(inp, am)) { 3762 (void) m_free(am); 3763 goto resetandabort; 3764 } 3765 break; 3766 #endif 3767 } 3768 (void) m_free(am); 3769 3770 tp = intotcpcb(inp); 3771 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3772 if (sc->sc_request_r_scale != 15) { 3773 tp->requested_s_scale = sc->sc_requested_s_scale; 3774 tp->request_r_scale = sc->sc_request_r_scale; 3775 tp->snd_scale = sc->sc_requested_s_scale; 3776 tp->rcv_scale = sc->sc_request_r_scale; 3777 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3778 } 3779 if (sc->sc_flags & SCF_TIMESTAMP) 3780 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3781 3782 tp->t_template = tcp_template(tp); 3783 if (tp->t_template == 0) { 3784 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3785 so = NULL; 3786 m_freem(m); 3787 goto abort; 3788 } 3789 #ifdef TCP_SACK 3790 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3791 #endif 3792 3793 tp->ts_modulate = sc->sc_modulate; 3794 tp->iss = sc->sc_iss; 3795 tp->irs = sc->sc_irs; 3796 tcp_sendseqinit(tp); 3797 #if defined (TCP_SACK) || defined(TCP_ECN) 3798 tp->snd_last = tp->snd_una; 3799 #endif /* TCP_SACK */ 3800 #if defined(TCP_SACK) && defined(TCP_FACK) 3801 tp->snd_fack = tp->snd_una; 3802 tp->retran_data = 0; 3803 tp->snd_awnd = 0; 3804 #endif /* TCP_FACK */ 3805 #ifdef TCP_ECN 3806 if (sc->sc_flags & SCF_ECN_PERMIT) { 3807 tp->t_flags |= TF_ECN_PERMIT; 3808 tcpstat.tcps_ecn_accepts++; 3809 } 3810 #endif 3811 #ifdef TCP_SACK 3812 if (sc->sc_flags & SCF_SACK_PERMIT) 3813 tp->t_flags |= TF_SACK_PERMIT; 3814 #endif 3815 #ifdef TCP_SIGNATURE 3816 if (sc->sc_flags & SCF_SIGNATURE) 3817 tp->t_flags |= TF_SIGNATURE; 3818 #endif 3819 tcp_rcvseqinit(tp); 3820 tp->t_state = TCPS_SYN_RECEIVED; 3821 tp->t_rcvtime = tcp_now; 3822 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3823 tcpstat.tcps_accepts++; 3824 3825 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3826 if (sc->sc_peermaxseg) 3827 tcp_mss_update(tp); 3828 /* Reset initial window to 1 segment for retransmit */ 3829 if (sc->sc_rxtshift > 0) 3830 tp->snd_cwnd = tp->t_maxseg; 3831 tp->snd_wl1 = sc->sc_irs; 3832 tp->rcv_up = sc->sc_irs + 1; 3833 3834 /* 3835 * This is what whould have happened in tcp_output() when 3836 * the SYN,ACK was sent. 3837 */ 3838 tp->snd_up = tp->snd_una; 3839 tp->snd_max = tp->snd_nxt = tp->iss+1; 3840 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3841 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3842 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3843 tp->last_ack_sent = tp->rcv_nxt; 3844 3845 tcpstat.tcps_sc_completed++; 3846 SYN_CACHE_PUT(sc); 3847 return (so); 3848 3849 resetandabort: 3850 tcp_respond(NULL, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, TH_RST); 3851 abort: 3852 if (so != NULL) 3853 (void) soabort(so); 3854 SYN_CACHE_PUT(sc); 3855 tcpstat.tcps_sc_aborted++; 3856 return ((struct socket *)(-1)); 3857 } 3858 3859 /* 3860 * This function is called when we get a RST for a 3861 * non-existent connection, so that we can see if the 3862 * connection is in the syn cache. If it is, zap it. 3863 */ 3864 3865 void 3866 syn_cache_reset(src, dst, th) 3867 struct sockaddr *src; 3868 struct sockaddr *dst; 3869 struct tcphdr *th; 3870 { 3871 struct syn_cache *sc; 3872 struct syn_cache_head *scp; 3873 int s = splsoftnet(); 3874 3875 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3876 splx(s); 3877 return; 3878 } 3879 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3880 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3881 splx(s); 3882 return; 3883 } 3884 SYN_CACHE_RM(sc); 3885 splx(s); 3886 tcpstat.tcps_sc_reset++; 3887 SYN_CACHE_PUT(sc); 3888 } 3889 3890 void 3891 syn_cache_unreach(src, dst, th) 3892 struct sockaddr *src; 3893 struct sockaddr *dst; 3894 struct tcphdr *th; 3895 { 3896 struct syn_cache *sc; 3897 struct syn_cache_head *scp; 3898 int s; 3899 3900 s = splsoftnet(); 3901 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3902 splx(s); 3903 return; 3904 } 3905 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3906 if (ntohl (th->th_seq) != sc->sc_iss) { 3907 splx(s); 3908 return; 3909 } 3910 3911 /* 3912 * If we've retransmitted 3 times and this is our second error, 3913 * we remove the entry. Otherwise, we allow it to continue on. 3914 * This prevents us from incorrectly nuking an entry during a 3915 * spurious network outage. 3916 * 3917 * See tcp_notify(). 3918 */ 3919 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3920 sc->sc_flags |= SCF_UNREACH; 3921 splx(s); 3922 return; 3923 } 3924 3925 SYN_CACHE_RM(sc); 3926 splx(s); 3927 tcpstat.tcps_sc_unreach++; 3928 SYN_CACHE_PUT(sc); 3929 } 3930 3931 /* 3932 * Given a LISTEN socket and an inbound SYN request, add 3933 * this to the syn cache, and send back a segment: 3934 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3935 * to the source. 3936 * 3937 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3938 * Doing so would require that we hold onto the data and deliver it 3939 * to the application. However, if we are the target of a SYN-flood 3940 * DoS attack, an attacker could send data which would eventually 3941 * consume all available buffer space if it were ACKed. By not ACKing 3942 * the data, we avoid this DoS scenario. 3943 */ 3944 3945 int 3946 syn_cache_add(src, dst, th, iphlen, so, m, optp, optlen, oi) 3947 struct sockaddr *src; 3948 struct sockaddr *dst; 3949 struct tcphdr *th; 3950 unsigned int iphlen; 3951 struct socket *so; 3952 struct mbuf *m; 3953 u_char *optp; 3954 int optlen; 3955 struct tcp_opt_info *oi; 3956 { 3957 struct tcpcb tb, *tp; 3958 long win; 3959 struct syn_cache *sc; 3960 struct syn_cache_head *scp; 3961 struct mbuf *ipopts; 3962 3963 tp = sototcpcb(so); 3964 3965 /* 3966 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3967 * 3968 * Note this check is performed in tcp_input() very early on. 3969 */ 3970 3971 /* 3972 * Initialize some local state. 3973 */ 3974 win = sbspace(&so->so_rcv); 3975 if (win > TCP_MAXWIN) 3976 win = TCP_MAXWIN; 3977 3978 #ifdef TCP_SIGNATURE 3979 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3980 #else 3981 if (optp) { 3982 #endif 3983 tb.pf = tp->pf; 3984 #ifdef TCP_SACK 3985 tb.sack_enable = tp->sack_enable; 3986 #endif 3987 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3988 #ifdef TCP_SIGNATURE 3989 if (tp->t_flags & TF_SIGNATURE) 3990 tb.t_flags |= TF_SIGNATURE; 3991 #endif 3992 tb.t_state = TCPS_LISTEN; 3993 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi)) 3994 return (0); 3995 } else 3996 tb.t_flags = 0; 3997 3998 switch (src->sa_family) { 3999 #ifdef INET 4000 case AF_INET: 4001 /* 4002 * Remember the IP options, if any. 4003 */ 4004 ipopts = ip_srcroute(); 4005 break; 4006 #endif 4007 default: 4008 ipopts = NULL; 4009 } 4010 4011 /* 4012 * See if we already have an entry for this connection. 4013 * If we do, resend the SYN,ACK. We do not count this 4014 * as a retransmission (XXX though maybe we should). 4015 */ 4016 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 4017 tcpstat.tcps_sc_dupesyn++; 4018 if (ipopts) { 4019 /* 4020 * If we were remembering a previous source route, 4021 * forget it and use the new one we've been given. 4022 */ 4023 if (sc->sc_ipopts) 4024 (void) m_free(sc->sc_ipopts); 4025 sc->sc_ipopts = ipopts; 4026 } 4027 sc->sc_timestamp = tb.ts_recent; 4028 if (syn_cache_respond(sc, m) == 0) { 4029 tcpstat.tcps_sndacks++; 4030 tcpstat.tcps_sndtotal++; 4031 } 4032 return (1); 4033 } 4034 4035 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 4036 if (sc == NULL) { 4037 if (ipopts) 4038 (void) m_free(ipopts); 4039 return (0); 4040 } 4041 4042 /* 4043 * Fill in the cache, and put the necessary IP and TCP 4044 * options into the reply. 4045 */ 4046 bzero(sc, sizeof(struct syn_cache)); 4047 bzero(&sc->sc_timer, sizeof(sc->sc_timer)); 4048 bcopy(src, &sc->sc_src, src->sa_len); 4049 bcopy(dst, &sc->sc_dst, dst->sa_len); 4050 sc->sc_flags = 0; 4051 sc->sc_ipopts = ipopts; 4052 sc->sc_irs = th->th_seq; 4053 4054 #ifdef TCP_COMPAT_42 4055 tcp_iss += TCP_ISSINCR/2; 4056 sc->sc_iss = tcp_iss; 4057 #else 4058 sc->sc_iss = tcp_rndiss_next(); 4059 #endif 4060 sc->sc_peermaxseg = oi->maxseg; 4061 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4062 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4063 sc->sc_win = win; 4064 sc->sc_timestamp = tb.ts_recent; 4065 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4066 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 4067 sc->sc_flags |= SCF_TIMESTAMP; 4068 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4069 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4070 sc->sc_requested_s_scale = tb.requested_s_scale; 4071 sc->sc_request_r_scale = 0; 4072 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4073 TCP_MAXWIN << sc->sc_request_r_scale < 4074 so->so_rcv.sb_hiwat) 4075 sc->sc_request_r_scale++; 4076 } else { 4077 sc->sc_requested_s_scale = 15; 4078 sc->sc_request_r_scale = 15; 4079 } 4080 #ifdef TCP_ECN 4081 /* 4082 * if both ECE and CWR flag bits are set, peer is ECN capable. 4083 */ 4084 if (tcp_do_ecn && 4085 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4086 sc->sc_flags |= SCF_ECN_PERMIT; 4087 #endif 4088 #ifdef TCP_SACK 4089 /* 4090 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4091 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4092 */ 4093 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4094 sc->sc_flags |= SCF_SACK_PERMIT; 4095 #endif 4096 #ifdef TCP_SIGNATURE 4097 if (tb.t_flags & TF_SIGNATURE) 4098 sc->sc_flags |= SCF_SIGNATURE; 4099 #endif 4100 sc->sc_tp = tp; 4101 if (syn_cache_respond(sc, m) == 0) { 4102 syn_cache_insert(sc, tp); 4103 tcpstat.tcps_sndacks++; 4104 tcpstat.tcps_sndtotal++; 4105 } else { 4106 SYN_CACHE_PUT(sc); 4107 tcpstat.tcps_sc_dropped++; 4108 } 4109 return (1); 4110 } 4111 4112 int 4113 syn_cache_respond(sc, m) 4114 struct syn_cache *sc; 4115 struct mbuf *m; 4116 { 4117 struct route *ro; 4118 u_int8_t *optp; 4119 int optlen, error; 4120 u_int16_t tlen; 4121 struct ip *ip = NULL; 4122 #ifdef INET6 4123 struct ip6_hdr *ip6 = NULL; 4124 #endif 4125 struct tcphdr *th; 4126 u_int hlen; 4127 struct inpcb *inp; 4128 4129 switch (sc->sc_src.sa.sa_family) { 4130 case AF_INET: 4131 hlen = sizeof(struct ip); 4132 ro = &sc->sc_route4; 4133 break; 4134 #ifdef INET6 4135 case AF_INET6: 4136 hlen = sizeof(struct ip6_hdr); 4137 ro = (struct route *)&sc->sc_route6; 4138 break; 4139 #endif 4140 default: 4141 if (m) 4142 m_freem(m); 4143 return (EAFNOSUPPORT); 4144 } 4145 4146 /* Compute the size of the TCP options. */ 4147 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4148 #ifdef TCP_SACK 4149 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4150 #endif 4151 #ifdef TCP_SIGNATURE 4152 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4153 #endif 4154 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4155 4156 tlen = hlen + sizeof(struct tcphdr) + optlen; 4157 4158 /* 4159 * Create the IP+TCP header from scratch. 4160 */ 4161 if (m) 4162 m_freem(m); 4163 #ifdef DIAGNOSTIC 4164 if (max_linkhdr + tlen > MCLBYTES) 4165 return (ENOBUFS); 4166 #endif 4167 MGETHDR(m, M_DONTWAIT, MT_DATA); 4168 if (m && max_linkhdr + tlen > MHLEN) { 4169 MCLGET(m, M_DONTWAIT); 4170 if ((m->m_flags & M_EXT) == 0) { 4171 m_freem(m); 4172 m = NULL; 4173 } 4174 } 4175 if (m == NULL) 4176 return (ENOBUFS); 4177 4178 /* Fixup the mbuf. */ 4179 m->m_data += max_linkhdr; 4180 m->m_len = m->m_pkthdr.len = tlen; 4181 m->m_pkthdr.rcvif = NULL; 4182 memset(mtod(m, u_char *), 0, tlen); 4183 4184 switch (sc->sc_src.sa.sa_family) { 4185 case AF_INET: 4186 ip = mtod(m, struct ip *); 4187 ip->ip_dst = sc->sc_src.sin.sin_addr; 4188 ip->ip_src = sc->sc_dst.sin.sin_addr; 4189 ip->ip_p = IPPROTO_TCP; 4190 th = (struct tcphdr *)(ip + 1); 4191 th->th_dport = sc->sc_src.sin.sin_port; 4192 th->th_sport = sc->sc_dst.sin.sin_port; 4193 break; 4194 #ifdef INET6 4195 case AF_INET6: 4196 ip6 = mtod(m, struct ip6_hdr *); 4197 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4198 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4199 ip6->ip6_nxt = IPPROTO_TCP; 4200 /* ip6_plen will be updated in ip6_output() */ 4201 th = (struct tcphdr *)(ip6 + 1); 4202 th->th_dport = sc->sc_src.sin6.sin6_port; 4203 th->th_sport = sc->sc_dst.sin6.sin6_port; 4204 break; 4205 #endif 4206 default: 4207 th = NULL; 4208 } 4209 4210 th->th_seq = htonl(sc->sc_iss); 4211 th->th_ack = htonl(sc->sc_irs + 1); 4212 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4213 th->th_flags = TH_SYN|TH_ACK; 4214 #ifdef TCP_ECN 4215 /* Set ECE for SYN-ACK if peer supports ECN. */ 4216 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4217 th->th_flags |= TH_ECE; 4218 #endif 4219 th->th_win = htons(sc->sc_win); 4220 /* th_sum already 0 */ 4221 /* th_urp already 0 */ 4222 4223 /* Tack on the TCP options. */ 4224 optp = (u_int8_t *)(th + 1); 4225 *optp++ = TCPOPT_MAXSEG; 4226 *optp++ = 4; 4227 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4228 *optp++ = sc->sc_ourmaxseg & 0xff; 4229 4230 #ifdef TCP_SACK 4231 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4232 if (sc->sc_flags & SCF_SACK_PERMIT) { 4233 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4234 optp += 4; 4235 } 4236 #endif 4237 4238 if (sc->sc_request_r_scale != 15) { 4239 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4240 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4241 sc->sc_request_r_scale); 4242 optp += 4; 4243 } 4244 4245 if (sc->sc_flags & SCF_TIMESTAMP) { 4246 u_int32_t *lp = (u_int32_t *)(optp); 4247 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4248 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4249 sc->sc_modulate = arc4random(); 4250 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4251 *lp = htonl(sc->sc_timestamp); 4252 optp += TCPOLEN_TSTAMP_APPA; 4253 } 4254 4255 #ifdef TCP_SIGNATURE 4256 if (sc->sc_flags & SCF_SIGNATURE) { 4257 union sockaddr_union src, dst; 4258 struct tdb *tdb; 4259 4260 bzero(&src, sizeof(union sockaddr_union)); 4261 bzero(&dst, sizeof(union sockaddr_union)); 4262 src.sa.sa_len = sc->sc_src.sa.sa_len; 4263 src.sa.sa_family = sc->sc_src.sa.sa_family; 4264 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4265 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4266 4267 switch (sc->sc_src.sa.sa_family) { 4268 case 0: /*default to PF_INET*/ 4269 #ifdef INET 4270 case AF_INET: 4271 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4272 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4273 break; 4274 #endif /* INET */ 4275 #ifdef INET6 4276 case AF_INET6: 4277 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4278 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4279 break; 4280 #endif /* INET6 */ 4281 } 4282 4283 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 4284 if (tdb == NULL) { 4285 if (m) 4286 m_freem(m); 4287 return (EPERM); 4288 } 4289 4290 /* Send signature option */ 4291 *(optp++) = TCPOPT_SIGNATURE; 4292 *(optp++) = TCPOLEN_SIGNATURE; 4293 4294 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4295 hlen, 0, optp) < 0) { 4296 if (m) 4297 m_freem(m); 4298 return (EINVAL); 4299 } 4300 optp += 16; 4301 4302 /* Pad options list to the next 32 bit boundary and 4303 * terminate it. 4304 */ 4305 *optp++ = TCPOPT_NOP; 4306 *optp++ = TCPOPT_EOL; 4307 } 4308 #endif /* TCP_SIGNATURE */ 4309 4310 /* Compute the packet's checksum. */ 4311 switch (sc->sc_src.sa.sa_family) { 4312 case AF_INET: 4313 ip->ip_len = htons(tlen - hlen); 4314 th->th_sum = 0; 4315 th->th_sum = in_cksum(m, tlen); 4316 break; 4317 #ifdef INET6 4318 case AF_INET6: 4319 ip6->ip6_plen = htons(tlen - hlen); 4320 th->th_sum = 0; 4321 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4322 break; 4323 #endif 4324 } 4325 4326 /* 4327 * Fill in some straggling IP bits. Note the stack expects 4328 * ip_len to be in host order, for convenience. 4329 */ 4330 switch (sc->sc_src.sa.sa_family) { 4331 #ifdef INET 4332 case AF_INET: 4333 ip->ip_len = htons(tlen); 4334 ip->ip_ttl = ip_defttl; 4335 /* XXX tos? */ 4336 break; 4337 #endif 4338 #ifdef INET6 4339 case AF_INET6: 4340 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4341 ip6->ip6_vfc |= IPV6_VERSION; 4342 ip6->ip6_plen = htons(tlen - hlen); 4343 /* ip6_hlim will be initialized afterwards */ 4344 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4345 break; 4346 #endif 4347 } 4348 4349 /* use IPsec policy from listening socket, on SYN ACK */ 4350 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4351 4352 switch (sc->sc_src.sa.sa_family) { 4353 #ifdef INET 4354 case AF_INET: 4355 error = ip_output(m, sc->sc_ipopts, ro, 4356 (ip_mtudisc ? IP_MTUDISC : 0), 4357 (struct ip_moptions *)NULL, inp); 4358 break; 4359 #endif 4360 #ifdef INET6 4361 case AF_INET6: 4362 ip6->ip6_hlim = in6_selecthlim(NULL, 4363 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4364 4365 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4366 (struct ip6_moptions *)0, NULL); 4367 break; 4368 #endif 4369 default: 4370 error = EAFNOSUPPORT; 4371 break; 4372 } 4373 return (error); 4374 } 4375