1 /* $OpenBSD: tcp_input.c,v 1.388 2023/05/30 19:32:57 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE TCP_TIME(24 * 24 * 60 * 60) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 189 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 190 191 void syn_cache_put(struct syn_cache *); 192 void syn_cache_rm(struct syn_cache *); 193 int syn_cache_respond(struct syn_cache *, struct mbuf *, uint32_t); 194 void syn_cache_timer(void *); 195 void syn_cache_reaper(void *); 196 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 197 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 198 struct tcphdr *, u_int); 199 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 200 unsigned int, struct socket *, struct mbuf *, u_char *, int, 201 struct tcp_opt_info *, tcp_seq *, uint32_t); 202 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 203 struct tcphdr *, unsigned int, unsigned int, struct socket *, 204 struct mbuf *, uint32_t); 205 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 206 struct syn_cache_head **, u_int); 207 208 /* 209 * Insert segment ti into reassembly queue of tcp with 210 * control block tp. Return TH_FIN if reassembly now includes 211 * a segment with FIN. The macro form does the common case inline 212 * (segment is the next to be received on an established connection, 213 * and the queue is empty), avoiding linkage into and removal 214 * from the queue and repetition of various conversions. 215 * Set DELACK for segments received in order, but ack immediately 216 * when segments are out of order (so fast retransmit can work). 217 */ 218 219 int 220 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 221 { 222 struct tcpqent *p, *q, *nq, *tiqe; 223 224 /* 225 * Allocate a new queue entry, before we throw away any data. 226 * If we can't, just drop the packet. XXX 227 */ 228 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 229 if (tiqe == NULL) { 230 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 231 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 232 /* Reuse last entry since new segment fills a hole */ 233 m_freem(tiqe->tcpqe_m); 234 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 235 } 236 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 237 /* Flush segment queue for this connection */ 238 tcp_freeq(tp); 239 tcpstat_inc(tcps_rcvmemdrop); 240 m_freem(m); 241 return (0); 242 } 243 } 244 245 /* 246 * Find a segment which begins after this one does. 247 */ 248 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 249 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 250 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 251 break; 252 253 /* 254 * If there is a preceding segment, it may provide some of 255 * our data already. If so, drop the data from the incoming 256 * segment. If it provides all of our data, drop us. 257 */ 258 if (p != NULL) { 259 struct tcphdr *phdr = p->tcpqe_tcp; 260 int i; 261 262 /* conversion to int (in i) handles seq wraparound */ 263 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 264 if (i > 0) { 265 if (i >= *tlen) { 266 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 267 *tlen); 268 m_freem(m); 269 pool_put(&tcpqe_pool, tiqe); 270 return (0); 271 } 272 m_adj(m, i); 273 *tlen -= i; 274 th->th_seq += i; 275 } 276 } 277 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 278 tp->t_rcvoopack++; 279 280 /* 281 * While we overlap succeeding segments trim them or, 282 * if they are completely covered, dequeue them. 283 */ 284 for (; q != NULL; q = nq) { 285 struct tcphdr *qhdr = q->tcpqe_tcp; 286 int i = (th->th_seq + *tlen) - qhdr->th_seq; 287 288 if (i <= 0) 289 break; 290 if (i < qhdr->th_reseqlen) { 291 qhdr->th_seq += i; 292 qhdr->th_reseqlen -= i; 293 m_adj(q->tcpqe_m, i); 294 break; 295 } 296 nq = TAILQ_NEXT(q, tcpqe_q); 297 m_freem(q->tcpqe_m); 298 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 299 pool_put(&tcpqe_pool, q); 300 } 301 302 /* Insert the new segment queue entry into place. */ 303 tiqe->tcpqe_m = m; 304 th->th_reseqlen = *tlen; 305 tiqe->tcpqe_tcp = th; 306 if (p == NULL) { 307 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 308 } else { 309 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 310 } 311 312 if (th->th_seq != tp->rcv_nxt) 313 return (0); 314 315 return (tcp_flush_queue(tp)); 316 } 317 318 int 319 tcp_flush_queue(struct tcpcb *tp) 320 { 321 struct socket *so = tp->t_inpcb->inp_socket; 322 struct tcpqent *q, *nq; 323 int flags; 324 325 /* 326 * Present data to user, advancing rcv_nxt through 327 * completed sequence space. 328 */ 329 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 330 return (0); 331 q = TAILQ_FIRST(&tp->t_segq); 332 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 333 return (0); 334 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 335 return (0); 336 do { 337 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 338 flags = q->tcpqe_tcp->th_flags & TH_FIN; 339 340 nq = TAILQ_NEXT(q, tcpqe_q); 341 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 342 ND6_HINT(tp); 343 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 344 m_freem(q->tcpqe_m); 345 else 346 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 347 pool_put(&tcpqe_pool, q); 348 q = nq; 349 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 350 tp->t_flags |= TF_BLOCKOUTPUT; 351 sorwakeup(so); 352 tp->t_flags &= ~TF_BLOCKOUTPUT; 353 return (flags); 354 } 355 356 /* 357 * TCP input routine, follows pages 65-76 of the 358 * protocol specification dated September, 1981 very closely. 359 */ 360 int 361 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 362 { 363 struct mbuf *m = *mp; 364 int iphlen = *offp; 365 struct ip *ip = NULL; 366 struct inpcb *inp = NULL; 367 u_int8_t *optp = NULL; 368 int optlen = 0; 369 int tlen, off; 370 struct tcpcb *otp = NULL, *tp = NULL; 371 int tiflags; 372 struct socket *so = NULL; 373 int todrop, acked, ourfinisacked; 374 int hdroptlen = 0; 375 short ostate; 376 caddr_t saveti; 377 tcp_seq iss, *reuse = NULL; 378 uint32_t now; 379 u_long tiwin; 380 struct tcp_opt_info opti; 381 struct tcphdr *th; 382 #ifdef INET6 383 struct ip6_hdr *ip6 = NULL; 384 #endif /* INET6 */ 385 #ifdef TCP_ECN 386 u_char iptos; 387 #endif 388 389 tcpstat_inc(tcps_rcvtotal); 390 391 opti.ts_present = 0; 392 opti.maxseg = 0; 393 now = tcp_now(); 394 395 /* 396 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 397 */ 398 if (m->m_flags & (M_BCAST|M_MCAST)) 399 goto drop; 400 401 /* 402 * Get IP and TCP header together in first mbuf. 403 * Note: IP leaves IP header in first mbuf. 404 */ 405 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 406 if (!th) { 407 tcpstat_inc(tcps_rcvshort); 408 return IPPROTO_DONE; 409 } 410 411 tlen = m->m_pkthdr.len - iphlen; 412 switch (af) { 413 case AF_INET: 414 ip = mtod(m, struct ip *); 415 #ifdef TCP_ECN 416 /* save ip_tos before clearing it for checksum */ 417 iptos = ip->ip_tos; 418 #endif 419 break; 420 #ifdef INET6 421 case AF_INET6: 422 ip6 = mtod(m, struct ip6_hdr *); 423 #ifdef TCP_ECN 424 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 425 #endif 426 427 /* 428 * Be proactive about unspecified IPv6 address in source. 429 * As we use all-zero to indicate unbounded/unconnected pcb, 430 * unspecified IPv6 address can be used to confuse us. 431 * 432 * Note that packets with unspecified IPv6 destination is 433 * already dropped in ip6_input. 434 */ 435 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 436 /* XXX stat */ 437 goto drop; 438 } 439 440 /* Discard packets to multicast */ 441 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 442 /* XXX stat */ 443 goto drop; 444 } 445 break; 446 #endif 447 default: 448 unhandled_af(af); 449 } 450 451 /* 452 * Checksum extended TCP header and data. 453 */ 454 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 455 int sum; 456 457 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 458 tcpstat_inc(tcps_rcvbadsum); 459 goto drop; 460 } 461 tcpstat_inc(tcps_inswcsum); 462 switch (af) { 463 case AF_INET: 464 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 465 break; 466 #ifdef INET6 467 case AF_INET6: 468 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 469 tlen); 470 break; 471 #endif 472 } 473 if (sum != 0) { 474 tcpstat_inc(tcps_rcvbadsum); 475 goto drop; 476 } 477 } 478 479 /* 480 * Check that TCP offset makes sense, 481 * pull out TCP options and adjust length. XXX 482 */ 483 off = th->th_off << 2; 484 if (off < sizeof(struct tcphdr) || off > tlen) { 485 tcpstat_inc(tcps_rcvbadoff); 486 goto drop; 487 } 488 tlen -= off; 489 if (off > sizeof(struct tcphdr)) { 490 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 491 if (!th) { 492 tcpstat_inc(tcps_rcvshort); 493 return IPPROTO_DONE; 494 } 495 optlen = off - sizeof(struct tcphdr); 496 optp = (u_int8_t *)(th + 1); 497 /* 498 * Do quick retrieval of timestamp options ("options 499 * prediction?"). If timestamp is the only option and it's 500 * formatted as recommended in RFC 1323 appendix A, we 501 * quickly get the values now and not bother calling 502 * tcp_dooptions(), etc. 503 */ 504 if ((optlen == TCPOLEN_TSTAMP_APPA || 505 (optlen > TCPOLEN_TSTAMP_APPA && 506 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 507 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 508 (th->th_flags & TH_SYN) == 0) { 509 opti.ts_present = 1; 510 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 511 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 512 optp = NULL; /* we've parsed the options */ 513 } 514 } 515 tiflags = th->th_flags; 516 517 /* 518 * Convert TCP protocol specific fields to host format. 519 */ 520 th->th_seq = ntohl(th->th_seq); 521 th->th_ack = ntohl(th->th_ack); 522 th->th_win = ntohs(th->th_win); 523 th->th_urp = ntohs(th->th_urp); 524 525 if (th->th_dport == 0) { 526 tcpstat_inc(tcps_noport); 527 goto dropwithreset_ratelim; 528 } 529 530 /* 531 * Locate pcb for segment. 532 */ 533 #if NPF > 0 534 inp = pf_inp_lookup(m); 535 #endif 536 findpcb: 537 if (inp == NULL) { 538 switch (af) { 539 #ifdef INET6 540 case AF_INET6: 541 inp = in6_pcblookup(&tcbtable, &ip6->ip6_src, 542 th->th_sport, &ip6->ip6_dst, th->th_dport, 543 m->m_pkthdr.ph_rtableid); 544 break; 545 #endif 546 case AF_INET: 547 inp = in_pcblookup(&tcbtable, ip->ip_src, 548 th->th_sport, ip->ip_dst, th->th_dport, 549 m->m_pkthdr.ph_rtableid); 550 break; 551 } 552 } 553 if (inp == NULL) { 554 tcpstat_inc(tcps_pcbhashmiss); 555 switch (af) { 556 #ifdef INET6 557 case AF_INET6: 558 inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, 559 th->th_dport, m, m->m_pkthdr.ph_rtableid); 560 break; 561 #endif /* INET6 */ 562 case AF_INET: 563 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 564 th->th_dport, m, m->m_pkthdr.ph_rtableid); 565 break; 566 } 567 /* 568 * If the state is CLOSED (i.e., TCB does not exist) then 569 * all data in the incoming segment is discarded. 570 * If the TCB exists but is in CLOSED state, it is embryonic, 571 * but should either do a listen or a connect soon. 572 */ 573 } 574 #ifdef IPSEC 575 if (ipsec_in_use) { 576 struct m_tag *mtag; 577 struct tdb *tdb = NULL; 578 int error; 579 580 /* Find most recent IPsec tag */ 581 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 582 if (mtag != NULL) { 583 struct tdb_ident *tdbi; 584 585 tdbi = (struct tdb_ident *)(mtag + 1); 586 tdb = gettdb(tdbi->rdomain, tdbi->spi, 587 &tdbi->dst, tdbi->proto); 588 } 589 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 590 tdb, inp, NULL, NULL); 591 tdb_unref(tdb); 592 if (error) { 593 tcpstat_inc(tcps_rcvnosec); 594 goto drop; 595 } 596 } 597 #endif /* IPSEC */ 598 599 if (inp == NULL) { 600 tcpstat_inc(tcps_noport); 601 goto dropwithreset_ratelim; 602 } 603 604 KASSERT(sotoinpcb(inp->inp_socket) == inp); 605 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 606 soassertlocked(inp->inp_socket); 607 608 /* Check the minimum TTL for socket. */ 609 switch (af) { 610 case AF_INET: 611 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 612 goto drop; 613 break; 614 #ifdef INET6 615 case AF_INET6: 616 if (inp->inp_ip6_minhlim && 617 inp->inp_ip6_minhlim > ip6->ip6_hlim) 618 goto drop; 619 break; 620 #endif 621 } 622 623 tp = intotcpcb(inp); 624 if (tp == NULL) 625 goto dropwithreset_ratelim; 626 if (tp->t_state == TCPS_CLOSED) 627 goto drop; 628 629 /* Unscale the window into a 32-bit value. */ 630 if ((tiflags & TH_SYN) == 0) 631 tiwin = th->th_win << tp->snd_scale; 632 else 633 tiwin = th->th_win; 634 635 so = inp->inp_socket; 636 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 637 union syn_cache_sa src; 638 union syn_cache_sa dst; 639 640 bzero(&src, sizeof(src)); 641 bzero(&dst, sizeof(dst)); 642 switch (af) { 643 case AF_INET: 644 src.sin.sin_len = sizeof(struct sockaddr_in); 645 src.sin.sin_family = AF_INET; 646 src.sin.sin_addr = ip->ip_src; 647 src.sin.sin_port = th->th_sport; 648 649 dst.sin.sin_len = sizeof(struct sockaddr_in); 650 dst.sin.sin_family = AF_INET; 651 dst.sin.sin_addr = ip->ip_dst; 652 dst.sin.sin_port = th->th_dport; 653 break; 654 #ifdef INET6 655 case AF_INET6: 656 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 657 src.sin6.sin6_family = AF_INET6; 658 src.sin6.sin6_addr = ip6->ip6_src; 659 src.sin6.sin6_port = th->th_sport; 660 661 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 662 dst.sin6.sin6_family = AF_INET6; 663 dst.sin6.sin6_addr = ip6->ip6_dst; 664 dst.sin6.sin6_port = th->th_dport; 665 break; 666 #endif /* INET6 */ 667 } 668 669 if (so->so_options & SO_DEBUG) { 670 otp = tp; 671 ostate = tp->t_state; 672 switch (af) { 673 #ifdef INET6 674 case AF_INET6: 675 saveti = (caddr_t) &tcp_saveti6; 676 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 677 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 678 break; 679 #endif 680 case AF_INET: 681 saveti = (caddr_t) &tcp_saveti; 682 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 683 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 684 break; 685 } 686 } 687 if (so->so_options & SO_ACCEPTCONN) { 688 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 689 690 case TH_SYN|TH_ACK|TH_RST: 691 case TH_SYN|TH_RST: 692 case TH_ACK|TH_RST: 693 case TH_RST: 694 syn_cache_reset(&src.sa, &dst.sa, th, 695 inp->inp_rtableid); 696 goto drop; 697 698 case TH_SYN|TH_ACK: 699 /* 700 * Received a SYN,ACK. This should 701 * never happen while we are in 702 * LISTEN. Send an RST. 703 */ 704 goto badsyn; 705 706 case TH_ACK: 707 so = syn_cache_get(&src.sa, &dst.sa, 708 th, iphlen, tlen, so, m, now); 709 if (so == NULL) { 710 /* 711 * We don't have a SYN for 712 * this ACK; send an RST. 713 */ 714 goto badsyn; 715 } else if (so == (struct socket *)(-1)) { 716 /* 717 * We were unable to create 718 * the connection. If the 719 * 3-way handshake was 720 * completed, and RST has 721 * been sent to the peer. 722 * Since the mbuf might be 723 * in use for the reply, 724 * do not free it. 725 */ 726 m = *mp = NULL; 727 goto drop; 728 } else { 729 /* 730 * We have created a 731 * full-blown connection. 732 */ 733 tp = NULL; 734 in_pcbunref(inp); 735 inp = in_pcbref(sotoinpcb(so)); 736 tp = intotcpcb(inp); 737 if (tp == NULL) 738 goto badsyn; /*XXX*/ 739 740 } 741 break; 742 743 default: 744 /* 745 * None of RST, SYN or ACK was set. 746 * This is an invalid packet for a 747 * TCB in LISTEN state. Send a RST. 748 */ 749 goto badsyn; 750 751 case TH_SYN: 752 /* 753 * Received a SYN. 754 */ 755 #ifdef INET6 756 /* 757 * If deprecated address is forbidden, we do 758 * not accept SYN to deprecated interface 759 * address to prevent any new inbound 760 * connection from getting established. 761 * When we do not accept SYN, we send a TCP 762 * RST, with deprecated source address (instead 763 * of dropping it). We compromise it as it is 764 * much better for peer to send a RST, and 765 * RST will be the final packet for the 766 * exchange. 767 * 768 * If we do not forbid deprecated addresses, we 769 * accept the SYN packet. RFC2462 does not 770 * suggest dropping SYN in this case. 771 * If we decipher RFC2462 5.5.4, it says like 772 * this: 773 * 1. use of deprecated addr with existing 774 * communication is okay - "SHOULD continue 775 * to be used" 776 * 2. use of it with new communication: 777 * (2a) "SHOULD NOT be used if alternate 778 * address with sufficient scope is 779 * available" 780 * (2b) nothing mentioned otherwise. 781 * Here we fall into (2b) case as we have no 782 * choice in our source address selection - we 783 * must obey the peer. 784 * 785 * The wording in RFC2462 is confusing, and 786 * there are multiple description text for 787 * deprecated address handling - worse, they 788 * are not exactly the same. I believe 5.5.4 789 * is the best one, so we follow 5.5.4. 790 */ 791 if (ip6 && !ip6_use_deprecated) { 792 struct in6_ifaddr *ia6; 793 struct ifnet *ifp = 794 if_get(m->m_pkthdr.ph_ifidx); 795 796 if (ifp && 797 (ia6 = in6ifa_ifpwithaddr(ifp, 798 &ip6->ip6_dst)) && 799 (ia6->ia6_flags & 800 IN6_IFF_DEPRECATED)) { 801 tp = NULL; 802 if_put(ifp); 803 goto dropwithreset; 804 } 805 if_put(ifp); 806 } 807 #endif 808 809 /* 810 * LISTEN socket received a SYN 811 * from itself? This can't possibly 812 * be valid; drop the packet. 813 */ 814 if (th->th_dport == th->th_sport) { 815 switch (af) { 816 #ifdef INET6 817 case AF_INET6: 818 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 819 &ip6->ip6_dst)) { 820 tcpstat_inc(tcps_badsyn); 821 goto drop; 822 } 823 break; 824 #endif /* INET6 */ 825 case AF_INET: 826 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 827 tcpstat_inc(tcps_badsyn); 828 goto drop; 829 } 830 break; 831 } 832 } 833 834 /* 835 * SYN looks ok; create compressed TCP 836 * state for it. 837 */ 838 if (so->so_qlen > so->so_qlimit || 839 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 840 so, m, optp, optlen, &opti, reuse, now) 841 == -1) { 842 tcpstat_inc(tcps_dropsyn); 843 goto drop; 844 } 845 in_pcbunref(inp); 846 return IPPROTO_DONE; 847 } 848 } 849 } 850 851 #ifdef DIAGNOSTIC 852 /* 853 * Should not happen now that all embryonic connections 854 * are handled with compressed state. 855 */ 856 if (tp->t_state == TCPS_LISTEN) 857 panic("tcp_input: TCPS_LISTEN"); 858 #endif 859 860 #if NPF > 0 861 pf_inp_link(m, inp); 862 #endif 863 864 /* 865 * Segment received on connection. 866 * Reset idle time and keep-alive timer. 867 */ 868 tp->t_rcvtime = now; 869 if (TCPS_HAVEESTABLISHED(tp->t_state)) 870 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 871 872 if (tp->sack_enable) 873 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 874 875 /* 876 * Process options. 877 */ 878 #ifdef TCP_SIGNATURE 879 if (optp || (tp->t_flags & TF_SIGNATURE)) 880 #else 881 if (optp) 882 #endif 883 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 884 m->m_pkthdr.ph_rtableid, now)) 885 goto drop; 886 887 if (opti.ts_present && opti.ts_ecr) { 888 int rtt_test; 889 890 /* subtract out the tcp timestamp modulator */ 891 opti.ts_ecr -= tp->ts_modulate; 892 893 /* make sure ts_ecr is sensible */ 894 rtt_test = now - opti.ts_ecr; 895 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 896 opti.ts_ecr = 0; 897 } 898 899 #ifdef TCP_ECN 900 /* if congestion experienced, set ECE bit in subsequent packets. */ 901 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 902 tp->t_flags |= TF_RCVD_CE; 903 tcpstat_inc(tcps_ecn_rcvce); 904 } 905 #endif 906 /* 907 * Header prediction: check for the two common cases 908 * of a uni-directional data xfer. If the packet has 909 * no control flags, is in-sequence, the window didn't 910 * change and we're not retransmitting, it's a 911 * candidate. If the length is zero and the ack moved 912 * forward, we're the sender side of the xfer. Just 913 * free the data acked & wake any higher level process 914 * that was blocked waiting for space. If the length 915 * is non-zero and the ack didn't move, we're the 916 * receiver side. If we're getting packets in-order 917 * (the reassembly queue is empty), add the data to 918 * the socket buffer and note that we need a delayed ack. 919 */ 920 if (tp->t_state == TCPS_ESTABLISHED && 921 #ifdef TCP_ECN 922 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 923 #else 924 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 925 #endif 926 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 927 th->th_seq == tp->rcv_nxt && 928 tiwin && tiwin == tp->snd_wnd && 929 tp->snd_nxt == tp->snd_max) { 930 931 /* 932 * If last ACK falls within this segment's sequence numbers, 933 * record the timestamp. 934 * Fix from Braden, see Stevens p. 870 935 */ 936 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 937 tp->ts_recent_age = now; 938 tp->ts_recent = opti.ts_val; 939 } 940 941 if (tlen == 0) { 942 if (SEQ_GT(th->th_ack, tp->snd_una) && 943 SEQ_LEQ(th->th_ack, tp->snd_max) && 944 tp->snd_cwnd >= tp->snd_wnd && 945 tp->t_dupacks == 0) { 946 /* 947 * this is a pure ack for outstanding data. 948 */ 949 tcpstat_inc(tcps_predack); 950 if (opti.ts_present && opti.ts_ecr) 951 tcp_xmit_timer(tp, now - opti.ts_ecr); 952 else if (tp->t_rtttime && 953 SEQ_GT(th->th_ack, tp->t_rtseq)) 954 tcp_xmit_timer(tp, now - tp->t_rtttime); 955 acked = th->th_ack - tp->snd_una; 956 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 957 acked); 958 tp->t_rcvacktime = now; 959 ND6_HINT(tp); 960 sbdrop(so, &so->so_snd, acked); 961 962 /* 963 * If we had a pending ICMP message that 964 * refers to data that have just been 965 * acknowledged, disregard the recorded ICMP 966 * message. 967 */ 968 if ((tp->t_flags & TF_PMTUD_PEND) && 969 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 970 tp->t_flags &= ~TF_PMTUD_PEND; 971 972 /* 973 * Keep track of the largest chunk of data 974 * acknowledged since last PMTU update 975 */ 976 if (tp->t_pmtud_mss_acked < acked) 977 tp->t_pmtud_mss_acked = acked; 978 979 tp->snd_una = th->th_ack; 980 /* Pull snd_wl2 up to prevent seq wrap. */ 981 tp->snd_wl2 = th->th_ack; 982 /* 983 * We want snd_last to track snd_una so 984 * as to avoid sequence wraparound problems 985 * for very large transfers. 986 */ 987 #ifdef TCP_ECN 988 if (SEQ_GT(tp->snd_una, tp->snd_last)) 989 #endif 990 tp->snd_last = tp->snd_una; 991 m_freem(m); 992 993 /* 994 * If all outstanding data are acked, stop 995 * retransmit timer, otherwise restart timer 996 * using current (possibly backed-off) value. 997 * If process is waiting for space, 998 * wakeup/selwakeup/signal. If data 999 * are ready to send, let tcp_output 1000 * decide between more output or persist. 1001 */ 1002 if (tp->snd_una == tp->snd_max) 1003 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1004 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1005 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1006 1007 tcp_update_sndspace(tp); 1008 if (sb_notify(so, &so->so_snd)) { 1009 tp->t_flags |= TF_BLOCKOUTPUT; 1010 sowwakeup(so); 1011 tp->t_flags &= ~TF_BLOCKOUTPUT; 1012 } 1013 if (so->so_snd.sb_cc || 1014 tp->t_flags & TF_NEEDOUTPUT) 1015 (void) tcp_output(tp); 1016 in_pcbunref(inp); 1017 return IPPROTO_DONE; 1018 } 1019 } else if (th->th_ack == tp->snd_una && 1020 TAILQ_EMPTY(&tp->t_segq) && 1021 tlen <= sbspace(so, &so->so_rcv)) { 1022 /* 1023 * This is a pure, in-sequence data packet 1024 * with nothing on the reassembly queue and 1025 * we have enough buffer space to take it. 1026 */ 1027 /* Clean receiver SACK report if present */ 1028 if (tp->sack_enable && tp->rcv_numsacks) 1029 tcp_clean_sackreport(tp); 1030 tcpstat_inc(tcps_preddat); 1031 tp->rcv_nxt += tlen; 1032 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1033 tp->snd_wl1 = th->th_seq; 1034 /* Packet has most recent segment, no urgent exists. */ 1035 tp->rcv_up = tp->rcv_nxt; 1036 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1037 ND6_HINT(tp); 1038 1039 TCP_SETUP_ACK(tp, tiflags, m); 1040 /* 1041 * Drop TCP, IP headers and TCP options then add data 1042 * to socket buffer. 1043 */ 1044 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1045 m_freem(m); 1046 else { 1047 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1048 now - tp->rfbuf_ts > (tp->t_srtt >> 1049 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) { 1050 tcp_update_rcvspace(tp); 1051 /* Start over with next RTT. */ 1052 tp->rfbuf_cnt = 0; 1053 tp->rfbuf_ts = 0; 1054 } else 1055 tp->rfbuf_cnt += tlen; 1056 m_adj(m, iphlen + off); 1057 sbappendstream(so, &so->so_rcv, m); 1058 } 1059 tp->t_flags |= TF_BLOCKOUTPUT; 1060 sorwakeup(so); 1061 tp->t_flags &= ~TF_BLOCKOUTPUT; 1062 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1063 (void) tcp_output(tp); 1064 in_pcbunref(inp); 1065 return IPPROTO_DONE; 1066 } 1067 } 1068 1069 /* 1070 * Compute mbuf offset to TCP data segment. 1071 */ 1072 hdroptlen = iphlen + off; 1073 1074 /* 1075 * Calculate amount of space in receive window, 1076 * and then do TCP input processing. 1077 * Receive window is amount of space in rcv queue, 1078 * but not less than advertised window. 1079 */ 1080 { int win; 1081 1082 win = sbspace(so, &so->so_rcv); 1083 if (win < 0) 1084 win = 0; 1085 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1086 } 1087 1088 switch (tp->t_state) { 1089 1090 /* 1091 * If the state is SYN_RECEIVED: 1092 * if seg contains SYN/ACK, send an RST. 1093 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1094 */ 1095 1096 case TCPS_SYN_RECEIVED: 1097 if (tiflags & TH_ACK) { 1098 if (tiflags & TH_SYN) { 1099 tcpstat_inc(tcps_badsyn); 1100 goto dropwithreset; 1101 } 1102 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1103 SEQ_GT(th->th_ack, tp->snd_max)) 1104 goto dropwithreset; 1105 } 1106 break; 1107 1108 /* 1109 * If the state is SYN_SENT: 1110 * if seg contains an ACK, but not for our SYN, drop the input. 1111 * if seg contains a RST, then drop the connection. 1112 * if seg does not contain SYN, then drop it. 1113 * Otherwise this is an acceptable SYN segment 1114 * initialize tp->rcv_nxt and tp->irs 1115 * if seg contains ack then advance tp->snd_una 1116 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1117 * arrange for segment to be acked (eventually) 1118 * continue processing rest of data/controls, beginning with URG 1119 */ 1120 case TCPS_SYN_SENT: 1121 if ((tiflags & TH_ACK) && 1122 (SEQ_LEQ(th->th_ack, tp->iss) || 1123 SEQ_GT(th->th_ack, tp->snd_max))) 1124 goto dropwithreset; 1125 if (tiflags & TH_RST) { 1126 #ifdef TCP_ECN 1127 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1128 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1129 goto drop; 1130 #endif 1131 if (tiflags & TH_ACK) 1132 tp = tcp_drop(tp, ECONNREFUSED); 1133 goto drop; 1134 } 1135 if ((tiflags & TH_SYN) == 0) 1136 goto drop; 1137 if (tiflags & TH_ACK) { 1138 tp->snd_una = th->th_ack; 1139 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1140 tp->snd_nxt = tp->snd_una; 1141 } 1142 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1143 tp->irs = th->th_seq; 1144 tcp_mss(tp, opti.maxseg); 1145 /* Reset initial window to 1 segment for retransmit */ 1146 if (tp->t_rxtshift > 0) 1147 tp->snd_cwnd = tp->t_maxseg; 1148 tcp_rcvseqinit(tp); 1149 tp->t_flags |= TF_ACKNOW; 1150 /* 1151 * If we've sent a SACK_PERMITTED option, and the peer 1152 * also replied with one, then TF_SACK_PERMIT should have 1153 * been set in tcp_dooptions(). If it was not, disable SACKs. 1154 */ 1155 if (tp->sack_enable) 1156 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1157 #ifdef TCP_ECN 1158 /* 1159 * if ECE is set but CWR is not set for SYN-ACK, or 1160 * both ECE and CWR are set for simultaneous open, 1161 * peer is ECN capable. 1162 */ 1163 if (tcp_do_ecn) { 1164 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1165 case TH_ACK|TH_ECE: 1166 case TH_ECE|TH_CWR: 1167 tp->t_flags |= TF_ECN_PERMIT; 1168 tiflags &= ~(TH_ECE|TH_CWR); 1169 tcpstat_inc(tcps_ecn_accepts); 1170 } 1171 } 1172 #endif 1173 1174 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1175 tcpstat_inc(tcps_connects); 1176 tp->t_flags |= TF_BLOCKOUTPUT; 1177 soisconnected(so); 1178 tp->t_flags &= ~TF_BLOCKOUTPUT; 1179 tp->t_state = TCPS_ESTABLISHED; 1180 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1181 /* Do window scaling on this connection? */ 1182 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1183 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1184 tp->snd_scale = tp->requested_s_scale; 1185 tp->rcv_scale = tp->request_r_scale; 1186 } 1187 tcp_flush_queue(tp); 1188 1189 /* 1190 * if we didn't have to retransmit the SYN, 1191 * use its rtt as our initial srtt & rtt var. 1192 */ 1193 if (tp->t_rtttime) 1194 tcp_xmit_timer(tp, now - tp->t_rtttime); 1195 /* 1196 * Since new data was acked (the SYN), open the 1197 * congestion window by one MSS. We do this 1198 * here, because we won't go through the normal 1199 * ACK processing below. And since this is the 1200 * start of the connection, we know we are in 1201 * the exponential phase of slow-start. 1202 */ 1203 tp->snd_cwnd += tp->t_maxseg; 1204 } else 1205 tp->t_state = TCPS_SYN_RECEIVED; 1206 1207 #if 0 1208 trimthenstep6: 1209 #endif 1210 /* 1211 * Advance th->th_seq to correspond to first data byte. 1212 * If data, trim to stay within window, 1213 * dropping FIN if necessary. 1214 */ 1215 th->th_seq++; 1216 if (tlen > tp->rcv_wnd) { 1217 todrop = tlen - tp->rcv_wnd; 1218 m_adj(m, -todrop); 1219 tlen = tp->rcv_wnd; 1220 tiflags &= ~TH_FIN; 1221 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1222 todrop); 1223 } 1224 tp->snd_wl1 = th->th_seq - 1; 1225 tp->rcv_up = th->th_seq; 1226 goto step6; 1227 /* 1228 * If a new connection request is received while in TIME_WAIT, 1229 * drop the old connection and start over if the if the 1230 * timestamp or the sequence numbers are above the previous 1231 * ones. 1232 */ 1233 case TCPS_TIME_WAIT: 1234 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1235 ((opti.ts_present && 1236 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1237 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1238 #if NPF > 0 1239 /* 1240 * The socket will be recreated but the new state 1241 * has already been linked to the socket. Remove the 1242 * link between old socket and new state. 1243 */ 1244 pf_inp_unlink(inp); 1245 #endif 1246 /* 1247 * Advance the iss by at least 32768, but 1248 * clear the msb in order to make sure 1249 * that SEG_LT(snd_nxt, iss). 1250 */ 1251 iss = tp->snd_nxt + 1252 ((arc4random() & 0x7fffffff) | 0x8000); 1253 reuse = &iss; 1254 tp = tcp_close(tp); 1255 in_pcbunref(inp); 1256 inp = NULL; 1257 goto findpcb; 1258 } 1259 } 1260 1261 /* 1262 * States other than LISTEN or SYN_SENT. 1263 * First check timestamp, if present. 1264 * Then check that at least some bytes of segment are within 1265 * receive window. If segment begins before rcv_nxt, 1266 * drop leading data (and SYN); if nothing left, just ack. 1267 * 1268 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1269 * and it's less than opti.ts_recent, drop it. 1270 */ 1271 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1272 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1273 1274 /* Check to see if ts_recent is over 24 days old. */ 1275 if ((int)(now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1276 /* 1277 * Invalidate ts_recent. If this segment updates 1278 * ts_recent, the age will be reset later and ts_recent 1279 * will get a valid value. If it does not, setting 1280 * ts_recent to zero will at least satisfy the 1281 * requirement that zero be placed in the timestamp 1282 * echo reply when ts_recent isn't valid. The 1283 * age isn't reset until we get a valid ts_recent 1284 * because we don't want out-of-order segments to be 1285 * dropped when ts_recent is old. 1286 */ 1287 tp->ts_recent = 0; 1288 } else { 1289 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1290 tcpstat_inc(tcps_pawsdrop); 1291 if (tlen) 1292 goto dropafterack; 1293 goto drop; 1294 } 1295 } 1296 1297 todrop = tp->rcv_nxt - th->th_seq; 1298 if (todrop > 0) { 1299 if (tiflags & TH_SYN) { 1300 tiflags &= ~TH_SYN; 1301 th->th_seq++; 1302 if (th->th_urp > 1) 1303 th->th_urp--; 1304 else 1305 tiflags &= ~TH_URG; 1306 todrop--; 1307 } 1308 if (todrop > tlen || 1309 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1310 /* 1311 * Any valid FIN must be to the left of the 1312 * window. At this point, FIN must be a 1313 * duplicate or out-of-sequence, so drop it. 1314 */ 1315 tiflags &= ~TH_FIN; 1316 /* 1317 * Send ACK to resynchronize, and drop any data, 1318 * but keep on processing for RST or ACK. 1319 */ 1320 tp->t_flags |= TF_ACKNOW; 1321 todrop = tlen; 1322 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1323 } else { 1324 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1325 todrop); 1326 } 1327 hdroptlen += todrop; /* drop from head afterwards */ 1328 th->th_seq += todrop; 1329 tlen -= todrop; 1330 if (th->th_urp > todrop) 1331 th->th_urp -= todrop; 1332 else { 1333 tiflags &= ~TH_URG; 1334 th->th_urp = 0; 1335 } 1336 } 1337 1338 /* 1339 * If new data are received on a connection after the 1340 * user processes are gone, then RST the other end. 1341 */ 1342 if ((so->so_state & SS_NOFDREF) && 1343 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1344 tp = tcp_close(tp); 1345 tcpstat_inc(tcps_rcvafterclose); 1346 goto dropwithreset; 1347 } 1348 1349 /* 1350 * If segment ends after window, drop trailing data 1351 * (and PUSH and FIN); if nothing left, just ACK. 1352 */ 1353 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1354 if (todrop > 0) { 1355 tcpstat_inc(tcps_rcvpackafterwin); 1356 if (todrop >= tlen) { 1357 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1358 /* 1359 * If window is closed can only take segments at 1360 * window edge, and have to drop data and PUSH from 1361 * incoming segments. Continue processing, but 1362 * remember to ack. Otherwise, drop segment 1363 * and ack. 1364 */ 1365 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1366 tp->t_flags |= TF_ACKNOW; 1367 tcpstat_inc(tcps_rcvwinprobe); 1368 } else 1369 goto dropafterack; 1370 } else 1371 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1372 m_adj(m, -todrop); 1373 tlen -= todrop; 1374 tiflags &= ~(TH_PUSH|TH_FIN); 1375 } 1376 1377 /* 1378 * If last ACK falls within this segment's sequence numbers, 1379 * record its timestamp if it's more recent. 1380 * NOTE that the test is modified according to the latest 1381 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1382 */ 1383 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1384 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1385 tp->ts_recent_age = now; 1386 tp->ts_recent = opti.ts_val; 1387 } 1388 1389 /* 1390 * If the RST bit is set examine the state: 1391 * SYN_RECEIVED STATE: 1392 * If passive open, return to LISTEN state. 1393 * If active open, inform user that connection was refused. 1394 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1395 * Inform user that connection was reset, and close tcb. 1396 * CLOSING, LAST_ACK, TIME_WAIT STATES 1397 * Close the tcb. 1398 */ 1399 if (tiflags & TH_RST) { 1400 if (th->th_seq != tp->last_ack_sent && 1401 th->th_seq != tp->rcv_nxt && 1402 th->th_seq != (tp->rcv_nxt + 1)) 1403 goto drop; 1404 1405 switch (tp->t_state) { 1406 case TCPS_SYN_RECEIVED: 1407 #ifdef TCP_ECN 1408 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1409 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1410 goto drop; 1411 #endif 1412 so->so_error = ECONNREFUSED; 1413 goto close; 1414 1415 case TCPS_ESTABLISHED: 1416 case TCPS_FIN_WAIT_1: 1417 case TCPS_FIN_WAIT_2: 1418 case TCPS_CLOSE_WAIT: 1419 so->so_error = ECONNRESET; 1420 close: 1421 tp->t_state = TCPS_CLOSED; 1422 tcpstat_inc(tcps_drops); 1423 tp = tcp_close(tp); 1424 goto drop; 1425 case TCPS_CLOSING: 1426 case TCPS_LAST_ACK: 1427 case TCPS_TIME_WAIT: 1428 tp = tcp_close(tp); 1429 goto drop; 1430 } 1431 } 1432 1433 /* 1434 * If a SYN is in the window, then this is an 1435 * error and we ACK and drop the packet. 1436 */ 1437 if (tiflags & TH_SYN) 1438 goto dropafterack_ratelim; 1439 1440 /* 1441 * If the ACK bit is off we drop the segment and return. 1442 */ 1443 if ((tiflags & TH_ACK) == 0) { 1444 if (tp->t_flags & TF_ACKNOW) 1445 goto dropafterack; 1446 else 1447 goto drop; 1448 } 1449 1450 /* 1451 * Ack processing. 1452 */ 1453 switch (tp->t_state) { 1454 1455 /* 1456 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1457 * ESTABLISHED state and continue processing. 1458 * The ACK was checked above. 1459 */ 1460 case TCPS_SYN_RECEIVED: 1461 tcpstat_inc(tcps_connects); 1462 tp->t_flags |= TF_BLOCKOUTPUT; 1463 soisconnected(so); 1464 tp->t_flags &= ~TF_BLOCKOUTPUT; 1465 tp->t_state = TCPS_ESTABLISHED; 1466 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1467 /* Do window scaling? */ 1468 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1469 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1470 tp->snd_scale = tp->requested_s_scale; 1471 tp->rcv_scale = tp->request_r_scale; 1472 tiwin = th->th_win << tp->snd_scale; 1473 } 1474 tcp_flush_queue(tp); 1475 tp->snd_wl1 = th->th_seq - 1; 1476 /* fall into ... */ 1477 1478 /* 1479 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1480 * ACKs. If the ack is in the range 1481 * tp->snd_una < th->th_ack <= tp->snd_max 1482 * then advance tp->snd_una to th->th_ack and drop 1483 * data from the retransmission queue. If this ACK reflects 1484 * more up to date window information we update our window information. 1485 */ 1486 case TCPS_ESTABLISHED: 1487 case TCPS_FIN_WAIT_1: 1488 case TCPS_FIN_WAIT_2: 1489 case TCPS_CLOSE_WAIT: 1490 case TCPS_CLOSING: 1491 case TCPS_LAST_ACK: 1492 case TCPS_TIME_WAIT: 1493 #ifdef TCP_ECN 1494 /* 1495 * if we receive ECE and are not already in recovery phase, 1496 * reduce cwnd by half but don't slow-start. 1497 * advance snd_last to snd_max not to reduce cwnd again 1498 * until all outstanding packets are acked. 1499 */ 1500 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1501 if ((tp->t_flags & TF_ECN_PERMIT) && 1502 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1503 u_int win; 1504 1505 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1506 if (win > 1) { 1507 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1508 tp->snd_cwnd = tp->snd_ssthresh; 1509 tp->snd_last = tp->snd_max; 1510 tp->t_flags |= TF_SEND_CWR; 1511 tcpstat_inc(tcps_cwr_ecn); 1512 } 1513 } 1514 tcpstat_inc(tcps_ecn_rcvece); 1515 } 1516 /* 1517 * if we receive CWR, we know that the peer has reduced 1518 * its congestion window. stop sending ecn-echo. 1519 */ 1520 if ((tiflags & TH_CWR)) { 1521 tp->t_flags &= ~TF_RCVD_CE; 1522 tcpstat_inc(tcps_ecn_rcvcwr); 1523 } 1524 #endif /* TCP_ECN */ 1525 1526 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1527 /* 1528 * Duplicate/old ACK processing. 1529 * Increments t_dupacks: 1530 * Pure duplicate (same seq/ack/window, no data) 1531 * Doesn't affect t_dupacks: 1532 * Data packets. 1533 * Normal window updates (window opens) 1534 * Resets t_dupacks: 1535 * New data ACKed. 1536 * Window shrinks 1537 * Old ACK 1538 */ 1539 if (tlen) { 1540 /* Drop very old ACKs unless th_seq matches */ 1541 if (th->th_seq != tp->rcv_nxt && 1542 SEQ_LT(th->th_ack, 1543 tp->snd_una - tp->max_sndwnd)) { 1544 tcpstat_inc(tcps_rcvacktooold); 1545 goto drop; 1546 } 1547 break; 1548 } 1549 /* 1550 * If we get an old ACK, there is probably packet 1551 * reordering going on. Be conservative and reset 1552 * t_dupacks so that we are less aggressive in 1553 * doing a fast retransmit. 1554 */ 1555 if (th->th_ack != tp->snd_una) { 1556 tp->t_dupacks = 0; 1557 break; 1558 } 1559 if (tiwin == tp->snd_wnd) { 1560 tcpstat_inc(tcps_rcvdupack); 1561 /* 1562 * If we have outstanding data (other than 1563 * a window probe), this is a completely 1564 * duplicate ack (ie, window info didn't 1565 * change), the ack is the biggest we've 1566 * seen and we've seen exactly our rexmt 1567 * threshold of them, assume a packet 1568 * has been dropped and retransmit it. 1569 * Kludge snd_nxt & the congestion 1570 * window so we send only this one 1571 * packet. 1572 * 1573 * We know we're losing at the current 1574 * window size so do congestion avoidance 1575 * (set ssthresh to half the current window 1576 * and pull our congestion window back to 1577 * the new ssthresh). 1578 * 1579 * Dup acks mean that packets have left the 1580 * network (they're now cached at the receiver) 1581 * so bump cwnd by the amount in the receiver 1582 * to keep a constant cwnd packets in the 1583 * network. 1584 */ 1585 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1586 tp->t_dupacks = 0; 1587 else if (++tp->t_dupacks == tcprexmtthresh) { 1588 tcp_seq onxt = tp->snd_nxt; 1589 u_long win = 1590 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1591 2 / tp->t_maxseg; 1592 1593 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1594 /* 1595 * False fast retx after 1596 * timeout. Do not cut window. 1597 */ 1598 tp->t_dupacks = 0; 1599 goto drop; 1600 } 1601 if (win < 2) 1602 win = 2; 1603 tp->snd_ssthresh = win * tp->t_maxseg; 1604 tp->snd_last = tp->snd_max; 1605 if (tp->sack_enable) { 1606 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1607 tp->t_rtttime = 0; 1608 #ifdef TCP_ECN 1609 tp->t_flags |= TF_SEND_CWR; 1610 #endif 1611 tcpstat_inc(tcps_cwr_frecovery); 1612 tcpstat_inc(tcps_sack_recovery_episode); 1613 /* 1614 * tcp_output() will send 1615 * oldest SACK-eligible rtx. 1616 */ 1617 (void) tcp_output(tp); 1618 tp->snd_cwnd = tp->snd_ssthresh+ 1619 tp->t_maxseg * tp->t_dupacks; 1620 goto drop; 1621 } 1622 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1623 tp->t_rtttime = 0; 1624 tp->snd_nxt = th->th_ack; 1625 tp->snd_cwnd = tp->t_maxseg; 1626 #ifdef TCP_ECN 1627 tp->t_flags |= TF_SEND_CWR; 1628 #endif 1629 tcpstat_inc(tcps_cwr_frecovery); 1630 tcpstat_inc(tcps_sndrexmitfast); 1631 (void) tcp_output(tp); 1632 1633 tp->snd_cwnd = tp->snd_ssthresh + 1634 tp->t_maxseg * tp->t_dupacks; 1635 if (SEQ_GT(onxt, tp->snd_nxt)) 1636 tp->snd_nxt = onxt; 1637 goto drop; 1638 } else if (tp->t_dupacks > tcprexmtthresh) { 1639 tp->snd_cwnd += tp->t_maxseg; 1640 (void) tcp_output(tp); 1641 goto drop; 1642 } 1643 } else if (tiwin < tp->snd_wnd) { 1644 /* 1645 * The window was retracted! Previous dup 1646 * ACKs may have been due to packets arriving 1647 * after the shrunken window, not a missing 1648 * packet, so play it safe and reset t_dupacks 1649 */ 1650 tp->t_dupacks = 0; 1651 } 1652 break; 1653 } 1654 /* 1655 * If the congestion window was inflated to account 1656 * for the other side's cached packets, retract it. 1657 */ 1658 if (tp->t_dupacks >= tcprexmtthresh) { 1659 /* Check for a partial ACK */ 1660 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1661 if (tp->sack_enable) 1662 tcp_sack_partialack(tp, th); 1663 else 1664 tcp_newreno_partialack(tp, th); 1665 } else { 1666 /* Out of fast recovery */ 1667 tp->snd_cwnd = tp->snd_ssthresh; 1668 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1669 tp->snd_ssthresh) 1670 tp->snd_cwnd = 1671 tcp_seq_subtract(tp->snd_max, 1672 th->th_ack); 1673 tp->t_dupacks = 0; 1674 } 1675 } else { 1676 /* 1677 * Reset the duplicate ACK counter if we 1678 * were not in fast recovery. 1679 */ 1680 tp->t_dupacks = 0; 1681 } 1682 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1683 tcpstat_inc(tcps_rcvacktoomuch); 1684 goto dropafterack_ratelim; 1685 } 1686 acked = th->th_ack - tp->snd_una; 1687 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1688 tp->t_rcvacktime = now; 1689 1690 /* 1691 * If we have a timestamp reply, update smoothed 1692 * round trip time. If no timestamp is present but 1693 * transmit timer is running and timed sequence 1694 * number was acked, update smoothed round trip time. 1695 * Since we now have an rtt measurement, cancel the 1696 * timer backoff (cf., Phil Karn's retransmit alg.). 1697 * Recompute the initial retransmit timer. 1698 */ 1699 if (opti.ts_present && opti.ts_ecr) 1700 tcp_xmit_timer(tp, now - opti.ts_ecr); 1701 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1702 tcp_xmit_timer(tp, now - tp->t_rtttime); 1703 1704 /* 1705 * If all outstanding data is acked, stop retransmit 1706 * timer and remember to restart (more output or persist). 1707 * If there is more data to be acked, restart retransmit 1708 * timer, using current (possibly backed-off) value. 1709 */ 1710 if (th->th_ack == tp->snd_max) { 1711 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1712 tp->t_flags |= TF_NEEDOUTPUT; 1713 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1714 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1715 /* 1716 * When new data is acked, open the congestion window. 1717 * If the window gives us less than ssthresh packets 1718 * in flight, open exponentially (maxseg per packet). 1719 * Otherwise open linearly: maxseg per window 1720 * (maxseg^2 / cwnd per packet). 1721 */ 1722 { 1723 u_int cw = tp->snd_cwnd; 1724 u_int incr = tp->t_maxseg; 1725 1726 if (cw > tp->snd_ssthresh) 1727 incr = max(incr * incr / cw, 1); 1728 if (tp->t_dupacks < tcprexmtthresh) 1729 tp->snd_cwnd = ulmin(cw + incr, 1730 TCP_MAXWIN << tp->snd_scale); 1731 } 1732 ND6_HINT(tp); 1733 if (acked > so->so_snd.sb_cc) { 1734 if (tp->snd_wnd > so->so_snd.sb_cc) 1735 tp->snd_wnd -= so->so_snd.sb_cc; 1736 else 1737 tp->snd_wnd = 0; 1738 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1739 ourfinisacked = 1; 1740 } else { 1741 sbdrop(so, &so->so_snd, acked); 1742 if (tp->snd_wnd > acked) 1743 tp->snd_wnd -= acked; 1744 else 1745 tp->snd_wnd = 0; 1746 ourfinisacked = 0; 1747 } 1748 1749 tcp_update_sndspace(tp); 1750 if (sb_notify(so, &so->so_snd)) { 1751 tp->t_flags |= TF_BLOCKOUTPUT; 1752 sowwakeup(so); 1753 tp->t_flags &= ~TF_BLOCKOUTPUT; 1754 } 1755 1756 /* 1757 * If we had a pending ICMP message that referred to data 1758 * that have just been acknowledged, disregard the recorded 1759 * ICMP message. 1760 */ 1761 if ((tp->t_flags & TF_PMTUD_PEND) && 1762 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1763 tp->t_flags &= ~TF_PMTUD_PEND; 1764 1765 /* 1766 * Keep track of the largest chunk of data acknowledged 1767 * since last PMTU update 1768 */ 1769 if (tp->t_pmtud_mss_acked < acked) 1770 tp->t_pmtud_mss_acked = acked; 1771 1772 tp->snd_una = th->th_ack; 1773 #ifdef TCP_ECN 1774 /* sync snd_last with snd_una */ 1775 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1776 tp->snd_last = tp->snd_una; 1777 #endif 1778 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1779 tp->snd_nxt = tp->snd_una; 1780 1781 switch (tp->t_state) { 1782 1783 /* 1784 * In FIN_WAIT_1 STATE in addition to the processing 1785 * for the ESTABLISHED state if our FIN is now acknowledged 1786 * then enter FIN_WAIT_2. 1787 */ 1788 case TCPS_FIN_WAIT_1: 1789 if (ourfinisacked) { 1790 /* 1791 * If we can't receive any more 1792 * data, then closing user can proceed. 1793 * Starting the timer is contrary to the 1794 * specification, but if we don't get a FIN 1795 * we'll hang forever. 1796 */ 1797 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 1798 tp->t_flags |= TF_BLOCKOUTPUT; 1799 soisdisconnected(so); 1800 tp->t_flags &= ~TF_BLOCKOUTPUT; 1801 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1802 } 1803 tp->t_state = TCPS_FIN_WAIT_2; 1804 } 1805 break; 1806 1807 /* 1808 * In CLOSING STATE in addition to the processing for 1809 * the ESTABLISHED state if the ACK acknowledges our FIN 1810 * then enter the TIME-WAIT state, otherwise ignore 1811 * the segment. 1812 */ 1813 case TCPS_CLOSING: 1814 if (ourfinisacked) { 1815 tp->t_state = TCPS_TIME_WAIT; 1816 tcp_canceltimers(tp); 1817 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1818 tp->t_flags |= TF_BLOCKOUTPUT; 1819 soisdisconnected(so); 1820 tp->t_flags &= ~TF_BLOCKOUTPUT; 1821 } 1822 break; 1823 1824 /* 1825 * In LAST_ACK, we may still be waiting for data to drain 1826 * and/or to be acked, as well as for the ack of our FIN. 1827 * If our FIN is now acknowledged, delete the TCB, 1828 * enter the closed state and return. 1829 */ 1830 case TCPS_LAST_ACK: 1831 if (ourfinisacked) { 1832 tp = tcp_close(tp); 1833 goto drop; 1834 } 1835 break; 1836 1837 /* 1838 * In TIME_WAIT state the only thing that should arrive 1839 * is a retransmission of the remote FIN. Acknowledge 1840 * it and restart the finack timer. 1841 */ 1842 case TCPS_TIME_WAIT: 1843 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1844 goto dropafterack; 1845 } 1846 } 1847 1848 step6: 1849 /* 1850 * Update window information. 1851 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1852 */ 1853 if ((tiflags & TH_ACK) && 1854 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1855 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1856 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1857 /* keep track of pure window updates */ 1858 if (tlen == 0 && 1859 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1860 tcpstat_inc(tcps_rcvwinupd); 1861 tp->snd_wnd = tiwin; 1862 tp->snd_wl1 = th->th_seq; 1863 tp->snd_wl2 = th->th_ack; 1864 if (tp->snd_wnd > tp->max_sndwnd) 1865 tp->max_sndwnd = tp->snd_wnd; 1866 tp->t_flags |= TF_NEEDOUTPUT; 1867 } 1868 1869 /* 1870 * Process segments with URG. 1871 */ 1872 if ((tiflags & TH_URG) && th->th_urp && 1873 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1874 /* 1875 * This is a kludge, but if we receive and accept 1876 * random urgent pointers, we'll crash in 1877 * soreceive. It's hard to imagine someone 1878 * actually wanting to send this much urgent data. 1879 */ 1880 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1881 th->th_urp = 0; /* XXX */ 1882 tiflags &= ~TH_URG; /* XXX */ 1883 goto dodata; /* XXX */ 1884 } 1885 /* 1886 * If this segment advances the known urgent pointer, 1887 * then mark the data stream. This should not happen 1888 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1889 * a FIN has been received from the remote side. 1890 * In these states we ignore the URG. 1891 * 1892 * According to RFC961 (Assigned Protocols), 1893 * the urgent pointer points to the last octet 1894 * of urgent data. We continue, however, 1895 * to consider it to indicate the first octet 1896 * of data past the urgent section as the original 1897 * spec states (in one of two places). 1898 */ 1899 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1900 tp->rcv_up = th->th_seq + th->th_urp; 1901 so->so_oobmark = so->so_rcv.sb_cc + 1902 (tp->rcv_up - tp->rcv_nxt) - 1; 1903 if (so->so_oobmark == 0) 1904 so->so_rcv.sb_state |= SS_RCVATMARK; 1905 sohasoutofband(so); 1906 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1907 } 1908 /* 1909 * Remove out of band data so doesn't get presented to user. 1910 * This can happen independent of advancing the URG pointer, 1911 * but if two URG's are pending at once, some out-of-band 1912 * data may creep in... ick. 1913 */ 1914 if (th->th_urp <= (u_int16_t) tlen && 1915 (so->so_options & SO_OOBINLINE) == 0) 1916 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1917 } else 1918 /* 1919 * If no out of band data is expected, 1920 * pull receive urgent pointer along 1921 * with the receive window. 1922 */ 1923 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1924 tp->rcv_up = tp->rcv_nxt; 1925 dodata: /* XXX */ 1926 1927 /* 1928 * Process the segment text, merging it into the TCP sequencing queue, 1929 * and arranging for acknowledgment of receipt if necessary. 1930 * This process logically involves adjusting tp->rcv_wnd as data 1931 * is presented to the user (this happens in tcp_usrreq.c, 1932 * case PRU_RCVD). If a FIN has already been received on this 1933 * connection then we just ignore the text. 1934 */ 1935 if ((tlen || (tiflags & TH_FIN)) && 1936 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1937 tcp_seq laststart = th->th_seq; 1938 tcp_seq lastend = th->th_seq + tlen; 1939 1940 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1941 tp->t_state == TCPS_ESTABLISHED) { 1942 TCP_SETUP_ACK(tp, tiflags, m); 1943 tp->rcv_nxt += tlen; 1944 tiflags = th->th_flags & TH_FIN; 1945 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1946 ND6_HINT(tp); 1947 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1948 m_freem(m); 1949 else { 1950 m_adj(m, hdroptlen); 1951 sbappendstream(so, &so->so_rcv, m); 1952 } 1953 tp->t_flags |= TF_BLOCKOUTPUT; 1954 sorwakeup(so); 1955 tp->t_flags &= ~TF_BLOCKOUTPUT; 1956 } else { 1957 m_adj(m, hdroptlen); 1958 tiflags = tcp_reass(tp, th, m, &tlen); 1959 tp->t_flags |= TF_ACKNOW; 1960 } 1961 if (tp->sack_enable) 1962 tcp_update_sack_list(tp, laststart, lastend); 1963 1964 /* 1965 * variable len never referenced again in modern BSD, 1966 * so why bother computing it ?? 1967 */ 1968 #if 0 1969 /* 1970 * Note the amount of data that peer has sent into 1971 * our window, in order to estimate the sender's 1972 * buffer size. 1973 */ 1974 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1975 #endif /* 0 */ 1976 } else { 1977 m_freem(m); 1978 tiflags &= ~TH_FIN; 1979 } 1980 1981 /* 1982 * If FIN is received ACK the FIN and let the user know 1983 * that the connection is closing. Ignore a FIN received before 1984 * the connection is fully established. 1985 */ 1986 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1987 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1988 tp->t_flags |= TF_BLOCKOUTPUT; 1989 socantrcvmore(so); 1990 tp->t_flags &= ~TF_BLOCKOUTPUT; 1991 tp->t_flags |= TF_ACKNOW; 1992 tp->rcv_nxt++; 1993 } 1994 switch (tp->t_state) { 1995 1996 /* 1997 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1998 */ 1999 case TCPS_ESTABLISHED: 2000 tp->t_state = TCPS_CLOSE_WAIT; 2001 break; 2002 2003 /* 2004 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2005 * enter the CLOSING state. 2006 */ 2007 case TCPS_FIN_WAIT_1: 2008 tp->t_state = TCPS_CLOSING; 2009 break; 2010 2011 /* 2012 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2013 * starting the time-wait timer, turning off the other 2014 * standard timers. 2015 */ 2016 case TCPS_FIN_WAIT_2: 2017 tp->t_state = TCPS_TIME_WAIT; 2018 tcp_canceltimers(tp); 2019 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2020 tp->t_flags |= TF_BLOCKOUTPUT; 2021 soisdisconnected(so); 2022 tp->t_flags &= ~TF_BLOCKOUTPUT; 2023 break; 2024 2025 /* 2026 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2027 */ 2028 case TCPS_TIME_WAIT: 2029 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2030 break; 2031 } 2032 } 2033 if (otp) 2034 tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen); 2035 2036 /* 2037 * Return any desired output. 2038 */ 2039 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2040 (void) tcp_output(tp); 2041 in_pcbunref(inp); 2042 return IPPROTO_DONE; 2043 2044 badsyn: 2045 /* 2046 * Received a bad SYN. Increment counters and dropwithreset. 2047 */ 2048 tcpstat_inc(tcps_badsyn); 2049 tp = NULL; 2050 goto dropwithreset; 2051 2052 dropafterack_ratelim: 2053 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2054 tcp_ackdrop_ppslim) == 0) { 2055 /* XXX stat */ 2056 goto drop; 2057 } 2058 /* ...fall into dropafterack... */ 2059 2060 dropafterack: 2061 /* 2062 * Generate an ACK dropping incoming segment if it occupies 2063 * sequence space, where the ACK reflects our state. 2064 */ 2065 if (tiflags & TH_RST) 2066 goto drop; 2067 m_freem(m); 2068 tp->t_flags |= TF_ACKNOW; 2069 (void) tcp_output(tp); 2070 in_pcbunref(inp); 2071 return IPPROTO_DONE; 2072 2073 dropwithreset_ratelim: 2074 /* 2075 * We may want to rate-limit RSTs in certain situations, 2076 * particularly if we are sending an RST in response to 2077 * an attempt to connect to or otherwise communicate with 2078 * a port for which we have no socket. 2079 */ 2080 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2081 tcp_rst_ppslim) == 0) { 2082 /* XXX stat */ 2083 goto drop; 2084 } 2085 /* ...fall into dropwithreset... */ 2086 2087 dropwithreset: 2088 /* 2089 * Generate a RST, dropping incoming segment. 2090 * Make ACK acceptable to originator of segment. 2091 * Don't bother to respond to RST. 2092 */ 2093 if (tiflags & TH_RST) 2094 goto drop; 2095 if (tiflags & TH_ACK) { 2096 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2097 TH_RST, m->m_pkthdr.ph_rtableid, now); 2098 } else { 2099 if (tiflags & TH_SYN) 2100 tlen++; 2101 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2102 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); 2103 } 2104 m_freem(m); 2105 in_pcbunref(inp); 2106 return IPPROTO_DONE; 2107 2108 drop: 2109 /* 2110 * Drop space held by incoming segment and return. 2111 */ 2112 if (otp) 2113 tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen); 2114 2115 m_freem(m); 2116 in_pcbunref(inp); 2117 return IPPROTO_DONE; 2118 } 2119 2120 int 2121 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2122 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2123 u_int rtableid, uint32_t now) 2124 { 2125 u_int16_t mss = 0; 2126 int opt, optlen; 2127 #ifdef TCP_SIGNATURE 2128 caddr_t sigp = NULL; 2129 struct tdb *tdb = NULL; 2130 #endif /* TCP_SIGNATURE */ 2131 2132 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2133 opt = cp[0]; 2134 if (opt == TCPOPT_EOL) 2135 break; 2136 if (opt == TCPOPT_NOP) 2137 optlen = 1; 2138 else { 2139 if (cnt < 2) 2140 break; 2141 optlen = cp[1]; 2142 if (optlen < 2 || optlen > cnt) 2143 break; 2144 } 2145 switch (opt) { 2146 2147 default: 2148 continue; 2149 2150 case TCPOPT_MAXSEG: 2151 if (optlen != TCPOLEN_MAXSEG) 2152 continue; 2153 if (!(th->th_flags & TH_SYN)) 2154 continue; 2155 if (TCPS_HAVERCVDSYN(tp->t_state)) 2156 continue; 2157 memcpy(&mss, cp + 2, sizeof(mss)); 2158 mss = ntohs(mss); 2159 oi->maxseg = mss; 2160 break; 2161 2162 case TCPOPT_WINDOW: 2163 if (optlen != TCPOLEN_WINDOW) 2164 continue; 2165 if (!(th->th_flags & TH_SYN)) 2166 continue; 2167 if (TCPS_HAVERCVDSYN(tp->t_state)) 2168 continue; 2169 tp->t_flags |= TF_RCVD_SCALE; 2170 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2171 break; 2172 2173 case TCPOPT_TIMESTAMP: 2174 if (optlen != TCPOLEN_TIMESTAMP) 2175 continue; 2176 oi->ts_present = 1; 2177 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2178 oi->ts_val = ntohl(oi->ts_val); 2179 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2180 oi->ts_ecr = ntohl(oi->ts_ecr); 2181 2182 if (!(th->th_flags & TH_SYN)) 2183 continue; 2184 if (TCPS_HAVERCVDSYN(tp->t_state)) 2185 continue; 2186 /* 2187 * A timestamp received in a SYN makes 2188 * it ok to send timestamp requests and replies. 2189 */ 2190 tp->t_flags |= TF_RCVD_TSTMP; 2191 tp->ts_recent = oi->ts_val; 2192 tp->ts_recent_age = now; 2193 break; 2194 2195 case TCPOPT_SACK_PERMITTED: 2196 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2197 continue; 2198 if (!(th->th_flags & TH_SYN)) 2199 continue; 2200 if (TCPS_HAVERCVDSYN(tp->t_state)) 2201 continue; 2202 /* MUST only be set on SYN */ 2203 tp->t_flags |= TF_SACK_PERMIT; 2204 break; 2205 case TCPOPT_SACK: 2206 tcp_sack_option(tp, th, cp, optlen); 2207 break; 2208 #ifdef TCP_SIGNATURE 2209 case TCPOPT_SIGNATURE: 2210 if (optlen != TCPOLEN_SIGNATURE) 2211 continue; 2212 2213 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2214 goto bad; 2215 2216 sigp = cp + 2; 2217 break; 2218 #endif /* TCP_SIGNATURE */ 2219 } 2220 } 2221 2222 #ifdef TCP_SIGNATURE 2223 if (tp->t_flags & TF_SIGNATURE) { 2224 union sockaddr_union src, dst; 2225 2226 memset(&src, 0, sizeof(union sockaddr_union)); 2227 memset(&dst, 0, sizeof(union sockaddr_union)); 2228 2229 switch (tp->pf) { 2230 case 0: 2231 case AF_INET: 2232 src.sa.sa_len = sizeof(struct sockaddr_in); 2233 src.sa.sa_family = AF_INET; 2234 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2235 dst.sa.sa_len = sizeof(struct sockaddr_in); 2236 dst.sa.sa_family = AF_INET; 2237 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2238 break; 2239 #ifdef INET6 2240 case AF_INET6: 2241 src.sa.sa_len = sizeof(struct sockaddr_in6); 2242 src.sa.sa_family = AF_INET6; 2243 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2244 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2245 dst.sa.sa_family = AF_INET6; 2246 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2247 break; 2248 #endif /* INET6 */ 2249 } 2250 2251 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2252 0, &src, &dst, IPPROTO_TCP); 2253 2254 /* 2255 * We don't have an SA for this peer, so we turn off 2256 * TF_SIGNATURE on the listen socket 2257 */ 2258 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2259 tp->t_flags &= ~TF_SIGNATURE; 2260 2261 } 2262 2263 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2264 tcpstat_inc(tcps_rcvbadsig); 2265 goto bad; 2266 } 2267 2268 if (sigp) { 2269 char sig[16]; 2270 2271 if (tdb == NULL) { 2272 tcpstat_inc(tcps_rcvbadsig); 2273 goto bad; 2274 } 2275 2276 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2277 goto bad; 2278 2279 if (timingsafe_bcmp(sig, sigp, 16)) { 2280 tcpstat_inc(tcps_rcvbadsig); 2281 goto bad; 2282 } 2283 2284 tcpstat_inc(tcps_rcvgoodsig); 2285 } 2286 2287 tdb_unref(tdb); 2288 #endif /* TCP_SIGNATURE */ 2289 2290 return (0); 2291 2292 #ifdef TCP_SIGNATURE 2293 bad: 2294 tdb_unref(tdb); 2295 #endif /* TCP_SIGNATURE */ 2296 return (-1); 2297 } 2298 2299 u_long 2300 tcp_seq_subtract(u_long a, u_long b) 2301 { 2302 return ((long)(a - b)); 2303 } 2304 2305 /* 2306 * This function is called upon receipt of new valid data (while not in header 2307 * prediction mode), and it updates the ordered list of sacks. 2308 */ 2309 void 2310 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2311 tcp_seq rcv_lastend) 2312 { 2313 /* 2314 * First reported block MUST be the most recent one. Subsequent 2315 * blocks SHOULD be in the order in which they arrived at the 2316 * receiver. These two conditions make the implementation fully 2317 * compliant with RFC 2018. 2318 */ 2319 int i, j = 0, count = 0, lastpos = -1; 2320 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2321 2322 /* First clean up current list of sacks */ 2323 for (i = 0; i < tp->rcv_numsacks; i++) { 2324 sack = tp->sackblks[i]; 2325 if (sack.start == 0 && sack.end == 0) { 2326 count++; /* count = number of blocks to be discarded */ 2327 continue; 2328 } 2329 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2330 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2331 count++; 2332 } else { 2333 temp[j].start = tp->sackblks[i].start; 2334 temp[j++].end = tp->sackblks[i].end; 2335 } 2336 } 2337 tp->rcv_numsacks -= count; 2338 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2339 tcp_clean_sackreport(tp); 2340 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2341 /* ==> need first sack block */ 2342 tp->sackblks[0].start = rcv_laststart; 2343 tp->sackblks[0].end = rcv_lastend; 2344 tp->rcv_numsacks = 1; 2345 } 2346 return; 2347 } 2348 /* Otherwise, sack blocks are already present. */ 2349 for (i = 0; i < tp->rcv_numsacks; i++) 2350 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2351 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2352 return; /* sack list remains unchanged */ 2353 /* 2354 * From here, segment just received should be (part of) the 1st sack. 2355 * Go through list, possibly coalescing sack block entries. 2356 */ 2357 firstsack.start = rcv_laststart; 2358 firstsack.end = rcv_lastend; 2359 for (i = 0; i < tp->rcv_numsacks; i++) { 2360 sack = tp->sackblks[i]; 2361 if (SEQ_LT(sack.end, firstsack.start) || 2362 SEQ_GT(sack.start, firstsack.end)) 2363 continue; /* no overlap */ 2364 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2365 /* 2366 * identical block; delete it here since we will 2367 * move it to the front of the list. 2368 */ 2369 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2370 lastpos = i; /* last posn with a zero entry */ 2371 continue; 2372 } 2373 if (SEQ_LEQ(sack.start, firstsack.start)) 2374 firstsack.start = sack.start; /* merge blocks */ 2375 if (SEQ_GEQ(sack.end, firstsack.end)) 2376 firstsack.end = sack.end; /* merge blocks */ 2377 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2378 lastpos = i; /* last posn with a zero entry */ 2379 } 2380 if (lastpos != -1) { /* at least one merge */ 2381 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2382 sack = tp->sackblks[i]; 2383 if (sack.start == 0 && sack.end == 0) 2384 continue; 2385 temp[j++] = sack; 2386 } 2387 tp->rcv_numsacks = j; /* including first blk (added later) */ 2388 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2389 tp->sackblks[i] = temp[i]; 2390 } else { /* no merges -- shift sacks by 1 */ 2391 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2392 tp->rcv_numsacks++; 2393 for (i = tp->rcv_numsacks-1; i > 0; i--) 2394 tp->sackblks[i] = tp->sackblks[i-1]; 2395 } 2396 tp->sackblks[0] = firstsack; 2397 return; 2398 } 2399 2400 /* 2401 * Process the TCP SACK option. tp->snd_holes is an ordered list 2402 * of holes (oldest to newest, in terms of the sequence space). 2403 */ 2404 void 2405 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2406 { 2407 int tmp_olen; 2408 u_char *tmp_cp; 2409 struct sackhole *cur, *p, *temp; 2410 2411 if (!tp->sack_enable) 2412 return; 2413 /* SACK without ACK doesn't make sense. */ 2414 if ((th->th_flags & TH_ACK) == 0) 2415 return; 2416 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2417 if (SEQ_LT(th->th_ack, tp->snd_una) || 2418 SEQ_GT(th->th_ack, tp->snd_max)) 2419 return; 2420 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2421 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2422 return; 2423 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2424 tmp_cp = cp + 2; 2425 tmp_olen = optlen - 2; 2426 tcpstat_inc(tcps_sack_rcv_opts); 2427 if (tp->snd_numholes < 0) 2428 tp->snd_numholes = 0; 2429 if (tp->t_maxseg == 0) 2430 panic("tcp_sack_option"); /* Should never happen */ 2431 while (tmp_olen > 0) { 2432 struct sackblk sack; 2433 2434 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2435 sack.start = ntohl(sack.start); 2436 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2437 sack.end = ntohl(sack.end); 2438 tmp_olen -= TCPOLEN_SACK; 2439 tmp_cp += TCPOLEN_SACK; 2440 if (SEQ_LEQ(sack.end, sack.start)) 2441 continue; /* bad SACK fields */ 2442 if (SEQ_LEQ(sack.end, tp->snd_una)) 2443 continue; /* old block */ 2444 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2445 if (SEQ_LT(sack.start, th->th_ack)) 2446 continue; 2447 } 2448 if (SEQ_GT(sack.end, tp->snd_max)) 2449 continue; 2450 if (tp->snd_holes == NULL) { /* first hole */ 2451 tp->snd_holes = (struct sackhole *) 2452 pool_get(&sackhl_pool, PR_NOWAIT); 2453 if (tp->snd_holes == NULL) { 2454 /* ENOBUFS, so ignore SACKed block for now */ 2455 goto dropped; 2456 } 2457 cur = tp->snd_holes; 2458 cur->start = th->th_ack; 2459 cur->end = sack.start; 2460 cur->rxmit = cur->start; 2461 cur->next = NULL; 2462 tp->snd_numholes = 1; 2463 tp->rcv_lastsack = sack.end; 2464 /* 2465 * dups is at least one. If more data has been 2466 * SACKed, it can be greater than one. 2467 */ 2468 cur->dups = min(tcprexmtthresh, 2469 ((sack.end - cur->end)/tp->t_maxseg)); 2470 if (cur->dups < 1) 2471 cur->dups = 1; 2472 continue; /* with next sack block */ 2473 } 2474 /* Go thru list of holes: p = previous, cur = current */ 2475 p = cur = tp->snd_holes; 2476 while (cur) { 2477 if (SEQ_LEQ(sack.end, cur->start)) 2478 /* SACKs data before the current hole */ 2479 break; /* no use going through more holes */ 2480 if (SEQ_GEQ(sack.start, cur->end)) { 2481 /* SACKs data beyond the current hole */ 2482 cur->dups++; 2483 if (((sack.end - cur->end)/tp->t_maxseg) >= 2484 tcprexmtthresh) 2485 cur->dups = tcprexmtthresh; 2486 p = cur; 2487 cur = cur->next; 2488 continue; 2489 } 2490 if (SEQ_LEQ(sack.start, cur->start)) { 2491 /* Data acks at least the beginning of hole */ 2492 if (SEQ_GEQ(sack.end, cur->end)) { 2493 /* Acks entire hole, so delete hole */ 2494 if (p != cur) { 2495 p->next = cur->next; 2496 pool_put(&sackhl_pool, cur); 2497 cur = p->next; 2498 } else { 2499 cur = cur->next; 2500 pool_put(&sackhl_pool, p); 2501 p = cur; 2502 tp->snd_holes = p; 2503 } 2504 tp->snd_numholes--; 2505 continue; 2506 } 2507 /* otherwise, move start of hole forward */ 2508 cur->start = sack.end; 2509 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2510 p = cur; 2511 cur = cur->next; 2512 continue; 2513 } 2514 /* move end of hole backward */ 2515 if (SEQ_GEQ(sack.end, cur->end)) { 2516 cur->end = sack.start; 2517 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2518 cur->dups++; 2519 if (((sack.end - cur->end)/tp->t_maxseg) >= 2520 tcprexmtthresh) 2521 cur->dups = tcprexmtthresh; 2522 p = cur; 2523 cur = cur->next; 2524 continue; 2525 } 2526 if (SEQ_LT(cur->start, sack.start) && 2527 SEQ_GT(cur->end, sack.end)) { 2528 /* 2529 * ACKs some data in middle of a hole; need to 2530 * split current hole 2531 */ 2532 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2533 goto dropped; 2534 temp = (struct sackhole *) 2535 pool_get(&sackhl_pool, PR_NOWAIT); 2536 if (temp == NULL) 2537 goto dropped; /* ENOBUFS */ 2538 temp->next = cur->next; 2539 temp->start = sack.end; 2540 temp->end = cur->end; 2541 temp->dups = cur->dups; 2542 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2543 cur->end = sack.start; 2544 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2545 cur->dups++; 2546 if (((sack.end - cur->end)/tp->t_maxseg) >= 2547 tcprexmtthresh) 2548 cur->dups = tcprexmtthresh; 2549 cur->next = temp; 2550 p = temp; 2551 cur = p->next; 2552 tp->snd_numholes++; 2553 } 2554 } 2555 /* At this point, p points to the last hole on the list */ 2556 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2557 /* 2558 * Need to append new hole at end. 2559 * Last hole is p (and it's not NULL). 2560 */ 2561 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2562 goto dropped; 2563 temp = (struct sackhole *) 2564 pool_get(&sackhl_pool, PR_NOWAIT); 2565 if (temp == NULL) 2566 goto dropped; /* ENOBUFS */ 2567 temp->start = tp->rcv_lastsack; 2568 temp->end = sack.start; 2569 temp->dups = min(tcprexmtthresh, 2570 ((sack.end - sack.start)/tp->t_maxseg)); 2571 if (temp->dups < 1) 2572 temp->dups = 1; 2573 temp->rxmit = temp->start; 2574 temp->next = 0; 2575 p->next = temp; 2576 tp->rcv_lastsack = sack.end; 2577 tp->snd_numholes++; 2578 } 2579 } 2580 return; 2581 dropped: 2582 tcpstat_inc(tcps_sack_drop_opts); 2583 } 2584 2585 /* 2586 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2587 * it is completely acked; otherwise, tcp_sack_option(), called from 2588 * tcp_dooptions(), will fix up the hole. 2589 */ 2590 void 2591 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2592 { 2593 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2594 /* max because this could be an older ack just arrived */ 2595 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2596 th->th_ack : tp->snd_una; 2597 struct sackhole *cur = tp->snd_holes; 2598 struct sackhole *prev; 2599 while (cur) 2600 if (SEQ_LEQ(cur->end, lastack)) { 2601 prev = cur; 2602 cur = cur->next; 2603 pool_put(&sackhl_pool, prev); 2604 tp->snd_numholes--; 2605 } else if (SEQ_LT(cur->start, lastack)) { 2606 cur->start = lastack; 2607 if (SEQ_LT(cur->rxmit, cur->start)) 2608 cur->rxmit = cur->start; 2609 break; 2610 } else 2611 break; 2612 tp->snd_holes = cur; 2613 } 2614 } 2615 2616 /* 2617 * Delete all receiver-side SACK information. 2618 */ 2619 void 2620 tcp_clean_sackreport(struct tcpcb *tp) 2621 { 2622 int i; 2623 2624 tp->rcv_numsacks = 0; 2625 for (i = 0; i < MAX_SACK_BLKS; i++) 2626 tp->sackblks[i].start = tp->sackblks[i].end=0; 2627 2628 } 2629 2630 /* 2631 * Partial ack handling within a sack recovery episode. When a partial ack 2632 * arrives, turn off retransmission timer, deflate the window, do not clear 2633 * tp->t_dupacks. 2634 */ 2635 void 2636 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2637 { 2638 /* Turn off retx. timer (will start again next segment) */ 2639 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2640 tp->t_rtttime = 0; 2641 /* 2642 * Partial window deflation. This statement relies on the 2643 * fact that tp->snd_una has not been updated yet. 2644 */ 2645 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2646 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2647 tp->snd_cwnd += tp->t_maxseg; 2648 } else 2649 tp->snd_cwnd = tp->t_maxseg; 2650 tp->snd_cwnd += tp->t_maxseg; 2651 tp->t_flags |= TF_NEEDOUTPUT; 2652 } 2653 2654 /* 2655 * Pull out of band byte out of a segment so 2656 * it doesn't appear in the user's data queue. 2657 * It is still reflected in the segment length for 2658 * sequencing purposes. 2659 */ 2660 void 2661 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2662 { 2663 int cnt = off + urgent - 1; 2664 2665 while (cnt >= 0) { 2666 if (m->m_len > cnt) { 2667 char *cp = mtod(m, caddr_t) + cnt; 2668 struct tcpcb *tp = sototcpcb(so); 2669 2670 tp->t_iobc = *cp; 2671 tp->t_oobflags |= TCPOOB_HAVEDATA; 2672 memmove(cp, cp + 1, m->m_len - cnt - 1); 2673 m->m_len--; 2674 return; 2675 } 2676 cnt -= m->m_len; 2677 m = m->m_next; 2678 if (m == NULL) 2679 break; 2680 } 2681 panic("tcp_pulloutofband"); 2682 } 2683 2684 /* 2685 * Collect new round-trip time estimate 2686 * and update averages and current timeout. 2687 */ 2688 void 2689 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2690 { 2691 int delta, rttmin; 2692 2693 if (rtt < 0) 2694 rtt = 0; 2695 else if (rtt > TCP_RTT_MAX) 2696 rtt = TCP_RTT_MAX; 2697 2698 tcpstat_inc(tcps_rttupdated); 2699 if (tp->t_srtt != 0) { 2700 /* 2701 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2702 * after the binary point (scaled by 4), whereas 2703 * srtt is stored as fixed point with 5 bits after the 2704 * binary point (i.e., scaled by 32). The following magic 2705 * is equivalent to the smoothing algorithm in rfc793 with 2706 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2707 * point). 2708 */ 2709 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2710 (tp->t_srtt >> TCP_RTT_SHIFT); 2711 if ((tp->t_srtt += delta) <= 0) 2712 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2713 /* 2714 * We accumulate a smoothed rtt variance (actually, a 2715 * smoothed mean difference), then set the retransmit 2716 * timer to smoothed rtt + 4 times the smoothed variance. 2717 * rttvar is stored as fixed point with 4 bits after the 2718 * binary point (scaled by 16). The following is 2719 * equivalent to rfc793 smoothing with an alpha of .75 2720 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2721 * rfc793's wired-in beta. 2722 */ 2723 if (delta < 0) 2724 delta = -delta; 2725 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2726 if ((tp->t_rttvar += delta) <= 0) 2727 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2728 } else { 2729 /* 2730 * No rtt measurement yet - use the unsmoothed rtt. 2731 * Set the variance to half the rtt (so our first 2732 * retransmit happens at 3*rtt). 2733 */ 2734 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2735 tp->t_rttvar = (rtt + 1) << 2736 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2737 } 2738 tp->t_rtttime = 0; 2739 tp->t_rxtshift = 0; 2740 2741 /* 2742 * the retransmit should happen at rtt + 4 * rttvar. 2743 * Because of the way we do the smoothing, srtt and rttvar 2744 * will each average +1/2 tick of bias. When we compute 2745 * the retransmit timer, we want 1/2 tick of rounding and 2746 * 1 extra tick because of +-1/2 tick uncertainty in the 2747 * firing of the timer. The bias will give us exactly the 2748 * 1.5 tick we need. But, because the bias is 2749 * statistical, we have to test that we don't drop below 2750 * the minimum feasible timer (which is 2 ticks). 2751 */ 2752 rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)), 2753 TCPTV_REXMTMAX); 2754 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2755 2756 /* 2757 * We received an ack for a packet that wasn't retransmitted; 2758 * it is probably safe to discard any error indications we've 2759 * received recently. This isn't quite right, but close enough 2760 * for now (a route might have failed after we sent a segment, 2761 * and the return path might not be symmetrical). 2762 */ 2763 tp->t_softerror = 0; 2764 } 2765 2766 /* 2767 * Determine a reasonable value for maxseg size. 2768 * If the route is known, check route for mtu. 2769 * If none, use an mss that can be handled on the outgoing 2770 * interface without forcing IP to fragment; if bigger than 2771 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2772 * to utilize large mbufs. If no route is found, route has no mtu, 2773 * or the destination isn't local, use a default, hopefully conservative 2774 * size (usually 512 or the default IP max size, but no more than the mtu 2775 * of the interface), as we can't discover anything about intervening 2776 * gateways or networks. We also initialize the congestion/slow start 2777 * window to be a single segment if the destination isn't local. 2778 * While looking at the routing entry, we also initialize other path-dependent 2779 * parameters from pre-set or cached values in the routing entry. 2780 * 2781 * Also take into account the space needed for options that we 2782 * send regularly. Make maxseg shorter by that amount to assure 2783 * that we can send maxseg amount of data even when the options 2784 * are present. Store the upper limit of the length of options plus 2785 * data in maxopd. 2786 * 2787 * NOTE: offer == -1 indicates that the maxseg size changed due to 2788 * Path MTU discovery. 2789 */ 2790 int 2791 tcp_mss(struct tcpcb *tp, int offer) 2792 { 2793 struct rtentry *rt; 2794 struct ifnet *ifp = NULL; 2795 int mss, mssopt; 2796 int iphlen; 2797 struct inpcb *inp; 2798 2799 inp = tp->t_inpcb; 2800 2801 mssopt = mss = tcp_mssdflt; 2802 2803 rt = in_pcbrtentry(inp); 2804 2805 if (rt == NULL) 2806 goto out; 2807 2808 ifp = if_get(rt->rt_ifidx); 2809 if (ifp == NULL) 2810 goto out; 2811 2812 switch (tp->pf) { 2813 #ifdef INET6 2814 case AF_INET6: 2815 iphlen = sizeof(struct ip6_hdr); 2816 break; 2817 #endif 2818 case AF_INET: 2819 iphlen = sizeof(struct ip); 2820 break; 2821 default: 2822 /* the family does not support path MTU discovery */ 2823 goto out; 2824 } 2825 2826 /* 2827 * if there's an mtu associated with the route and we support 2828 * path MTU discovery for the underlying protocol family, use it. 2829 */ 2830 if (rt->rt_mtu) { 2831 /* 2832 * One may wish to lower MSS to take into account options, 2833 * especially security-related options. 2834 */ 2835 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2836 /* 2837 * RFC2460 section 5, last paragraph: if path MTU is 2838 * smaller than 1280, use 1280 as packet size and 2839 * attach fragment header. 2840 */ 2841 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2842 sizeof(struct tcphdr); 2843 } else { 2844 mss = rt->rt_mtu - iphlen - 2845 sizeof(struct tcphdr); 2846 } 2847 } else if (ifp->if_flags & IFF_LOOPBACK) { 2848 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2849 } else if (tp->pf == AF_INET) { 2850 if (ip_mtudisc) 2851 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2852 } 2853 #ifdef INET6 2854 else if (tp->pf == AF_INET6) { 2855 /* 2856 * for IPv6, path MTU discovery is always turned on, 2857 * or the node must use packet size <= 1280. 2858 */ 2859 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2860 } 2861 #endif /* INET6 */ 2862 2863 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2864 if (offer != -1) { 2865 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2866 mssopt = max(tcp_mssdflt, mssopt); 2867 } 2868 out: 2869 if_put(ifp); 2870 /* 2871 * The current mss, t_maxseg, is initialized to the default value. 2872 * If we compute a smaller value, reduce the current mss. 2873 * If we compute a larger value, return it for use in sending 2874 * a max seg size option, but don't store it for use 2875 * unless we received an offer at least that large from peer. 2876 * 2877 * However, do not accept offers lower than the minimum of 2878 * the interface MTU and 216. 2879 */ 2880 if (offer > 0) 2881 tp->t_peermss = offer; 2882 if (tp->t_peermss) 2883 mss = min(mss, max(tp->t_peermss, 216)); 2884 2885 /* sanity - at least max opt. space */ 2886 mss = max(mss, 64); 2887 2888 /* 2889 * maxopd stores the maximum length of data AND options 2890 * in a segment; maxseg is the amount of data in a normal 2891 * segment. We need to store this value (maxopd) apart 2892 * from maxseg, because now every segment carries options 2893 * and thus we normally have somewhat less data in segments. 2894 */ 2895 tp->t_maxopd = mss; 2896 2897 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2898 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2899 mss -= TCPOLEN_TSTAMP_APPA; 2900 #ifdef TCP_SIGNATURE 2901 if (tp->t_flags & TF_SIGNATURE) 2902 mss -= TCPOLEN_SIGLEN; 2903 #endif 2904 2905 if (offer == -1) { 2906 /* mss changed due to Path MTU discovery */ 2907 tp->t_flags &= ~TF_PMTUD_PEND; 2908 tp->t_pmtud_mtu_sent = 0; 2909 tp->t_pmtud_mss_acked = 0; 2910 if (mss < tp->t_maxseg) { 2911 /* 2912 * Follow suggestion in RFC 2414 to reduce the 2913 * congestion window by the ratio of the old 2914 * segment size to the new segment size. 2915 */ 2916 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2917 mss, mss); 2918 } 2919 } else if (tcp_do_rfc3390 == 2) { 2920 /* increase initial window */ 2921 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2922 } else if (tcp_do_rfc3390) { 2923 /* increase initial window */ 2924 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2925 } else 2926 tp->snd_cwnd = mss; 2927 2928 tp->t_maxseg = mss; 2929 2930 return (offer != -1 ? mssopt : mss); 2931 } 2932 2933 u_int 2934 tcp_hdrsz(struct tcpcb *tp) 2935 { 2936 u_int hlen; 2937 2938 switch (tp->pf) { 2939 #ifdef INET6 2940 case AF_INET6: 2941 hlen = sizeof(struct ip6_hdr); 2942 break; 2943 #endif 2944 case AF_INET: 2945 hlen = sizeof(struct ip); 2946 break; 2947 default: 2948 hlen = 0; 2949 break; 2950 } 2951 hlen += sizeof(struct tcphdr); 2952 2953 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2954 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2955 hlen += TCPOLEN_TSTAMP_APPA; 2956 #ifdef TCP_SIGNATURE 2957 if (tp->t_flags & TF_SIGNATURE) 2958 hlen += TCPOLEN_SIGLEN; 2959 #endif 2960 return (hlen); 2961 } 2962 2963 /* 2964 * Set connection variables based on the effective MSS. 2965 * We are passed the TCPCB for the actual connection. If we 2966 * are the server, we are called by the compressed state engine 2967 * when the 3-way handshake is complete. If we are the client, 2968 * we are called when we receive the SYN,ACK from the server. 2969 * 2970 * NOTE: The t_maxseg value must be initialized in the TCPCB 2971 * before this routine is called! 2972 */ 2973 void 2974 tcp_mss_update(struct tcpcb *tp) 2975 { 2976 int mss; 2977 u_long bufsize; 2978 struct rtentry *rt; 2979 struct socket *so; 2980 2981 so = tp->t_inpcb->inp_socket; 2982 mss = tp->t_maxseg; 2983 2984 rt = in_pcbrtentry(tp->t_inpcb); 2985 2986 if (rt == NULL) 2987 return; 2988 2989 bufsize = so->so_snd.sb_hiwat; 2990 if (bufsize < mss) { 2991 mss = bufsize; 2992 /* Update t_maxseg and t_maxopd */ 2993 tcp_mss(tp, mss); 2994 } else { 2995 bufsize = roundup(bufsize, mss); 2996 if (bufsize > sb_max) 2997 bufsize = sb_max; 2998 (void)sbreserve(so, &so->so_snd, bufsize); 2999 } 3000 3001 bufsize = so->so_rcv.sb_hiwat; 3002 if (bufsize > mss) { 3003 bufsize = roundup(bufsize, mss); 3004 if (bufsize > sb_max) 3005 bufsize = sb_max; 3006 (void)sbreserve(so, &so->so_rcv, bufsize); 3007 } 3008 3009 } 3010 3011 /* 3012 * When a partial ack arrives, force the retransmission of the 3013 * next unacknowledged segment. Do not clear tp->t_dupacks. 3014 * By setting snd_nxt to ti_ack, this forces retransmission timer 3015 * to be started again. 3016 */ 3017 void 3018 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3019 { 3020 /* 3021 * snd_una has not been updated and the socket send buffer 3022 * not yet drained of the acked data, so we have to leave 3023 * snd_una as it was to get the correct data offset in 3024 * tcp_output(). 3025 */ 3026 tcp_seq onxt = tp->snd_nxt; 3027 u_long ocwnd = tp->snd_cwnd; 3028 3029 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3030 tp->t_rtttime = 0; 3031 tp->snd_nxt = th->th_ack; 3032 /* 3033 * Set snd_cwnd to one segment beyond acknowledged offset 3034 * (tp->snd_una not yet updated when this function is called) 3035 */ 3036 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3037 (void)tcp_output(tp); 3038 tp->snd_cwnd = ocwnd; 3039 if (SEQ_GT(onxt, tp->snd_nxt)) 3040 tp->snd_nxt = onxt; 3041 /* 3042 * Partial window deflation. Relies on fact that tp->snd_una 3043 * not updated yet. 3044 */ 3045 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3046 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3047 else 3048 tp->snd_cwnd = 0; 3049 tp->snd_cwnd += tp->t_maxseg; 3050 } 3051 3052 int 3053 tcp_mss_adv(struct mbuf *m, int af) 3054 { 3055 int mss = 0; 3056 int iphlen; 3057 struct ifnet *ifp = NULL; 3058 3059 if (m && (m->m_flags & M_PKTHDR)) 3060 ifp = if_get(m->m_pkthdr.ph_ifidx); 3061 3062 switch (af) { 3063 case AF_INET: 3064 if (ifp != NULL) 3065 mss = ifp->if_mtu; 3066 iphlen = sizeof(struct ip); 3067 break; 3068 #ifdef INET6 3069 case AF_INET6: 3070 if (ifp != NULL) 3071 mss = ifp->if_mtu; 3072 iphlen = sizeof(struct ip6_hdr); 3073 break; 3074 #endif 3075 default: 3076 unhandled_af(af); 3077 } 3078 if_put(ifp); 3079 mss = mss - iphlen - sizeof(struct tcphdr); 3080 return (max(mss, tcp_mssdflt)); 3081 } 3082 3083 /* 3084 * TCP compressed state engine. Currently used to hold compressed 3085 * state for SYN_RECEIVED. 3086 */ 3087 3088 /* syn hash parameters */ 3089 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3090 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3091 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3092 int tcp_syn_use_limit = 100000; 3093 3094 struct syn_cache_set tcp_syn_cache[2]; 3095 int tcp_syn_cache_active; 3096 3097 #define SYN_HASH(sa, sp, dp, rand) \ 3098 (((sa)->s_addr ^ (rand)[0]) * \ 3099 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3100 #ifndef INET6 3101 #define SYN_HASHALL(hash, src, dst, rand) \ 3102 do { \ 3103 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3104 satosin(src)->sin_port, \ 3105 satosin(dst)->sin_port, (rand)); \ 3106 } while (/*CONSTCOND*/ 0) 3107 #else 3108 #define SYN_HASH6(sa, sp, dp, rand) \ 3109 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3110 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3111 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3112 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3113 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3114 3115 #define SYN_HASHALL(hash, src, dst, rand) \ 3116 do { \ 3117 switch ((src)->sa_family) { \ 3118 case AF_INET: \ 3119 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3120 satosin(src)->sin_port, \ 3121 satosin(dst)->sin_port, (rand)); \ 3122 break; \ 3123 case AF_INET6: \ 3124 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3125 satosin6(src)->sin6_port, \ 3126 satosin6(dst)->sin6_port, (rand)); \ 3127 break; \ 3128 default: \ 3129 hash = 0; \ 3130 } \ 3131 } while (/*CONSTCOND*/0) 3132 #endif /* INET6 */ 3133 3134 void 3135 syn_cache_rm(struct syn_cache *sc) 3136 { 3137 sc->sc_flags |= SCF_DEAD; 3138 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3139 sc->sc_tp = NULL; 3140 LIST_REMOVE(sc, sc_tpq); 3141 sc->sc_buckethead->sch_length--; 3142 timeout_del(&sc->sc_timer); 3143 sc->sc_set->scs_count--; 3144 } 3145 3146 void 3147 syn_cache_put(struct syn_cache *sc) 3148 { 3149 m_free(sc->sc_ipopts); 3150 if (sc->sc_route4.ro_rt != NULL) { 3151 rtfree(sc->sc_route4.ro_rt); 3152 sc->sc_route4.ro_rt = NULL; 3153 } 3154 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3155 timeout_add(&sc->sc_timer, 0); 3156 } 3157 3158 struct pool syn_cache_pool; 3159 3160 /* 3161 * We don't estimate RTT with SYNs, so each packet starts with the default 3162 * RTT and each timer step has a fixed timeout value. 3163 */ 3164 #define SYN_CACHE_TIMER_ARM(sc) \ 3165 do { \ 3166 TCPT_RANGESET((sc)->sc_rxtcur, \ 3167 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3168 TCPTV_REXMTMAX); \ 3169 if (!timeout_initialized(&(sc)->sc_timer)) \ 3170 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3171 timeout_add_msec(&(sc)->sc_timer, (sc)->sc_rxtcur); \ 3172 } while (/*CONSTCOND*/0) 3173 3174 void 3175 syn_cache_init(void) 3176 { 3177 int i; 3178 3179 /* Initialize the hash buckets. */ 3180 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3181 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3182 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3183 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3184 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3185 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3186 for (i = 0; i < tcp_syn_hash_size; i++) { 3187 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3188 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3189 } 3190 3191 /* Initialize the syn cache pool. */ 3192 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3193 0, "syncache", NULL); 3194 } 3195 3196 void 3197 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3198 { 3199 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3200 struct syn_cache_head *scp; 3201 struct syn_cache *sc2; 3202 int i; 3203 3204 NET_ASSERT_LOCKED(); 3205 3206 /* 3207 * If there are no entries in the hash table, reinitialize 3208 * the hash secrets. To avoid useless cache swaps and 3209 * reinitialization, use it until the limit is reached. 3210 * An empty cache is also the opportunity to resize the hash. 3211 */ 3212 if (set->scs_count == 0 && set->scs_use <= 0) { 3213 set->scs_use = tcp_syn_use_limit; 3214 if (set->scs_size != tcp_syn_hash_size) { 3215 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3216 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3217 if (scp == NULL) { 3218 /* Try again next time. */ 3219 set->scs_use = 0; 3220 } else { 3221 free(set->scs_buckethead, M_SYNCACHE, 3222 set->scs_size * 3223 sizeof(struct syn_cache_head)); 3224 set->scs_buckethead = scp; 3225 set->scs_size = tcp_syn_hash_size; 3226 for (i = 0; i < tcp_syn_hash_size; i++) 3227 TAILQ_INIT(&scp[i].sch_bucket); 3228 } 3229 } 3230 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3231 tcpstat_inc(tcps_sc_seedrandom); 3232 } 3233 3234 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3235 set->scs_random); 3236 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3237 sc->sc_buckethead = scp; 3238 3239 /* 3240 * Make sure that we don't overflow the per-bucket 3241 * limit or the total cache size limit. 3242 */ 3243 if (scp->sch_length >= tcp_syn_bucket_limit) { 3244 tcpstat_inc(tcps_sc_bucketoverflow); 3245 /* 3246 * Someone might attack our bucket hash function. Reseed 3247 * with random as soon as the passive syn cache gets empty. 3248 */ 3249 set->scs_use = 0; 3250 /* 3251 * The bucket is full. Toss the oldest element in the 3252 * bucket. This will be the first entry in the bucket. 3253 */ 3254 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3255 #ifdef DIAGNOSTIC 3256 /* 3257 * This should never happen; we should always find an 3258 * entry in our bucket. 3259 */ 3260 if (sc2 == NULL) 3261 panic("%s: bucketoverflow: impossible", __func__); 3262 #endif 3263 syn_cache_rm(sc2); 3264 syn_cache_put(sc2); 3265 } else if (set->scs_count >= tcp_syn_cache_limit) { 3266 struct syn_cache_head *scp2, *sce; 3267 3268 tcpstat_inc(tcps_sc_overflowed); 3269 /* 3270 * The cache is full. Toss the oldest entry in the 3271 * first non-empty bucket we can find. 3272 * 3273 * XXX We would really like to toss the oldest 3274 * entry in the cache, but we hope that this 3275 * condition doesn't happen very often. 3276 */ 3277 scp2 = scp; 3278 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3279 sce = &set->scs_buckethead[set->scs_size]; 3280 for (++scp2; scp2 != scp; scp2++) { 3281 if (scp2 >= sce) 3282 scp2 = &set->scs_buckethead[0]; 3283 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3284 break; 3285 } 3286 #ifdef DIAGNOSTIC 3287 /* 3288 * This should never happen; we should always find a 3289 * non-empty bucket. 3290 */ 3291 if (scp2 == scp) 3292 panic("%s: cacheoverflow: impossible", 3293 __func__); 3294 #endif 3295 } 3296 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3297 syn_cache_rm(sc2); 3298 syn_cache_put(sc2); 3299 } 3300 3301 /* 3302 * Initialize the entry's timer. 3303 */ 3304 sc->sc_rxttot = 0; 3305 sc->sc_rxtshift = 0; 3306 SYN_CACHE_TIMER_ARM(sc); 3307 3308 /* Link it from tcpcb entry */ 3309 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3310 3311 /* Put it into the bucket. */ 3312 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3313 scp->sch_length++; 3314 sc->sc_set = set; 3315 set->scs_count++; 3316 set->scs_use--; 3317 3318 tcpstat_inc(tcps_sc_added); 3319 3320 /* 3321 * If the active cache has exceeded its use limit and 3322 * the passive syn cache is empty, exchange their roles. 3323 */ 3324 if (set->scs_use <= 0 && 3325 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3326 tcp_syn_cache_active = !tcp_syn_cache_active; 3327 } 3328 3329 /* 3330 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3331 * If we have retransmitted an entry the maximum number of times, expire 3332 * that entry. 3333 */ 3334 void 3335 syn_cache_timer(void *arg) 3336 { 3337 struct syn_cache *sc = arg; 3338 uint32_t now; 3339 3340 NET_LOCK(); 3341 if (sc->sc_flags & SCF_DEAD) 3342 goto out; 3343 3344 now = tcp_now(); 3345 3346 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3347 /* Drop it -- too many retransmissions. */ 3348 goto dropit; 3349 } 3350 3351 /* 3352 * Compute the total amount of time this entry has 3353 * been on a queue. If this entry has been on longer 3354 * than the keep alive timer would allow, expire it. 3355 */ 3356 sc->sc_rxttot += sc->sc_rxtcur; 3357 if (sc->sc_rxttot >= tcptv_keep_init) 3358 goto dropit; 3359 3360 tcpstat_inc(tcps_sc_retransmitted); 3361 (void) syn_cache_respond(sc, NULL, now); 3362 3363 /* Advance the timer back-off. */ 3364 sc->sc_rxtshift++; 3365 SYN_CACHE_TIMER_ARM(sc); 3366 3367 out: 3368 NET_UNLOCK(); 3369 return; 3370 3371 dropit: 3372 tcpstat_inc(tcps_sc_timed_out); 3373 syn_cache_rm(sc); 3374 syn_cache_put(sc); 3375 NET_UNLOCK(); 3376 } 3377 3378 void 3379 syn_cache_reaper(void *arg) 3380 { 3381 struct syn_cache *sc = arg; 3382 3383 pool_put(&syn_cache_pool, (sc)); 3384 return; 3385 } 3386 3387 /* 3388 * Remove syn cache created by the specified tcb entry, 3389 * because this does not make sense to keep them 3390 * (if there's no tcb entry, syn cache entry will never be used) 3391 */ 3392 void 3393 syn_cache_cleanup(struct tcpcb *tp) 3394 { 3395 struct syn_cache *sc, *nsc; 3396 3397 NET_ASSERT_LOCKED(); 3398 3399 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3400 #ifdef DIAGNOSTIC 3401 if (sc->sc_tp != tp) 3402 panic("invalid sc_tp in syn_cache_cleanup"); 3403 #endif 3404 syn_cache_rm(sc); 3405 syn_cache_put(sc); 3406 } 3407 /* just for safety */ 3408 LIST_INIT(&tp->t_sc); 3409 } 3410 3411 /* 3412 * Find an entry in the syn cache. 3413 */ 3414 struct syn_cache * 3415 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3416 struct syn_cache_head **headp, u_int rtableid) 3417 { 3418 struct syn_cache_set *sets[2]; 3419 struct syn_cache *sc; 3420 struct syn_cache_head *scp; 3421 u_int32_t hash; 3422 int i; 3423 3424 NET_ASSERT_LOCKED(); 3425 3426 /* Check the active cache first, the passive cache is likely empty. */ 3427 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3428 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3429 for (i = 0; i < 2; i++) { 3430 if (sets[i]->scs_count == 0) 3431 continue; 3432 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3433 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3434 *headp = scp; 3435 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3436 if (sc->sc_hash != hash) 3437 continue; 3438 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3439 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3440 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3441 return (sc); 3442 } 3443 } 3444 return (NULL); 3445 } 3446 3447 /* 3448 * This function gets called when we receive an ACK for a 3449 * socket in the LISTEN state. We look up the connection 3450 * in the syn cache, and if its there, we pull it out of 3451 * the cache and turn it into a full-blown connection in 3452 * the SYN-RECEIVED state. 3453 * 3454 * The return values may not be immediately obvious, and their effects 3455 * can be subtle, so here they are: 3456 * 3457 * NULL SYN was not found in cache; caller should drop the 3458 * packet and send an RST. 3459 * 3460 * -1 We were unable to create the new connection, and are 3461 * aborting it. An ACK,RST is being sent to the peer 3462 * (unless we got screwy sequence numbers; see below), 3463 * because the 3-way handshake has been completed. Caller 3464 * should not free the mbuf, since we may be using it. If 3465 * we are not, we will free it. 3466 * 3467 * Otherwise, the return value is a pointer to the new socket 3468 * associated with the connection. 3469 */ 3470 struct socket * 3471 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3472 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint32_t now) 3473 { 3474 struct syn_cache *sc; 3475 struct syn_cache_head *scp; 3476 struct inpcb *inp, *oldinp; 3477 struct tcpcb *tp = NULL; 3478 struct mbuf *am; 3479 struct socket *oso; 3480 3481 NET_ASSERT_LOCKED(); 3482 3483 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3484 if (sc == NULL) 3485 return (NULL); 3486 3487 /* 3488 * Verify the sequence and ack numbers. Try getting the correct 3489 * response again. 3490 */ 3491 if ((th->th_ack != sc->sc_iss + 1) || 3492 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3493 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3494 (void) syn_cache_respond(sc, m, now); 3495 return ((struct socket *)(-1)); 3496 } 3497 3498 /* Remove this cache entry */ 3499 syn_cache_rm(sc); 3500 3501 /* 3502 * Ok, create the full blown connection, and set things up 3503 * as they would have been set up if we had created the 3504 * connection when the SYN arrived. If we can't create 3505 * the connection, abort it. 3506 */ 3507 oso = so; 3508 so = sonewconn(so, SS_ISCONNECTED, M_DONTWAIT); 3509 if (so == NULL) 3510 goto resetandabort; 3511 3512 oldinp = sotoinpcb(oso); 3513 inp = sotoinpcb(so); 3514 3515 #ifdef IPSEC 3516 /* 3517 * We need to copy the required security levels 3518 * from the old pcb. Ditto for any other 3519 * IPsec-related information. 3520 */ 3521 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3522 sizeof(oldinp->inp_seclevel)); 3523 #endif /* IPSEC */ 3524 #ifdef INET6 3525 /* 3526 * inp still has the OLD in_pcb stuff, set the 3527 * v6-related flags on the new guy, too. 3528 */ 3529 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3530 if (inp->inp_flags & INP_IPV6) { 3531 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3532 inp->inp_hops = oldinp->inp_hops; 3533 } else 3534 #endif /* INET6 */ 3535 { 3536 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3537 } 3538 3539 #if NPF > 0 3540 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3541 struct pf_divert *divert; 3542 3543 divert = pf_find_divert(m); 3544 KASSERT(divert != NULL); 3545 inp->inp_rtableid = divert->rdomain; 3546 } else 3547 #endif 3548 /* inherit rtable from listening socket */ 3549 inp->inp_rtableid = sc->sc_rtableid; 3550 3551 inp->inp_lport = th->th_dport; 3552 switch (src->sa_family) { 3553 #ifdef INET6 3554 case AF_INET6: 3555 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3556 break; 3557 #endif /* INET6 */ 3558 case AF_INET: 3559 inp->inp_laddr = satosin(dst)->sin_addr; 3560 inp->inp_options = ip_srcroute(m); 3561 if (inp->inp_options == NULL) { 3562 inp->inp_options = sc->sc_ipopts; 3563 sc->sc_ipopts = NULL; 3564 } 3565 break; 3566 } 3567 in_pcbrehash(inp); 3568 3569 /* 3570 * Give the new socket our cached route reference. 3571 */ 3572 if (src->sa_family == AF_INET) 3573 inp->inp_route = sc->sc_route4; /* struct assignment */ 3574 #ifdef INET6 3575 else 3576 inp->inp_route6 = sc->sc_route6; 3577 #endif 3578 sc->sc_route4.ro_rt = NULL; 3579 3580 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3581 if (am == NULL) 3582 goto resetandabort; 3583 am->m_len = src->sa_len; 3584 memcpy(mtod(am, caddr_t), src, src->sa_len); 3585 if (in_pcbconnect(inp, am)) { 3586 (void) m_free(am); 3587 goto resetandabort; 3588 } 3589 (void) m_free(am); 3590 3591 tp = intotcpcb(inp); 3592 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3593 if (sc->sc_request_r_scale != 15) { 3594 tp->requested_s_scale = sc->sc_requested_s_scale; 3595 tp->request_r_scale = sc->sc_request_r_scale; 3596 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3597 } 3598 if (sc->sc_flags & SCF_TIMESTAMP) 3599 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3600 3601 tp->t_template = tcp_template(tp); 3602 if (tp->t_template == 0) { 3603 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3604 so = NULL; 3605 goto abort; 3606 } 3607 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3608 tp->ts_modulate = sc->sc_modulate; 3609 tp->ts_recent = sc->sc_timestamp; 3610 tp->iss = sc->sc_iss; 3611 tp->irs = sc->sc_irs; 3612 tcp_sendseqinit(tp); 3613 tp->snd_last = tp->snd_una; 3614 #ifdef TCP_ECN 3615 if (sc->sc_flags & SCF_ECN_PERMIT) { 3616 tp->t_flags |= TF_ECN_PERMIT; 3617 tcpstat_inc(tcps_ecn_accepts); 3618 } 3619 #endif 3620 if (sc->sc_flags & SCF_SACK_PERMIT) 3621 tp->t_flags |= TF_SACK_PERMIT; 3622 #ifdef TCP_SIGNATURE 3623 if (sc->sc_flags & SCF_SIGNATURE) 3624 tp->t_flags |= TF_SIGNATURE; 3625 #endif 3626 tcp_rcvseqinit(tp); 3627 tp->t_state = TCPS_SYN_RECEIVED; 3628 tp->t_rcvtime = now; 3629 tp->t_sndtime = now; 3630 tp->t_rcvacktime = now; 3631 tp->t_sndacktime = now; 3632 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3633 tcpstat_inc(tcps_accepts); 3634 3635 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3636 if (sc->sc_peermaxseg) 3637 tcp_mss_update(tp); 3638 /* Reset initial window to 1 segment for retransmit */ 3639 if (sc->sc_rxtshift > 0) 3640 tp->snd_cwnd = tp->t_maxseg; 3641 tp->snd_wl1 = sc->sc_irs; 3642 tp->rcv_up = sc->sc_irs + 1; 3643 3644 /* 3645 * This is what would have happened in tcp_output() when 3646 * the SYN,ACK was sent. 3647 */ 3648 tp->snd_up = tp->snd_una; 3649 tp->snd_max = tp->snd_nxt = tp->iss+1; 3650 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3651 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3652 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3653 tp->last_ack_sent = tp->rcv_nxt; 3654 3655 tcpstat_inc(tcps_sc_completed); 3656 syn_cache_put(sc); 3657 return (so); 3658 3659 resetandabort: 3660 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3661 m->m_pkthdr.ph_rtableid, now); 3662 abort: 3663 m_freem(m); 3664 if (so != NULL) 3665 soabort(so); 3666 syn_cache_put(sc); 3667 tcpstat_inc(tcps_sc_aborted); 3668 return ((struct socket *)(-1)); 3669 } 3670 3671 /* 3672 * This function is called when we get a RST for a 3673 * non-existent connection, so that we can see if the 3674 * connection is in the syn cache. If it is, zap it. 3675 */ 3676 3677 void 3678 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3679 u_int rtableid) 3680 { 3681 struct syn_cache *sc; 3682 struct syn_cache_head *scp; 3683 3684 NET_ASSERT_LOCKED(); 3685 3686 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3687 return; 3688 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3689 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3690 return; 3691 syn_cache_rm(sc); 3692 tcpstat_inc(tcps_sc_reset); 3693 syn_cache_put(sc); 3694 } 3695 3696 void 3697 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3698 u_int rtableid) 3699 { 3700 struct syn_cache *sc; 3701 struct syn_cache_head *scp; 3702 3703 NET_ASSERT_LOCKED(); 3704 3705 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3706 return; 3707 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3708 if (ntohl (th->th_seq) != sc->sc_iss) { 3709 return; 3710 } 3711 3712 /* 3713 * If we've retransmitted 3 times and this is our second error, 3714 * we remove the entry. Otherwise, we allow it to continue on. 3715 * This prevents us from incorrectly nuking an entry during a 3716 * spurious network outage. 3717 * 3718 * See tcp_notify(). 3719 */ 3720 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3721 sc->sc_flags |= SCF_UNREACH; 3722 return; 3723 } 3724 3725 syn_cache_rm(sc); 3726 tcpstat_inc(tcps_sc_unreach); 3727 syn_cache_put(sc); 3728 } 3729 3730 /* 3731 * Given a LISTEN socket and an inbound SYN request, add 3732 * this to the syn cache, and send back a segment: 3733 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3734 * to the source. 3735 * 3736 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3737 * Doing so would require that we hold onto the data and deliver it 3738 * to the application. However, if we are the target of a SYN-flood 3739 * DoS attack, an attacker could send data which would eventually 3740 * consume all available buffer space if it were ACKed. By not ACKing 3741 * the data, we avoid this DoS scenario. 3742 */ 3743 3744 int 3745 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3746 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3747 struct tcp_opt_info *oi, tcp_seq *issp, uint32_t now) 3748 { 3749 struct tcpcb tb, *tp; 3750 long win; 3751 struct syn_cache *sc; 3752 struct syn_cache_head *scp; 3753 struct mbuf *ipopts; 3754 3755 tp = sototcpcb(so); 3756 3757 /* 3758 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3759 * 3760 * Note this check is performed in tcp_input() very early on. 3761 */ 3762 3763 /* 3764 * Initialize some local state. 3765 */ 3766 win = sbspace(so, &so->so_rcv); 3767 if (win > TCP_MAXWIN) 3768 win = TCP_MAXWIN; 3769 3770 bzero(&tb, sizeof(tb)); 3771 #ifdef TCP_SIGNATURE 3772 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3773 #else 3774 if (optp) { 3775 #endif 3776 tb.pf = tp->pf; 3777 tb.sack_enable = tp->sack_enable; 3778 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3779 #ifdef TCP_SIGNATURE 3780 if (tp->t_flags & TF_SIGNATURE) 3781 tb.t_flags |= TF_SIGNATURE; 3782 #endif 3783 tb.t_state = TCPS_LISTEN; 3784 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3785 sotoinpcb(so)->inp_rtableid, now)) 3786 return (-1); 3787 } 3788 3789 switch (src->sa_family) { 3790 case AF_INET: 3791 /* 3792 * Remember the IP options, if any. 3793 */ 3794 ipopts = ip_srcroute(m); 3795 break; 3796 default: 3797 ipopts = NULL; 3798 } 3799 3800 /* 3801 * See if we already have an entry for this connection. 3802 * If we do, resend the SYN,ACK. We do not count this 3803 * as a retransmission (XXX though maybe we should). 3804 */ 3805 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3806 if (sc != NULL) { 3807 tcpstat_inc(tcps_sc_dupesyn); 3808 if (ipopts) { 3809 /* 3810 * If we were remembering a previous source route, 3811 * forget it and use the new one we've been given. 3812 */ 3813 m_free(sc->sc_ipopts); 3814 sc->sc_ipopts = ipopts; 3815 } 3816 sc->sc_timestamp = tb.ts_recent; 3817 if (syn_cache_respond(sc, m, now) == 0) { 3818 tcpstat_inc(tcps_sndacks); 3819 tcpstat_inc(tcps_sndtotal); 3820 } 3821 return (0); 3822 } 3823 3824 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3825 if (sc == NULL) { 3826 m_free(ipopts); 3827 return (-1); 3828 } 3829 3830 /* 3831 * Fill in the cache, and put the necessary IP and TCP 3832 * options into the reply. 3833 */ 3834 memcpy(&sc->sc_src, src, src->sa_len); 3835 memcpy(&sc->sc_dst, dst, dst->sa_len); 3836 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3837 sc->sc_flags = 0; 3838 sc->sc_ipopts = ipopts; 3839 sc->sc_irs = th->th_seq; 3840 3841 sc->sc_iss = issp ? *issp : arc4random(); 3842 sc->sc_peermaxseg = oi->maxseg; 3843 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3844 sc->sc_win = win; 3845 sc->sc_timestamp = tb.ts_recent; 3846 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3847 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3848 sc->sc_flags |= SCF_TIMESTAMP; 3849 sc->sc_modulate = arc4random(); 3850 } 3851 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3852 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3853 sc->sc_requested_s_scale = tb.requested_s_scale; 3854 sc->sc_request_r_scale = 0; 3855 /* 3856 * Pick the smallest possible scaling factor that 3857 * will still allow us to scale up to sb_max. 3858 * 3859 * We do this because there are broken firewalls that 3860 * will corrupt the window scale option, leading to 3861 * the other endpoint believing that our advertised 3862 * window is unscaled. At scale factors larger than 3863 * 5 the unscaled window will drop below 1500 bytes, 3864 * leading to serious problems when traversing these 3865 * broken firewalls. 3866 * 3867 * With the default sbmax of 256K, a scale factor 3868 * of 3 will be chosen by this algorithm. Those who 3869 * choose a larger sbmax should watch out 3870 * for the compatibility problems mentioned above. 3871 * 3872 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3873 * or <SYN,ACK>) segment itself is never scaled. 3874 */ 3875 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3876 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3877 sc->sc_request_r_scale++; 3878 } else { 3879 sc->sc_requested_s_scale = 15; 3880 sc->sc_request_r_scale = 15; 3881 } 3882 #ifdef TCP_ECN 3883 /* 3884 * if both ECE and CWR flag bits are set, peer is ECN capable. 3885 */ 3886 if (tcp_do_ecn && 3887 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3888 sc->sc_flags |= SCF_ECN_PERMIT; 3889 #endif 3890 /* 3891 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3892 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3893 */ 3894 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3895 sc->sc_flags |= SCF_SACK_PERMIT; 3896 #ifdef TCP_SIGNATURE 3897 if (tb.t_flags & TF_SIGNATURE) 3898 sc->sc_flags |= SCF_SIGNATURE; 3899 #endif 3900 sc->sc_tp = tp; 3901 if (syn_cache_respond(sc, m, now) == 0) { 3902 syn_cache_insert(sc, tp); 3903 tcpstat_inc(tcps_sndacks); 3904 tcpstat_inc(tcps_sndtotal); 3905 } else { 3906 syn_cache_put(sc); 3907 tcpstat_inc(tcps_sc_dropped); 3908 } 3909 3910 return (0); 3911 } 3912 3913 int 3914 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint32_t now) 3915 { 3916 u_int8_t *optp; 3917 int optlen, error; 3918 u_int16_t tlen; 3919 struct ip *ip = NULL; 3920 #ifdef INET6 3921 struct ip6_hdr *ip6 = NULL; 3922 #endif 3923 struct tcphdr *th; 3924 u_int hlen; 3925 struct inpcb *inp; 3926 3927 switch (sc->sc_src.sa.sa_family) { 3928 case AF_INET: 3929 hlen = sizeof(struct ip); 3930 break; 3931 #ifdef INET6 3932 case AF_INET6: 3933 hlen = sizeof(struct ip6_hdr); 3934 break; 3935 #endif 3936 default: 3937 m_freem(m); 3938 return (EAFNOSUPPORT); 3939 } 3940 3941 /* Compute the size of the TCP options. */ 3942 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3943 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 3944 #ifdef TCP_SIGNATURE 3945 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3946 #endif 3947 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3948 3949 tlen = hlen + sizeof(struct tcphdr) + optlen; 3950 3951 /* 3952 * Create the IP+TCP header from scratch. 3953 */ 3954 m_freem(m); 3955 #ifdef DIAGNOSTIC 3956 if (max_linkhdr + tlen > MCLBYTES) 3957 return (ENOBUFS); 3958 #endif 3959 MGETHDR(m, M_DONTWAIT, MT_DATA); 3960 if (m && max_linkhdr + tlen > MHLEN) { 3961 MCLGET(m, M_DONTWAIT); 3962 if ((m->m_flags & M_EXT) == 0) { 3963 m_freem(m); 3964 m = NULL; 3965 } 3966 } 3967 if (m == NULL) 3968 return (ENOBUFS); 3969 3970 /* Fixup the mbuf. */ 3971 m->m_data += max_linkhdr; 3972 m->m_len = m->m_pkthdr.len = tlen; 3973 m->m_pkthdr.ph_ifidx = 0; 3974 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 3975 memset(mtod(m, u_char *), 0, tlen); 3976 3977 switch (sc->sc_src.sa.sa_family) { 3978 case AF_INET: 3979 ip = mtod(m, struct ip *); 3980 ip->ip_dst = sc->sc_src.sin.sin_addr; 3981 ip->ip_src = sc->sc_dst.sin.sin_addr; 3982 ip->ip_p = IPPROTO_TCP; 3983 th = (struct tcphdr *)(ip + 1); 3984 th->th_dport = sc->sc_src.sin.sin_port; 3985 th->th_sport = sc->sc_dst.sin.sin_port; 3986 break; 3987 #ifdef INET6 3988 case AF_INET6: 3989 ip6 = mtod(m, struct ip6_hdr *); 3990 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3991 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3992 ip6->ip6_nxt = IPPROTO_TCP; 3993 th = (struct tcphdr *)(ip6 + 1); 3994 th->th_dport = sc->sc_src.sin6.sin6_port; 3995 th->th_sport = sc->sc_dst.sin6.sin6_port; 3996 break; 3997 #endif 3998 } 3999 4000 th->th_seq = htonl(sc->sc_iss); 4001 th->th_ack = htonl(sc->sc_irs + 1); 4002 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4003 th->th_flags = TH_SYN|TH_ACK; 4004 #ifdef TCP_ECN 4005 /* Set ECE for SYN-ACK if peer supports ECN. */ 4006 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4007 th->th_flags |= TH_ECE; 4008 #endif 4009 th->th_win = htons(sc->sc_win); 4010 /* th_sum already 0 */ 4011 /* th_urp already 0 */ 4012 4013 /* Tack on the TCP options. */ 4014 optp = (u_int8_t *)(th + 1); 4015 *optp++ = TCPOPT_MAXSEG; 4016 *optp++ = 4; 4017 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4018 *optp++ = sc->sc_ourmaxseg & 0xff; 4019 4020 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4021 if (sc->sc_flags & SCF_SACK_PERMIT) { 4022 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4023 optp += 4; 4024 } 4025 4026 if (sc->sc_request_r_scale != 15) { 4027 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4028 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4029 sc->sc_request_r_scale); 4030 optp += 4; 4031 } 4032 4033 if (sc->sc_flags & SCF_TIMESTAMP) { 4034 u_int32_t *lp = (u_int32_t *)(optp); 4035 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4036 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4037 *lp++ = htonl(now + sc->sc_modulate); 4038 *lp = htonl(sc->sc_timestamp); 4039 optp += TCPOLEN_TSTAMP_APPA; 4040 } 4041 4042 #ifdef TCP_SIGNATURE 4043 if (sc->sc_flags & SCF_SIGNATURE) { 4044 union sockaddr_union src, dst; 4045 struct tdb *tdb; 4046 4047 bzero(&src, sizeof(union sockaddr_union)); 4048 bzero(&dst, sizeof(union sockaddr_union)); 4049 src.sa.sa_len = sc->sc_src.sa.sa_len; 4050 src.sa.sa_family = sc->sc_src.sa.sa_family; 4051 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4052 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4053 4054 switch (sc->sc_src.sa.sa_family) { 4055 case 0: /*default to PF_INET*/ 4056 case AF_INET: 4057 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4058 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4059 break; 4060 #ifdef INET6 4061 case AF_INET6: 4062 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4063 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4064 break; 4065 #endif /* INET6 */ 4066 } 4067 4068 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4069 0, &src, &dst, IPPROTO_TCP); 4070 if (tdb == NULL) { 4071 m_freem(m); 4072 return (EPERM); 4073 } 4074 4075 /* Send signature option */ 4076 *(optp++) = TCPOPT_SIGNATURE; 4077 *(optp++) = TCPOLEN_SIGNATURE; 4078 4079 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4080 hlen, 0, optp) < 0) { 4081 m_freem(m); 4082 tdb_unref(tdb); 4083 return (EINVAL); 4084 } 4085 tdb_unref(tdb); 4086 optp += 16; 4087 4088 /* Pad options list to the next 32 bit boundary and 4089 * terminate it. 4090 */ 4091 *optp++ = TCPOPT_NOP; 4092 *optp++ = TCPOPT_EOL; 4093 } 4094 #endif /* TCP_SIGNATURE */ 4095 4096 SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); 4097 4098 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4099 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4100 4101 /* 4102 * Fill in some straggling IP bits. Note the stack expects 4103 * ip_len to be in host order, for convenience. 4104 */ 4105 switch (sc->sc_src.sa.sa_family) { 4106 case AF_INET: 4107 ip->ip_len = htons(tlen); 4108 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4109 if (inp != NULL) 4110 ip->ip_tos = inp->inp_ip.ip_tos; 4111 4112 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4113 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4114 break; 4115 #ifdef INET6 4116 case AF_INET6: 4117 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4118 ip6->ip6_vfc |= IPV6_VERSION; 4119 /* ip6_plen will be updated in ip6_output() */ 4120 ip6->ip6_hlim = in6_selecthlim(inp); 4121 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4122 4123 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4124 NULL, NULL); 4125 break; 4126 #endif 4127 } 4128 return (error); 4129 } 4130