1 /* $OpenBSD: tcp_input.c,v 1.407 2024/08/26 13:55:14 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet6/ip6_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #if NPF > 0 100 #include <net/pfvar.h> 101 #endif 102 103 int tcp_mss_adv(struct mbuf *, int); 104 int tcp_flush_queue(struct tcpcb *); 105 106 #ifdef INET6 107 #include <netinet6/in6_var.h> 108 #include <netinet6/nd6.h> 109 #endif /* INET6 */ 110 111 int tcprexmtthresh = 3; 112 int tcptv_keep_init = TCPTV_KEEP_INIT; 113 114 int tcp_rst_ppslim = 100; /* 100pps */ 115 int tcp_rst_ppslim_count = 0; 116 struct timeval tcp_rst_ppslim_last; 117 118 int tcp_ackdrop_ppslim = 100; /* 100pps */ 119 int tcp_ackdrop_ppslim_count = 0; 120 struct timeval tcp_ackdrop_ppslim_last; 121 122 #define TCP_PAWS_IDLE TCP_TIME(24 * 24 * 60 * 60) 123 124 /* for modulo comparisons of timestamps */ 125 #define TSTMP_LT(a,b) ((int32_t)((a)-(b)) < 0) 126 #define TSTMP_GEQ(a,b) ((int32_t)((a)-(b)) >= 0) 127 128 /* for TCP SACK comparisons */ 129 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 130 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 131 132 /* 133 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 134 */ 135 #ifdef INET6 136 #define ND6_HINT(tp) \ 137 do { \ 138 if (tp && tp->t_inpcb && \ 139 ISSET(tp->t_inpcb->inp_flags, INP_IPV6) && \ 140 rtisvalid(tp->t_inpcb->inp_route.ro_rt)) { \ 141 nd6_nud_hint(tp->t_inpcb->inp_route.ro_rt); \ 142 } \ 143 } while (0) 144 #else 145 #define ND6_HINT(tp) 146 #endif 147 148 #ifdef TCP_ECN 149 /* 150 * ECN (Explicit Congestion Notification) support based on RFC3168 151 * implementation note: 152 * snd_last is used to track a recovery phase. 153 * when cwnd is reduced, snd_last is set to snd_max. 154 * while snd_last > snd_una, the sender is in a recovery phase and 155 * its cwnd should not be reduced again. 156 * snd_last follows snd_una when not in a recovery phase. 157 */ 158 #endif 159 160 /* 161 * Macro to compute ACK transmission behavior. Delay the ACK unless 162 * we have already delayed an ACK (must send an ACK every two segments). 163 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 164 * option is enabled or when the packet is coming from a loopback 165 * interface. 166 */ 167 #define TCP_SETUP_ACK(tp, tiflags, m) \ 168 do { \ 169 struct ifnet *ifp = NULL; \ 170 if (m && (m->m_flags & M_PKTHDR)) \ 171 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 172 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 173 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 174 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 175 tp->t_flags |= TF_ACKNOW; \ 176 else \ 177 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \ 178 if_put(ifp); \ 179 } while (0) 180 181 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 182 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 183 184 void syn_cache_put(struct syn_cache *); 185 void syn_cache_rm(struct syn_cache *); 186 int syn_cache_respond(struct syn_cache *, struct mbuf *, uint64_t); 187 void syn_cache_timer(void *); 188 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 189 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 190 struct tcphdr *, u_int); 191 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 192 unsigned int, struct socket *, struct mbuf *, u_char *, int, 193 struct tcp_opt_info *, tcp_seq *, uint64_t); 194 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 195 struct tcphdr *, unsigned int, unsigned int, struct socket *, 196 struct mbuf *, uint64_t); 197 struct syn_cache *syn_cache_lookup(const struct sockaddr *, 198 const struct sockaddr *, struct syn_cache_head **, u_int); 199 200 /* 201 * Insert segment ti into reassembly queue of tcp with 202 * control block tp. Return TH_FIN if reassembly now includes 203 * a segment with FIN. The macro form does the common case inline 204 * (segment is the next to be received on an established connection, 205 * and the queue is empty), avoiding linkage into and removal 206 * from the queue and repetition of various conversions. 207 * Set DELACK for segments received in order, but ack immediately 208 * when segments are out of order (so fast retransmit can work). 209 */ 210 211 int 212 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 213 { 214 struct tcpqent *p, *q, *nq, *tiqe; 215 216 /* 217 * Allocate a new queue entry, before we throw away any data. 218 * If we can't, just drop the packet. XXX 219 */ 220 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 221 if (tiqe == NULL) { 222 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 223 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 224 /* Reuse last entry since new segment fills a hole */ 225 m_freem(tiqe->tcpqe_m); 226 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 227 } 228 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 229 /* Flush segment queue for this connection */ 230 tcp_freeq(tp); 231 tcpstat_inc(tcps_rcvmemdrop); 232 m_freem(m); 233 return (0); 234 } 235 } 236 237 /* 238 * Find a segment which begins after this one does. 239 */ 240 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 241 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 242 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 243 break; 244 245 /* 246 * If there is a preceding segment, it may provide some of 247 * our data already. If so, drop the data from the incoming 248 * segment. If it provides all of our data, drop us. 249 */ 250 if (p != NULL) { 251 struct tcphdr *phdr = p->tcpqe_tcp; 252 int i; 253 254 /* conversion to int (in i) handles seq wraparound */ 255 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 256 if (i > 0) { 257 if (i >= *tlen) { 258 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 259 *tlen); 260 m_freem(m); 261 pool_put(&tcpqe_pool, tiqe); 262 return (0); 263 } 264 m_adj(m, i); 265 *tlen -= i; 266 th->th_seq += i; 267 } 268 } 269 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 270 tp->t_rcvoopack++; 271 272 /* 273 * While we overlap succeeding segments trim them or, 274 * if they are completely covered, dequeue them. 275 */ 276 for (; q != NULL; q = nq) { 277 struct tcphdr *qhdr = q->tcpqe_tcp; 278 int i = (th->th_seq + *tlen) - qhdr->th_seq; 279 280 if (i <= 0) 281 break; 282 if (i < qhdr->th_reseqlen) { 283 qhdr->th_seq += i; 284 qhdr->th_reseqlen -= i; 285 m_adj(q->tcpqe_m, i); 286 break; 287 } 288 nq = TAILQ_NEXT(q, tcpqe_q); 289 m_freem(q->tcpqe_m); 290 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 291 pool_put(&tcpqe_pool, q); 292 } 293 294 /* Insert the new segment queue entry into place. */ 295 tiqe->tcpqe_m = m; 296 th->th_reseqlen = *tlen; 297 tiqe->tcpqe_tcp = th; 298 if (p == NULL) { 299 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 300 } else { 301 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 302 } 303 304 if (th->th_seq != tp->rcv_nxt) 305 return (0); 306 307 return (tcp_flush_queue(tp)); 308 } 309 310 int 311 tcp_flush_queue(struct tcpcb *tp) 312 { 313 struct socket *so = tp->t_inpcb->inp_socket; 314 struct tcpqent *q, *nq; 315 int flags; 316 317 /* 318 * Present data to user, advancing rcv_nxt through 319 * completed sequence space. 320 */ 321 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 322 return (0); 323 q = TAILQ_FIRST(&tp->t_segq); 324 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 325 return (0); 326 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 327 return (0); 328 do { 329 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 330 flags = q->tcpqe_tcp->th_flags & TH_FIN; 331 332 nq = TAILQ_NEXT(q, tcpqe_q); 333 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 334 ND6_HINT(tp); 335 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 336 m_freem(q->tcpqe_m); 337 else 338 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 339 pool_put(&tcpqe_pool, q); 340 q = nq; 341 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 342 tp->t_flags |= TF_BLOCKOUTPUT; 343 sorwakeup(so); 344 tp->t_flags &= ~TF_BLOCKOUTPUT; 345 return (flags); 346 } 347 348 /* 349 * TCP input routine, follows pages 65-76 of the 350 * protocol specification dated September, 1981 very closely. 351 */ 352 int 353 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 354 { 355 struct mbuf *m = *mp; 356 int iphlen = *offp; 357 struct ip *ip = NULL; 358 struct inpcb *inp = NULL; 359 u_int8_t *optp = NULL; 360 int optlen = 0; 361 int tlen, off; 362 struct tcpcb *otp = NULL, *tp = NULL; 363 int tiflags; 364 struct socket *so = NULL; 365 int todrop, acked, ourfinisacked; 366 int hdroptlen = 0; 367 short ostate; 368 union { 369 struct tcpiphdr tcpip; 370 #ifdef INET6 371 struct tcpipv6hdr tcpip6; 372 #endif 373 char caddr; 374 } saveti; 375 tcp_seq iss, *reuse = NULL; 376 uint64_t now; 377 u_long tiwin; 378 struct tcp_opt_info opti; 379 struct tcphdr *th; 380 #ifdef INET6 381 struct ip6_hdr *ip6 = NULL; 382 #endif /* INET6 */ 383 #ifdef TCP_ECN 384 u_char iptos; 385 #endif 386 387 tcpstat_inc(tcps_rcvtotal); 388 389 opti.ts_present = 0; 390 opti.maxseg = 0; 391 now = tcp_now(); 392 393 /* 394 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 395 */ 396 if (m->m_flags & (M_BCAST|M_MCAST)) 397 goto drop; 398 399 /* 400 * Get IP and TCP header together in first mbuf. 401 * Note: IP leaves IP header in first mbuf. 402 */ 403 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 404 if (!th) { 405 tcpstat_inc(tcps_rcvshort); 406 return IPPROTO_DONE; 407 } 408 409 tlen = m->m_pkthdr.len - iphlen; 410 switch (af) { 411 case AF_INET: 412 ip = mtod(m, struct ip *); 413 #ifdef TCP_ECN 414 /* save ip_tos before clearing it for checksum */ 415 iptos = ip->ip_tos; 416 #endif 417 break; 418 #ifdef INET6 419 case AF_INET6: 420 ip6 = mtod(m, struct ip6_hdr *); 421 #ifdef TCP_ECN 422 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 423 #endif 424 425 /* 426 * Be proactive about unspecified IPv6 address in source. 427 * As we use all-zero to indicate unbounded/unconnected pcb, 428 * unspecified IPv6 address can be used to confuse us. 429 * 430 * Note that packets with unspecified IPv6 destination is 431 * already dropped in ip6_input. 432 */ 433 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 434 /* XXX stat */ 435 goto drop; 436 } 437 438 /* Discard packets to multicast */ 439 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 440 /* XXX stat */ 441 goto drop; 442 } 443 break; 444 #endif 445 default: 446 unhandled_af(af); 447 } 448 449 /* 450 * Checksum extended TCP header and data. 451 */ 452 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 453 int sum; 454 455 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 456 tcpstat_inc(tcps_rcvbadsum); 457 goto drop; 458 } 459 tcpstat_inc(tcps_inswcsum); 460 switch (af) { 461 case AF_INET: 462 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 463 break; 464 #ifdef INET6 465 case AF_INET6: 466 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 467 tlen); 468 break; 469 #endif 470 } 471 if (sum != 0) { 472 tcpstat_inc(tcps_rcvbadsum); 473 goto drop; 474 } 475 } 476 477 /* 478 * Check that TCP offset makes sense, 479 * pull out TCP options and adjust length. XXX 480 */ 481 off = th->th_off << 2; 482 if (off < sizeof(struct tcphdr) || off > tlen) { 483 tcpstat_inc(tcps_rcvbadoff); 484 goto drop; 485 } 486 tlen -= off; 487 if (off > sizeof(struct tcphdr)) { 488 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 489 if (!th) { 490 tcpstat_inc(tcps_rcvshort); 491 return IPPROTO_DONE; 492 } 493 optlen = off - sizeof(struct tcphdr); 494 optp = (u_int8_t *)(th + 1); 495 /* 496 * Do quick retrieval of timestamp options ("options 497 * prediction?"). If timestamp is the only option and it's 498 * formatted as recommended in RFC 1323 appendix A, we 499 * quickly get the values now and not bother calling 500 * tcp_dooptions(), etc. 501 */ 502 if ((optlen == TCPOLEN_TSTAMP_APPA || 503 (optlen > TCPOLEN_TSTAMP_APPA && 504 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 505 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 506 (th->th_flags & TH_SYN) == 0) { 507 opti.ts_present = 1; 508 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 509 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 510 optp = NULL; /* we've parsed the options */ 511 } 512 } 513 tiflags = th->th_flags; 514 515 /* 516 * Convert TCP protocol specific fields to host format. 517 */ 518 th->th_seq = ntohl(th->th_seq); 519 th->th_ack = ntohl(th->th_ack); 520 th->th_win = ntohs(th->th_win); 521 th->th_urp = ntohs(th->th_urp); 522 523 if (th->th_dport == 0) { 524 tcpstat_inc(tcps_noport); 525 goto dropwithreset_ratelim; 526 } 527 528 /* 529 * Locate pcb for segment. 530 */ 531 #if NPF > 0 532 inp = pf_inp_lookup(m); 533 #endif 534 findpcb: 535 if (inp == NULL) { 536 switch (af) { 537 #ifdef INET6 538 case AF_INET6: 539 inp = in6_pcblookup(&tcb6table, &ip6->ip6_src, 540 th->th_sport, &ip6->ip6_dst, th->th_dport, 541 m->m_pkthdr.ph_rtableid); 542 break; 543 #endif 544 case AF_INET: 545 inp = in_pcblookup(&tcbtable, ip->ip_src, 546 th->th_sport, ip->ip_dst, th->th_dport, 547 m->m_pkthdr.ph_rtableid); 548 break; 549 } 550 } 551 if (inp == NULL) { 552 tcpstat_inc(tcps_pcbhashmiss); 553 switch (af) { 554 #ifdef INET6 555 case AF_INET6: 556 inp = in6_pcblookup_listen(&tcb6table, &ip6->ip6_dst, 557 th->th_dport, m, m->m_pkthdr.ph_rtableid); 558 break; 559 #endif 560 case AF_INET: 561 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 562 th->th_dport, m, m->m_pkthdr.ph_rtableid); 563 break; 564 } 565 /* 566 * If the state is CLOSED (i.e., TCB does not exist) then 567 * all data in the incoming segment is discarded. 568 * If the TCB exists but is in CLOSED state, it is embryonic, 569 * but should either do a listen or a connect soon. 570 */ 571 } 572 #ifdef IPSEC 573 if (ipsec_in_use) { 574 struct m_tag *mtag; 575 struct tdb *tdb = NULL; 576 int error; 577 578 /* Find most recent IPsec tag */ 579 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 580 if (mtag != NULL) { 581 struct tdb_ident *tdbi; 582 583 tdbi = (struct tdb_ident *)(mtag + 1); 584 tdb = gettdb(tdbi->rdomain, tdbi->spi, 585 &tdbi->dst, tdbi->proto); 586 } 587 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 588 tdb, inp ? &inp->inp_seclevel : NULL, NULL, NULL); 589 tdb_unref(tdb); 590 if (error) { 591 tcpstat_inc(tcps_rcvnosec); 592 goto drop; 593 } 594 } 595 #endif /* IPSEC */ 596 597 if (inp == NULL) { 598 tcpstat_inc(tcps_noport); 599 goto dropwithreset_ratelim; 600 } 601 602 KASSERT(sotoinpcb(inp->inp_socket) == inp); 603 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 604 soassertlocked(inp->inp_socket); 605 606 /* Check the minimum TTL for socket. */ 607 switch (af) { 608 case AF_INET: 609 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 610 goto drop; 611 break; 612 #ifdef INET6 613 case AF_INET6: 614 if (inp->inp_ip6_minhlim && 615 inp->inp_ip6_minhlim > ip6->ip6_hlim) 616 goto drop; 617 break; 618 #endif 619 } 620 621 tp = intotcpcb(inp); 622 if (tp == NULL) 623 goto dropwithreset_ratelim; 624 if (tp->t_state == TCPS_CLOSED) 625 goto drop; 626 627 /* Unscale the window into a 32-bit value. */ 628 if ((tiflags & TH_SYN) == 0) 629 tiwin = th->th_win << tp->snd_scale; 630 else 631 tiwin = th->th_win; 632 633 so = inp->inp_socket; 634 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 635 union syn_cache_sa src; 636 union syn_cache_sa dst; 637 638 bzero(&src, sizeof(src)); 639 bzero(&dst, sizeof(dst)); 640 switch (af) { 641 case AF_INET: 642 src.sin.sin_len = sizeof(struct sockaddr_in); 643 src.sin.sin_family = AF_INET; 644 src.sin.sin_addr = ip->ip_src; 645 src.sin.sin_port = th->th_sport; 646 647 dst.sin.sin_len = sizeof(struct sockaddr_in); 648 dst.sin.sin_family = AF_INET; 649 dst.sin.sin_addr = ip->ip_dst; 650 dst.sin.sin_port = th->th_dport; 651 break; 652 #ifdef INET6 653 case AF_INET6: 654 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 655 src.sin6.sin6_family = AF_INET6; 656 src.sin6.sin6_addr = ip6->ip6_src; 657 src.sin6.sin6_port = th->th_sport; 658 659 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 660 dst.sin6.sin6_family = AF_INET6; 661 dst.sin6.sin6_addr = ip6->ip6_dst; 662 dst.sin6.sin6_port = th->th_dport; 663 break; 664 #endif /* INET6 */ 665 } 666 667 if (so->so_options & SO_DEBUG) { 668 otp = tp; 669 ostate = tp->t_state; 670 switch (af) { 671 #ifdef INET6 672 case AF_INET6: 673 saveti.tcpip6.ti6_i = *ip6; 674 saveti.tcpip6.ti6_t = *th; 675 break; 676 #endif 677 case AF_INET: 678 memcpy(&saveti.tcpip.ti_i, ip, sizeof(*ip)); 679 saveti.tcpip.ti_t = *th; 680 break; 681 } 682 } 683 if (so->so_options & SO_ACCEPTCONN) { 684 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 685 686 case TH_SYN|TH_ACK|TH_RST: 687 case TH_SYN|TH_RST: 688 case TH_ACK|TH_RST: 689 case TH_RST: 690 syn_cache_reset(&src.sa, &dst.sa, th, 691 inp->inp_rtableid); 692 goto drop; 693 694 case TH_SYN|TH_ACK: 695 /* 696 * Received a SYN,ACK. This should 697 * never happen while we are in 698 * LISTEN. Send an RST. 699 */ 700 goto badsyn; 701 702 case TH_ACK: 703 so = syn_cache_get(&src.sa, &dst.sa, 704 th, iphlen, tlen, so, m, now); 705 if (so == NULL) { 706 /* 707 * We don't have a SYN for 708 * this ACK; send an RST. 709 */ 710 goto badsyn; 711 } else if (so == (struct socket *)(-1)) { 712 /* 713 * We were unable to create 714 * the connection. If the 715 * 3-way handshake was 716 * completed, and RST has 717 * been sent to the peer. 718 * Since the mbuf might be 719 * in use for the reply, 720 * do not free it. 721 */ 722 m = *mp = NULL; 723 goto drop; 724 } else { 725 /* 726 * We have created a 727 * full-blown connection. 728 */ 729 tp = NULL; 730 in_pcbunref(inp); 731 inp = in_pcbref(sotoinpcb(so)); 732 tp = intotcpcb(inp); 733 if (tp == NULL) 734 goto badsyn; /*XXX*/ 735 736 } 737 break; 738 739 default: 740 /* 741 * None of RST, SYN or ACK was set. 742 * This is an invalid packet for a 743 * TCB in LISTEN state. Send a RST. 744 */ 745 goto badsyn; 746 747 case TH_SYN: 748 /* 749 * Received a SYN. 750 */ 751 #ifdef INET6 752 /* 753 * If deprecated address is forbidden, we do 754 * not accept SYN to deprecated interface 755 * address to prevent any new inbound 756 * connection from getting established. 757 * When we do not accept SYN, we send a TCP 758 * RST, with deprecated source address (instead 759 * of dropping it). We compromise it as it is 760 * much better for peer to send a RST, and 761 * RST will be the final packet for the 762 * exchange. 763 * 764 * If we do not forbid deprecated addresses, we 765 * accept the SYN packet. RFC2462 does not 766 * suggest dropping SYN in this case. 767 * If we decipher RFC2462 5.5.4, it says like 768 * this: 769 * 1. use of deprecated addr with existing 770 * communication is okay - "SHOULD continue 771 * to be used" 772 * 2. use of it with new communication: 773 * (2a) "SHOULD NOT be used if alternate 774 * address with sufficient scope is 775 * available" 776 * (2b) nothing mentioned otherwise. 777 * Here we fall into (2b) case as we have no 778 * choice in our source address selection - we 779 * must obey the peer. 780 * 781 * The wording in RFC2462 is confusing, and 782 * there are multiple description text for 783 * deprecated address handling - worse, they 784 * are not exactly the same. I believe 5.5.4 785 * is the best one, so we follow 5.5.4. 786 */ 787 if (ip6 && !ip6_use_deprecated) { 788 struct in6_ifaddr *ia6; 789 struct ifnet *ifp = 790 if_get(m->m_pkthdr.ph_ifidx); 791 792 if (ifp && 793 (ia6 = in6ifa_ifpwithaddr(ifp, 794 &ip6->ip6_dst)) && 795 (ia6->ia6_flags & 796 IN6_IFF_DEPRECATED)) { 797 tp = NULL; 798 if_put(ifp); 799 goto dropwithreset; 800 } 801 if_put(ifp); 802 } 803 #endif 804 805 /* 806 * LISTEN socket received a SYN 807 * from itself? This can't possibly 808 * be valid; drop the packet. 809 */ 810 if (th->th_dport == th->th_sport) { 811 switch (af) { 812 #ifdef INET6 813 case AF_INET6: 814 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 815 &ip6->ip6_dst)) { 816 tcpstat_inc(tcps_badsyn); 817 goto drop; 818 } 819 break; 820 #endif /* INET6 */ 821 case AF_INET: 822 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 823 tcpstat_inc(tcps_badsyn); 824 goto drop; 825 } 826 break; 827 } 828 } 829 830 /* 831 * SYN looks ok; create compressed TCP 832 * state for it. 833 */ 834 if (so->so_qlen > so->so_qlimit || 835 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 836 so, m, optp, optlen, &opti, reuse, now) 837 == -1) { 838 tcpstat_inc(tcps_dropsyn); 839 goto drop; 840 } 841 in_pcbunref(inp); 842 return IPPROTO_DONE; 843 } 844 } 845 } 846 847 #ifdef DIAGNOSTIC 848 /* 849 * Should not happen now that all embryonic connections 850 * are handled with compressed state. 851 */ 852 if (tp->t_state == TCPS_LISTEN) 853 panic("tcp_input: TCPS_LISTEN"); 854 #endif 855 856 #if NPF > 0 857 pf_inp_link(m, inp); 858 #endif 859 860 /* 861 * Segment received on connection. 862 * Reset idle time and keep-alive timer. 863 */ 864 tp->t_rcvtime = now; 865 if (TCPS_HAVEESTABLISHED(tp->t_state)) 866 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 867 868 if (tp->sack_enable) 869 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 870 871 /* 872 * Process options. 873 */ 874 if (optp 875 #ifdef TCP_SIGNATURE 876 || (tp->t_flags & TF_SIGNATURE) 877 #endif 878 ) { 879 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 880 m->m_pkthdr.ph_rtableid, now)) 881 goto drop; 882 } 883 884 if (opti.ts_present && opti.ts_ecr) { 885 int32_t rtt_test; 886 887 /* subtract out the tcp timestamp modulator */ 888 opti.ts_ecr -= tp->ts_modulate; 889 890 /* make sure ts_ecr is sensible */ 891 rtt_test = now - opti.ts_ecr; 892 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 893 opti.ts_ecr = 0; 894 } 895 896 #ifdef TCP_ECN 897 /* if congestion experienced, set ECE bit in subsequent packets. */ 898 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 899 tp->t_flags |= TF_RCVD_CE; 900 tcpstat_inc(tcps_ecn_rcvce); 901 } 902 #endif 903 /* 904 * Header prediction: check for the two common cases 905 * of a uni-directional data xfer. If the packet has 906 * no control flags, is in-sequence, the window didn't 907 * change and we're not retransmitting, it's a 908 * candidate. If the length is zero and the ack moved 909 * forward, we're the sender side of the xfer. Just 910 * free the data acked & wake any higher level process 911 * that was blocked waiting for space. If the length 912 * is non-zero and the ack didn't move, we're the 913 * receiver side. If we're getting packets in-order 914 * (the reassembly queue is empty), add the data to 915 * the socket buffer and note that we need a delayed ack. 916 */ 917 if (tp->t_state == TCPS_ESTABLISHED && 918 #ifdef TCP_ECN 919 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 920 #else 921 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 922 #endif 923 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 924 th->th_seq == tp->rcv_nxt && 925 tiwin && tiwin == tp->snd_wnd && 926 tp->snd_nxt == tp->snd_max) { 927 928 /* 929 * If last ACK falls within this segment's sequence numbers, 930 * record the timestamp. 931 * Fix from Braden, see Stevens p. 870 932 */ 933 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 934 tp->ts_recent_age = now; 935 tp->ts_recent = opti.ts_val; 936 } 937 938 if (tlen == 0) { 939 if (SEQ_GT(th->th_ack, tp->snd_una) && 940 SEQ_LEQ(th->th_ack, tp->snd_max) && 941 tp->snd_cwnd >= tp->snd_wnd && 942 tp->t_dupacks == 0) { 943 /* 944 * this is a pure ack for outstanding data. 945 */ 946 tcpstat_inc(tcps_predack); 947 if (opti.ts_present && opti.ts_ecr) 948 tcp_xmit_timer(tp, now - opti.ts_ecr); 949 else if (tp->t_rtttime && 950 SEQ_GT(th->th_ack, tp->t_rtseq)) 951 tcp_xmit_timer(tp, now - tp->t_rtttime); 952 acked = th->th_ack - tp->snd_una; 953 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 954 acked); 955 tp->t_rcvacktime = now; 956 ND6_HINT(tp); 957 sbdrop(so, &so->so_snd, acked); 958 959 /* 960 * If we had a pending ICMP message that 961 * refers to data that have just been 962 * acknowledged, disregard the recorded ICMP 963 * message. 964 */ 965 if ((tp->t_flags & TF_PMTUD_PEND) && 966 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 967 tp->t_flags &= ~TF_PMTUD_PEND; 968 969 /* 970 * Keep track of the largest chunk of data 971 * acknowledged since last PMTU update 972 */ 973 if (tp->t_pmtud_mss_acked < acked) 974 tp->t_pmtud_mss_acked = acked; 975 976 tp->snd_una = th->th_ack; 977 /* Pull snd_wl2 up to prevent seq wrap. */ 978 tp->snd_wl2 = th->th_ack; 979 /* 980 * We want snd_last to track snd_una so 981 * as to avoid sequence wraparound problems 982 * for very large transfers. 983 */ 984 #ifdef TCP_ECN 985 if (SEQ_GT(tp->snd_una, tp->snd_last)) 986 #endif 987 tp->snd_last = tp->snd_una; 988 m_freem(m); 989 990 /* 991 * If all outstanding data are acked, stop 992 * retransmit timer, otherwise restart timer 993 * using current (possibly backed-off) value. 994 * If process is waiting for space, 995 * wakeup/selwakeup/signal. If data 996 * are ready to send, let tcp_output 997 * decide between more output or persist. 998 */ 999 if (tp->snd_una == tp->snd_max) 1000 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1001 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1002 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1003 1004 tcp_update_sndspace(tp); 1005 if (sb_notify(so, &so->so_snd)) { 1006 tp->t_flags |= TF_BLOCKOUTPUT; 1007 sowwakeup(so); 1008 tp->t_flags &= ~TF_BLOCKOUTPUT; 1009 } 1010 if (so->so_snd.sb_cc || 1011 tp->t_flags & TF_NEEDOUTPUT) 1012 (void) tcp_output(tp); 1013 in_pcbunref(inp); 1014 return IPPROTO_DONE; 1015 } 1016 } else if (th->th_ack == tp->snd_una && 1017 TAILQ_EMPTY(&tp->t_segq) && 1018 tlen <= sbspace(so, &so->so_rcv)) { 1019 /* 1020 * This is a pure, in-sequence data packet 1021 * with nothing on the reassembly queue and 1022 * we have enough buffer space to take it. 1023 */ 1024 /* Clean receiver SACK report if present */ 1025 if (tp->sack_enable && tp->rcv_numsacks) 1026 tcp_clean_sackreport(tp); 1027 tcpstat_inc(tcps_preddat); 1028 tp->rcv_nxt += tlen; 1029 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1030 tp->snd_wl1 = th->th_seq; 1031 /* Packet has most recent segment, no urgent exists. */ 1032 tp->rcv_up = tp->rcv_nxt; 1033 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1034 ND6_HINT(tp); 1035 1036 TCP_SETUP_ACK(tp, tiflags, m); 1037 /* 1038 * Drop TCP, IP headers and TCP options then add data 1039 * to socket buffer. 1040 */ 1041 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1042 m_freem(m); 1043 else { 1044 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1045 now - tp->rfbuf_ts > (tp->t_srtt >> 1046 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) { 1047 tcp_update_rcvspace(tp); 1048 /* Start over with next RTT. */ 1049 tp->rfbuf_cnt = 0; 1050 tp->rfbuf_ts = 0; 1051 } else 1052 tp->rfbuf_cnt += tlen; 1053 m_adj(m, iphlen + off); 1054 sbappendstream(so, &so->so_rcv, m); 1055 } 1056 tp->t_flags |= TF_BLOCKOUTPUT; 1057 sorwakeup(so); 1058 tp->t_flags &= ~TF_BLOCKOUTPUT; 1059 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1060 (void) tcp_output(tp); 1061 in_pcbunref(inp); 1062 return IPPROTO_DONE; 1063 } 1064 } 1065 1066 /* 1067 * Compute mbuf offset to TCP data segment. 1068 */ 1069 hdroptlen = iphlen + off; 1070 1071 /* 1072 * Calculate amount of space in receive window, 1073 * and then do TCP input processing. 1074 * Receive window is amount of space in rcv queue, 1075 * but not less than advertised window. 1076 */ 1077 { 1078 int win; 1079 1080 win = sbspace(so, &so->so_rcv); 1081 if (win < 0) 1082 win = 0; 1083 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1084 } 1085 1086 switch (tp->t_state) { 1087 1088 /* 1089 * If the state is SYN_RECEIVED: 1090 * if seg contains SYN/ACK, send an RST. 1091 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1092 */ 1093 1094 case TCPS_SYN_RECEIVED: 1095 if (tiflags & TH_ACK) { 1096 if (tiflags & TH_SYN) { 1097 tcpstat_inc(tcps_badsyn); 1098 goto dropwithreset; 1099 } 1100 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1101 SEQ_GT(th->th_ack, tp->snd_max)) 1102 goto dropwithreset; 1103 } 1104 break; 1105 1106 /* 1107 * If the state is SYN_SENT: 1108 * if seg contains an ACK, but not for our SYN, drop the input. 1109 * if seg contains a RST, then drop the connection. 1110 * if seg does not contain SYN, then drop it. 1111 * Otherwise this is an acceptable SYN segment 1112 * initialize tp->rcv_nxt and tp->irs 1113 * if seg contains ack then advance tp->snd_una 1114 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1115 * arrange for segment to be acked (eventually) 1116 * continue processing rest of data/controls, beginning with URG 1117 */ 1118 case TCPS_SYN_SENT: 1119 if ((tiflags & TH_ACK) && 1120 (SEQ_LEQ(th->th_ack, tp->iss) || 1121 SEQ_GT(th->th_ack, tp->snd_max))) 1122 goto dropwithreset; 1123 if (tiflags & TH_RST) { 1124 #ifdef TCP_ECN 1125 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1126 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1127 goto drop; 1128 #endif 1129 if (tiflags & TH_ACK) 1130 tp = tcp_drop(tp, ECONNREFUSED); 1131 goto drop; 1132 } 1133 if ((tiflags & TH_SYN) == 0) 1134 goto drop; 1135 if (tiflags & TH_ACK) { 1136 tp->snd_una = th->th_ack; 1137 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1138 tp->snd_nxt = tp->snd_una; 1139 } 1140 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1141 tp->irs = th->th_seq; 1142 tcp_mss(tp, opti.maxseg); 1143 /* Reset initial window to 1 segment for retransmit */ 1144 if (tp->t_rxtshift > 0) 1145 tp->snd_cwnd = tp->t_maxseg; 1146 tcp_rcvseqinit(tp); 1147 tp->t_flags |= TF_ACKNOW; 1148 /* 1149 * If we've sent a SACK_PERMITTED option, and the peer 1150 * also replied with one, then TF_SACK_PERMIT should have 1151 * been set in tcp_dooptions(). If it was not, disable SACKs. 1152 */ 1153 if (tp->sack_enable) 1154 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1155 #ifdef TCP_ECN 1156 /* 1157 * if ECE is set but CWR is not set for SYN-ACK, or 1158 * both ECE and CWR are set for simultaneous open, 1159 * peer is ECN capable. 1160 */ 1161 if (tcp_do_ecn) { 1162 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1163 case TH_ACK|TH_ECE: 1164 case TH_ECE|TH_CWR: 1165 tp->t_flags |= TF_ECN_PERMIT; 1166 tiflags &= ~(TH_ECE|TH_CWR); 1167 tcpstat_inc(tcps_ecn_accepts); 1168 } 1169 } 1170 #endif 1171 1172 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1173 tcpstat_inc(tcps_connects); 1174 tp->t_flags |= TF_BLOCKOUTPUT; 1175 soisconnected(so); 1176 tp->t_flags &= ~TF_BLOCKOUTPUT; 1177 tp->t_state = TCPS_ESTABLISHED; 1178 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1179 /* Do window scaling on this connection? */ 1180 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1181 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1182 tp->snd_scale = tp->requested_s_scale; 1183 tp->rcv_scale = tp->request_r_scale; 1184 } 1185 tcp_flush_queue(tp); 1186 1187 /* 1188 * if we didn't have to retransmit the SYN, 1189 * use its rtt as our initial srtt & rtt var. 1190 */ 1191 if (tp->t_rtttime) 1192 tcp_xmit_timer(tp, now - tp->t_rtttime); 1193 /* 1194 * Since new data was acked (the SYN), open the 1195 * congestion window by one MSS. We do this 1196 * here, because we won't go through the normal 1197 * ACK processing below. And since this is the 1198 * start of the connection, we know we are in 1199 * the exponential phase of slow-start. 1200 */ 1201 tp->snd_cwnd += tp->t_maxseg; 1202 } else 1203 tp->t_state = TCPS_SYN_RECEIVED; 1204 1205 #if 0 1206 trimthenstep6: 1207 #endif 1208 /* 1209 * Advance th->th_seq to correspond to first data byte. 1210 * If data, trim to stay within window, 1211 * dropping FIN if necessary. 1212 */ 1213 th->th_seq++; 1214 if (tlen > tp->rcv_wnd) { 1215 todrop = tlen - tp->rcv_wnd; 1216 m_adj(m, -todrop); 1217 tlen = tp->rcv_wnd; 1218 tiflags &= ~TH_FIN; 1219 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1220 todrop); 1221 } 1222 tp->snd_wl1 = th->th_seq - 1; 1223 tp->rcv_up = th->th_seq; 1224 goto step6; 1225 /* 1226 * If a new connection request is received while in TIME_WAIT, 1227 * drop the old connection and start over if the if the 1228 * timestamp or the sequence numbers are above the previous 1229 * ones. 1230 */ 1231 case TCPS_TIME_WAIT: 1232 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1233 ((opti.ts_present && 1234 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1235 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1236 #if NPF > 0 1237 /* 1238 * The socket will be recreated but the new state 1239 * has already been linked to the socket. Remove the 1240 * link between old socket and new state. 1241 */ 1242 pf_inp_unlink(inp); 1243 #endif 1244 /* 1245 * Advance the iss by at least 32768, but 1246 * clear the msb in order to make sure 1247 * that SEG_LT(snd_nxt, iss). 1248 */ 1249 iss = tp->snd_nxt + 1250 ((arc4random() & 0x7fffffff) | 0x8000); 1251 reuse = &iss; 1252 tp = tcp_close(tp); 1253 in_pcbunref(inp); 1254 inp = NULL; 1255 goto findpcb; 1256 } 1257 } 1258 1259 /* 1260 * States other than LISTEN or SYN_SENT. 1261 * First check timestamp, if present. 1262 * Then check that at least some bytes of segment are within 1263 * receive window. If segment begins before rcv_nxt, 1264 * drop leading data (and SYN); if nothing left, just ack. 1265 * 1266 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1267 * and it's less than opti.ts_recent, drop it. 1268 */ 1269 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1270 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1271 1272 /* Check to see if ts_recent is over 24 days old. */ 1273 if (now - tp->ts_recent_age > TCP_PAWS_IDLE) { 1274 /* 1275 * Invalidate ts_recent. If this segment updates 1276 * ts_recent, the age will be reset later and ts_recent 1277 * will get a valid value. If it does not, setting 1278 * ts_recent to zero will at least satisfy the 1279 * requirement that zero be placed in the timestamp 1280 * echo reply when ts_recent isn't valid. The 1281 * age isn't reset until we get a valid ts_recent 1282 * because we don't want out-of-order segments to be 1283 * dropped when ts_recent is old. 1284 */ 1285 tp->ts_recent = 0; 1286 } else { 1287 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1288 tcpstat_inc(tcps_pawsdrop); 1289 if (tlen) 1290 goto dropafterack; 1291 goto drop; 1292 } 1293 } 1294 1295 todrop = tp->rcv_nxt - th->th_seq; 1296 if (todrop > 0) { 1297 if (tiflags & TH_SYN) { 1298 tiflags &= ~TH_SYN; 1299 th->th_seq++; 1300 if (th->th_urp > 1) 1301 th->th_urp--; 1302 else 1303 tiflags &= ~TH_URG; 1304 todrop--; 1305 } 1306 if (todrop > tlen || 1307 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1308 /* 1309 * Any valid FIN must be to the left of the 1310 * window. At this point, FIN must be a 1311 * duplicate or out-of-sequence, so drop it. 1312 */ 1313 tiflags &= ~TH_FIN; 1314 /* 1315 * Send ACK to resynchronize, and drop any data, 1316 * but keep on processing for RST or ACK. 1317 */ 1318 tp->t_flags |= TF_ACKNOW; 1319 todrop = tlen; 1320 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1321 } else { 1322 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1323 todrop); 1324 } 1325 hdroptlen += todrop; /* drop from head afterwards */ 1326 th->th_seq += todrop; 1327 tlen -= todrop; 1328 if (th->th_urp > todrop) 1329 th->th_urp -= todrop; 1330 else { 1331 tiflags &= ~TH_URG; 1332 th->th_urp = 0; 1333 } 1334 } 1335 1336 /* 1337 * If new data are received on a connection after the 1338 * user processes are gone, then RST the other end. 1339 */ 1340 if ((so->so_state & SS_NOFDREF) && 1341 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1342 tp = tcp_close(tp); 1343 tcpstat_inc(tcps_rcvafterclose); 1344 goto dropwithreset; 1345 } 1346 1347 /* 1348 * If segment ends after window, drop trailing data 1349 * (and PUSH and FIN); if nothing left, just ACK. 1350 */ 1351 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1352 if (todrop > 0) { 1353 tcpstat_inc(tcps_rcvpackafterwin); 1354 if (todrop >= tlen) { 1355 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1356 /* 1357 * If window is closed can only take segments at 1358 * window edge, and have to drop data and PUSH from 1359 * incoming segments. Continue processing, but 1360 * remember to ack. Otherwise, drop segment 1361 * and ack. 1362 */ 1363 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1364 tp->t_flags |= TF_ACKNOW; 1365 tcpstat_inc(tcps_rcvwinprobe); 1366 } else 1367 goto dropafterack; 1368 } else 1369 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1370 m_adj(m, -todrop); 1371 tlen -= todrop; 1372 tiflags &= ~(TH_PUSH|TH_FIN); 1373 } 1374 1375 /* 1376 * If last ACK falls within this segment's sequence numbers, 1377 * record its timestamp if it's more recent. 1378 * NOTE that the test is modified according to the latest 1379 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1380 */ 1381 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1382 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1383 tp->ts_recent_age = now; 1384 tp->ts_recent = opti.ts_val; 1385 } 1386 1387 /* 1388 * If the RST bit is set examine the state: 1389 * SYN_RECEIVED STATE: 1390 * If passive open, return to LISTEN state. 1391 * If active open, inform user that connection was refused. 1392 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1393 * Inform user that connection was reset, and close tcb. 1394 * CLOSING, LAST_ACK, TIME_WAIT STATES 1395 * Close the tcb. 1396 */ 1397 if (tiflags & TH_RST) { 1398 if (th->th_seq != tp->last_ack_sent && 1399 th->th_seq != tp->rcv_nxt && 1400 th->th_seq != (tp->rcv_nxt + 1)) 1401 goto drop; 1402 1403 switch (tp->t_state) { 1404 case TCPS_SYN_RECEIVED: 1405 #ifdef TCP_ECN 1406 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1407 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1408 goto drop; 1409 #endif 1410 so->so_error = ECONNREFUSED; 1411 goto close; 1412 1413 case TCPS_ESTABLISHED: 1414 case TCPS_FIN_WAIT_1: 1415 case TCPS_FIN_WAIT_2: 1416 case TCPS_CLOSE_WAIT: 1417 so->so_error = ECONNRESET; 1418 close: 1419 tp->t_state = TCPS_CLOSED; 1420 tcpstat_inc(tcps_drops); 1421 tp = tcp_close(tp); 1422 goto drop; 1423 case TCPS_CLOSING: 1424 case TCPS_LAST_ACK: 1425 case TCPS_TIME_WAIT: 1426 tp = tcp_close(tp); 1427 goto drop; 1428 } 1429 } 1430 1431 /* 1432 * If a SYN is in the window, then this is an 1433 * error and we ACK and drop the packet. 1434 */ 1435 if (tiflags & TH_SYN) 1436 goto dropafterack_ratelim; 1437 1438 /* 1439 * If the ACK bit is off we drop the segment and return. 1440 */ 1441 if ((tiflags & TH_ACK) == 0) { 1442 if (tp->t_flags & TF_ACKNOW) 1443 goto dropafterack; 1444 else 1445 goto drop; 1446 } 1447 1448 /* 1449 * Ack processing. 1450 */ 1451 switch (tp->t_state) { 1452 1453 /* 1454 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1455 * ESTABLISHED state and continue processing. 1456 * The ACK was checked above. 1457 */ 1458 case TCPS_SYN_RECEIVED: 1459 tcpstat_inc(tcps_connects); 1460 tp->t_flags |= TF_BLOCKOUTPUT; 1461 soisconnected(so); 1462 tp->t_flags &= ~TF_BLOCKOUTPUT; 1463 tp->t_state = TCPS_ESTABLISHED; 1464 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1465 /* Do window scaling? */ 1466 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1467 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1468 tp->snd_scale = tp->requested_s_scale; 1469 tp->rcv_scale = tp->request_r_scale; 1470 tiwin = th->th_win << tp->snd_scale; 1471 } 1472 tcp_flush_queue(tp); 1473 tp->snd_wl1 = th->th_seq - 1; 1474 /* fall into ... */ 1475 1476 /* 1477 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1478 * ACKs. If the ack is in the range 1479 * tp->snd_una < th->th_ack <= tp->snd_max 1480 * then advance tp->snd_una to th->th_ack and drop 1481 * data from the retransmission queue. If this ACK reflects 1482 * more up to date window information we update our window information. 1483 */ 1484 case TCPS_ESTABLISHED: 1485 case TCPS_FIN_WAIT_1: 1486 case TCPS_FIN_WAIT_2: 1487 case TCPS_CLOSE_WAIT: 1488 case TCPS_CLOSING: 1489 case TCPS_LAST_ACK: 1490 case TCPS_TIME_WAIT: 1491 #ifdef TCP_ECN 1492 /* 1493 * if we receive ECE and are not already in recovery phase, 1494 * reduce cwnd by half but don't slow-start. 1495 * advance snd_last to snd_max not to reduce cwnd again 1496 * until all outstanding packets are acked. 1497 */ 1498 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1499 if ((tp->t_flags & TF_ECN_PERMIT) && 1500 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1501 u_int win; 1502 1503 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1504 if (win > 1) { 1505 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1506 tp->snd_cwnd = tp->snd_ssthresh; 1507 tp->snd_last = tp->snd_max; 1508 tp->t_flags |= TF_SEND_CWR; 1509 tcpstat_inc(tcps_cwr_ecn); 1510 } 1511 } 1512 tcpstat_inc(tcps_ecn_rcvece); 1513 } 1514 /* 1515 * if we receive CWR, we know that the peer has reduced 1516 * its congestion window. stop sending ecn-echo. 1517 */ 1518 if ((tiflags & TH_CWR)) { 1519 tp->t_flags &= ~TF_RCVD_CE; 1520 tcpstat_inc(tcps_ecn_rcvcwr); 1521 } 1522 #endif /* TCP_ECN */ 1523 1524 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1525 /* 1526 * Duplicate/old ACK processing. 1527 * Increments t_dupacks: 1528 * Pure duplicate (same seq/ack/window, no data) 1529 * Doesn't affect t_dupacks: 1530 * Data packets. 1531 * Normal window updates (window opens) 1532 * Resets t_dupacks: 1533 * New data ACKed. 1534 * Window shrinks 1535 * Old ACK 1536 */ 1537 if (tlen) { 1538 /* Drop very old ACKs unless th_seq matches */ 1539 if (th->th_seq != tp->rcv_nxt && 1540 SEQ_LT(th->th_ack, 1541 tp->snd_una - tp->max_sndwnd)) { 1542 tcpstat_inc(tcps_rcvacktooold); 1543 goto drop; 1544 } 1545 break; 1546 } 1547 /* 1548 * If we get an old ACK, there is probably packet 1549 * reordering going on. Be conservative and reset 1550 * t_dupacks so that we are less aggressive in 1551 * doing a fast retransmit. 1552 */ 1553 if (th->th_ack != tp->snd_una) { 1554 tp->t_dupacks = 0; 1555 break; 1556 } 1557 if (tiwin == tp->snd_wnd) { 1558 tcpstat_inc(tcps_rcvdupack); 1559 /* 1560 * If we have outstanding data (other than 1561 * a window probe), this is a completely 1562 * duplicate ack (ie, window info didn't 1563 * change), the ack is the biggest we've 1564 * seen and we've seen exactly our rexmt 1565 * threshold of them, assume a packet 1566 * has been dropped and retransmit it. 1567 * Kludge snd_nxt & the congestion 1568 * window so we send only this one 1569 * packet. 1570 * 1571 * We know we're losing at the current 1572 * window size so do congestion avoidance 1573 * (set ssthresh to half the current window 1574 * and pull our congestion window back to 1575 * the new ssthresh). 1576 * 1577 * Dup acks mean that packets have left the 1578 * network (they're now cached at the receiver) 1579 * so bump cwnd by the amount in the receiver 1580 * to keep a constant cwnd packets in the 1581 * network. 1582 */ 1583 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1584 tp->t_dupacks = 0; 1585 else if (++tp->t_dupacks == tcprexmtthresh) { 1586 tcp_seq onxt = tp->snd_nxt; 1587 u_long win = 1588 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1589 2 / tp->t_maxseg; 1590 1591 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1592 /* 1593 * False fast retx after 1594 * timeout. Do not cut window. 1595 */ 1596 tp->t_dupacks = 0; 1597 goto drop; 1598 } 1599 if (win < 2) 1600 win = 2; 1601 tp->snd_ssthresh = win * tp->t_maxseg; 1602 tp->snd_last = tp->snd_max; 1603 if (tp->sack_enable) { 1604 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1605 tp->t_rtttime = 0; 1606 #ifdef TCP_ECN 1607 tp->t_flags |= TF_SEND_CWR; 1608 #endif 1609 tcpstat_inc(tcps_cwr_frecovery); 1610 tcpstat_inc(tcps_sack_recovery_episode); 1611 /* 1612 * tcp_output() will send 1613 * oldest SACK-eligible rtx. 1614 */ 1615 (void) tcp_output(tp); 1616 tp->snd_cwnd = tp->snd_ssthresh+ 1617 tp->t_maxseg * tp->t_dupacks; 1618 goto drop; 1619 } 1620 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1621 tp->t_rtttime = 0; 1622 tp->snd_nxt = th->th_ack; 1623 tp->snd_cwnd = tp->t_maxseg; 1624 #ifdef TCP_ECN 1625 tp->t_flags |= TF_SEND_CWR; 1626 #endif 1627 tcpstat_inc(tcps_cwr_frecovery); 1628 tcpstat_inc(tcps_sndrexmitfast); 1629 (void) tcp_output(tp); 1630 1631 tp->snd_cwnd = tp->snd_ssthresh + 1632 tp->t_maxseg * tp->t_dupacks; 1633 if (SEQ_GT(onxt, tp->snd_nxt)) 1634 tp->snd_nxt = onxt; 1635 goto drop; 1636 } else if (tp->t_dupacks > tcprexmtthresh) { 1637 tp->snd_cwnd += tp->t_maxseg; 1638 (void) tcp_output(tp); 1639 goto drop; 1640 } 1641 } else if (tiwin < tp->snd_wnd) { 1642 /* 1643 * The window was retracted! Previous dup 1644 * ACKs may have been due to packets arriving 1645 * after the shrunken window, not a missing 1646 * packet, so play it safe and reset t_dupacks 1647 */ 1648 tp->t_dupacks = 0; 1649 } 1650 break; 1651 } 1652 /* 1653 * If the congestion window was inflated to account 1654 * for the other side's cached packets, retract it. 1655 */ 1656 if (tp->t_dupacks >= tcprexmtthresh) { 1657 /* Check for a partial ACK */ 1658 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1659 if (tp->sack_enable) 1660 tcp_sack_partialack(tp, th); 1661 else 1662 tcp_newreno_partialack(tp, th); 1663 } else { 1664 /* Out of fast recovery */ 1665 tp->snd_cwnd = tp->snd_ssthresh; 1666 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1667 tp->snd_ssthresh) 1668 tp->snd_cwnd = 1669 tcp_seq_subtract(tp->snd_max, 1670 th->th_ack); 1671 tp->t_dupacks = 0; 1672 } 1673 } else { 1674 /* 1675 * Reset the duplicate ACK counter if we 1676 * were not in fast recovery. 1677 */ 1678 tp->t_dupacks = 0; 1679 } 1680 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1681 tcpstat_inc(tcps_rcvacktoomuch); 1682 goto dropafterack_ratelim; 1683 } 1684 acked = th->th_ack - tp->snd_una; 1685 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1686 tp->t_rcvacktime = now; 1687 1688 /* 1689 * If we have a timestamp reply, update smoothed 1690 * round trip time. If no timestamp is present but 1691 * transmit timer is running and timed sequence 1692 * number was acked, update smoothed round trip time. 1693 * Since we now have an rtt measurement, cancel the 1694 * timer backoff (cf., Phil Karn's retransmit alg.). 1695 * Recompute the initial retransmit timer. 1696 */ 1697 if (opti.ts_present && opti.ts_ecr) 1698 tcp_xmit_timer(tp, now - opti.ts_ecr); 1699 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1700 tcp_xmit_timer(tp, now - tp->t_rtttime); 1701 1702 /* 1703 * If all outstanding data is acked, stop retransmit 1704 * timer and remember to restart (more output or persist). 1705 * If there is more data to be acked, restart retransmit 1706 * timer, using current (possibly backed-off) value. 1707 */ 1708 if (th->th_ack == tp->snd_max) { 1709 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1710 tp->t_flags |= TF_NEEDOUTPUT; 1711 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1712 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1713 /* 1714 * When new data is acked, open the congestion window. 1715 * If the window gives us less than ssthresh packets 1716 * in flight, open exponentially (maxseg per packet). 1717 * Otherwise open linearly: maxseg per window 1718 * (maxseg^2 / cwnd per packet). 1719 */ 1720 { 1721 u_int cw = tp->snd_cwnd; 1722 u_int incr = tp->t_maxseg; 1723 1724 if (cw > tp->snd_ssthresh) 1725 incr = max(incr * incr / cw, 1); 1726 if (tp->t_dupacks < tcprexmtthresh) 1727 tp->snd_cwnd = ulmin(cw + incr, 1728 TCP_MAXWIN << tp->snd_scale); 1729 } 1730 ND6_HINT(tp); 1731 if (acked > so->so_snd.sb_cc) { 1732 if (tp->snd_wnd > so->so_snd.sb_cc) 1733 tp->snd_wnd -= so->so_snd.sb_cc; 1734 else 1735 tp->snd_wnd = 0; 1736 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1737 ourfinisacked = 1; 1738 } else { 1739 sbdrop(so, &so->so_snd, acked); 1740 if (tp->snd_wnd > acked) 1741 tp->snd_wnd -= acked; 1742 else 1743 tp->snd_wnd = 0; 1744 ourfinisacked = 0; 1745 } 1746 1747 tcp_update_sndspace(tp); 1748 if (sb_notify(so, &so->so_snd)) { 1749 tp->t_flags |= TF_BLOCKOUTPUT; 1750 sowwakeup(so); 1751 tp->t_flags &= ~TF_BLOCKOUTPUT; 1752 } 1753 1754 /* 1755 * If we had a pending ICMP message that referred to data 1756 * that have just been acknowledged, disregard the recorded 1757 * ICMP message. 1758 */ 1759 if ((tp->t_flags & TF_PMTUD_PEND) && 1760 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1761 tp->t_flags &= ~TF_PMTUD_PEND; 1762 1763 /* 1764 * Keep track of the largest chunk of data acknowledged 1765 * since last PMTU update 1766 */ 1767 if (tp->t_pmtud_mss_acked < acked) 1768 tp->t_pmtud_mss_acked = acked; 1769 1770 tp->snd_una = th->th_ack; 1771 #ifdef TCP_ECN 1772 /* sync snd_last with snd_una */ 1773 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1774 tp->snd_last = tp->snd_una; 1775 #endif 1776 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1777 tp->snd_nxt = tp->snd_una; 1778 1779 switch (tp->t_state) { 1780 1781 /* 1782 * In FIN_WAIT_1 STATE in addition to the processing 1783 * for the ESTABLISHED state if our FIN is now acknowledged 1784 * then enter FIN_WAIT_2. 1785 */ 1786 case TCPS_FIN_WAIT_1: 1787 if (ourfinisacked) { 1788 /* 1789 * If we can't receive any more 1790 * data, then closing user can proceed. 1791 * Starting the timer is contrary to the 1792 * specification, but if we don't get a FIN 1793 * we'll hang forever. 1794 */ 1795 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 1796 tp->t_flags |= TF_BLOCKOUTPUT; 1797 soisdisconnected(so); 1798 tp->t_flags &= ~TF_BLOCKOUTPUT; 1799 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1800 } 1801 tp->t_state = TCPS_FIN_WAIT_2; 1802 } 1803 break; 1804 1805 /* 1806 * In CLOSING STATE in addition to the processing for 1807 * the ESTABLISHED state if the ACK acknowledges our FIN 1808 * then enter the TIME-WAIT state, otherwise ignore 1809 * the segment. 1810 */ 1811 case TCPS_CLOSING: 1812 if (ourfinisacked) { 1813 tp->t_state = TCPS_TIME_WAIT; 1814 tcp_canceltimers(tp); 1815 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1816 tp->t_flags |= TF_BLOCKOUTPUT; 1817 soisdisconnected(so); 1818 tp->t_flags &= ~TF_BLOCKOUTPUT; 1819 } 1820 break; 1821 1822 /* 1823 * In LAST_ACK, we may still be waiting for data to drain 1824 * and/or to be acked, as well as for the ack of our FIN. 1825 * If our FIN is now acknowledged, delete the TCB, 1826 * enter the closed state and return. 1827 */ 1828 case TCPS_LAST_ACK: 1829 if (ourfinisacked) { 1830 tp = tcp_close(tp); 1831 goto drop; 1832 } 1833 break; 1834 1835 /* 1836 * In TIME_WAIT state the only thing that should arrive 1837 * is a retransmission of the remote FIN. Acknowledge 1838 * it and restart the finack timer. 1839 */ 1840 case TCPS_TIME_WAIT: 1841 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1842 goto dropafterack; 1843 } 1844 } 1845 1846 step6: 1847 /* 1848 * Update window information. 1849 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1850 */ 1851 if ((tiflags & TH_ACK) && 1852 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1853 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1854 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1855 /* keep track of pure window updates */ 1856 if (tlen == 0 && 1857 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1858 tcpstat_inc(tcps_rcvwinupd); 1859 tp->snd_wnd = tiwin; 1860 tp->snd_wl1 = th->th_seq; 1861 tp->snd_wl2 = th->th_ack; 1862 if (tp->snd_wnd > tp->max_sndwnd) 1863 tp->max_sndwnd = tp->snd_wnd; 1864 tp->t_flags |= TF_NEEDOUTPUT; 1865 } 1866 1867 /* 1868 * Process segments with URG. 1869 */ 1870 if ((tiflags & TH_URG) && th->th_urp && 1871 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1872 /* 1873 * This is a kludge, but if we receive and accept 1874 * random urgent pointers, we'll crash in 1875 * soreceive. It's hard to imagine someone 1876 * actually wanting to send this much urgent data. 1877 */ 1878 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1879 th->th_urp = 0; /* XXX */ 1880 tiflags &= ~TH_URG; /* XXX */ 1881 goto dodata; /* XXX */ 1882 } 1883 /* 1884 * If this segment advances the known urgent pointer, 1885 * then mark the data stream. This should not happen 1886 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1887 * a FIN has been received from the remote side. 1888 * In these states we ignore the URG. 1889 * 1890 * According to RFC961 (Assigned Protocols), 1891 * the urgent pointer points to the last octet 1892 * of urgent data. We continue, however, 1893 * to consider it to indicate the first octet 1894 * of data past the urgent section as the original 1895 * spec states (in one of two places). 1896 */ 1897 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1898 tp->rcv_up = th->th_seq + th->th_urp; 1899 so->so_oobmark = so->so_rcv.sb_cc + 1900 (tp->rcv_up - tp->rcv_nxt) - 1; 1901 if (so->so_oobmark == 0) 1902 so->so_rcv.sb_state |= SS_RCVATMARK; 1903 sohasoutofband(so); 1904 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1905 } 1906 /* 1907 * Remove out of band data so doesn't get presented to user. 1908 * This can happen independent of advancing the URG pointer, 1909 * but if two URG's are pending at once, some out-of-band 1910 * data may creep in... ick. 1911 */ 1912 if (th->th_urp <= (u_int16_t) tlen && 1913 (so->so_options & SO_OOBINLINE) == 0) 1914 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1915 } else 1916 /* 1917 * If no out of band data is expected, 1918 * pull receive urgent pointer along 1919 * with the receive window. 1920 */ 1921 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1922 tp->rcv_up = tp->rcv_nxt; 1923 dodata: /* XXX */ 1924 1925 /* 1926 * Process the segment text, merging it into the TCP sequencing queue, 1927 * and arranging for acknowledgment of receipt if necessary. 1928 * This process logically involves adjusting tp->rcv_wnd as data 1929 * is presented to the user (this happens in tcp_usrreq.c, 1930 * case PRU_RCVD). If a FIN has already been received on this 1931 * connection then we just ignore the text. 1932 */ 1933 if ((tlen || (tiflags & TH_FIN)) && 1934 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1935 tcp_seq laststart = th->th_seq; 1936 tcp_seq lastend = th->th_seq + tlen; 1937 1938 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1939 tp->t_state == TCPS_ESTABLISHED) { 1940 TCP_SETUP_ACK(tp, tiflags, m); 1941 tp->rcv_nxt += tlen; 1942 tiflags = th->th_flags & TH_FIN; 1943 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1944 ND6_HINT(tp); 1945 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1946 m_freem(m); 1947 else { 1948 m_adj(m, hdroptlen); 1949 sbappendstream(so, &so->so_rcv, m); 1950 } 1951 tp->t_flags |= TF_BLOCKOUTPUT; 1952 sorwakeup(so); 1953 tp->t_flags &= ~TF_BLOCKOUTPUT; 1954 } else { 1955 m_adj(m, hdroptlen); 1956 tiflags = tcp_reass(tp, th, m, &tlen); 1957 tp->t_flags |= TF_ACKNOW; 1958 } 1959 if (tp->sack_enable) 1960 tcp_update_sack_list(tp, laststart, lastend); 1961 1962 /* 1963 * variable len never referenced again in modern BSD, 1964 * so why bother computing it ?? 1965 */ 1966 #if 0 1967 /* 1968 * Note the amount of data that peer has sent into 1969 * our window, in order to estimate the sender's 1970 * buffer size. 1971 */ 1972 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1973 #endif /* 0 */ 1974 } else { 1975 m_freem(m); 1976 tiflags &= ~TH_FIN; 1977 } 1978 1979 /* 1980 * If FIN is received ACK the FIN and let the user know 1981 * that the connection is closing. Ignore a FIN received before 1982 * the connection is fully established. 1983 */ 1984 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1985 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1986 tp->t_flags |= TF_BLOCKOUTPUT; 1987 socantrcvmore(so); 1988 tp->t_flags &= ~TF_BLOCKOUTPUT; 1989 tp->t_flags |= TF_ACKNOW; 1990 tp->rcv_nxt++; 1991 } 1992 switch (tp->t_state) { 1993 1994 /* 1995 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1996 */ 1997 case TCPS_ESTABLISHED: 1998 tp->t_state = TCPS_CLOSE_WAIT; 1999 break; 2000 2001 /* 2002 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2003 * enter the CLOSING state. 2004 */ 2005 case TCPS_FIN_WAIT_1: 2006 tp->t_state = TCPS_CLOSING; 2007 break; 2008 2009 /* 2010 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2011 * starting the time-wait timer, turning off the other 2012 * standard timers. 2013 */ 2014 case TCPS_FIN_WAIT_2: 2015 tp->t_state = TCPS_TIME_WAIT; 2016 tcp_canceltimers(tp); 2017 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2018 tp->t_flags |= TF_BLOCKOUTPUT; 2019 soisdisconnected(so); 2020 tp->t_flags &= ~TF_BLOCKOUTPUT; 2021 break; 2022 2023 /* 2024 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2025 */ 2026 case TCPS_TIME_WAIT: 2027 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2028 break; 2029 } 2030 } 2031 if (otp) 2032 tcp_trace(TA_INPUT, ostate, tp, otp, &saveti.caddr, 0, tlen); 2033 2034 /* 2035 * Return any desired output. 2036 */ 2037 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2038 (void) tcp_output(tp); 2039 in_pcbunref(inp); 2040 return IPPROTO_DONE; 2041 2042 badsyn: 2043 /* 2044 * Received a bad SYN. Increment counters and dropwithreset. 2045 */ 2046 tcpstat_inc(tcps_badsyn); 2047 tp = NULL; 2048 goto dropwithreset; 2049 2050 dropafterack_ratelim: 2051 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2052 tcp_ackdrop_ppslim) == 0) { 2053 /* XXX stat */ 2054 goto drop; 2055 } 2056 /* ...fall into dropafterack... */ 2057 2058 dropafterack: 2059 /* 2060 * Generate an ACK dropping incoming segment if it occupies 2061 * sequence space, where the ACK reflects our state. 2062 */ 2063 if (tiflags & TH_RST) 2064 goto drop; 2065 m_freem(m); 2066 tp->t_flags |= TF_ACKNOW; 2067 (void) tcp_output(tp); 2068 in_pcbunref(inp); 2069 return IPPROTO_DONE; 2070 2071 dropwithreset_ratelim: 2072 /* 2073 * We may want to rate-limit RSTs in certain situations, 2074 * particularly if we are sending an RST in response to 2075 * an attempt to connect to or otherwise communicate with 2076 * a port for which we have no socket. 2077 */ 2078 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2079 tcp_rst_ppslim) == 0) { 2080 /* XXX stat */ 2081 goto drop; 2082 } 2083 /* ...fall into dropwithreset... */ 2084 2085 dropwithreset: 2086 /* 2087 * Generate a RST, dropping incoming segment. 2088 * Make ACK acceptable to originator of segment. 2089 * Don't bother to respond to RST. 2090 */ 2091 if (tiflags & TH_RST) 2092 goto drop; 2093 if (tiflags & TH_ACK) { 2094 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2095 TH_RST, m->m_pkthdr.ph_rtableid, now); 2096 } else { 2097 if (tiflags & TH_SYN) 2098 tlen++; 2099 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2100 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); 2101 } 2102 m_freem(m); 2103 in_pcbunref(inp); 2104 return IPPROTO_DONE; 2105 2106 drop: 2107 /* 2108 * Drop space held by incoming segment and return. 2109 */ 2110 if (otp) 2111 tcp_trace(TA_DROP, ostate, tp, otp, &saveti.caddr, 0, tlen); 2112 2113 m_freem(m); 2114 in_pcbunref(inp); 2115 return IPPROTO_DONE; 2116 } 2117 2118 int 2119 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2120 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2121 u_int rtableid, uint64_t now) 2122 { 2123 u_int16_t mss = 0; 2124 int opt, optlen; 2125 #ifdef TCP_SIGNATURE 2126 caddr_t sigp = NULL; 2127 struct tdb *tdb = NULL; 2128 #endif 2129 2130 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2131 opt = cp[0]; 2132 if (opt == TCPOPT_EOL) 2133 break; 2134 if (opt == TCPOPT_NOP) 2135 optlen = 1; 2136 else { 2137 if (cnt < 2) 2138 break; 2139 optlen = cp[1]; 2140 if (optlen < 2 || optlen > cnt) 2141 break; 2142 } 2143 switch (opt) { 2144 2145 default: 2146 continue; 2147 2148 case TCPOPT_MAXSEG: 2149 if (optlen != TCPOLEN_MAXSEG) 2150 continue; 2151 if (!(th->th_flags & TH_SYN)) 2152 continue; 2153 if (TCPS_HAVERCVDSYN(tp->t_state)) 2154 continue; 2155 memcpy(&mss, cp + 2, sizeof(mss)); 2156 mss = ntohs(mss); 2157 oi->maxseg = mss; 2158 break; 2159 2160 case TCPOPT_WINDOW: 2161 if (optlen != TCPOLEN_WINDOW) 2162 continue; 2163 if (!(th->th_flags & TH_SYN)) 2164 continue; 2165 if (TCPS_HAVERCVDSYN(tp->t_state)) 2166 continue; 2167 tp->t_flags |= TF_RCVD_SCALE; 2168 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2169 break; 2170 2171 case TCPOPT_TIMESTAMP: 2172 if (optlen != TCPOLEN_TIMESTAMP) 2173 continue; 2174 oi->ts_present = 1; 2175 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2176 oi->ts_val = ntohl(oi->ts_val); 2177 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2178 oi->ts_ecr = ntohl(oi->ts_ecr); 2179 2180 if (!(th->th_flags & TH_SYN)) 2181 continue; 2182 if (TCPS_HAVERCVDSYN(tp->t_state)) 2183 continue; 2184 /* 2185 * A timestamp received in a SYN makes 2186 * it ok to send timestamp requests and replies. 2187 */ 2188 tp->t_flags |= TF_RCVD_TSTMP; 2189 tp->ts_recent = oi->ts_val; 2190 tp->ts_recent_age = now; 2191 break; 2192 2193 case TCPOPT_SACK_PERMITTED: 2194 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2195 continue; 2196 if (!(th->th_flags & TH_SYN)) 2197 continue; 2198 if (TCPS_HAVERCVDSYN(tp->t_state)) 2199 continue; 2200 /* MUST only be set on SYN */ 2201 tp->t_flags |= TF_SACK_PERMIT; 2202 break; 2203 case TCPOPT_SACK: 2204 tcp_sack_option(tp, th, cp, optlen); 2205 break; 2206 #ifdef TCP_SIGNATURE 2207 case TCPOPT_SIGNATURE: 2208 if (optlen != TCPOLEN_SIGNATURE) 2209 continue; 2210 2211 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2212 goto bad; 2213 2214 sigp = cp + 2; 2215 break; 2216 #endif /* TCP_SIGNATURE */ 2217 } 2218 } 2219 2220 #ifdef TCP_SIGNATURE 2221 if (tp->t_flags & TF_SIGNATURE) { 2222 union sockaddr_union src, dst; 2223 2224 memset(&src, 0, sizeof(union sockaddr_union)); 2225 memset(&dst, 0, sizeof(union sockaddr_union)); 2226 2227 switch (tp->pf) { 2228 case 0: 2229 case AF_INET: 2230 src.sa.sa_len = sizeof(struct sockaddr_in); 2231 src.sa.sa_family = AF_INET; 2232 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2233 dst.sa.sa_len = sizeof(struct sockaddr_in); 2234 dst.sa.sa_family = AF_INET; 2235 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2236 break; 2237 #ifdef INET6 2238 case AF_INET6: 2239 src.sa.sa_len = sizeof(struct sockaddr_in6); 2240 src.sa.sa_family = AF_INET6; 2241 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2242 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2243 dst.sa.sa_family = AF_INET6; 2244 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2245 break; 2246 #endif /* INET6 */ 2247 } 2248 2249 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2250 0, &src, &dst, IPPROTO_TCP); 2251 2252 /* 2253 * We don't have an SA for this peer, so we turn off 2254 * TF_SIGNATURE on the listen socket 2255 */ 2256 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2257 tp->t_flags &= ~TF_SIGNATURE; 2258 2259 } 2260 2261 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2262 tcpstat_inc(tcps_rcvbadsig); 2263 goto bad; 2264 } 2265 2266 if (sigp) { 2267 char sig[16]; 2268 2269 if (tdb == NULL) { 2270 tcpstat_inc(tcps_rcvbadsig); 2271 goto bad; 2272 } 2273 2274 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2275 goto bad; 2276 2277 if (timingsafe_bcmp(sig, sigp, 16)) { 2278 tcpstat_inc(tcps_rcvbadsig); 2279 goto bad; 2280 } 2281 2282 tcpstat_inc(tcps_rcvgoodsig); 2283 } 2284 2285 tdb_unref(tdb); 2286 #endif /* TCP_SIGNATURE */ 2287 2288 return (0); 2289 2290 #ifdef TCP_SIGNATURE 2291 bad: 2292 tdb_unref(tdb); 2293 #endif 2294 return (-1); 2295 } 2296 2297 u_long 2298 tcp_seq_subtract(u_long a, u_long b) 2299 { 2300 return ((long)(a - b)); 2301 } 2302 2303 /* 2304 * This function is called upon receipt of new valid data (while not in header 2305 * prediction mode), and it updates the ordered list of sacks. 2306 */ 2307 void 2308 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2309 tcp_seq rcv_lastend) 2310 { 2311 /* 2312 * First reported block MUST be the most recent one. Subsequent 2313 * blocks SHOULD be in the order in which they arrived at the 2314 * receiver. These two conditions make the implementation fully 2315 * compliant with RFC 2018. 2316 */ 2317 int i, j = 0, count = 0, lastpos = -1; 2318 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2319 2320 /* First clean up current list of sacks */ 2321 for (i = 0; i < tp->rcv_numsacks; i++) { 2322 sack = tp->sackblks[i]; 2323 if (sack.start == 0 && sack.end == 0) { 2324 count++; /* count = number of blocks to be discarded */ 2325 continue; 2326 } 2327 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2328 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2329 count++; 2330 } else { 2331 temp[j].start = tp->sackblks[i].start; 2332 temp[j++].end = tp->sackblks[i].end; 2333 } 2334 } 2335 tp->rcv_numsacks -= count; 2336 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2337 tcp_clean_sackreport(tp); 2338 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2339 /* ==> need first sack block */ 2340 tp->sackblks[0].start = rcv_laststart; 2341 tp->sackblks[0].end = rcv_lastend; 2342 tp->rcv_numsacks = 1; 2343 } 2344 return; 2345 } 2346 /* Otherwise, sack blocks are already present. */ 2347 for (i = 0; i < tp->rcv_numsacks; i++) 2348 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2349 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2350 return; /* sack list remains unchanged */ 2351 /* 2352 * From here, segment just received should be (part of) the 1st sack. 2353 * Go through list, possibly coalescing sack block entries. 2354 */ 2355 firstsack.start = rcv_laststart; 2356 firstsack.end = rcv_lastend; 2357 for (i = 0; i < tp->rcv_numsacks; i++) { 2358 sack = tp->sackblks[i]; 2359 if (SEQ_LT(sack.end, firstsack.start) || 2360 SEQ_GT(sack.start, firstsack.end)) 2361 continue; /* no overlap */ 2362 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2363 /* 2364 * identical block; delete it here since we will 2365 * move it to the front of the list. 2366 */ 2367 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2368 lastpos = i; /* last posn with a zero entry */ 2369 continue; 2370 } 2371 if (SEQ_LEQ(sack.start, firstsack.start)) 2372 firstsack.start = sack.start; /* merge blocks */ 2373 if (SEQ_GEQ(sack.end, firstsack.end)) 2374 firstsack.end = sack.end; /* merge blocks */ 2375 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2376 lastpos = i; /* last posn with a zero entry */ 2377 } 2378 if (lastpos != -1) { /* at least one merge */ 2379 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2380 sack = tp->sackblks[i]; 2381 if (sack.start == 0 && sack.end == 0) 2382 continue; 2383 temp[j++] = sack; 2384 } 2385 tp->rcv_numsacks = j; /* including first blk (added later) */ 2386 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2387 tp->sackblks[i] = temp[i]; 2388 } else { /* no merges -- shift sacks by 1 */ 2389 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2390 tp->rcv_numsacks++; 2391 for (i = tp->rcv_numsacks-1; i > 0; i--) 2392 tp->sackblks[i] = tp->sackblks[i-1]; 2393 } 2394 tp->sackblks[0] = firstsack; 2395 return; 2396 } 2397 2398 /* 2399 * Process the TCP SACK option. tp->snd_holes is an ordered list 2400 * of holes (oldest to newest, in terms of the sequence space). 2401 */ 2402 void 2403 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2404 { 2405 int tmp_olen; 2406 u_char *tmp_cp; 2407 struct sackhole *cur, *p, *temp; 2408 2409 if (!tp->sack_enable) 2410 return; 2411 /* SACK without ACK doesn't make sense. */ 2412 if ((th->th_flags & TH_ACK) == 0) 2413 return; 2414 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2415 if (SEQ_LT(th->th_ack, tp->snd_una) || 2416 SEQ_GT(th->th_ack, tp->snd_max)) 2417 return; 2418 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2419 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2420 return; 2421 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2422 tmp_cp = cp + 2; 2423 tmp_olen = optlen - 2; 2424 tcpstat_inc(tcps_sack_rcv_opts); 2425 if (tp->snd_numholes < 0) 2426 tp->snd_numholes = 0; 2427 if (tp->t_maxseg == 0) 2428 panic("tcp_sack_option"); /* Should never happen */ 2429 while (tmp_olen > 0) { 2430 struct sackblk sack; 2431 2432 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2433 sack.start = ntohl(sack.start); 2434 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2435 sack.end = ntohl(sack.end); 2436 tmp_olen -= TCPOLEN_SACK; 2437 tmp_cp += TCPOLEN_SACK; 2438 if (SEQ_LEQ(sack.end, sack.start)) 2439 continue; /* bad SACK fields */ 2440 if (SEQ_LEQ(sack.end, tp->snd_una)) 2441 continue; /* old block */ 2442 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2443 if (SEQ_LT(sack.start, th->th_ack)) 2444 continue; 2445 } 2446 if (SEQ_GT(sack.end, tp->snd_max)) 2447 continue; 2448 if (tp->snd_holes == NULL) { /* first hole */ 2449 tp->snd_holes = (struct sackhole *) 2450 pool_get(&sackhl_pool, PR_NOWAIT); 2451 if (tp->snd_holes == NULL) { 2452 /* ENOBUFS, so ignore SACKed block for now */ 2453 goto dropped; 2454 } 2455 cur = tp->snd_holes; 2456 cur->start = th->th_ack; 2457 cur->end = sack.start; 2458 cur->rxmit = cur->start; 2459 cur->next = NULL; 2460 tp->snd_numholes = 1; 2461 tp->rcv_lastsack = sack.end; 2462 /* 2463 * dups is at least one. If more data has been 2464 * SACKed, it can be greater than one. 2465 */ 2466 cur->dups = min(tcprexmtthresh, 2467 ((sack.end - cur->end)/tp->t_maxseg)); 2468 if (cur->dups < 1) 2469 cur->dups = 1; 2470 continue; /* with next sack block */ 2471 } 2472 /* Go thru list of holes: p = previous, cur = current */ 2473 p = cur = tp->snd_holes; 2474 while (cur) { 2475 if (SEQ_LEQ(sack.end, cur->start)) 2476 /* SACKs data before the current hole */ 2477 break; /* no use going through more holes */ 2478 if (SEQ_GEQ(sack.start, cur->end)) { 2479 /* SACKs data beyond the current hole */ 2480 cur->dups++; 2481 if (((sack.end - cur->end)/tp->t_maxseg) >= 2482 tcprexmtthresh) 2483 cur->dups = tcprexmtthresh; 2484 p = cur; 2485 cur = cur->next; 2486 continue; 2487 } 2488 if (SEQ_LEQ(sack.start, cur->start)) { 2489 /* Data acks at least the beginning of hole */ 2490 if (SEQ_GEQ(sack.end, cur->end)) { 2491 /* Acks entire hole, so delete hole */ 2492 if (p != cur) { 2493 p->next = cur->next; 2494 pool_put(&sackhl_pool, cur); 2495 cur = p->next; 2496 } else { 2497 cur = cur->next; 2498 pool_put(&sackhl_pool, p); 2499 p = cur; 2500 tp->snd_holes = p; 2501 } 2502 tp->snd_numholes--; 2503 continue; 2504 } 2505 /* otherwise, move start of hole forward */ 2506 cur->start = sack.end; 2507 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2508 p = cur; 2509 cur = cur->next; 2510 continue; 2511 } 2512 /* move end of hole backward */ 2513 if (SEQ_GEQ(sack.end, cur->end)) { 2514 cur->end = sack.start; 2515 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2516 cur->dups++; 2517 if (((sack.end - cur->end)/tp->t_maxseg) >= 2518 tcprexmtthresh) 2519 cur->dups = tcprexmtthresh; 2520 p = cur; 2521 cur = cur->next; 2522 continue; 2523 } 2524 if (SEQ_LT(cur->start, sack.start) && 2525 SEQ_GT(cur->end, sack.end)) { 2526 /* 2527 * ACKs some data in middle of a hole; need to 2528 * split current hole 2529 */ 2530 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2531 goto dropped; 2532 temp = (struct sackhole *) 2533 pool_get(&sackhl_pool, PR_NOWAIT); 2534 if (temp == NULL) 2535 goto dropped; /* ENOBUFS */ 2536 temp->next = cur->next; 2537 temp->start = sack.end; 2538 temp->end = cur->end; 2539 temp->dups = cur->dups; 2540 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2541 cur->end = sack.start; 2542 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2543 cur->dups++; 2544 if (((sack.end - cur->end)/tp->t_maxseg) >= 2545 tcprexmtthresh) 2546 cur->dups = tcprexmtthresh; 2547 cur->next = temp; 2548 p = temp; 2549 cur = p->next; 2550 tp->snd_numholes++; 2551 } 2552 } 2553 /* At this point, p points to the last hole on the list */ 2554 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2555 /* 2556 * Need to append new hole at end. 2557 * Last hole is p (and it's not NULL). 2558 */ 2559 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2560 goto dropped; 2561 temp = (struct sackhole *) 2562 pool_get(&sackhl_pool, PR_NOWAIT); 2563 if (temp == NULL) 2564 goto dropped; /* ENOBUFS */ 2565 temp->start = tp->rcv_lastsack; 2566 temp->end = sack.start; 2567 temp->dups = min(tcprexmtthresh, 2568 ((sack.end - sack.start)/tp->t_maxseg)); 2569 if (temp->dups < 1) 2570 temp->dups = 1; 2571 temp->rxmit = temp->start; 2572 temp->next = 0; 2573 p->next = temp; 2574 tp->rcv_lastsack = sack.end; 2575 tp->snd_numholes++; 2576 } 2577 } 2578 return; 2579 dropped: 2580 tcpstat_inc(tcps_sack_drop_opts); 2581 } 2582 2583 /* 2584 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2585 * it is completely acked; otherwise, tcp_sack_option(), called from 2586 * tcp_dooptions(), will fix up the hole. 2587 */ 2588 void 2589 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2590 { 2591 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2592 /* max because this could be an older ack just arrived */ 2593 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2594 th->th_ack : tp->snd_una; 2595 struct sackhole *cur = tp->snd_holes; 2596 struct sackhole *prev; 2597 while (cur) 2598 if (SEQ_LEQ(cur->end, lastack)) { 2599 prev = cur; 2600 cur = cur->next; 2601 pool_put(&sackhl_pool, prev); 2602 tp->snd_numholes--; 2603 } else if (SEQ_LT(cur->start, lastack)) { 2604 cur->start = lastack; 2605 if (SEQ_LT(cur->rxmit, cur->start)) 2606 cur->rxmit = cur->start; 2607 break; 2608 } else 2609 break; 2610 tp->snd_holes = cur; 2611 } 2612 } 2613 2614 /* 2615 * Delete all receiver-side SACK information. 2616 */ 2617 void 2618 tcp_clean_sackreport(struct tcpcb *tp) 2619 { 2620 int i; 2621 2622 tp->rcv_numsacks = 0; 2623 for (i = 0; i < MAX_SACK_BLKS; i++) 2624 tp->sackblks[i].start = tp->sackblks[i].end=0; 2625 2626 } 2627 2628 /* 2629 * Partial ack handling within a sack recovery episode. When a partial ack 2630 * arrives, turn off retransmission timer, deflate the window, do not clear 2631 * tp->t_dupacks. 2632 */ 2633 void 2634 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2635 { 2636 /* Turn off retx. timer (will start again next segment) */ 2637 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2638 tp->t_rtttime = 0; 2639 /* 2640 * Partial window deflation. This statement relies on the 2641 * fact that tp->snd_una has not been updated yet. 2642 */ 2643 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2644 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2645 tp->snd_cwnd += tp->t_maxseg; 2646 } else 2647 tp->snd_cwnd = tp->t_maxseg; 2648 tp->snd_cwnd += tp->t_maxseg; 2649 tp->t_flags |= TF_NEEDOUTPUT; 2650 } 2651 2652 /* 2653 * Pull out of band byte out of a segment so 2654 * it doesn't appear in the user's data queue. 2655 * It is still reflected in the segment length for 2656 * sequencing purposes. 2657 */ 2658 void 2659 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2660 { 2661 int cnt = off + urgent - 1; 2662 2663 while (cnt >= 0) { 2664 if (m->m_len > cnt) { 2665 char *cp = mtod(m, caddr_t) + cnt; 2666 struct tcpcb *tp = sototcpcb(so); 2667 2668 tp->t_iobc = *cp; 2669 tp->t_oobflags |= TCPOOB_HAVEDATA; 2670 memmove(cp, cp + 1, m->m_len - cnt - 1); 2671 m->m_len--; 2672 return; 2673 } 2674 cnt -= m->m_len; 2675 m = m->m_next; 2676 if (m == NULL) 2677 break; 2678 } 2679 panic("tcp_pulloutofband"); 2680 } 2681 2682 /* 2683 * Collect new round-trip time estimate 2684 * and update averages and current timeout. 2685 */ 2686 void 2687 tcp_xmit_timer(struct tcpcb *tp, int32_t rtt) 2688 { 2689 int delta, rttmin; 2690 2691 if (rtt < 0) 2692 rtt = 0; 2693 else if (rtt > TCP_RTT_MAX) 2694 rtt = TCP_RTT_MAX; 2695 2696 tcpstat_inc(tcps_rttupdated); 2697 if (tp->t_srtt != 0) { 2698 /* 2699 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2700 * after the binary point (scaled by 4), whereas 2701 * srtt is stored as fixed point with 5 bits after the 2702 * binary point (i.e., scaled by 32). The following magic 2703 * is equivalent to the smoothing algorithm in rfc793 with 2704 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2705 * point). 2706 */ 2707 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2708 (tp->t_srtt >> TCP_RTT_SHIFT); 2709 if ((tp->t_srtt += delta) <= 0) 2710 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2711 /* 2712 * We accumulate a smoothed rtt variance (actually, a 2713 * smoothed mean difference), then set the retransmit 2714 * timer to smoothed rtt + 4 times the smoothed variance. 2715 * rttvar is stored as fixed point with 4 bits after the 2716 * binary point (scaled by 16). The following is 2717 * equivalent to rfc793 smoothing with an alpha of .75 2718 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2719 * rfc793's wired-in beta. 2720 */ 2721 if (delta < 0) 2722 delta = -delta; 2723 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2724 if ((tp->t_rttvar += delta) <= 0) 2725 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2726 } else { 2727 /* 2728 * No rtt measurement yet - use the unsmoothed rtt. 2729 * Set the variance to half the rtt (so our first 2730 * retransmit happens at 3*rtt). 2731 */ 2732 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2733 tp->t_rttvar = (rtt + 1) << 2734 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2735 } 2736 tp->t_rtttime = 0; 2737 tp->t_rxtshift = 0; 2738 2739 /* 2740 * the retransmit should happen at rtt + 4 * rttvar. 2741 * Because of the way we do the smoothing, srtt and rttvar 2742 * will each average +1/2 tick of bias. When we compute 2743 * the retransmit timer, we want 1/2 tick of rounding and 2744 * 1 extra tick because of +-1/2 tick uncertainty in the 2745 * firing of the timer. The bias will give us exactly the 2746 * 1.5 tick we need. But, because the bias is 2747 * statistical, we have to test that we don't drop below 2748 * the minimum feasible timer (which is 2 ticks). 2749 */ 2750 rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)), 2751 TCPTV_REXMTMAX); 2752 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2753 2754 /* 2755 * We received an ack for a packet that wasn't retransmitted; 2756 * it is probably safe to discard any error indications we've 2757 * received recently. This isn't quite right, but close enough 2758 * for now (a route might have failed after we sent a segment, 2759 * and the return path might not be symmetrical). 2760 */ 2761 tp->t_softerror = 0; 2762 } 2763 2764 /* 2765 * Determine a reasonable value for maxseg size. 2766 * If the route is known, check route for mtu. 2767 * If none, use an mss that can be handled on the outgoing 2768 * interface without forcing IP to fragment; if bigger than 2769 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2770 * to utilize large mbufs. If no route is found, route has no mtu, 2771 * or the destination isn't local, use a default, hopefully conservative 2772 * size (usually 512 or the default IP max size, but no more than the mtu 2773 * of the interface), as we can't discover anything about intervening 2774 * gateways or networks. We also initialize the congestion/slow start 2775 * window to be a single segment if the destination isn't local. 2776 * While looking at the routing entry, we also initialize other path-dependent 2777 * parameters from pre-set or cached values in the routing entry. 2778 * 2779 * Also take into account the space needed for options that we 2780 * send regularly. Make maxseg shorter by that amount to assure 2781 * that we can send maxseg amount of data even when the options 2782 * are present. Store the upper limit of the length of options plus 2783 * data in maxopd. 2784 * 2785 * NOTE: offer == -1 indicates that the maxseg size changed due to 2786 * Path MTU discovery. 2787 */ 2788 int 2789 tcp_mss(struct tcpcb *tp, int offer) 2790 { 2791 struct rtentry *rt; 2792 struct ifnet *ifp = NULL; 2793 int mss, mssopt; 2794 int iphlen; 2795 struct inpcb *inp; 2796 2797 inp = tp->t_inpcb; 2798 2799 mssopt = mss = tcp_mssdflt; 2800 2801 rt = in_pcbrtentry(inp); 2802 2803 if (rt == NULL) 2804 goto out; 2805 2806 ifp = if_get(rt->rt_ifidx); 2807 if (ifp == NULL) 2808 goto out; 2809 2810 switch (tp->pf) { 2811 #ifdef INET6 2812 case AF_INET6: 2813 iphlen = sizeof(struct ip6_hdr); 2814 break; 2815 #endif 2816 case AF_INET: 2817 iphlen = sizeof(struct ip); 2818 break; 2819 default: 2820 /* the family does not support path MTU discovery */ 2821 goto out; 2822 } 2823 2824 /* 2825 * if there's an mtu associated with the route and we support 2826 * path MTU discovery for the underlying protocol family, use it. 2827 */ 2828 if (rt->rt_mtu) { 2829 /* 2830 * One may wish to lower MSS to take into account options, 2831 * especially security-related options. 2832 */ 2833 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2834 /* 2835 * RFC2460 section 5, last paragraph: if path MTU is 2836 * smaller than 1280, use 1280 as packet size and 2837 * attach fragment header. 2838 */ 2839 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2840 sizeof(struct tcphdr); 2841 } else { 2842 mss = rt->rt_mtu - iphlen - 2843 sizeof(struct tcphdr); 2844 } 2845 } else if (ifp->if_flags & IFF_LOOPBACK) { 2846 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2847 } else if (tp->pf == AF_INET) { 2848 if (ip_mtudisc) 2849 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2850 } 2851 #ifdef INET6 2852 else if (tp->pf == AF_INET6) { 2853 /* 2854 * for IPv6, path MTU discovery is always turned on, 2855 * or the node must use packet size <= 1280. 2856 */ 2857 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2858 } 2859 #endif /* INET6 */ 2860 2861 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2862 if (offer != -1) { 2863 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2864 mssopt = max(tcp_mssdflt, mssopt); 2865 } 2866 out: 2867 if_put(ifp); 2868 /* 2869 * The current mss, t_maxseg, is initialized to the default value. 2870 * If we compute a smaller value, reduce the current mss. 2871 * If we compute a larger value, return it for use in sending 2872 * a max seg size option, but don't store it for use 2873 * unless we received an offer at least that large from peer. 2874 * 2875 * However, do not accept offers lower than the minimum of 2876 * the interface MTU and 216. 2877 */ 2878 if (offer > 0) 2879 tp->t_peermss = offer; 2880 if (tp->t_peermss) 2881 mss = min(mss, max(tp->t_peermss, 216)); 2882 2883 /* sanity - at least max opt. space */ 2884 mss = max(mss, 64); 2885 2886 /* 2887 * maxopd stores the maximum length of data AND options 2888 * in a segment; maxseg is the amount of data in a normal 2889 * segment. We need to store this value (maxopd) apart 2890 * from maxseg, because now every segment carries options 2891 * and thus we normally have somewhat less data in segments. 2892 */ 2893 tp->t_maxopd = mss; 2894 2895 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2896 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2897 mss -= TCPOLEN_TSTAMP_APPA; 2898 #ifdef TCP_SIGNATURE 2899 if (tp->t_flags & TF_SIGNATURE) 2900 mss -= TCPOLEN_SIGLEN; 2901 #endif 2902 2903 if (offer == -1) { 2904 /* mss changed due to Path MTU discovery */ 2905 tp->t_flags &= ~TF_PMTUD_PEND; 2906 tp->t_pmtud_mtu_sent = 0; 2907 tp->t_pmtud_mss_acked = 0; 2908 if (mss < tp->t_maxseg) { 2909 /* 2910 * Follow suggestion in RFC 2414 to reduce the 2911 * congestion window by the ratio of the old 2912 * segment size to the new segment size. 2913 */ 2914 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2915 mss, mss); 2916 } 2917 } else if (tcp_do_rfc3390 == 2) { 2918 /* increase initial window */ 2919 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2920 } else if (tcp_do_rfc3390) { 2921 /* increase initial window */ 2922 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2923 } else 2924 tp->snd_cwnd = mss; 2925 2926 tp->t_maxseg = mss; 2927 2928 return (offer != -1 ? mssopt : mss); 2929 } 2930 2931 u_int 2932 tcp_hdrsz(struct tcpcb *tp) 2933 { 2934 u_int hlen; 2935 2936 switch (tp->pf) { 2937 #ifdef INET6 2938 case AF_INET6: 2939 hlen = sizeof(struct ip6_hdr); 2940 break; 2941 #endif 2942 case AF_INET: 2943 hlen = sizeof(struct ip); 2944 break; 2945 default: 2946 hlen = 0; 2947 break; 2948 } 2949 hlen += sizeof(struct tcphdr); 2950 2951 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2952 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2953 hlen += TCPOLEN_TSTAMP_APPA; 2954 #ifdef TCP_SIGNATURE 2955 if (tp->t_flags & TF_SIGNATURE) 2956 hlen += TCPOLEN_SIGLEN; 2957 #endif 2958 return (hlen); 2959 } 2960 2961 /* 2962 * Set connection variables based on the effective MSS. 2963 * We are passed the TCPCB for the actual connection. If we 2964 * are the server, we are called by the compressed state engine 2965 * when the 3-way handshake is complete. If we are the client, 2966 * we are called when we receive the SYN,ACK from the server. 2967 * 2968 * NOTE: The t_maxseg value must be initialized in the TCPCB 2969 * before this routine is called! 2970 */ 2971 void 2972 tcp_mss_update(struct tcpcb *tp) 2973 { 2974 int mss; 2975 u_long bufsize; 2976 struct rtentry *rt; 2977 struct socket *so; 2978 2979 so = tp->t_inpcb->inp_socket; 2980 mss = tp->t_maxseg; 2981 2982 rt = in_pcbrtentry(tp->t_inpcb); 2983 2984 if (rt == NULL) 2985 return; 2986 2987 bufsize = so->so_snd.sb_hiwat; 2988 if (bufsize < mss) { 2989 mss = bufsize; 2990 /* Update t_maxseg and t_maxopd */ 2991 tcp_mss(tp, mss); 2992 } else { 2993 bufsize = roundup(bufsize, mss); 2994 if (bufsize > sb_max) 2995 bufsize = sb_max; 2996 (void)sbreserve(so, &so->so_snd, bufsize); 2997 } 2998 2999 bufsize = so->so_rcv.sb_hiwat; 3000 if (bufsize > mss) { 3001 bufsize = roundup(bufsize, mss); 3002 if (bufsize > sb_max) 3003 bufsize = sb_max; 3004 (void)sbreserve(so, &so->so_rcv, bufsize); 3005 } 3006 3007 } 3008 3009 /* 3010 * When a partial ack arrives, force the retransmission of the 3011 * next unacknowledged segment. Do not clear tp->t_dupacks. 3012 * By setting snd_nxt to ti_ack, this forces retransmission timer 3013 * to be started again. 3014 */ 3015 void 3016 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3017 { 3018 /* 3019 * snd_una has not been updated and the socket send buffer 3020 * not yet drained of the acked data, so we have to leave 3021 * snd_una as it was to get the correct data offset in 3022 * tcp_output(). 3023 */ 3024 tcp_seq onxt = tp->snd_nxt; 3025 u_long ocwnd = tp->snd_cwnd; 3026 3027 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3028 tp->t_rtttime = 0; 3029 tp->snd_nxt = th->th_ack; 3030 /* 3031 * Set snd_cwnd to one segment beyond acknowledged offset 3032 * (tp->snd_una not yet updated when this function is called) 3033 */ 3034 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3035 (void)tcp_output(tp); 3036 tp->snd_cwnd = ocwnd; 3037 if (SEQ_GT(onxt, tp->snd_nxt)) 3038 tp->snd_nxt = onxt; 3039 /* 3040 * Partial window deflation. Relies on fact that tp->snd_una 3041 * not updated yet. 3042 */ 3043 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3044 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3045 else 3046 tp->snd_cwnd = 0; 3047 tp->snd_cwnd += tp->t_maxseg; 3048 } 3049 3050 int 3051 tcp_mss_adv(struct mbuf *m, int af) 3052 { 3053 int mss = 0; 3054 int iphlen; 3055 struct ifnet *ifp = NULL; 3056 3057 if (m && (m->m_flags & M_PKTHDR)) 3058 ifp = if_get(m->m_pkthdr.ph_ifidx); 3059 3060 switch (af) { 3061 case AF_INET: 3062 if (ifp != NULL) 3063 mss = ifp->if_mtu; 3064 iphlen = sizeof(struct ip); 3065 break; 3066 #ifdef INET6 3067 case AF_INET6: 3068 if (ifp != NULL) 3069 mss = ifp->if_mtu; 3070 iphlen = sizeof(struct ip6_hdr); 3071 break; 3072 #endif 3073 default: 3074 unhandled_af(af); 3075 } 3076 if_put(ifp); 3077 mss = mss - iphlen - sizeof(struct tcphdr); 3078 return (max(mss, tcp_mssdflt)); 3079 } 3080 3081 /* 3082 * TCP compressed state engine. Currently used to hold compressed 3083 * state for SYN_RECEIVED. 3084 */ 3085 3086 /* 3087 * Locks used to protect global data and struct members: 3088 * N net lock 3089 * S syn_cache_mtx tcp syn cache global mutex 3090 */ 3091 3092 /* syn hash parameters */ 3093 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; /* [N] size of hash table */ 3094 int tcp_syn_cache_limit = /* [N] global entry limit */ 3095 TCP_SYN_HASH_SIZE * TCP_SYN_BUCKET_SIZE; 3096 int tcp_syn_bucket_limit = /* [N] per bucket limit */ 3097 3 * TCP_SYN_BUCKET_SIZE; 3098 int tcp_syn_use_limit = 100000; /* [N] reseed after uses */ 3099 3100 struct pool syn_cache_pool; 3101 struct syn_cache_set tcp_syn_cache[2]; 3102 int tcp_syn_cache_active; 3103 struct mutex syn_cache_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); 3104 3105 #define SYN_HASH(sa, sp, dp, rand) \ 3106 (((sa)->s_addr ^ (rand)[0]) * \ 3107 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3108 #ifndef INET6 3109 #define SYN_HASHALL(hash, src, dst, rand) \ 3110 do { \ 3111 hash = SYN_HASH(&satosin_const(src)->sin_addr, \ 3112 satosin_const(src)->sin_port, \ 3113 satosin_const(dst)->sin_port, (rand)); \ 3114 } while (/*CONSTCOND*/ 0) 3115 #else 3116 #define SYN_HASH6(sa, sp, dp, rand) \ 3117 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3118 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3119 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3120 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3121 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3122 3123 #define SYN_HASHALL(hash, src, dst, rand) \ 3124 do { \ 3125 switch ((src)->sa_family) { \ 3126 case AF_INET: \ 3127 hash = SYN_HASH(&satosin_const(src)->sin_addr, \ 3128 satosin_const(src)->sin_port, \ 3129 satosin_const(dst)->sin_port, (rand)); \ 3130 break; \ 3131 case AF_INET6: \ 3132 hash = SYN_HASH6(&satosin6_const(src)->sin6_addr, \ 3133 satosin6_const(src)->sin6_port, \ 3134 satosin6_const(dst)->sin6_port, (rand)); \ 3135 break; \ 3136 default: \ 3137 hash = 0; \ 3138 } \ 3139 } while (/*CONSTCOND*/0) 3140 #endif /* INET6 */ 3141 3142 void 3143 syn_cache_rm(struct syn_cache *sc) 3144 { 3145 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3146 3147 KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD)); 3148 SET(sc->sc_dynflags, SCF_DEAD); 3149 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3150 sc->sc_tp = NULL; 3151 LIST_REMOVE(sc, sc_tpq); 3152 refcnt_rele(&sc->sc_refcnt); 3153 sc->sc_buckethead->sch_length--; 3154 if (timeout_del(&sc->sc_timer)) 3155 refcnt_rele(&sc->sc_refcnt); 3156 sc->sc_set->scs_count--; 3157 } 3158 3159 void 3160 syn_cache_put(struct syn_cache *sc) 3161 { 3162 if (refcnt_rele(&sc->sc_refcnt) == 0) 3163 return; 3164 3165 /* Dealing with last reference, no lock needed. */ 3166 m_free(sc->sc_ipopts); 3167 rtfree(sc->sc_route.ro_rt); 3168 3169 pool_put(&syn_cache_pool, sc); 3170 } 3171 3172 void 3173 syn_cache_init(void) 3174 { 3175 int i; 3176 3177 /* Initialize the hash buckets. */ 3178 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3179 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3180 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3181 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3182 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3183 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3184 for (i = 0; i < tcp_syn_hash_size; i++) { 3185 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3186 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3187 } 3188 3189 /* Initialize the syn cache pool. */ 3190 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3191 0, "syncache", NULL); 3192 } 3193 3194 void 3195 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3196 { 3197 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3198 struct syn_cache_head *scp; 3199 struct syn_cache *sc2; 3200 int i; 3201 3202 NET_ASSERT_LOCKED(); 3203 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3204 3205 /* 3206 * If there are no entries in the hash table, reinitialize 3207 * the hash secrets. To avoid useless cache swaps and 3208 * reinitialization, use it until the limit is reached. 3209 * An empty cache is also the opportunity to resize the hash. 3210 */ 3211 if (set->scs_count == 0 && set->scs_use <= 0) { 3212 set->scs_use = tcp_syn_use_limit; 3213 if (set->scs_size != tcp_syn_hash_size) { 3214 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3215 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3216 if (scp == NULL) { 3217 /* Try again next time. */ 3218 set->scs_use = 0; 3219 } else { 3220 free(set->scs_buckethead, M_SYNCACHE, 3221 set->scs_size * 3222 sizeof(struct syn_cache_head)); 3223 set->scs_buckethead = scp; 3224 set->scs_size = tcp_syn_hash_size; 3225 for (i = 0; i < tcp_syn_hash_size; i++) 3226 TAILQ_INIT(&scp[i].sch_bucket); 3227 } 3228 } 3229 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3230 tcpstat_inc(tcps_sc_seedrandom); 3231 } 3232 3233 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3234 set->scs_random); 3235 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3236 sc->sc_buckethead = scp; 3237 3238 /* 3239 * Make sure that we don't overflow the per-bucket 3240 * limit or the total cache size limit. 3241 */ 3242 if (scp->sch_length >= tcp_syn_bucket_limit) { 3243 tcpstat_inc(tcps_sc_bucketoverflow); 3244 /* 3245 * Someone might attack our bucket hash function. Reseed 3246 * with random as soon as the passive syn cache gets empty. 3247 */ 3248 set->scs_use = 0; 3249 /* 3250 * The bucket is full. Toss the oldest element in the 3251 * bucket. This will be the first entry in the bucket. 3252 */ 3253 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3254 #ifdef DIAGNOSTIC 3255 /* 3256 * This should never happen; we should always find an 3257 * entry in our bucket. 3258 */ 3259 if (sc2 == NULL) 3260 panic("%s: bucketoverflow: impossible", __func__); 3261 #endif 3262 syn_cache_rm(sc2); 3263 syn_cache_put(sc2); 3264 } else if (set->scs_count >= tcp_syn_cache_limit) { 3265 struct syn_cache_head *scp2, *sce; 3266 3267 tcpstat_inc(tcps_sc_overflowed); 3268 /* 3269 * The cache is full. Toss the oldest entry in the 3270 * first non-empty bucket we can find. 3271 * 3272 * XXX We would really like to toss the oldest 3273 * entry in the cache, but we hope that this 3274 * condition doesn't happen very often. 3275 */ 3276 scp2 = scp; 3277 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3278 sce = &set->scs_buckethead[set->scs_size]; 3279 for (++scp2; scp2 != scp; scp2++) { 3280 if (scp2 >= sce) 3281 scp2 = &set->scs_buckethead[0]; 3282 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3283 break; 3284 } 3285 #ifdef DIAGNOSTIC 3286 /* 3287 * This should never happen; we should always find a 3288 * non-empty bucket. 3289 */ 3290 if (scp2 == scp) 3291 panic("%s: cacheoverflow: impossible", 3292 __func__); 3293 #endif 3294 } 3295 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3296 syn_cache_rm(sc2); 3297 syn_cache_put(sc2); 3298 } 3299 3300 /* 3301 * Initialize the entry's timer. We don't estimate RTT 3302 * with SYNs, so each packet starts with the default RTT 3303 * and each timer step has a fixed timeout value. 3304 */ 3305 sc->sc_rxttot = 0; 3306 sc->sc_rxtshift = 0; 3307 TCPT_RANGESET(sc->sc_rxtcur, 3308 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3309 TCPTV_REXMTMAX); 3310 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3311 refcnt_take(&sc->sc_refcnt); 3312 3313 /* Link it from tcpcb entry */ 3314 refcnt_take(&sc->sc_refcnt); 3315 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3316 3317 /* Put it into the bucket. */ 3318 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3319 scp->sch_length++; 3320 sc->sc_set = set; 3321 set->scs_count++; 3322 set->scs_use--; 3323 3324 tcpstat_inc(tcps_sc_added); 3325 3326 /* 3327 * If the active cache has exceeded its use limit and 3328 * the passive syn cache is empty, exchange their roles. 3329 */ 3330 if (set->scs_use <= 0 && 3331 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3332 tcp_syn_cache_active = !tcp_syn_cache_active; 3333 } 3334 3335 /* 3336 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3337 * If we have retransmitted an entry the maximum number of times, expire 3338 * that entry. 3339 */ 3340 void 3341 syn_cache_timer(void *arg) 3342 { 3343 struct syn_cache *sc = arg; 3344 uint64_t now; 3345 int lastref; 3346 3347 mtx_enter(&syn_cache_mtx); 3348 if (ISSET(sc->sc_dynflags, SCF_DEAD)) 3349 goto freeit; 3350 3351 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3352 /* Drop it -- too many retransmissions. */ 3353 goto dropit; 3354 } 3355 3356 /* 3357 * Compute the total amount of time this entry has 3358 * been on a queue. If this entry has been on longer 3359 * than the keep alive timer would allow, expire it. 3360 */ 3361 sc->sc_rxttot += sc->sc_rxtcur; 3362 if (sc->sc_rxttot >= READ_ONCE(tcptv_keep_init)) 3363 goto dropit; 3364 3365 /* Advance the timer back-off. */ 3366 sc->sc_rxtshift++; 3367 TCPT_RANGESET(sc->sc_rxtcur, 3368 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3369 TCPTV_REXMTMAX); 3370 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3371 refcnt_take(&sc->sc_refcnt); 3372 mtx_leave(&syn_cache_mtx); 3373 3374 NET_LOCK(); 3375 now = tcp_now(); 3376 (void) syn_cache_respond(sc, NULL, now); 3377 tcpstat_inc(tcps_sc_retransmitted); 3378 NET_UNLOCK(); 3379 3380 syn_cache_put(sc); 3381 return; 3382 3383 dropit: 3384 tcpstat_inc(tcps_sc_timed_out); 3385 syn_cache_rm(sc); 3386 /* Decrement reference of the timer and free object after remove. */ 3387 lastref = refcnt_rele(&sc->sc_refcnt); 3388 KASSERT(lastref == 0); 3389 (void)lastref; 3390 freeit: 3391 mtx_leave(&syn_cache_mtx); 3392 syn_cache_put(sc); 3393 } 3394 3395 /* 3396 * Remove syn cache created by the specified tcb entry, 3397 * because this does not make sense to keep them 3398 * (if there's no tcb entry, syn cache entry will never be used) 3399 */ 3400 void 3401 syn_cache_cleanup(struct tcpcb *tp) 3402 { 3403 struct syn_cache *sc, *nsc; 3404 3405 NET_ASSERT_LOCKED(); 3406 3407 mtx_enter(&syn_cache_mtx); 3408 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3409 #ifdef DIAGNOSTIC 3410 if (sc->sc_tp != tp) 3411 panic("invalid sc_tp in syn_cache_cleanup"); 3412 #endif 3413 syn_cache_rm(sc); 3414 syn_cache_put(sc); 3415 } 3416 mtx_leave(&syn_cache_mtx); 3417 3418 KASSERT(LIST_EMPTY(&tp->t_sc)); 3419 } 3420 3421 /* 3422 * Find an entry in the syn cache. 3423 */ 3424 struct syn_cache * 3425 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, 3426 struct syn_cache_head **headp, u_int rtableid) 3427 { 3428 struct syn_cache_set *sets[2]; 3429 struct syn_cache *sc; 3430 struct syn_cache_head *scp; 3431 u_int32_t hash; 3432 int i; 3433 3434 NET_ASSERT_LOCKED(); 3435 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3436 3437 /* Check the active cache first, the passive cache is likely empty. */ 3438 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3439 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3440 for (i = 0; i < 2; i++) { 3441 if (sets[i]->scs_count == 0) 3442 continue; 3443 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3444 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3445 *headp = scp; 3446 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3447 if (sc->sc_hash != hash) 3448 continue; 3449 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3450 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3451 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3452 return (sc); 3453 } 3454 } 3455 return (NULL); 3456 } 3457 3458 /* 3459 * This function gets called when we receive an ACK for a 3460 * socket in the LISTEN state. We look up the connection 3461 * in the syn cache, and if its there, we pull it out of 3462 * the cache and turn it into a full-blown connection in 3463 * the SYN-RECEIVED state. 3464 * 3465 * The return values may not be immediately obvious, and their effects 3466 * can be subtle, so here they are: 3467 * 3468 * NULL SYN was not found in cache; caller should drop the 3469 * packet and send an RST. 3470 * 3471 * -1 We were unable to create the new connection, and are 3472 * aborting it. An ACK,RST is being sent to the peer 3473 * (unless we got screwy sequence numbers; see below), 3474 * because the 3-way handshake has been completed. Caller 3475 * should not free the mbuf, since we may be using it. If 3476 * we are not, we will free it. 3477 * 3478 * Otherwise, the return value is a pointer to the new socket 3479 * associated with the connection. 3480 */ 3481 struct socket * 3482 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3483 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint64_t now) 3484 { 3485 struct syn_cache *sc; 3486 struct syn_cache_head *scp; 3487 struct inpcb *inp, *oldinp; 3488 struct tcpcb *tp = NULL; 3489 struct mbuf *am; 3490 struct socket *oso; 3491 u_int rtableid; 3492 3493 NET_ASSERT_LOCKED(); 3494 3495 mtx_enter(&syn_cache_mtx); 3496 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3497 if (sc == NULL) { 3498 mtx_leave(&syn_cache_mtx); 3499 return (NULL); 3500 } 3501 3502 /* 3503 * Verify the sequence and ack numbers. Try getting the correct 3504 * response again. 3505 */ 3506 if ((th->th_ack != sc->sc_iss + 1) || 3507 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3508 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3509 refcnt_take(&sc->sc_refcnt); 3510 mtx_leave(&syn_cache_mtx); 3511 (void) syn_cache_respond(sc, m, now); 3512 syn_cache_put(sc); 3513 return ((struct socket *)(-1)); 3514 } 3515 3516 /* Remove this cache entry */ 3517 syn_cache_rm(sc); 3518 mtx_leave(&syn_cache_mtx); 3519 3520 /* 3521 * Ok, create the full blown connection, and set things up 3522 * as they would have been set up if we had created the 3523 * connection when the SYN arrived. If we can't create 3524 * the connection, abort it. 3525 */ 3526 oso = so; 3527 so = sonewconn(so, SS_ISCONNECTED, M_DONTWAIT); 3528 if (so == NULL) 3529 goto resetandabort; 3530 3531 oldinp = sotoinpcb(oso); 3532 inp = sotoinpcb(so); 3533 3534 #ifdef IPSEC 3535 /* 3536 * We need to copy the required security levels 3537 * from the old pcb. Ditto for any other 3538 * IPsec-related information. 3539 */ 3540 inp->inp_seclevel = oldinp->inp_seclevel; 3541 #endif /* IPSEC */ 3542 #ifdef INET6 3543 if (ISSET(inp->inp_flags, INP_IPV6)) { 3544 KASSERT(ISSET(oldinp->inp_flags, INP_IPV6)); 3545 3546 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3547 inp->inp_hops = oldinp->inp_hops; 3548 } else 3549 #endif 3550 { 3551 KASSERT(!ISSET(oldinp->inp_flags, INP_IPV6)); 3552 3553 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3554 inp->inp_options = ip_srcroute(m); 3555 if (inp->inp_options == NULL) { 3556 inp->inp_options = sc->sc_ipopts; 3557 sc->sc_ipopts = NULL; 3558 } 3559 } 3560 3561 /* inherit rtable from listening socket */ 3562 rtableid = sc->sc_rtableid; 3563 #if NPF > 0 3564 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3565 struct pf_divert *divert; 3566 3567 divert = pf_find_divert(m); 3568 KASSERT(divert != NULL); 3569 rtableid = divert->rdomain; 3570 } 3571 #endif 3572 in_pcbset_laddr(inp, dst, rtableid); 3573 3574 /* 3575 * Give the new socket our cached route reference. 3576 */ 3577 inp->inp_route = sc->sc_route; /* struct assignment */ 3578 sc->sc_route.ro_rt = NULL; 3579 3580 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3581 if (am == NULL) 3582 goto resetandabort; 3583 am->m_len = src->sa_len; 3584 memcpy(mtod(am, caddr_t), src, src->sa_len); 3585 if (in_pcbconnect(inp, am)) { 3586 (void) m_free(am); 3587 goto resetandabort; 3588 } 3589 (void) m_free(am); 3590 3591 tp = intotcpcb(inp); 3592 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3593 if (sc->sc_request_r_scale != 15) { 3594 tp->requested_s_scale = sc->sc_requested_s_scale; 3595 tp->request_r_scale = sc->sc_request_r_scale; 3596 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3597 } 3598 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) 3599 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3600 3601 tp->t_template = tcp_template(tp); 3602 if (tp->t_template == 0) { 3603 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3604 so = NULL; 3605 goto abort; 3606 } 3607 tp->sack_enable = ISSET(sc->sc_fixflags, SCF_SACK_PERMIT); 3608 tp->ts_modulate = sc->sc_modulate; 3609 tp->ts_recent = sc->sc_timestamp; 3610 tp->iss = sc->sc_iss; 3611 tp->irs = sc->sc_irs; 3612 tcp_sendseqinit(tp); 3613 tp->snd_last = tp->snd_una; 3614 #ifdef TCP_ECN 3615 if (ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) { 3616 tp->t_flags |= TF_ECN_PERMIT; 3617 tcpstat_inc(tcps_ecn_accepts); 3618 } 3619 #endif 3620 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) 3621 tp->t_flags |= TF_SACK_PERMIT; 3622 #ifdef TCP_SIGNATURE 3623 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) 3624 tp->t_flags |= TF_SIGNATURE; 3625 #endif 3626 tcp_rcvseqinit(tp); 3627 tp->t_state = TCPS_SYN_RECEIVED; 3628 tp->t_rcvtime = now; 3629 tp->t_sndtime = now; 3630 tp->t_rcvacktime = now; 3631 tp->t_sndacktime = now; 3632 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3633 tcpstat_inc(tcps_accepts); 3634 3635 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3636 if (sc->sc_peermaxseg) 3637 tcp_mss_update(tp); 3638 /* Reset initial window to 1 segment for retransmit */ 3639 if (READ_ONCE(sc->sc_rxtshift) > 0) 3640 tp->snd_cwnd = tp->t_maxseg; 3641 tp->snd_wl1 = sc->sc_irs; 3642 tp->rcv_up = sc->sc_irs + 1; 3643 3644 /* 3645 * This is what would have happened in tcp_output() when 3646 * the SYN,ACK was sent. 3647 */ 3648 tp->snd_up = tp->snd_una; 3649 tp->snd_max = tp->snd_nxt = tp->iss+1; 3650 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3651 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3652 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3653 tp->last_ack_sent = tp->rcv_nxt; 3654 3655 tcpstat_inc(tcps_sc_completed); 3656 syn_cache_put(sc); 3657 return (so); 3658 3659 resetandabort: 3660 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3661 m->m_pkthdr.ph_rtableid, now); 3662 abort: 3663 m_freem(m); 3664 if (so != NULL) 3665 soabort(so); 3666 syn_cache_put(sc); 3667 tcpstat_inc(tcps_sc_aborted); 3668 return ((struct socket *)(-1)); 3669 } 3670 3671 /* 3672 * This function is called when we get a RST for a 3673 * non-existent connection, so that we can see if the 3674 * connection is in the syn cache. If it is, zap it. 3675 */ 3676 3677 void 3678 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3679 u_int rtableid) 3680 { 3681 struct syn_cache *sc; 3682 struct syn_cache_head *scp; 3683 3684 NET_ASSERT_LOCKED(); 3685 3686 mtx_enter(&syn_cache_mtx); 3687 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3688 if (sc == NULL) { 3689 mtx_leave(&syn_cache_mtx); 3690 return; 3691 } 3692 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3693 SEQ_GT(th->th_seq, sc->sc_irs + 1)) { 3694 mtx_leave(&syn_cache_mtx); 3695 return; 3696 } 3697 syn_cache_rm(sc); 3698 mtx_leave(&syn_cache_mtx); 3699 tcpstat_inc(tcps_sc_reset); 3700 syn_cache_put(sc); 3701 } 3702 3703 void 3704 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, 3705 struct tcphdr *th, u_int rtableid) 3706 { 3707 struct syn_cache *sc; 3708 struct syn_cache_head *scp; 3709 3710 NET_ASSERT_LOCKED(); 3711 3712 mtx_enter(&syn_cache_mtx); 3713 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3714 if (sc == NULL) { 3715 mtx_leave(&syn_cache_mtx); 3716 return; 3717 } 3718 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3719 if (ntohl (th->th_seq) != sc->sc_iss) { 3720 mtx_leave(&syn_cache_mtx); 3721 return; 3722 } 3723 3724 /* 3725 * If we've retransmitted 3 times and this is our second error, 3726 * we remove the entry. Otherwise, we allow it to continue on. 3727 * This prevents us from incorrectly nuking an entry during a 3728 * spurious network outage. 3729 * 3730 * See tcp_notify(). 3731 */ 3732 if (!ISSET(sc->sc_dynflags, SCF_UNREACH) || sc->sc_rxtshift < 3) { 3733 SET(sc->sc_dynflags, SCF_UNREACH); 3734 mtx_leave(&syn_cache_mtx); 3735 return; 3736 } 3737 3738 syn_cache_rm(sc); 3739 mtx_leave(&syn_cache_mtx); 3740 tcpstat_inc(tcps_sc_unreach); 3741 syn_cache_put(sc); 3742 } 3743 3744 /* 3745 * Given a LISTEN socket and an inbound SYN request, add 3746 * this to the syn cache, and send back a segment: 3747 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3748 * to the source. 3749 * 3750 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3751 * Doing so would require that we hold onto the data and deliver it 3752 * to the application. However, if we are the target of a SYN-flood 3753 * DoS attack, an attacker could send data which would eventually 3754 * consume all available buffer space if it were ACKed. By not ACKing 3755 * the data, we avoid this DoS scenario. 3756 */ 3757 3758 int 3759 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3760 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3761 struct tcp_opt_info *oi, tcp_seq *issp, uint64_t now) 3762 { 3763 struct tcpcb tb, *tp; 3764 long win; 3765 struct syn_cache *sc; 3766 struct syn_cache_head *scp; 3767 struct mbuf *ipopts; 3768 3769 NET_ASSERT_LOCKED(); 3770 3771 tp = sototcpcb(so); 3772 3773 /* 3774 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3775 * 3776 * Note this check is performed in tcp_input() very early on. 3777 */ 3778 3779 /* 3780 * Initialize some local state. 3781 */ 3782 win = sbspace(so, &so->so_rcv); 3783 if (win > TCP_MAXWIN) 3784 win = TCP_MAXWIN; 3785 3786 bzero(&tb, sizeof(tb)); 3787 if (optp 3788 #ifdef TCP_SIGNATURE 3789 || (tp->t_flags & TF_SIGNATURE) 3790 #endif 3791 ) { 3792 tb.pf = tp->pf; 3793 tb.sack_enable = tp->sack_enable; 3794 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3795 #ifdef TCP_SIGNATURE 3796 if (tp->t_flags & TF_SIGNATURE) 3797 tb.t_flags |= TF_SIGNATURE; 3798 #endif 3799 tb.t_state = TCPS_LISTEN; 3800 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3801 sotoinpcb(so)->inp_rtableid, now)) 3802 return (-1); 3803 } 3804 3805 switch (src->sa_family) { 3806 case AF_INET: 3807 /* 3808 * Remember the IP options, if any. 3809 */ 3810 ipopts = ip_srcroute(m); 3811 break; 3812 default: 3813 ipopts = NULL; 3814 } 3815 3816 /* 3817 * See if we already have an entry for this connection. 3818 * If we do, resend the SYN,ACK. We do not count this 3819 * as a retransmission (XXX though maybe we should). 3820 */ 3821 mtx_enter(&syn_cache_mtx); 3822 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3823 if (sc != NULL) { 3824 refcnt_take(&sc->sc_refcnt); 3825 mtx_leave(&syn_cache_mtx); 3826 tcpstat_inc(tcps_sc_dupesyn); 3827 if (ipopts) { 3828 /* 3829 * If we were remembering a previous source route, 3830 * forget it and use the new one we've been given. 3831 */ 3832 m_free(sc->sc_ipopts); 3833 sc->sc_ipopts = ipopts; 3834 } 3835 sc->sc_timestamp = tb.ts_recent; 3836 if (syn_cache_respond(sc, m, now) == 0) { 3837 tcpstat_inc(tcps_sndacks); 3838 tcpstat_inc(tcps_sndtotal); 3839 } 3840 syn_cache_put(sc); 3841 return (0); 3842 } 3843 mtx_leave(&syn_cache_mtx); 3844 3845 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3846 if (sc == NULL) { 3847 m_free(ipopts); 3848 return (-1); 3849 } 3850 refcnt_init_trace(&sc->sc_refcnt, DT_REFCNT_IDX_SYNCACHE); 3851 timeout_set_flags(&sc->sc_timer, syn_cache_timer, sc, 3852 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 3853 3854 /* 3855 * Fill in the cache, and put the necessary IP and TCP 3856 * options into the reply. 3857 */ 3858 memcpy(&sc->sc_src, src, src->sa_len); 3859 memcpy(&sc->sc_dst, dst, dst->sa_len); 3860 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3861 sc->sc_ipopts = ipopts; 3862 sc->sc_irs = th->th_seq; 3863 3864 sc->sc_iss = issp ? *issp : arc4random(); 3865 sc->sc_peermaxseg = oi->maxseg; 3866 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3867 sc->sc_win = win; 3868 sc->sc_timestamp = tb.ts_recent; 3869 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3870 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3871 SET(sc->sc_fixflags, SCF_TIMESTAMP); 3872 sc->sc_modulate = arc4random(); 3873 } 3874 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3875 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3876 sc->sc_requested_s_scale = tb.requested_s_scale; 3877 sc->sc_request_r_scale = 0; 3878 /* 3879 * Pick the smallest possible scaling factor that 3880 * will still allow us to scale up to sb_max. 3881 * 3882 * We do this because there are broken firewalls that 3883 * will corrupt the window scale option, leading to 3884 * the other endpoint believing that our advertised 3885 * window is unscaled. At scale factors larger than 3886 * 5 the unscaled window will drop below 1500 bytes, 3887 * leading to serious problems when traversing these 3888 * broken firewalls. 3889 * 3890 * With the default sbmax of 256K, a scale factor 3891 * of 3 will be chosen by this algorithm. Those who 3892 * choose a larger sbmax should watch out 3893 * for the compatibility problems mentioned above. 3894 * 3895 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3896 * or <SYN,ACK>) segment itself is never scaled. 3897 */ 3898 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3899 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3900 sc->sc_request_r_scale++; 3901 } else { 3902 sc->sc_requested_s_scale = 15; 3903 sc->sc_request_r_scale = 15; 3904 } 3905 #ifdef TCP_ECN 3906 /* 3907 * if both ECE and CWR flag bits are set, peer is ECN capable. 3908 */ 3909 if (tcp_do_ecn && 3910 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3911 SET(sc->sc_fixflags, SCF_ECN_PERMIT); 3912 #endif 3913 /* 3914 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3915 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3916 */ 3917 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3918 SET(sc->sc_fixflags, SCF_SACK_PERMIT); 3919 #ifdef TCP_SIGNATURE 3920 if (tb.t_flags & TF_SIGNATURE) 3921 SET(sc->sc_fixflags, SCF_SIGNATURE); 3922 #endif 3923 sc->sc_tp = tp; 3924 if (syn_cache_respond(sc, m, now) == 0) { 3925 mtx_enter(&syn_cache_mtx); 3926 /* 3927 * XXXSMP Currently exclusive netlock prevents another insert 3928 * after our syn_cache_lookup() and before syn_cache_insert(). 3929 * Double insert should be handled and not rely on netlock. 3930 */ 3931 syn_cache_insert(sc, tp); 3932 mtx_leave(&syn_cache_mtx); 3933 tcpstat_inc(tcps_sndacks); 3934 tcpstat_inc(tcps_sndtotal); 3935 } else { 3936 syn_cache_put(sc); 3937 tcpstat_inc(tcps_sc_dropped); 3938 } 3939 3940 return (0); 3941 } 3942 3943 int 3944 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint64_t now) 3945 { 3946 u_int8_t *optp; 3947 int optlen, error; 3948 u_int16_t tlen; 3949 struct ip *ip = NULL; 3950 #ifdef INET6 3951 struct ip6_hdr *ip6 = NULL; 3952 #endif 3953 struct tcphdr *th; 3954 u_int hlen; 3955 struct inpcb *inp; 3956 3957 NET_ASSERT_LOCKED(); 3958 3959 switch (sc->sc_src.sa.sa_family) { 3960 case AF_INET: 3961 hlen = sizeof(struct ip); 3962 break; 3963 #ifdef INET6 3964 case AF_INET6: 3965 hlen = sizeof(struct ip6_hdr); 3966 break; 3967 #endif 3968 default: 3969 m_freem(m); 3970 return (EAFNOSUPPORT); 3971 } 3972 3973 /* Compute the size of the TCP options. */ 3974 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3975 (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT) ? 4 : 0) + 3976 #ifdef TCP_SIGNATURE 3977 (ISSET(sc->sc_fixflags, SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3978 #endif 3979 (ISSET(sc->sc_fixflags, SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3980 3981 tlen = hlen + sizeof(struct tcphdr) + optlen; 3982 3983 /* 3984 * Create the IP+TCP header from scratch. 3985 */ 3986 m_freem(m); 3987 #ifdef DIAGNOSTIC 3988 if (max_linkhdr + tlen > MCLBYTES) 3989 return (ENOBUFS); 3990 #endif 3991 MGETHDR(m, M_DONTWAIT, MT_DATA); 3992 if (m && max_linkhdr + tlen > MHLEN) { 3993 MCLGET(m, M_DONTWAIT); 3994 if ((m->m_flags & M_EXT) == 0) { 3995 m_freem(m); 3996 m = NULL; 3997 } 3998 } 3999 if (m == NULL) 4000 return (ENOBUFS); 4001 4002 /* Fixup the mbuf. */ 4003 m->m_data += max_linkhdr; 4004 m->m_len = m->m_pkthdr.len = tlen; 4005 m->m_pkthdr.ph_ifidx = 0; 4006 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4007 memset(mtod(m, u_char *), 0, tlen); 4008 4009 switch (sc->sc_src.sa.sa_family) { 4010 case AF_INET: 4011 ip = mtod(m, struct ip *); 4012 ip->ip_dst = sc->sc_src.sin.sin_addr; 4013 ip->ip_src = sc->sc_dst.sin.sin_addr; 4014 ip->ip_p = IPPROTO_TCP; 4015 th = (struct tcphdr *)(ip + 1); 4016 th->th_dport = sc->sc_src.sin.sin_port; 4017 th->th_sport = sc->sc_dst.sin.sin_port; 4018 break; 4019 #ifdef INET6 4020 case AF_INET6: 4021 ip6 = mtod(m, struct ip6_hdr *); 4022 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4023 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4024 ip6->ip6_nxt = IPPROTO_TCP; 4025 th = (struct tcphdr *)(ip6 + 1); 4026 th->th_dport = sc->sc_src.sin6.sin6_port; 4027 th->th_sport = sc->sc_dst.sin6.sin6_port; 4028 break; 4029 #endif 4030 } 4031 4032 th->th_seq = htonl(sc->sc_iss); 4033 th->th_ack = htonl(sc->sc_irs + 1); 4034 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4035 th->th_flags = TH_SYN|TH_ACK; 4036 #ifdef TCP_ECN 4037 /* Set ECE for SYN-ACK if peer supports ECN. */ 4038 if (tcp_do_ecn && ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) 4039 th->th_flags |= TH_ECE; 4040 #endif 4041 th->th_win = htons(sc->sc_win); 4042 /* th_sum already 0 */ 4043 /* th_urp already 0 */ 4044 4045 /* Tack on the TCP options. */ 4046 optp = (u_int8_t *)(th + 1); 4047 *optp++ = TCPOPT_MAXSEG; 4048 *optp++ = 4; 4049 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4050 *optp++ = sc->sc_ourmaxseg & 0xff; 4051 4052 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4053 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) { 4054 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4055 optp += 4; 4056 } 4057 4058 if (sc->sc_request_r_scale != 15) { 4059 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4060 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4061 sc->sc_request_r_scale); 4062 optp += 4; 4063 } 4064 4065 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) { 4066 u_int32_t *lp = (u_int32_t *)(optp); 4067 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4068 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4069 *lp++ = htonl(now + sc->sc_modulate); 4070 *lp = htonl(sc->sc_timestamp); 4071 optp += TCPOLEN_TSTAMP_APPA; 4072 } 4073 4074 #ifdef TCP_SIGNATURE 4075 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) { 4076 union sockaddr_union src, dst; 4077 struct tdb *tdb; 4078 4079 bzero(&src, sizeof(union sockaddr_union)); 4080 bzero(&dst, sizeof(union sockaddr_union)); 4081 src.sa.sa_len = sc->sc_src.sa.sa_len; 4082 src.sa.sa_family = sc->sc_src.sa.sa_family; 4083 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4084 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4085 4086 switch (sc->sc_src.sa.sa_family) { 4087 case 0: /*default to PF_INET*/ 4088 case AF_INET: 4089 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4090 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4091 break; 4092 #ifdef INET6 4093 case AF_INET6: 4094 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4095 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4096 break; 4097 #endif /* INET6 */ 4098 } 4099 4100 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4101 0, &src, &dst, IPPROTO_TCP); 4102 if (tdb == NULL) { 4103 m_freem(m); 4104 return (EPERM); 4105 } 4106 4107 /* Send signature option */ 4108 *(optp++) = TCPOPT_SIGNATURE; 4109 *(optp++) = TCPOLEN_SIGNATURE; 4110 4111 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4112 hlen, 0, optp) < 0) { 4113 m_freem(m); 4114 tdb_unref(tdb); 4115 return (EINVAL); 4116 } 4117 tdb_unref(tdb); 4118 optp += 16; 4119 4120 /* Pad options list to the next 32 bit boundary and 4121 * terminate it. 4122 */ 4123 *optp++ = TCPOPT_NOP; 4124 *optp++ = TCPOPT_EOL; 4125 } 4126 #endif /* TCP_SIGNATURE */ 4127 4128 SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); 4129 4130 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4131 mtx_enter(&syn_cache_mtx); 4132 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4133 mtx_leave(&syn_cache_mtx); 4134 4135 /* 4136 * Fill in some straggling IP bits. Note the stack expects 4137 * ip_len to be in host order, for convenience. 4138 */ 4139 switch (sc->sc_src.sa.sa_family) { 4140 case AF_INET: 4141 ip->ip_len = htons(tlen); 4142 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4143 if (inp != NULL) 4144 ip->ip_tos = inp->inp_ip.ip_tos; 4145 4146 error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 4147 (ip_mtudisc ? IP_MTUDISC : 0), NULL, 4148 inp ? &inp->inp_seclevel : NULL, 0); 4149 break; 4150 #ifdef INET6 4151 case AF_INET6: 4152 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4153 ip6->ip6_vfc |= IPV6_VERSION; 4154 /* ip6_plen will be updated in ip6_output() */ 4155 ip6->ip6_hlim = in6_selecthlim(inp); 4156 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4157 4158 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route, 0, 4159 NULL, inp ? &inp->inp_seclevel : NULL); 4160 break; 4161 #endif 4162 } 4163 return (error); 4164 } 4165