1 /* $OpenBSD: tcp_input.c,v 1.373 2021/12/01 12:51:09 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_TIMER_ARM_MSEC(tp, TCPT_DELACK, tcp_delack_msecs); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 189 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 190 191 void syn_cache_put(struct syn_cache *); 192 void syn_cache_rm(struct syn_cache *); 193 int syn_cache_respond(struct syn_cache *, struct mbuf *); 194 void syn_cache_timer(void *); 195 void syn_cache_reaper(void *); 196 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 197 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 198 struct tcphdr *, u_int); 199 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 200 unsigned int, struct socket *, struct mbuf *, u_char *, int, 201 struct tcp_opt_info *, tcp_seq *); 202 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 203 struct tcphdr *, unsigned int, unsigned int, struct socket *, 204 struct mbuf *); 205 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 206 struct syn_cache_head **, u_int); 207 208 /* 209 * Insert segment ti into reassembly queue of tcp with 210 * control block tp. Return TH_FIN if reassembly now includes 211 * a segment with FIN. The macro form does the common case inline 212 * (segment is the next to be received on an established connection, 213 * and the queue is empty), avoiding linkage into and removal 214 * from the queue and repetition of various conversions. 215 * Set DELACK for segments received in order, but ack immediately 216 * when segments are out of order (so fast retransmit can work). 217 */ 218 219 int 220 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 221 { 222 struct tcpqent *p, *q, *nq, *tiqe; 223 224 /* 225 * Allocate a new queue entry, before we throw away any data. 226 * If we can't, just drop the packet. XXX 227 */ 228 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 229 if (tiqe == NULL) { 230 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 231 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 232 /* Reuse last entry since new segment fills a hole */ 233 m_freem(tiqe->tcpqe_m); 234 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 235 } 236 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 237 /* Flush segment queue for this connection */ 238 tcp_freeq(tp); 239 tcpstat_inc(tcps_rcvmemdrop); 240 m_freem(m); 241 return (0); 242 } 243 } 244 245 /* 246 * Find a segment which begins after this one does. 247 */ 248 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 249 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 250 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 251 break; 252 253 /* 254 * If there is a preceding segment, it may provide some of 255 * our data already. If so, drop the data from the incoming 256 * segment. If it provides all of our data, drop us. 257 */ 258 if (p != NULL) { 259 struct tcphdr *phdr = p->tcpqe_tcp; 260 int i; 261 262 /* conversion to int (in i) handles seq wraparound */ 263 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 264 if (i > 0) { 265 if (i >= *tlen) { 266 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 267 *tlen); 268 m_freem(m); 269 pool_put(&tcpqe_pool, tiqe); 270 return (0); 271 } 272 m_adj(m, i); 273 *tlen -= i; 274 th->th_seq += i; 275 } 276 } 277 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 278 279 /* 280 * While we overlap succeeding segments trim them or, 281 * if they are completely covered, dequeue them. 282 */ 283 for (; q != NULL; q = nq) { 284 struct tcphdr *qhdr = q->tcpqe_tcp; 285 int i = (th->th_seq + *tlen) - qhdr->th_seq; 286 287 if (i <= 0) 288 break; 289 if (i < qhdr->th_reseqlen) { 290 qhdr->th_seq += i; 291 qhdr->th_reseqlen -= i; 292 m_adj(q->tcpqe_m, i); 293 break; 294 } 295 nq = TAILQ_NEXT(q, tcpqe_q); 296 m_freem(q->tcpqe_m); 297 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 298 pool_put(&tcpqe_pool, q); 299 } 300 301 /* Insert the new segment queue entry into place. */ 302 tiqe->tcpqe_m = m; 303 th->th_reseqlen = *tlen; 304 tiqe->tcpqe_tcp = th; 305 if (p == NULL) { 306 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 307 } else { 308 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 309 } 310 311 if (th->th_seq != tp->rcv_nxt) 312 return (0); 313 314 return (tcp_flush_queue(tp)); 315 } 316 317 int 318 tcp_flush_queue(struct tcpcb *tp) 319 { 320 struct socket *so = tp->t_inpcb->inp_socket; 321 struct tcpqent *q, *nq; 322 int flags; 323 324 /* 325 * Present data to user, advancing rcv_nxt through 326 * completed sequence space. 327 */ 328 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 329 return (0); 330 q = TAILQ_FIRST(&tp->t_segq); 331 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 332 return (0); 333 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 334 return (0); 335 do { 336 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 337 flags = q->tcpqe_tcp->th_flags & TH_FIN; 338 339 nq = TAILQ_NEXT(q, tcpqe_q); 340 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 341 ND6_HINT(tp); 342 if (so->so_state & SS_CANTRCVMORE) 343 m_freem(q->tcpqe_m); 344 else 345 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 346 pool_put(&tcpqe_pool, q); 347 q = nq; 348 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 349 tp->t_flags |= TF_BLOCKOUTPUT; 350 sorwakeup(so); 351 tp->t_flags &= ~TF_BLOCKOUTPUT; 352 return (flags); 353 } 354 355 /* 356 * TCP input routine, follows pages 65-76 of the 357 * protocol specification dated September, 1981 very closely. 358 */ 359 int 360 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 361 { 362 struct mbuf *m = *mp; 363 int iphlen = *offp; 364 struct ip *ip = NULL; 365 struct inpcb *inp = NULL; 366 u_int8_t *optp = NULL; 367 int optlen = 0; 368 int tlen, off; 369 struct tcpcb *otp = NULL, *tp = NULL; 370 int tiflags; 371 struct socket *so = NULL; 372 int todrop, acked, ourfinisacked; 373 int hdroptlen = 0; 374 short ostate; 375 caddr_t saveti; 376 tcp_seq iss, *reuse = NULL; 377 u_long tiwin; 378 struct tcp_opt_info opti; 379 struct tcphdr *th; 380 #ifdef INET6 381 struct ip6_hdr *ip6 = NULL; 382 #endif /* INET6 */ 383 #ifdef TCP_ECN 384 u_char iptos; 385 #endif 386 387 tcpstat_inc(tcps_rcvtotal); 388 389 opti.ts_present = 0; 390 opti.maxseg = 0; 391 392 /* 393 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 394 */ 395 if (m->m_flags & (M_BCAST|M_MCAST)) 396 goto drop; 397 398 /* 399 * Get IP and TCP header together in first mbuf. 400 * Note: IP leaves IP header in first mbuf. 401 */ 402 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 403 if (!th) { 404 tcpstat_inc(tcps_rcvshort); 405 return IPPROTO_DONE; 406 } 407 408 tlen = m->m_pkthdr.len - iphlen; 409 switch (af) { 410 case AF_INET: 411 ip = mtod(m, struct ip *); 412 #ifdef TCP_ECN 413 /* save ip_tos before clearing it for checksum */ 414 iptos = ip->ip_tos; 415 #endif 416 break; 417 #ifdef INET6 418 case AF_INET6: 419 ip6 = mtod(m, struct ip6_hdr *); 420 #ifdef TCP_ECN 421 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 422 #endif 423 424 /* 425 * Be proactive about unspecified IPv6 address in source. 426 * As we use all-zero to indicate unbounded/unconnected pcb, 427 * unspecified IPv6 address can be used to confuse us. 428 * 429 * Note that packets with unspecified IPv6 destination is 430 * already dropped in ip6_input. 431 */ 432 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 433 /* XXX stat */ 434 goto drop; 435 } 436 437 /* Discard packets to multicast */ 438 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 439 /* XXX stat */ 440 goto drop; 441 } 442 break; 443 #endif 444 default: 445 unhandled_af(af); 446 } 447 448 /* 449 * Checksum extended TCP header and data. 450 */ 451 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 452 int sum; 453 454 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 455 tcpstat_inc(tcps_rcvbadsum); 456 goto drop; 457 } 458 tcpstat_inc(tcps_inswcsum); 459 switch (af) { 460 case AF_INET: 461 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 462 break; 463 #ifdef INET6 464 case AF_INET6: 465 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 466 tlen); 467 break; 468 #endif 469 } 470 if (sum != 0) { 471 tcpstat_inc(tcps_rcvbadsum); 472 goto drop; 473 } 474 } 475 476 /* 477 * Check that TCP offset makes sense, 478 * pull out TCP options and adjust length. XXX 479 */ 480 off = th->th_off << 2; 481 if (off < sizeof(struct tcphdr) || off > tlen) { 482 tcpstat_inc(tcps_rcvbadoff); 483 goto drop; 484 } 485 tlen -= off; 486 if (off > sizeof(struct tcphdr)) { 487 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 488 if (!th) { 489 tcpstat_inc(tcps_rcvshort); 490 return IPPROTO_DONE; 491 } 492 optlen = off - sizeof(struct tcphdr); 493 optp = (u_int8_t *)(th + 1); 494 /* 495 * Do quick retrieval of timestamp options ("options 496 * prediction?"). If timestamp is the only option and it's 497 * formatted as recommended in RFC 1323 appendix A, we 498 * quickly get the values now and not bother calling 499 * tcp_dooptions(), etc. 500 */ 501 if ((optlen == TCPOLEN_TSTAMP_APPA || 502 (optlen > TCPOLEN_TSTAMP_APPA && 503 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 504 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 505 (th->th_flags & TH_SYN) == 0) { 506 opti.ts_present = 1; 507 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 508 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 509 optp = NULL; /* we've parsed the options */ 510 } 511 } 512 tiflags = th->th_flags; 513 514 /* 515 * Convert TCP protocol specific fields to host format. 516 */ 517 th->th_seq = ntohl(th->th_seq); 518 th->th_ack = ntohl(th->th_ack); 519 th->th_win = ntohs(th->th_win); 520 th->th_urp = ntohs(th->th_urp); 521 522 /* 523 * Locate pcb for segment. 524 */ 525 #if NPF > 0 526 inp = pf_inp_lookup(m); 527 #endif 528 findpcb: 529 if (inp == NULL) { 530 switch (af) { 531 #ifdef INET6 532 case AF_INET6: 533 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 534 th->th_sport, &ip6->ip6_dst, th->th_dport, 535 m->m_pkthdr.ph_rtableid); 536 break; 537 #endif 538 case AF_INET: 539 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 540 th->th_sport, ip->ip_dst, th->th_dport, 541 m->m_pkthdr.ph_rtableid); 542 break; 543 } 544 } 545 if (inp == NULL) { 546 tcpstat_inc(tcps_pcbhashmiss); 547 switch (af) { 548 #ifdef INET6 549 case AF_INET6: 550 inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, 551 th->th_dport, m, m->m_pkthdr.ph_rtableid); 552 break; 553 #endif /* INET6 */ 554 case AF_INET: 555 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 556 th->th_dport, m, m->m_pkthdr.ph_rtableid); 557 break; 558 } 559 /* 560 * If the state is CLOSED (i.e., TCB does not exist) then 561 * all data in the incoming segment is discarded. 562 * If the TCB exists but is in CLOSED state, it is embryonic, 563 * but should either do a listen or a connect soon. 564 */ 565 } 566 #ifdef IPSEC 567 if (ipsec_in_use) { 568 struct m_tag *mtag; 569 struct tdb *tdb = NULL; 570 int error; 571 572 /* Find most recent IPsec tag */ 573 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 574 if (mtag != NULL) { 575 struct tdb_ident *tdbi; 576 577 tdbi = (struct tdb_ident *)(mtag + 1); 578 tdb = gettdb(tdbi->rdomain, tdbi->spi, 579 &tdbi->dst, tdbi->proto); 580 } 581 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 582 tdb, inp, NULL, 0); 583 tdb_unref(tdb); 584 if (error) { 585 tcpstat_inc(tcps_rcvnosec); 586 goto drop; 587 } 588 } 589 #endif /* IPSEC */ 590 591 if (inp == NULL) { 592 tcpstat_inc(tcps_noport); 593 goto dropwithreset_ratelim; 594 } 595 596 KASSERT(sotoinpcb(inp->inp_socket) == inp); 597 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 598 soassertlocked(inp->inp_socket); 599 600 /* Check the minimum TTL for socket. */ 601 switch (af) { 602 case AF_INET: 603 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 604 goto drop; 605 break; 606 #ifdef INET6 607 case AF_INET6: 608 if (inp->inp_ip6_minhlim && 609 inp->inp_ip6_minhlim > ip6->ip6_hlim) 610 goto drop; 611 break; 612 #endif 613 } 614 615 tp = intotcpcb(inp); 616 if (tp == NULL) 617 goto dropwithreset_ratelim; 618 if (tp->t_state == TCPS_CLOSED) 619 goto drop; 620 621 /* Unscale the window into a 32-bit value. */ 622 if ((tiflags & TH_SYN) == 0) 623 tiwin = th->th_win << tp->snd_scale; 624 else 625 tiwin = th->th_win; 626 627 so = inp->inp_socket; 628 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 629 union syn_cache_sa src; 630 union syn_cache_sa dst; 631 632 bzero(&src, sizeof(src)); 633 bzero(&dst, sizeof(dst)); 634 switch (af) { 635 case AF_INET: 636 src.sin.sin_len = sizeof(struct sockaddr_in); 637 src.sin.sin_family = AF_INET; 638 src.sin.sin_addr = ip->ip_src; 639 src.sin.sin_port = th->th_sport; 640 641 dst.sin.sin_len = sizeof(struct sockaddr_in); 642 dst.sin.sin_family = AF_INET; 643 dst.sin.sin_addr = ip->ip_dst; 644 dst.sin.sin_port = th->th_dport; 645 break; 646 #ifdef INET6 647 case AF_INET6: 648 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 649 src.sin6.sin6_family = AF_INET6; 650 src.sin6.sin6_addr = ip6->ip6_src; 651 src.sin6.sin6_port = th->th_sport; 652 653 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 654 dst.sin6.sin6_family = AF_INET6; 655 dst.sin6.sin6_addr = ip6->ip6_dst; 656 dst.sin6.sin6_port = th->th_dport; 657 break; 658 #endif /* INET6 */ 659 } 660 661 if (so->so_options & SO_DEBUG) { 662 otp = tp; 663 ostate = tp->t_state; 664 switch (af) { 665 #ifdef INET6 666 case AF_INET6: 667 saveti = (caddr_t) &tcp_saveti6; 668 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 669 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 670 break; 671 #endif 672 case AF_INET: 673 saveti = (caddr_t) &tcp_saveti; 674 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 675 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 676 break; 677 } 678 } 679 if (so->so_options & SO_ACCEPTCONN) { 680 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 681 682 case TH_SYN|TH_ACK|TH_RST: 683 case TH_SYN|TH_RST: 684 case TH_ACK|TH_RST: 685 case TH_RST: 686 syn_cache_reset(&src.sa, &dst.sa, th, 687 inp->inp_rtableid); 688 goto drop; 689 690 case TH_SYN|TH_ACK: 691 /* 692 * Received a SYN,ACK. This should 693 * never happen while we are in 694 * LISTEN. Send an RST. 695 */ 696 goto badsyn; 697 698 case TH_ACK: 699 so = syn_cache_get(&src.sa, &dst.sa, 700 th, iphlen, tlen, so, m); 701 if (so == NULL) { 702 /* 703 * We don't have a SYN for 704 * this ACK; send an RST. 705 */ 706 goto badsyn; 707 } else if (so == (struct socket *)(-1)) { 708 /* 709 * We were unable to create 710 * the connection. If the 711 * 3-way handshake was 712 * completed, and RST has 713 * been sent to the peer. 714 * Since the mbuf might be 715 * in use for the reply, 716 * do not free it. 717 */ 718 m = *mp = NULL; 719 goto drop; 720 } else { 721 /* 722 * We have created a 723 * full-blown connection. 724 */ 725 tp = NULL; 726 inp = sotoinpcb(so); 727 tp = intotcpcb(inp); 728 if (tp == NULL) 729 goto badsyn; /*XXX*/ 730 731 } 732 break; 733 734 default: 735 /* 736 * None of RST, SYN or ACK was set. 737 * This is an invalid packet for a 738 * TCB in LISTEN state. Send a RST. 739 */ 740 goto badsyn; 741 742 case TH_SYN: 743 /* 744 * Received a SYN. 745 */ 746 #ifdef INET6 747 /* 748 * If deprecated address is forbidden, we do 749 * not accept SYN to deprecated interface 750 * address to prevent any new inbound 751 * connection from getting established. 752 * When we do not accept SYN, we send a TCP 753 * RST, with deprecated source address (instead 754 * of dropping it). We compromise it as it is 755 * much better for peer to send a RST, and 756 * RST will be the final packet for the 757 * exchange. 758 * 759 * If we do not forbid deprecated addresses, we 760 * accept the SYN packet. RFC2462 does not 761 * suggest dropping SYN in this case. 762 * If we decipher RFC2462 5.5.4, it says like 763 * this: 764 * 1. use of deprecated addr with existing 765 * communication is okay - "SHOULD continue 766 * to be used" 767 * 2. use of it with new communication: 768 * (2a) "SHOULD NOT be used if alternate 769 * address with sufficient scope is 770 * available" 771 * (2b) nothing mentioned otherwise. 772 * Here we fall into (2b) case as we have no 773 * choice in our source address selection - we 774 * must obey the peer. 775 * 776 * The wording in RFC2462 is confusing, and 777 * there are multiple description text for 778 * deprecated address handling - worse, they 779 * are not exactly the same. I believe 5.5.4 780 * is the best one, so we follow 5.5.4. 781 */ 782 if (ip6 && !ip6_use_deprecated) { 783 struct in6_ifaddr *ia6; 784 struct ifnet *ifp = 785 if_get(m->m_pkthdr.ph_ifidx); 786 787 if (ifp && 788 (ia6 = in6ifa_ifpwithaddr(ifp, 789 &ip6->ip6_dst)) && 790 (ia6->ia6_flags & 791 IN6_IFF_DEPRECATED)) { 792 tp = NULL; 793 if_put(ifp); 794 goto dropwithreset; 795 } 796 if_put(ifp); 797 } 798 #endif 799 800 /* 801 * LISTEN socket received a SYN 802 * from itself? This can't possibly 803 * be valid; drop the packet. 804 */ 805 if (th->th_dport == th->th_sport) { 806 switch (af) { 807 #ifdef INET6 808 case AF_INET6: 809 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 810 &ip6->ip6_dst)) { 811 tcpstat_inc(tcps_badsyn); 812 goto drop; 813 } 814 break; 815 #endif /* INET6 */ 816 case AF_INET: 817 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 818 tcpstat_inc(tcps_badsyn); 819 goto drop; 820 } 821 break; 822 } 823 } 824 825 /* 826 * SYN looks ok; create compressed TCP 827 * state for it. 828 */ 829 if (so->so_qlen > so->so_qlimit || 830 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 831 so, m, optp, optlen, &opti, reuse) == -1) { 832 tcpstat_inc(tcps_dropsyn); 833 goto drop; 834 } 835 return IPPROTO_DONE; 836 } 837 } 838 } 839 840 #ifdef DIAGNOSTIC 841 /* 842 * Should not happen now that all embryonic connections 843 * are handled with compressed state. 844 */ 845 if (tp->t_state == TCPS_LISTEN) 846 panic("tcp_input: TCPS_LISTEN"); 847 #endif 848 849 #if NPF > 0 850 pf_inp_link(m, inp); 851 #endif 852 853 /* 854 * Segment received on connection. 855 * Reset idle time and keep-alive timer. 856 */ 857 tp->t_rcvtime = tcp_now; 858 if (TCPS_HAVEESTABLISHED(tp->t_state)) 859 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 860 861 if (tp->sack_enable) 862 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 863 864 /* 865 * Process options. 866 */ 867 #ifdef TCP_SIGNATURE 868 if (optp || (tp->t_flags & TF_SIGNATURE)) 869 #else 870 if (optp) 871 #endif 872 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 873 m->m_pkthdr.ph_rtableid)) 874 goto drop; 875 876 if (opti.ts_present && opti.ts_ecr) { 877 int rtt_test; 878 879 /* subtract out the tcp timestamp modulator */ 880 opti.ts_ecr -= tp->ts_modulate; 881 882 /* make sure ts_ecr is sensible */ 883 rtt_test = tcp_now - opti.ts_ecr; 884 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 885 opti.ts_ecr = 0; 886 } 887 888 #ifdef TCP_ECN 889 /* if congestion experienced, set ECE bit in subsequent packets. */ 890 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 891 tp->t_flags |= TF_RCVD_CE; 892 tcpstat_inc(tcps_ecn_rcvce); 893 } 894 #endif 895 /* 896 * Header prediction: check for the two common cases 897 * of a uni-directional data xfer. If the packet has 898 * no control flags, is in-sequence, the window didn't 899 * change and we're not retransmitting, it's a 900 * candidate. If the length is zero and the ack moved 901 * forward, we're the sender side of the xfer. Just 902 * free the data acked & wake any higher level process 903 * that was blocked waiting for space. If the length 904 * is non-zero and the ack didn't move, we're the 905 * receiver side. If we're getting packets in-order 906 * (the reassembly queue is empty), add the data to 907 * the socket buffer and note that we need a delayed ack. 908 */ 909 if (tp->t_state == TCPS_ESTABLISHED && 910 #ifdef TCP_ECN 911 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 912 #else 913 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 914 #endif 915 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 916 th->th_seq == tp->rcv_nxt && 917 tiwin && tiwin == tp->snd_wnd && 918 tp->snd_nxt == tp->snd_max) { 919 920 /* 921 * If last ACK falls within this segment's sequence numbers, 922 * record the timestamp. 923 * Fix from Braden, see Stevens p. 870 924 */ 925 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 926 tp->ts_recent_age = tcp_now; 927 tp->ts_recent = opti.ts_val; 928 } 929 930 if (tlen == 0) { 931 if (SEQ_GT(th->th_ack, tp->snd_una) && 932 SEQ_LEQ(th->th_ack, tp->snd_max) && 933 tp->snd_cwnd >= tp->snd_wnd && 934 tp->t_dupacks == 0) { 935 /* 936 * this is a pure ack for outstanding data. 937 */ 938 tcpstat_inc(tcps_predack); 939 if (opti.ts_present && opti.ts_ecr) 940 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 941 else if (tp->t_rtttime && 942 SEQ_GT(th->th_ack, tp->t_rtseq)) 943 tcp_xmit_timer(tp, 944 tcp_now - tp->t_rtttime); 945 acked = th->th_ack - tp->snd_una; 946 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 947 acked); 948 ND6_HINT(tp); 949 sbdrop(so, &so->so_snd, acked); 950 951 /* 952 * If we had a pending ICMP message that 953 * refers to data that have just been 954 * acknowledged, disregard the recorded ICMP 955 * message. 956 */ 957 if ((tp->t_flags & TF_PMTUD_PEND) && 958 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 959 tp->t_flags &= ~TF_PMTUD_PEND; 960 961 /* 962 * Keep track of the largest chunk of data 963 * acknowledged since last PMTU update 964 */ 965 if (tp->t_pmtud_mss_acked < acked) 966 tp->t_pmtud_mss_acked = acked; 967 968 tp->snd_una = th->th_ack; 969 /* Pull snd_wl2 up to prevent seq wrap. */ 970 tp->snd_wl2 = th->th_ack; 971 /* 972 * We want snd_last to track snd_una so 973 * as to avoid sequence wraparound problems 974 * for very large transfers. 975 */ 976 #ifdef TCP_ECN 977 if (SEQ_GT(tp->snd_una, tp->snd_last)) 978 #endif 979 tp->snd_last = tp->snd_una; 980 m_freem(m); 981 982 /* 983 * If all outstanding data are acked, stop 984 * retransmit timer, otherwise restart timer 985 * using current (possibly backed-off) value. 986 * If process is waiting for space, 987 * wakeup/selwakeup/signal. If data 988 * are ready to send, let tcp_output 989 * decide between more output or persist. 990 */ 991 if (tp->snd_una == tp->snd_max) 992 TCP_TIMER_DISARM(tp, TCPT_REXMT); 993 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 994 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 995 996 tcp_update_sndspace(tp); 997 if (sb_notify(so, &so->so_snd)) { 998 tp->t_flags |= TF_BLOCKOUTPUT; 999 sowwakeup(so); 1000 tp->t_flags &= ~TF_BLOCKOUTPUT; 1001 } 1002 if (so->so_snd.sb_cc || 1003 tp->t_flags & TF_NEEDOUTPUT) 1004 (void) tcp_output(tp); 1005 return IPPROTO_DONE; 1006 } 1007 } else if (th->th_ack == tp->snd_una && 1008 TAILQ_EMPTY(&tp->t_segq) && 1009 tlen <= sbspace(so, &so->so_rcv)) { 1010 /* 1011 * This is a pure, in-sequence data packet 1012 * with nothing on the reassembly queue and 1013 * we have enough buffer space to take it. 1014 */ 1015 /* Clean receiver SACK report if present */ 1016 if (tp->sack_enable && tp->rcv_numsacks) 1017 tcp_clean_sackreport(tp); 1018 tcpstat_inc(tcps_preddat); 1019 tp->rcv_nxt += tlen; 1020 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1021 tp->snd_wl1 = th->th_seq; 1022 /* Packet has most recent segment, no urgent exists. */ 1023 tp->rcv_up = tp->rcv_nxt; 1024 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1025 ND6_HINT(tp); 1026 1027 TCP_SETUP_ACK(tp, tiflags, m); 1028 /* 1029 * Drop TCP, IP headers and TCP options then add data 1030 * to socket buffer. 1031 */ 1032 if (so->so_state & SS_CANTRCVMORE) 1033 m_freem(m); 1034 else { 1035 if (opti.ts_present && opti.ts_ecr) { 1036 if (tp->rfbuf_ts < opti.ts_ecr && 1037 opti.ts_ecr - tp->rfbuf_ts < hz) { 1038 tcp_update_rcvspace(tp); 1039 /* Start over with next RTT. */ 1040 tp->rfbuf_cnt = 0; 1041 tp->rfbuf_ts = 0; 1042 } else 1043 tp->rfbuf_cnt += tlen; 1044 } 1045 m_adj(m, iphlen + off); 1046 sbappendstream(so, &so->so_rcv, m); 1047 } 1048 tp->t_flags |= TF_BLOCKOUTPUT; 1049 sorwakeup(so); 1050 tp->t_flags &= ~TF_BLOCKOUTPUT; 1051 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1052 (void) tcp_output(tp); 1053 return IPPROTO_DONE; 1054 } 1055 } 1056 1057 /* 1058 * Compute mbuf offset to TCP data segment. 1059 */ 1060 hdroptlen = iphlen + off; 1061 1062 /* 1063 * Calculate amount of space in receive window, 1064 * and then do TCP input processing. 1065 * Receive window is amount of space in rcv queue, 1066 * but not less than advertised window. 1067 */ 1068 { int win; 1069 1070 win = sbspace(so, &so->so_rcv); 1071 if (win < 0) 1072 win = 0; 1073 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1074 } 1075 1076 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1077 tp->rfbuf_cnt = 0; 1078 tp->rfbuf_ts = 0; 1079 1080 switch (tp->t_state) { 1081 1082 /* 1083 * If the state is SYN_RECEIVED: 1084 * if seg contains SYN/ACK, send an RST. 1085 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1086 */ 1087 1088 case TCPS_SYN_RECEIVED: 1089 if (tiflags & TH_ACK) { 1090 if (tiflags & TH_SYN) { 1091 tcpstat_inc(tcps_badsyn); 1092 goto dropwithreset; 1093 } 1094 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1095 SEQ_GT(th->th_ack, tp->snd_max)) 1096 goto dropwithreset; 1097 } 1098 break; 1099 1100 /* 1101 * If the state is SYN_SENT: 1102 * if seg contains an ACK, but not for our SYN, drop the input. 1103 * if seg contains a RST, then drop the connection. 1104 * if seg does not contain SYN, then drop it. 1105 * Otherwise this is an acceptable SYN segment 1106 * initialize tp->rcv_nxt and tp->irs 1107 * if seg contains ack then advance tp->snd_una 1108 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1109 * arrange for segment to be acked (eventually) 1110 * continue processing rest of data/controls, beginning with URG 1111 */ 1112 case TCPS_SYN_SENT: 1113 if ((tiflags & TH_ACK) && 1114 (SEQ_LEQ(th->th_ack, tp->iss) || 1115 SEQ_GT(th->th_ack, tp->snd_max))) 1116 goto dropwithreset; 1117 if (tiflags & TH_RST) { 1118 #ifdef TCP_ECN 1119 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1120 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1121 goto drop; 1122 #endif 1123 if (tiflags & TH_ACK) 1124 tp = tcp_drop(tp, ECONNREFUSED); 1125 goto drop; 1126 } 1127 if ((tiflags & TH_SYN) == 0) 1128 goto drop; 1129 if (tiflags & TH_ACK) { 1130 tp->snd_una = th->th_ack; 1131 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1132 tp->snd_nxt = tp->snd_una; 1133 } 1134 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1135 tp->irs = th->th_seq; 1136 tcp_mss(tp, opti.maxseg); 1137 /* Reset initial window to 1 segment for retransmit */ 1138 if (tp->t_rxtshift > 0) 1139 tp->snd_cwnd = tp->t_maxseg; 1140 tcp_rcvseqinit(tp); 1141 tp->t_flags |= TF_ACKNOW; 1142 /* 1143 * If we've sent a SACK_PERMITTED option, and the peer 1144 * also replied with one, then TF_SACK_PERMIT should have 1145 * been set in tcp_dooptions(). If it was not, disable SACKs. 1146 */ 1147 if (tp->sack_enable) 1148 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1149 #ifdef TCP_ECN 1150 /* 1151 * if ECE is set but CWR is not set for SYN-ACK, or 1152 * both ECE and CWR are set for simultaneous open, 1153 * peer is ECN capable. 1154 */ 1155 if (tcp_do_ecn) { 1156 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1157 case TH_ACK|TH_ECE: 1158 case TH_ECE|TH_CWR: 1159 tp->t_flags |= TF_ECN_PERMIT; 1160 tiflags &= ~(TH_ECE|TH_CWR); 1161 tcpstat_inc(tcps_ecn_accepts); 1162 } 1163 } 1164 #endif 1165 1166 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1167 tcpstat_inc(tcps_connects); 1168 tp->t_flags |= TF_BLOCKOUTPUT; 1169 soisconnected(so); 1170 tp->t_flags &= ~TF_BLOCKOUTPUT; 1171 tp->t_state = TCPS_ESTABLISHED; 1172 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1173 /* Do window scaling on this connection? */ 1174 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1175 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1176 tp->snd_scale = tp->requested_s_scale; 1177 tp->rcv_scale = tp->request_r_scale; 1178 } 1179 tcp_flush_queue(tp); 1180 1181 /* 1182 * if we didn't have to retransmit the SYN, 1183 * use its rtt as our initial srtt & rtt var. 1184 */ 1185 if (tp->t_rtttime) 1186 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1187 /* 1188 * Since new data was acked (the SYN), open the 1189 * congestion window by one MSS. We do this 1190 * here, because we won't go through the normal 1191 * ACK processing below. And since this is the 1192 * start of the connection, we know we are in 1193 * the exponential phase of slow-start. 1194 */ 1195 tp->snd_cwnd += tp->t_maxseg; 1196 } else 1197 tp->t_state = TCPS_SYN_RECEIVED; 1198 1199 #if 0 1200 trimthenstep6: 1201 #endif 1202 /* 1203 * Advance th->th_seq to correspond to first data byte. 1204 * If data, trim to stay within window, 1205 * dropping FIN if necessary. 1206 */ 1207 th->th_seq++; 1208 if (tlen > tp->rcv_wnd) { 1209 todrop = tlen - tp->rcv_wnd; 1210 m_adj(m, -todrop); 1211 tlen = tp->rcv_wnd; 1212 tiflags &= ~TH_FIN; 1213 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1214 todrop); 1215 } 1216 tp->snd_wl1 = th->th_seq - 1; 1217 tp->rcv_up = th->th_seq; 1218 goto step6; 1219 /* 1220 * If a new connection request is received while in TIME_WAIT, 1221 * drop the old connection and start over if the if the 1222 * timestamp or the sequence numbers are above the previous 1223 * ones. 1224 */ 1225 case TCPS_TIME_WAIT: 1226 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1227 ((opti.ts_present && 1228 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1229 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1230 #if NPF > 0 1231 /* 1232 * The socket will be recreated but the new state 1233 * has already been linked to the socket. Remove the 1234 * link between old socket and new state. 1235 */ 1236 pf_inp_unlink(inp); 1237 #endif 1238 /* 1239 * Advance the iss by at least 32768, but 1240 * clear the msb in order to make sure 1241 * that SEG_LT(snd_nxt, iss). 1242 */ 1243 iss = tp->snd_nxt + 1244 ((arc4random() & 0x7fffffff) | 0x8000); 1245 reuse = &iss; 1246 tp = tcp_close(tp); 1247 inp = NULL; 1248 goto findpcb; 1249 } 1250 } 1251 1252 /* 1253 * States other than LISTEN or SYN_SENT. 1254 * First check timestamp, if present. 1255 * Then check that at least some bytes of segment are within 1256 * receive window. If segment begins before rcv_nxt, 1257 * drop leading data (and SYN); if nothing left, just ack. 1258 * 1259 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1260 * and it's less than opti.ts_recent, drop it. 1261 */ 1262 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1263 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1264 1265 /* Check to see if ts_recent is over 24 days old. */ 1266 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1267 /* 1268 * Invalidate ts_recent. If this segment updates 1269 * ts_recent, the age will be reset later and ts_recent 1270 * will get a valid value. If it does not, setting 1271 * ts_recent to zero will at least satisfy the 1272 * requirement that zero be placed in the timestamp 1273 * echo reply when ts_recent isn't valid. The 1274 * age isn't reset until we get a valid ts_recent 1275 * because we don't want out-of-order segments to be 1276 * dropped when ts_recent is old. 1277 */ 1278 tp->ts_recent = 0; 1279 } else { 1280 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1281 tcpstat_inc(tcps_pawsdrop); 1282 if (tlen) 1283 goto dropafterack; 1284 goto drop; 1285 } 1286 } 1287 1288 todrop = tp->rcv_nxt - th->th_seq; 1289 if (todrop > 0) { 1290 if (tiflags & TH_SYN) { 1291 tiflags &= ~TH_SYN; 1292 th->th_seq++; 1293 if (th->th_urp > 1) 1294 th->th_urp--; 1295 else 1296 tiflags &= ~TH_URG; 1297 todrop--; 1298 } 1299 if (todrop > tlen || 1300 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1301 /* 1302 * Any valid FIN must be to the left of the 1303 * window. At this point, FIN must be a 1304 * duplicate or out-of-sequence, so drop it. 1305 */ 1306 tiflags &= ~TH_FIN; 1307 /* 1308 * Send ACK to resynchronize, and drop any data, 1309 * but keep on processing for RST or ACK. 1310 */ 1311 tp->t_flags |= TF_ACKNOW; 1312 todrop = tlen; 1313 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1314 } else { 1315 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1316 todrop); 1317 } 1318 hdroptlen += todrop; /* drop from head afterwards */ 1319 th->th_seq += todrop; 1320 tlen -= todrop; 1321 if (th->th_urp > todrop) 1322 th->th_urp -= todrop; 1323 else { 1324 tiflags &= ~TH_URG; 1325 th->th_urp = 0; 1326 } 1327 } 1328 1329 /* 1330 * If new data are received on a connection after the 1331 * user processes are gone, then RST the other end. 1332 */ 1333 if ((so->so_state & SS_NOFDREF) && 1334 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1335 tp = tcp_close(tp); 1336 tcpstat_inc(tcps_rcvafterclose); 1337 goto dropwithreset; 1338 } 1339 1340 /* 1341 * If segment ends after window, drop trailing data 1342 * (and PUSH and FIN); if nothing left, just ACK. 1343 */ 1344 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1345 if (todrop > 0) { 1346 tcpstat_inc(tcps_rcvpackafterwin); 1347 if (todrop >= tlen) { 1348 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1349 /* 1350 * If window is closed can only take segments at 1351 * window edge, and have to drop data and PUSH from 1352 * incoming segments. Continue processing, but 1353 * remember to ack. Otherwise, drop segment 1354 * and ack. 1355 */ 1356 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1357 tp->t_flags |= TF_ACKNOW; 1358 tcpstat_inc(tcps_rcvwinprobe); 1359 } else 1360 goto dropafterack; 1361 } else 1362 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1363 m_adj(m, -todrop); 1364 tlen -= todrop; 1365 tiflags &= ~(TH_PUSH|TH_FIN); 1366 } 1367 1368 /* 1369 * If last ACK falls within this segment's sequence numbers, 1370 * record its timestamp if it's more recent. 1371 * NOTE that the test is modified according to the latest 1372 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1373 */ 1374 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1375 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1376 tp->ts_recent_age = tcp_now; 1377 tp->ts_recent = opti.ts_val; 1378 } 1379 1380 /* 1381 * If the RST bit is set examine the state: 1382 * SYN_RECEIVED STATE: 1383 * If passive open, return to LISTEN state. 1384 * If active open, inform user that connection was refused. 1385 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1386 * Inform user that connection was reset, and close tcb. 1387 * CLOSING, LAST_ACK, TIME_WAIT STATES 1388 * Close the tcb. 1389 */ 1390 if (tiflags & TH_RST) { 1391 if (th->th_seq != tp->last_ack_sent && 1392 th->th_seq != tp->rcv_nxt && 1393 th->th_seq != (tp->rcv_nxt + 1)) 1394 goto drop; 1395 1396 switch (tp->t_state) { 1397 case TCPS_SYN_RECEIVED: 1398 #ifdef TCP_ECN 1399 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1400 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1401 goto drop; 1402 #endif 1403 so->so_error = ECONNREFUSED; 1404 goto close; 1405 1406 case TCPS_ESTABLISHED: 1407 case TCPS_FIN_WAIT_1: 1408 case TCPS_FIN_WAIT_2: 1409 case TCPS_CLOSE_WAIT: 1410 so->so_error = ECONNRESET; 1411 close: 1412 tp->t_state = TCPS_CLOSED; 1413 tcpstat_inc(tcps_drops); 1414 tp = tcp_close(tp); 1415 goto drop; 1416 case TCPS_CLOSING: 1417 case TCPS_LAST_ACK: 1418 case TCPS_TIME_WAIT: 1419 tp = tcp_close(tp); 1420 goto drop; 1421 } 1422 } 1423 1424 /* 1425 * If a SYN is in the window, then this is an 1426 * error and we ACK and drop the packet. 1427 */ 1428 if (tiflags & TH_SYN) 1429 goto dropafterack_ratelim; 1430 1431 /* 1432 * If the ACK bit is off we drop the segment and return. 1433 */ 1434 if ((tiflags & TH_ACK) == 0) { 1435 if (tp->t_flags & TF_ACKNOW) 1436 goto dropafterack; 1437 else 1438 goto drop; 1439 } 1440 1441 /* 1442 * Ack processing. 1443 */ 1444 switch (tp->t_state) { 1445 1446 /* 1447 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1448 * ESTABLISHED state and continue processing. 1449 * The ACK was checked above. 1450 */ 1451 case TCPS_SYN_RECEIVED: 1452 tcpstat_inc(tcps_connects); 1453 tp->t_flags |= TF_BLOCKOUTPUT; 1454 soisconnected(so); 1455 tp->t_flags &= ~TF_BLOCKOUTPUT; 1456 tp->t_state = TCPS_ESTABLISHED; 1457 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1458 /* Do window scaling? */ 1459 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1460 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1461 tp->snd_scale = tp->requested_s_scale; 1462 tp->rcv_scale = tp->request_r_scale; 1463 tiwin = th->th_win << tp->snd_scale; 1464 } 1465 tcp_flush_queue(tp); 1466 tp->snd_wl1 = th->th_seq - 1; 1467 /* fall into ... */ 1468 1469 /* 1470 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1471 * ACKs. If the ack is in the range 1472 * tp->snd_una < th->th_ack <= tp->snd_max 1473 * then advance tp->snd_una to th->th_ack and drop 1474 * data from the retransmission queue. If this ACK reflects 1475 * more up to date window information we update our window information. 1476 */ 1477 case TCPS_ESTABLISHED: 1478 case TCPS_FIN_WAIT_1: 1479 case TCPS_FIN_WAIT_2: 1480 case TCPS_CLOSE_WAIT: 1481 case TCPS_CLOSING: 1482 case TCPS_LAST_ACK: 1483 case TCPS_TIME_WAIT: 1484 #ifdef TCP_ECN 1485 /* 1486 * if we receive ECE and are not already in recovery phase, 1487 * reduce cwnd by half but don't slow-start. 1488 * advance snd_last to snd_max not to reduce cwnd again 1489 * until all outstanding packets are acked. 1490 */ 1491 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1492 if ((tp->t_flags & TF_ECN_PERMIT) && 1493 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1494 u_int win; 1495 1496 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1497 if (win > 1) { 1498 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1499 tp->snd_cwnd = tp->snd_ssthresh; 1500 tp->snd_last = tp->snd_max; 1501 tp->t_flags |= TF_SEND_CWR; 1502 tcpstat_inc(tcps_cwr_ecn); 1503 } 1504 } 1505 tcpstat_inc(tcps_ecn_rcvece); 1506 } 1507 /* 1508 * if we receive CWR, we know that the peer has reduced 1509 * its congestion window. stop sending ecn-echo. 1510 */ 1511 if ((tiflags & TH_CWR)) { 1512 tp->t_flags &= ~TF_RCVD_CE; 1513 tcpstat_inc(tcps_ecn_rcvcwr); 1514 } 1515 #endif /* TCP_ECN */ 1516 1517 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1518 /* 1519 * Duplicate/old ACK processing. 1520 * Increments t_dupacks: 1521 * Pure duplicate (same seq/ack/window, no data) 1522 * Doesn't affect t_dupacks: 1523 * Data packets. 1524 * Normal window updates (window opens) 1525 * Resets t_dupacks: 1526 * New data ACKed. 1527 * Window shrinks 1528 * Old ACK 1529 */ 1530 if (tlen) { 1531 /* Drop very old ACKs unless th_seq matches */ 1532 if (th->th_seq != tp->rcv_nxt && 1533 SEQ_LT(th->th_ack, 1534 tp->snd_una - tp->max_sndwnd)) { 1535 tcpstat_inc(tcps_rcvacktooold); 1536 goto drop; 1537 } 1538 break; 1539 } 1540 /* 1541 * If we get an old ACK, there is probably packet 1542 * reordering going on. Be conservative and reset 1543 * t_dupacks so that we are less aggressive in 1544 * doing a fast retransmit. 1545 */ 1546 if (th->th_ack != tp->snd_una) { 1547 tp->t_dupacks = 0; 1548 break; 1549 } 1550 if (tiwin == tp->snd_wnd) { 1551 tcpstat_inc(tcps_rcvdupack); 1552 /* 1553 * If we have outstanding data (other than 1554 * a window probe), this is a completely 1555 * duplicate ack (ie, window info didn't 1556 * change), the ack is the biggest we've 1557 * seen and we've seen exactly our rexmt 1558 * threshold of them, assume a packet 1559 * has been dropped and retransmit it. 1560 * Kludge snd_nxt & the congestion 1561 * window so we send only this one 1562 * packet. 1563 * 1564 * We know we're losing at the current 1565 * window size so do congestion avoidance 1566 * (set ssthresh to half the current window 1567 * and pull our congestion window back to 1568 * the new ssthresh). 1569 * 1570 * Dup acks mean that packets have left the 1571 * network (they're now cached at the receiver) 1572 * so bump cwnd by the amount in the receiver 1573 * to keep a constant cwnd packets in the 1574 * network. 1575 */ 1576 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1577 tp->t_dupacks = 0; 1578 else if (++tp->t_dupacks == tcprexmtthresh) { 1579 tcp_seq onxt = tp->snd_nxt; 1580 u_long win = 1581 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1582 2 / tp->t_maxseg; 1583 1584 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1585 /* 1586 * False fast retx after 1587 * timeout. Do not cut window. 1588 */ 1589 tp->t_dupacks = 0; 1590 goto drop; 1591 } 1592 if (win < 2) 1593 win = 2; 1594 tp->snd_ssthresh = win * tp->t_maxseg; 1595 tp->snd_last = tp->snd_max; 1596 if (tp->sack_enable) { 1597 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1598 tp->t_rtttime = 0; 1599 #ifdef TCP_ECN 1600 tp->t_flags |= TF_SEND_CWR; 1601 #endif 1602 tcpstat_inc(tcps_cwr_frecovery); 1603 tcpstat_inc(tcps_sack_recovery_episode); 1604 /* 1605 * tcp_output() will send 1606 * oldest SACK-eligible rtx. 1607 */ 1608 (void) tcp_output(tp); 1609 tp->snd_cwnd = tp->snd_ssthresh+ 1610 tp->t_maxseg * tp->t_dupacks; 1611 goto drop; 1612 } 1613 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1614 tp->t_rtttime = 0; 1615 tp->snd_nxt = th->th_ack; 1616 tp->snd_cwnd = tp->t_maxseg; 1617 #ifdef TCP_ECN 1618 tp->t_flags |= TF_SEND_CWR; 1619 #endif 1620 tcpstat_inc(tcps_cwr_frecovery); 1621 tcpstat_inc(tcps_sndrexmitfast); 1622 (void) tcp_output(tp); 1623 1624 tp->snd_cwnd = tp->snd_ssthresh + 1625 tp->t_maxseg * tp->t_dupacks; 1626 if (SEQ_GT(onxt, tp->snd_nxt)) 1627 tp->snd_nxt = onxt; 1628 goto drop; 1629 } else if (tp->t_dupacks > tcprexmtthresh) { 1630 tp->snd_cwnd += tp->t_maxseg; 1631 (void) tcp_output(tp); 1632 goto drop; 1633 } 1634 } else if (tiwin < tp->snd_wnd) { 1635 /* 1636 * The window was retracted! Previous dup 1637 * ACKs may have been due to packets arriving 1638 * after the shrunken window, not a missing 1639 * packet, so play it safe and reset t_dupacks 1640 */ 1641 tp->t_dupacks = 0; 1642 } 1643 break; 1644 } 1645 /* 1646 * If the congestion window was inflated to account 1647 * for the other side's cached packets, retract it. 1648 */ 1649 if (tp->t_dupacks >= tcprexmtthresh) { 1650 /* Check for a partial ACK */ 1651 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1652 if (tp->sack_enable) 1653 tcp_sack_partialack(tp, th); 1654 else 1655 tcp_newreno_partialack(tp, th); 1656 } else { 1657 /* Out of fast recovery */ 1658 tp->snd_cwnd = tp->snd_ssthresh; 1659 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1660 tp->snd_ssthresh) 1661 tp->snd_cwnd = 1662 tcp_seq_subtract(tp->snd_max, 1663 th->th_ack); 1664 tp->t_dupacks = 0; 1665 } 1666 } else { 1667 /* 1668 * Reset the duplicate ACK counter if we 1669 * were not in fast recovery. 1670 */ 1671 tp->t_dupacks = 0; 1672 } 1673 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1674 tcpstat_inc(tcps_rcvacktoomuch); 1675 goto dropafterack_ratelim; 1676 } 1677 acked = th->th_ack - tp->snd_una; 1678 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1679 1680 /* 1681 * If we have a timestamp reply, update smoothed 1682 * round trip time. If no timestamp is present but 1683 * transmit timer is running and timed sequence 1684 * number was acked, update smoothed round trip time. 1685 * Since we now have an rtt measurement, cancel the 1686 * timer backoff (cf., Phil Karn's retransmit alg.). 1687 * Recompute the initial retransmit timer. 1688 */ 1689 if (opti.ts_present && opti.ts_ecr) 1690 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1691 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1692 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1693 1694 /* 1695 * If all outstanding data is acked, stop retransmit 1696 * timer and remember to restart (more output or persist). 1697 * If there is more data to be acked, restart retransmit 1698 * timer, using current (possibly backed-off) value. 1699 */ 1700 if (th->th_ack == tp->snd_max) { 1701 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1702 tp->t_flags |= TF_NEEDOUTPUT; 1703 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1704 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1705 /* 1706 * When new data is acked, open the congestion window. 1707 * If the window gives us less than ssthresh packets 1708 * in flight, open exponentially (maxseg per packet). 1709 * Otherwise open linearly: maxseg per window 1710 * (maxseg^2 / cwnd per packet). 1711 */ 1712 { 1713 u_int cw = tp->snd_cwnd; 1714 u_int incr = tp->t_maxseg; 1715 1716 if (cw > tp->snd_ssthresh) 1717 incr = max(incr * incr / cw, 1); 1718 if (tp->t_dupacks < tcprexmtthresh) 1719 tp->snd_cwnd = ulmin(cw + incr, 1720 TCP_MAXWIN << tp->snd_scale); 1721 } 1722 ND6_HINT(tp); 1723 if (acked > so->so_snd.sb_cc) { 1724 if (tp->snd_wnd > so->so_snd.sb_cc) 1725 tp->snd_wnd -= so->so_snd.sb_cc; 1726 else 1727 tp->snd_wnd = 0; 1728 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1729 ourfinisacked = 1; 1730 } else { 1731 sbdrop(so, &so->so_snd, acked); 1732 if (tp->snd_wnd > acked) 1733 tp->snd_wnd -= acked; 1734 else 1735 tp->snd_wnd = 0; 1736 ourfinisacked = 0; 1737 } 1738 1739 tcp_update_sndspace(tp); 1740 if (sb_notify(so, &so->so_snd)) { 1741 tp->t_flags |= TF_BLOCKOUTPUT; 1742 sowwakeup(so); 1743 tp->t_flags &= ~TF_BLOCKOUTPUT; 1744 } 1745 1746 /* 1747 * If we had a pending ICMP message that referred to data 1748 * that have just been acknowledged, disregard the recorded 1749 * ICMP message. 1750 */ 1751 if ((tp->t_flags & TF_PMTUD_PEND) && 1752 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1753 tp->t_flags &= ~TF_PMTUD_PEND; 1754 1755 /* 1756 * Keep track of the largest chunk of data acknowledged 1757 * since last PMTU update 1758 */ 1759 if (tp->t_pmtud_mss_acked < acked) 1760 tp->t_pmtud_mss_acked = acked; 1761 1762 tp->snd_una = th->th_ack; 1763 #ifdef TCP_ECN 1764 /* sync snd_last with snd_una */ 1765 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1766 tp->snd_last = tp->snd_una; 1767 #endif 1768 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1769 tp->snd_nxt = tp->snd_una; 1770 1771 switch (tp->t_state) { 1772 1773 /* 1774 * In FIN_WAIT_1 STATE in addition to the processing 1775 * for the ESTABLISHED state if our FIN is now acknowledged 1776 * then enter FIN_WAIT_2. 1777 */ 1778 case TCPS_FIN_WAIT_1: 1779 if (ourfinisacked) { 1780 /* 1781 * If we can't receive any more 1782 * data, then closing user can proceed. 1783 * Starting the timer is contrary to the 1784 * specification, but if we don't get a FIN 1785 * we'll hang forever. 1786 */ 1787 if (so->so_state & SS_CANTRCVMORE) { 1788 tp->t_flags |= TF_BLOCKOUTPUT; 1789 soisdisconnected(so); 1790 tp->t_flags &= ~TF_BLOCKOUTPUT; 1791 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1792 } 1793 tp->t_state = TCPS_FIN_WAIT_2; 1794 } 1795 break; 1796 1797 /* 1798 * In CLOSING STATE in addition to the processing for 1799 * the ESTABLISHED state if the ACK acknowledges our FIN 1800 * then enter the TIME-WAIT state, otherwise ignore 1801 * the segment. 1802 */ 1803 case TCPS_CLOSING: 1804 if (ourfinisacked) { 1805 tp->t_state = TCPS_TIME_WAIT; 1806 tcp_canceltimers(tp); 1807 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1808 tp->t_flags |= TF_BLOCKOUTPUT; 1809 soisdisconnected(so); 1810 tp->t_flags &= ~TF_BLOCKOUTPUT; 1811 } 1812 break; 1813 1814 /* 1815 * In LAST_ACK, we may still be waiting for data to drain 1816 * and/or to be acked, as well as for the ack of our FIN. 1817 * If our FIN is now acknowledged, delete the TCB, 1818 * enter the closed state and return. 1819 */ 1820 case TCPS_LAST_ACK: 1821 if (ourfinisacked) { 1822 tp = tcp_close(tp); 1823 goto drop; 1824 } 1825 break; 1826 1827 /* 1828 * In TIME_WAIT state the only thing that should arrive 1829 * is a retransmission of the remote FIN. Acknowledge 1830 * it and restart the finack timer. 1831 */ 1832 case TCPS_TIME_WAIT: 1833 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1834 goto dropafterack; 1835 } 1836 } 1837 1838 step6: 1839 /* 1840 * Update window information. 1841 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1842 */ 1843 if ((tiflags & TH_ACK) && 1844 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1845 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1846 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1847 /* keep track of pure window updates */ 1848 if (tlen == 0 && 1849 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1850 tcpstat_inc(tcps_rcvwinupd); 1851 tp->snd_wnd = tiwin; 1852 tp->snd_wl1 = th->th_seq; 1853 tp->snd_wl2 = th->th_ack; 1854 if (tp->snd_wnd > tp->max_sndwnd) 1855 tp->max_sndwnd = tp->snd_wnd; 1856 tp->t_flags |= TF_NEEDOUTPUT; 1857 } 1858 1859 /* 1860 * Process segments with URG. 1861 */ 1862 if ((tiflags & TH_URG) && th->th_urp && 1863 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1864 /* 1865 * This is a kludge, but if we receive and accept 1866 * random urgent pointers, we'll crash in 1867 * soreceive. It's hard to imagine someone 1868 * actually wanting to send this much urgent data. 1869 */ 1870 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1871 th->th_urp = 0; /* XXX */ 1872 tiflags &= ~TH_URG; /* XXX */ 1873 goto dodata; /* XXX */ 1874 } 1875 /* 1876 * If this segment advances the known urgent pointer, 1877 * then mark the data stream. This should not happen 1878 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1879 * a FIN has been received from the remote side. 1880 * In these states we ignore the URG. 1881 * 1882 * According to RFC961 (Assigned Protocols), 1883 * the urgent pointer points to the last octet 1884 * of urgent data. We continue, however, 1885 * to consider it to indicate the first octet 1886 * of data past the urgent section as the original 1887 * spec states (in one of two places). 1888 */ 1889 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1890 tp->rcv_up = th->th_seq + th->th_urp; 1891 so->so_oobmark = so->so_rcv.sb_cc + 1892 (tp->rcv_up - tp->rcv_nxt) - 1; 1893 if (so->so_oobmark == 0) 1894 so->so_state |= SS_RCVATMARK; 1895 sohasoutofband(so); 1896 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1897 } 1898 /* 1899 * Remove out of band data so doesn't get presented to user. 1900 * This can happen independent of advancing the URG pointer, 1901 * but if two URG's are pending at once, some out-of-band 1902 * data may creep in... ick. 1903 */ 1904 if (th->th_urp <= (u_int16_t) tlen && 1905 (so->so_options & SO_OOBINLINE) == 0) 1906 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1907 } else 1908 /* 1909 * If no out of band data is expected, 1910 * pull receive urgent pointer along 1911 * with the receive window. 1912 */ 1913 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1914 tp->rcv_up = tp->rcv_nxt; 1915 dodata: /* XXX */ 1916 1917 /* 1918 * Process the segment text, merging it into the TCP sequencing queue, 1919 * and arranging for acknowledgment of receipt if necessary. 1920 * This process logically involves adjusting tp->rcv_wnd as data 1921 * is presented to the user (this happens in tcp_usrreq.c, 1922 * case PRU_RCVD). If a FIN has already been received on this 1923 * connection then we just ignore the text. 1924 */ 1925 if ((tlen || (tiflags & TH_FIN)) && 1926 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1927 tcp_seq laststart = th->th_seq; 1928 tcp_seq lastend = th->th_seq + tlen; 1929 1930 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1931 tp->t_state == TCPS_ESTABLISHED) { 1932 TCP_SETUP_ACK(tp, tiflags, m); 1933 tp->rcv_nxt += tlen; 1934 tiflags = th->th_flags & TH_FIN; 1935 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1936 ND6_HINT(tp); 1937 if (so->so_state & SS_CANTRCVMORE) 1938 m_freem(m); 1939 else { 1940 m_adj(m, hdroptlen); 1941 sbappendstream(so, &so->so_rcv, m); 1942 } 1943 tp->t_flags |= TF_BLOCKOUTPUT; 1944 sorwakeup(so); 1945 tp->t_flags &= ~TF_BLOCKOUTPUT; 1946 } else { 1947 m_adj(m, hdroptlen); 1948 tiflags = tcp_reass(tp, th, m, &tlen); 1949 tp->t_flags |= TF_ACKNOW; 1950 } 1951 if (tp->sack_enable) 1952 tcp_update_sack_list(tp, laststart, lastend); 1953 1954 /* 1955 * variable len never referenced again in modern BSD, 1956 * so why bother computing it ?? 1957 */ 1958 #if 0 1959 /* 1960 * Note the amount of data that peer has sent into 1961 * our window, in order to estimate the sender's 1962 * buffer size. 1963 */ 1964 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1965 #endif /* 0 */ 1966 } else { 1967 m_freem(m); 1968 tiflags &= ~TH_FIN; 1969 } 1970 1971 /* 1972 * If FIN is received ACK the FIN and let the user know 1973 * that the connection is closing. Ignore a FIN received before 1974 * the connection is fully established. 1975 */ 1976 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1977 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1978 tp->t_flags |= TF_BLOCKOUTPUT; 1979 socantrcvmore(so); 1980 tp->t_flags &= ~TF_BLOCKOUTPUT; 1981 tp->t_flags |= TF_ACKNOW; 1982 tp->rcv_nxt++; 1983 } 1984 switch (tp->t_state) { 1985 1986 /* 1987 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1988 */ 1989 case TCPS_ESTABLISHED: 1990 tp->t_state = TCPS_CLOSE_WAIT; 1991 break; 1992 1993 /* 1994 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1995 * enter the CLOSING state. 1996 */ 1997 case TCPS_FIN_WAIT_1: 1998 tp->t_state = TCPS_CLOSING; 1999 break; 2000 2001 /* 2002 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2003 * starting the time-wait timer, turning off the other 2004 * standard timers. 2005 */ 2006 case TCPS_FIN_WAIT_2: 2007 tp->t_state = TCPS_TIME_WAIT; 2008 tcp_canceltimers(tp); 2009 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2010 tp->t_flags |= TF_BLOCKOUTPUT; 2011 soisdisconnected(so); 2012 tp->t_flags &= ~TF_BLOCKOUTPUT; 2013 break; 2014 2015 /* 2016 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2017 */ 2018 case TCPS_TIME_WAIT: 2019 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2020 break; 2021 } 2022 } 2023 if (otp) 2024 tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen); 2025 2026 /* 2027 * Return any desired output. 2028 */ 2029 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2030 (void) tcp_output(tp); 2031 return IPPROTO_DONE; 2032 2033 badsyn: 2034 /* 2035 * Received a bad SYN. Increment counters and dropwithreset. 2036 */ 2037 tcpstat_inc(tcps_badsyn); 2038 tp = NULL; 2039 goto dropwithreset; 2040 2041 dropafterack_ratelim: 2042 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2043 tcp_ackdrop_ppslim) == 0) { 2044 /* XXX stat */ 2045 goto drop; 2046 } 2047 /* ...fall into dropafterack... */ 2048 2049 dropafterack: 2050 /* 2051 * Generate an ACK dropping incoming segment if it occupies 2052 * sequence space, where the ACK reflects our state. 2053 */ 2054 if (tiflags & TH_RST) 2055 goto drop; 2056 m_freem(m); 2057 tp->t_flags |= TF_ACKNOW; 2058 (void) tcp_output(tp); 2059 return IPPROTO_DONE; 2060 2061 dropwithreset_ratelim: 2062 /* 2063 * We may want to rate-limit RSTs in certain situations, 2064 * particularly if we are sending an RST in response to 2065 * an attempt to connect to or otherwise communicate with 2066 * a port for which we have no socket. 2067 */ 2068 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2069 tcp_rst_ppslim) == 0) { 2070 /* XXX stat */ 2071 goto drop; 2072 } 2073 /* ...fall into dropwithreset... */ 2074 2075 dropwithreset: 2076 /* 2077 * Generate a RST, dropping incoming segment. 2078 * Make ACK acceptable to originator of segment. 2079 * Don't bother to respond to RST. 2080 */ 2081 if (tiflags & TH_RST) 2082 goto drop; 2083 if (tiflags & TH_ACK) { 2084 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2085 TH_RST, m->m_pkthdr.ph_rtableid); 2086 } else { 2087 if (tiflags & TH_SYN) 2088 tlen++; 2089 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2090 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2091 } 2092 m_freem(m); 2093 return IPPROTO_DONE; 2094 2095 drop: 2096 /* 2097 * Drop space held by incoming segment and return. 2098 */ 2099 if (otp) 2100 tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen); 2101 2102 m_freem(m); 2103 return IPPROTO_DONE; 2104 } 2105 2106 int 2107 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2108 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2109 u_int rtableid) 2110 { 2111 u_int16_t mss = 0; 2112 int opt, optlen; 2113 #ifdef TCP_SIGNATURE 2114 caddr_t sigp = NULL; 2115 struct tdb *tdb = NULL; 2116 #endif /* TCP_SIGNATURE */ 2117 2118 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2119 opt = cp[0]; 2120 if (opt == TCPOPT_EOL) 2121 break; 2122 if (opt == TCPOPT_NOP) 2123 optlen = 1; 2124 else { 2125 if (cnt < 2) 2126 break; 2127 optlen = cp[1]; 2128 if (optlen < 2 || optlen > cnt) 2129 break; 2130 } 2131 switch (opt) { 2132 2133 default: 2134 continue; 2135 2136 case TCPOPT_MAXSEG: 2137 if (optlen != TCPOLEN_MAXSEG) 2138 continue; 2139 if (!(th->th_flags & TH_SYN)) 2140 continue; 2141 if (TCPS_HAVERCVDSYN(tp->t_state)) 2142 continue; 2143 memcpy(&mss, cp + 2, sizeof(mss)); 2144 mss = ntohs(mss); 2145 oi->maxseg = mss; 2146 break; 2147 2148 case TCPOPT_WINDOW: 2149 if (optlen != TCPOLEN_WINDOW) 2150 continue; 2151 if (!(th->th_flags & TH_SYN)) 2152 continue; 2153 if (TCPS_HAVERCVDSYN(tp->t_state)) 2154 continue; 2155 tp->t_flags |= TF_RCVD_SCALE; 2156 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2157 break; 2158 2159 case TCPOPT_TIMESTAMP: 2160 if (optlen != TCPOLEN_TIMESTAMP) 2161 continue; 2162 oi->ts_present = 1; 2163 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2164 oi->ts_val = ntohl(oi->ts_val); 2165 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2166 oi->ts_ecr = ntohl(oi->ts_ecr); 2167 2168 if (!(th->th_flags & TH_SYN)) 2169 continue; 2170 if (TCPS_HAVERCVDSYN(tp->t_state)) 2171 continue; 2172 /* 2173 * A timestamp received in a SYN makes 2174 * it ok to send timestamp requests and replies. 2175 */ 2176 tp->t_flags |= TF_RCVD_TSTMP; 2177 tp->ts_recent = oi->ts_val; 2178 tp->ts_recent_age = tcp_now; 2179 break; 2180 2181 case TCPOPT_SACK_PERMITTED: 2182 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2183 continue; 2184 if (!(th->th_flags & TH_SYN)) 2185 continue; 2186 if (TCPS_HAVERCVDSYN(tp->t_state)) 2187 continue; 2188 /* MUST only be set on SYN */ 2189 tp->t_flags |= TF_SACK_PERMIT; 2190 break; 2191 case TCPOPT_SACK: 2192 tcp_sack_option(tp, th, cp, optlen); 2193 break; 2194 #ifdef TCP_SIGNATURE 2195 case TCPOPT_SIGNATURE: 2196 if (optlen != TCPOLEN_SIGNATURE) 2197 continue; 2198 2199 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2200 goto bad; 2201 2202 sigp = cp + 2; 2203 break; 2204 #endif /* TCP_SIGNATURE */ 2205 } 2206 } 2207 2208 #ifdef TCP_SIGNATURE 2209 if (tp->t_flags & TF_SIGNATURE) { 2210 union sockaddr_union src, dst; 2211 2212 memset(&src, 0, sizeof(union sockaddr_union)); 2213 memset(&dst, 0, sizeof(union sockaddr_union)); 2214 2215 switch (tp->pf) { 2216 case 0: 2217 case AF_INET: 2218 src.sa.sa_len = sizeof(struct sockaddr_in); 2219 src.sa.sa_family = AF_INET; 2220 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2221 dst.sa.sa_len = sizeof(struct sockaddr_in); 2222 dst.sa.sa_family = AF_INET; 2223 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2224 break; 2225 #ifdef INET6 2226 case AF_INET6: 2227 src.sa.sa_len = sizeof(struct sockaddr_in6); 2228 src.sa.sa_family = AF_INET6; 2229 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2230 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2231 dst.sa.sa_family = AF_INET6; 2232 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2233 break; 2234 #endif /* INET6 */ 2235 } 2236 2237 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2238 0, &src, &dst, IPPROTO_TCP); 2239 2240 /* 2241 * We don't have an SA for this peer, so we turn off 2242 * TF_SIGNATURE on the listen socket 2243 */ 2244 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2245 tp->t_flags &= ~TF_SIGNATURE; 2246 2247 } 2248 2249 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2250 tcpstat_inc(tcps_rcvbadsig); 2251 goto bad; 2252 } 2253 2254 if (sigp) { 2255 char sig[16]; 2256 2257 if (tdb == NULL) { 2258 tcpstat_inc(tcps_rcvbadsig); 2259 goto bad; 2260 } 2261 2262 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2263 goto bad; 2264 2265 if (timingsafe_bcmp(sig, sigp, 16)) { 2266 tcpstat_inc(tcps_rcvbadsig); 2267 goto bad; 2268 } 2269 2270 tcpstat_inc(tcps_rcvgoodsig); 2271 } 2272 2273 tdb_unref(tdb); 2274 #endif /* TCP_SIGNATURE */ 2275 2276 return (0); 2277 2278 #ifdef TCP_SIGNATURE 2279 bad: 2280 tdb_unref(tdb); 2281 #endif /* TCP_SIGNATURE */ 2282 return (-1); 2283 } 2284 2285 u_long 2286 tcp_seq_subtract(u_long a, u_long b) 2287 { 2288 return ((long)(a - b)); 2289 } 2290 2291 /* 2292 * This function is called upon receipt of new valid data (while not in header 2293 * prediction mode), and it updates the ordered list of sacks. 2294 */ 2295 void 2296 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2297 tcp_seq rcv_lastend) 2298 { 2299 /* 2300 * First reported block MUST be the most recent one. Subsequent 2301 * blocks SHOULD be in the order in which they arrived at the 2302 * receiver. These two conditions make the implementation fully 2303 * compliant with RFC 2018. 2304 */ 2305 int i, j = 0, count = 0, lastpos = -1; 2306 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2307 2308 /* First clean up current list of sacks */ 2309 for (i = 0; i < tp->rcv_numsacks; i++) { 2310 sack = tp->sackblks[i]; 2311 if (sack.start == 0 && sack.end == 0) { 2312 count++; /* count = number of blocks to be discarded */ 2313 continue; 2314 } 2315 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2316 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2317 count++; 2318 } else { 2319 temp[j].start = tp->sackblks[i].start; 2320 temp[j++].end = tp->sackblks[i].end; 2321 } 2322 } 2323 tp->rcv_numsacks -= count; 2324 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2325 tcp_clean_sackreport(tp); 2326 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2327 /* ==> need first sack block */ 2328 tp->sackblks[0].start = rcv_laststart; 2329 tp->sackblks[0].end = rcv_lastend; 2330 tp->rcv_numsacks = 1; 2331 } 2332 return; 2333 } 2334 /* Otherwise, sack blocks are already present. */ 2335 for (i = 0; i < tp->rcv_numsacks; i++) 2336 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2337 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2338 return; /* sack list remains unchanged */ 2339 /* 2340 * From here, segment just received should be (part of) the 1st sack. 2341 * Go through list, possibly coalescing sack block entries. 2342 */ 2343 firstsack.start = rcv_laststart; 2344 firstsack.end = rcv_lastend; 2345 for (i = 0; i < tp->rcv_numsacks; i++) { 2346 sack = tp->sackblks[i]; 2347 if (SEQ_LT(sack.end, firstsack.start) || 2348 SEQ_GT(sack.start, firstsack.end)) 2349 continue; /* no overlap */ 2350 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2351 /* 2352 * identical block; delete it here since we will 2353 * move it to the front of the list. 2354 */ 2355 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2356 lastpos = i; /* last posn with a zero entry */ 2357 continue; 2358 } 2359 if (SEQ_LEQ(sack.start, firstsack.start)) 2360 firstsack.start = sack.start; /* merge blocks */ 2361 if (SEQ_GEQ(sack.end, firstsack.end)) 2362 firstsack.end = sack.end; /* merge blocks */ 2363 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2364 lastpos = i; /* last posn with a zero entry */ 2365 } 2366 if (lastpos != -1) { /* at least one merge */ 2367 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2368 sack = tp->sackblks[i]; 2369 if (sack.start == 0 && sack.end == 0) 2370 continue; 2371 temp[j++] = sack; 2372 } 2373 tp->rcv_numsacks = j; /* including first blk (added later) */ 2374 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2375 tp->sackblks[i] = temp[i]; 2376 } else { /* no merges -- shift sacks by 1 */ 2377 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2378 tp->rcv_numsacks++; 2379 for (i = tp->rcv_numsacks-1; i > 0; i--) 2380 tp->sackblks[i] = tp->sackblks[i-1]; 2381 } 2382 tp->sackblks[0] = firstsack; 2383 return; 2384 } 2385 2386 /* 2387 * Process the TCP SACK option. tp->snd_holes is an ordered list 2388 * of holes (oldest to newest, in terms of the sequence space). 2389 */ 2390 void 2391 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2392 { 2393 int tmp_olen; 2394 u_char *tmp_cp; 2395 struct sackhole *cur, *p, *temp; 2396 2397 if (!tp->sack_enable) 2398 return; 2399 /* SACK without ACK doesn't make sense. */ 2400 if ((th->th_flags & TH_ACK) == 0) 2401 return; 2402 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2403 if (SEQ_LT(th->th_ack, tp->snd_una) || 2404 SEQ_GT(th->th_ack, tp->snd_max)) 2405 return; 2406 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2407 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2408 return; 2409 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2410 tmp_cp = cp + 2; 2411 tmp_olen = optlen - 2; 2412 tcpstat_inc(tcps_sack_rcv_opts); 2413 if (tp->snd_numholes < 0) 2414 tp->snd_numholes = 0; 2415 if (tp->t_maxseg == 0) 2416 panic("tcp_sack_option"); /* Should never happen */ 2417 while (tmp_olen > 0) { 2418 struct sackblk sack; 2419 2420 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2421 sack.start = ntohl(sack.start); 2422 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2423 sack.end = ntohl(sack.end); 2424 tmp_olen -= TCPOLEN_SACK; 2425 tmp_cp += TCPOLEN_SACK; 2426 if (SEQ_LEQ(sack.end, sack.start)) 2427 continue; /* bad SACK fields */ 2428 if (SEQ_LEQ(sack.end, tp->snd_una)) 2429 continue; /* old block */ 2430 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2431 if (SEQ_LT(sack.start, th->th_ack)) 2432 continue; 2433 } 2434 if (SEQ_GT(sack.end, tp->snd_max)) 2435 continue; 2436 if (tp->snd_holes == NULL) { /* first hole */ 2437 tp->snd_holes = (struct sackhole *) 2438 pool_get(&sackhl_pool, PR_NOWAIT); 2439 if (tp->snd_holes == NULL) { 2440 /* ENOBUFS, so ignore SACKed block for now */ 2441 goto dropped; 2442 } 2443 cur = tp->snd_holes; 2444 cur->start = th->th_ack; 2445 cur->end = sack.start; 2446 cur->rxmit = cur->start; 2447 cur->next = NULL; 2448 tp->snd_numholes = 1; 2449 tp->rcv_lastsack = sack.end; 2450 /* 2451 * dups is at least one. If more data has been 2452 * SACKed, it can be greater than one. 2453 */ 2454 cur->dups = min(tcprexmtthresh, 2455 ((sack.end - cur->end)/tp->t_maxseg)); 2456 if (cur->dups < 1) 2457 cur->dups = 1; 2458 continue; /* with next sack block */ 2459 } 2460 /* Go thru list of holes: p = previous, cur = current */ 2461 p = cur = tp->snd_holes; 2462 while (cur) { 2463 if (SEQ_LEQ(sack.end, cur->start)) 2464 /* SACKs data before the current hole */ 2465 break; /* no use going through more holes */ 2466 if (SEQ_GEQ(sack.start, cur->end)) { 2467 /* SACKs data beyond the current hole */ 2468 cur->dups++; 2469 if (((sack.end - cur->end)/tp->t_maxseg) >= 2470 tcprexmtthresh) 2471 cur->dups = tcprexmtthresh; 2472 p = cur; 2473 cur = cur->next; 2474 continue; 2475 } 2476 if (SEQ_LEQ(sack.start, cur->start)) { 2477 /* Data acks at least the beginning of hole */ 2478 if (SEQ_GEQ(sack.end, cur->end)) { 2479 /* Acks entire hole, so delete hole */ 2480 if (p != cur) { 2481 p->next = cur->next; 2482 pool_put(&sackhl_pool, cur); 2483 cur = p->next; 2484 } else { 2485 cur = cur->next; 2486 pool_put(&sackhl_pool, p); 2487 p = cur; 2488 tp->snd_holes = p; 2489 } 2490 tp->snd_numholes--; 2491 continue; 2492 } 2493 /* otherwise, move start of hole forward */ 2494 cur->start = sack.end; 2495 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2496 p = cur; 2497 cur = cur->next; 2498 continue; 2499 } 2500 /* move end of hole backward */ 2501 if (SEQ_GEQ(sack.end, cur->end)) { 2502 cur->end = sack.start; 2503 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2504 cur->dups++; 2505 if (((sack.end - cur->end)/tp->t_maxseg) >= 2506 tcprexmtthresh) 2507 cur->dups = tcprexmtthresh; 2508 p = cur; 2509 cur = cur->next; 2510 continue; 2511 } 2512 if (SEQ_LT(cur->start, sack.start) && 2513 SEQ_GT(cur->end, sack.end)) { 2514 /* 2515 * ACKs some data in middle of a hole; need to 2516 * split current hole 2517 */ 2518 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2519 goto dropped; 2520 temp = (struct sackhole *) 2521 pool_get(&sackhl_pool, PR_NOWAIT); 2522 if (temp == NULL) 2523 goto dropped; /* ENOBUFS */ 2524 temp->next = cur->next; 2525 temp->start = sack.end; 2526 temp->end = cur->end; 2527 temp->dups = cur->dups; 2528 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2529 cur->end = sack.start; 2530 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2531 cur->dups++; 2532 if (((sack.end - cur->end)/tp->t_maxseg) >= 2533 tcprexmtthresh) 2534 cur->dups = tcprexmtthresh; 2535 cur->next = temp; 2536 p = temp; 2537 cur = p->next; 2538 tp->snd_numholes++; 2539 } 2540 } 2541 /* At this point, p points to the last hole on the list */ 2542 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2543 /* 2544 * Need to append new hole at end. 2545 * Last hole is p (and it's not NULL). 2546 */ 2547 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2548 goto dropped; 2549 temp = (struct sackhole *) 2550 pool_get(&sackhl_pool, PR_NOWAIT); 2551 if (temp == NULL) 2552 goto dropped; /* ENOBUFS */ 2553 temp->start = tp->rcv_lastsack; 2554 temp->end = sack.start; 2555 temp->dups = min(tcprexmtthresh, 2556 ((sack.end - sack.start)/tp->t_maxseg)); 2557 if (temp->dups < 1) 2558 temp->dups = 1; 2559 temp->rxmit = temp->start; 2560 temp->next = 0; 2561 p->next = temp; 2562 tp->rcv_lastsack = sack.end; 2563 tp->snd_numholes++; 2564 } 2565 } 2566 return; 2567 dropped: 2568 tcpstat_inc(tcps_sack_drop_opts); 2569 } 2570 2571 /* 2572 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2573 * it is completely acked; otherwise, tcp_sack_option(), called from 2574 * tcp_dooptions(), will fix up the hole. 2575 */ 2576 void 2577 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2578 { 2579 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2580 /* max because this could be an older ack just arrived */ 2581 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2582 th->th_ack : tp->snd_una; 2583 struct sackhole *cur = tp->snd_holes; 2584 struct sackhole *prev; 2585 while (cur) 2586 if (SEQ_LEQ(cur->end, lastack)) { 2587 prev = cur; 2588 cur = cur->next; 2589 pool_put(&sackhl_pool, prev); 2590 tp->snd_numholes--; 2591 } else if (SEQ_LT(cur->start, lastack)) { 2592 cur->start = lastack; 2593 if (SEQ_LT(cur->rxmit, cur->start)) 2594 cur->rxmit = cur->start; 2595 break; 2596 } else 2597 break; 2598 tp->snd_holes = cur; 2599 } 2600 } 2601 2602 /* 2603 * Delete all receiver-side SACK information. 2604 */ 2605 void 2606 tcp_clean_sackreport(struct tcpcb *tp) 2607 { 2608 int i; 2609 2610 tp->rcv_numsacks = 0; 2611 for (i = 0; i < MAX_SACK_BLKS; i++) 2612 tp->sackblks[i].start = tp->sackblks[i].end=0; 2613 2614 } 2615 2616 /* 2617 * Partial ack handling within a sack recovery episode. When a partial ack 2618 * arrives, turn off retransmission timer, deflate the window, do not clear 2619 * tp->t_dupacks. 2620 */ 2621 void 2622 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2623 { 2624 /* Turn off retx. timer (will start again next segment) */ 2625 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2626 tp->t_rtttime = 0; 2627 /* 2628 * Partial window deflation. This statement relies on the 2629 * fact that tp->snd_una has not been updated yet. 2630 */ 2631 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2632 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2633 tp->snd_cwnd += tp->t_maxseg; 2634 } else 2635 tp->snd_cwnd = tp->t_maxseg; 2636 tp->snd_cwnd += tp->t_maxseg; 2637 tp->t_flags |= TF_NEEDOUTPUT; 2638 } 2639 2640 /* 2641 * Pull out of band byte out of a segment so 2642 * it doesn't appear in the user's data queue. 2643 * It is still reflected in the segment length for 2644 * sequencing purposes. 2645 */ 2646 void 2647 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2648 { 2649 int cnt = off + urgent - 1; 2650 2651 while (cnt >= 0) { 2652 if (m->m_len > cnt) { 2653 char *cp = mtod(m, caddr_t) + cnt; 2654 struct tcpcb *tp = sototcpcb(so); 2655 2656 tp->t_iobc = *cp; 2657 tp->t_oobflags |= TCPOOB_HAVEDATA; 2658 memmove(cp, cp + 1, m->m_len - cnt - 1); 2659 m->m_len--; 2660 return; 2661 } 2662 cnt -= m->m_len; 2663 m = m->m_next; 2664 if (m == NULL) 2665 break; 2666 } 2667 panic("tcp_pulloutofband"); 2668 } 2669 2670 /* 2671 * Collect new round-trip time estimate 2672 * and update averages and current timeout. 2673 */ 2674 void 2675 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2676 { 2677 short delta; 2678 short rttmin; 2679 2680 if (rtt < 0) 2681 rtt = 0; 2682 else if (rtt > TCP_RTT_MAX) 2683 rtt = TCP_RTT_MAX; 2684 2685 tcpstat_inc(tcps_rttupdated); 2686 if (tp->t_srtt != 0) { 2687 /* 2688 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2689 * after the binary point (scaled by 4), whereas 2690 * srtt is stored as fixed point with 5 bits after the 2691 * binary point (i.e., scaled by 32). The following magic 2692 * is equivalent to the smoothing algorithm in rfc793 with 2693 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2694 * point). 2695 */ 2696 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2697 (tp->t_srtt >> TCP_RTT_SHIFT); 2698 if ((tp->t_srtt += delta) <= 0) 2699 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2700 /* 2701 * We accumulate a smoothed rtt variance (actually, a 2702 * smoothed mean difference), then set the retransmit 2703 * timer to smoothed rtt + 4 times the smoothed variance. 2704 * rttvar is stored as fixed point with 4 bits after the 2705 * binary point (scaled by 16). The following is 2706 * equivalent to rfc793 smoothing with an alpha of .75 2707 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2708 * rfc793's wired-in beta. 2709 */ 2710 if (delta < 0) 2711 delta = -delta; 2712 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2713 if ((tp->t_rttvar += delta) <= 0) 2714 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2715 } else { 2716 /* 2717 * No rtt measurement yet - use the unsmoothed rtt. 2718 * Set the variance to half the rtt (so our first 2719 * retransmit happens at 3*rtt). 2720 */ 2721 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2722 tp->t_rttvar = (rtt + 1) << 2723 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2724 } 2725 tp->t_rtttime = 0; 2726 tp->t_rxtshift = 0; 2727 2728 /* 2729 * the retransmit should happen at rtt + 4 * rttvar. 2730 * Because of the way we do the smoothing, srtt and rttvar 2731 * will each average +1/2 tick of bias. When we compute 2732 * the retransmit timer, we want 1/2 tick of rounding and 2733 * 1 extra tick because of +-1/2 tick uncertainty in the 2734 * firing of the timer. The bias will give us exactly the 2735 * 1.5 tick we need. But, because the bias is 2736 * statistical, we have to test that we don't drop below 2737 * the minimum feasible timer (which is 2 ticks). 2738 */ 2739 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2740 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2741 2742 /* 2743 * We received an ack for a packet that wasn't retransmitted; 2744 * it is probably safe to discard any error indications we've 2745 * received recently. This isn't quite right, but close enough 2746 * for now (a route might have failed after we sent a segment, 2747 * and the return path might not be symmetrical). 2748 */ 2749 tp->t_softerror = 0; 2750 } 2751 2752 /* 2753 * Determine a reasonable value for maxseg size. 2754 * If the route is known, check route for mtu. 2755 * If none, use an mss that can be handled on the outgoing 2756 * interface without forcing IP to fragment; if bigger than 2757 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2758 * to utilize large mbufs. If no route is found, route has no mtu, 2759 * or the destination isn't local, use a default, hopefully conservative 2760 * size (usually 512 or the default IP max size, but no more than the mtu 2761 * of the interface), as we can't discover anything about intervening 2762 * gateways or networks. We also initialize the congestion/slow start 2763 * window to be a single segment if the destination isn't local. 2764 * While looking at the routing entry, we also initialize other path-dependent 2765 * parameters from pre-set or cached values in the routing entry. 2766 * 2767 * Also take into account the space needed for options that we 2768 * send regularly. Make maxseg shorter by that amount to assure 2769 * that we can send maxseg amount of data even when the options 2770 * are present. Store the upper limit of the length of options plus 2771 * data in maxopd. 2772 * 2773 * NOTE: offer == -1 indicates that the maxseg size changed due to 2774 * Path MTU discovery. 2775 */ 2776 int 2777 tcp_mss(struct tcpcb *tp, int offer) 2778 { 2779 struct rtentry *rt; 2780 struct ifnet *ifp = NULL; 2781 int mss, mssopt; 2782 int iphlen; 2783 struct inpcb *inp; 2784 2785 inp = tp->t_inpcb; 2786 2787 mssopt = mss = tcp_mssdflt; 2788 2789 rt = in_pcbrtentry(inp); 2790 2791 if (rt == NULL) 2792 goto out; 2793 2794 ifp = if_get(rt->rt_ifidx); 2795 if (ifp == NULL) 2796 goto out; 2797 2798 switch (tp->pf) { 2799 #ifdef INET6 2800 case AF_INET6: 2801 iphlen = sizeof(struct ip6_hdr); 2802 break; 2803 #endif 2804 case AF_INET: 2805 iphlen = sizeof(struct ip); 2806 break; 2807 default: 2808 /* the family does not support path MTU discovery */ 2809 goto out; 2810 } 2811 2812 /* 2813 * if there's an mtu associated with the route and we support 2814 * path MTU discovery for the underlying protocol family, use it. 2815 */ 2816 if (rt->rt_mtu) { 2817 /* 2818 * One may wish to lower MSS to take into account options, 2819 * especially security-related options. 2820 */ 2821 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2822 /* 2823 * RFC2460 section 5, last paragraph: if path MTU is 2824 * smaller than 1280, use 1280 as packet size and 2825 * attach fragment header. 2826 */ 2827 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2828 sizeof(struct tcphdr); 2829 } else { 2830 mss = rt->rt_mtu - iphlen - 2831 sizeof(struct tcphdr); 2832 } 2833 } else if (ifp->if_flags & IFF_LOOPBACK) { 2834 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2835 } else if (tp->pf == AF_INET) { 2836 if (ip_mtudisc) 2837 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2838 } 2839 #ifdef INET6 2840 else if (tp->pf == AF_INET6) { 2841 /* 2842 * for IPv6, path MTU discovery is always turned on, 2843 * or the node must use packet size <= 1280. 2844 */ 2845 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2846 } 2847 #endif /* INET6 */ 2848 2849 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2850 if (offer != -1) { 2851 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2852 mssopt = max(tcp_mssdflt, mssopt); 2853 } 2854 out: 2855 if_put(ifp); 2856 /* 2857 * The current mss, t_maxseg, is initialized to the default value. 2858 * If we compute a smaller value, reduce the current mss. 2859 * If we compute a larger value, return it for use in sending 2860 * a max seg size option, but don't store it for use 2861 * unless we received an offer at least that large from peer. 2862 * 2863 * However, do not accept offers lower than the minimum of 2864 * the interface MTU and 216. 2865 */ 2866 if (offer > 0) 2867 tp->t_peermss = offer; 2868 if (tp->t_peermss) 2869 mss = min(mss, max(tp->t_peermss, 216)); 2870 2871 /* sanity - at least max opt. space */ 2872 mss = max(mss, 64); 2873 2874 /* 2875 * maxopd stores the maximum length of data AND options 2876 * in a segment; maxseg is the amount of data in a normal 2877 * segment. We need to store this value (maxopd) apart 2878 * from maxseg, because now every segment carries options 2879 * and thus we normally have somewhat less data in segments. 2880 */ 2881 tp->t_maxopd = mss; 2882 2883 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2884 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2885 mss -= TCPOLEN_TSTAMP_APPA; 2886 #ifdef TCP_SIGNATURE 2887 if (tp->t_flags & TF_SIGNATURE) 2888 mss -= TCPOLEN_SIGLEN; 2889 #endif 2890 2891 if (offer == -1) { 2892 /* mss changed due to Path MTU discovery */ 2893 tp->t_flags &= ~TF_PMTUD_PEND; 2894 tp->t_pmtud_mtu_sent = 0; 2895 tp->t_pmtud_mss_acked = 0; 2896 if (mss < tp->t_maxseg) { 2897 /* 2898 * Follow suggestion in RFC 2414 to reduce the 2899 * congestion window by the ratio of the old 2900 * segment size to the new segment size. 2901 */ 2902 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2903 mss, mss); 2904 } 2905 } else if (tcp_do_rfc3390 == 2) { 2906 /* increase initial window */ 2907 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2908 } else if (tcp_do_rfc3390) { 2909 /* increase initial window */ 2910 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2911 } else 2912 tp->snd_cwnd = mss; 2913 2914 tp->t_maxseg = mss; 2915 2916 return (offer != -1 ? mssopt : mss); 2917 } 2918 2919 u_int 2920 tcp_hdrsz(struct tcpcb *tp) 2921 { 2922 u_int hlen; 2923 2924 switch (tp->pf) { 2925 #ifdef INET6 2926 case AF_INET6: 2927 hlen = sizeof(struct ip6_hdr); 2928 break; 2929 #endif 2930 case AF_INET: 2931 hlen = sizeof(struct ip); 2932 break; 2933 default: 2934 hlen = 0; 2935 break; 2936 } 2937 hlen += sizeof(struct tcphdr); 2938 2939 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2940 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2941 hlen += TCPOLEN_TSTAMP_APPA; 2942 #ifdef TCP_SIGNATURE 2943 if (tp->t_flags & TF_SIGNATURE) 2944 hlen += TCPOLEN_SIGLEN; 2945 #endif 2946 return (hlen); 2947 } 2948 2949 /* 2950 * Set connection variables based on the effective MSS. 2951 * We are passed the TCPCB for the actual connection. If we 2952 * are the server, we are called by the compressed state engine 2953 * when the 3-way handshake is complete. If we are the client, 2954 * we are called when we receive the SYN,ACK from the server. 2955 * 2956 * NOTE: The t_maxseg value must be initialized in the TCPCB 2957 * before this routine is called! 2958 */ 2959 void 2960 tcp_mss_update(struct tcpcb *tp) 2961 { 2962 int mss; 2963 u_long bufsize; 2964 struct rtentry *rt; 2965 struct socket *so; 2966 2967 so = tp->t_inpcb->inp_socket; 2968 mss = tp->t_maxseg; 2969 2970 rt = in_pcbrtentry(tp->t_inpcb); 2971 2972 if (rt == NULL) 2973 return; 2974 2975 bufsize = so->so_snd.sb_hiwat; 2976 if (bufsize < mss) { 2977 mss = bufsize; 2978 /* Update t_maxseg and t_maxopd */ 2979 tcp_mss(tp, mss); 2980 } else { 2981 bufsize = roundup(bufsize, mss); 2982 if (bufsize > sb_max) 2983 bufsize = sb_max; 2984 (void)sbreserve(so, &so->so_snd, bufsize); 2985 } 2986 2987 bufsize = so->so_rcv.sb_hiwat; 2988 if (bufsize > mss) { 2989 bufsize = roundup(bufsize, mss); 2990 if (bufsize > sb_max) 2991 bufsize = sb_max; 2992 (void)sbreserve(so, &so->so_rcv, bufsize); 2993 } 2994 2995 } 2996 2997 /* 2998 * When a partial ack arrives, force the retransmission of the 2999 * next unacknowledged segment. Do not clear tp->t_dupacks. 3000 * By setting snd_nxt to ti_ack, this forces retransmission timer 3001 * to be started again. 3002 */ 3003 void 3004 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3005 { 3006 /* 3007 * snd_una has not been updated and the socket send buffer 3008 * not yet drained of the acked data, so we have to leave 3009 * snd_una as it was to get the correct data offset in 3010 * tcp_output(). 3011 */ 3012 tcp_seq onxt = tp->snd_nxt; 3013 u_long ocwnd = tp->snd_cwnd; 3014 3015 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3016 tp->t_rtttime = 0; 3017 tp->snd_nxt = th->th_ack; 3018 /* 3019 * Set snd_cwnd to one segment beyond acknowledged offset 3020 * (tp->snd_una not yet updated when this function is called) 3021 */ 3022 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3023 (void)tcp_output(tp); 3024 tp->snd_cwnd = ocwnd; 3025 if (SEQ_GT(onxt, tp->snd_nxt)) 3026 tp->snd_nxt = onxt; 3027 /* 3028 * Partial window deflation. Relies on fact that tp->snd_una 3029 * not updated yet. 3030 */ 3031 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3032 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3033 else 3034 tp->snd_cwnd = 0; 3035 tp->snd_cwnd += tp->t_maxseg; 3036 } 3037 3038 int 3039 tcp_mss_adv(struct mbuf *m, int af) 3040 { 3041 int mss = 0; 3042 int iphlen; 3043 struct ifnet *ifp = NULL; 3044 3045 if (m && (m->m_flags & M_PKTHDR)) 3046 ifp = if_get(m->m_pkthdr.ph_ifidx); 3047 3048 switch (af) { 3049 case AF_INET: 3050 if (ifp != NULL) 3051 mss = ifp->if_mtu; 3052 iphlen = sizeof(struct ip); 3053 break; 3054 #ifdef INET6 3055 case AF_INET6: 3056 if (ifp != NULL) 3057 mss = ifp->if_mtu; 3058 iphlen = sizeof(struct ip6_hdr); 3059 break; 3060 #endif 3061 default: 3062 unhandled_af(af); 3063 } 3064 if_put(ifp); 3065 mss = mss - iphlen - sizeof(struct tcphdr); 3066 return (max(mss, tcp_mssdflt)); 3067 } 3068 3069 /* 3070 * TCP compressed state engine. Currently used to hold compressed 3071 * state for SYN_RECEIVED. 3072 */ 3073 3074 /* syn hash parameters */ 3075 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3076 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3077 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3078 int tcp_syn_use_limit = 100000; 3079 3080 struct syn_cache_set tcp_syn_cache[2]; 3081 int tcp_syn_cache_active; 3082 3083 #define SYN_HASH(sa, sp, dp, rand) \ 3084 (((sa)->s_addr ^ (rand)[0]) * \ 3085 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3086 #ifndef INET6 3087 #define SYN_HASHALL(hash, src, dst, rand) \ 3088 do { \ 3089 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3090 satosin(src)->sin_port, \ 3091 satosin(dst)->sin_port, (rand)); \ 3092 } while (/*CONSTCOND*/ 0) 3093 #else 3094 #define SYN_HASH6(sa, sp, dp, rand) \ 3095 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3096 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3097 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3098 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3099 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3100 3101 #define SYN_HASHALL(hash, src, dst, rand) \ 3102 do { \ 3103 switch ((src)->sa_family) { \ 3104 case AF_INET: \ 3105 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3106 satosin(src)->sin_port, \ 3107 satosin(dst)->sin_port, (rand)); \ 3108 break; \ 3109 case AF_INET6: \ 3110 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3111 satosin6(src)->sin6_port, \ 3112 satosin6(dst)->sin6_port, (rand)); \ 3113 break; \ 3114 default: \ 3115 hash = 0; \ 3116 } \ 3117 } while (/*CONSTCOND*/0) 3118 #endif /* INET6 */ 3119 3120 void 3121 syn_cache_rm(struct syn_cache *sc) 3122 { 3123 sc->sc_flags |= SCF_DEAD; 3124 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3125 sc->sc_tp = NULL; 3126 LIST_REMOVE(sc, sc_tpq); 3127 sc->sc_buckethead->sch_length--; 3128 timeout_del(&sc->sc_timer); 3129 sc->sc_set->scs_count--; 3130 } 3131 3132 void 3133 syn_cache_put(struct syn_cache *sc) 3134 { 3135 m_free(sc->sc_ipopts); 3136 if (sc->sc_route4.ro_rt != NULL) { 3137 rtfree(sc->sc_route4.ro_rt); 3138 sc->sc_route4.ro_rt = NULL; 3139 } 3140 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3141 timeout_add(&sc->sc_timer, 0); 3142 } 3143 3144 struct pool syn_cache_pool; 3145 3146 /* 3147 * We don't estimate RTT with SYNs, so each packet starts with the default 3148 * RTT and each timer step has a fixed timeout value. 3149 */ 3150 #define SYN_CACHE_TIMER_ARM(sc) \ 3151 do { \ 3152 TCPT_RANGESET((sc)->sc_rxtcur, \ 3153 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3154 TCPTV_REXMTMAX); \ 3155 if (!timeout_initialized(&(sc)->sc_timer)) \ 3156 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3157 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3158 } while (/*CONSTCOND*/0) 3159 3160 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3161 3162 void 3163 syn_cache_init(void) 3164 { 3165 int i; 3166 3167 /* Initialize the hash buckets. */ 3168 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3169 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3170 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3171 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3172 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3173 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3174 for (i = 0; i < tcp_syn_hash_size; i++) { 3175 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3176 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3177 } 3178 3179 /* Initialize the syn cache pool. */ 3180 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3181 0, "syncache", NULL); 3182 } 3183 3184 void 3185 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3186 { 3187 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3188 struct syn_cache_head *scp; 3189 struct syn_cache *sc2; 3190 int i; 3191 3192 NET_ASSERT_LOCKED(); 3193 3194 /* 3195 * If there are no entries in the hash table, reinitialize 3196 * the hash secrets. To avoid useless cache swaps and 3197 * reinitialization, use it until the limit is reached. 3198 * An empty cache is also the opportunity to resize the hash. 3199 */ 3200 if (set->scs_count == 0 && set->scs_use <= 0) { 3201 set->scs_use = tcp_syn_use_limit; 3202 if (set->scs_size != tcp_syn_hash_size) { 3203 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3204 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3205 if (scp == NULL) { 3206 /* Try again next time. */ 3207 set->scs_use = 0; 3208 } else { 3209 free(set->scs_buckethead, M_SYNCACHE, 3210 set->scs_size * 3211 sizeof(struct syn_cache_head)); 3212 set->scs_buckethead = scp; 3213 set->scs_size = tcp_syn_hash_size; 3214 for (i = 0; i < tcp_syn_hash_size; i++) 3215 TAILQ_INIT(&scp[i].sch_bucket); 3216 } 3217 } 3218 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3219 tcpstat_inc(tcps_sc_seedrandom); 3220 } 3221 3222 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3223 set->scs_random); 3224 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3225 sc->sc_buckethead = scp; 3226 3227 /* 3228 * Make sure that we don't overflow the per-bucket 3229 * limit or the total cache size limit. 3230 */ 3231 if (scp->sch_length >= tcp_syn_bucket_limit) { 3232 tcpstat_inc(tcps_sc_bucketoverflow); 3233 /* 3234 * Someone might attack our bucket hash function. Reseed 3235 * with random as soon as the passive syn cache gets empty. 3236 */ 3237 set->scs_use = 0; 3238 /* 3239 * The bucket is full. Toss the oldest element in the 3240 * bucket. This will be the first entry in the bucket. 3241 */ 3242 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3243 #ifdef DIAGNOSTIC 3244 /* 3245 * This should never happen; we should always find an 3246 * entry in our bucket. 3247 */ 3248 if (sc2 == NULL) 3249 panic("%s: bucketoverflow: impossible", __func__); 3250 #endif 3251 syn_cache_rm(sc2); 3252 syn_cache_put(sc2); 3253 } else if (set->scs_count >= tcp_syn_cache_limit) { 3254 struct syn_cache_head *scp2, *sce; 3255 3256 tcpstat_inc(tcps_sc_overflowed); 3257 /* 3258 * The cache is full. Toss the oldest entry in the 3259 * first non-empty bucket we can find. 3260 * 3261 * XXX We would really like to toss the oldest 3262 * entry in the cache, but we hope that this 3263 * condition doesn't happen very often. 3264 */ 3265 scp2 = scp; 3266 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3267 sce = &set->scs_buckethead[set->scs_size]; 3268 for (++scp2; scp2 != scp; scp2++) { 3269 if (scp2 >= sce) 3270 scp2 = &set->scs_buckethead[0]; 3271 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3272 break; 3273 } 3274 #ifdef DIAGNOSTIC 3275 /* 3276 * This should never happen; we should always find a 3277 * non-empty bucket. 3278 */ 3279 if (scp2 == scp) 3280 panic("%s: cacheoverflow: impossible", 3281 __func__); 3282 #endif 3283 } 3284 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3285 syn_cache_rm(sc2); 3286 syn_cache_put(sc2); 3287 } 3288 3289 /* 3290 * Initialize the entry's timer. 3291 */ 3292 sc->sc_rxttot = 0; 3293 sc->sc_rxtshift = 0; 3294 SYN_CACHE_TIMER_ARM(sc); 3295 3296 /* Link it from tcpcb entry */ 3297 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3298 3299 /* Put it into the bucket. */ 3300 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3301 scp->sch_length++; 3302 sc->sc_set = set; 3303 set->scs_count++; 3304 set->scs_use--; 3305 3306 tcpstat_inc(tcps_sc_added); 3307 3308 /* 3309 * If the active cache has exceeded its use limit and 3310 * the passive syn cache is empty, exchange their roles. 3311 */ 3312 if (set->scs_use <= 0 && 3313 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3314 tcp_syn_cache_active = !tcp_syn_cache_active; 3315 } 3316 3317 /* 3318 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3319 * If we have retransmitted an entry the maximum number of times, expire 3320 * that entry. 3321 */ 3322 void 3323 syn_cache_timer(void *arg) 3324 { 3325 struct syn_cache *sc = arg; 3326 3327 NET_LOCK(); 3328 if (sc->sc_flags & SCF_DEAD) 3329 goto out; 3330 3331 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3332 /* Drop it -- too many retransmissions. */ 3333 goto dropit; 3334 } 3335 3336 /* 3337 * Compute the total amount of time this entry has 3338 * been on a queue. If this entry has been on longer 3339 * than the keep alive timer would allow, expire it. 3340 */ 3341 sc->sc_rxttot += sc->sc_rxtcur; 3342 if (sc->sc_rxttot >= tcptv_keep_init) 3343 goto dropit; 3344 3345 tcpstat_inc(tcps_sc_retransmitted); 3346 (void) syn_cache_respond(sc, NULL); 3347 3348 /* Advance the timer back-off. */ 3349 sc->sc_rxtshift++; 3350 SYN_CACHE_TIMER_ARM(sc); 3351 3352 out: 3353 NET_UNLOCK(); 3354 return; 3355 3356 dropit: 3357 tcpstat_inc(tcps_sc_timed_out); 3358 syn_cache_rm(sc); 3359 syn_cache_put(sc); 3360 NET_UNLOCK(); 3361 } 3362 3363 void 3364 syn_cache_reaper(void *arg) 3365 { 3366 struct syn_cache *sc = arg; 3367 3368 pool_put(&syn_cache_pool, (sc)); 3369 return; 3370 } 3371 3372 /* 3373 * Remove syn cache created by the specified tcb entry, 3374 * because this does not make sense to keep them 3375 * (if there's no tcb entry, syn cache entry will never be used) 3376 */ 3377 void 3378 syn_cache_cleanup(struct tcpcb *tp) 3379 { 3380 struct syn_cache *sc, *nsc; 3381 3382 NET_ASSERT_LOCKED(); 3383 3384 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3385 #ifdef DIAGNOSTIC 3386 if (sc->sc_tp != tp) 3387 panic("invalid sc_tp in syn_cache_cleanup"); 3388 #endif 3389 syn_cache_rm(sc); 3390 syn_cache_put(sc); 3391 } 3392 /* just for safety */ 3393 LIST_INIT(&tp->t_sc); 3394 } 3395 3396 /* 3397 * Find an entry in the syn cache. 3398 */ 3399 struct syn_cache * 3400 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3401 struct syn_cache_head **headp, u_int rtableid) 3402 { 3403 struct syn_cache_set *sets[2]; 3404 struct syn_cache *sc; 3405 struct syn_cache_head *scp; 3406 u_int32_t hash; 3407 int i; 3408 3409 NET_ASSERT_LOCKED(); 3410 3411 /* Check the active cache first, the passive cache is likely empty. */ 3412 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3413 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3414 for (i = 0; i < 2; i++) { 3415 if (sets[i]->scs_count == 0) 3416 continue; 3417 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3418 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3419 *headp = scp; 3420 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3421 if (sc->sc_hash != hash) 3422 continue; 3423 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3424 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3425 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3426 return (sc); 3427 } 3428 } 3429 return (NULL); 3430 } 3431 3432 /* 3433 * This function gets called when we receive an ACK for a 3434 * socket in the LISTEN state. We look up the connection 3435 * in the syn cache, and if its there, we pull it out of 3436 * the cache and turn it into a full-blown connection in 3437 * the SYN-RECEIVED state. 3438 * 3439 * The return values may not be immediately obvious, and their effects 3440 * can be subtle, so here they are: 3441 * 3442 * NULL SYN was not found in cache; caller should drop the 3443 * packet and send an RST. 3444 * 3445 * -1 We were unable to create the new connection, and are 3446 * aborting it. An ACK,RST is being sent to the peer 3447 * (unless we got screwey sequence numbners; see below), 3448 * because the 3-way handshake has been completed. Caller 3449 * should not free the mbuf, since we may be using it. If 3450 * we are not, we will free it. 3451 * 3452 * Otherwise, the return value is a pointer to the new socket 3453 * associated with the connection. 3454 */ 3455 struct socket * 3456 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3457 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3458 { 3459 struct syn_cache *sc; 3460 struct syn_cache_head *scp; 3461 struct inpcb *inp, *oldinp; 3462 struct tcpcb *tp = NULL; 3463 struct mbuf *am; 3464 struct socket *oso; 3465 3466 NET_ASSERT_LOCKED(); 3467 3468 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3469 if (sc == NULL) 3470 return (NULL); 3471 3472 /* 3473 * Verify the sequence and ack numbers. Try getting the correct 3474 * response again. 3475 */ 3476 if ((th->th_ack != sc->sc_iss + 1) || 3477 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3478 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3479 (void) syn_cache_respond(sc, m); 3480 return ((struct socket *)(-1)); 3481 } 3482 3483 /* Remove this cache entry */ 3484 syn_cache_rm(sc); 3485 3486 /* 3487 * Ok, create the full blown connection, and set things up 3488 * as they would have been set up if we had created the 3489 * connection when the SYN arrived. If we can't create 3490 * the connection, abort it. 3491 */ 3492 oso = so; 3493 so = sonewconn(so, SS_ISCONNECTED); 3494 if (so == NULL) 3495 goto resetandabort; 3496 3497 oldinp = sotoinpcb(oso); 3498 inp = sotoinpcb(so); 3499 3500 #ifdef IPSEC 3501 /* 3502 * We need to copy the required security levels 3503 * from the old pcb. Ditto for any other 3504 * IPsec-related information. 3505 */ 3506 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3507 sizeof(oldinp->inp_seclevel)); 3508 #endif /* IPSEC */ 3509 #ifdef INET6 3510 /* 3511 * inp still has the OLD in_pcb stuff, set the 3512 * v6-related flags on the new guy, too. 3513 */ 3514 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3515 if (inp->inp_flags & INP_IPV6) { 3516 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3517 inp->inp_hops = oldinp->inp_hops; 3518 } else 3519 #endif /* INET6 */ 3520 { 3521 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3522 } 3523 3524 #if NPF > 0 3525 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3526 struct pf_divert *divert; 3527 3528 divert = pf_find_divert(m); 3529 KASSERT(divert != NULL); 3530 inp->inp_rtableid = divert->rdomain; 3531 } else 3532 #endif 3533 /* inherit rtable from listening socket */ 3534 inp->inp_rtableid = sc->sc_rtableid; 3535 3536 inp->inp_lport = th->th_dport; 3537 switch (src->sa_family) { 3538 #ifdef INET6 3539 case AF_INET6: 3540 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3541 break; 3542 #endif /* INET6 */ 3543 case AF_INET: 3544 inp->inp_laddr = satosin(dst)->sin_addr; 3545 inp->inp_options = ip_srcroute(m); 3546 if (inp->inp_options == NULL) { 3547 inp->inp_options = sc->sc_ipopts; 3548 sc->sc_ipopts = NULL; 3549 } 3550 break; 3551 } 3552 in_pcbrehash(inp); 3553 3554 /* 3555 * Give the new socket our cached route reference. 3556 */ 3557 if (src->sa_family == AF_INET) 3558 inp->inp_route = sc->sc_route4; /* struct assignment */ 3559 #ifdef INET6 3560 else 3561 inp->inp_route6 = sc->sc_route6; 3562 #endif 3563 sc->sc_route4.ro_rt = NULL; 3564 3565 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3566 if (am == NULL) 3567 goto resetandabort; 3568 am->m_len = src->sa_len; 3569 memcpy(mtod(am, caddr_t), src, src->sa_len); 3570 if (in_pcbconnect(inp, am)) { 3571 (void) m_free(am); 3572 goto resetandabort; 3573 } 3574 (void) m_free(am); 3575 3576 tp = intotcpcb(inp); 3577 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3578 if (sc->sc_request_r_scale != 15) { 3579 tp->requested_s_scale = sc->sc_requested_s_scale; 3580 tp->request_r_scale = sc->sc_request_r_scale; 3581 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3582 } 3583 if (sc->sc_flags & SCF_TIMESTAMP) 3584 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3585 3586 tp->t_template = tcp_template(tp); 3587 if (tp->t_template == 0) { 3588 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3589 so = NULL; 3590 goto abort; 3591 } 3592 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3593 tp->ts_modulate = sc->sc_modulate; 3594 tp->ts_recent = sc->sc_timestamp; 3595 tp->iss = sc->sc_iss; 3596 tp->irs = sc->sc_irs; 3597 tcp_sendseqinit(tp); 3598 tp->snd_last = tp->snd_una; 3599 #ifdef TCP_ECN 3600 if (sc->sc_flags & SCF_ECN_PERMIT) { 3601 tp->t_flags |= TF_ECN_PERMIT; 3602 tcpstat_inc(tcps_ecn_accepts); 3603 } 3604 #endif 3605 if (sc->sc_flags & SCF_SACK_PERMIT) 3606 tp->t_flags |= TF_SACK_PERMIT; 3607 #ifdef TCP_SIGNATURE 3608 if (sc->sc_flags & SCF_SIGNATURE) 3609 tp->t_flags |= TF_SIGNATURE; 3610 #endif 3611 tcp_rcvseqinit(tp); 3612 tp->t_state = TCPS_SYN_RECEIVED; 3613 tp->t_rcvtime = tcp_now; 3614 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3615 tcpstat_inc(tcps_accepts); 3616 3617 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3618 if (sc->sc_peermaxseg) 3619 tcp_mss_update(tp); 3620 /* Reset initial window to 1 segment for retransmit */ 3621 if (sc->sc_rxtshift > 0) 3622 tp->snd_cwnd = tp->t_maxseg; 3623 tp->snd_wl1 = sc->sc_irs; 3624 tp->rcv_up = sc->sc_irs + 1; 3625 3626 /* 3627 * This is what would have happened in tcp_output() when 3628 * the SYN,ACK was sent. 3629 */ 3630 tp->snd_up = tp->snd_una; 3631 tp->snd_max = tp->snd_nxt = tp->iss+1; 3632 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3633 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3634 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3635 tp->last_ack_sent = tp->rcv_nxt; 3636 3637 tcpstat_inc(tcps_sc_completed); 3638 syn_cache_put(sc); 3639 return (so); 3640 3641 resetandabort: 3642 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3643 m->m_pkthdr.ph_rtableid); 3644 abort: 3645 m_freem(m); 3646 if (so != NULL) 3647 (void) soabort(so); 3648 syn_cache_put(sc); 3649 tcpstat_inc(tcps_sc_aborted); 3650 return ((struct socket *)(-1)); 3651 } 3652 3653 /* 3654 * This function is called when we get a RST for a 3655 * non-existent connection, so that we can see if the 3656 * connection is in the syn cache. If it is, zap it. 3657 */ 3658 3659 void 3660 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3661 u_int rtableid) 3662 { 3663 struct syn_cache *sc; 3664 struct syn_cache_head *scp; 3665 3666 NET_ASSERT_LOCKED(); 3667 3668 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3669 return; 3670 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3671 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3672 return; 3673 syn_cache_rm(sc); 3674 tcpstat_inc(tcps_sc_reset); 3675 syn_cache_put(sc); 3676 } 3677 3678 void 3679 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3680 u_int rtableid) 3681 { 3682 struct syn_cache *sc; 3683 struct syn_cache_head *scp; 3684 3685 NET_ASSERT_LOCKED(); 3686 3687 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3688 return; 3689 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3690 if (ntohl (th->th_seq) != sc->sc_iss) { 3691 return; 3692 } 3693 3694 /* 3695 * If we've retransmitted 3 times and this is our second error, 3696 * we remove the entry. Otherwise, we allow it to continue on. 3697 * This prevents us from incorrectly nuking an entry during a 3698 * spurious network outage. 3699 * 3700 * See tcp_notify(). 3701 */ 3702 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3703 sc->sc_flags |= SCF_UNREACH; 3704 return; 3705 } 3706 3707 syn_cache_rm(sc); 3708 tcpstat_inc(tcps_sc_unreach); 3709 syn_cache_put(sc); 3710 } 3711 3712 /* 3713 * Given a LISTEN socket and an inbound SYN request, add 3714 * this to the syn cache, and send back a segment: 3715 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3716 * to the source. 3717 * 3718 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3719 * Doing so would require that we hold onto the data and deliver it 3720 * to the application. However, if we are the target of a SYN-flood 3721 * DoS attack, an attacker could send data which would eventually 3722 * consume all available buffer space if it were ACKed. By not ACKing 3723 * the data, we avoid this DoS scenario. 3724 */ 3725 3726 int 3727 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3728 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3729 struct tcp_opt_info *oi, tcp_seq *issp) 3730 { 3731 struct tcpcb tb, *tp; 3732 long win; 3733 struct syn_cache *sc; 3734 struct syn_cache_head *scp; 3735 struct mbuf *ipopts; 3736 3737 tp = sototcpcb(so); 3738 3739 /* 3740 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3741 * 3742 * Note this check is performed in tcp_input() very early on. 3743 */ 3744 3745 /* 3746 * Initialize some local state. 3747 */ 3748 win = sbspace(so, &so->so_rcv); 3749 if (win > TCP_MAXWIN) 3750 win = TCP_MAXWIN; 3751 3752 bzero(&tb, sizeof(tb)); 3753 #ifdef TCP_SIGNATURE 3754 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3755 #else 3756 if (optp) { 3757 #endif 3758 tb.pf = tp->pf; 3759 tb.sack_enable = tp->sack_enable; 3760 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3761 #ifdef TCP_SIGNATURE 3762 if (tp->t_flags & TF_SIGNATURE) 3763 tb.t_flags |= TF_SIGNATURE; 3764 #endif 3765 tb.t_state = TCPS_LISTEN; 3766 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3767 sotoinpcb(so)->inp_rtableid)) 3768 return (-1); 3769 } 3770 3771 switch (src->sa_family) { 3772 case AF_INET: 3773 /* 3774 * Remember the IP options, if any. 3775 */ 3776 ipopts = ip_srcroute(m); 3777 break; 3778 default: 3779 ipopts = NULL; 3780 } 3781 3782 /* 3783 * See if we already have an entry for this connection. 3784 * If we do, resend the SYN,ACK. We do not count this 3785 * as a retransmission (XXX though maybe we should). 3786 */ 3787 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3788 if (sc != NULL) { 3789 tcpstat_inc(tcps_sc_dupesyn); 3790 if (ipopts) { 3791 /* 3792 * If we were remembering a previous source route, 3793 * forget it and use the new one we've been given. 3794 */ 3795 m_free(sc->sc_ipopts); 3796 sc->sc_ipopts = ipopts; 3797 } 3798 sc->sc_timestamp = tb.ts_recent; 3799 if (syn_cache_respond(sc, m) == 0) { 3800 tcpstat_inc(tcps_sndacks); 3801 tcpstat_inc(tcps_sndtotal); 3802 } 3803 return (0); 3804 } 3805 3806 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3807 if (sc == NULL) { 3808 m_free(ipopts); 3809 return (-1); 3810 } 3811 3812 /* 3813 * Fill in the cache, and put the necessary IP and TCP 3814 * options into the reply. 3815 */ 3816 memcpy(&sc->sc_src, src, src->sa_len); 3817 memcpy(&sc->sc_dst, dst, dst->sa_len); 3818 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3819 sc->sc_flags = 0; 3820 sc->sc_ipopts = ipopts; 3821 sc->sc_irs = th->th_seq; 3822 3823 sc->sc_iss = issp ? *issp : arc4random(); 3824 sc->sc_peermaxseg = oi->maxseg; 3825 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3826 sc->sc_win = win; 3827 sc->sc_timestamp = tb.ts_recent; 3828 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3829 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3830 sc->sc_flags |= SCF_TIMESTAMP; 3831 sc->sc_modulate = arc4random(); 3832 } 3833 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3834 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3835 sc->sc_requested_s_scale = tb.requested_s_scale; 3836 sc->sc_request_r_scale = 0; 3837 /* 3838 * Pick the smallest possible scaling factor that 3839 * will still allow us to scale up to sb_max. 3840 * 3841 * We do this because there are broken firewalls that 3842 * will corrupt the window scale option, leading to 3843 * the other endpoint believing that our advertised 3844 * window is unscaled. At scale factors larger than 3845 * 5 the unscaled window will drop below 1500 bytes, 3846 * leading to serious problems when traversing these 3847 * broken firewalls. 3848 * 3849 * With the default sbmax of 256K, a scale factor 3850 * of 3 will be chosen by this algorithm. Those who 3851 * choose a larger sbmax should watch out 3852 * for the compatibility problems mentioned above. 3853 * 3854 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3855 * or <SYN,ACK>) segment itself is never scaled. 3856 */ 3857 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3858 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3859 sc->sc_request_r_scale++; 3860 } else { 3861 sc->sc_requested_s_scale = 15; 3862 sc->sc_request_r_scale = 15; 3863 } 3864 #ifdef TCP_ECN 3865 /* 3866 * if both ECE and CWR flag bits are set, peer is ECN capable. 3867 */ 3868 if (tcp_do_ecn && 3869 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3870 sc->sc_flags |= SCF_ECN_PERMIT; 3871 #endif 3872 /* 3873 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3874 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3875 */ 3876 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3877 sc->sc_flags |= SCF_SACK_PERMIT; 3878 #ifdef TCP_SIGNATURE 3879 if (tb.t_flags & TF_SIGNATURE) 3880 sc->sc_flags |= SCF_SIGNATURE; 3881 #endif 3882 sc->sc_tp = tp; 3883 if (syn_cache_respond(sc, m) == 0) { 3884 syn_cache_insert(sc, tp); 3885 tcpstat_inc(tcps_sndacks); 3886 tcpstat_inc(tcps_sndtotal); 3887 } else { 3888 syn_cache_put(sc); 3889 tcpstat_inc(tcps_sc_dropped); 3890 } 3891 3892 return (0); 3893 } 3894 3895 int 3896 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 3897 { 3898 u_int8_t *optp; 3899 int optlen, error; 3900 u_int16_t tlen; 3901 struct ip *ip = NULL; 3902 #ifdef INET6 3903 struct ip6_hdr *ip6 = NULL; 3904 #endif 3905 struct tcphdr *th; 3906 u_int hlen; 3907 struct inpcb *inp; 3908 3909 switch (sc->sc_src.sa.sa_family) { 3910 case AF_INET: 3911 hlen = sizeof(struct ip); 3912 break; 3913 #ifdef INET6 3914 case AF_INET6: 3915 hlen = sizeof(struct ip6_hdr); 3916 break; 3917 #endif 3918 default: 3919 m_freem(m); 3920 return (EAFNOSUPPORT); 3921 } 3922 3923 /* Compute the size of the TCP options. */ 3924 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3925 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 3926 #ifdef TCP_SIGNATURE 3927 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3928 #endif 3929 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3930 3931 tlen = hlen + sizeof(struct tcphdr) + optlen; 3932 3933 /* 3934 * Create the IP+TCP header from scratch. 3935 */ 3936 m_freem(m); 3937 #ifdef DIAGNOSTIC 3938 if (max_linkhdr + tlen > MCLBYTES) 3939 return (ENOBUFS); 3940 #endif 3941 MGETHDR(m, M_DONTWAIT, MT_DATA); 3942 if (m && max_linkhdr + tlen > MHLEN) { 3943 MCLGET(m, M_DONTWAIT); 3944 if ((m->m_flags & M_EXT) == 0) { 3945 m_freem(m); 3946 m = NULL; 3947 } 3948 } 3949 if (m == NULL) 3950 return (ENOBUFS); 3951 3952 /* Fixup the mbuf. */ 3953 m->m_data += max_linkhdr; 3954 m->m_len = m->m_pkthdr.len = tlen; 3955 m->m_pkthdr.ph_ifidx = 0; 3956 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 3957 memset(mtod(m, u_char *), 0, tlen); 3958 3959 switch (sc->sc_src.sa.sa_family) { 3960 case AF_INET: 3961 ip = mtod(m, struct ip *); 3962 ip->ip_dst = sc->sc_src.sin.sin_addr; 3963 ip->ip_src = sc->sc_dst.sin.sin_addr; 3964 ip->ip_p = IPPROTO_TCP; 3965 th = (struct tcphdr *)(ip + 1); 3966 th->th_dport = sc->sc_src.sin.sin_port; 3967 th->th_sport = sc->sc_dst.sin.sin_port; 3968 break; 3969 #ifdef INET6 3970 case AF_INET6: 3971 ip6 = mtod(m, struct ip6_hdr *); 3972 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3973 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3974 ip6->ip6_nxt = IPPROTO_TCP; 3975 /* ip6_plen will be updated in ip6_output() */ 3976 th = (struct tcphdr *)(ip6 + 1); 3977 th->th_dport = sc->sc_src.sin6.sin6_port; 3978 th->th_sport = sc->sc_dst.sin6.sin6_port; 3979 break; 3980 #endif 3981 default: 3982 unhandled_af(sc->sc_src.sa.sa_family); 3983 } 3984 3985 th->th_seq = htonl(sc->sc_iss); 3986 th->th_ack = htonl(sc->sc_irs + 1); 3987 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3988 th->th_flags = TH_SYN|TH_ACK; 3989 #ifdef TCP_ECN 3990 /* Set ECE for SYN-ACK if peer supports ECN. */ 3991 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 3992 th->th_flags |= TH_ECE; 3993 #endif 3994 th->th_win = htons(sc->sc_win); 3995 /* th_sum already 0 */ 3996 /* th_urp already 0 */ 3997 3998 /* Tack on the TCP options. */ 3999 optp = (u_int8_t *)(th + 1); 4000 *optp++ = TCPOPT_MAXSEG; 4001 *optp++ = 4; 4002 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4003 *optp++ = sc->sc_ourmaxseg & 0xff; 4004 4005 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4006 if (sc->sc_flags & SCF_SACK_PERMIT) { 4007 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4008 optp += 4; 4009 } 4010 4011 if (sc->sc_request_r_scale != 15) { 4012 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4013 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4014 sc->sc_request_r_scale); 4015 optp += 4; 4016 } 4017 4018 if (sc->sc_flags & SCF_TIMESTAMP) { 4019 u_int32_t *lp = (u_int32_t *)(optp); 4020 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4021 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4022 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4023 *lp = htonl(sc->sc_timestamp); 4024 optp += TCPOLEN_TSTAMP_APPA; 4025 } 4026 4027 #ifdef TCP_SIGNATURE 4028 if (sc->sc_flags & SCF_SIGNATURE) { 4029 union sockaddr_union src, dst; 4030 struct tdb *tdb; 4031 4032 bzero(&src, sizeof(union sockaddr_union)); 4033 bzero(&dst, sizeof(union sockaddr_union)); 4034 src.sa.sa_len = sc->sc_src.sa.sa_len; 4035 src.sa.sa_family = sc->sc_src.sa.sa_family; 4036 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4037 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4038 4039 switch (sc->sc_src.sa.sa_family) { 4040 case 0: /*default to PF_INET*/ 4041 case AF_INET: 4042 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4043 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4044 break; 4045 #ifdef INET6 4046 case AF_INET6: 4047 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4048 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4049 break; 4050 #endif /* INET6 */ 4051 } 4052 4053 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4054 0, &src, &dst, IPPROTO_TCP); 4055 if (tdb == NULL) { 4056 m_freem(m); 4057 return (EPERM); 4058 } 4059 4060 /* Send signature option */ 4061 *(optp++) = TCPOPT_SIGNATURE; 4062 *(optp++) = TCPOLEN_SIGNATURE; 4063 4064 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4065 hlen, 0, optp) < 0) { 4066 m_freem(m); 4067 tdb_unref(tdb); 4068 return (EINVAL); 4069 } 4070 tdb_unref(tdb); 4071 optp += 16; 4072 4073 /* Pad options list to the next 32 bit boundary and 4074 * terminate it. 4075 */ 4076 *optp++ = TCPOPT_NOP; 4077 *optp++ = TCPOPT_EOL; 4078 } 4079 #endif /* TCP_SIGNATURE */ 4080 4081 /* Compute the packet's checksum. */ 4082 switch (sc->sc_src.sa.sa_family) { 4083 case AF_INET: 4084 ip->ip_len = htons(tlen - hlen); 4085 th->th_sum = 0; 4086 th->th_sum = in_cksum(m, tlen); 4087 break; 4088 #ifdef INET6 4089 case AF_INET6: 4090 ip6->ip6_plen = htons(tlen - hlen); 4091 th->th_sum = 0; 4092 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4093 break; 4094 #endif 4095 } 4096 4097 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4098 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4099 4100 /* 4101 * Fill in some straggling IP bits. Note the stack expects 4102 * ip_len to be in host order, for convenience. 4103 */ 4104 switch (sc->sc_src.sa.sa_family) { 4105 case AF_INET: 4106 ip->ip_len = htons(tlen); 4107 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4108 if (inp != NULL) 4109 ip->ip_tos = inp->inp_ip.ip_tos; 4110 break; 4111 #ifdef INET6 4112 case AF_INET6: 4113 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4114 ip6->ip6_vfc |= IPV6_VERSION; 4115 ip6->ip6_plen = htons(tlen - hlen); 4116 /* ip6_hlim will be initialized afterwards */ 4117 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4118 break; 4119 #endif 4120 } 4121 4122 switch (sc->sc_src.sa.sa_family) { 4123 case AF_INET: 4124 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4125 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4126 break; 4127 #ifdef INET6 4128 case AF_INET6: 4129 ip6->ip6_hlim = in6_selecthlim(inp); 4130 4131 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4132 NULL, NULL); 4133 break; 4134 #endif 4135 default: 4136 error = EAFNOSUPPORT; 4137 break; 4138 } 4139 return (error); 4140 } 4141