1 /* $OpenBSD: tcp_input.c,v 1.361 2019/07/12 19:43:51 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_TIMER_ARM_MSEC(tp, TCPT_DELACK, tcp_delack_msecs); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 189 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 190 191 void syn_cache_put(struct syn_cache *); 192 void syn_cache_rm(struct syn_cache *); 193 int syn_cache_respond(struct syn_cache *, struct mbuf *); 194 void syn_cache_timer(void *); 195 void syn_cache_reaper(void *); 196 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 197 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 198 struct tcphdr *, u_int); 199 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 200 unsigned int, struct socket *, struct mbuf *, u_char *, int, 201 struct tcp_opt_info *, tcp_seq *); 202 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 203 struct tcphdr *, unsigned int, unsigned int, struct socket *, 204 struct mbuf *); 205 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 206 struct syn_cache_head **, u_int); 207 208 /* 209 * Insert segment ti into reassembly queue of tcp with 210 * control block tp. Return TH_FIN if reassembly now includes 211 * a segment with FIN. The macro form does the common case inline 212 * (segment is the next to be received on an established connection, 213 * and the queue is empty), avoiding linkage into and removal 214 * from the queue and repetition of various conversions. 215 * Set DELACK for segments received in order, but ack immediately 216 * when segments are out of order (so fast retransmit can work). 217 */ 218 219 int 220 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 221 { 222 struct tcpqent *p, *q, *nq, *tiqe; 223 224 /* 225 * Allocate a new queue entry, before we throw away any data. 226 * If we can't, just drop the packet. XXX 227 */ 228 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 229 if (tiqe == NULL) { 230 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 231 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 232 /* Reuse last entry since new segment fills a hole */ 233 m_freem(tiqe->tcpqe_m); 234 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 235 } 236 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 237 /* Flush segment queue for this connection */ 238 tcp_freeq(tp); 239 tcpstat_inc(tcps_rcvmemdrop); 240 m_freem(m); 241 return (0); 242 } 243 } 244 245 /* 246 * Find a segment which begins after this one does. 247 */ 248 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 249 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 250 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 251 break; 252 253 /* 254 * If there is a preceding segment, it may provide some of 255 * our data already. If so, drop the data from the incoming 256 * segment. If it provides all of our data, drop us. 257 */ 258 if (p != NULL) { 259 struct tcphdr *phdr = p->tcpqe_tcp; 260 int i; 261 262 /* conversion to int (in i) handles seq wraparound */ 263 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 264 if (i > 0) { 265 if (i >= *tlen) { 266 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 267 *tlen); 268 m_freem(m); 269 pool_put(&tcpqe_pool, tiqe); 270 return (0); 271 } 272 m_adj(m, i); 273 *tlen -= i; 274 th->th_seq += i; 275 } 276 } 277 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 278 279 /* 280 * While we overlap succeeding segments trim them or, 281 * if they are completely covered, dequeue them. 282 */ 283 for (; q != NULL; q = nq) { 284 struct tcphdr *qhdr = q->tcpqe_tcp; 285 int i = (th->th_seq + *tlen) - qhdr->th_seq; 286 287 if (i <= 0) 288 break; 289 if (i < qhdr->th_reseqlen) { 290 qhdr->th_seq += i; 291 qhdr->th_reseqlen -= i; 292 m_adj(q->tcpqe_m, i); 293 break; 294 } 295 nq = TAILQ_NEXT(q, tcpqe_q); 296 m_freem(q->tcpqe_m); 297 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 298 pool_put(&tcpqe_pool, q); 299 } 300 301 /* Insert the new segment queue entry into place. */ 302 tiqe->tcpqe_m = m; 303 th->th_reseqlen = *tlen; 304 tiqe->tcpqe_tcp = th; 305 if (p == NULL) { 306 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 307 } else { 308 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 309 } 310 311 if (th->th_seq != tp->rcv_nxt) 312 return (0); 313 314 return (tcp_flush_queue(tp)); 315 } 316 317 int 318 tcp_flush_queue(struct tcpcb *tp) 319 { 320 struct socket *so = tp->t_inpcb->inp_socket; 321 struct tcpqent *q, *nq; 322 int flags; 323 324 /* 325 * Present data to user, advancing rcv_nxt through 326 * completed sequence space. 327 */ 328 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 329 return (0); 330 q = TAILQ_FIRST(&tp->t_segq); 331 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 332 return (0); 333 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 334 return (0); 335 do { 336 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 337 flags = q->tcpqe_tcp->th_flags & TH_FIN; 338 339 nq = TAILQ_NEXT(q, tcpqe_q); 340 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 341 ND6_HINT(tp); 342 if (so->so_state & SS_CANTRCVMORE) 343 m_freem(q->tcpqe_m); 344 else 345 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 346 pool_put(&tcpqe_pool, q); 347 q = nq; 348 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 349 tp->t_flags |= TF_BLOCKOUTPUT; 350 sorwakeup(so); 351 tp->t_flags &= ~TF_BLOCKOUTPUT; 352 return (flags); 353 } 354 355 /* 356 * TCP input routine, follows pages 65-76 of the 357 * protocol specification dated September, 1981 very closely. 358 */ 359 int 360 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 361 { 362 struct mbuf *m = *mp; 363 int iphlen = *offp; 364 struct ip *ip = NULL; 365 struct inpcb *inp = NULL; 366 u_int8_t *optp = NULL; 367 int optlen = 0; 368 int tlen, off; 369 struct tcpcb *otp = NULL, *tp = NULL; 370 int tiflags; 371 struct socket *so = NULL; 372 int todrop, acked, ourfinisacked; 373 int hdroptlen = 0; 374 short ostate; 375 caddr_t saveti; 376 tcp_seq iss, *reuse = NULL; 377 u_long tiwin; 378 struct tcp_opt_info opti; 379 struct tcphdr *th; 380 #ifdef INET6 381 struct ip6_hdr *ip6 = NULL; 382 #endif /* INET6 */ 383 #ifdef IPSEC 384 struct m_tag *mtag; 385 struct tdb_ident *tdbi; 386 struct tdb *tdb; 387 int error; 388 #endif /* IPSEC */ 389 #ifdef TCP_ECN 390 u_char iptos; 391 #endif 392 393 tcpstat_inc(tcps_rcvtotal); 394 395 opti.ts_present = 0; 396 opti.maxseg = 0; 397 398 /* 399 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 400 */ 401 if (m->m_flags & (M_BCAST|M_MCAST)) 402 goto drop; 403 404 /* 405 * Get IP and TCP header together in first mbuf. 406 * Note: IP leaves IP header in first mbuf. 407 */ 408 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 409 if (!th) { 410 tcpstat_inc(tcps_rcvshort); 411 return IPPROTO_DONE; 412 } 413 414 tlen = m->m_pkthdr.len - iphlen; 415 switch (af) { 416 case AF_INET: 417 ip = mtod(m, struct ip *); 418 #ifdef TCP_ECN 419 /* save ip_tos before clearing it for checksum */ 420 iptos = ip->ip_tos; 421 #endif 422 break; 423 #ifdef INET6 424 case AF_INET6: 425 ip6 = mtod(m, struct ip6_hdr *); 426 #ifdef TCP_ECN 427 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 428 #endif 429 430 /* 431 * Be proactive about unspecified IPv6 address in source. 432 * As we use all-zero to indicate unbounded/unconnected pcb, 433 * unspecified IPv6 address can be used to confuse us. 434 * 435 * Note that packets with unspecified IPv6 destination is 436 * already dropped in ip6_input. 437 */ 438 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 439 /* XXX stat */ 440 goto drop; 441 } 442 443 /* Discard packets to multicast */ 444 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 445 /* XXX stat */ 446 goto drop; 447 } 448 break; 449 #endif 450 default: 451 unhandled_af(af); 452 } 453 454 /* 455 * Checksum extended TCP header and data. 456 */ 457 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 458 int sum; 459 460 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 461 tcpstat_inc(tcps_rcvbadsum); 462 goto drop; 463 } 464 tcpstat_inc(tcps_inswcsum); 465 switch (af) { 466 case AF_INET: 467 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 468 break; 469 #ifdef INET6 470 case AF_INET6: 471 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 472 tlen); 473 break; 474 #endif 475 } 476 if (sum != 0) { 477 tcpstat_inc(tcps_rcvbadsum); 478 goto drop; 479 } 480 } 481 482 /* 483 * Check that TCP offset makes sense, 484 * pull out TCP options and adjust length. XXX 485 */ 486 off = th->th_off << 2; 487 if (off < sizeof(struct tcphdr) || off > tlen) { 488 tcpstat_inc(tcps_rcvbadoff); 489 goto drop; 490 } 491 tlen -= off; 492 if (off > sizeof(struct tcphdr)) { 493 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 494 if (!th) { 495 tcpstat_inc(tcps_rcvshort); 496 return IPPROTO_DONE; 497 } 498 optlen = off - sizeof(struct tcphdr); 499 optp = (u_int8_t *)(th + 1); 500 /* 501 * Do quick retrieval of timestamp options ("options 502 * prediction?"). If timestamp is the only option and it's 503 * formatted as recommended in RFC 1323 appendix A, we 504 * quickly get the values now and not bother calling 505 * tcp_dooptions(), etc. 506 */ 507 if ((optlen == TCPOLEN_TSTAMP_APPA || 508 (optlen > TCPOLEN_TSTAMP_APPA && 509 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 510 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 511 (th->th_flags & TH_SYN) == 0) { 512 opti.ts_present = 1; 513 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 514 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 515 optp = NULL; /* we've parsed the options */ 516 } 517 } 518 tiflags = th->th_flags; 519 520 /* 521 * Convert TCP protocol specific fields to host format. 522 */ 523 th->th_seq = ntohl(th->th_seq); 524 th->th_ack = ntohl(th->th_ack); 525 th->th_win = ntohs(th->th_win); 526 th->th_urp = ntohs(th->th_urp); 527 528 /* 529 * Locate pcb for segment. 530 */ 531 #if NPF > 0 532 inp = pf_inp_lookup(m); 533 #endif 534 findpcb: 535 if (inp == NULL) { 536 switch (af) { 537 #ifdef INET6 538 case AF_INET6: 539 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 540 th->th_sport, &ip6->ip6_dst, th->th_dport, 541 m->m_pkthdr.ph_rtableid); 542 break; 543 #endif 544 case AF_INET: 545 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 546 th->th_sport, ip->ip_dst, th->th_dport, 547 m->m_pkthdr.ph_rtableid); 548 break; 549 } 550 } 551 if (inp == NULL) { 552 tcpstat_inc(tcps_pcbhashmiss); 553 switch (af) { 554 #ifdef INET6 555 case AF_INET6: 556 inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, 557 th->th_dport, m, m->m_pkthdr.ph_rtableid); 558 break; 559 #endif /* INET6 */ 560 case AF_INET: 561 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 562 th->th_dport, m, m->m_pkthdr.ph_rtableid); 563 break; 564 } 565 /* 566 * If the state is CLOSED (i.e., TCB does not exist) then 567 * all data in the incoming segment is discarded. 568 * If the TCB exists but is in CLOSED state, it is embryonic, 569 * but should either do a listen or a connect soon. 570 */ 571 if (inp == NULL) { 572 tcpstat_inc(tcps_noport); 573 goto dropwithreset_ratelim; 574 } 575 } 576 KASSERT(sotoinpcb(inp->inp_socket) == inp); 577 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 578 soassertlocked(inp->inp_socket); 579 580 /* Check the minimum TTL for socket. */ 581 switch (af) { 582 case AF_INET: 583 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 584 goto drop; 585 break; 586 #ifdef INET6 587 case AF_INET6: 588 if (inp->inp_ip6_minhlim && 589 inp->inp_ip6_minhlim > ip6->ip6_hlim) 590 goto drop; 591 break; 592 #endif 593 } 594 595 tp = intotcpcb(inp); 596 if (tp == NULL) 597 goto dropwithreset_ratelim; 598 if (tp->t_state == TCPS_CLOSED) 599 goto drop; 600 601 /* Unscale the window into a 32-bit value. */ 602 if ((tiflags & TH_SYN) == 0) 603 tiwin = th->th_win << tp->snd_scale; 604 else 605 tiwin = th->th_win; 606 607 so = inp->inp_socket; 608 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 609 union syn_cache_sa src; 610 union syn_cache_sa dst; 611 612 bzero(&src, sizeof(src)); 613 bzero(&dst, sizeof(dst)); 614 switch (af) { 615 case AF_INET: 616 src.sin.sin_len = sizeof(struct sockaddr_in); 617 src.sin.sin_family = AF_INET; 618 src.sin.sin_addr = ip->ip_src; 619 src.sin.sin_port = th->th_sport; 620 621 dst.sin.sin_len = sizeof(struct sockaddr_in); 622 dst.sin.sin_family = AF_INET; 623 dst.sin.sin_addr = ip->ip_dst; 624 dst.sin.sin_port = th->th_dport; 625 break; 626 #ifdef INET6 627 case AF_INET6: 628 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 629 src.sin6.sin6_family = AF_INET6; 630 src.sin6.sin6_addr = ip6->ip6_src; 631 src.sin6.sin6_port = th->th_sport; 632 633 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 634 dst.sin6.sin6_family = AF_INET6; 635 dst.sin6.sin6_addr = ip6->ip6_dst; 636 dst.sin6.sin6_port = th->th_dport; 637 break; 638 #endif /* INET6 */ 639 } 640 641 if (so->so_options & SO_DEBUG) { 642 otp = tp; 643 ostate = tp->t_state; 644 switch (af) { 645 #ifdef INET6 646 case AF_INET6: 647 saveti = (caddr_t) &tcp_saveti6; 648 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 649 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 650 break; 651 #endif 652 case AF_INET: 653 saveti = (caddr_t) &tcp_saveti; 654 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 655 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 656 break; 657 } 658 } 659 if (so->so_options & SO_ACCEPTCONN) { 660 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 661 662 case TH_SYN|TH_ACK|TH_RST: 663 case TH_SYN|TH_RST: 664 case TH_ACK|TH_RST: 665 case TH_RST: 666 syn_cache_reset(&src.sa, &dst.sa, th, 667 inp->inp_rtableid); 668 goto drop; 669 670 case TH_SYN|TH_ACK: 671 /* 672 * Received a SYN,ACK. This should 673 * never happen while we are in 674 * LISTEN. Send an RST. 675 */ 676 goto badsyn; 677 678 case TH_ACK: 679 so = syn_cache_get(&src.sa, &dst.sa, 680 th, iphlen, tlen, so, m); 681 if (so == NULL) { 682 /* 683 * We don't have a SYN for 684 * this ACK; send an RST. 685 */ 686 goto badsyn; 687 } else if (so == (struct socket *)(-1)) { 688 /* 689 * We were unable to create 690 * the connection. If the 691 * 3-way handshake was 692 * completed, and RST has 693 * been sent to the peer. 694 * Since the mbuf might be 695 * in use for the reply, 696 * do not free it. 697 */ 698 m = *mp = NULL; 699 goto drop; 700 } else { 701 /* 702 * We have created a 703 * full-blown connection. 704 */ 705 tp = NULL; 706 inp = sotoinpcb(so); 707 tp = intotcpcb(inp); 708 if (tp == NULL) 709 goto badsyn; /*XXX*/ 710 711 } 712 break; 713 714 default: 715 /* 716 * None of RST, SYN or ACK was set. 717 * This is an invalid packet for a 718 * TCB in LISTEN state. Send a RST. 719 */ 720 goto badsyn; 721 722 case TH_SYN: 723 /* 724 * Received a SYN. 725 */ 726 #ifdef INET6 727 /* 728 * If deprecated address is forbidden, we do 729 * not accept SYN to deprecated interface 730 * address to prevent any new inbound 731 * connection from getting established. 732 * When we do not accept SYN, we send a TCP 733 * RST, with deprecated source address (instead 734 * of dropping it). We compromise it as it is 735 * much better for peer to send a RST, and 736 * RST will be the final packet for the 737 * exchange. 738 * 739 * If we do not forbid deprecated addresses, we 740 * accept the SYN packet. RFC2462 does not 741 * suggest dropping SYN in this case. 742 * If we decipher RFC2462 5.5.4, it says like 743 * this: 744 * 1. use of deprecated addr with existing 745 * communication is okay - "SHOULD continue 746 * to be used" 747 * 2. use of it with new communication: 748 * (2a) "SHOULD NOT be used if alternate 749 * address with sufficient scope is 750 * available" 751 * (2b) nothing mentioned otherwise. 752 * Here we fall into (2b) case as we have no 753 * choice in our source address selection - we 754 * must obey the peer. 755 * 756 * The wording in RFC2462 is confusing, and 757 * there are multiple description text for 758 * deprecated address handling - worse, they 759 * are not exactly the same. I believe 5.5.4 760 * is the best one, so we follow 5.5.4. 761 */ 762 if (ip6 && !ip6_use_deprecated) { 763 struct in6_ifaddr *ia6; 764 struct ifnet *ifp = 765 if_get(m->m_pkthdr.ph_ifidx); 766 767 if (ifp && 768 (ia6 = in6ifa_ifpwithaddr(ifp, 769 &ip6->ip6_dst)) && 770 (ia6->ia6_flags & 771 IN6_IFF_DEPRECATED)) { 772 tp = NULL; 773 if_put(ifp); 774 goto dropwithreset; 775 } 776 if_put(ifp); 777 } 778 #endif 779 780 /* 781 * LISTEN socket received a SYN 782 * from itself? This can't possibly 783 * be valid; drop the packet. 784 */ 785 if (th->th_dport == th->th_sport) { 786 switch (af) { 787 #ifdef INET6 788 case AF_INET6: 789 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 790 &ip6->ip6_dst)) { 791 tcpstat_inc(tcps_badsyn); 792 goto drop; 793 } 794 break; 795 #endif /* INET6 */ 796 case AF_INET: 797 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 798 tcpstat_inc(tcps_badsyn); 799 goto drop; 800 } 801 break; 802 } 803 } 804 805 /* 806 * SYN looks ok; create compressed TCP 807 * state for it. 808 */ 809 if (so->so_qlen > so->so_qlimit || 810 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 811 so, m, optp, optlen, &opti, reuse) == -1) { 812 tcpstat_inc(tcps_dropsyn); 813 goto drop; 814 } 815 return IPPROTO_DONE; 816 } 817 } 818 } 819 820 #ifdef DIAGNOSTIC 821 /* 822 * Should not happen now that all embryonic connections 823 * are handled with compressed state. 824 */ 825 if (tp->t_state == TCPS_LISTEN) 826 panic("tcp_input: TCPS_LISTEN"); 827 #endif 828 829 #if NPF > 0 830 pf_inp_link(m, inp); 831 #endif 832 833 #ifdef IPSEC 834 /* Find most recent IPsec tag */ 835 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 836 if (mtag != NULL) { 837 tdbi = (struct tdb_ident *)(mtag + 1); 838 tdb = gettdb(tdbi->rdomain, tdbi->spi, 839 &tdbi->dst, tdbi->proto); 840 } else 841 tdb = NULL; 842 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 843 tdb, inp, 0); 844 if (error) { 845 tcpstat_inc(tcps_rcvnosec); 846 goto drop; 847 } 848 #endif /* IPSEC */ 849 850 /* 851 * Segment received on connection. 852 * Reset idle time and keep-alive timer. 853 */ 854 tp->t_rcvtime = tcp_now; 855 if (TCPS_HAVEESTABLISHED(tp->t_state)) 856 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 857 858 if (tp->sack_enable) 859 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 860 861 /* 862 * Process options. 863 */ 864 #ifdef TCP_SIGNATURE 865 if (optp || (tp->t_flags & TF_SIGNATURE)) 866 #else 867 if (optp) 868 #endif 869 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 870 m->m_pkthdr.ph_rtableid)) 871 goto drop; 872 873 if (opti.ts_present && opti.ts_ecr) { 874 int rtt_test; 875 876 /* subtract out the tcp timestamp modulator */ 877 opti.ts_ecr -= tp->ts_modulate; 878 879 /* make sure ts_ecr is sensible */ 880 rtt_test = tcp_now - opti.ts_ecr; 881 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 882 opti.ts_ecr = 0; 883 } 884 885 #ifdef TCP_ECN 886 /* if congestion experienced, set ECE bit in subsequent packets. */ 887 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 888 tp->t_flags |= TF_RCVD_CE; 889 tcpstat_inc(tcps_ecn_rcvce); 890 } 891 #endif 892 /* 893 * Header prediction: check for the two common cases 894 * of a uni-directional data xfer. If the packet has 895 * no control flags, is in-sequence, the window didn't 896 * change and we're not retransmitting, it's a 897 * candidate. If the length is zero and the ack moved 898 * forward, we're the sender side of the xfer. Just 899 * free the data acked & wake any higher level process 900 * that was blocked waiting for space. If the length 901 * is non-zero and the ack didn't move, we're the 902 * receiver side. If we're getting packets in-order 903 * (the reassembly queue is empty), add the data to 904 * the socket buffer and note that we need a delayed ack. 905 */ 906 if (tp->t_state == TCPS_ESTABLISHED && 907 #ifdef TCP_ECN 908 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 909 #else 910 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 911 #endif 912 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 913 th->th_seq == tp->rcv_nxt && 914 tiwin && tiwin == tp->snd_wnd && 915 tp->snd_nxt == tp->snd_max) { 916 917 /* 918 * If last ACK falls within this segment's sequence numbers, 919 * record the timestamp. 920 * Fix from Braden, see Stevens p. 870 921 */ 922 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 923 tp->ts_recent_age = tcp_now; 924 tp->ts_recent = opti.ts_val; 925 } 926 927 if (tlen == 0) { 928 if (SEQ_GT(th->th_ack, tp->snd_una) && 929 SEQ_LEQ(th->th_ack, tp->snd_max) && 930 tp->snd_cwnd >= tp->snd_wnd && 931 tp->t_dupacks == 0) { 932 /* 933 * this is a pure ack for outstanding data. 934 */ 935 tcpstat_inc(tcps_predack); 936 if (opti.ts_present && opti.ts_ecr) 937 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 938 else if (tp->t_rtttime && 939 SEQ_GT(th->th_ack, tp->t_rtseq)) 940 tcp_xmit_timer(tp, 941 tcp_now - tp->t_rtttime); 942 acked = th->th_ack - tp->snd_una; 943 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 944 acked); 945 ND6_HINT(tp); 946 sbdrop(so, &so->so_snd, acked); 947 948 /* 949 * If we had a pending ICMP message that 950 * refers to data that have just been 951 * acknowledged, disregard the recorded ICMP 952 * message. 953 */ 954 if ((tp->t_flags & TF_PMTUD_PEND) && 955 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 956 tp->t_flags &= ~TF_PMTUD_PEND; 957 958 /* 959 * Keep track of the largest chunk of data 960 * acknowledged since last PMTU update 961 */ 962 if (tp->t_pmtud_mss_acked < acked) 963 tp->t_pmtud_mss_acked = acked; 964 965 tp->snd_una = th->th_ack; 966 /* 967 * We want snd_last to track snd_una so 968 * as to avoid sequence wraparound problems 969 * for very large transfers. 970 */ 971 #ifdef TCP_ECN 972 if (SEQ_GT(tp->snd_una, tp->snd_last)) 973 #endif 974 tp->snd_last = tp->snd_una; 975 m_freem(m); 976 977 /* 978 * If all outstanding data are acked, stop 979 * retransmit timer, otherwise restart timer 980 * using current (possibly backed-off) value. 981 * If process is waiting for space, 982 * wakeup/selwakeup/signal. If data 983 * are ready to send, let tcp_output 984 * decide between more output or persist. 985 */ 986 if (tp->snd_una == tp->snd_max) 987 TCP_TIMER_DISARM(tp, TCPT_REXMT); 988 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 989 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 990 991 tcp_update_sndspace(tp); 992 if (sb_notify(so, &so->so_snd)) { 993 tp->t_flags |= TF_BLOCKOUTPUT; 994 sowwakeup(so); 995 tp->t_flags &= ~TF_BLOCKOUTPUT; 996 } 997 if (so->so_snd.sb_cc || 998 tp->t_flags & TF_NEEDOUTPUT) 999 (void) tcp_output(tp); 1000 return IPPROTO_DONE; 1001 } 1002 } else if (th->th_ack == tp->snd_una && 1003 TAILQ_EMPTY(&tp->t_segq) && 1004 tlen <= sbspace(so, &so->so_rcv)) { 1005 /* 1006 * This is a pure, in-sequence data packet 1007 * with nothing on the reassembly queue and 1008 * we have enough buffer space to take it. 1009 */ 1010 /* Clean receiver SACK report if present */ 1011 if (tp->sack_enable && tp->rcv_numsacks) 1012 tcp_clean_sackreport(tp); 1013 tcpstat_inc(tcps_preddat); 1014 tp->rcv_nxt += tlen; 1015 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1016 ND6_HINT(tp); 1017 1018 TCP_SETUP_ACK(tp, tiflags, m); 1019 /* 1020 * Drop TCP, IP headers and TCP options then add data 1021 * to socket buffer. 1022 */ 1023 if (so->so_state & SS_CANTRCVMORE) 1024 m_freem(m); 1025 else { 1026 if (opti.ts_present && opti.ts_ecr) { 1027 if (tp->rfbuf_ts < opti.ts_ecr && 1028 opti.ts_ecr - tp->rfbuf_ts < hz) { 1029 tcp_update_rcvspace(tp); 1030 /* Start over with next RTT. */ 1031 tp->rfbuf_cnt = 0; 1032 tp->rfbuf_ts = 0; 1033 } else 1034 tp->rfbuf_cnt += tlen; 1035 } 1036 m_adj(m, iphlen + off); 1037 sbappendstream(so, &so->so_rcv, m); 1038 } 1039 tp->t_flags |= TF_BLOCKOUTPUT; 1040 sorwakeup(so); 1041 tp->t_flags &= ~TF_BLOCKOUTPUT; 1042 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1043 (void) tcp_output(tp); 1044 return IPPROTO_DONE; 1045 } 1046 } 1047 1048 /* 1049 * Compute mbuf offset to TCP data segment. 1050 */ 1051 hdroptlen = iphlen + off; 1052 1053 /* 1054 * Calculate amount of space in receive window, 1055 * and then do TCP input processing. 1056 * Receive window is amount of space in rcv queue, 1057 * but not less than advertised window. 1058 */ 1059 { int win; 1060 1061 win = sbspace(so, &so->so_rcv); 1062 if (win < 0) 1063 win = 0; 1064 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1065 } 1066 1067 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1068 tp->rfbuf_cnt = 0; 1069 tp->rfbuf_ts = 0; 1070 1071 switch (tp->t_state) { 1072 1073 /* 1074 * If the state is SYN_RECEIVED: 1075 * if seg contains SYN/ACK, send an RST. 1076 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1077 */ 1078 1079 case TCPS_SYN_RECEIVED: 1080 if (tiflags & TH_ACK) { 1081 if (tiflags & TH_SYN) { 1082 tcpstat_inc(tcps_badsyn); 1083 goto dropwithreset; 1084 } 1085 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1086 SEQ_GT(th->th_ack, tp->snd_max)) 1087 goto dropwithreset; 1088 } 1089 break; 1090 1091 /* 1092 * If the state is SYN_SENT: 1093 * if seg contains an ACK, but not for our SYN, drop the input. 1094 * if seg contains a RST, then drop the connection. 1095 * if seg does not contain SYN, then drop it. 1096 * Otherwise this is an acceptable SYN segment 1097 * initialize tp->rcv_nxt and tp->irs 1098 * if seg contains ack then advance tp->snd_una 1099 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1100 * arrange for segment to be acked (eventually) 1101 * continue processing rest of data/controls, beginning with URG 1102 */ 1103 case TCPS_SYN_SENT: 1104 if ((tiflags & TH_ACK) && 1105 (SEQ_LEQ(th->th_ack, tp->iss) || 1106 SEQ_GT(th->th_ack, tp->snd_max))) 1107 goto dropwithreset; 1108 if (tiflags & TH_RST) { 1109 #ifdef TCP_ECN 1110 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1111 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1112 goto drop; 1113 #endif 1114 if (tiflags & TH_ACK) 1115 tp = tcp_drop(tp, ECONNREFUSED); 1116 goto drop; 1117 } 1118 if ((tiflags & TH_SYN) == 0) 1119 goto drop; 1120 if (tiflags & TH_ACK) { 1121 tp->snd_una = th->th_ack; 1122 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1123 tp->snd_nxt = tp->snd_una; 1124 } 1125 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1126 tp->irs = th->th_seq; 1127 tcp_mss(tp, opti.maxseg); 1128 /* Reset initial window to 1 segment for retransmit */ 1129 if (tp->t_rxtshift > 0) 1130 tp->snd_cwnd = tp->t_maxseg; 1131 tcp_rcvseqinit(tp); 1132 tp->t_flags |= TF_ACKNOW; 1133 /* 1134 * If we've sent a SACK_PERMITTED option, and the peer 1135 * also replied with one, then TF_SACK_PERMIT should have 1136 * been set in tcp_dooptions(). If it was not, disable SACKs. 1137 */ 1138 if (tp->sack_enable) 1139 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1140 #ifdef TCP_ECN 1141 /* 1142 * if ECE is set but CWR is not set for SYN-ACK, or 1143 * both ECE and CWR are set for simultaneous open, 1144 * peer is ECN capable. 1145 */ 1146 if (tcp_do_ecn) { 1147 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1148 case TH_ACK|TH_ECE: 1149 case TH_ECE|TH_CWR: 1150 tp->t_flags |= TF_ECN_PERMIT; 1151 tiflags &= ~(TH_ECE|TH_CWR); 1152 tcpstat_inc(tcps_ecn_accepts); 1153 } 1154 } 1155 #endif 1156 1157 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1158 tcpstat_inc(tcps_connects); 1159 tp->t_flags |= TF_BLOCKOUTPUT; 1160 soisconnected(so); 1161 tp->t_flags &= ~TF_BLOCKOUTPUT; 1162 tp->t_state = TCPS_ESTABLISHED; 1163 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1164 /* Do window scaling on this connection? */ 1165 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1166 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1167 tp->snd_scale = tp->requested_s_scale; 1168 tp->rcv_scale = tp->request_r_scale; 1169 } 1170 tcp_flush_queue(tp); 1171 1172 /* 1173 * if we didn't have to retransmit the SYN, 1174 * use its rtt as our initial srtt & rtt var. 1175 */ 1176 if (tp->t_rtttime) 1177 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1178 /* 1179 * Since new data was acked (the SYN), open the 1180 * congestion window by one MSS. We do this 1181 * here, because we won't go through the normal 1182 * ACK processing below. And since this is the 1183 * start of the connection, we know we are in 1184 * the exponential phase of slow-start. 1185 */ 1186 tp->snd_cwnd += tp->t_maxseg; 1187 } else 1188 tp->t_state = TCPS_SYN_RECEIVED; 1189 1190 #if 0 1191 trimthenstep6: 1192 #endif 1193 /* 1194 * Advance th->th_seq to correspond to first data byte. 1195 * If data, trim to stay within window, 1196 * dropping FIN if necessary. 1197 */ 1198 th->th_seq++; 1199 if (tlen > tp->rcv_wnd) { 1200 todrop = tlen - tp->rcv_wnd; 1201 m_adj(m, -todrop); 1202 tlen = tp->rcv_wnd; 1203 tiflags &= ~TH_FIN; 1204 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1205 todrop); 1206 } 1207 tp->snd_wl1 = th->th_seq - 1; 1208 tp->rcv_up = th->th_seq; 1209 goto step6; 1210 /* 1211 * If a new connection request is received while in TIME_WAIT, 1212 * drop the old connection and start over if the if the 1213 * timestamp or the sequence numbers are above the previous 1214 * ones. 1215 */ 1216 case TCPS_TIME_WAIT: 1217 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1218 ((opti.ts_present && 1219 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1220 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1221 #if NPF > 0 1222 /* 1223 * The socket will be recreated but the new state 1224 * has already been linked to the socket. Remove the 1225 * link between old socket and new state. 1226 */ 1227 pf_inp_unlink(inp); 1228 #endif 1229 /* 1230 * Advance the iss by at least 32768, but 1231 * clear the msb in order to make sure 1232 * that SEG_LT(snd_nxt, iss). 1233 */ 1234 iss = tp->snd_nxt + 1235 ((arc4random() & 0x7fffffff) | 0x8000); 1236 reuse = &iss; 1237 tp = tcp_close(tp); 1238 inp = NULL; 1239 goto findpcb; 1240 } 1241 } 1242 1243 /* 1244 * States other than LISTEN or SYN_SENT. 1245 * First check timestamp, if present. 1246 * Then check that at least some bytes of segment are within 1247 * receive window. If segment begins before rcv_nxt, 1248 * drop leading data (and SYN); if nothing left, just ack. 1249 * 1250 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1251 * and it's less than opti.ts_recent, drop it. 1252 */ 1253 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1254 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1255 1256 /* Check to see if ts_recent is over 24 days old. */ 1257 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1258 /* 1259 * Invalidate ts_recent. If this segment updates 1260 * ts_recent, the age will be reset later and ts_recent 1261 * will get a valid value. If it does not, setting 1262 * ts_recent to zero will at least satisfy the 1263 * requirement that zero be placed in the timestamp 1264 * echo reply when ts_recent isn't valid. The 1265 * age isn't reset until we get a valid ts_recent 1266 * because we don't want out-of-order segments to be 1267 * dropped when ts_recent is old. 1268 */ 1269 tp->ts_recent = 0; 1270 } else { 1271 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1272 tcpstat_inc(tcps_pawsdrop); 1273 if (tlen) 1274 goto dropafterack; 1275 goto drop; 1276 } 1277 } 1278 1279 todrop = tp->rcv_nxt - th->th_seq; 1280 if (todrop > 0) { 1281 if (tiflags & TH_SYN) { 1282 tiflags &= ~TH_SYN; 1283 th->th_seq++; 1284 if (th->th_urp > 1) 1285 th->th_urp--; 1286 else 1287 tiflags &= ~TH_URG; 1288 todrop--; 1289 } 1290 if (todrop > tlen || 1291 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1292 /* 1293 * Any valid FIN must be to the left of the 1294 * window. At this point, FIN must be a 1295 * duplicate or out-of-sequence, so drop it. 1296 */ 1297 tiflags &= ~TH_FIN; 1298 /* 1299 * Send ACK to resynchronize, and drop any data, 1300 * but keep on processing for RST or ACK. 1301 */ 1302 tp->t_flags |= TF_ACKNOW; 1303 todrop = tlen; 1304 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1305 } else { 1306 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1307 todrop); 1308 } 1309 hdroptlen += todrop; /* drop from head afterwards */ 1310 th->th_seq += todrop; 1311 tlen -= todrop; 1312 if (th->th_urp > todrop) 1313 th->th_urp -= todrop; 1314 else { 1315 tiflags &= ~TH_URG; 1316 th->th_urp = 0; 1317 } 1318 } 1319 1320 /* 1321 * If new data are received on a connection after the 1322 * user processes are gone, then RST the other end. 1323 */ 1324 if ((so->so_state & SS_NOFDREF) && 1325 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1326 tp = tcp_close(tp); 1327 tcpstat_inc(tcps_rcvafterclose); 1328 goto dropwithreset; 1329 } 1330 1331 /* 1332 * If segment ends after window, drop trailing data 1333 * (and PUSH and FIN); if nothing left, just ACK. 1334 */ 1335 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1336 if (todrop > 0) { 1337 tcpstat_inc(tcps_rcvpackafterwin); 1338 if (todrop >= tlen) { 1339 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1340 /* 1341 * If window is closed can only take segments at 1342 * window edge, and have to drop data and PUSH from 1343 * incoming segments. Continue processing, but 1344 * remember to ack. Otherwise, drop segment 1345 * and ack. 1346 */ 1347 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1348 tp->t_flags |= TF_ACKNOW; 1349 tcpstat_inc(tcps_rcvwinprobe); 1350 } else 1351 goto dropafterack; 1352 } else 1353 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1354 m_adj(m, -todrop); 1355 tlen -= todrop; 1356 tiflags &= ~(TH_PUSH|TH_FIN); 1357 } 1358 1359 /* 1360 * If last ACK falls within this segment's sequence numbers, 1361 * record its timestamp if it's more recent. 1362 * NOTE that the test is modified according to the latest 1363 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1364 */ 1365 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1366 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1367 tp->ts_recent_age = tcp_now; 1368 tp->ts_recent = opti.ts_val; 1369 } 1370 1371 /* 1372 * If the RST bit is set examine the state: 1373 * SYN_RECEIVED STATE: 1374 * If passive open, return to LISTEN state. 1375 * If active open, inform user that connection was refused. 1376 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1377 * Inform user that connection was reset, and close tcb. 1378 * CLOSING, LAST_ACK, TIME_WAIT STATES 1379 * Close the tcb. 1380 */ 1381 if (tiflags & TH_RST) { 1382 if (th->th_seq != tp->last_ack_sent && 1383 th->th_seq != tp->rcv_nxt && 1384 th->th_seq != (tp->rcv_nxt + 1)) 1385 goto drop; 1386 1387 switch (tp->t_state) { 1388 case TCPS_SYN_RECEIVED: 1389 #ifdef TCP_ECN 1390 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1391 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1392 goto drop; 1393 #endif 1394 so->so_error = ECONNREFUSED; 1395 goto close; 1396 1397 case TCPS_ESTABLISHED: 1398 case TCPS_FIN_WAIT_1: 1399 case TCPS_FIN_WAIT_2: 1400 case TCPS_CLOSE_WAIT: 1401 so->so_error = ECONNRESET; 1402 close: 1403 tp->t_state = TCPS_CLOSED; 1404 tcpstat_inc(tcps_drops); 1405 tp = tcp_close(tp); 1406 goto drop; 1407 case TCPS_CLOSING: 1408 case TCPS_LAST_ACK: 1409 case TCPS_TIME_WAIT: 1410 tp = tcp_close(tp); 1411 goto drop; 1412 } 1413 } 1414 1415 /* 1416 * If a SYN is in the window, then this is an 1417 * error and we ACK and drop the packet. 1418 */ 1419 if (tiflags & TH_SYN) 1420 goto dropafterack_ratelim; 1421 1422 /* 1423 * If the ACK bit is off we drop the segment and return. 1424 */ 1425 if ((tiflags & TH_ACK) == 0) { 1426 if (tp->t_flags & TF_ACKNOW) 1427 goto dropafterack; 1428 else 1429 goto drop; 1430 } 1431 1432 /* 1433 * Ack processing. 1434 */ 1435 switch (tp->t_state) { 1436 1437 /* 1438 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1439 * ESTABLISHED state and continue processing. 1440 * The ACK was checked above. 1441 */ 1442 case TCPS_SYN_RECEIVED: 1443 tcpstat_inc(tcps_connects); 1444 tp->t_flags |= TF_BLOCKOUTPUT; 1445 soisconnected(so); 1446 tp->t_flags &= ~TF_BLOCKOUTPUT; 1447 tp->t_state = TCPS_ESTABLISHED; 1448 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1449 /* Do window scaling? */ 1450 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1451 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1452 tp->snd_scale = tp->requested_s_scale; 1453 tp->rcv_scale = tp->request_r_scale; 1454 tiwin = th->th_win << tp->snd_scale; 1455 } 1456 tcp_flush_queue(tp); 1457 tp->snd_wl1 = th->th_seq - 1; 1458 /* fall into ... */ 1459 1460 /* 1461 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1462 * ACKs. If the ack is in the range 1463 * tp->snd_una < th->th_ack <= tp->snd_max 1464 * then advance tp->snd_una to th->th_ack and drop 1465 * data from the retransmission queue. If this ACK reflects 1466 * more up to date window information we update our window information. 1467 */ 1468 case TCPS_ESTABLISHED: 1469 case TCPS_FIN_WAIT_1: 1470 case TCPS_FIN_WAIT_2: 1471 case TCPS_CLOSE_WAIT: 1472 case TCPS_CLOSING: 1473 case TCPS_LAST_ACK: 1474 case TCPS_TIME_WAIT: 1475 #ifdef TCP_ECN 1476 /* 1477 * if we receive ECE and are not already in recovery phase, 1478 * reduce cwnd by half but don't slow-start. 1479 * advance snd_last to snd_max not to reduce cwnd again 1480 * until all outstanding packets are acked. 1481 */ 1482 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1483 if ((tp->t_flags & TF_ECN_PERMIT) && 1484 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1485 u_int win; 1486 1487 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1488 if (win > 1) { 1489 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1490 tp->snd_cwnd = tp->snd_ssthresh; 1491 tp->snd_last = tp->snd_max; 1492 tp->t_flags |= TF_SEND_CWR; 1493 tcpstat_inc(tcps_cwr_ecn); 1494 } 1495 } 1496 tcpstat_inc(tcps_ecn_rcvece); 1497 } 1498 /* 1499 * if we receive CWR, we know that the peer has reduced 1500 * its congestion window. stop sending ecn-echo. 1501 */ 1502 if ((tiflags & TH_CWR)) { 1503 tp->t_flags &= ~TF_RCVD_CE; 1504 tcpstat_inc(tcps_ecn_rcvcwr); 1505 } 1506 #endif /* TCP_ECN */ 1507 1508 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1509 /* 1510 * Duplicate/old ACK processing. 1511 * Increments t_dupacks: 1512 * Pure duplicate (same seq/ack/window, no data) 1513 * Doesn't affect t_dupacks: 1514 * Data packets. 1515 * Normal window updates (window opens) 1516 * Resets t_dupacks: 1517 * New data ACKed. 1518 * Window shrinks 1519 * Old ACK 1520 */ 1521 if (tlen) { 1522 /* Drop very old ACKs unless th_seq matches */ 1523 if (th->th_seq != tp->rcv_nxt && 1524 SEQ_LT(th->th_ack, 1525 tp->snd_una - tp->max_sndwnd)) { 1526 tcpstat_inc(tcps_rcvacktooold); 1527 goto drop; 1528 } 1529 break; 1530 } 1531 /* 1532 * If we get an old ACK, there is probably packet 1533 * reordering going on. Be conservative and reset 1534 * t_dupacks so that we are less aggressive in 1535 * doing a fast retransmit. 1536 */ 1537 if (th->th_ack != tp->snd_una) { 1538 tp->t_dupacks = 0; 1539 break; 1540 } 1541 if (tiwin == tp->snd_wnd) { 1542 tcpstat_inc(tcps_rcvdupack); 1543 /* 1544 * If we have outstanding data (other than 1545 * a window probe), this is a completely 1546 * duplicate ack (ie, window info didn't 1547 * change), the ack is the biggest we've 1548 * seen and we've seen exactly our rexmt 1549 * threshold of them, assume a packet 1550 * has been dropped and retransmit it. 1551 * Kludge snd_nxt & the congestion 1552 * window so we send only this one 1553 * packet. 1554 * 1555 * We know we're losing at the current 1556 * window size so do congestion avoidance 1557 * (set ssthresh to half the current window 1558 * and pull our congestion window back to 1559 * the new ssthresh). 1560 * 1561 * Dup acks mean that packets have left the 1562 * network (they're now cached at the receiver) 1563 * so bump cwnd by the amount in the receiver 1564 * to keep a constant cwnd packets in the 1565 * network. 1566 */ 1567 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1568 tp->t_dupacks = 0; 1569 else if (++tp->t_dupacks == tcprexmtthresh) { 1570 tcp_seq onxt = tp->snd_nxt; 1571 u_long win = 1572 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1573 2 / tp->t_maxseg; 1574 1575 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1576 /* 1577 * False fast retx after 1578 * timeout. Do not cut window. 1579 */ 1580 tp->t_dupacks = 0; 1581 goto drop; 1582 } 1583 if (win < 2) 1584 win = 2; 1585 tp->snd_ssthresh = win * tp->t_maxseg; 1586 tp->snd_last = tp->snd_max; 1587 if (tp->sack_enable) { 1588 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1589 tp->t_rtttime = 0; 1590 #ifdef TCP_ECN 1591 tp->t_flags |= TF_SEND_CWR; 1592 #endif 1593 tcpstat_inc(tcps_cwr_frecovery); 1594 tcpstat_inc(tcps_sack_recovery_episode); 1595 /* 1596 * tcp_output() will send 1597 * oldest SACK-eligible rtx. 1598 */ 1599 (void) tcp_output(tp); 1600 tp->snd_cwnd = tp->snd_ssthresh+ 1601 tp->t_maxseg * tp->t_dupacks; 1602 goto drop; 1603 } 1604 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1605 tp->t_rtttime = 0; 1606 tp->snd_nxt = th->th_ack; 1607 tp->snd_cwnd = tp->t_maxseg; 1608 #ifdef TCP_ECN 1609 tp->t_flags |= TF_SEND_CWR; 1610 #endif 1611 tcpstat_inc(tcps_cwr_frecovery); 1612 tcpstat_inc(tcps_sndrexmitfast); 1613 (void) tcp_output(tp); 1614 1615 tp->snd_cwnd = tp->snd_ssthresh + 1616 tp->t_maxseg * tp->t_dupacks; 1617 if (SEQ_GT(onxt, tp->snd_nxt)) 1618 tp->snd_nxt = onxt; 1619 goto drop; 1620 } else if (tp->t_dupacks > tcprexmtthresh) { 1621 tp->snd_cwnd += tp->t_maxseg; 1622 (void) tcp_output(tp); 1623 goto drop; 1624 } 1625 } else if (tiwin < tp->snd_wnd) { 1626 /* 1627 * The window was retracted! Previous dup 1628 * ACKs may have been due to packets arriving 1629 * after the shrunken window, not a missing 1630 * packet, so play it safe and reset t_dupacks 1631 */ 1632 tp->t_dupacks = 0; 1633 } 1634 break; 1635 } 1636 /* 1637 * If the congestion window was inflated to account 1638 * for the other side's cached packets, retract it. 1639 */ 1640 if (tp->t_dupacks >= tcprexmtthresh) { 1641 /* Check for a partial ACK */ 1642 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1643 if (tp->sack_enable) 1644 tcp_sack_partialack(tp, th); 1645 else 1646 tcp_newreno_partialack(tp, th); 1647 } else { 1648 /* Out of fast recovery */ 1649 tp->snd_cwnd = tp->snd_ssthresh; 1650 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1651 tp->snd_ssthresh) 1652 tp->snd_cwnd = 1653 tcp_seq_subtract(tp->snd_max, 1654 th->th_ack); 1655 tp->t_dupacks = 0; 1656 } 1657 } else { 1658 /* 1659 * Reset the duplicate ACK counter if we 1660 * were not in fast recovery. 1661 */ 1662 tp->t_dupacks = 0; 1663 } 1664 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1665 tcpstat_inc(tcps_rcvacktoomuch); 1666 goto dropafterack_ratelim; 1667 } 1668 acked = th->th_ack - tp->snd_una; 1669 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1670 1671 /* 1672 * If we have a timestamp reply, update smoothed 1673 * round trip time. If no timestamp is present but 1674 * transmit timer is running and timed sequence 1675 * number was acked, update smoothed round trip time. 1676 * Since we now have an rtt measurement, cancel the 1677 * timer backoff (cf., Phil Karn's retransmit alg.). 1678 * Recompute the initial retransmit timer. 1679 */ 1680 if (opti.ts_present && opti.ts_ecr) 1681 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1682 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1683 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1684 1685 /* 1686 * If all outstanding data is acked, stop retransmit 1687 * timer and remember to restart (more output or persist). 1688 * If there is more data to be acked, restart retransmit 1689 * timer, using current (possibly backed-off) value. 1690 */ 1691 if (th->th_ack == tp->snd_max) { 1692 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1693 tp->t_flags |= TF_NEEDOUTPUT; 1694 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1695 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1696 /* 1697 * When new data is acked, open the congestion window. 1698 * If the window gives us less than ssthresh packets 1699 * in flight, open exponentially (maxseg per packet). 1700 * Otherwise open linearly: maxseg per window 1701 * (maxseg^2 / cwnd per packet). 1702 */ 1703 { 1704 u_int cw = tp->snd_cwnd; 1705 u_int incr = tp->t_maxseg; 1706 1707 if (cw > tp->snd_ssthresh) 1708 incr = incr * incr / cw; 1709 if (tp->t_dupacks < tcprexmtthresh) 1710 tp->snd_cwnd = ulmin(cw + incr, 1711 TCP_MAXWIN << tp->snd_scale); 1712 } 1713 ND6_HINT(tp); 1714 if (acked > so->so_snd.sb_cc) { 1715 tp->snd_wnd -= so->so_snd.sb_cc; 1716 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1717 ourfinisacked = 1; 1718 } else { 1719 sbdrop(so, &so->so_snd, acked); 1720 tp->snd_wnd -= acked; 1721 ourfinisacked = 0; 1722 } 1723 1724 tcp_update_sndspace(tp); 1725 if (sb_notify(so, &so->so_snd)) { 1726 tp->t_flags |= TF_BLOCKOUTPUT; 1727 sowwakeup(so); 1728 tp->t_flags &= ~TF_BLOCKOUTPUT; 1729 } 1730 1731 /* 1732 * If we had a pending ICMP message that referred to data 1733 * that have just been acknowledged, disregard the recorded 1734 * ICMP message. 1735 */ 1736 if ((tp->t_flags & TF_PMTUD_PEND) && 1737 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1738 tp->t_flags &= ~TF_PMTUD_PEND; 1739 1740 /* 1741 * Keep track of the largest chunk of data acknowledged 1742 * since last PMTU update 1743 */ 1744 if (tp->t_pmtud_mss_acked < acked) 1745 tp->t_pmtud_mss_acked = acked; 1746 1747 tp->snd_una = th->th_ack; 1748 #ifdef TCP_ECN 1749 /* sync snd_last with snd_una */ 1750 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1751 tp->snd_last = tp->snd_una; 1752 #endif 1753 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1754 tp->snd_nxt = tp->snd_una; 1755 1756 switch (tp->t_state) { 1757 1758 /* 1759 * In FIN_WAIT_1 STATE in addition to the processing 1760 * for the ESTABLISHED state if our FIN is now acknowledged 1761 * then enter FIN_WAIT_2. 1762 */ 1763 case TCPS_FIN_WAIT_1: 1764 if (ourfinisacked) { 1765 /* 1766 * If we can't receive any more 1767 * data, then closing user can proceed. 1768 * Starting the timer is contrary to the 1769 * specification, but if we don't get a FIN 1770 * we'll hang forever. 1771 */ 1772 if (so->so_state & SS_CANTRCVMORE) { 1773 tp->t_flags |= TF_BLOCKOUTPUT; 1774 soisdisconnected(so); 1775 tp->t_flags &= ~TF_BLOCKOUTPUT; 1776 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1777 } 1778 tp->t_state = TCPS_FIN_WAIT_2; 1779 } 1780 break; 1781 1782 /* 1783 * In CLOSING STATE in addition to the processing for 1784 * the ESTABLISHED state if the ACK acknowledges our FIN 1785 * then enter the TIME-WAIT state, otherwise ignore 1786 * the segment. 1787 */ 1788 case TCPS_CLOSING: 1789 if (ourfinisacked) { 1790 tp->t_state = TCPS_TIME_WAIT; 1791 tcp_canceltimers(tp); 1792 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1793 tp->t_flags |= TF_BLOCKOUTPUT; 1794 soisdisconnected(so); 1795 tp->t_flags &= ~TF_BLOCKOUTPUT; 1796 } 1797 break; 1798 1799 /* 1800 * In LAST_ACK, we may still be waiting for data to drain 1801 * and/or to be acked, as well as for the ack of our FIN. 1802 * If our FIN is now acknowledged, delete the TCB, 1803 * enter the closed state and return. 1804 */ 1805 case TCPS_LAST_ACK: 1806 if (ourfinisacked) { 1807 tp = tcp_close(tp); 1808 goto drop; 1809 } 1810 break; 1811 1812 /* 1813 * In TIME_WAIT state the only thing that should arrive 1814 * is a retransmission of the remote FIN. Acknowledge 1815 * it and restart the finack timer. 1816 */ 1817 case TCPS_TIME_WAIT: 1818 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1819 goto dropafterack; 1820 } 1821 } 1822 1823 step6: 1824 /* 1825 * Update window information. 1826 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1827 */ 1828 if ((tiflags & TH_ACK) && 1829 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1830 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1831 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1832 /* keep track of pure window updates */ 1833 if (tlen == 0 && 1834 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1835 tcpstat_inc(tcps_rcvwinupd); 1836 tp->snd_wnd = tiwin; 1837 tp->snd_wl1 = th->th_seq; 1838 tp->snd_wl2 = th->th_ack; 1839 if (tp->snd_wnd > tp->max_sndwnd) 1840 tp->max_sndwnd = tp->snd_wnd; 1841 tp->t_flags |= TF_NEEDOUTPUT; 1842 } 1843 1844 /* 1845 * Process segments with URG. 1846 */ 1847 if ((tiflags & TH_URG) && th->th_urp && 1848 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1849 /* 1850 * This is a kludge, but if we receive and accept 1851 * random urgent pointers, we'll crash in 1852 * soreceive. It's hard to imagine someone 1853 * actually wanting to send this much urgent data. 1854 */ 1855 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1856 th->th_urp = 0; /* XXX */ 1857 tiflags &= ~TH_URG; /* XXX */ 1858 goto dodata; /* XXX */ 1859 } 1860 /* 1861 * If this segment advances the known urgent pointer, 1862 * then mark the data stream. This should not happen 1863 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1864 * a FIN has been received from the remote side. 1865 * In these states we ignore the URG. 1866 * 1867 * According to RFC961 (Assigned Protocols), 1868 * the urgent pointer points to the last octet 1869 * of urgent data. We continue, however, 1870 * to consider it to indicate the first octet 1871 * of data past the urgent section as the original 1872 * spec states (in one of two places). 1873 */ 1874 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1875 tp->rcv_up = th->th_seq + th->th_urp; 1876 so->so_oobmark = so->so_rcv.sb_cc + 1877 (tp->rcv_up - tp->rcv_nxt) - 1; 1878 if (so->so_oobmark == 0) 1879 so->so_state |= SS_RCVATMARK; 1880 sohasoutofband(so); 1881 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1882 } 1883 /* 1884 * Remove out of band data so doesn't get presented to user. 1885 * This can happen independent of advancing the URG pointer, 1886 * but if two URG's are pending at once, some out-of-band 1887 * data may creep in... ick. 1888 */ 1889 if (th->th_urp <= (u_int16_t) tlen && 1890 (so->so_options & SO_OOBINLINE) == 0) 1891 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1892 } else 1893 /* 1894 * If no out of band data is expected, 1895 * pull receive urgent pointer along 1896 * with the receive window. 1897 */ 1898 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1899 tp->rcv_up = tp->rcv_nxt; 1900 dodata: /* XXX */ 1901 1902 /* 1903 * Process the segment text, merging it into the TCP sequencing queue, 1904 * and arranging for acknowledgment of receipt if necessary. 1905 * This process logically involves adjusting tp->rcv_wnd as data 1906 * is presented to the user (this happens in tcp_usrreq.c, 1907 * case PRU_RCVD). If a FIN has already been received on this 1908 * connection then we just ignore the text. 1909 */ 1910 if ((tlen || (tiflags & TH_FIN)) && 1911 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1912 tcp_seq laststart = th->th_seq; 1913 tcp_seq lastend = th->th_seq + tlen; 1914 1915 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1916 tp->t_state == TCPS_ESTABLISHED) { 1917 TCP_SETUP_ACK(tp, tiflags, m); 1918 tp->rcv_nxt += tlen; 1919 tiflags = th->th_flags & TH_FIN; 1920 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1921 ND6_HINT(tp); 1922 if (so->so_state & SS_CANTRCVMORE) 1923 m_freem(m); 1924 else { 1925 m_adj(m, hdroptlen); 1926 sbappendstream(so, &so->so_rcv, m); 1927 } 1928 tp->t_flags |= TF_BLOCKOUTPUT; 1929 sorwakeup(so); 1930 tp->t_flags &= ~TF_BLOCKOUTPUT; 1931 } else { 1932 m_adj(m, hdroptlen); 1933 tiflags = tcp_reass(tp, th, m, &tlen); 1934 tp->t_flags |= TF_ACKNOW; 1935 } 1936 if (tp->sack_enable) 1937 tcp_update_sack_list(tp, laststart, lastend); 1938 1939 /* 1940 * variable len never referenced again in modern BSD, 1941 * so why bother computing it ?? 1942 */ 1943 #if 0 1944 /* 1945 * Note the amount of data that peer has sent into 1946 * our window, in order to estimate the sender's 1947 * buffer size. 1948 */ 1949 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1950 #endif /* 0 */ 1951 } else { 1952 m_freem(m); 1953 tiflags &= ~TH_FIN; 1954 } 1955 1956 /* 1957 * If FIN is received ACK the FIN and let the user know 1958 * that the connection is closing. Ignore a FIN received before 1959 * the connection is fully established. 1960 */ 1961 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1962 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1963 tp->t_flags |= TF_BLOCKOUTPUT; 1964 socantrcvmore(so); 1965 tp->t_flags &= ~TF_BLOCKOUTPUT; 1966 tp->t_flags |= TF_ACKNOW; 1967 tp->rcv_nxt++; 1968 } 1969 switch (tp->t_state) { 1970 1971 /* 1972 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1973 */ 1974 case TCPS_ESTABLISHED: 1975 tp->t_state = TCPS_CLOSE_WAIT; 1976 break; 1977 1978 /* 1979 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1980 * enter the CLOSING state. 1981 */ 1982 case TCPS_FIN_WAIT_1: 1983 tp->t_state = TCPS_CLOSING; 1984 break; 1985 1986 /* 1987 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1988 * starting the time-wait timer, turning off the other 1989 * standard timers. 1990 */ 1991 case TCPS_FIN_WAIT_2: 1992 tp->t_state = TCPS_TIME_WAIT; 1993 tcp_canceltimers(tp); 1994 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1995 tp->t_flags |= TF_BLOCKOUTPUT; 1996 soisdisconnected(so); 1997 tp->t_flags &= ~TF_BLOCKOUTPUT; 1998 break; 1999 2000 /* 2001 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2002 */ 2003 case TCPS_TIME_WAIT: 2004 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2005 break; 2006 } 2007 } 2008 if (otp) 2009 tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen); 2010 2011 /* 2012 * Return any desired output. 2013 */ 2014 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2015 (void) tcp_output(tp); 2016 return IPPROTO_DONE; 2017 2018 badsyn: 2019 /* 2020 * Received a bad SYN. Increment counters and dropwithreset. 2021 */ 2022 tcpstat_inc(tcps_badsyn); 2023 tp = NULL; 2024 goto dropwithreset; 2025 2026 dropafterack_ratelim: 2027 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2028 tcp_ackdrop_ppslim) == 0) { 2029 /* XXX stat */ 2030 goto drop; 2031 } 2032 /* ...fall into dropafterack... */ 2033 2034 dropafterack: 2035 /* 2036 * Generate an ACK dropping incoming segment if it occupies 2037 * sequence space, where the ACK reflects our state. 2038 */ 2039 if (tiflags & TH_RST) 2040 goto drop; 2041 m_freem(m); 2042 tp->t_flags |= TF_ACKNOW; 2043 (void) tcp_output(tp); 2044 return IPPROTO_DONE; 2045 2046 dropwithreset_ratelim: 2047 /* 2048 * We may want to rate-limit RSTs in certain situations, 2049 * particularly if we are sending an RST in response to 2050 * an attempt to connect to or otherwise communicate with 2051 * a port for which we have no socket. 2052 */ 2053 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2054 tcp_rst_ppslim) == 0) { 2055 /* XXX stat */ 2056 goto drop; 2057 } 2058 /* ...fall into dropwithreset... */ 2059 2060 dropwithreset: 2061 /* 2062 * Generate a RST, dropping incoming segment. 2063 * Make ACK acceptable to originator of segment. 2064 * Don't bother to respond to RST. 2065 */ 2066 if (tiflags & TH_RST) 2067 goto drop; 2068 if (tiflags & TH_ACK) { 2069 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2070 TH_RST, m->m_pkthdr.ph_rtableid); 2071 } else { 2072 if (tiflags & TH_SYN) 2073 tlen++; 2074 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2075 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2076 } 2077 m_freem(m); 2078 return IPPROTO_DONE; 2079 2080 drop: 2081 /* 2082 * Drop space held by incoming segment and return. 2083 */ 2084 if (otp) 2085 tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen); 2086 2087 m_freem(m); 2088 return IPPROTO_DONE; 2089 } 2090 2091 int 2092 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2093 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2094 u_int rtableid) 2095 { 2096 u_int16_t mss = 0; 2097 int opt, optlen; 2098 #ifdef TCP_SIGNATURE 2099 caddr_t sigp = NULL; 2100 struct tdb *tdb = NULL; 2101 #endif /* TCP_SIGNATURE */ 2102 2103 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2104 opt = cp[0]; 2105 if (opt == TCPOPT_EOL) 2106 break; 2107 if (opt == TCPOPT_NOP) 2108 optlen = 1; 2109 else { 2110 if (cnt < 2) 2111 break; 2112 optlen = cp[1]; 2113 if (optlen < 2 || optlen > cnt) 2114 break; 2115 } 2116 switch (opt) { 2117 2118 default: 2119 continue; 2120 2121 case TCPOPT_MAXSEG: 2122 if (optlen != TCPOLEN_MAXSEG) 2123 continue; 2124 if (!(th->th_flags & TH_SYN)) 2125 continue; 2126 if (TCPS_HAVERCVDSYN(tp->t_state)) 2127 continue; 2128 memcpy(&mss, cp + 2, sizeof(mss)); 2129 mss = ntohs(mss); 2130 oi->maxseg = mss; 2131 break; 2132 2133 case TCPOPT_WINDOW: 2134 if (optlen != TCPOLEN_WINDOW) 2135 continue; 2136 if (!(th->th_flags & TH_SYN)) 2137 continue; 2138 if (TCPS_HAVERCVDSYN(tp->t_state)) 2139 continue; 2140 tp->t_flags |= TF_RCVD_SCALE; 2141 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2142 break; 2143 2144 case TCPOPT_TIMESTAMP: 2145 if (optlen != TCPOLEN_TIMESTAMP) 2146 continue; 2147 oi->ts_present = 1; 2148 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2149 oi->ts_val = ntohl(oi->ts_val); 2150 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2151 oi->ts_ecr = ntohl(oi->ts_ecr); 2152 2153 if (!(th->th_flags & TH_SYN)) 2154 continue; 2155 if (TCPS_HAVERCVDSYN(tp->t_state)) 2156 continue; 2157 /* 2158 * A timestamp received in a SYN makes 2159 * it ok to send timestamp requests and replies. 2160 */ 2161 tp->t_flags |= TF_RCVD_TSTMP; 2162 tp->ts_recent = oi->ts_val; 2163 tp->ts_recent_age = tcp_now; 2164 break; 2165 2166 case TCPOPT_SACK_PERMITTED: 2167 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2168 continue; 2169 if (!(th->th_flags & TH_SYN)) 2170 continue; 2171 if (TCPS_HAVERCVDSYN(tp->t_state)) 2172 continue; 2173 /* MUST only be set on SYN */ 2174 tp->t_flags |= TF_SACK_PERMIT; 2175 break; 2176 case TCPOPT_SACK: 2177 tcp_sack_option(tp, th, cp, optlen); 2178 break; 2179 #ifdef TCP_SIGNATURE 2180 case TCPOPT_SIGNATURE: 2181 if (optlen != TCPOLEN_SIGNATURE) 2182 continue; 2183 2184 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2185 return (-1); 2186 2187 sigp = cp + 2; 2188 break; 2189 #endif /* TCP_SIGNATURE */ 2190 } 2191 } 2192 2193 #ifdef TCP_SIGNATURE 2194 if (tp->t_flags & TF_SIGNATURE) { 2195 union sockaddr_union src, dst; 2196 2197 memset(&src, 0, sizeof(union sockaddr_union)); 2198 memset(&dst, 0, sizeof(union sockaddr_union)); 2199 2200 switch (tp->pf) { 2201 case 0: 2202 case AF_INET: 2203 src.sa.sa_len = sizeof(struct sockaddr_in); 2204 src.sa.sa_family = AF_INET; 2205 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2206 dst.sa.sa_len = sizeof(struct sockaddr_in); 2207 dst.sa.sa_family = AF_INET; 2208 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2209 break; 2210 #ifdef INET6 2211 case AF_INET6: 2212 src.sa.sa_len = sizeof(struct sockaddr_in6); 2213 src.sa.sa_family = AF_INET6; 2214 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2215 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2216 dst.sa.sa_family = AF_INET6; 2217 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2218 break; 2219 #endif /* INET6 */ 2220 } 2221 2222 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2223 0, &src, &dst, IPPROTO_TCP); 2224 2225 /* 2226 * We don't have an SA for this peer, so we turn off 2227 * TF_SIGNATURE on the listen socket 2228 */ 2229 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2230 tp->t_flags &= ~TF_SIGNATURE; 2231 2232 } 2233 2234 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2235 tcpstat_inc(tcps_rcvbadsig); 2236 return (-1); 2237 } 2238 2239 if (sigp) { 2240 char sig[16]; 2241 2242 if (tdb == NULL) { 2243 tcpstat_inc(tcps_rcvbadsig); 2244 return (-1); 2245 } 2246 2247 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2248 return (-1); 2249 2250 if (timingsafe_bcmp(sig, sigp, 16)) { 2251 tcpstat_inc(tcps_rcvbadsig); 2252 return (-1); 2253 } 2254 2255 tcpstat_inc(tcps_rcvgoodsig); 2256 } 2257 #endif /* TCP_SIGNATURE */ 2258 2259 return (0); 2260 } 2261 2262 u_long 2263 tcp_seq_subtract(u_long a, u_long b) 2264 { 2265 return ((long)(a - b)); 2266 } 2267 2268 /* 2269 * This function is called upon receipt of new valid data (while not in header 2270 * prediction mode), and it updates the ordered list of sacks. 2271 */ 2272 void 2273 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2274 tcp_seq rcv_lastend) 2275 { 2276 /* 2277 * First reported block MUST be the most recent one. Subsequent 2278 * blocks SHOULD be in the order in which they arrived at the 2279 * receiver. These two conditions make the implementation fully 2280 * compliant with RFC 2018. 2281 */ 2282 int i, j = 0, count = 0, lastpos = -1; 2283 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2284 2285 /* First clean up current list of sacks */ 2286 for (i = 0; i < tp->rcv_numsacks; i++) { 2287 sack = tp->sackblks[i]; 2288 if (sack.start == 0 && sack.end == 0) { 2289 count++; /* count = number of blocks to be discarded */ 2290 continue; 2291 } 2292 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2293 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2294 count++; 2295 } else { 2296 temp[j].start = tp->sackblks[i].start; 2297 temp[j++].end = tp->sackblks[i].end; 2298 } 2299 } 2300 tp->rcv_numsacks -= count; 2301 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2302 tcp_clean_sackreport(tp); 2303 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2304 /* ==> need first sack block */ 2305 tp->sackblks[0].start = rcv_laststart; 2306 tp->sackblks[0].end = rcv_lastend; 2307 tp->rcv_numsacks = 1; 2308 } 2309 return; 2310 } 2311 /* Otherwise, sack blocks are already present. */ 2312 for (i = 0; i < tp->rcv_numsacks; i++) 2313 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2314 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2315 return; /* sack list remains unchanged */ 2316 /* 2317 * From here, segment just received should be (part of) the 1st sack. 2318 * Go through list, possibly coalescing sack block entries. 2319 */ 2320 firstsack.start = rcv_laststart; 2321 firstsack.end = rcv_lastend; 2322 for (i = 0; i < tp->rcv_numsacks; i++) { 2323 sack = tp->sackblks[i]; 2324 if (SEQ_LT(sack.end, firstsack.start) || 2325 SEQ_GT(sack.start, firstsack.end)) 2326 continue; /* no overlap */ 2327 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2328 /* 2329 * identical block; delete it here since we will 2330 * move it to the front of the list. 2331 */ 2332 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2333 lastpos = i; /* last posn with a zero entry */ 2334 continue; 2335 } 2336 if (SEQ_LEQ(sack.start, firstsack.start)) 2337 firstsack.start = sack.start; /* merge blocks */ 2338 if (SEQ_GEQ(sack.end, firstsack.end)) 2339 firstsack.end = sack.end; /* merge blocks */ 2340 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2341 lastpos = i; /* last posn with a zero entry */ 2342 } 2343 if (lastpos != -1) { /* at least one merge */ 2344 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2345 sack = tp->sackblks[i]; 2346 if (sack.start == 0 && sack.end == 0) 2347 continue; 2348 temp[j++] = sack; 2349 } 2350 tp->rcv_numsacks = j; /* including first blk (added later) */ 2351 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2352 tp->sackblks[i] = temp[i]; 2353 } else { /* no merges -- shift sacks by 1 */ 2354 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2355 tp->rcv_numsacks++; 2356 for (i = tp->rcv_numsacks-1; i > 0; i--) 2357 tp->sackblks[i] = tp->sackblks[i-1]; 2358 } 2359 tp->sackblks[0] = firstsack; 2360 return; 2361 } 2362 2363 /* 2364 * Process the TCP SACK option. tp->snd_holes is an ordered list 2365 * of holes (oldest to newest, in terms of the sequence space). 2366 */ 2367 void 2368 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2369 { 2370 int tmp_olen; 2371 u_char *tmp_cp; 2372 struct sackhole *cur, *p, *temp; 2373 2374 if (!tp->sack_enable) 2375 return; 2376 /* SACK without ACK doesn't make sense. */ 2377 if ((th->th_flags & TH_ACK) == 0) 2378 return; 2379 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2380 if (SEQ_LT(th->th_ack, tp->snd_una) || 2381 SEQ_GT(th->th_ack, tp->snd_max)) 2382 return; 2383 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2384 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2385 return; 2386 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2387 tmp_cp = cp + 2; 2388 tmp_olen = optlen - 2; 2389 tcpstat_inc(tcps_sack_rcv_opts); 2390 if (tp->snd_numholes < 0) 2391 tp->snd_numholes = 0; 2392 if (tp->t_maxseg == 0) 2393 panic("tcp_sack_option"); /* Should never happen */ 2394 while (tmp_olen > 0) { 2395 struct sackblk sack; 2396 2397 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2398 sack.start = ntohl(sack.start); 2399 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2400 sack.end = ntohl(sack.end); 2401 tmp_olen -= TCPOLEN_SACK; 2402 tmp_cp += TCPOLEN_SACK; 2403 if (SEQ_LEQ(sack.end, sack.start)) 2404 continue; /* bad SACK fields */ 2405 if (SEQ_LEQ(sack.end, tp->snd_una)) 2406 continue; /* old block */ 2407 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2408 if (SEQ_LT(sack.start, th->th_ack)) 2409 continue; 2410 } 2411 if (SEQ_GT(sack.end, tp->snd_max)) 2412 continue; 2413 if (tp->snd_holes == NULL) { /* first hole */ 2414 tp->snd_holes = (struct sackhole *) 2415 pool_get(&sackhl_pool, PR_NOWAIT); 2416 if (tp->snd_holes == NULL) { 2417 /* ENOBUFS, so ignore SACKed block for now */ 2418 goto dropped; 2419 } 2420 cur = tp->snd_holes; 2421 cur->start = th->th_ack; 2422 cur->end = sack.start; 2423 cur->rxmit = cur->start; 2424 cur->next = NULL; 2425 tp->snd_numholes = 1; 2426 tp->rcv_lastsack = sack.end; 2427 /* 2428 * dups is at least one. If more data has been 2429 * SACKed, it can be greater than one. 2430 */ 2431 cur->dups = min(tcprexmtthresh, 2432 ((sack.end - cur->end)/tp->t_maxseg)); 2433 if (cur->dups < 1) 2434 cur->dups = 1; 2435 continue; /* with next sack block */ 2436 } 2437 /* Go thru list of holes: p = previous, cur = current */ 2438 p = cur = tp->snd_holes; 2439 while (cur) { 2440 if (SEQ_LEQ(sack.end, cur->start)) 2441 /* SACKs data before the current hole */ 2442 break; /* no use going through more holes */ 2443 if (SEQ_GEQ(sack.start, cur->end)) { 2444 /* SACKs data beyond the current hole */ 2445 cur->dups++; 2446 if (((sack.end - cur->end)/tp->t_maxseg) >= 2447 tcprexmtthresh) 2448 cur->dups = tcprexmtthresh; 2449 p = cur; 2450 cur = cur->next; 2451 continue; 2452 } 2453 if (SEQ_LEQ(sack.start, cur->start)) { 2454 /* Data acks at least the beginning of hole */ 2455 if (SEQ_GEQ(sack.end, cur->end)) { 2456 /* Acks entire hole, so delete hole */ 2457 if (p != cur) { 2458 p->next = cur->next; 2459 pool_put(&sackhl_pool, cur); 2460 cur = p->next; 2461 } else { 2462 cur = cur->next; 2463 pool_put(&sackhl_pool, p); 2464 p = cur; 2465 tp->snd_holes = p; 2466 } 2467 tp->snd_numholes--; 2468 continue; 2469 } 2470 /* otherwise, move start of hole forward */ 2471 cur->start = sack.end; 2472 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2473 p = cur; 2474 cur = cur->next; 2475 continue; 2476 } 2477 /* move end of hole backward */ 2478 if (SEQ_GEQ(sack.end, cur->end)) { 2479 cur->end = sack.start; 2480 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2481 cur->dups++; 2482 if (((sack.end - cur->end)/tp->t_maxseg) >= 2483 tcprexmtthresh) 2484 cur->dups = tcprexmtthresh; 2485 p = cur; 2486 cur = cur->next; 2487 continue; 2488 } 2489 if (SEQ_LT(cur->start, sack.start) && 2490 SEQ_GT(cur->end, sack.end)) { 2491 /* 2492 * ACKs some data in middle of a hole; need to 2493 * split current hole 2494 */ 2495 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2496 goto dropped; 2497 temp = (struct sackhole *) 2498 pool_get(&sackhl_pool, PR_NOWAIT); 2499 if (temp == NULL) 2500 goto dropped; /* ENOBUFS */ 2501 temp->next = cur->next; 2502 temp->start = sack.end; 2503 temp->end = cur->end; 2504 temp->dups = cur->dups; 2505 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2506 cur->end = sack.start; 2507 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2508 cur->dups++; 2509 if (((sack.end - cur->end)/tp->t_maxseg) >= 2510 tcprexmtthresh) 2511 cur->dups = tcprexmtthresh; 2512 cur->next = temp; 2513 p = temp; 2514 cur = p->next; 2515 tp->snd_numholes++; 2516 } 2517 } 2518 /* At this point, p points to the last hole on the list */ 2519 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2520 /* 2521 * Need to append new hole at end. 2522 * Last hole is p (and it's not NULL). 2523 */ 2524 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2525 goto dropped; 2526 temp = (struct sackhole *) 2527 pool_get(&sackhl_pool, PR_NOWAIT); 2528 if (temp == NULL) 2529 goto dropped; /* ENOBUFS */ 2530 temp->start = tp->rcv_lastsack; 2531 temp->end = sack.start; 2532 temp->dups = min(tcprexmtthresh, 2533 ((sack.end - sack.start)/tp->t_maxseg)); 2534 if (temp->dups < 1) 2535 temp->dups = 1; 2536 temp->rxmit = temp->start; 2537 temp->next = 0; 2538 p->next = temp; 2539 tp->rcv_lastsack = sack.end; 2540 tp->snd_numholes++; 2541 } 2542 } 2543 return; 2544 dropped: 2545 tcpstat_inc(tcps_sack_drop_opts); 2546 } 2547 2548 /* 2549 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2550 * it is completely acked; otherwise, tcp_sack_option(), called from 2551 * tcp_dooptions(), will fix up the hole. 2552 */ 2553 void 2554 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2555 { 2556 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2557 /* max because this could be an older ack just arrived */ 2558 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2559 th->th_ack : tp->snd_una; 2560 struct sackhole *cur = tp->snd_holes; 2561 struct sackhole *prev; 2562 while (cur) 2563 if (SEQ_LEQ(cur->end, lastack)) { 2564 prev = cur; 2565 cur = cur->next; 2566 pool_put(&sackhl_pool, prev); 2567 tp->snd_numholes--; 2568 } else if (SEQ_LT(cur->start, lastack)) { 2569 cur->start = lastack; 2570 if (SEQ_LT(cur->rxmit, cur->start)) 2571 cur->rxmit = cur->start; 2572 break; 2573 } else 2574 break; 2575 tp->snd_holes = cur; 2576 } 2577 } 2578 2579 /* 2580 * Delete all receiver-side SACK information. 2581 */ 2582 void 2583 tcp_clean_sackreport(struct tcpcb *tp) 2584 { 2585 int i; 2586 2587 tp->rcv_numsacks = 0; 2588 for (i = 0; i < MAX_SACK_BLKS; i++) 2589 tp->sackblks[i].start = tp->sackblks[i].end=0; 2590 2591 } 2592 2593 /* 2594 * Partial ack handling within a sack recovery episode. When a partial ack 2595 * arrives, turn off retransmission timer, deflate the window, do not clear 2596 * tp->t_dupacks. 2597 */ 2598 void 2599 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2600 { 2601 /* Turn off retx. timer (will start again next segment) */ 2602 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2603 tp->t_rtttime = 0; 2604 /* 2605 * Partial window deflation. This statement relies on the 2606 * fact that tp->snd_una has not been updated yet. 2607 */ 2608 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2609 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2610 tp->snd_cwnd += tp->t_maxseg; 2611 } else 2612 tp->snd_cwnd = tp->t_maxseg; 2613 tp->snd_cwnd += tp->t_maxseg; 2614 tp->t_flags |= TF_NEEDOUTPUT; 2615 } 2616 2617 /* 2618 * Pull out of band byte out of a segment so 2619 * it doesn't appear in the user's data queue. 2620 * It is still reflected in the segment length for 2621 * sequencing purposes. 2622 */ 2623 void 2624 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2625 { 2626 int cnt = off + urgent - 1; 2627 2628 while (cnt >= 0) { 2629 if (m->m_len > cnt) { 2630 char *cp = mtod(m, caddr_t) + cnt; 2631 struct tcpcb *tp = sototcpcb(so); 2632 2633 tp->t_iobc = *cp; 2634 tp->t_oobflags |= TCPOOB_HAVEDATA; 2635 memmove(cp, cp + 1, m->m_len - cnt - 1); 2636 m->m_len--; 2637 return; 2638 } 2639 cnt -= m->m_len; 2640 m = m->m_next; 2641 if (m == NULL) 2642 break; 2643 } 2644 panic("tcp_pulloutofband"); 2645 } 2646 2647 /* 2648 * Collect new round-trip time estimate 2649 * and update averages and current timeout. 2650 */ 2651 void 2652 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2653 { 2654 short delta; 2655 short rttmin; 2656 2657 if (rtt < 0) 2658 rtt = 0; 2659 else if (rtt > TCP_RTT_MAX) 2660 rtt = TCP_RTT_MAX; 2661 2662 tcpstat_inc(tcps_rttupdated); 2663 if (tp->t_srtt != 0) { 2664 /* 2665 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2666 * after the binary point (scaled by 4), whereas 2667 * srtt is stored as fixed point with 5 bits after the 2668 * binary point (i.e., scaled by 32). The following magic 2669 * is equivalent to the smoothing algorithm in rfc793 with 2670 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2671 * point). 2672 */ 2673 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2674 (tp->t_srtt >> TCP_RTT_SHIFT); 2675 if ((tp->t_srtt += delta) <= 0) 2676 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2677 /* 2678 * We accumulate a smoothed rtt variance (actually, a 2679 * smoothed mean difference), then set the retransmit 2680 * timer to smoothed rtt + 4 times the smoothed variance. 2681 * rttvar is stored as fixed point with 4 bits after the 2682 * binary point (scaled by 16). The following is 2683 * equivalent to rfc793 smoothing with an alpha of .75 2684 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2685 * rfc793's wired-in beta. 2686 */ 2687 if (delta < 0) 2688 delta = -delta; 2689 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2690 if ((tp->t_rttvar += delta) <= 0) 2691 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2692 } else { 2693 /* 2694 * No rtt measurement yet - use the unsmoothed rtt. 2695 * Set the variance to half the rtt (so our first 2696 * retransmit happens at 3*rtt). 2697 */ 2698 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2699 tp->t_rttvar = (rtt + 1) << 2700 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2701 } 2702 tp->t_rtttime = 0; 2703 tp->t_rxtshift = 0; 2704 2705 /* 2706 * the retransmit should happen at rtt + 4 * rttvar. 2707 * Because of the way we do the smoothing, srtt and rttvar 2708 * will each average +1/2 tick of bias. When we compute 2709 * the retransmit timer, we want 1/2 tick of rounding and 2710 * 1 extra tick because of +-1/2 tick uncertainty in the 2711 * firing of the timer. The bias will give us exactly the 2712 * 1.5 tick we need. But, because the bias is 2713 * statistical, we have to test that we don't drop below 2714 * the minimum feasible timer (which is 2 ticks). 2715 */ 2716 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2717 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2718 2719 /* 2720 * We received an ack for a packet that wasn't retransmitted; 2721 * it is probably safe to discard any error indications we've 2722 * received recently. This isn't quite right, but close enough 2723 * for now (a route might have failed after we sent a segment, 2724 * and the return path might not be symmetrical). 2725 */ 2726 tp->t_softerror = 0; 2727 } 2728 2729 /* 2730 * Determine a reasonable value for maxseg size. 2731 * If the route is known, check route for mtu. 2732 * If none, use an mss that can be handled on the outgoing 2733 * interface without forcing IP to fragment; if bigger than 2734 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2735 * to utilize large mbufs. If no route is found, route has no mtu, 2736 * or the destination isn't local, use a default, hopefully conservative 2737 * size (usually 512 or the default IP max size, but no more than the mtu 2738 * of the interface), as we can't discover anything about intervening 2739 * gateways or networks. We also initialize the congestion/slow start 2740 * window to be a single segment if the destination isn't local. 2741 * While looking at the routing entry, we also initialize other path-dependent 2742 * parameters from pre-set or cached values in the routing entry. 2743 * 2744 * Also take into account the space needed for options that we 2745 * send regularly. Make maxseg shorter by that amount to assure 2746 * that we can send maxseg amount of data even when the options 2747 * are present. Store the upper limit of the length of options plus 2748 * data in maxopd. 2749 * 2750 * NOTE: offer == -1 indicates that the maxseg size changed due to 2751 * Path MTU discovery. 2752 */ 2753 int 2754 tcp_mss(struct tcpcb *tp, int offer) 2755 { 2756 struct rtentry *rt; 2757 struct ifnet *ifp = NULL; 2758 int mss, mssopt; 2759 int iphlen; 2760 struct inpcb *inp; 2761 2762 inp = tp->t_inpcb; 2763 2764 mssopt = mss = tcp_mssdflt; 2765 2766 rt = in_pcbrtentry(inp); 2767 2768 if (rt == NULL) 2769 goto out; 2770 2771 ifp = if_get(rt->rt_ifidx); 2772 if (ifp == NULL) 2773 goto out; 2774 2775 switch (tp->pf) { 2776 #ifdef INET6 2777 case AF_INET6: 2778 iphlen = sizeof(struct ip6_hdr); 2779 break; 2780 #endif 2781 case AF_INET: 2782 iphlen = sizeof(struct ip); 2783 break; 2784 default: 2785 /* the family does not support path MTU discovery */ 2786 goto out; 2787 } 2788 2789 /* 2790 * if there's an mtu associated with the route and we support 2791 * path MTU discovery for the underlying protocol family, use it. 2792 */ 2793 if (rt->rt_mtu) { 2794 /* 2795 * One may wish to lower MSS to take into account options, 2796 * especially security-related options. 2797 */ 2798 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2799 /* 2800 * RFC2460 section 5, last paragraph: if path MTU is 2801 * smaller than 1280, use 1280 as packet size and 2802 * attach fragment header. 2803 */ 2804 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2805 sizeof(struct tcphdr); 2806 } else { 2807 mss = rt->rt_mtu - iphlen - 2808 sizeof(struct tcphdr); 2809 } 2810 } else if (ifp->if_flags & IFF_LOOPBACK) { 2811 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2812 } else if (tp->pf == AF_INET) { 2813 if (ip_mtudisc) 2814 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2815 } 2816 #ifdef INET6 2817 else if (tp->pf == AF_INET6) { 2818 /* 2819 * for IPv6, path MTU discovery is always turned on, 2820 * or the node must use packet size <= 1280. 2821 */ 2822 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2823 } 2824 #endif /* INET6 */ 2825 2826 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2827 if (offer != -1) { 2828 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2829 mssopt = max(tcp_mssdflt, mssopt); 2830 } 2831 out: 2832 if_put(ifp); 2833 /* 2834 * The current mss, t_maxseg, is initialized to the default value. 2835 * If we compute a smaller value, reduce the current mss. 2836 * If we compute a larger value, return it for use in sending 2837 * a max seg size option, but don't store it for use 2838 * unless we received an offer at least that large from peer. 2839 * 2840 * However, do not accept offers lower than the minimum of 2841 * the interface MTU and 216. 2842 */ 2843 if (offer > 0) 2844 tp->t_peermss = offer; 2845 if (tp->t_peermss) 2846 mss = min(mss, max(tp->t_peermss, 216)); 2847 2848 /* sanity - at least max opt. space */ 2849 mss = max(mss, 64); 2850 2851 /* 2852 * maxopd stores the maximum length of data AND options 2853 * in a segment; maxseg is the amount of data in a normal 2854 * segment. We need to store this value (maxopd) apart 2855 * from maxseg, because now every segment carries options 2856 * and thus we normally have somewhat less data in segments. 2857 */ 2858 tp->t_maxopd = mss; 2859 2860 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2861 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2862 mss -= TCPOLEN_TSTAMP_APPA; 2863 #ifdef TCP_SIGNATURE 2864 if (tp->t_flags & TF_SIGNATURE) 2865 mss -= TCPOLEN_SIGLEN; 2866 #endif 2867 2868 if (offer == -1) { 2869 /* mss changed due to Path MTU discovery */ 2870 tp->t_flags &= ~TF_PMTUD_PEND; 2871 tp->t_pmtud_mtu_sent = 0; 2872 tp->t_pmtud_mss_acked = 0; 2873 if (mss < tp->t_maxseg) { 2874 /* 2875 * Follow suggestion in RFC 2414 to reduce the 2876 * congestion window by the ratio of the old 2877 * segment size to the new segment size. 2878 */ 2879 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2880 mss, mss); 2881 } 2882 } else if (tcp_do_rfc3390 == 2) { 2883 /* increase initial window */ 2884 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2885 } else if (tcp_do_rfc3390) { 2886 /* increase initial window */ 2887 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2888 } else 2889 tp->snd_cwnd = mss; 2890 2891 tp->t_maxseg = mss; 2892 2893 return (offer != -1 ? mssopt : mss); 2894 } 2895 2896 u_int 2897 tcp_hdrsz(struct tcpcb *tp) 2898 { 2899 u_int hlen; 2900 2901 switch (tp->pf) { 2902 #ifdef INET6 2903 case AF_INET6: 2904 hlen = sizeof(struct ip6_hdr); 2905 break; 2906 #endif 2907 case AF_INET: 2908 hlen = sizeof(struct ip); 2909 break; 2910 default: 2911 hlen = 0; 2912 break; 2913 } 2914 hlen += sizeof(struct tcphdr); 2915 2916 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2917 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2918 hlen += TCPOLEN_TSTAMP_APPA; 2919 #ifdef TCP_SIGNATURE 2920 if (tp->t_flags & TF_SIGNATURE) 2921 hlen += TCPOLEN_SIGLEN; 2922 #endif 2923 return (hlen); 2924 } 2925 2926 /* 2927 * Set connection variables based on the effective MSS. 2928 * We are passed the TCPCB for the actual connection. If we 2929 * are the server, we are called by the compressed state engine 2930 * when the 3-way handshake is complete. If we are the client, 2931 * we are called when we receive the SYN,ACK from the server. 2932 * 2933 * NOTE: The t_maxseg value must be initialized in the TCPCB 2934 * before this routine is called! 2935 */ 2936 void 2937 tcp_mss_update(struct tcpcb *tp) 2938 { 2939 int mss; 2940 u_long bufsize; 2941 struct rtentry *rt; 2942 struct socket *so; 2943 2944 so = tp->t_inpcb->inp_socket; 2945 mss = tp->t_maxseg; 2946 2947 rt = in_pcbrtentry(tp->t_inpcb); 2948 2949 if (rt == NULL) 2950 return; 2951 2952 bufsize = so->so_snd.sb_hiwat; 2953 if (bufsize < mss) { 2954 mss = bufsize; 2955 /* Update t_maxseg and t_maxopd */ 2956 tcp_mss(tp, mss); 2957 } else { 2958 bufsize = roundup(bufsize, mss); 2959 if (bufsize > sb_max) 2960 bufsize = sb_max; 2961 (void)sbreserve(so, &so->so_snd, bufsize); 2962 } 2963 2964 bufsize = so->so_rcv.sb_hiwat; 2965 if (bufsize > mss) { 2966 bufsize = roundup(bufsize, mss); 2967 if (bufsize > sb_max) 2968 bufsize = sb_max; 2969 (void)sbreserve(so, &so->so_rcv, bufsize); 2970 } 2971 2972 } 2973 2974 /* 2975 * When a partial ack arrives, force the retransmission of the 2976 * next unacknowledged segment. Do not clear tp->t_dupacks. 2977 * By setting snd_nxt to ti_ack, this forces retransmission timer 2978 * to be started again. 2979 */ 2980 void 2981 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 2982 { 2983 /* 2984 * snd_una has not been updated and the socket send buffer 2985 * not yet drained of the acked data, so we have to leave 2986 * snd_una as it was to get the correct data offset in 2987 * tcp_output(). 2988 */ 2989 tcp_seq onxt = tp->snd_nxt; 2990 u_long ocwnd = tp->snd_cwnd; 2991 2992 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2993 tp->t_rtttime = 0; 2994 tp->snd_nxt = th->th_ack; 2995 /* 2996 * Set snd_cwnd to one segment beyond acknowledged offset 2997 * (tp->snd_una not yet updated when this function is called) 2998 */ 2999 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3000 (void)tcp_output(tp); 3001 tp->snd_cwnd = ocwnd; 3002 if (SEQ_GT(onxt, tp->snd_nxt)) 3003 tp->snd_nxt = onxt; 3004 /* 3005 * Partial window deflation. Relies on fact that tp->snd_una 3006 * not updated yet. 3007 */ 3008 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3009 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3010 else 3011 tp->snd_cwnd = 0; 3012 tp->snd_cwnd += tp->t_maxseg; 3013 } 3014 3015 int 3016 tcp_mss_adv(struct mbuf *m, int af) 3017 { 3018 int mss = 0; 3019 int iphlen; 3020 struct ifnet *ifp = NULL; 3021 3022 if (m && (m->m_flags & M_PKTHDR)) 3023 ifp = if_get(m->m_pkthdr.ph_ifidx); 3024 3025 switch (af) { 3026 case AF_INET: 3027 if (ifp != NULL) 3028 mss = ifp->if_mtu; 3029 iphlen = sizeof(struct ip); 3030 break; 3031 #ifdef INET6 3032 case AF_INET6: 3033 if (ifp != NULL) 3034 mss = ifp->if_mtu; 3035 iphlen = sizeof(struct ip6_hdr); 3036 break; 3037 #endif 3038 default: 3039 unhandled_af(af); 3040 } 3041 if_put(ifp); 3042 mss = mss - iphlen - sizeof(struct tcphdr); 3043 return (max(mss, tcp_mssdflt)); 3044 } 3045 3046 /* 3047 * TCP compressed state engine. Currently used to hold compressed 3048 * state for SYN_RECEIVED. 3049 */ 3050 3051 /* syn hash parameters */ 3052 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3053 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3054 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3055 int tcp_syn_use_limit = 100000; 3056 3057 struct syn_cache_set tcp_syn_cache[2]; 3058 int tcp_syn_cache_active; 3059 3060 #define SYN_HASH(sa, sp, dp, rand) \ 3061 (((sa)->s_addr ^ (rand)[0]) * \ 3062 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3063 #ifndef INET6 3064 #define SYN_HASHALL(hash, src, dst, rand) \ 3065 do { \ 3066 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3067 satosin(src)->sin_port, \ 3068 satosin(dst)->sin_port, (rand)); \ 3069 } while (/*CONSTCOND*/ 0) 3070 #else 3071 #define SYN_HASH6(sa, sp, dp, rand) \ 3072 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3073 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3074 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3075 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3076 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3077 3078 #define SYN_HASHALL(hash, src, dst, rand) \ 3079 do { \ 3080 switch ((src)->sa_family) { \ 3081 case AF_INET: \ 3082 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3083 satosin(src)->sin_port, \ 3084 satosin(dst)->sin_port, (rand)); \ 3085 break; \ 3086 case AF_INET6: \ 3087 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3088 satosin6(src)->sin6_port, \ 3089 satosin6(dst)->sin6_port, (rand)); \ 3090 break; \ 3091 default: \ 3092 hash = 0; \ 3093 } \ 3094 } while (/*CONSTCOND*/0) 3095 #endif /* INET6 */ 3096 3097 void 3098 syn_cache_rm(struct syn_cache *sc) 3099 { 3100 sc->sc_flags |= SCF_DEAD; 3101 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3102 sc->sc_tp = NULL; 3103 LIST_REMOVE(sc, sc_tpq); 3104 sc->sc_buckethead->sch_length--; 3105 timeout_del(&sc->sc_timer); 3106 sc->sc_set->scs_count--; 3107 } 3108 3109 void 3110 syn_cache_put(struct syn_cache *sc) 3111 { 3112 m_free(sc->sc_ipopts); 3113 if (sc->sc_route4.ro_rt != NULL) { 3114 rtfree(sc->sc_route4.ro_rt); 3115 sc->sc_route4.ro_rt = NULL; 3116 } 3117 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3118 timeout_add(&sc->sc_timer, 0); 3119 } 3120 3121 struct pool syn_cache_pool; 3122 3123 /* 3124 * We don't estimate RTT with SYNs, so each packet starts with the default 3125 * RTT and each timer step has a fixed timeout value. 3126 */ 3127 #define SYN_CACHE_TIMER_ARM(sc) \ 3128 do { \ 3129 TCPT_RANGESET((sc)->sc_rxtcur, \ 3130 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3131 TCPTV_REXMTMAX); \ 3132 if (!timeout_initialized(&(sc)->sc_timer)) \ 3133 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3134 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3135 } while (/*CONSTCOND*/0) 3136 3137 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3138 3139 void 3140 syn_cache_init(void) 3141 { 3142 int i; 3143 3144 /* Initialize the hash buckets. */ 3145 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3146 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3147 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3148 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3149 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3150 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3151 for (i = 0; i < tcp_syn_hash_size; i++) { 3152 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3153 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3154 } 3155 3156 /* Initialize the syn cache pool. */ 3157 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3158 0, "syncache", NULL); 3159 } 3160 3161 void 3162 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3163 { 3164 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3165 struct syn_cache_head *scp; 3166 struct syn_cache *sc2; 3167 int i; 3168 3169 NET_ASSERT_LOCKED(); 3170 3171 /* 3172 * If there are no entries in the hash table, reinitialize 3173 * the hash secrets. To avoid useless cache swaps and 3174 * reinitialization, use it until the limit is reached. 3175 * An emtpy cache is also the oportunity to resize the hash. 3176 */ 3177 if (set->scs_count == 0 && set->scs_use <= 0) { 3178 set->scs_use = tcp_syn_use_limit; 3179 if (set->scs_size != tcp_syn_hash_size) { 3180 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3181 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3182 if (scp == NULL) { 3183 /* Try again next time. */ 3184 set->scs_use = 0; 3185 } else { 3186 free(set->scs_buckethead, M_SYNCACHE, 3187 set->scs_size * 3188 sizeof(struct syn_cache_head)); 3189 set->scs_buckethead = scp; 3190 set->scs_size = tcp_syn_hash_size; 3191 for (i = 0; i < tcp_syn_hash_size; i++) 3192 TAILQ_INIT(&scp[i].sch_bucket); 3193 } 3194 } 3195 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3196 tcpstat_inc(tcps_sc_seedrandom); 3197 } 3198 3199 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3200 set->scs_random); 3201 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3202 sc->sc_buckethead = scp; 3203 3204 /* 3205 * Make sure that we don't overflow the per-bucket 3206 * limit or the total cache size limit. 3207 */ 3208 if (scp->sch_length >= tcp_syn_bucket_limit) { 3209 tcpstat_inc(tcps_sc_bucketoverflow); 3210 /* 3211 * Someone might attack our bucket hash function. Reseed 3212 * with random as soon as the passive syn cache gets empty. 3213 */ 3214 set->scs_use = 0; 3215 /* 3216 * The bucket is full. Toss the oldest element in the 3217 * bucket. This will be the first entry in the bucket. 3218 */ 3219 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3220 #ifdef DIAGNOSTIC 3221 /* 3222 * This should never happen; we should always find an 3223 * entry in our bucket. 3224 */ 3225 if (sc2 == NULL) 3226 panic("%s: bucketoverflow: impossible", __func__); 3227 #endif 3228 syn_cache_rm(sc2); 3229 syn_cache_put(sc2); 3230 } else if (set->scs_count >= tcp_syn_cache_limit) { 3231 struct syn_cache_head *scp2, *sce; 3232 3233 tcpstat_inc(tcps_sc_overflowed); 3234 /* 3235 * The cache is full. Toss the oldest entry in the 3236 * first non-empty bucket we can find. 3237 * 3238 * XXX We would really like to toss the oldest 3239 * entry in the cache, but we hope that this 3240 * condition doesn't happen very often. 3241 */ 3242 scp2 = scp; 3243 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3244 sce = &set->scs_buckethead[set->scs_size]; 3245 for (++scp2; scp2 != scp; scp2++) { 3246 if (scp2 >= sce) 3247 scp2 = &set->scs_buckethead[0]; 3248 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3249 break; 3250 } 3251 #ifdef DIAGNOSTIC 3252 /* 3253 * This should never happen; we should always find a 3254 * non-empty bucket. 3255 */ 3256 if (scp2 == scp) 3257 panic("%s: cacheoverflow: impossible", 3258 __func__); 3259 #endif 3260 } 3261 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3262 syn_cache_rm(sc2); 3263 syn_cache_put(sc2); 3264 } 3265 3266 /* 3267 * Initialize the entry's timer. 3268 */ 3269 sc->sc_rxttot = 0; 3270 sc->sc_rxtshift = 0; 3271 SYN_CACHE_TIMER_ARM(sc); 3272 3273 /* Link it from tcpcb entry */ 3274 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3275 3276 /* Put it into the bucket. */ 3277 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3278 scp->sch_length++; 3279 sc->sc_set = set; 3280 set->scs_count++; 3281 set->scs_use--; 3282 3283 tcpstat_inc(tcps_sc_added); 3284 3285 /* 3286 * If the active cache has exceeded its use limit and 3287 * the passive syn cache is empty, exchange their roles. 3288 */ 3289 if (set->scs_use <= 0 && 3290 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3291 tcp_syn_cache_active = !tcp_syn_cache_active; 3292 } 3293 3294 /* 3295 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3296 * If we have retransmitted an entry the maximum number of times, expire 3297 * that entry. 3298 */ 3299 void 3300 syn_cache_timer(void *arg) 3301 { 3302 struct syn_cache *sc = arg; 3303 3304 NET_LOCK(); 3305 if (sc->sc_flags & SCF_DEAD) 3306 goto out; 3307 3308 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3309 /* Drop it -- too many retransmissions. */ 3310 goto dropit; 3311 } 3312 3313 /* 3314 * Compute the total amount of time this entry has 3315 * been on a queue. If this entry has been on longer 3316 * than the keep alive timer would allow, expire it. 3317 */ 3318 sc->sc_rxttot += sc->sc_rxtcur; 3319 if (sc->sc_rxttot >= tcptv_keep_init) 3320 goto dropit; 3321 3322 tcpstat_inc(tcps_sc_retransmitted); 3323 (void) syn_cache_respond(sc, NULL); 3324 3325 /* Advance the timer back-off. */ 3326 sc->sc_rxtshift++; 3327 SYN_CACHE_TIMER_ARM(sc); 3328 3329 out: 3330 NET_UNLOCK(); 3331 return; 3332 3333 dropit: 3334 tcpstat_inc(tcps_sc_timed_out); 3335 syn_cache_rm(sc); 3336 syn_cache_put(sc); 3337 NET_UNLOCK(); 3338 } 3339 3340 void 3341 syn_cache_reaper(void *arg) 3342 { 3343 struct syn_cache *sc = arg; 3344 3345 pool_put(&syn_cache_pool, (sc)); 3346 return; 3347 } 3348 3349 /* 3350 * Remove syn cache created by the specified tcb entry, 3351 * because this does not make sense to keep them 3352 * (if there's no tcb entry, syn cache entry will never be used) 3353 */ 3354 void 3355 syn_cache_cleanup(struct tcpcb *tp) 3356 { 3357 struct syn_cache *sc, *nsc; 3358 3359 NET_ASSERT_LOCKED(); 3360 3361 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3362 #ifdef DIAGNOSTIC 3363 if (sc->sc_tp != tp) 3364 panic("invalid sc_tp in syn_cache_cleanup"); 3365 #endif 3366 syn_cache_rm(sc); 3367 syn_cache_put(sc); 3368 } 3369 /* just for safety */ 3370 LIST_INIT(&tp->t_sc); 3371 } 3372 3373 /* 3374 * Find an entry in the syn cache. 3375 */ 3376 struct syn_cache * 3377 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3378 struct syn_cache_head **headp, u_int rtableid) 3379 { 3380 struct syn_cache_set *sets[2]; 3381 struct syn_cache *sc; 3382 struct syn_cache_head *scp; 3383 u_int32_t hash; 3384 int i; 3385 3386 NET_ASSERT_LOCKED(); 3387 3388 /* Check the active cache first, the passive cache is likely emtpy. */ 3389 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3390 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3391 for (i = 0; i < 2; i++) { 3392 if (sets[i]->scs_count == 0) 3393 continue; 3394 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3395 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3396 *headp = scp; 3397 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3398 if (sc->sc_hash != hash) 3399 continue; 3400 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3401 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3402 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3403 return (sc); 3404 } 3405 } 3406 return (NULL); 3407 } 3408 3409 /* 3410 * This function gets called when we receive an ACK for a 3411 * socket in the LISTEN state. We look up the connection 3412 * in the syn cache, and if its there, we pull it out of 3413 * the cache and turn it into a full-blown connection in 3414 * the SYN-RECEIVED state. 3415 * 3416 * The return values may not be immediately obvious, and their effects 3417 * can be subtle, so here they are: 3418 * 3419 * NULL SYN was not found in cache; caller should drop the 3420 * packet and send an RST. 3421 * 3422 * -1 We were unable to create the new connection, and are 3423 * aborting it. An ACK,RST is being sent to the peer 3424 * (unless we got screwey sequence numbners; see below), 3425 * because the 3-way handshake has been completed. Caller 3426 * should not free the mbuf, since we may be using it. If 3427 * we are not, we will free it. 3428 * 3429 * Otherwise, the return value is a pointer to the new socket 3430 * associated with the connection. 3431 */ 3432 struct socket * 3433 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3434 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3435 { 3436 struct syn_cache *sc; 3437 struct syn_cache_head *scp; 3438 struct inpcb *inp, *oldinp; 3439 struct tcpcb *tp = NULL; 3440 struct mbuf *am; 3441 struct socket *oso; 3442 3443 NET_ASSERT_LOCKED(); 3444 3445 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3446 if (sc == NULL) 3447 return (NULL); 3448 3449 /* 3450 * Verify the sequence and ack numbers. Try getting the correct 3451 * response again. 3452 */ 3453 if ((th->th_ack != sc->sc_iss + 1) || 3454 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3455 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3456 (void) syn_cache_respond(sc, m); 3457 return ((struct socket *)(-1)); 3458 } 3459 3460 /* Remove this cache entry */ 3461 syn_cache_rm(sc); 3462 3463 /* 3464 * Ok, create the full blown connection, and set things up 3465 * as they would have been set up if we had created the 3466 * connection when the SYN arrived. If we can't create 3467 * the connection, abort it. 3468 */ 3469 oso = so; 3470 so = sonewconn(so, SS_ISCONNECTED); 3471 if (so == NULL) 3472 goto resetandabort; 3473 3474 oldinp = sotoinpcb(oso); 3475 inp = sotoinpcb(so); 3476 3477 #ifdef IPSEC 3478 /* 3479 * We need to copy the required security levels 3480 * from the old pcb. Ditto for any other 3481 * IPsec-related information. 3482 */ 3483 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3484 sizeof(oldinp->inp_seclevel)); 3485 #endif /* IPSEC */ 3486 #ifdef INET6 3487 /* 3488 * inp still has the OLD in_pcb stuff, set the 3489 * v6-related flags on the new guy, too. 3490 */ 3491 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3492 if (inp->inp_flags & INP_IPV6) { 3493 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3494 inp->inp_hops = oldinp->inp_hops; 3495 } else 3496 #endif /* INET6 */ 3497 { 3498 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3499 } 3500 3501 #if NPF > 0 3502 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3503 struct pf_divert *divert; 3504 3505 divert = pf_find_divert(m); 3506 KASSERT(divert != NULL); 3507 inp->inp_rtableid = divert->rdomain; 3508 } else 3509 #endif 3510 /* inherit rtable from listening socket */ 3511 inp->inp_rtableid = sc->sc_rtableid; 3512 3513 inp->inp_lport = th->th_dport; 3514 switch (src->sa_family) { 3515 #ifdef INET6 3516 case AF_INET6: 3517 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3518 break; 3519 #endif /* INET6 */ 3520 case AF_INET: 3521 inp->inp_laddr = satosin(dst)->sin_addr; 3522 inp->inp_options = ip_srcroute(m); 3523 if (inp->inp_options == NULL) { 3524 inp->inp_options = sc->sc_ipopts; 3525 sc->sc_ipopts = NULL; 3526 } 3527 break; 3528 } 3529 in_pcbrehash(inp); 3530 3531 /* 3532 * Give the new socket our cached route reference. 3533 */ 3534 if (src->sa_family == AF_INET) 3535 inp->inp_route = sc->sc_route4; /* struct assignment */ 3536 #ifdef INET6 3537 else 3538 inp->inp_route6 = sc->sc_route6; 3539 #endif 3540 sc->sc_route4.ro_rt = NULL; 3541 3542 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3543 if (am == NULL) 3544 goto resetandabort; 3545 am->m_len = src->sa_len; 3546 memcpy(mtod(am, caddr_t), src, src->sa_len); 3547 if (in_pcbconnect(inp, am)) { 3548 (void) m_free(am); 3549 goto resetandabort; 3550 } 3551 (void) m_free(am); 3552 3553 tp = intotcpcb(inp); 3554 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3555 if (sc->sc_request_r_scale != 15) { 3556 tp->requested_s_scale = sc->sc_requested_s_scale; 3557 tp->request_r_scale = sc->sc_request_r_scale; 3558 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3559 } 3560 if (sc->sc_flags & SCF_TIMESTAMP) 3561 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3562 3563 tp->t_template = tcp_template(tp); 3564 if (tp->t_template == 0) { 3565 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3566 so = NULL; 3567 goto abort; 3568 } 3569 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3570 tp->ts_modulate = sc->sc_modulate; 3571 tp->ts_recent = sc->sc_timestamp; 3572 tp->iss = sc->sc_iss; 3573 tp->irs = sc->sc_irs; 3574 tcp_sendseqinit(tp); 3575 tp->snd_last = tp->snd_una; 3576 #ifdef TCP_ECN 3577 if (sc->sc_flags & SCF_ECN_PERMIT) { 3578 tp->t_flags |= TF_ECN_PERMIT; 3579 tcpstat_inc(tcps_ecn_accepts); 3580 } 3581 #endif 3582 if (sc->sc_flags & SCF_SACK_PERMIT) 3583 tp->t_flags |= TF_SACK_PERMIT; 3584 #ifdef TCP_SIGNATURE 3585 if (sc->sc_flags & SCF_SIGNATURE) 3586 tp->t_flags |= TF_SIGNATURE; 3587 #endif 3588 tcp_rcvseqinit(tp); 3589 tp->t_state = TCPS_SYN_RECEIVED; 3590 tp->t_rcvtime = tcp_now; 3591 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3592 tcpstat_inc(tcps_accepts); 3593 3594 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3595 if (sc->sc_peermaxseg) 3596 tcp_mss_update(tp); 3597 /* Reset initial window to 1 segment for retransmit */ 3598 if (sc->sc_rxtshift > 0) 3599 tp->snd_cwnd = tp->t_maxseg; 3600 tp->snd_wl1 = sc->sc_irs; 3601 tp->rcv_up = sc->sc_irs + 1; 3602 3603 /* 3604 * This is what whould have happened in tcp_output() when 3605 * the SYN,ACK was sent. 3606 */ 3607 tp->snd_up = tp->snd_una; 3608 tp->snd_max = tp->snd_nxt = tp->iss+1; 3609 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3610 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3611 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3612 tp->last_ack_sent = tp->rcv_nxt; 3613 3614 tcpstat_inc(tcps_sc_completed); 3615 syn_cache_put(sc); 3616 return (so); 3617 3618 resetandabort: 3619 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3620 m->m_pkthdr.ph_rtableid); 3621 abort: 3622 m_freem(m); 3623 if (so != NULL) 3624 (void) soabort(so); 3625 syn_cache_put(sc); 3626 tcpstat_inc(tcps_sc_aborted); 3627 return ((struct socket *)(-1)); 3628 } 3629 3630 /* 3631 * This function is called when we get a RST for a 3632 * non-existent connection, so that we can see if the 3633 * connection is in the syn cache. If it is, zap it. 3634 */ 3635 3636 void 3637 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3638 u_int rtableid) 3639 { 3640 struct syn_cache *sc; 3641 struct syn_cache_head *scp; 3642 3643 NET_ASSERT_LOCKED(); 3644 3645 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3646 return; 3647 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3648 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3649 return; 3650 syn_cache_rm(sc); 3651 tcpstat_inc(tcps_sc_reset); 3652 syn_cache_put(sc); 3653 } 3654 3655 void 3656 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3657 u_int rtableid) 3658 { 3659 struct syn_cache *sc; 3660 struct syn_cache_head *scp; 3661 3662 NET_ASSERT_LOCKED(); 3663 3664 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3665 return; 3666 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3667 if (ntohl (th->th_seq) != sc->sc_iss) { 3668 return; 3669 } 3670 3671 /* 3672 * If we've retransmitted 3 times and this is our second error, 3673 * we remove the entry. Otherwise, we allow it to continue on. 3674 * This prevents us from incorrectly nuking an entry during a 3675 * spurious network outage. 3676 * 3677 * See tcp_notify(). 3678 */ 3679 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3680 sc->sc_flags |= SCF_UNREACH; 3681 return; 3682 } 3683 3684 syn_cache_rm(sc); 3685 tcpstat_inc(tcps_sc_unreach); 3686 syn_cache_put(sc); 3687 } 3688 3689 /* 3690 * Given a LISTEN socket and an inbound SYN request, add 3691 * this to the syn cache, and send back a segment: 3692 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3693 * to the source. 3694 * 3695 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3696 * Doing so would require that we hold onto the data and deliver it 3697 * to the application. However, if we are the target of a SYN-flood 3698 * DoS attack, an attacker could send data which would eventually 3699 * consume all available buffer space if it were ACKed. By not ACKing 3700 * the data, we avoid this DoS scenario. 3701 */ 3702 3703 int 3704 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3705 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3706 struct tcp_opt_info *oi, tcp_seq *issp) 3707 { 3708 struct tcpcb tb, *tp; 3709 long win; 3710 struct syn_cache *sc; 3711 struct syn_cache_head *scp; 3712 struct mbuf *ipopts; 3713 3714 tp = sototcpcb(so); 3715 3716 /* 3717 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3718 * 3719 * Note this check is performed in tcp_input() very early on. 3720 */ 3721 3722 /* 3723 * Initialize some local state. 3724 */ 3725 win = sbspace(so, &so->so_rcv); 3726 if (win > TCP_MAXWIN) 3727 win = TCP_MAXWIN; 3728 3729 bzero(&tb, sizeof(tb)); 3730 #ifdef TCP_SIGNATURE 3731 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3732 #else 3733 if (optp) { 3734 #endif 3735 tb.pf = tp->pf; 3736 tb.sack_enable = tp->sack_enable; 3737 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3738 #ifdef TCP_SIGNATURE 3739 if (tp->t_flags & TF_SIGNATURE) 3740 tb.t_flags |= TF_SIGNATURE; 3741 #endif 3742 tb.t_state = TCPS_LISTEN; 3743 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3744 sotoinpcb(so)->inp_rtableid)) 3745 return (-1); 3746 } 3747 3748 switch (src->sa_family) { 3749 case AF_INET: 3750 /* 3751 * Remember the IP options, if any. 3752 */ 3753 ipopts = ip_srcroute(m); 3754 break; 3755 default: 3756 ipopts = NULL; 3757 } 3758 3759 /* 3760 * See if we already have an entry for this connection. 3761 * If we do, resend the SYN,ACK. We do not count this 3762 * as a retransmission (XXX though maybe we should). 3763 */ 3764 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3765 if (sc != NULL) { 3766 tcpstat_inc(tcps_sc_dupesyn); 3767 if (ipopts) { 3768 /* 3769 * If we were remembering a previous source route, 3770 * forget it and use the new one we've been given. 3771 */ 3772 m_free(sc->sc_ipopts); 3773 sc->sc_ipopts = ipopts; 3774 } 3775 sc->sc_timestamp = tb.ts_recent; 3776 if (syn_cache_respond(sc, m) == 0) { 3777 tcpstat_inc(tcps_sndacks); 3778 tcpstat_inc(tcps_sndtotal); 3779 } 3780 return (0); 3781 } 3782 3783 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3784 if (sc == NULL) { 3785 m_free(ipopts); 3786 return (-1); 3787 } 3788 3789 /* 3790 * Fill in the cache, and put the necessary IP and TCP 3791 * options into the reply. 3792 */ 3793 memcpy(&sc->sc_src, src, src->sa_len); 3794 memcpy(&sc->sc_dst, dst, dst->sa_len); 3795 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3796 sc->sc_flags = 0; 3797 sc->sc_ipopts = ipopts; 3798 sc->sc_irs = th->th_seq; 3799 3800 sc->sc_iss = issp ? *issp : arc4random(); 3801 sc->sc_peermaxseg = oi->maxseg; 3802 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3803 sc->sc_win = win; 3804 sc->sc_timestamp = tb.ts_recent; 3805 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3806 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3807 sc->sc_flags |= SCF_TIMESTAMP; 3808 sc->sc_modulate = arc4random(); 3809 } 3810 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3811 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3812 sc->sc_requested_s_scale = tb.requested_s_scale; 3813 sc->sc_request_r_scale = 0; 3814 /* 3815 * Pick the smallest possible scaling factor that 3816 * will still allow us to scale up to sb_max. 3817 * 3818 * We do this because there are broken firewalls that 3819 * will corrupt the window scale option, leading to 3820 * the other endpoint believing that our advertised 3821 * window is unscaled. At scale factors larger than 3822 * 5 the unscaled window will drop below 1500 bytes, 3823 * leading to serious problems when traversing these 3824 * broken firewalls. 3825 * 3826 * With the default sbmax of 256K, a scale factor 3827 * of 3 will be chosen by this algorithm. Those who 3828 * choose a larger sbmax should watch out 3829 * for the compatiblity problems mentioned above. 3830 * 3831 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3832 * or <SYN,ACK>) segment itself is never scaled. 3833 */ 3834 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3835 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3836 sc->sc_request_r_scale++; 3837 } else { 3838 sc->sc_requested_s_scale = 15; 3839 sc->sc_request_r_scale = 15; 3840 } 3841 #ifdef TCP_ECN 3842 /* 3843 * if both ECE and CWR flag bits are set, peer is ECN capable. 3844 */ 3845 if (tcp_do_ecn && 3846 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3847 sc->sc_flags |= SCF_ECN_PERMIT; 3848 #endif 3849 /* 3850 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3851 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3852 */ 3853 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3854 sc->sc_flags |= SCF_SACK_PERMIT; 3855 #ifdef TCP_SIGNATURE 3856 if (tb.t_flags & TF_SIGNATURE) 3857 sc->sc_flags |= SCF_SIGNATURE; 3858 #endif 3859 sc->sc_tp = tp; 3860 if (syn_cache_respond(sc, m) == 0) { 3861 syn_cache_insert(sc, tp); 3862 tcpstat_inc(tcps_sndacks); 3863 tcpstat_inc(tcps_sndtotal); 3864 } else { 3865 syn_cache_put(sc); 3866 tcpstat_inc(tcps_sc_dropped); 3867 } 3868 3869 return (0); 3870 } 3871 3872 int 3873 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 3874 { 3875 u_int8_t *optp; 3876 int optlen, error; 3877 u_int16_t tlen; 3878 struct ip *ip = NULL; 3879 #ifdef INET6 3880 struct ip6_hdr *ip6 = NULL; 3881 #endif 3882 struct tcphdr *th; 3883 u_int hlen; 3884 struct inpcb *inp; 3885 3886 switch (sc->sc_src.sa.sa_family) { 3887 case AF_INET: 3888 hlen = sizeof(struct ip); 3889 break; 3890 #ifdef INET6 3891 case AF_INET6: 3892 hlen = sizeof(struct ip6_hdr); 3893 break; 3894 #endif 3895 default: 3896 m_freem(m); 3897 return (EAFNOSUPPORT); 3898 } 3899 3900 /* Compute the size of the TCP options. */ 3901 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3902 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 3903 #ifdef TCP_SIGNATURE 3904 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3905 #endif 3906 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3907 3908 tlen = hlen + sizeof(struct tcphdr) + optlen; 3909 3910 /* 3911 * Create the IP+TCP header from scratch. 3912 */ 3913 m_freem(m); 3914 #ifdef DIAGNOSTIC 3915 if (max_linkhdr + tlen > MCLBYTES) 3916 return (ENOBUFS); 3917 #endif 3918 MGETHDR(m, M_DONTWAIT, MT_DATA); 3919 if (m && max_linkhdr + tlen > MHLEN) { 3920 MCLGET(m, M_DONTWAIT); 3921 if ((m->m_flags & M_EXT) == 0) { 3922 m_freem(m); 3923 m = NULL; 3924 } 3925 } 3926 if (m == NULL) 3927 return (ENOBUFS); 3928 3929 /* Fixup the mbuf. */ 3930 m->m_data += max_linkhdr; 3931 m->m_len = m->m_pkthdr.len = tlen; 3932 m->m_pkthdr.ph_ifidx = 0; 3933 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 3934 memset(mtod(m, u_char *), 0, tlen); 3935 3936 switch (sc->sc_src.sa.sa_family) { 3937 case AF_INET: 3938 ip = mtod(m, struct ip *); 3939 ip->ip_dst = sc->sc_src.sin.sin_addr; 3940 ip->ip_src = sc->sc_dst.sin.sin_addr; 3941 ip->ip_p = IPPROTO_TCP; 3942 th = (struct tcphdr *)(ip + 1); 3943 th->th_dport = sc->sc_src.sin.sin_port; 3944 th->th_sport = sc->sc_dst.sin.sin_port; 3945 break; 3946 #ifdef INET6 3947 case AF_INET6: 3948 ip6 = mtod(m, struct ip6_hdr *); 3949 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3950 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3951 ip6->ip6_nxt = IPPROTO_TCP; 3952 /* ip6_plen will be updated in ip6_output() */ 3953 th = (struct tcphdr *)(ip6 + 1); 3954 th->th_dport = sc->sc_src.sin6.sin6_port; 3955 th->th_sport = sc->sc_dst.sin6.sin6_port; 3956 break; 3957 #endif 3958 default: 3959 unhandled_af(sc->sc_src.sa.sa_family); 3960 } 3961 3962 th->th_seq = htonl(sc->sc_iss); 3963 th->th_ack = htonl(sc->sc_irs + 1); 3964 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3965 th->th_flags = TH_SYN|TH_ACK; 3966 #ifdef TCP_ECN 3967 /* Set ECE for SYN-ACK if peer supports ECN. */ 3968 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 3969 th->th_flags |= TH_ECE; 3970 #endif 3971 th->th_win = htons(sc->sc_win); 3972 /* th_sum already 0 */ 3973 /* th_urp already 0 */ 3974 3975 /* Tack on the TCP options. */ 3976 optp = (u_int8_t *)(th + 1); 3977 *optp++ = TCPOPT_MAXSEG; 3978 *optp++ = 4; 3979 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3980 *optp++ = sc->sc_ourmaxseg & 0xff; 3981 3982 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 3983 if (sc->sc_flags & SCF_SACK_PERMIT) { 3984 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 3985 optp += 4; 3986 } 3987 3988 if (sc->sc_request_r_scale != 15) { 3989 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3990 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3991 sc->sc_request_r_scale); 3992 optp += 4; 3993 } 3994 3995 if (sc->sc_flags & SCF_TIMESTAMP) { 3996 u_int32_t *lp = (u_int32_t *)(optp); 3997 /* Form timestamp option as shown in appendix A of RFC 1323. */ 3998 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 3999 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4000 *lp = htonl(sc->sc_timestamp); 4001 optp += TCPOLEN_TSTAMP_APPA; 4002 } 4003 4004 #ifdef TCP_SIGNATURE 4005 if (sc->sc_flags & SCF_SIGNATURE) { 4006 union sockaddr_union src, dst; 4007 struct tdb *tdb; 4008 4009 bzero(&src, sizeof(union sockaddr_union)); 4010 bzero(&dst, sizeof(union sockaddr_union)); 4011 src.sa.sa_len = sc->sc_src.sa.sa_len; 4012 src.sa.sa_family = sc->sc_src.sa.sa_family; 4013 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4014 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4015 4016 switch (sc->sc_src.sa.sa_family) { 4017 case 0: /*default to PF_INET*/ 4018 case AF_INET: 4019 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4020 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4021 break; 4022 #ifdef INET6 4023 case AF_INET6: 4024 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4025 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4026 break; 4027 #endif /* INET6 */ 4028 } 4029 4030 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4031 0, &src, &dst, IPPROTO_TCP); 4032 if (tdb == NULL) { 4033 m_freem(m); 4034 return (EPERM); 4035 } 4036 4037 /* Send signature option */ 4038 *(optp++) = TCPOPT_SIGNATURE; 4039 *(optp++) = TCPOLEN_SIGNATURE; 4040 4041 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4042 hlen, 0, optp) < 0) { 4043 m_freem(m); 4044 return (EINVAL); 4045 } 4046 optp += 16; 4047 4048 /* Pad options list to the next 32 bit boundary and 4049 * terminate it. 4050 */ 4051 *optp++ = TCPOPT_NOP; 4052 *optp++ = TCPOPT_EOL; 4053 } 4054 #endif /* TCP_SIGNATURE */ 4055 4056 /* Compute the packet's checksum. */ 4057 switch (sc->sc_src.sa.sa_family) { 4058 case AF_INET: 4059 ip->ip_len = htons(tlen - hlen); 4060 th->th_sum = 0; 4061 th->th_sum = in_cksum(m, tlen); 4062 break; 4063 #ifdef INET6 4064 case AF_INET6: 4065 ip6->ip6_plen = htons(tlen - hlen); 4066 th->th_sum = 0; 4067 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4068 break; 4069 #endif 4070 } 4071 4072 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4073 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4074 4075 /* 4076 * Fill in some straggling IP bits. Note the stack expects 4077 * ip_len to be in host order, for convenience. 4078 */ 4079 switch (sc->sc_src.sa.sa_family) { 4080 case AF_INET: 4081 ip->ip_len = htons(tlen); 4082 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4083 if (inp != NULL) 4084 ip->ip_tos = inp->inp_ip.ip_tos; 4085 break; 4086 #ifdef INET6 4087 case AF_INET6: 4088 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4089 ip6->ip6_vfc |= IPV6_VERSION; 4090 ip6->ip6_plen = htons(tlen - hlen); 4091 /* ip6_hlim will be initialized afterwards */ 4092 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4093 break; 4094 #endif 4095 } 4096 4097 switch (sc->sc_src.sa.sa_family) { 4098 case AF_INET: 4099 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4100 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4101 break; 4102 #ifdef INET6 4103 case AF_INET6: 4104 ip6->ip6_hlim = in6_selecthlim(inp); 4105 4106 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4107 NULL, NULL); 4108 break; 4109 #endif 4110 default: 4111 error = EAFNOSUPPORT; 4112 break; 4113 } 4114 return (error); 4115 } 4116