1 /* $OpenBSD: tcp_input.c,v 1.352 2017/11/20 10:35:24 mpi Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if ((tp)->t_flags & TF_DELACK || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_SET_DELACK(tp); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 189 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 190 191 void syn_cache_put(struct syn_cache *); 192 void syn_cache_rm(struct syn_cache *); 193 int syn_cache_respond(struct syn_cache *, struct mbuf *); 194 void syn_cache_timer(void *); 195 void syn_cache_reaper(void *); 196 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 197 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 198 struct tcphdr *, u_int); 199 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 200 unsigned int, struct socket *, struct mbuf *, u_char *, int, 201 struct tcp_opt_info *, tcp_seq *); 202 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 203 struct tcphdr *, unsigned int, unsigned int, struct socket *, 204 struct mbuf *); 205 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 206 struct syn_cache_head **, u_int); 207 208 /* 209 * Insert segment ti into reassembly queue of tcp with 210 * control block tp. Return TH_FIN if reassembly now includes 211 * a segment with FIN. The macro form does the common case inline 212 * (segment is the next to be received on an established connection, 213 * and the queue is empty), avoiding linkage into and removal 214 * from the queue and repetition of various conversions. 215 * Set DELACK for segments received in order, but ack immediately 216 * when segments are out of order (so fast retransmit can work). 217 */ 218 219 int 220 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 221 { 222 struct tcpqent *p, *q, *nq, *tiqe; 223 224 /* 225 * Allocate a new queue entry, before we throw away any data. 226 * If we can't, just drop the packet. XXX 227 */ 228 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 229 if (tiqe == NULL) { 230 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 231 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 232 /* Reuse last entry since new segment fills a hole */ 233 m_freem(tiqe->tcpqe_m); 234 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 235 } 236 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 237 /* Flush segment queue for this connection */ 238 tcp_freeq(tp); 239 tcpstat_inc(tcps_rcvmemdrop); 240 m_freem(m); 241 return (0); 242 } 243 } 244 245 /* 246 * Find a segment which begins after this one does. 247 */ 248 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 249 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 250 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 251 break; 252 253 /* 254 * If there is a preceding segment, it may provide some of 255 * our data already. If so, drop the data from the incoming 256 * segment. If it provides all of our data, drop us. 257 */ 258 if (p != NULL) { 259 struct tcphdr *phdr = p->tcpqe_tcp; 260 int i; 261 262 /* conversion to int (in i) handles seq wraparound */ 263 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 264 if (i > 0) { 265 if (i >= *tlen) { 266 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 267 *tlen); 268 m_freem(m); 269 pool_put(&tcpqe_pool, tiqe); 270 return (0); 271 } 272 m_adj(m, i); 273 *tlen -= i; 274 th->th_seq += i; 275 } 276 } 277 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 278 279 /* 280 * While we overlap succeeding segments trim them or, 281 * if they are completely covered, dequeue them. 282 */ 283 for (; q != NULL; q = nq) { 284 struct tcphdr *qhdr = q->tcpqe_tcp; 285 int i = (th->th_seq + *tlen) - qhdr->th_seq; 286 287 if (i <= 0) 288 break; 289 if (i < qhdr->th_reseqlen) { 290 qhdr->th_seq += i; 291 qhdr->th_reseqlen -= i; 292 m_adj(q->tcpqe_m, i); 293 break; 294 } 295 nq = TAILQ_NEXT(q, tcpqe_q); 296 m_freem(q->tcpqe_m); 297 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 298 pool_put(&tcpqe_pool, q); 299 } 300 301 /* Insert the new segment queue entry into place. */ 302 tiqe->tcpqe_m = m; 303 th->th_reseqlen = *tlen; 304 tiqe->tcpqe_tcp = th; 305 if (p == NULL) { 306 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 307 } else { 308 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 309 } 310 311 if (th->th_seq != tp->rcv_nxt) 312 return (0); 313 314 return (tcp_flush_queue(tp)); 315 } 316 317 int 318 tcp_flush_queue(struct tcpcb *tp) 319 { 320 struct socket *so = tp->t_inpcb->inp_socket; 321 struct tcpqent *q, *nq; 322 int flags; 323 324 /* 325 * Present data to user, advancing rcv_nxt through 326 * completed sequence space. 327 */ 328 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 329 return (0); 330 q = TAILQ_FIRST(&tp->t_segq); 331 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 332 return (0); 333 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 334 return (0); 335 do { 336 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 337 flags = q->tcpqe_tcp->th_flags & TH_FIN; 338 339 nq = TAILQ_NEXT(q, tcpqe_q); 340 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 341 ND6_HINT(tp); 342 if (so->so_state & SS_CANTRCVMORE) 343 m_freem(q->tcpqe_m); 344 else 345 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 346 pool_put(&tcpqe_pool, q); 347 q = nq; 348 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 349 tp->t_flags |= TF_BLOCKOUTPUT; 350 sorwakeup(so); 351 tp->t_flags &= ~TF_BLOCKOUTPUT; 352 return (flags); 353 } 354 355 /* 356 * TCP input routine, follows pages 65-76 of the 357 * protocol specification dated September, 1981 very closely. 358 */ 359 int 360 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 361 { 362 struct mbuf *m = *mp; 363 int iphlen = *offp; 364 struct ip *ip = NULL; 365 struct inpcb *inp = NULL; 366 u_int8_t *optp = NULL; 367 int optlen = 0; 368 int tlen, off; 369 struct tcpcb *tp = NULL; 370 int tiflags; 371 struct socket *so = NULL; 372 int todrop, acked, ourfinisacked; 373 int hdroptlen = 0; 374 short ostate = 0; 375 tcp_seq iss, *reuse = NULL; 376 u_long tiwin; 377 struct tcp_opt_info opti; 378 struct tcphdr *th; 379 #ifdef INET6 380 struct ip6_hdr *ip6 = NULL; 381 #endif /* INET6 */ 382 #ifdef IPSEC 383 struct m_tag *mtag; 384 struct tdb_ident *tdbi; 385 struct tdb *tdb; 386 int error; 387 #endif /* IPSEC */ 388 #ifdef TCP_ECN 389 u_char iptos; 390 #endif 391 392 tcpstat_inc(tcps_rcvtotal); 393 394 opti.ts_present = 0; 395 opti.maxseg = 0; 396 397 /* 398 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 399 */ 400 if (m->m_flags & (M_BCAST|M_MCAST)) 401 goto drop; 402 403 /* 404 * Get IP and TCP header together in first mbuf. 405 * Note: IP leaves IP header in first mbuf. 406 */ 407 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 408 if (!th) { 409 tcpstat_inc(tcps_rcvshort); 410 return IPPROTO_DONE; 411 } 412 413 tlen = m->m_pkthdr.len - iphlen; 414 switch (af) { 415 case AF_INET: 416 ip = mtod(m, struct ip *); 417 #ifdef TCP_ECN 418 /* save ip_tos before clearing it for checksum */ 419 iptos = ip->ip_tos; 420 #endif 421 break; 422 #ifdef INET6 423 case AF_INET6: 424 ip6 = mtod(m, struct ip6_hdr *); 425 #ifdef TCP_ECN 426 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 427 #endif 428 429 /* 430 * Be proactive about unspecified IPv6 address in source. 431 * As we use all-zero to indicate unbounded/unconnected pcb, 432 * unspecified IPv6 address can be used to confuse us. 433 * 434 * Note that packets with unspecified IPv6 destination is 435 * already dropped in ip6_input. 436 */ 437 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 438 /* XXX stat */ 439 goto drop; 440 } 441 442 /* Discard packets to multicast */ 443 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 444 /* XXX stat */ 445 goto drop; 446 } 447 break; 448 #endif 449 default: 450 unhandled_af(af); 451 } 452 453 /* 454 * Checksum extended TCP header and data. 455 */ 456 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 457 int sum; 458 459 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 460 tcpstat_inc(tcps_rcvbadsum); 461 goto drop; 462 } 463 tcpstat_inc(tcps_inswcsum); 464 switch (af) { 465 case AF_INET: 466 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 467 break; 468 #ifdef INET6 469 case AF_INET6: 470 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 471 tlen); 472 break; 473 #endif 474 } 475 if (sum != 0) { 476 tcpstat_inc(tcps_rcvbadsum); 477 goto drop; 478 } 479 } 480 481 /* 482 * Check that TCP offset makes sense, 483 * pull out TCP options and adjust length. XXX 484 */ 485 off = th->th_off << 2; 486 if (off < sizeof(struct tcphdr) || off > tlen) { 487 tcpstat_inc(tcps_rcvbadoff); 488 goto drop; 489 } 490 tlen -= off; 491 if (off > sizeof(struct tcphdr)) { 492 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 493 if (!th) { 494 tcpstat_inc(tcps_rcvshort); 495 return IPPROTO_DONE; 496 } 497 optlen = off - sizeof(struct tcphdr); 498 optp = (u_int8_t *)(th + 1); 499 /* 500 * Do quick retrieval of timestamp options ("options 501 * prediction?"). If timestamp is the only option and it's 502 * formatted as recommended in RFC 1323 appendix A, we 503 * quickly get the values now and not bother calling 504 * tcp_dooptions(), etc. 505 */ 506 if ((optlen == TCPOLEN_TSTAMP_APPA || 507 (optlen > TCPOLEN_TSTAMP_APPA && 508 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 509 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 510 (th->th_flags & TH_SYN) == 0) { 511 opti.ts_present = 1; 512 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 513 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 514 optp = NULL; /* we've parsed the options */ 515 } 516 } 517 tiflags = th->th_flags; 518 519 /* 520 * Convert TCP protocol specific fields to host format. 521 */ 522 th->th_seq = ntohl(th->th_seq); 523 th->th_ack = ntohl(th->th_ack); 524 th->th_win = ntohs(th->th_win); 525 th->th_urp = ntohs(th->th_urp); 526 527 /* 528 * Locate pcb for segment. 529 */ 530 #if NPF > 0 531 inp = pf_inp_lookup(m); 532 #endif 533 findpcb: 534 if (inp == NULL) { 535 switch (af) { 536 #ifdef INET6 537 case AF_INET6: 538 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 539 th->th_sport, &ip6->ip6_dst, th->th_dport, 540 m->m_pkthdr.ph_rtableid); 541 break; 542 #endif 543 case AF_INET: 544 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 545 th->th_sport, ip->ip_dst, th->th_dport, 546 m->m_pkthdr.ph_rtableid); 547 break; 548 } 549 } 550 if (inp == NULL) { 551 int inpl_reverse = 0; 552 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 553 inpl_reverse = 1; 554 tcpstat_inc(tcps_pcbhashmiss); 555 switch (af) { 556 #ifdef INET6 557 case AF_INET6: 558 inp = in6_pcblookup_listen(&tcbtable, 559 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 560 m->m_pkthdr.ph_rtableid); 561 break; 562 #endif /* INET6 */ 563 case AF_INET: 564 inp = in_pcblookup_listen(&tcbtable, 565 ip->ip_dst, th->th_dport, inpl_reverse, m, 566 m->m_pkthdr.ph_rtableid); 567 break; 568 } 569 /* 570 * If the state is CLOSED (i.e., TCB does not exist) then 571 * all data in the incoming segment is discarded. 572 * If the TCB exists but is in CLOSED state, it is embryonic, 573 * but should either do a listen or a connect soon. 574 */ 575 if (inp == NULL) { 576 tcpstat_inc(tcps_noport); 577 goto dropwithreset_ratelim; 578 } 579 } 580 KASSERT(sotoinpcb(inp->inp_socket) == inp); 581 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 582 soassertlocked(inp->inp_socket); 583 584 /* Check the minimum TTL for socket. */ 585 switch (af) { 586 case AF_INET: 587 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 588 goto drop; 589 break; 590 #ifdef INET6 591 case AF_INET6: 592 if (inp->inp_ip6_minhlim && 593 inp->inp_ip6_minhlim > ip6->ip6_hlim) 594 goto drop; 595 break; 596 #endif 597 } 598 599 tp = intotcpcb(inp); 600 if (tp == NULL) 601 goto dropwithreset_ratelim; 602 if (tp->t_state == TCPS_CLOSED) 603 goto drop; 604 605 /* Unscale the window into a 32-bit value. */ 606 if ((tiflags & TH_SYN) == 0) 607 tiwin = th->th_win << tp->snd_scale; 608 else 609 tiwin = th->th_win; 610 611 so = inp->inp_socket; 612 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 613 union syn_cache_sa src; 614 union syn_cache_sa dst; 615 616 bzero(&src, sizeof(src)); 617 bzero(&dst, sizeof(dst)); 618 switch (af) { 619 case AF_INET: 620 src.sin.sin_len = sizeof(struct sockaddr_in); 621 src.sin.sin_family = AF_INET; 622 src.sin.sin_addr = ip->ip_src; 623 src.sin.sin_port = th->th_sport; 624 625 dst.sin.sin_len = sizeof(struct sockaddr_in); 626 dst.sin.sin_family = AF_INET; 627 dst.sin.sin_addr = ip->ip_dst; 628 dst.sin.sin_port = th->th_dport; 629 break; 630 #ifdef INET6 631 case AF_INET6: 632 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 633 src.sin6.sin6_family = AF_INET6; 634 src.sin6.sin6_addr = ip6->ip6_src; 635 src.sin6.sin6_port = th->th_sport; 636 637 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 638 dst.sin6.sin6_family = AF_INET6; 639 dst.sin6.sin6_addr = ip6->ip6_dst; 640 dst.sin6.sin6_port = th->th_dport; 641 break; 642 #endif /* INET6 */ 643 default: 644 goto badsyn; /*sanity*/ 645 } 646 647 if (so->so_options & SO_DEBUG) { 648 ostate = tp->t_state; 649 switch (af) { 650 #ifdef INET6 651 case AF_INET6: 652 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 653 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 654 break; 655 #endif 656 case AF_INET: 657 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 658 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 659 break; 660 } 661 } 662 if (so->so_options & SO_ACCEPTCONN) { 663 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 664 665 case TH_SYN|TH_ACK|TH_RST: 666 case TH_SYN|TH_RST: 667 case TH_ACK|TH_RST: 668 case TH_RST: 669 syn_cache_reset(&src.sa, &dst.sa, th, 670 inp->inp_rtableid); 671 goto drop; 672 673 case TH_SYN|TH_ACK: 674 /* 675 * Received a SYN,ACK. This should 676 * never happen while we are in 677 * LISTEN. Send an RST. 678 */ 679 goto badsyn; 680 681 case TH_ACK: 682 so = syn_cache_get(&src.sa, &dst.sa, 683 th, iphlen, tlen, so, m); 684 if (so == NULL) { 685 /* 686 * We don't have a SYN for 687 * this ACK; send an RST. 688 */ 689 goto badsyn; 690 } else if (so == (struct socket *)(-1)) { 691 /* 692 * We were unable to create 693 * the connection. If the 694 * 3-way handshake was 695 * completed, and RST has 696 * been sent to the peer. 697 * Since the mbuf might be 698 * in use for the reply, 699 * do not free it. 700 */ 701 m = *mp = NULL; 702 goto drop; 703 } else { 704 /* 705 * We have created a 706 * full-blown connection. 707 */ 708 tp = NULL; 709 inp = sotoinpcb(so); 710 tp = intotcpcb(inp); 711 if (tp == NULL) 712 goto badsyn; /*XXX*/ 713 714 } 715 break; 716 717 default: 718 /* 719 * None of RST, SYN or ACK was set. 720 * This is an invalid packet for a 721 * TCB in LISTEN state. Send a RST. 722 */ 723 goto badsyn; 724 725 case TH_SYN: 726 /* 727 * Received a SYN. 728 */ 729 #ifdef INET6 730 /* 731 * If deprecated address is forbidden, we do 732 * not accept SYN to deprecated interface 733 * address to prevent any new inbound 734 * connection from getting established. 735 * When we do not accept SYN, we send a TCP 736 * RST, with deprecated source address (instead 737 * of dropping it). We compromise it as it is 738 * much better for peer to send a RST, and 739 * RST will be the final packet for the 740 * exchange. 741 * 742 * If we do not forbid deprecated addresses, we 743 * accept the SYN packet. RFC2462 does not 744 * suggest dropping SYN in this case. 745 * If we decipher RFC2462 5.5.4, it says like 746 * this: 747 * 1. use of deprecated addr with existing 748 * communication is okay - "SHOULD continue 749 * to be used" 750 * 2. use of it with new communication: 751 * (2a) "SHOULD NOT be used if alternate 752 * address with sufficient scope is 753 * available" 754 * (2b) nothing mentioned otherwise. 755 * Here we fall into (2b) case as we have no 756 * choice in our source address selection - we 757 * must obey the peer. 758 * 759 * The wording in RFC2462 is confusing, and 760 * there are multiple description text for 761 * deprecated address handling - worse, they 762 * are not exactly the same. I believe 5.5.4 763 * is the best one, so we follow 5.5.4. 764 */ 765 if (ip6 && !ip6_use_deprecated) { 766 struct in6_ifaddr *ia6; 767 struct ifnet *ifp = 768 if_get(m->m_pkthdr.ph_ifidx); 769 770 if (ifp && 771 (ia6 = in6ifa_ifpwithaddr(ifp, 772 &ip6->ip6_dst)) && 773 (ia6->ia6_flags & 774 IN6_IFF_DEPRECATED)) { 775 tp = NULL; 776 if_put(ifp); 777 goto dropwithreset; 778 } 779 if_put(ifp); 780 } 781 #endif 782 783 /* 784 * LISTEN socket received a SYN 785 * from itself? This can't possibly 786 * be valid; drop the packet. 787 */ 788 if (th->th_dport == th->th_sport) { 789 switch (af) { 790 #ifdef INET6 791 case AF_INET6: 792 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 793 &ip6->ip6_dst)) { 794 tcpstat_inc(tcps_badsyn); 795 goto drop; 796 } 797 break; 798 #endif /* INET6 */ 799 case AF_INET: 800 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 801 tcpstat_inc(tcps_badsyn); 802 goto drop; 803 } 804 break; 805 } 806 } 807 808 /* 809 * SYN looks ok; create compressed TCP 810 * state for it. 811 */ 812 if (so->so_qlen > so->so_qlimit || 813 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 814 so, m, optp, optlen, &opti, reuse) == -1) { 815 tcpstat_inc(tcps_dropsyn); 816 goto drop; 817 } 818 return IPPROTO_DONE; 819 } 820 } 821 } 822 823 #ifdef DIAGNOSTIC 824 /* 825 * Should not happen now that all embryonic connections 826 * are handled with compressed state. 827 */ 828 if (tp->t_state == TCPS_LISTEN) 829 panic("tcp_input: TCPS_LISTEN"); 830 #endif 831 832 #if NPF > 0 833 pf_inp_link(m, inp); 834 #endif 835 836 #ifdef IPSEC 837 /* Find most recent IPsec tag */ 838 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 839 if (mtag != NULL) { 840 tdbi = (struct tdb_ident *)(mtag + 1); 841 tdb = gettdb(tdbi->rdomain, tdbi->spi, 842 &tdbi->dst, tdbi->proto); 843 } else 844 tdb = NULL; 845 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 846 tdb, inp, 0); 847 if (error) { 848 tcpstat_inc(tcps_rcvnosec); 849 goto drop; 850 } 851 #endif /* IPSEC */ 852 853 /* 854 * Segment received on connection. 855 * Reset idle time and keep-alive timer. 856 */ 857 tp->t_rcvtime = tcp_now; 858 if (TCPS_HAVEESTABLISHED(tp->t_state)) 859 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 860 861 if (tp->sack_enable) 862 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 863 864 /* 865 * Process options. 866 */ 867 #ifdef TCP_SIGNATURE 868 if (optp || (tp->t_flags & TF_SIGNATURE)) 869 #else 870 if (optp) 871 #endif 872 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 873 m->m_pkthdr.ph_rtableid)) 874 goto drop; 875 876 if (opti.ts_present && opti.ts_ecr) { 877 int rtt_test; 878 879 /* subtract out the tcp timestamp modulator */ 880 opti.ts_ecr -= tp->ts_modulate; 881 882 /* make sure ts_ecr is sensible */ 883 rtt_test = tcp_now - opti.ts_ecr; 884 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 885 opti.ts_ecr = 0; 886 } 887 888 #ifdef TCP_ECN 889 /* if congestion experienced, set ECE bit in subsequent packets. */ 890 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 891 tp->t_flags |= TF_RCVD_CE; 892 tcpstat_inc(tcps_ecn_rcvce); 893 } 894 #endif 895 /* 896 * Header prediction: check for the two common cases 897 * of a uni-directional data xfer. If the packet has 898 * no control flags, is in-sequence, the window didn't 899 * change and we're not retransmitting, it's a 900 * candidate. If the length is zero and the ack moved 901 * forward, we're the sender side of the xfer. Just 902 * free the data acked & wake any higher level process 903 * that was blocked waiting for space. If the length 904 * is non-zero and the ack didn't move, we're the 905 * receiver side. If we're getting packets in-order 906 * (the reassembly queue is empty), add the data to 907 * the socket buffer and note that we need a delayed ack. 908 */ 909 if (tp->t_state == TCPS_ESTABLISHED && 910 #ifdef TCP_ECN 911 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 912 #else 913 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 914 #endif 915 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 916 th->th_seq == tp->rcv_nxt && 917 tiwin && tiwin == tp->snd_wnd && 918 tp->snd_nxt == tp->snd_max) { 919 920 /* 921 * If last ACK falls within this segment's sequence numbers, 922 * record the timestamp. 923 * Fix from Braden, see Stevens p. 870 924 */ 925 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 926 tp->ts_recent_age = tcp_now; 927 tp->ts_recent = opti.ts_val; 928 } 929 930 if (tlen == 0) { 931 if (SEQ_GT(th->th_ack, tp->snd_una) && 932 SEQ_LEQ(th->th_ack, tp->snd_max) && 933 tp->snd_cwnd >= tp->snd_wnd && 934 tp->t_dupacks == 0) { 935 /* 936 * this is a pure ack for outstanding data. 937 */ 938 tcpstat_inc(tcps_predack); 939 if (opti.ts_present && opti.ts_ecr) 940 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 941 else if (tp->t_rtttime && 942 SEQ_GT(th->th_ack, tp->t_rtseq)) 943 tcp_xmit_timer(tp, 944 tcp_now - tp->t_rtttime); 945 acked = th->th_ack - tp->snd_una; 946 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 947 acked); 948 ND6_HINT(tp); 949 sbdrop(so, &so->so_snd, acked); 950 951 /* 952 * If we had a pending ICMP message that 953 * refers to data that have just been 954 * acknowledged, disregard the recorded ICMP 955 * message. 956 */ 957 if ((tp->t_flags & TF_PMTUD_PEND) && 958 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 959 tp->t_flags &= ~TF_PMTUD_PEND; 960 961 /* 962 * Keep track of the largest chunk of data 963 * acknowledged since last PMTU update 964 */ 965 if (tp->t_pmtud_mss_acked < acked) 966 tp->t_pmtud_mss_acked = acked; 967 968 tp->snd_una = th->th_ack; 969 /* 970 * We want snd_last to track snd_una so 971 * as to avoid sequence wraparound problems 972 * for very large transfers. 973 */ 974 #ifdef TCP_ECN 975 if (SEQ_GT(tp->snd_una, tp->snd_last)) 976 #endif 977 tp->snd_last = tp->snd_una; 978 m_freem(m); 979 980 /* 981 * If all outstanding data are acked, stop 982 * retransmit timer, otherwise restart timer 983 * using current (possibly backed-off) value. 984 * If process is waiting for space, 985 * wakeup/selwakeup/signal. If data 986 * are ready to send, let tcp_output 987 * decide between more output or persist. 988 */ 989 if (tp->snd_una == tp->snd_max) 990 TCP_TIMER_DISARM(tp, TCPT_REXMT); 991 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 992 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 993 994 tcp_update_sndspace(tp); 995 if (sb_notify(so, &so->so_snd)) { 996 tp->t_flags |= TF_BLOCKOUTPUT; 997 sowwakeup(so); 998 tp->t_flags &= ~TF_BLOCKOUTPUT; 999 } 1000 if (so->so_snd.sb_cc || 1001 tp->t_flags & TF_NEEDOUTPUT) 1002 (void) tcp_output(tp); 1003 return IPPROTO_DONE; 1004 } 1005 } else if (th->th_ack == tp->snd_una && 1006 TAILQ_EMPTY(&tp->t_segq) && 1007 tlen <= sbspace(so, &so->so_rcv)) { 1008 /* 1009 * This is a pure, in-sequence data packet 1010 * with nothing on the reassembly queue and 1011 * we have enough buffer space to take it. 1012 */ 1013 /* Clean receiver SACK report if present */ 1014 if (tp->sack_enable && tp->rcv_numsacks) 1015 tcp_clean_sackreport(tp); 1016 tcpstat_inc(tcps_preddat); 1017 tp->rcv_nxt += tlen; 1018 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1019 ND6_HINT(tp); 1020 1021 TCP_SETUP_ACK(tp, tiflags, m); 1022 /* 1023 * Drop TCP, IP headers and TCP options then add data 1024 * to socket buffer. 1025 */ 1026 if (so->so_state & SS_CANTRCVMORE) 1027 m_freem(m); 1028 else { 1029 if (opti.ts_present && opti.ts_ecr) { 1030 if (tp->rfbuf_ts < opti.ts_ecr && 1031 opti.ts_ecr - tp->rfbuf_ts < hz) { 1032 tcp_update_rcvspace(tp); 1033 /* Start over with next RTT. */ 1034 tp->rfbuf_cnt = 0; 1035 tp->rfbuf_ts = 0; 1036 } else 1037 tp->rfbuf_cnt += tlen; 1038 } 1039 m_adj(m, iphlen + off); 1040 sbappendstream(so, &so->so_rcv, m); 1041 } 1042 tp->t_flags |= TF_BLOCKOUTPUT; 1043 sorwakeup(so); 1044 tp->t_flags &= ~TF_BLOCKOUTPUT; 1045 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1046 (void) tcp_output(tp); 1047 return IPPROTO_DONE; 1048 } 1049 } 1050 1051 /* 1052 * Compute mbuf offset to TCP data segment. 1053 */ 1054 hdroptlen = iphlen + off; 1055 1056 /* 1057 * Calculate amount of space in receive window, 1058 * and then do TCP input processing. 1059 * Receive window is amount of space in rcv queue, 1060 * but not less than advertised window. 1061 */ 1062 { int win; 1063 1064 win = sbspace(so, &so->so_rcv); 1065 if (win < 0) 1066 win = 0; 1067 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1068 } 1069 1070 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1071 tp->rfbuf_cnt = 0; 1072 tp->rfbuf_ts = 0; 1073 1074 switch (tp->t_state) { 1075 1076 /* 1077 * If the state is SYN_RECEIVED: 1078 * if seg contains SYN/ACK, send an RST. 1079 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1080 */ 1081 1082 case TCPS_SYN_RECEIVED: 1083 if (tiflags & TH_ACK) { 1084 if (tiflags & TH_SYN) { 1085 tcpstat_inc(tcps_badsyn); 1086 goto dropwithreset; 1087 } 1088 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1089 SEQ_GT(th->th_ack, tp->snd_max)) 1090 goto dropwithreset; 1091 } 1092 break; 1093 1094 /* 1095 * If the state is SYN_SENT: 1096 * if seg contains an ACK, but not for our SYN, drop the input. 1097 * if seg contains a RST, then drop the connection. 1098 * if seg does not contain SYN, then drop it. 1099 * Otherwise this is an acceptable SYN segment 1100 * initialize tp->rcv_nxt and tp->irs 1101 * if seg contains ack then advance tp->snd_una 1102 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1103 * arrange for segment to be acked (eventually) 1104 * continue processing rest of data/controls, beginning with URG 1105 */ 1106 case TCPS_SYN_SENT: 1107 if ((tiflags & TH_ACK) && 1108 (SEQ_LEQ(th->th_ack, tp->iss) || 1109 SEQ_GT(th->th_ack, tp->snd_max))) 1110 goto dropwithreset; 1111 if (tiflags & TH_RST) { 1112 #ifdef TCP_ECN 1113 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1114 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1115 goto drop; 1116 #endif 1117 if (tiflags & TH_ACK) 1118 tp = tcp_drop(tp, ECONNREFUSED); 1119 goto drop; 1120 } 1121 if ((tiflags & TH_SYN) == 0) 1122 goto drop; 1123 if (tiflags & TH_ACK) { 1124 tp->snd_una = th->th_ack; 1125 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1126 tp->snd_nxt = tp->snd_una; 1127 } 1128 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1129 tp->irs = th->th_seq; 1130 tcp_mss(tp, opti.maxseg); 1131 /* Reset initial window to 1 segment for retransmit */ 1132 if (tp->t_rxtshift > 0) 1133 tp->snd_cwnd = tp->t_maxseg; 1134 tcp_rcvseqinit(tp); 1135 tp->t_flags |= TF_ACKNOW; 1136 /* 1137 * If we've sent a SACK_PERMITTED option, and the peer 1138 * also replied with one, then TF_SACK_PERMIT should have 1139 * been set in tcp_dooptions(). If it was not, disable SACKs. 1140 */ 1141 if (tp->sack_enable) 1142 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1143 #ifdef TCP_ECN 1144 /* 1145 * if ECE is set but CWR is not set for SYN-ACK, or 1146 * both ECE and CWR are set for simultaneous open, 1147 * peer is ECN capable. 1148 */ 1149 if (tcp_do_ecn) { 1150 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1151 case TH_ACK|TH_ECE: 1152 case TH_ECE|TH_CWR: 1153 tp->t_flags |= TF_ECN_PERMIT; 1154 tiflags &= ~(TH_ECE|TH_CWR); 1155 tcpstat_inc(tcps_ecn_accepts); 1156 } 1157 } 1158 #endif 1159 1160 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1161 tcpstat_inc(tcps_connects); 1162 tp->t_flags |= TF_BLOCKOUTPUT; 1163 soisconnected(so); 1164 tp->t_flags &= ~TF_BLOCKOUTPUT; 1165 tp->t_state = TCPS_ESTABLISHED; 1166 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1167 /* Do window scaling on this connection? */ 1168 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1169 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1170 tp->snd_scale = tp->requested_s_scale; 1171 tp->rcv_scale = tp->request_r_scale; 1172 } 1173 tcp_flush_queue(tp); 1174 1175 /* 1176 * if we didn't have to retransmit the SYN, 1177 * use its rtt as our initial srtt & rtt var. 1178 */ 1179 if (tp->t_rtttime) 1180 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1181 /* 1182 * Since new data was acked (the SYN), open the 1183 * congestion window by one MSS. We do this 1184 * here, because we won't go through the normal 1185 * ACK processing below. And since this is the 1186 * start of the connection, we know we are in 1187 * the exponential phase of slow-start. 1188 */ 1189 tp->snd_cwnd += tp->t_maxseg; 1190 } else 1191 tp->t_state = TCPS_SYN_RECEIVED; 1192 1193 #if 0 1194 trimthenstep6: 1195 #endif 1196 /* 1197 * Advance th->th_seq to correspond to first data byte. 1198 * If data, trim to stay within window, 1199 * dropping FIN if necessary. 1200 */ 1201 th->th_seq++; 1202 if (tlen > tp->rcv_wnd) { 1203 todrop = tlen - tp->rcv_wnd; 1204 m_adj(m, -todrop); 1205 tlen = tp->rcv_wnd; 1206 tiflags &= ~TH_FIN; 1207 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1208 todrop); 1209 } 1210 tp->snd_wl1 = th->th_seq - 1; 1211 tp->rcv_up = th->th_seq; 1212 goto step6; 1213 /* 1214 * If a new connection request is received while in TIME_WAIT, 1215 * drop the old connection and start over if the if the 1216 * timestamp or the sequence numbers are above the previous 1217 * ones. 1218 */ 1219 case TCPS_TIME_WAIT: 1220 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1221 ((opti.ts_present && 1222 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1223 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1224 #if NPF > 0 1225 /* 1226 * The socket will be recreated but the new state 1227 * has already been linked to the socket. Remove the 1228 * link between old socket and new state. 1229 */ 1230 pf_inp_unlink(inp); 1231 #endif 1232 /* 1233 * Advance the iss by at least 32768, but 1234 * clear the msb in order to make sure 1235 * that SEG_LT(snd_nxt, iss). 1236 */ 1237 iss = tp->snd_nxt + 1238 ((arc4random() & 0x7fffffff) | 0x8000); 1239 reuse = &iss; 1240 tp = tcp_close(tp); 1241 inp = NULL; 1242 goto findpcb; 1243 } 1244 } 1245 1246 /* 1247 * States other than LISTEN or SYN_SENT. 1248 * First check timestamp, if present. 1249 * Then check that at least some bytes of segment are within 1250 * receive window. If segment begins before rcv_nxt, 1251 * drop leading data (and SYN); if nothing left, just ack. 1252 * 1253 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1254 * and it's less than opti.ts_recent, drop it. 1255 */ 1256 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1257 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1258 1259 /* Check to see if ts_recent is over 24 days old. */ 1260 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1261 /* 1262 * Invalidate ts_recent. If this segment updates 1263 * ts_recent, the age will be reset later and ts_recent 1264 * will get a valid value. If it does not, setting 1265 * ts_recent to zero will at least satisfy the 1266 * requirement that zero be placed in the timestamp 1267 * echo reply when ts_recent isn't valid. The 1268 * age isn't reset until we get a valid ts_recent 1269 * because we don't want out-of-order segments to be 1270 * dropped when ts_recent is old. 1271 */ 1272 tp->ts_recent = 0; 1273 } else { 1274 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1275 tcpstat_inc(tcps_pawsdrop); 1276 goto dropafterack; 1277 } 1278 } 1279 1280 todrop = tp->rcv_nxt - th->th_seq; 1281 if (todrop > 0) { 1282 if (tiflags & TH_SYN) { 1283 tiflags &= ~TH_SYN; 1284 th->th_seq++; 1285 if (th->th_urp > 1) 1286 th->th_urp--; 1287 else 1288 tiflags &= ~TH_URG; 1289 todrop--; 1290 } 1291 if (todrop > tlen || 1292 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1293 /* 1294 * Any valid FIN must be to the left of the 1295 * window. At this point, FIN must be a 1296 * duplicate or out-of-sequence, so drop it. 1297 */ 1298 tiflags &= ~TH_FIN; 1299 /* 1300 * Send ACK to resynchronize, and drop any data, 1301 * but keep on processing for RST or ACK. 1302 */ 1303 tp->t_flags |= TF_ACKNOW; 1304 todrop = tlen; 1305 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1306 } else { 1307 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1308 todrop); 1309 } 1310 hdroptlen += todrop; /* drop from head afterwards */ 1311 th->th_seq += todrop; 1312 tlen -= todrop; 1313 if (th->th_urp > todrop) 1314 th->th_urp -= todrop; 1315 else { 1316 tiflags &= ~TH_URG; 1317 th->th_urp = 0; 1318 } 1319 } 1320 1321 /* 1322 * If new data are received on a connection after the 1323 * user processes are gone, then RST the other end. 1324 */ 1325 if ((so->so_state & SS_NOFDREF) && 1326 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1327 tp = tcp_close(tp); 1328 tcpstat_inc(tcps_rcvafterclose); 1329 goto dropwithreset; 1330 } 1331 1332 /* 1333 * If segment ends after window, drop trailing data 1334 * (and PUSH and FIN); if nothing left, just ACK. 1335 */ 1336 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1337 if (todrop > 0) { 1338 tcpstat_inc(tcps_rcvpackafterwin); 1339 if (todrop >= tlen) { 1340 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1341 /* 1342 * If window is closed can only take segments at 1343 * window edge, and have to drop data and PUSH from 1344 * incoming segments. Continue processing, but 1345 * remember to ack. Otherwise, drop segment 1346 * and ack. 1347 */ 1348 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1349 tp->t_flags |= TF_ACKNOW; 1350 tcpstat_inc(tcps_rcvwinprobe); 1351 } else 1352 goto dropafterack; 1353 } else 1354 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1355 m_adj(m, -todrop); 1356 tlen -= todrop; 1357 tiflags &= ~(TH_PUSH|TH_FIN); 1358 } 1359 1360 /* 1361 * If last ACK falls within this segment's sequence numbers, 1362 * record its timestamp if it's more recent. 1363 * NOTE that the test is modified according to the latest 1364 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1365 */ 1366 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1367 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1368 tp->ts_recent_age = tcp_now; 1369 tp->ts_recent = opti.ts_val; 1370 } 1371 1372 /* 1373 * If the RST bit is set examine the state: 1374 * SYN_RECEIVED STATE: 1375 * If passive open, return to LISTEN state. 1376 * If active open, inform user that connection was refused. 1377 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1378 * Inform user that connection was reset, and close tcb. 1379 * CLOSING, LAST_ACK, TIME_WAIT STATES 1380 * Close the tcb. 1381 */ 1382 if (tiflags & TH_RST) { 1383 if (th->th_seq != tp->last_ack_sent && 1384 th->th_seq != tp->rcv_nxt && 1385 th->th_seq != (tp->rcv_nxt + 1)) 1386 goto drop; 1387 1388 switch (tp->t_state) { 1389 case TCPS_SYN_RECEIVED: 1390 #ifdef TCP_ECN 1391 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1392 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1393 goto drop; 1394 #endif 1395 so->so_error = ECONNREFUSED; 1396 goto close; 1397 1398 case TCPS_ESTABLISHED: 1399 case TCPS_FIN_WAIT_1: 1400 case TCPS_FIN_WAIT_2: 1401 case TCPS_CLOSE_WAIT: 1402 so->so_error = ECONNRESET; 1403 close: 1404 tp->t_state = TCPS_CLOSED; 1405 tcpstat_inc(tcps_drops); 1406 tp = tcp_close(tp); 1407 goto drop; 1408 case TCPS_CLOSING: 1409 case TCPS_LAST_ACK: 1410 case TCPS_TIME_WAIT: 1411 tp = tcp_close(tp); 1412 goto drop; 1413 } 1414 } 1415 1416 /* 1417 * If a SYN is in the window, then this is an 1418 * error and we ACK and drop the packet. 1419 */ 1420 if (tiflags & TH_SYN) 1421 goto dropafterack_ratelim; 1422 1423 /* 1424 * If the ACK bit is off we drop the segment and return. 1425 */ 1426 if ((tiflags & TH_ACK) == 0) { 1427 if (tp->t_flags & TF_ACKNOW) 1428 goto dropafterack; 1429 else 1430 goto drop; 1431 } 1432 1433 /* 1434 * Ack processing. 1435 */ 1436 switch (tp->t_state) { 1437 1438 /* 1439 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1440 * ESTABLISHED state and continue processing. 1441 * The ACK was checked above. 1442 */ 1443 case TCPS_SYN_RECEIVED: 1444 tcpstat_inc(tcps_connects); 1445 tp->t_flags |= TF_BLOCKOUTPUT; 1446 soisconnected(so); 1447 tp->t_flags &= ~TF_BLOCKOUTPUT; 1448 tp->t_state = TCPS_ESTABLISHED; 1449 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1450 /* Do window scaling? */ 1451 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1452 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1453 tp->snd_scale = tp->requested_s_scale; 1454 tp->rcv_scale = tp->request_r_scale; 1455 tiwin = th->th_win << tp->snd_scale; 1456 } 1457 tcp_flush_queue(tp); 1458 tp->snd_wl1 = th->th_seq - 1; 1459 /* fall into ... */ 1460 1461 /* 1462 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1463 * ACKs. If the ack is in the range 1464 * tp->snd_una < th->th_ack <= tp->snd_max 1465 * then advance tp->snd_una to th->th_ack and drop 1466 * data from the retransmission queue. If this ACK reflects 1467 * more up to date window information we update our window information. 1468 */ 1469 case TCPS_ESTABLISHED: 1470 case TCPS_FIN_WAIT_1: 1471 case TCPS_FIN_WAIT_2: 1472 case TCPS_CLOSE_WAIT: 1473 case TCPS_CLOSING: 1474 case TCPS_LAST_ACK: 1475 case TCPS_TIME_WAIT: 1476 #ifdef TCP_ECN 1477 /* 1478 * if we receive ECE and are not already in recovery phase, 1479 * reduce cwnd by half but don't slow-start. 1480 * advance snd_last to snd_max not to reduce cwnd again 1481 * until all outstanding packets are acked. 1482 */ 1483 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1484 if ((tp->t_flags & TF_ECN_PERMIT) && 1485 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1486 u_int win; 1487 1488 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1489 if (win > 1) { 1490 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1491 tp->snd_cwnd = tp->snd_ssthresh; 1492 tp->snd_last = tp->snd_max; 1493 tp->t_flags |= TF_SEND_CWR; 1494 tcpstat_inc(tcps_cwr_ecn); 1495 } 1496 } 1497 tcpstat_inc(tcps_ecn_rcvece); 1498 } 1499 /* 1500 * if we receive CWR, we know that the peer has reduced 1501 * its congestion window. stop sending ecn-echo. 1502 */ 1503 if ((tiflags & TH_CWR)) { 1504 tp->t_flags &= ~TF_RCVD_CE; 1505 tcpstat_inc(tcps_ecn_rcvcwr); 1506 } 1507 #endif /* TCP_ECN */ 1508 1509 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1510 /* 1511 * Duplicate/old ACK processing. 1512 * Increments t_dupacks: 1513 * Pure duplicate (same seq/ack/window, no data) 1514 * Doesn't affect t_dupacks: 1515 * Data packets. 1516 * Normal window updates (window opens) 1517 * Resets t_dupacks: 1518 * New data ACKed. 1519 * Window shrinks 1520 * Old ACK 1521 */ 1522 if (tlen) { 1523 /* Drop very old ACKs unless th_seq matches */ 1524 if (th->th_seq != tp->rcv_nxt && 1525 SEQ_LT(th->th_ack, 1526 tp->snd_una - tp->max_sndwnd)) { 1527 tcpstat_inc(tcps_rcvacktooold); 1528 goto drop; 1529 } 1530 break; 1531 } 1532 /* 1533 * If we get an old ACK, there is probably packet 1534 * reordering going on. Be conservative and reset 1535 * t_dupacks so that we are less aggressive in 1536 * doing a fast retransmit. 1537 */ 1538 if (th->th_ack != tp->snd_una) { 1539 tp->t_dupacks = 0; 1540 break; 1541 } 1542 if (tiwin == tp->snd_wnd) { 1543 tcpstat_inc(tcps_rcvdupack); 1544 /* 1545 * If we have outstanding data (other than 1546 * a window probe), this is a completely 1547 * duplicate ack (ie, window info didn't 1548 * change), the ack is the biggest we've 1549 * seen and we've seen exactly our rexmt 1550 * threshold of them, assume a packet 1551 * has been dropped and retransmit it. 1552 * Kludge snd_nxt & the congestion 1553 * window so we send only this one 1554 * packet. 1555 * 1556 * We know we're losing at the current 1557 * window size so do congestion avoidance 1558 * (set ssthresh to half the current window 1559 * and pull our congestion window back to 1560 * the new ssthresh). 1561 * 1562 * Dup acks mean that packets have left the 1563 * network (they're now cached at the receiver) 1564 * so bump cwnd by the amount in the receiver 1565 * to keep a constant cwnd packets in the 1566 * network. 1567 */ 1568 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1569 tp->t_dupacks = 0; 1570 else if (++tp->t_dupacks == tcprexmtthresh) { 1571 tcp_seq onxt = tp->snd_nxt; 1572 u_long win = 1573 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1574 2 / tp->t_maxseg; 1575 1576 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1577 /* 1578 * False fast retx after 1579 * timeout. Do not cut window. 1580 */ 1581 tp->t_dupacks = 0; 1582 goto drop; 1583 } 1584 if (win < 2) 1585 win = 2; 1586 tp->snd_ssthresh = win * tp->t_maxseg; 1587 tp->snd_last = tp->snd_max; 1588 if (tp->sack_enable) { 1589 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1590 tp->t_rtttime = 0; 1591 #ifdef TCP_ECN 1592 tp->t_flags |= TF_SEND_CWR; 1593 #endif 1594 tcpstat_inc(tcps_cwr_frecovery); 1595 tcpstat_inc(tcps_sack_recovery_episode); 1596 /* 1597 * tcp_output() will send 1598 * oldest SACK-eligible rtx. 1599 */ 1600 (void) tcp_output(tp); 1601 tp->snd_cwnd = tp->snd_ssthresh+ 1602 tp->t_maxseg * tp->t_dupacks; 1603 goto drop; 1604 } 1605 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1606 tp->t_rtttime = 0; 1607 tp->snd_nxt = th->th_ack; 1608 tp->snd_cwnd = tp->t_maxseg; 1609 #ifdef TCP_ECN 1610 tp->t_flags |= TF_SEND_CWR; 1611 #endif 1612 tcpstat_inc(tcps_cwr_frecovery); 1613 tcpstat_inc(tcps_sndrexmitfast); 1614 (void) tcp_output(tp); 1615 1616 tp->snd_cwnd = tp->snd_ssthresh + 1617 tp->t_maxseg * tp->t_dupacks; 1618 if (SEQ_GT(onxt, tp->snd_nxt)) 1619 tp->snd_nxt = onxt; 1620 goto drop; 1621 } else if (tp->t_dupacks > tcprexmtthresh) { 1622 tp->snd_cwnd += tp->t_maxseg; 1623 (void) tcp_output(tp); 1624 goto drop; 1625 } 1626 } else if (tiwin < tp->snd_wnd) { 1627 /* 1628 * The window was retracted! Previous dup 1629 * ACKs may have been due to packets arriving 1630 * after the shrunken window, not a missing 1631 * packet, so play it safe and reset t_dupacks 1632 */ 1633 tp->t_dupacks = 0; 1634 } 1635 break; 1636 } 1637 /* 1638 * If the congestion window was inflated to account 1639 * for the other side's cached packets, retract it. 1640 */ 1641 if (tp->t_dupacks >= tcprexmtthresh) { 1642 /* Check for a partial ACK */ 1643 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1644 if (tp->sack_enable) 1645 tcp_sack_partialack(tp, th); 1646 else 1647 tcp_newreno_partialack(tp, th); 1648 } else { 1649 /* Out of fast recovery */ 1650 tp->snd_cwnd = tp->snd_ssthresh; 1651 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1652 tp->snd_ssthresh) 1653 tp->snd_cwnd = 1654 tcp_seq_subtract(tp->snd_max, 1655 th->th_ack); 1656 tp->t_dupacks = 0; 1657 } 1658 } else { 1659 /* 1660 * Reset the duplicate ACK counter if we 1661 * were not in fast recovery. 1662 */ 1663 tp->t_dupacks = 0; 1664 } 1665 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1666 tcpstat_inc(tcps_rcvacktoomuch); 1667 goto dropafterack_ratelim; 1668 } 1669 acked = th->th_ack - tp->snd_una; 1670 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1671 1672 /* 1673 * If we have a timestamp reply, update smoothed 1674 * round trip time. If no timestamp is present but 1675 * transmit timer is running and timed sequence 1676 * number was acked, update smoothed round trip time. 1677 * Since we now have an rtt measurement, cancel the 1678 * timer backoff (cf., Phil Karn's retransmit alg.). 1679 * Recompute the initial retransmit timer. 1680 */ 1681 if (opti.ts_present && opti.ts_ecr) 1682 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1683 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1684 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1685 1686 /* 1687 * If all outstanding data is acked, stop retransmit 1688 * timer and remember to restart (more output or persist). 1689 * If there is more data to be acked, restart retransmit 1690 * timer, using current (possibly backed-off) value. 1691 */ 1692 if (th->th_ack == tp->snd_max) { 1693 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1694 tp->t_flags |= TF_NEEDOUTPUT; 1695 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1696 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1697 /* 1698 * When new data is acked, open the congestion window. 1699 * If the window gives us less than ssthresh packets 1700 * in flight, open exponentially (maxseg per packet). 1701 * Otherwise open linearly: maxseg per window 1702 * (maxseg^2 / cwnd per packet). 1703 */ 1704 { 1705 u_int cw = tp->snd_cwnd; 1706 u_int incr = tp->t_maxseg; 1707 1708 if (cw > tp->snd_ssthresh) 1709 incr = incr * incr / cw; 1710 if (tp->t_dupacks < tcprexmtthresh) 1711 tp->snd_cwnd = ulmin(cw + incr, 1712 TCP_MAXWIN << tp->snd_scale); 1713 } 1714 ND6_HINT(tp); 1715 if (acked > so->so_snd.sb_cc) { 1716 tp->snd_wnd -= so->so_snd.sb_cc; 1717 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1718 ourfinisacked = 1; 1719 } else { 1720 sbdrop(so, &so->so_snd, acked); 1721 tp->snd_wnd -= acked; 1722 ourfinisacked = 0; 1723 } 1724 1725 tcp_update_sndspace(tp); 1726 if (sb_notify(so, &so->so_snd)) { 1727 tp->t_flags |= TF_BLOCKOUTPUT; 1728 sowwakeup(so); 1729 tp->t_flags &= ~TF_BLOCKOUTPUT; 1730 } 1731 1732 /* 1733 * If we had a pending ICMP message that referred to data 1734 * that have just been acknowledged, disregard the recorded 1735 * ICMP message. 1736 */ 1737 if ((tp->t_flags & TF_PMTUD_PEND) && 1738 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1739 tp->t_flags &= ~TF_PMTUD_PEND; 1740 1741 /* 1742 * Keep track of the largest chunk of data acknowledged 1743 * since last PMTU update 1744 */ 1745 if (tp->t_pmtud_mss_acked < acked) 1746 tp->t_pmtud_mss_acked = acked; 1747 1748 tp->snd_una = th->th_ack; 1749 #ifdef TCP_ECN 1750 /* sync snd_last with snd_una */ 1751 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1752 tp->snd_last = tp->snd_una; 1753 #endif 1754 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1755 tp->snd_nxt = tp->snd_una; 1756 1757 switch (tp->t_state) { 1758 1759 /* 1760 * In FIN_WAIT_1 STATE in addition to the processing 1761 * for the ESTABLISHED state if our FIN is now acknowledged 1762 * then enter FIN_WAIT_2. 1763 */ 1764 case TCPS_FIN_WAIT_1: 1765 if (ourfinisacked) { 1766 /* 1767 * If we can't receive any more 1768 * data, then closing user can proceed. 1769 * Starting the timer is contrary to the 1770 * specification, but if we don't get a FIN 1771 * we'll hang forever. 1772 */ 1773 if (so->so_state & SS_CANTRCVMORE) { 1774 tp->t_flags |= TF_BLOCKOUTPUT; 1775 soisdisconnected(so); 1776 tp->t_flags &= ~TF_BLOCKOUTPUT; 1777 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1778 } 1779 tp->t_state = TCPS_FIN_WAIT_2; 1780 } 1781 break; 1782 1783 /* 1784 * In CLOSING STATE in addition to the processing for 1785 * the ESTABLISHED state if the ACK acknowledges our FIN 1786 * then enter the TIME-WAIT state, otherwise ignore 1787 * the segment. 1788 */ 1789 case TCPS_CLOSING: 1790 if (ourfinisacked) { 1791 tp->t_state = TCPS_TIME_WAIT; 1792 tcp_canceltimers(tp); 1793 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1794 tp->t_flags |= TF_BLOCKOUTPUT; 1795 soisdisconnected(so); 1796 tp->t_flags &= ~TF_BLOCKOUTPUT; 1797 } 1798 break; 1799 1800 /* 1801 * In LAST_ACK, we may still be waiting for data to drain 1802 * and/or to be acked, as well as for the ack of our FIN. 1803 * If our FIN is now acknowledged, delete the TCB, 1804 * enter the closed state and return. 1805 */ 1806 case TCPS_LAST_ACK: 1807 if (ourfinisacked) { 1808 tp = tcp_close(tp); 1809 goto drop; 1810 } 1811 break; 1812 1813 /* 1814 * In TIME_WAIT state the only thing that should arrive 1815 * is a retransmission of the remote FIN. Acknowledge 1816 * it and restart the finack timer. 1817 */ 1818 case TCPS_TIME_WAIT: 1819 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1820 goto dropafterack; 1821 } 1822 } 1823 1824 step6: 1825 /* 1826 * Update window information. 1827 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1828 */ 1829 if ((tiflags & TH_ACK) && 1830 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1831 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1832 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1833 /* keep track of pure window updates */ 1834 if (tlen == 0 && 1835 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1836 tcpstat_inc(tcps_rcvwinupd); 1837 tp->snd_wnd = tiwin; 1838 tp->snd_wl1 = th->th_seq; 1839 tp->snd_wl2 = th->th_ack; 1840 if (tp->snd_wnd > tp->max_sndwnd) 1841 tp->max_sndwnd = tp->snd_wnd; 1842 tp->t_flags |= TF_NEEDOUTPUT; 1843 } 1844 1845 /* 1846 * Process segments with URG. 1847 */ 1848 if ((tiflags & TH_URG) && th->th_urp && 1849 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1850 /* 1851 * This is a kludge, but if we receive and accept 1852 * random urgent pointers, we'll crash in 1853 * soreceive. It's hard to imagine someone 1854 * actually wanting to send this much urgent data. 1855 */ 1856 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1857 th->th_urp = 0; /* XXX */ 1858 tiflags &= ~TH_URG; /* XXX */ 1859 goto dodata; /* XXX */ 1860 } 1861 /* 1862 * If this segment advances the known urgent pointer, 1863 * then mark the data stream. This should not happen 1864 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1865 * a FIN has been received from the remote side. 1866 * In these states we ignore the URG. 1867 * 1868 * According to RFC961 (Assigned Protocols), 1869 * the urgent pointer points to the last octet 1870 * of urgent data. We continue, however, 1871 * to consider it to indicate the first octet 1872 * of data past the urgent section as the original 1873 * spec states (in one of two places). 1874 */ 1875 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1876 tp->rcv_up = th->th_seq + th->th_urp; 1877 so->so_oobmark = so->so_rcv.sb_cc + 1878 (tp->rcv_up - tp->rcv_nxt) - 1; 1879 if (so->so_oobmark == 0) 1880 so->so_state |= SS_RCVATMARK; 1881 sohasoutofband(so); 1882 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1883 } 1884 /* 1885 * Remove out of band data so doesn't get presented to user. 1886 * This can happen independent of advancing the URG pointer, 1887 * but if two URG's are pending at once, some out-of-band 1888 * data may creep in... ick. 1889 */ 1890 if (th->th_urp <= (u_int16_t) tlen && 1891 (so->so_options & SO_OOBINLINE) == 0) 1892 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1893 } else 1894 /* 1895 * If no out of band data is expected, 1896 * pull receive urgent pointer along 1897 * with the receive window. 1898 */ 1899 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1900 tp->rcv_up = tp->rcv_nxt; 1901 dodata: /* XXX */ 1902 1903 /* 1904 * Process the segment text, merging it into the TCP sequencing queue, 1905 * and arranging for acknowledgment of receipt if necessary. 1906 * This process logically involves adjusting tp->rcv_wnd as data 1907 * is presented to the user (this happens in tcp_usrreq.c, 1908 * case PRU_RCVD). If a FIN has already been received on this 1909 * connection then we just ignore the text. 1910 */ 1911 if ((tlen || (tiflags & TH_FIN)) && 1912 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1913 tcp_seq laststart = th->th_seq; 1914 tcp_seq lastend = th->th_seq + tlen; 1915 1916 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1917 tp->t_state == TCPS_ESTABLISHED) { 1918 TCP_SETUP_ACK(tp, tiflags, m); 1919 tp->rcv_nxt += tlen; 1920 tiflags = th->th_flags & TH_FIN; 1921 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1922 ND6_HINT(tp); 1923 if (so->so_state & SS_CANTRCVMORE) 1924 m_freem(m); 1925 else { 1926 m_adj(m, hdroptlen); 1927 sbappendstream(so, &so->so_rcv, m); 1928 } 1929 tp->t_flags |= TF_BLOCKOUTPUT; 1930 sorwakeup(so); 1931 tp->t_flags &= ~TF_BLOCKOUTPUT; 1932 } else { 1933 m_adj(m, hdroptlen); 1934 tiflags = tcp_reass(tp, th, m, &tlen); 1935 tp->t_flags |= TF_ACKNOW; 1936 } 1937 if (tp->sack_enable) 1938 tcp_update_sack_list(tp, laststart, lastend); 1939 1940 /* 1941 * variable len never referenced again in modern BSD, 1942 * so why bother computing it ?? 1943 */ 1944 #if 0 1945 /* 1946 * Note the amount of data that peer has sent into 1947 * our window, in order to estimate the sender's 1948 * buffer size. 1949 */ 1950 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1951 #endif /* 0 */ 1952 } else { 1953 m_freem(m); 1954 tiflags &= ~TH_FIN; 1955 } 1956 1957 /* 1958 * If FIN is received ACK the FIN and let the user know 1959 * that the connection is closing. Ignore a FIN received before 1960 * the connection is fully established. 1961 */ 1962 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1963 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1964 tp->t_flags |= TF_BLOCKOUTPUT; 1965 socantrcvmore(so); 1966 tp->t_flags &= ~TF_BLOCKOUTPUT; 1967 tp->t_flags |= TF_ACKNOW; 1968 tp->rcv_nxt++; 1969 } 1970 switch (tp->t_state) { 1971 1972 /* 1973 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1974 */ 1975 case TCPS_ESTABLISHED: 1976 tp->t_state = TCPS_CLOSE_WAIT; 1977 break; 1978 1979 /* 1980 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1981 * enter the CLOSING state. 1982 */ 1983 case TCPS_FIN_WAIT_1: 1984 tp->t_state = TCPS_CLOSING; 1985 break; 1986 1987 /* 1988 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1989 * starting the time-wait timer, turning off the other 1990 * standard timers. 1991 */ 1992 case TCPS_FIN_WAIT_2: 1993 tp->t_state = TCPS_TIME_WAIT; 1994 tcp_canceltimers(tp); 1995 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1996 tp->t_flags |= TF_BLOCKOUTPUT; 1997 soisdisconnected(so); 1998 tp->t_flags &= ~TF_BLOCKOUTPUT; 1999 break; 2000 2001 /* 2002 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2003 */ 2004 case TCPS_TIME_WAIT: 2005 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2006 break; 2007 } 2008 } 2009 if (so->so_options & SO_DEBUG) { 2010 switch (tp->pf) { 2011 #ifdef INET6 2012 case PF_INET6: 2013 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2014 0, tlen); 2015 break; 2016 #endif /* INET6 */ 2017 case PF_INET: 2018 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2019 0, tlen); 2020 break; 2021 } 2022 } 2023 2024 /* 2025 * Return any desired output. 2026 */ 2027 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2028 (void) tcp_output(tp); 2029 return IPPROTO_DONE; 2030 2031 badsyn: 2032 /* 2033 * Received a bad SYN. Increment counters and dropwithreset. 2034 */ 2035 tcpstat_inc(tcps_badsyn); 2036 tp = NULL; 2037 goto dropwithreset; 2038 2039 dropafterack_ratelim: 2040 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2041 tcp_ackdrop_ppslim) == 0) { 2042 /* XXX stat */ 2043 goto drop; 2044 } 2045 /* ...fall into dropafterack... */ 2046 2047 dropafterack: 2048 /* 2049 * Generate an ACK dropping incoming segment if it occupies 2050 * sequence space, where the ACK reflects our state. 2051 */ 2052 if (tiflags & TH_RST) 2053 goto drop; 2054 m_freem(m); 2055 tp->t_flags |= TF_ACKNOW; 2056 (void) tcp_output(tp); 2057 return IPPROTO_DONE; 2058 2059 dropwithreset_ratelim: 2060 /* 2061 * We may want to rate-limit RSTs in certain situations, 2062 * particularly if we are sending an RST in response to 2063 * an attempt to connect to or otherwise communicate with 2064 * a port for which we have no socket. 2065 */ 2066 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2067 tcp_rst_ppslim) == 0) { 2068 /* XXX stat */ 2069 goto drop; 2070 } 2071 /* ...fall into dropwithreset... */ 2072 2073 dropwithreset: 2074 /* 2075 * Generate a RST, dropping incoming segment. 2076 * Make ACK acceptable to originator of segment. 2077 * Don't bother to respond to RST. 2078 */ 2079 if (tiflags & TH_RST) 2080 goto drop; 2081 if (tiflags & TH_ACK) { 2082 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2083 TH_RST, m->m_pkthdr.ph_rtableid); 2084 } else { 2085 if (tiflags & TH_SYN) 2086 tlen++; 2087 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2088 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2089 } 2090 m_freem(m); 2091 return IPPROTO_DONE; 2092 2093 drop: 2094 /* 2095 * Drop space held by incoming segment and return. 2096 */ 2097 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2098 switch (tp->pf) { 2099 #ifdef INET6 2100 case PF_INET6: 2101 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2102 0, tlen); 2103 break; 2104 #endif /* INET6 */ 2105 case PF_INET: 2106 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2107 0, tlen); 2108 break; 2109 } 2110 } 2111 2112 m_freem(m); 2113 return IPPROTO_DONE; 2114 } 2115 2116 int 2117 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2118 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2119 u_int rtableid) 2120 { 2121 u_int16_t mss = 0; 2122 int opt, optlen; 2123 #ifdef TCP_SIGNATURE 2124 caddr_t sigp = NULL; 2125 struct tdb *tdb = NULL; 2126 #endif /* TCP_SIGNATURE */ 2127 2128 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2129 opt = cp[0]; 2130 if (opt == TCPOPT_EOL) 2131 break; 2132 if (opt == TCPOPT_NOP) 2133 optlen = 1; 2134 else { 2135 if (cnt < 2) 2136 break; 2137 optlen = cp[1]; 2138 if (optlen < 2 || optlen > cnt) 2139 break; 2140 } 2141 switch (opt) { 2142 2143 default: 2144 continue; 2145 2146 case TCPOPT_MAXSEG: 2147 if (optlen != TCPOLEN_MAXSEG) 2148 continue; 2149 if (!(th->th_flags & TH_SYN)) 2150 continue; 2151 if (TCPS_HAVERCVDSYN(tp->t_state)) 2152 continue; 2153 memcpy(&mss, cp + 2, sizeof(mss)); 2154 mss = ntohs(mss); 2155 oi->maxseg = mss; 2156 break; 2157 2158 case TCPOPT_WINDOW: 2159 if (optlen != TCPOLEN_WINDOW) 2160 continue; 2161 if (!(th->th_flags & TH_SYN)) 2162 continue; 2163 if (TCPS_HAVERCVDSYN(tp->t_state)) 2164 continue; 2165 tp->t_flags |= TF_RCVD_SCALE; 2166 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2167 break; 2168 2169 case TCPOPT_TIMESTAMP: 2170 if (optlen != TCPOLEN_TIMESTAMP) 2171 continue; 2172 oi->ts_present = 1; 2173 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2174 oi->ts_val = ntohl(oi->ts_val); 2175 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2176 oi->ts_ecr = ntohl(oi->ts_ecr); 2177 2178 if (!(th->th_flags & TH_SYN)) 2179 continue; 2180 if (TCPS_HAVERCVDSYN(tp->t_state)) 2181 continue; 2182 /* 2183 * A timestamp received in a SYN makes 2184 * it ok to send timestamp requests and replies. 2185 */ 2186 tp->t_flags |= TF_RCVD_TSTMP; 2187 tp->ts_recent = oi->ts_val; 2188 tp->ts_recent_age = tcp_now; 2189 break; 2190 2191 case TCPOPT_SACK_PERMITTED: 2192 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2193 continue; 2194 if (!(th->th_flags & TH_SYN)) 2195 continue; 2196 if (TCPS_HAVERCVDSYN(tp->t_state)) 2197 continue; 2198 /* MUST only be set on SYN */ 2199 tp->t_flags |= TF_SACK_PERMIT; 2200 break; 2201 case TCPOPT_SACK: 2202 tcp_sack_option(tp, th, cp, optlen); 2203 break; 2204 #ifdef TCP_SIGNATURE 2205 case TCPOPT_SIGNATURE: 2206 if (optlen != TCPOLEN_SIGNATURE) 2207 continue; 2208 2209 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2210 return (-1); 2211 2212 sigp = cp + 2; 2213 break; 2214 #endif /* TCP_SIGNATURE */ 2215 } 2216 } 2217 2218 #ifdef TCP_SIGNATURE 2219 if (tp->t_flags & TF_SIGNATURE) { 2220 union sockaddr_union src, dst; 2221 2222 memset(&src, 0, sizeof(union sockaddr_union)); 2223 memset(&dst, 0, sizeof(union sockaddr_union)); 2224 2225 switch (tp->pf) { 2226 case 0: 2227 case AF_INET: 2228 src.sa.sa_len = sizeof(struct sockaddr_in); 2229 src.sa.sa_family = AF_INET; 2230 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2231 dst.sa.sa_len = sizeof(struct sockaddr_in); 2232 dst.sa.sa_family = AF_INET; 2233 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2234 break; 2235 #ifdef INET6 2236 case AF_INET6: 2237 src.sa.sa_len = sizeof(struct sockaddr_in6); 2238 src.sa.sa_family = AF_INET6; 2239 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2240 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2241 dst.sa.sa_family = AF_INET6; 2242 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2243 break; 2244 #endif /* INET6 */ 2245 } 2246 2247 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2248 0, &src, &dst, IPPROTO_TCP); 2249 2250 /* 2251 * We don't have an SA for this peer, so we turn off 2252 * TF_SIGNATURE on the listen socket 2253 */ 2254 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2255 tp->t_flags &= ~TF_SIGNATURE; 2256 2257 } 2258 2259 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2260 tcpstat_inc(tcps_rcvbadsig); 2261 return (-1); 2262 } 2263 2264 if (sigp) { 2265 char sig[16]; 2266 2267 if (tdb == NULL) { 2268 tcpstat_inc(tcps_rcvbadsig); 2269 return (-1); 2270 } 2271 2272 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2273 return (-1); 2274 2275 if (timingsafe_bcmp(sig, sigp, 16)) { 2276 tcpstat_inc(tcps_rcvbadsig); 2277 return (-1); 2278 } 2279 2280 tcpstat_inc(tcps_rcvgoodsig); 2281 } 2282 #endif /* TCP_SIGNATURE */ 2283 2284 return (0); 2285 } 2286 2287 u_long 2288 tcp_seq_subtract(u_long a, u_long b) 2289 { 2290 return ((long)(a - b)); 2291 } 2292 2293 /* 2294 * This function is called upon receipt of new valid data (while not in header 2295 * prediction mode), and it updates the ordered list of sacks. 2296 */ 2297 void 2298 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2299 tcp_seq rcv_lastend) 2300 { 2301 /* 2302 * First reported block MUST be the most recent one. Subsequent 2303 * blocks SHOULD be in the order in which they arrived at the 2304 * receiver. These two conditions make the implementation fully 2305 * compliant with RFC 2018. 2306 */ 2307 int i, j = 0, count = 0, lastpos = -1; 2308 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2309 2310 /* First clean up current list of sacks */ 2311 for (i = 0; i < tp->rcv_numsacks; i++) { 2312 sack = tp->sackblks[i]; 2313 if (sack.start == 0 && sack.end == 0) { 2314 count++; /* count = number of blocks to be discarded */ 2315 continue; 2316 } 2317 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2318 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2319 count++; 2320 } else { 2321 temp[j].start = tp->sackblks[i].start; 2322 temp[j++].end = tp->sackblks[i].end; 2323 } 2324 } 2325 tp->rcv_numsacks -= count; 2326 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2327 tcp_clean_sackreport(tp); 2328 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2329 /* ==> need first sack block */ 2330 tp->sackblks[0].start = rcv_laststart; 2331 tp->sackblks[0].end = rcv_lastend; 2332 tp->rcv_numsacks = 1; 2333 } 2334 return; 2335 } 2336 /* Otherwise, sack blocks are already present. */ 2337 for (i = 0; i < tp->rcv_numsacks; i++) 2338 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2339 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2340 return; /* sack list remains unchanged */ 2341 /* 2342 * From here, segment just received should be (part of) the 1st sack. 2343 * Go through list, possibly coalescing sack block entries. 2344 */ 2345 firstsack.start = rcv_laststart; 2346 firstsack.end = rcv_lastend; 2347 for (i = 0; i < tp->rcv_numsacks; i++) { 2348 sack = tp->sackblks[i]; 2349 if (SEQ_LT(sack.end, firstsack.start) || 2350 SEQ_GT(sack.start, firstsack.end)) 2351 continue; /* no overlap */ 2352 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2353 /* 2354 * identical block; delete it here since we will 2355 * move it to the front of the list. 2356 */ 2357 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2358 lastpos = i; /* last posn with a zero entry */ 2359 continue; 2360 } 2361 if (SEQ_LEQ(sack.start, firstsack.start)) 2362 firstsack.start = sack.start; /* merge blocks */ 2363 if (SEQ_GEQ(sack.end, firstsack.end)) 2364 firstsack.end = sack.end; /* merge blocks */ 2365 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2366 lastpos = i; /* last posn with a zero entry */ 2367 } 2368 if (lastpos != -1) { /* at least one merge */ 2369 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2370 sack = tp->sackblks[i]; 2371 if (sack.start == 0 && sack.end == 0) 2372 continue; 2373 temp[j++] = sack; 2374 } 2375 tp->rcv_numsacks = j; /* including first blk (added later) */ 2376 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2377 tp->sackblks[i] = temp[i]; 2378 } else { /* no merges -- shift sacks by 1 */ 2379 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2380 tp->rcv_numsacks++; 2381 for (i = tp->rcv_numsacks-1; i > 0; i--) 2382 tp->sackblks[i] = tp->sackblks[i-1]; 2383 } 2384 tp->sackblks[0] = firstsack; 2385 return; 2386 } 2387 2388 /* 2389 * Process the TCP SACK option. tp->snd_holes is an ordered list 2390 * of holes (oldest to newest, in terms of the sequence space). 2391 */ 2392 void 2393 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2394 { 2395 int tmp_olen; 2396 u_char *tmp_cp; 2397 struct sackhole *cur, *p, *temp; 2398 2399 if (!tp->sack_enable) 2400 return; 2401 /* SACK without ACK doesn't make sense. */ 2402 if ((th->th_flags & TH_ACK) == 0) 2403 return; 2404 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2405 if (SEQ_LT(th->th_ack, tp->snd_una) || 2406 SEQ_GT(th->th_ack, tp->snd_max)) 2407 return; 2408 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2409 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2410 return; 2411 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2412 tmp_cp = cp + 2; 2413 tmp_olen = optlen - 2; 2414 tcpstat_inc(tcps_sack_rcv_opts); 2415 if (tp->snd_numholes < 0) 2416 tp->snd_numholes = 0; 2417 if (tp->t_maxseg == 0) 2418 panic("tcp_sack_option"); /* Should never happen */ 2419 while (tmp_olen > 0) { 2420 struct sackblk sack; 2421 2422 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2423 sack.start = ntohl(sack.start); 2424 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2425 sack.end = ntohl(sack.end); 2426 tmp_olen -= TCPOLEN_SACK; 2427 tmp_cp += TCPOLEN_SACK; 2428 if (SEQ_LEQ(sack.end, sack.start)) 2429 continue; /* bad SACK fields */ 2430 if (SEQ_LEQ(sack.end, tp->snd_una)) 2431 continue; /* old block */ 2432 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2433 if (SEQ_LT(sack.start, th->th_ack)) 2434 continue; 2435 } 2436 if (SEQ_GT(sack.end, tp->snd_max)) 2437 continue; 2438 if (tp->snd_holes == NULL) { /* first hole */ 2439 tp->snd_holes = (struct sackhole *) 2440 pool_get(&sackhl_pool, PR_NOWAIT); 2441 if (tp->snd_holes == NULL) { 2442 /* ENOBUFS, so ignore SACKed block for now*/ 2443 goto done; 2444 } 2445 cur = tp->snd_holes; 2446 cur->start = th->th_ack; 2447 cur->end = sack.start; 2448 cur->rxmit = cur->start; 2449 cur->next = NULL; 2450 tp->snd_numholes = 1; 2451 tp->rcv_lastsack = sack.end; 2452 /* 2453 * dups is at least one. If more data has been 2454 * SACKed, it can be greater than one. 2455 */ 2456 cur->dups = min(tcprexmtthresh, 2457 ((sack.end - cur->end)/tp->t_maxseg)); 2458 if (cur->dups < 1) 2459 cur->dups = 1; 2460 continue; /* with next sack block */ 2461 } 2462 /* Go thru list of holes: p = previous, cur = current */ 2463 p = cur = tp->snd_holes; 2464 while (cur) { 2465 if (SEQ_LEQ(sack.end, cur->start)) 2466 /* SACKs data before the current hole */ 2467 break; /* no use going through more holes */ 2468 if (SEQ_GEQ(sack.start, cur->end)) { 2469 /* SACKs data beyond the current hole */ 2470 cur->dups++; 2471 if (((sack.end - cur->end)/tp->t_maxseg) >= 2472 tcprexmtthresh) 2473 cur->dups = tcprexmtthresh; 2474 p = cur; 2475 cur = cur->next; 2476 continue; 2477 } 2478 if (SEQ_LEQ(sack.start, cur->start)) { 2479 /* Data acks at least the beginning of hole */ 2480 if (SEQ_GEQ(sack.end, cur->end)) { 2481 /* Acks entire hole, so delete hole */ 2482 if (p != cur) { 2483 p->next = cur->next; 2484 pool_put(&sackhl_pool, cur); 2485 cur = p->next; 2486 } else { 2487 cur = cur->next; 2488 pool_put(&sackhl_pool, p); 2489 p = cur; 2490 tp->snd_holes = p; 2491 } 2492 tp->snd_numholes--; 2493 continue; 2494 } 2495 /* otherwise, move start of hole forward */ 2496 cur->start = sack.end; 2497 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2498 p = cur; 2499 cur = cur->next; 2500 continue; 2501 } 2502 /* move end of hole backward */ 2503 if (SEQ_GEQ(sack.end, cur->end)) { 2504 cur->end = sack.start; 2505 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2506 cur->dups++; 2507 if (((sack.end - cur->end)/tp->t_maxseg) >= 2508 tcprexmtthresh) 2509 cur->dups = tcprexmtthresh; 2510 p = cur; 2511 cur = cur->next; 2512 continue; 2513 } 2514 if (SEQ_LT(cur->start, sack.start) && 2515 SEQ_GT(cur->end, sack.end)) { 2516 /* 2517 * ACKs some data in middle of a hole; need to 2518 * split current hole 2519 */ 2520 temp = (struct sackhole *) 2521 pool_get(&sackhl_pool, PR_NOWAIT); 2522 if (temp == NULL) 2523 goto done; /* ENOBUFS */ 2524 temp->next = cur->next; 2525 temp->start = sack.end; 2526 temp->end = cur->end; 2527 temp->dups = cur->dups; 2528 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2529 cur->end = sack.start; 2530 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2531 cur->dups++; 2532 if (((sack.end - cur->end)/tp->t_maxseg) >= 2533 tcprexmtthresh) 2534 cur->dups = tcprexmtthresh; 2535 cur->next = temp; 2536 p = temp; 2537 cur = p->next; 2538 tp->snd_numholes++; 2539 } 2540 } 2541 /* At this point, p points to the last hole on the list */ 2542 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2543 /* 2544 * Need to append new hole at end. 2545 * Last hole is p (and it's not NULL). 2546 */ 2547 temp = (struct sackhole *) 2548 pool_get(&sackhl_pool, PR_NOWAIT); 2549 if (temp == NULL) 2550 goto done; /* ENOBUFS */ 2551 temp->start = tp->rcv_lastsack; 2552 temp->end = sack.start; 2553 temp->dups = min(tcprexmtthresh, 2554 ((sack.end - sack.start)/tp->t_maxseg)); 2555 if (temp->dups < 1) 2556 temp->dups = 1; 2557 temp->rxmit = temp->start; 2558 temp->next = 0; 2559 p->next = temp; 2560 tp->rcv_lastsack = sack.end; 2561 tp->snd_numholes++; 2562 } 2563 } 2564 done: 2565 return; 2566 } 2567 2568 /* 2569 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2570 * it is completely acked; otherwise, tcp_sack_option(), called from 2571 * tcp_dooptions(), will fix up the hole. 2572 */ 2573 void 2574 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2575 { 2576 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2577 /* max because this could be an older ack just arrived */ 2578 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2579 th->th_ack : tp->snd_una; 2580 struct sackhole *cur = tp->snd_holes; 2581 struct sackhole *prev; 2582 while (cur) 2583 if (SEQ_LEQ(cur->end, lastack)) { 2584 prev = cur; 2585 cur = cur->next; 2586 pool_put(&sackhl_pool, prev); 2587 tp->snd_numholes--; 2588 } else if (SEQ_LT(cur->start, lastack)) { 2589 cur->start = lastack; 2590 if (SEQ_LT(cur->rxmit, cur->start)) 2591 cur->rxmit = cur->start; 2592 break; 2593 } else 2594 break; 2595 tp->snd_holes = cur; 2596 } 2597 } 2598 2599 /* 2600 * Delete all receiver-side SACK information. 2601 */ 2602 void 2603 tcp_clean_sackreport(struct tcpcb *tp) 2604 { 2605 int i; 2606 2607 tp->rcv_numsacks = 0; 2608 for (i = 0; i < MAX_SACK_BLKS; i++) 2609 tp->sackblks[i].start = tp->sackblks[i].end=0; 2610 2611 } 2612 2613 /* 2614 * Partial ack handling within a sack recovery episode. When a partial ack 2615 * arrives, turn off retransmission timer, deflate the window, do not clear 2616 * tp->t_dupacks. 2617 */ 2618 void 2619 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2620 { 2621 /* Turn off retx. timer (will start again next segment) */ 2622 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2623 tp->t_rtttime = 0; 2624 /* 2625 * Partial window deflation. This statement relies on the 2626 * fact that tp->snd_una has not been updated yet. 2627 */ 2628 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2629 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2630 tp->snd_cwnd += tp->t_maxseg; 2631 } else 2632 tp->snd_cwnd = tp->t_maxseg; 2633 tp->snd_cwnd += tp->t_maxseg; 2634 tp->t_flags |= TF_NEEDOUTPUT; 2635 } 2636 2637 /* 2638 * Pull out of band byte out of a segment so 2639 * it doesn't appear in the user's data queue. 2640 * It is still reflected in the segment length for 2641 * sequencing purposes. 2642 */ 2643 void 2644 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2645 { 2646 int cnt = off + urgent - 1; 2647 2648 while (cnt >= 0) { 2649 if (m->m_len > cnt) { 2650 char *cp = mtod(m, caddr_t) + cnt; 2651 struct tcpcb *tp = sototcpcb(so); 2652 2653 tp->t_iobc = *cp; 2654 tp->t_oobflags |= TCPOOB_HAVEDATA; 2655 memmove(cp, cp + 1, m->m_len - cnt - 1); 2656 m->m_len--; 2657 return; 2658 } 2659 cnt -= m->m_len; 2660 m = m->m_next; 2661 if (m == NULL) 2662 break; 2663 } 2664 panic("tcp_pulloutofband"); 2665 } 2666 2667 /* 2668 * Collect new round-trip time estimate 2669 * and update averages and current timeout. 2670 */ 2671 void 2672 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2673 { 2674 short delta; 2675 short rttmin; 2676 2677 if (rtt < 0) 2678 rtt = 0; 2679 else if (rtt > TCP_RTT_MAX) 2680 rtt = TCP_RTT_MAX; 2681 2682 tcpstat_inc(tcps_rttupdated); 2683 if (tp->t_srtt != 0) { 2684 /* 2685 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2686 * after the binary point (scaled by 4), whereas 2687 * srtt is stored as fixed point with 5 bits after the 2688 * binary point (i.e., scaled by 32). The following magic 2689 * is equivalent to the smoothing algorithm in rfc793 with 2690 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2691 * point). 2692 */ 2693 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2694 (tp->t_srtt >> TCP_RTT_SHIFT); 2695 if ((tp->t_srtt += delta) <= 0) 2696 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2697 /* 2698 * We accumulate a smoothed rtt variance (actually, a 2699 * smoothed mean difference), then set the retransmit 2700 * timer to smoothed rtt + 4 times the smoothed variance. 2701 * rttvar is stored as fixed point with 4 bits after the 2702 * binary point (scaled by 16). The following is 2703 * equivalent to rfc793 smoothing with an alpha of .75 2704 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2705 * rfc793's wired-in beta. 2706 */ 2707 if (delta < 0) 2708 delta = -delta; 2709 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2710 if ((tp->t_rttvar += delta) <= 0) 2711 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2712 } else { 2713 /* 2714 * No rtt measurement yet - use the unsmoothed rtt. 2715 * Set the variance to half the rtt (so our first 2716 * retransmit happens at 3*rtt). 2717 */ 2718 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2719 tp->t_rttvar = (rtt + 1) << 2720 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2721 } 2722 tp->t_rtttime = 0; 2723 tp->t_rxtshift = 0; 2724 2725 /* 2726 * the retransmit should happen at rtt + 4 * rttvar. 2727 * Because of the way we do the smoothing, srtt and rttvar 2728 * will each average +1/2 tick of bias. When we compute 2729 * the retransmit timer, we want 1/2 tick of rounding and 2730 * 1 extra tick because of +-1/2 tick uncertainty in the 2731 * firing of the timer. The bias will give us exactly the 2732 * 1.5 tick we need. But, because the bias is 2733 * statistical, we have to test that we don't drop below 2734 * the minimum feasible timer (which is 2 ticks). 2735 */ 2736 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2737 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2738 2739 /* 2740 * We received an ack for a packet that wasn't retransmitted; 2741 * it is probably safe to discard any error indications we've 2742 * received recently. This isn't quite right, but close enough 2743 * for now (a route might have failed after we sent a segment, 2744 * and the return path might not be symmetrical). 2745 */ 2746 tp->t_softerror = 0; 2747 } 2748 2749 /* 2750 * Determine a reasonable value for maxseg size. 2751 * If the route is known, check route for mtu. 2752 * If none, use an mss that can be handled on the outgoing 2753 * interface without forcing IP to fragment; if bigger than 2754 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2755 * to utilize large mbufs. If no route is found, route has no mtu, 2756 * or the destination isn't local, use a default, hopefully conservative 2757 * size (usually 512 or the default IP max size, but no more than the mtu 2758 * of the interface), as we can't discover anything about intervening 2759 * gateways or networks. We also initialize the congestion/slow start 2760 * window to be a single segment if the destination isn't local. 2761 * While looking at the routing entry, we also initialize other path-dependent 2762 * parameters from pre-set or cached values in the routing entry. 2763 * 2764 * Also take into account the space needed for options that we 2765 * send regularly. Make maxseg shorter by that amount to assure 2766 * that we can send maxseg amount of data even when the options 2767 * are present. Store the upper limit of the length of options plus 2768 * data in maxopd. 2769 * 2770 * NOTE: offer == -1 indicates that the maxseg size changed due to 2771 * Path MTU discovery. 2772 */ 2773 int 2774 tcp_mss(struct tcpcb *tp, int offer) 2775 { 2776 struct rtentry *rt; 2777 struct ifnet *ifp = NULL; 2778 int mss, mssopt; 2779 int iphlen; 2780 struct inpcb *inp; 2781 2782 inp = tp->t_inpcb; 2783 2784 mssopt = mss = tcp_mssdflt; 2785 2786 rt = in_pcbrtentry(inp); 2787 2788 if (rt == NULL) 2789 goto out; 2790 2791 ifp = if_get(rt->rt_ifidx); 2792 if (ifp == NULL) 2793 goto out; 2794 2795 switch (tp->pf) { 2796 #ifdef INET6 2797 case AF_INET6: 2798 iphlen = sizeof(struct ip6_hdr); 2799 break; 2800 #endif 2801 case AF_INET: 2802 iphlen = sizeof(struct ip); 2803 break; 2804 default: 2805 /* the family does not support path MTU discovery */ 2806 goto out; 2807 } 2808 2809 /* 2810 * if there's an mtu associated with the route and we support 2811 * path MTU discovery for the underlying protocol family, use it. 2812 */ 2813 if (rt->rt_mtu) { 2814 /* 2815 * One may wish to lower MSS to take into account options, 2816 * especially security-related options. 2817 */ 2818 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2819 /* 2820 * RFC2460 section 5, last paragraph: if path MTU is 2821 * smaller than 1280, use 1280 as packet size and 2822 * attach fragment header. 2823 */ 2824 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2825 sizeof(struct tcphdr); 2826 } else { 2827 mss = rt->rt_mtu - iphlen - 2828 sizeof(struct tcphdr); 2829 } 2830 } else if (ifp->if_flags & IFF_LOOPBACK) { 2831 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2832 } else if (tp->pf == AF_INET) { 2833 if (ip_mtudisc) 2834 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2835 } 2836 #ifdef INET6 2837 else if (tp->pf == AF_INET6) { 2838 /* 2839 * for IPv6, path MTU discovery is always turned on, 2840 * or the node must use packet size <= 1280. 2841 */ 2842 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2843 } 2844 #endif /* INET6 */ 2845 2846 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2847 if (offer != -1) { 2848 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2849 mssopt = max(tcp_mssdflt, mssopt); 2850 } 2851 out: 2852 if_put(ifp); 2853 /* 2854 * The current mss, t_maxseg, is initialized to the default value. 2855 * If we compute a smaller value, reduce the current mss. 2856 * If we compute a larger value, return it for use in sending 2857 * a max seg size option, but don't store it for use 2858 * unless we received an offer at least that large from peer. 2859 * 2860 * However, do not accept offers lower than the minimum of 2861 * the interface MTU and 216. 2862 */ 2863 if (offer > 0) 2864 tp->t_peermss = offer; 2865 if (tp->t_peermss) 2866 mss = min(mss, max(tp->t_peermss, 216)); 2867 2868 /* sanity - at least max opt. space */ 2869 mss = max(mss, 64); 2870 2871 /* 2872 * maxopd stores the maximum length of data AND options 2873 * in a segment; maxseg is the amount of data in a normal 2874 * segment. We need to store this value (maxopd) apart 2875 * from maxseg, because now every segment carries options 2876 * and thus we normally have somewhat less data in segments. 2877 */ 2878 tp->t_maxopd = mss; 2879 2880 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2881 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2882 mss -= TCPOLEN_TSTAMP_APPA; 2883 #ifdef TCP_SIGNATURE 2884 if (tp->t_flags & TF_SIGNATURE) 2885 mss -= TCPOLEN_SIGLEN; 2886 #endif 2887 2888 if (offer == -1) { 2889 /* mss changed due to Path MTU discovery */ 2890 tp->t_flags &= ~TF_PMTUD_PEND; 2891 tp->t_pmtud_mtu_sent = 0; 2892 tp->t_pmtud_mss_acked = 0; 2893 if (mss < tp->t_maxseg) { 2894 /* 2895 * Follow suggestion in RFC 2414 to reduce the 2896 * congestion window by the ratio of the old 2897 * segment size to the new segment size. 2898 */ 2899 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2900 mss, mss); 2901 } 2902 } else if (tcp_do_rfc3390 == 2) { 2903 /* increase initial window */ 2904 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2905 } else if (tcp_do_rfc3390) { 2906 /* increase initial window */ 2907 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2908 } else 2909 tp->snd_cwnd = mss; 2910 2911 tp->t_maxseg = mss; 2912 2913 return (offer != -1 ? mssopt : mss); 2914 } 2915 2916 u_int 2917 tcp_hdrsz(struct tcpcb *tp) 2918 { 2919 u_int hlen; 2920 2921 switch (tp->pf) { 2922 #ifdef INET6 2923 case AF_INET6: 2924 hlen = sizeof(struct ip6_hdr); 2925 break; 2926 #endif 2927 case AF_INET: 2928 hlen = sizeof(struct ip); 2929 break; 2930 default: 2931 hlen = 0; 2932 break; 2933 } 2934 hlen += sizeof(struct tcphdr); 2935 2936 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2937 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2938 hlen += TCPOLEN_TSTAMP_APPA; 2939 #ifdef TCP_SIGNATURE 2940 if (tp->t_flags & TF_SIGNATURE) 2941 hlen += TCPOLEN_SIGLEN; 2942 #endif 2943 return (hlen); 2944 } 2945 2946 /* 2947 * Set connection variables based on the effective MSS. 2948 * We are passed the TCPCB for the actual connection. If we 2949 * are the server, we are called by the compressed state engine 2950 * when the 3-way handshake is complete. If we are the client, 2951 * we are called when we receive the SYN,ACK from the server. 2952 * 2953 * NOTE: The t_maxseg value must be initialized in the TCPCB 2954 * before this routine is called! 2955 */ 2956 void 2957 tcp_mss_update(struct tcpcb *tp) 2958 { 2959 int mss; 2960 u_long bufsize; 2961 struct rtentry *rt; 2962 struct socket *so; 2963 2964 so = tp->t_inpcb->inp_socket; 2965 mss = tp->t_maxseg; 2966 2967 rt = in_pcbrtentry(tp->t_inpcb); 2968 2969 if (rt == NULL) 2970 return; 2971 2972 bufsize = so->so_snd.sb_hiwat; 2973 if (bufsize < mss) { 2974 mss = bufsize; 2975 /* Update t_maxseg and t_maxopd */ 2976 tcp_mss(tp, mss); 2977 } else { 2978 bufsize = roundup(bufsize, mss); 2979 if (bufsize > sb_max) 2980 bufsize = sb_max; 2981 (void)sbreserve(so, &so->so_snd, bufsize); 2982 } 2983 2984 bufsize = so->so_rcv.sb_hiwat; 2985 if (bufsize > mss) { 2986 bufsize = roundup(bufsize, mss); 2987 if (bufsize > sb_max) 2988 bufsize = sb_max; 2989 (void)sbreserve(so, &so->so_rcv, bufsize); 2990 } 2991 2992 } 2993 2994 /* 2995 * When a partial ack arrives, force the retransmission of the 2996 * next unacknowledged segment. Do not clear tp->t_dupacks. 2997 * By setting snd_nxt to ti_ack, this forces retransmission timer 2998 * to be started again. 2999 */ 3000 void 3001 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3002 { 3003 /* 3004 * snd_una has not been updated and the socket send buffer 3005 * not yet drained of the acked data, so we have to leave 3006 * snd_una as it was to get the correct data offset in 3007 * tcp_output(). 3008 */ 3009 tcp_seq onxt = tp->snd_nxt; 3010 u_long ocwnd = tp->snd_cwnd; 3011 3012 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3013 tp->t_rtttime = 0; 3014 tp->snd_nxt = th->th_ack; 3015 /* 3016 * Set snd_cwnd to one segment beyond acknowledged offset 3017 * (tp->snd_una not yet updated when this function is called) 3018 */ 3019 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3020 (void)tcp_output(tp); 3021 tp->snd_cwnd = ocwnd; 3022 if (SEQ_GT(onxt, tp->snd_nxt)) 3023 tp->snd_nxt = onxt; 3024 /* 3025 * Partial window deflation. Relies on fact that tp->snd_una 3026 * not updated yet. 3027 */ 3028 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3029 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3030 else 3031 tp->snd_cwnd = 0; 3032 tp->snd_cwnd += tp->t_maxseg; 3033 } 3034 3035 int 3036 tcp_mss_adv(struct mbuf *m, int af) 3037 { 3038 int mss = 0; 3039 int iphlen; 3040 struct ifnet *ifp = NULL; 3041 3042 if (m && (m->m_flags & M_PKTHDR)) 3043 ifp = if_get(m->m_pkthdr.ph_ifidx); 3044 3045 switch (af) { 3046 case AF_INET: 3047 if (ifp != NULL) 3048 mss = ifp->if_mtu; 3049 iphlen = sizeof(struct ip); 3050 break; 3051 #ifdef INET6 3052 case AF_INET6: 3053 if (ifp != NULL) 3054 mss = ifp->if_mtu; 3055 iphlen = sizeof(struct ip6_hdr); 3056 break; 3057 #endif 3058 default: 3059 unhandled_af(af); 3060 } 3061 if_put(ifp); 3062 mss = mss - iphlen - sizeof(struct tcphdr); 3063 return (max(mss, tcp_mssdflt)); 3064 } 3065 3066 /* 3067 * TCP compressed state engine. Currently used to hold compressed 3068 * state for SYN_RECEIVED. 3069 */ 3070 3071 /* syn hash parameters */ 3072 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3073 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3074 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3075 int tcp_syn_use_limit = 100000; 3076 3077 struct syn_cache_set tcp_syn_cache[2]; 3078 int tcp_syn_cache_active; 3079 3080 #define SYN_HASH(sa, sp, dp, rand) \ 3081 (((sa)->s_addr ^ (rand)[0]) * \ 3082 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3083 #ifndef INET6 3084 #define SYN_HASHALL(hash, src, dst, rand) \ 3085 do { \ 3086 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3087 satosin(src)->sin_port, \ 3088 satosin(dst)->sin_port, (rand)); \ 3089 } while (/*CONSTCOND*/ 0) 3090 #else 3091 #define SYN_HASH6(sa, sp, dp, rand) \ 3092 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3093 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3094 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3095 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3096 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3097 3098 #define SYN_HASHALL(hash, src, dst, rand) \ 3099 do { \ 3100 switch ((src)->sa_family) { \ 3101 case AF_INET: \ 3102 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3103 satosin(src)->sin_port, \ 3104 satosin(dst)->sin_port, (rand)); \ 3105 break; \ 3106 case AF_INET6: \ 3107 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3108 satosin6(src)->sin6_port, \ 3109 satosin6(dst)->sin6_port, (rand)); \ 3110 break; \ 3111 default: \ 3112 hash = 0; \ 3113 } \ 3114 } while (/*CONSTCOND*/0) 3115 #endif /* INET6 */ 3116 3117 void 3118 syn_cache_rm(struct syn_cache *sc) 3119 { 3120 sc->sc_flags |= SCF_DEAD; 3121 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3122 sc->sc_tp = NULL; 3123 LIST_REMOVE(sc, sc_tpq); 3124 sc->sc_buckethead->sch_length--; 3125 timeout_del(&sc->sc_timer); 3126 sc->sc_set->scs_count--; 3127 } 3128 3129 void 3130 syn_cache_put(struct syn_cache *sc) 3131 { 3132 m_free(sc->sc_ipopts); 3133 if (sc->sc_route4.ro_rt != NULL) { 3134 rtfree(sc->sc_route4.ro_rt); 3135 sc->sc_route4.ro_rt = NULL; 3136 } 3137 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3138 timeout_add(&sc->sc_timer, 0); 3139 } 3140 3141 struct pool syn_cache_pool; 3142 3143 /* 3144 * We don't estimate RTT with SYNs, so each packet starts with the default 3145 * RTT and each timer step has a fixed timeout value. 3146 */ 3147 #define SYN_CACHE_TIMER_ARM(sc) \ 3148 do { \ 3149 TCPT_RANGESET((sc)->sc_rxtcur, \ 3150 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3151 TCPTV_REXMTMAX); \ 3152 if (!timeout_initialized(&(sc)->sc_timer)) \ 3153 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3154 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3155 } while (/*CONSTCOND*/0) 3156 3157 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3158 3159 void 3160 syn_cache_init(void) 3161 { 3162 int i; 3163 3164 /* Initialize the hash buckets. */ 3165 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3166 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3167 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3168 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3169 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3170 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3171 for (i = 0; i < tcp_syn_hash_size; i++) { 3172 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3173 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3174 } 3175 3176 /* Initialize the syn cache pool. */ 3177 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3178 0, "syncache", NULL); 3179 } 3180 3181 void 3182 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3183 { 3184 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3185 struct syn_cache_head *scp; 3186 struct syn_cache *sc2; 3187 int i; 3188 3189 NET_ASSERT_LOCKED(); 3190 3191 /* 3192 * If there are no entries in the hash table, reinitialize 3193 * the hash secrets. To avoid useless cache swaps and 3194 * reinitialization, use it until the limit is reached. 3195 * An emtpy cache is also the oportunity to resize the hash. 3196 */ 3197 if (set->scs_count == 0 && set->scs_use <= 0) { 3198 set->scs_use = tcp_syn_use_limit; 3199 if (set->scs_size != tcp_syn_hash_size) { 3200 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3201 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3202 if (scp == NULL) { 3203 /* Try again next time. */ 3204 set->scs_use = 0; 3205 } else { 3206 free(set->scs_buckethead, M_SYNCACHE, 3207 set->scs_size * 3208 sizeof(struct syn_cache_head)); 3209 set->scs_buckethead = scp; 3210 set->scs_size = tcp_syn_hash_size; 3211 for (i = 0; i < tcp_syn_hash_size; i++) 3212 TAILQ_INIT(&scp[i].sch_bucket); 3213 } 3214 } 3215 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3216 tcpstat_inc(tcps_sc_seedrandom); 3217 } 3218 3219 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3220 set->scs_random); 3221 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3222 sc->sc_buckethead = scp; 3223 3224 /* 3225 * Make sure that we don't overflow the per-bucket 3226 * limit or the total cache size limit. 3227 */ 3228 if (scp->sch_length >= tcp_syn_bucket_limit) { 3229 tcpstat_inc(tcps_sc_bucketoverflow); 3230 /* 3231 * Someone might attack our bucket hash function. Reseed 3232 * with random as soon as the passive syn cache gets empty. 3233 */ 3234 set->scs_use = 0; 3235 /* 3236 * The bucket is full. Toss the oldest element in the 3237 * bucket. This will be the first entry in the bucket. 3238 */ 3239 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3240 #ifdef DIAGNOSTIC 3241 /* 3242 * This should never happen; we should always find an 3243 * entry in our bucket. 3244 */ 3245 if (sc2 == NULL) 3246 panic("%s: bucketoverflow: impossible", __func__); 3247 #endif 3248 syn_cache_rm(sc2); 3249 syn_cache_put(sc2); 3250 } else if (set->scs_count >= tcp_syn_cache_limit) { 3251 struct syn_cache_head *scp2, *sce; 3252 3253 tcpstat_inc(tcps_sc_overflowed); 3254 /* 3255 * The cache is full. Toss the oldest entry in the 3256 * first non-empty bucket we can find. 3257 * 3258 * XXX We would really like to toss the oldest 3259 * entry in the cache, but we hope that this 3260 * condition doesn't happen very often. 3261 */ 3262 scp2 = scp; 3263 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3264 sce = &set->scs_buckethead[set->scs_size]; 3265 for (++scp2; scp2 != scp; scp2++) { 3266 if (scp2 >= sce) 3267 scp2 = &set->scs_buckethead[0]; 3268 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3269 break; 3270 } 3271 #ifdef DIAGNOSTIC 3272 /* 3273 * This should never happen; we should always find a 3274 * non-empty bucket. 3275 */ 3276 if (scp2 == scp) 3277 panic("%s: cacheoverflow: impossible", 3278 __func__); 3279 #endif 3280 } 3281 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3282 syn_cache_rm(sc2); 3283 syn_cache_put(sc2); 3284 } 3285 3286 /* 3287 * Initialize the entry's timer. 3288 */ 3289 sc->sc_rxttot = 0; 3290 sc->sc_rxtshift = 0; 3291 SYN_CACHE_TIMER_ARM(sc); 3292 3293 /* Link it from tcpcb entry */ 3294 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3295 3296 /* Put it into the bucket. */ 3297 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3298 scp->sch_length++; 3299 sc->sc_set = set; 3300 set->scs_count++; 3301 set->scs_use--; 3302 3303 tcpstat_inc(tcps_sc_added); 3304 3305 /* 3306 * If the active cache has exceeded its use limit and 3307 * the passive syn cache is empty, exchange their roles. 3308 */ 3309 if (set->scs_use <= 0 && 3310 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3311 tcp_syn_cache_active = !tcp_syn_cache_active; 3312 } 3313 3314 /* 3315 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3316 * If we have retransmitted an entry the maximum number of times, expire 3317 * that entry. 3318 */ 3319 void 3320 syn_cache_timer(void *arg) 3321 { 3322 struct syn_cache *sc = arg; 3323 3324 NET_LOCK(); 3325 if (sc->sc_flags & SCF_DEAD) 3326 goto out; 3327 3328 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3329 /* Drop it -- too many retransmissions. */ 3330 goto dropit; 3331 } 3332 3333 /* 3334 * Compute the total amount of time this entry has 3335 * been on a queue. If this entry has been on longer 3336 * than the keep alive timer would allow, expire it. 3337 */ 3338 sc->sc_rxttot += sc->sc_rxtcur; 3339 if (sc->sc_rxttot >= tcptv_keep_init) 3340 goto dropit; 3341 3342 tcpstat_inc(tcps_sc_retransmitted); 3343 (void) syn_cache_respond(sc, NULL); 3344 3345 /* Advance the timer back-off. */ 3346 sc->sc_rxtshift++; 3347 SYN_CACHE_TIMER_ARM(sc); 3348 3349 out: 3350 NET_UNLOCK(); 3351 return; 3352 3353 dropit: 3354 tcpstat_inc(tcps_sc_timed_out); 3355 syn_cache_rm(sc); 3356 syn_cache_put(sc); 3357 NET_UNLOCK(); 3358 } 3359 3360 void 3361 syn_cache_reaper(void *arg) 3362 { 3363 struct syn_cache *sc = arg; 3364 3365 pool_put(&syn_cache_pool, (sc)); 3366 return; 3367 } 3368 3369 /* 3370 * Remove syn cache created by the specified tcb entry, 3371 * because this does not make sense to keep them 3372 * (if there's no tcb entry, syn cache entry will never be used) 3373 */ 3374 void 3375 syn_cache_cleanup(struct tcpcb *tp) 3376 { 3377 struct syn_cache *sc, *nsc; 3378 3379 NET_ASSERT_LOCKED(); 3380 3381 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3382 #ifdef DIAGNOSTIC 3383 if (sc->sc_tp != tp) 3384 panic("invalid sc_tp in syn_cache_cleanup"); 3385 #endif 3386 syn_cache_rm(sc); 3387 syn_cache_put(sc); 3388 } 3389 /* just for safety */ 3390 LIST_INIT(&tp->t_sc); 3391 } 3392 3393 /* 3394 * Find an entry in the syn cache. 3395 */ 3396 struct syn_cache * 3397 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3398 struct syn_cache_head **headp, u_int rtableid) 3399 { 3400 struct syn_cache_set *sets[2]; 3401 struct syn_cache *sc; 3402 struct syn_cache_head *scp; 3403 u_int32_t hash; 3404 int i; 3405 3406 NET_ASSERT_LOCKED(); 3407 3408 /* Check the active cache first, the passive cache is likely emtpy. */ 3409 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3410 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3411 for (i = 0; i < 2; i++) { 3412 if (sets[i]->scs_count == 0) 3413 continue; 3414 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3415 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3416 *headp = scp; 3417 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3418 if (sc->sc_hash != hash) 3419 continue; 3420 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3421 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3422 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3423 return (sc); 3424 } 3425 } 3426 return (NULL); 3427 } 3428 3429 /* 3430 * This function gets called when we receive an ACK for a 3431 * socket in the LISTEN state. We look up the connection 3432 * in the syn cache, and if its there, we pull it out of 3433 * the cache and turn it into a full-blown connection in 3434 * the SYN-RECEIVED state. 3435 * 3436 * The return values may not be immediately obvious, and their effects 3437 * can be subtle, so here they are: 3438 * 3439 * NULL SYN was not found in cache; caller should drop the 3440 * packet and send an RST. 3441 * 3442 * -1 We were unable to create the new connection, and are 3443 * aborting it. An ACK,RST is being sent to the peer 3444 * (unless we got screwey sequence numbners; see below), 3445 * because the 3-way handshake has been completed. Caller 3446 * should not free the mbuf, since we may be using it. If 3447 * we are not, we will free it. 3448 * 3449 * Otherwise, the return value is a pointer to the new socket 3450 * associated with the connection. 3451 */ 3452 struct socket * 3453 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3454 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3455 { 3456 struct syn_cache *sc; 3457 struct syn_cache_head *scp; 3458 struct inpcb *inp, *oldinp; 3459 struct tcpcb *tp = NULL; 3460 struct mbuf *am; 3461 struct socket *oso; 3462 #if NPF > 0 3463 struct pf_divert *divert = NULL; 3464 #endif 3465 3466 NET_ASSERT_LOCKED(); 3467 3468 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3469 if (sc == NULL) 3470 return (NULL); 3471 3472 /* 3473 * Verify the sequence and ack numbers. Try getting the correct 3474 * response again. 3475 */ 3476 if ((th->th_ack != sc->sc_iss + 1) || 3477 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3478 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3479 (void) syn_cache_respond(sc, m); 3480 return ((struct socket *)(-1)); 3481 } 3482 3483 /* Remove this cache entry */ 3484 syn_cache_rm(sc); 3485 3486 /* 3487 * Ok, create the full blown connection, and set things up 3488 * as they would have been set up if we had created the 3489 * connection when the SYN arrived. If we can't create 3490 * the connection, abort it. 3491 */ 3492 oso = so; 3493 so = sonewconn(so, SS_ISCONNECTED); 3494 if (so == NULL) 3495 goto resetandabort; 3496 3497 oldinp = sotoinpcb(oso); 3498 inp = sotoinpcb(so); 3499 3500 #ifdef IPSEC 3501 /* 3502 * We need to copy the required security levels 3503 * from the old pcb. Ditto for any other 3504 * IPsec-related information. 3505 */ 3506 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3507 sizeof(oldinp->inp_seclevel)); 3508 #endif /* IPSEC */ 3509 #ifdef INET6 3510 /* 3511 * inp still has the OLD in_pcb stuff, set the 3512 * v6-related flags on the new guy, too. 3513 */ 3514 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3515 if (inp->inp_flags & INP_IPV6) { 3516 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3517 inp->inp_hops = oldinp->inp_hops; 3518 } else 3519 #endif /* INET6 */ 3520 { 3521 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3522 } 3523 3524 #if NPF > 0 3525 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3526 (divert = pf_find_divert(m)) != NULL) 3527 inp->inp_rtableid = divert->rdomain; 3528 else 3529 #endif 3530 /* inherit rtable from listening socket */ 3531 inp->inp_rtableid = sc->sc_rtableid; 3532 3533 inp->inp_lport = th->th_dport; 3534 switch (src->sa_family) { 3535 #ifdef INET6 3536 case AF_INET6: 3537 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3538 break; 3539 #endif /* INET6 */ 3540 case AF_INET: 3541 inp->inp_laddr = satosin(dst)->sin_addr; 3542 inp->inp_options = ip_srcroute(m); 3543 if (inp->inp_options == NULL) { 3544 inp->inp_options = sc->sc_ipopts; 3545 sc->sc_ipopts = NULL; 3546 } 3547 break; 3548 } 3549 in_pcbrehash(inp); 3550 3551 /* 3552 * Give the new socket our cached route reference. 3553 */ 3554 if (src->sa_family == AF_INET) 3555 inp->inp_route = sc->sc_route4; /* struct assignment */ 3556 #ifdef INET6 3557 else 3558 inp->inp_route6 = sc->sc_route6; 3559 #endif 3560 sc->sc_route4.ro_rt = NULL; 3561 3562 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3563 if (am == NULL) 3564 goto resetandabort; 3565 am->m_len = src->sa_len; 3566 memcpy(mtod(am, caddr_t), src, src->sa_len); 3567 3568 switch (src->sa_family) { 3569 case AF_INET: 3570 /* drop IPv4 packet to AF_INET6 socket */ 3571 if (inp->inp_flags & INP_IPV6) { 3572 (void) m_free(am); 3573 goto resetandabort; 3574 } 3575 if (in_pcbconnect(inp, am)) { 3576 (void) m_free(am); 3577 goto resetandabort; 3578 } 3579 break; 3580 #ifdef INET6 3581 case AF_INET6: 3582 if (in6_pcbconnect(inp, am)) { 3583 (void) m_free(am); 3584 goto resetandabort; 3585 } 3586 break; 3587 #endif 3588 } 3589 (void) m_free(am); 3590 3591 tp = intotcpcb(inp); 3592 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3593 if (sc->sc_request_r_scale != 15) { 3594 tp->requested_s_scale = sc->sc_requested_s_scale; 3595 tp->request_r_scale = sc->sc_request_r_scale; 3596 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3597 } 3598 if (sc->sc_flags & SCF_TIMESTAMP) 3599 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3600 3601 tp->t_template = tcp_template(tp); 3602 if (tp->t_template == 0) { 3603 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3604 so = NULL; 3605 m_freem(m); 3606 goto abort; 3607 } 3608 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3609 tp->ts_modulate = sc->sc_modulate; 3610 tp->ts_recent = sc->sc_timestamp; 3611 tp->iss = sc->sc_iss; 3612 tp->irs = sc->sc_irs; 3613 tcp_sendseqinit(tp); 3614 tp->snd_last = tp->snd_una; 3615 #ifdef TCP_ECN 3616 if (sc->sc_flags & SCF_ECN_PERMIT) { 3617 tp->t_flags |= TF_ECN_PERMIT; 3618 tcpstat_inc(tcps_ecn_accepts); 3619 } 3620 #endif 3621 if (sc->sc_flags & SCF_SACK_PERMIT) 3622 tp->t_flags |= TF_SACK_PERMIT; 3623 #ifdef TCP_SIGNATURE 3624 if (sc->sc_flags & SCF_SIGNATURE) 3625 tp->t_flags |= TF_SIGNATURE; 3626 #endif 3627 tcp_rcvseqinit(tp); 3628 tp->t_state = TCPS_SYN_RECEIVED; 3629 tp->t_rcvtime = tcp_now; 3630 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3631 tcpstat_inc(tcps_accepts); 3632 3633 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3634 if (sc->sc_peermaxseg) 3635 tcp_mss_update(tp); 3636 /* Reset initial window to 1 segment for retransmit */ 3637 if (sc->sc_rxtshift > 0) 3638 tp->snd_cwnd = tp->t_maxseg; 3639 tp->snd_wl1 = sc->sc_irs; 3640 tp->rcv_up = sc->sc_irs + 1; 3641 3642 /* 3643 * This is what whould have happened in tcp_output() when 3644 * the SYN,ACK was sent. 3645 */ 3646 tp->snd_up = tp->snd_una; 3647 tp->snd_max = tp->snd_nxt = tp->iss+1; 3648 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3649 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3650 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3651 tp->last_ack_sent = tp->rcv_nxt; 3652 3653 tcpstat_inc(tcps_sc_completed); 3654 syn_cache_put(sc); 3655 return (so); 3656 3657 resetandabort: 3658 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3659 m->m_pkthdr.ph_rtableid); 3660 m_freem(m); 3661 abort: 3662 if (so != NULL) 3663 (void) soabort(so); 3664 syn_cache_put(sc); 3665 tcpstat_inc(tcps_sc_aborted); 3666 return ((struct socket *)(-1)); 3667 } 3668 3669 /* 3670 * This function is called when we get a RST for a 3671 * non-existent connection, so that we can see if the 3672 * connection is in the syn cache. If it is, zap it. 3673 */ 3674 3675 void 3676 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3677 u_int rtableid) 3678 { 3679 struct syn_cache *sc; 3680 struct syn_cache_head *scp; 3681 3682 NET_ASSERT_LOCKED(); 3683 3684 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3685 return; 3686 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3687 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3688 return; 3689 syn_cache_rm(sc); 3690 tcpstat_inc(tcps_sc_reset); 3691 syn_cache_put(sc); 3692 } 3693 3694 void 3695 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3696 u_int rtableid) 3697 { 3698 struct syn_cache *sc; 3699 struct syn_cache_head *scp; 3700 3701 NET_ASSERT_LOCKED(); 3702 3703 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3704 return; 3705 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3706 if (ntohl (th->th_seq) != sc->sc_iss) { 3707 return; 3708 } 3709 3710 /* 3711 * If we've retransmitted 3 times and this is our second error, 3712 * we remove the entry. Otherwise, we allow it to continue on. 3713 * This prevents us from incorrectly nuking an entry during a 3714 * spurious network outage. 3715 * 3716 * See tcp_notify(). 3717 */ 3718 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3719 sc->sc_flags |= SCF_UNREACH; 3720 return; 3721 } 3722 3723 syn_cache_rm(sc); 3724 tcpstat_inc(tcps_sc_unreach); 3725 syn_cache_put(sc); 3726 } 3727 3728 /* 3729 * Given a LISTEN socket and an inbound SYN request, add 3730 * this to the syn cache, and send back a segment: 3731 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3732 * to the source. 3733 * 3734 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3735 * Doing so would require that we hold onto the data and deliver it 3736 * to the application. However, if we are the target of a SYN-flood 3737 * DoS attack, an attacker could send data which would eventually 3738 * consume all available buffer space if it were ACKed. By not ACKing 3739 * the data, we avoid this DoS scenario. 3740 */ 3741 3742 int 3743 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3744 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3745 struct tcp_opt_info *oi, tcp_seq *issp) 3746 { 3747 struct tcpcb tb, *tp; 3748 long win; 3749 struct syn_cache *sc; 3750 struct syn_cache_head *scp; 3751 struct mbuf *ipopts; 3752 3753 tp = sototcpcb(so); 3754 3755 /* 3756 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3757 * 3758 * Note this check is performed in tcp_input() very early on. 3759 */ 3760 3761 /* 3762 * Initialize some local state. 3763 */ 3764 win = sbspace(so, &so->so_rcv); 3765 if (win > TCP_MAXWIN) 3766 win = TCP_MAXWIN; 3767 3768 bzero(&tb, sizeof(tb)); 3769 #ifdef TCP_SIGNATURE 3770 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3771 #else 3772 if (optp) { 3773 #endif 3774 tb.pf = tp->pf; 3775 tb.sack_enable = tp->sack_enable; 3776 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3777 #ifdef TCP_SIGNATURE 3778 if (tp->t_flags & TF_SIGNATURE) 3779 tb.t_flags |= TF_SIGNATURE; 3780 #endif 3781 tb.t_state = TCPS_LISTEN; 3782 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3783 sotoinpcb(so)->inp_rtableid)) 3784 return (-1); 3785 } 3786 3787 switch (src->sa_family) { 3788 case AF_INET: 3789 /* 3790 * Remember the IP options, if any. 3791 */ 3792 ipopts = ip_srcroute(m); 3793 break; 3794 default: 3795 ipopts = NULL; 3796 } 3797 3798 /* 3799 * See if we already have an entry for this connection. 3800 * If we do, resend the SYN,ACK. We do not count this 3801 * as a retransmission (XXX though maybe we should). 3802 */ 3803 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3804 if (sc != NULL) { 3805 tcpstat_inc(tcps_sc_dupesyn); 3806 if (ipopts) { 3807 /* 3808 * If we were remembering a previous source route, 3809 * forget it and use the new one we've been given. 3810 */ 3811 m_free(sc->sc_ipopts); 3812 sc->sc_ipopts = ipopts; 3813 } 3814 sc->sc_timestamp = tb.ts_recent; 3815 if (syn_cache_respond(sc, m) == 0) { 3816 tcpstat_inc(tcps_sndacks); 3817 tcpstat_inc(tcps_sndtotal); 3818 } 3819 return (0); 3820 } 3821 3822 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3823 if (sc == NULL) { 3824 m_free(ipopts); 3825 return (-1); 3826 } 3827 3828 /* 3829 * Fill in the cache, and put the necessary IP and TCP 3830 * options into the reply. 3831 */ 3832 memcpy(&sc->sc_src, src, src->sa_len); 3833 memcpy(&sc->sc_dst, dst, dst->sa_len); 3834 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3835 sc->sc_flags = 0; 3836 sc->sc_ipopts = ipopts; 3837 sc->sc_irs = th->th_seq; 3838 3839 sc->sc_iss = issp ? *issp : arc4random(); 3840 sc->sc_peermaxseg = oi->maxseg; 3841 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3842 sc->sc_win = win; 3843 sc->sc_timestamp = tb.ts_recent; 3844 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3845 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3846 sc->sc_flags |= SCF_TIMESTAMP; 3847 sc->sc_modulate = arc4random(); 3848 } 3849 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3850 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3851 sc->sc_requested_s_scale = tb.requested_s_scale; 3852 sc->sc_request_r_scale = 0; 3853 /* 3854 * Pick the smallest possible scaling factor that 3855 * will still allow us to scale up to sb_max. 3856 * 3857 * We do this because there are broken firewalls that 3858 * will corrupt the window scale option, leading to 3859 * the other endpoint believing that our advertised 3860 * window is unscaled. At scale factors larger than 3861 * 5 the unscaled window will drop below 1500 bytes, 3862 * leading to serious problems when traversing these 3863 * broken firewalls. 3864 * 3865 * With the default sbmax of 256K, a scale factor 3866 * of 3 will be chosen by this algorithm. Those who 3867 * choose a larger sbmax should watch out 3868 * for the compatiblity problems mentioned above. 3869 * 3870 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3871 * or <SYN,ACK>) segment itself is never scaled. 3872 */ 3873 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3874 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3875 sc->sc_request_r_scale++; 3876 } else { 3877 sc->sc_requested_s_scale = 15; 3878 sc->sc_request_r_scale = 15; 3879 } 3880 #ifdef TCP_ECN 3881 /* 3882 * if both ECE and CWR flag bits are set, peer is ECN capable. 3883 */ 3884 if (tcp_do_ecn && 3885 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3886 sc->sc_flags |= SCF_ECN_PERMIT; 3887 #endif 3888 /* 3889 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3890 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3891 */ 3892 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3893 sc->sc_flags |= SCF_SACK_PERMIT; 3894 #ifdef TCP_SIGNATURE 3895 if (tb.t_flags & TF_SIGNATURE) 3896 sc->sc_flags |= SCF_SIGNATURE; 3897 #endif 3898 sc->sc_tp = tp; 3899 if (syn_cache_respond(sc, m) == 0) { 3900 syn_cache_insert(sc, tp); 3901 tcpstat_inc(tcps_sndacks); 3902 tcpstat_inc(tcps_sndtotal); 3903 } else { 3904 syn_cache_put(sc); 3905 tcpstat_inc(tcps_sc_dropped); 3906 } 3907 3908 return (0); 3909 } 3910 3911 int 3912 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 3913 { 3914 u_int8_t *optp; 3915 int optlen, error; 3916 u_int16_t tlen; 3917 struct ip *ip = NULL; 3918 #ifdef INET6 3919 struct ip6_hdr *ip6 = NULL; 3920 #endif 3921 struct tcphdr *th; 3922 u_int hlen; 3923 struct inpcb *inp; 3924 3925 switch (sc->sc_src.sa.sa_family) { 3926 case AF_INET: 3927 hlen = sizeof(struct ip); 3928 break; 3929 #ifdef INET6 3930 case AF_INET6: 3931 hlen = sizeof(struct ip6_hdr); 3932 break; 3933 #endif 3934 default: 3935 m_freem(m); 3936 return (EAFNOSUPPORT); 3937 } 3938 3939 /* Compute the size of the TCP options. */ 3940 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3941 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 3942 #ifdef TCP_SIGNATURE 3943 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3944 #endif 3945 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3946 3947 tlen = hlen + sizeof(struct tcphdr) + optlen; 3948 3949 /* 3950 * Create the IP+TCP header from scratch. 3951 */ 3952 m_freem(m); 3953 #ifdef DIAGNOSTIC 3954 if (max_linkhdr + tlen > MCLBYTES) 3955 return (ENOBUFS); 3956 #endif 3957 MGETHDR(m, M_DONTWAIT, MT_DATA); 3958 if (m && max_linkhdr + tlen > MHLEN) { 3959 MCLGET(m, M_DONTWAIT); 3960 if ((m->m_flags & M_EXT) == 0) { 3961 m_freem(m); 3962 m = NULL; 3963 } 3964 } 3965 if (m == NULL) 3966 return (ENOBUFS); 3967 3968 /* Fixup the mbuf. */ 3969 m->m_data += max_linkhdr; 3970 m->m_len = m->m_pkthdr.len = tlen; 3971 m->m_pkthdr.ph_ifidx = 0; 3972 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 3973 memset(mtod(m, u_char *), 0, tlen); 3974 3975 switch (sc->sc_src.sa.sa_family) { 3976 case AF_INET: 3977 ip = mtod(m, struct ip *); 3978 ip->ip_dst = sc->sc_src.sin.sin_addr; 3979 ip->ip_src = sc->sc_dst.sin.sin_addr; 3980 ip->ip_p = IPPROTO_TCP; 3981 th = (struct tcphdr *)(ip + 1); 3982 th->th_dport = sc->sc_src.sin.sin_port; 3983 th->th_sport = sc->sc_dst.sin.sin_port; 3984 break; 3985 #ifdef INET6 3986 case AF_INET6: 3987 ip6 = mtod(m, struct ip6_hdr *); 3988 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3989 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3990 ip6->ip6_nxt = IPPROTO_TCP; 3991 /* ip6_plen will be updated in ip6_output() */ 3992 th = (struct tcphdr *)(ip6 + 1); 3993 th->th_dport = sc->sc_src.sin6.sin6_port; 3994 th->th_sport = sc->sc_dst.sin6.sin6_port; 3995 break; 3996 #endif 3997 default: 3998 unhandled_af(sc->sc_src.sa.sa_family); 3999 } 4000 4001 th->th_seq = htonl(sc->sc_iss); 4002 th->th_ack = htonl(sc->sc_irs + 1); 4003 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4004 th->th_flags = TH_SYN|TH_ACK; 4005 #ifdef TCP_ECN 4006 /* Set ECE for SYN-ACK if peer supports ECN. */ 4007 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4008 th->th_flags |= TH_ECE; 4009 #endif 4010 th->th_win = htons(sc->sc_win); 4011 /* th_sum already 0 */ 4012 /* th_urp already 0 */ 4013 4014 /* Tack on the TCP options. */ 4015 optp = (u_int8_t *)(th + 1); 4016 *optp++ = TCPOPT_MAXSEG; 4017 *optp++ = 4; 4018 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4019 *optp++ = sc->sc_ourmaxseg & 0xff; 4020 4021 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4022 if (sc->sc_flags & SCF_SACK_PERMIT) { 4023 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4024 optp += 4; 4025 } 4026 4027 if (sc->sc_request_r_scale != 15) { 4028 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4029 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4030 sc->sc_request_r_scale); 4031 optp += 4; 4032 } 4033 4034 if (sc->sc_flags & SCF_TIMESTAMP) { 4035 u_int32_t *lp = (u_int32_t *)(optp); 4036 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4037 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4038 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4039 *lp = htonl(sc->sc_timestamp); 4040 optp += TCPOLEN_TSTAMP_APPA; 4041 } 4042 4043 #ifdef TCP_SIGNATURE 4044 if (sc->sc_flags & SCF_SIGNATURE) { 4045 union sockaddr_union src, dst; 4046 struct tdb *tdb; 4047 4048 bzero(&src, sizeof(union sockaddr_union)); 4049 bzero(&dst, sizeof(union sockaddr_union)); 4050 src.sa.sa_len = sc->sc_src.sa.sa_len; 4051 src.sa.sa_family = sc->sc_src.sa.sa_family; 4052 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4053 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4054 4055 switch (sc->sc_src.sa.sa_family) { 4056 case 0: /*default to PF_INET*/ 4057 case AF_INET: 4058 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4059 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4060 break; 4061 #ifdef INET6 4062 case AF_INET6: 4063 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4064 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4065 break; 4066 #endif /* INET6 */ 4067 } 4068 4069 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4070 0, &src, &dst, IPPROTO_TCP); 4071 if (tdb == NULL) { 4072 m_freem(m); 4073 return (EPERM); 4074 } 4075 4076 /* Send signature option */ 4077 *(optp++) = TCPOPT_SIGNATURE; 4078 *(optp++) = TCPOLEN_SIGNATURE; 4079 4080 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4081 hlen, 0, optp) < 0) { 4082 m_freem(m); 4083 return (EINVAL); 4084 } 4085 optp += 16; 4086 4087 /* Pad options list to the next 32 bit boundary and 4088 * terminate it. 4089 */ 4090 *optp++ = TCPOPT_NOP; 4091 *optp++ = TCPOPT_EOL; 4092 } 4093 #endif /* TCP_SIGNATURE */ 4094 4095 /* Compute the packet's checksum. */ 4096 switch (sc->sc_src.sa.sa_family) { 4097 case AF_INET: 4098 ip->ip_len = htons(tlen - hlen); 4099 th->th_sum = 0; 4100 th->th_sum = in_cksum(m, tlen); 4101 break; 4102 #ifdef INET6 4103 case AF_INET6: 4104 ip6->ip6_plen = htons(tlen - hlen); 4105 th->th_sum = 0; 4106 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4107 break; 4108 #endif 4109 } 4110 4111 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4112 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4113 4114 /* 4115 * Fill in some straggling IP bits. Note the stack expects 4116 * ip_len to be in host order, for convenience. 4117 */ 4118 switch (sc->sc_src.sa.sa_family) { 4119 case AF_INET: 4120 ip->ip_len = htons(tlen); 4121 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4122 if (inp != NULL) 4123 ip->ip_tos = inp->inp_ip.ip_tos; 4124 break; 4125 #ifdef INET6 4126 case AF_INET6: 4127 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4128 ip6->ip6_vfc |= IPV6_VERSION; 4129 ip6->ip6_plen = htons(tlen - hlen); 4130 /* ip6_hlim will be initialized afterwards */ 4131 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4132 break; 4133 #endif 4134 } 4135 4136 switch (sc->sc_src.sa.sa_family) { 4137 case AF_INET: 4138 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4139 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4140 break; 4141 #ifdef INET6 4142 case AF_INET6: 4143 ip6->ip6_hlim = in6_selecthlim(inp); 4144 4145 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4146 NULL, NULL); 4147 break; 4148 #endif 4149 default: 4150 error = EAFNOSUPPORT; 4151 break; 4152 } 4153 return (error); 4154 } 4155