1 /* $OpenBSD: tcp_input.c,v 1.354 2017/12/04 13:40:34 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if ((tp)->t_flags & TF_DELACK || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_SET_DELACK(tp); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 189 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 190 191 void syn_cache_put(struct syn_cache *); 192 void syn_cache_rm(struct syn_cache *); 193 int syn_cache_respond(struct syn_cache *, struct mbuf *); 194 void syn_cache_timer(void *); 195 void syn_cache_reaper(void *); 196 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 197 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 198 struct tcphdr *, u_int); 199 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 200 unsigned int, struct socket *, struct mbuf *, u_char *, int, 201 struct tcp_opt_info *, tcp_seq *); 202 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 203 struct tcphdr *, unsigned int, unsigned int, struct socket *, 204 struct mbuf *); 205 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 206 struct syn_cache_head **, u_int); 207 208 /* 209 * Insert segment ti into reassembly queue of tcp with 210 * control block tp. Return TH_FIN if reassembly now includes 211 * a segment with FIN. The macro form does the common case inline 212 * (segment is the next to be received on an established connection, 213 * and the queue is empty), avoiding linkage into and removal 214 * from the queue and repetition of various conversions. 215 * Set DELACK for segments received in order, but ack immediately 216 * when segments are out of order (so fast retransmit can work). 217 */ 218 219 int 220 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 221 { 222 struct tcpqent *p, *q, *nq, *tiqe; 223 224 /* 225 * Allocate a new queue entry, before we throw away any data. 226 * If we can't, just drop the packet. XXX 227 */ 228 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 229 if (tiqe == NULL) { 230 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 231 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 232 /* Reuse last entry since new segment fills a hole */ 233 m_freem(tiqe->tcpqe_m); 234 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 235 } 236 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 237 /* Flush segment queue for this connection */ 238 tcp_freeq(tp); 239 tcpstat_inc(tcps_rcvmemdrop); 240 m_freem(m); 241 return (0); 242 } 243 } 244 245 /* 246 * Find a segment which begins after this one does. 247 */ 248 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 249 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 250 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 251 break; 252 253 /* 254 * If there is a preceding segment, it may provide some of 255 * our data already. If so, drop the data from the incoming 256 * segment. If it provides all of our data, drop us. 257 */ 258 if (p != NULL) { 259 struct tcphdr *phdr = p->tcpqe_tcp; 260 int i; 261 262 /* conversion to int (in i) handles seq wraparound */ 263 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 264 if (i > 0) { 265 if (i >= *tlen) { 266 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 267 *tlen); 268 m_freem(m); 269 pool_put(&tcpqe_pool, tiqe); 270 return (0); 271 } 272 m_adj(m, i); 273 *tlen -= i; 274 th->th_seq += i; 275 } 276 } 277 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 278 279 /* 280 * While we overlap succeeding segments trim them or, 281 * if they are completely covered, dequeue them. 282 */ 283 for (; q != NULL; q = nq) { 284 struct tcphdr *qhdr = q->tcpqe_tcp; 285 int i = (th->th_seq + *tlen) - qhdr->th_seq; 286 287 if (i <= 0) 288 break; 289 if (i < qhdr->th_reseqlen) { 290 qhdr->th_seq += i; 291 qhdr->th_reseqlen -= i; 292 m_adj(q->tcpqe_m, i); 293 break; 294 } 295 nq = TAILQ_NEXT(q, tcpqe_q); 296 m_freem(q->tcpqe_m); 297 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 298 pool_put(&tcpqe_pool, q); 299 } 300 301 /* Insert the new segment queue entry into place. */ 302 tiqe->tcpqe_m = m; 303 th->th_reseqlen = *tlen; 304 tiqe->tcpqe_tcp = th; 305 if (p == NULL) { 306 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 307 } else { 308 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 309 } 310 311 if (th->th_seq != tp->rcv_nxt) 312 return (0); 313 314 return (tcp_flush_queue(tp)); 315 } 316 317 int 318 tcp_flush_queue(struct tcpcb *tp) 319 { 320 struct socket *so = tp->t_inpcb->inp_socket; 321 struct tcpqent *q, *nq; 322 int flags; 323 324 /* 325 * Present data to user, advancing rcv_nxt through 326 * completed sequence space. 327 */ 328 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 329 return (0); 330 q = TAILQ_FIRST(&tp->t_segq); 331 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 332 return (0); 333 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 334 return (0); 335 do { 336 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 337 flags = q->tcpqe_tcp->th_flags & TH_FIN; 338 339 nq = TAILQ_NEXT(q, tcpqe_q); 340 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 341 ND6_HINT(tp); 342 if (so->so_state & SS_CANTRCVMORE) 343 m_freem(q->tcpqe_m); 344 else 345 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 346 pool_put(&tcpqe_pool, q); 347 q = nq; 348 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 349 tp->t_flags |= TF_BLOCKOUTPUT; 350 sorwakeup(so); 351 tp->t_flags &= ~TF_BLOCKOUTPUT; 352 return (flags); 353 } 354 355 /* 356 * TCP input routine, follows pages 65-76 of the 357 * protocol specification dated September, 1981 very closely. 358 */ 359 int 360 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 361 { 362 struct mbuf *m = *mp; 363 int iphlen = *offp; 364 struct ip *ip = NULL; 365 struct inpcb *inp = NULL; 366 u_int8_t *optp = NULL; 367 int optlen = 0; 368 int tlen, off; 369 struct tcpcb *tp = NULL; 370 int tiflags; 371 struct socket *so = NULL; 372 int todrop, acked, ourfinisacked; 373 int hdroptlen = 0; 374 short ostate = 0; 375 tcp_seq iss, *reuse = NULL; 376 u_long tiwin; 377 struct tcp_opt_info opti; 378 struct tcphdr *th; 379 #ifdef INET6 380 struct ip6_hdr *ip6 = NULL; 381 #endif /* INET6 */ 382 #ifdef IPSEC 383 struct m_tag *mtag; 384 struct tdb_ident *tdbi; 385 struct tdb *tdb; 386 int error; 387 #endif /* IPSEC */ 388 #ifdef TCP_ECN 389 u_char iptos; 390 #endif 391 392 tcpstat_inc(tcps_rcvtotal); 393 394 opti.ts_present = 0; 395 opti.maxseg = 0; 396 397 /* 398 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 399 */ 400 if (m->m_flags & (M_BCAST|M_MCAST)) 401 goto drop; 402 403 /* 404 * Get IP and TCP header together in first mbuf. 405 * Note: IP leaves IP header in first mbuf. 406 */ 407 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 408 if (!th) { 409 tcpstat_inc(tcps_rcvshort); 410 return IPPROTO_DONE; 411 } 412 413 tlen = m->m_pkthdr.len - iphlen; 414 switch (af) { 415 case AF_INET: 416 ip = mtod(m, struct ip *); 417 #ifdef TCP_ECN 418 /* save ip_tos before clearing it for checksum */ 419 iptos = ip->ip_tos; 420 #endif 421 break; 422 #ifdef INET6 423 case AF_INET6: 424 ip6 = mtod(m, struct ip6_hdr *); 425 #ifdef TCP_ECN 426 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 427 #endif 428 429 /* 430 * Be proactive about unspecified IPv6 address in source. 431 * As we use all-zero to indicate unbounded/unconnected pcb, 432 * unspecified IPv6 address can be used to confuse us. 433 * 434 * Note that packets with unspecified IPv6 destination is 435 * already dropped in ip6_input. 436 */ 437 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 438 /* XXX stat */ 439 goto drop; 440 } 441 442 /* Discard packets to multicast */ 443 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 444 /* XXX stat */ 445 goto drop; 446 } 447 break; 448 #endif 449 default: 450 unhandled_af(af); 451 } 452 453 /* 454 * Checksum extended TCP header and data. 455 */ 456 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 457 int sum; 458 459 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 460 tcpstat_inc(tcps_rcvbadsum); 461 goto drop; 462 } 463 tcpstat_inc(tcps_inswcsum); 464 switch (af) { 465 case AF_INET: 466 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 467 break; 468 #ifdef INET6 469 case AF_INET6: 470 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 471 tlen); 472 break; 473 #endif 474 } 475 if (sum != 0) { 476 tcpstat_inc(tcps_rcvbadsum); 477 goto drop; 478 } 479 } 480 481 /* 482 * Check that TCP offset makes sense, 483 * pull out TCP options and adjust length. XXX 484 */ 485 off = th->th_off << 2; 486 if (off < sizeof(struct tcphdr) || off > tlen) { 487 tcpstat_inc(tcps_rcvbadoff); 488 goto drop; 489 } 490 tlen -= off; 491 if (off > sizeof(struct tcphdr)) { 492 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 493 if (!th) { 494 tcpstat_inc(tcps_rcvshort); 495 return IPPROTO_DONE; 496 } 497 optlen = off - sizeof(struct tcphdr); 498 optp = (u_int8_t *)(th + 1); 499 /* 500 * Do quick retrieval of timestamp options ("options 501 * prediction?"). If timestamp is the only option and it's 502 * formatted as recommended in RFC 1323 appendix A, we 503 * quickly get the values now and not bother calling 504 * tcp_dooptions(), etc. 505 */ 506 if ((optlen == TCPOLEN_TSTAMP_APPA || 507 (optlen > TCPOLEN_TSTAMP_APPA && 508 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 509 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 510 (th->th_flags & TH_SYN) == 0) { 511 opti.ts_present = 1; 512 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 513 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 514 optp = NULL; /* we've parsed the options */ 515 } 516 } 517 tiflags = th->th_flags; 518 519 /* 520 * Convert TCP protocol specific fields to host format. 521 */ 522 th->th_seq = ntohl(th->th_seq); 523 th->th_ack = ntohl(th->th_ack); 524 th->th_win = ntohs(th->th_win); 525 th->th_urp = ntohs(th->th_urp); 526 527 /* 528 * Locate pcb for segment. 529 */ 530 #if NPF > 0 531 inp = pf_inp_lookup(m); 532 #endif 533 findpcb: 534 if (inp == NULL) { 535 switch (af) { 536 #ifdef INET6 537 case AF_INET6: 538 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 539 th->th_sport, &ip6->ip6_dst, th->th_dport, 540 m->m_pkthdr.ph_rtableid); 541 break; 542 #endif 543 case AF_INET: 544 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 545 th->th_sport, ip->ip_dst, th->th_dport, 546 m->m_pkthdr.ph_rtableid); 547 break; 548 } 549 } 550 if (inp == NULL) { 551 tcpstat_inc(tcps_pcbhashmiss); 552 switch (af) { 553 #ifdef INET6 554 case AF_INET6: 555 inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, 556 th->th_dport, m, m->m_pkthdr.ph_rtableid); 557 break; 558 #endif /* INET6 */ 559 case AF_INET: 560 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 561 th->th_dport, m, m->m_pkthdr.ph_rtableid); 562 break; 563 } 564 /* 565 * If the state is CLOSED (i.e., TCB does not exist) then 566 * all data in the incoming segment is discarded. 567 * If the TCB exists but is in CLOSED state, it is embryonic, 568 * but should either do a listen or a connect soon. 569 */ 570 if (inp == NULL) { 571 tcpstat_inc(tcps_noport); 572 goto dropwithreset_ratelim; 573 } 574 } 575 KASSERT(sotoinpcb(inp->inp_socket) == inp); 576 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 577 soassertlocked(inp->inp_socket); 578 579 /* Check the minimum TTL for socket. */ 580 switch (af) { 581 case AF_INET: 582 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 583 goto drop; 584 break; 585 #ifdef INET6 586 case AF_INET6: 587 if (inp->inp_ip6_minhlim && 588 inp->inp_ip6_minhlim > ip6->ip6_hlim) 589 goto drop; 590 break; 591 #endif 592 } 593 594 tp = intotcpcb(inp); 595 if (tp == NULL) 596 goto dropwithreset_ratelim; 597 if (tp->t_state == TCPS_CLOSED) 598 goto drop; 599 600 /* Unscale the window into a 32-bit value. */ 601 if ((tiflags & TH_SYN) == 0) 602 tiwin = th->th_win << tp->snd_scale; 603 else 604 tiwin = th->th_win; 605 606 so = inp->inp_socket; 607 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 608 union syn_cache_sa src; 609 union syn_cache_sa dst; 610 611 bzero(&src, sizeof(src)); 612 bzero(&dst, sizeof(dst)); 613 switch (af) { 614 case AF_INET: 615 src.sin.sin_len = sizeof(struct sockaddr_in); 616 src.sin.sin_family = AF_INET; 617 src.sin.sin_addr = ip->ip_src; 618 src.sin.sin_port = th->th_sport; 619 620 dst.sin.sin_len = sizeof(struct sockaddr_in); 621 dst.sin.sin_family = AF_INET; 622 dst.sin.sin_addr = ip->ip_dst; 623 dst.sin.sin_port = th->th_dport; 624 break; 625 #ifdef INET6 626 case AF_INET6: 627 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 628 src.sin6.sin6_family = AF_INET6; 629 src.sin6.sin6_addr = ip6->ip6_src; 630 src.sin6.sin6_port = th->th_sport; 631 632 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 633 dst.sin6.sin6_family = AF_INET6; 634 dst.sin6.sin6_addr = ip6->ip6_dst; 635 dst.sin6.sin6_port = th->th_dport; 636 break; 637 #endif /* INET6 */ 638 default: 639 goto badsyn; /*sanity*/ 640 } 641 642 if (so->so_options & SO_DEBUG) { 643 ostate = tp->t_state; 644 switch (af) { 645 #ifdef INET6 646 case AF_INET6: 647 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 648 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 649 break; 650 #endif 651 case AF_INET: 652 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 653 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 654 break; 655 } 656 } 657 if (so->so_options & SO_ACCEPTCONN) { 658 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 659 660 case TH_SYN|TH_ACK|TH_RST: 661 case TH_SYN|TH_RST: 662 case TH_ACK|TH_RST: 663 case TH_RST: 664 syn_cache_reset(&src.sa, &dst.sa, th, 665 inp->inp_rtableid); 666 goto drop; 667 668 case TH_SYN|TH_ACK: 669 /* 670 * Received a SYN,ACK. This should 671 * never happen while we are in 672 * LISTEN. Send an RST. 673 */ 674 goto badsyn; 675 676 case TH_ACK: 677 so = syn_cache_get(&src.sa, &dst.sa, 678 th, iphlen, tlen, so, m); 679 if (so == NULL) { 680 /* 681 * We don't have a SYN for 682 * this ACK; send an RST. 683 */ 684 goto badsyn; 685 } else if (so == (struct socket *)(-1)) { 686 /* 687 * We were unable to create 688 * the connection. If the 689 * 3-way handshake was 690 * completed, and RST has 691 * been sent to the peer. 692 * Since the mbuf might be 693 * in use for the reply, 694 * do not free it. 695 */ 696 m = *mp = NULL; 697 goto drop; 698 } else { 699 /* 700 * We have created a 701 * full-blown connection. 702 */ 703 tp = NULL; 704 inp = sotoinpcb(so); 705 tp = intotcpcb(inp); 706 if (tp == NULL) 707 goto badsyn; /*XXX*/ 708 709 } 710 break; 711 712 default: 713 /* 714 * None of RST, SYN or ACK was set. 715 * This is an invalid packet for a 716 * TCB in LISTEN state. Send a RST. 717 */ 718 goto badsyn; 719 720 case TH_SYN: 721 /* 722 * Received a SYN. 723 */ 724 #ifdef INET6 725 /* 726 * If deprecated address is forbidden, we do 727 * not accept SYN to deprecated interface 728 * address to prevent any new inbound 729 * connection from getting established. 730 * When we do not accept SYN, we send a TCP 731 * RST, with deprecated source address (instead 732 * of dropping it). We compromise it as it is 733 * much better for peer to send a RST, and 734 * RST will be the final packet for the 735 * exchange. 736 * 737 * If we do not forbid deprecated addresses, we 738 * accept the SYN packet. RFC2462 does not 739 * suggest dropping SYN in this case. 740 * If we decipher RFC2462 5.5.4, it says like 741 * this: 742 * 1. use of deprecated addr with existing 743 * communication is okay - "SHOULD continue 744 * to be used" 745 * 2. use of it with new communication: 746 * (2a) "SHOULD NOT be used if alternate 747 * address with sufficient scope is 748 * available" 749 * (2b) nothing mentioned otherwise. 750 * Here we fall into (2b) case as we have no 751 * choice in our source address selection - we 752 * must obey the peer. 753 * 754 * The wording in RFC2462 is confusing, and 755 * there are multiple description text for 756 * deprecated address handling - worse, they 757 * are not exactly the same. I believe 5.5.4 758 * is the best one, so we follow 5.5.4. 759 */ 760 if (ip6 && !ip6_use_deprecated) { 761 struct in6_ifaddr *ia6; 762 struct ifnet *ifp = 763 if_get(m->m_pkthdr.ph_ifidx); 764 765 if (ifp && 766 (ia6 = in6ifa_ifpwithaddr(ifp, 767 &ip6->ip6_dst)) && 768 (ia6->ia6_flags & 769 IN6_IFF_DEPRECATED)) { 770 tp = NULL; 771 if_put(ifp); 772 goto dropwithreset; 773 } 774 if_put(ifp); 775 } 776 #endif 777 778 /* 779 * LISTEN socket received a SYN 780 * from itself? This can't possibly 781 * be valid; drop the packet. 782 */ 783 if (th->th_dport == th->th_sport) { 784 switch (af) { 785 #ifdef INET6 786 case AF_INET6: 787 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 788 &ip6->ip6_dst)) { 789 tcpstat_inc(tcps_badsyn); 790 goto drop; 791 } 792 break; 793 #endif /* INET6 */ 794 case AF_INET: 795 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 796 tcpstat_inc(tcps_badsyn); 797 goto drop; 798 } 799 break; 800 } 801 } 802 803 /* 804 * SYN looks ok; create compressed TCP 805 * state for it. 806 */ 807 if (so->so_qlen > so->so_qlimit || 808 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 809 so, m, optp, optlen, &opti, reuse) == -1) { 810 tcpstat_inc(tcps_dropsyn); 811 goto drop; 812 } 813 return IPPROTO_DONE; 814 } 815 } 816 } 817 818 #ifdef DIAGNOSTIC 819 /* 820 * Should not happen now that all embryonic connections 821 * are handled with compressed state. 822 */ 823 if (tp->t_state == TCPS_LISTEN) 824 panic("tcp_input: TCPS_LISTEN"); 825 #endif 826 827 #if NPF > 0 828 pf_inp_link(m, inp); 829 #endif 830 831 #ifdef IPSEC 832 /* Find most recent IPsec tag */ 833 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 834 if (mtag != NULL) { 835 tdbi = (struct tdb_ident *)(mtag + 1); 836 tdb = gettdb(tdbi->rdomain, tdbi->spi, 837 &tdbi->dst, tdbi->proto); 838 } else 839 tdb = NULL; 840 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 841 tdb, inp, 0); 842 if (error) { 843 tcpstat_inc(tcps_rcvnosec); 844 goto drop; 845 } 846 #endif /* IPSEC */ 847 848 /* 849 * Segment received on connection. 850 * Reset idle time and keep-alive timer. 851 */ 852 tp->t_rcvtime = tcp_now; 853 if (TCPS_HAVEESTABLISHED(tp->t_state)) 854 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 855 856 if (tp->sack_enable) 857 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 858 859 /* 860 * Process options. 861 */ 862 #ifdef TCP_SIGNATURE 863 if (optp || (tp->t_flags & TF_SIGNATURE)) 864 #else 865 if (optp) 866 #endif 867 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 868 m->m_pkthdr.ph_rtableid)) 869 goto drop; 870 871 if (opti.ts_present && opti.ts_ecr) { 872 int rtt_test; 873 874 /* subtract out the tcp timestamp modulator */ 875 opti.ts_ecr -= tp->ts_modulate; 876 877 /* make sure ts_ecr is sensible */ 878 rtt_test = tcp_now - opti.ts_ecr; 879 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 880 opti.ts_ecr = 0; 881 } 882 883 #ifdef TCP_ECN 884 /* if congestion experienced, set ECE bit in subsequent packets. */ 885 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 886 tp->t_flags |= TF_RCVD_CE; 887 tcpstat_inc(tcps_ecn_rcvce); 888 } 889 #endif 890 /* 891 * Header prediction: check for the two common cases 892 * of a uni-directional data xfer. If the packet has 893 * no control flags, is in-sequence, the window didn't 894 * change and we're not retransmitting, it's a 895 * candidate. If the length is zero and the ack moved 896 * forward, we're the sender side of the xfer. Just 897 * free the data acked & wake any higher level process 898 * that was blocked waiting for space. If the length 899 * is non-zero and the ack didn't move, we're the 900 * receiver side. If we're getting packets in-order 901 * (the reassembly queue is empty), add the data to 902 * the socket buffer and note that we need a delayed ack. 903 */ 904 if (tp->t_state == TCPS_ESTABLISHED && 905 #ifdef TCP_ECN 906 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 907 #else 908 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 909 #endif 910 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 911 th->th_seq == tp->rcv_nxt && 912 tiwin && tiwin == tp->snd_wnd && 913 tp->snd_nxt == tp->snd_max) { 914 915 /* 916 * If last ACK falls within this segment's sequence numbers, 917 * record the timestamp. 918 * Fix from Braden, see Stevens p. 870 919 */ 920 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 921 tp->ts_recent_age = tcp_now; 922 tp->ts_recent = opti.ts_val; 923 } 924 925 if (tlen == 0) { 926 if (SEQ_GT(th->th_ack, tp->snd_una) && 927 SEQ_LEQ(th->th_ack, tp->snd_max) && 928 tp->snd_cwnd >= tp->snd_wnd && 929 tp->t_dupacks == 0) { 930 /* 931 * this is a pure ack for outstanding data. 932 */ 933 tcpstat_inc(tcps_predack); 934 if (opti.ts_present && opti.ts_ecr) 935 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 936 else if (tp->t_rtttime && 937 SEQ_GT(th->th_ack, tp->t_rtseq)) 938 tcp_xmit_timer(tp, 939 tcp_now - tp->t_rtttime); 940 acked = th->th_ack - tp->snd_una; 941 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 942 acked); 943 ND6_HINT(tp); 944 sbdrop(so, &so->so_snd, acked); 945 946 /* 947 * If we had a pending ICMP message that 948 * refers to data that have just been 949 * acknowledged, disregard the recorded ICMP 950 * message. 951 */ 952 if ((tp->t_flags & TF_PMTUD_PEND) && 953 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 954 tp->t_flags &= ~TF_PMTUD_PEND; 955 956 /* 957 * Keep track of the largest chunk of data 958 * acknowledged since last PMTU update 959 */ 960 if (tp->t_pmtud_mss_acked < acked) 961 tp->t_pmtud_mss_acked = acked; 962 963 tp->snd_una = th->th_ack; 964 /* 965 * We want snd_last to track snd_una so 966 * as to avoid sequence wraparound problems 967 * for very large transfers. 968 */ 969 #ifdef TCP_ECN 970 if (SEQ_GT(tp->snd_una, tp->snd_last)) 971 #endif 972 tp->snd_last = tp->snd_una; 973 m_freem(m); 974 975 /* 976 * If all outstanding data are acked, stop 977 * retransmit timer, otherwise restart timer 978 * using current (possibly backed-off) value. 979 * If process is waiting for space, 980 * wakeup/selwakeup/signal. If data 981 * are ready to send, let tcp_output 982 * decide between more output or persist. 983 */ 984 if (tp->snd_una == tp->snd_max) 985 TCP_TIMER_DISARM(tp, TCPT_REXMT); 986 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 987 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 988 989 tcp_update_sndspace(tp); 990 if (sb_notify(so, &so->so_snd)) { 991 tp->t_flags |= TF_BLOCKOUTPUT; 992 sowwakeup(so); 993 tp->t_flags &= ~TF_BLOCKOUTPUT; 994 } 995 if (so->so_snd.sb_cc || 996 tp->t_flags & TF_NEEDOUTPUT) 997 (void) tcp_output(tp); 998 return IPPROTO_DONE; 999 } 1000 } else if (th->th_ack == tp->snd_una && 1001 TAILQ_EMPTY(&tp->t_segq) && 1002 tlen <= sbspace(so, &so->so_rcv)) { 1003 /* 1004 * This is a pure, in-sequence data packet 1005 * with nothing on the reassembly queue and 1006 * we have enough buffer space to take it. 1007 */ 1008 /* Clean receiver SACK report if present */ 1009 if (tp->sack_enable && tp->rcv_numsacks) 1010 tcp_clean_sackreport(tp); 1011 tcpstat_inc(tcps_preddat); 1012 tp->rcv_nxt += tlen; 1013 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1014 ND6_HINT(tp); 1015 1016 TCP_SETUP_ACK(tp, tiflags, m); 1017 /* 1018 * Drop TCP, IP headers and TCP options then add data 1019 * to socket buffer. 1020 */ 1021 if (so->so_state & SS_CANTRCVMORE) 1022 m_freem(m); 1023 else { 1024 if (opti.ts_present && opti.ts_ecr) { 1025 if (tp->rfbuf_ts < opti.ts_ecr && 1026 opti.ts_ecr - tp->rfbuf_ts < hz) { 1027 tcp_update_rcvspace(tp); 1028 /* Start over with next RTT. */ 1029 tp->rfbuf_cnt = 0; 1030 tp->rfbuf_ts = 0; 1031 } else 1032 tp->rfbuf_cnt += tlen; 1033 } 1034 m_adj(m, iphlen + off); 1035 sbappendstream(so, &so->so_rcv, m); 1036 } 1037 tp->t_flags |= TF_BLOCKOUTPUT; 1038 sorwakeup(so); 1039 tp->t_flags &= ~TF_BLOCKOUTPUT; 1040 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1041 (void) tcp_output(tp); 1042 return IPPROTO_DONE; 1043 } 1044 } 1045 1046 /* 1047 * Compute mbuf offset to TCP data segment. 1048 */ 1049 hdroptlen = iphlen + off; 1050 1051 /* 1052 * Calculate amount of space in receive window, 1053 * and then do TCP input processing. 1054 * Receive window is amount of space in rcv queue, 1055 * but not less than advertised window. 1056 */ 1057 { int win; 1058 1059 win = sbspace(so, &so->so_rcv); 1060 if (win < 0) 1061 win = 0; 1062 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1063 } 1064 1065 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1066 tp->rfbuf_cnt = 0; 1067 tp->rfbuf_ts = 0; 1068 1069 switch (tp->t_state) { 1070 1071 /* 1072 * If the state is SYN_RECEIVED: 1073 * if seg contains SYN/ACK, send an RST. 1074 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1075 */ 1076 1077 case TCPS_SYN_RECEIVED: 1078 if (tiflags & TH_ACK) { 1079 if (tiflags & TH_SYN) { 1080 tcpstat_inc(tcps_badsyn); 1081 goto dropwithreset; 1082 } 1083 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1084 SEQ_GT(th->th_ack, tp->snd_max)) 1085 goto dropwithreset; 1086 } 1087 break; 1088 1089 /* 1090 * If the state is SYN_SENT: 1091 * if seg contains an ACK, but not for our SYN, drop the input. 1092 * if seg contains a RST, then drop the connection. 1093 * if seg does not contain SYN, then drop it. 1094 * Otherwise this is an acceptable SYN segment 1095 * initialize tp->rcv_nxt and tp->irs 1096 * if seg contains ack then advance tp->snd_una 1097 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1098 * arrange for segment to be acked (eventually) 1099 * continue processing rest of data/controls, beginning with URG 1100 */ 1101 case TCPS_SYN_SENT: 1102 if ((tiflags & TH_ACK) && 1103 (SEQ_LEQ(th->th_ack, tp->iss) || 1104 SEQ_GT(th->th_ack, tp->snd_max))) 1105 goto dropwithreset; 1106 if (tiflags & TH_RST) { 1107 #ifdef TCP_ECN 1108 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1109 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1110 goto drop; 1111 #endif 1112 if (tiflags & TH_ACK) 1113 tp = tcp_drop(tp, ECONNREFUSED); 1114 goto drop; 1115 } 1116 if ((tiflags & TH_SYN) == 0) 1117 goto drop; 1118 if (tiflags & TH_ACK) { 1119 tp->snd_una = th->th_ack; 1120 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1121 tp->snd_nxt = tp->snd_una; 1122 } 1123 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1124 tp->irs = th->th_seq; 1125 tcp_mss(tp, opti.maxseg); 1126 /* Reset initial window to 1 segment for retransmit */ 1127 if (tp->t_rxtshift > 0) 1128 tp->snd_cwnd = tp->t_maxseg; 1129 tcp_rcvseqinit(tp); 1130 tp->t_flags |= TF_ACKNOW; 1131 /* 1132 * If we've sent a SACK_PERMITTED option, and the peer 1133 * also replied with one, then TF_SACK_PERMIT should have 1134 * been set in tcp_dooptions(). If it was not, disable SACKs. 1135 */ 1136 if (tp->sack_enable) 1137 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1138 #ifdef TCP_ECN 1139 /* 1140 * if ECE is set but CWR is not set for SYN-ACK, or 1141 * both ECE and CWR are set for simultaneous open, 1142 * peer is ECN capable. 1143 */ 1144 if (tcp_do_ecn) { 1145 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1146 case TH_ACK|TH_ECE: 1147 case TH_ECE|TH_CWR: 1148 tp->t_flags |= TF_ECN_PERMIT; 1149 tiflags &= ~(TH_ECE|TH_CWR); 1150 tcpstat_inc(tcps_ecn_accepts); 1151 } 1152 } 1153 #endif 1154 1155 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1156 tcpstat_inc(tcps_connects); 1157 tp->t_flags |= TF_BLOCKOUTPUT; 1158 soisconnected(so); 1159 tp->t_flags &= ~TF_BLOCKOUTPUT; 1160 tp->t_state = TCPS_ESTABLISHED; 1161 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1162 /* Do window scaling on this connection? */ 1163 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1164 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1165 tp->snd_scale = tp->requested_s_scale; 1166 tp->rcv_scale = tp->request_r_scale; 1167 } 1168 tcp_flush_queue(tp); 1169 1170 /* 1171 * if we didn't have to retransmit the SYN, 1172 * use its rtt as our initial srtt & rtt var. 1173 */ 1174 if (tp->t_rtttime) 1175 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1176 /* 1177 * Since new data was acked (the SYN), open the 1178 * congestion window by one MSS. We do this 1179 * here, because we won't go through the normal 1180 * ACK processing below. And since this is the 1181 * start of the connection, we know we are in 1182 * the exponential phase of slow-start. 1183 */ 1184 tp->snd_cwnd += tp->t_maxseg; 1185 } else 1186 tp->t_state = TCPS_SYN_RECEIVED; 1187 1188 #if 0 1189 trimthenstep6: 1190 #endif 1191 /* 1192 * Advance th->th_seq to correspond to first data byte. 1193 * If data, trim to stay within window, 1194 * dropping FIN if necessary. 1195 */ 1196 th->th_seq++; 1197 if (tlen > tp->rcv_wnd) { 1198 todrop = tlen - tp->rcv_wnd; 1199 m_adj(m, -todrop); 1200 tlen = tp->rcv_wnd; 1201 tiflags &= ~TH_FIN; 1202 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1203 todrop); 1204 } 1205 tp->snd_wl1 = th->th_seq - 1; 1206 tp->rcv_up = th->th_seq; 1207 goto step6; 1208 /* 1209 * If a new connection request is received while in TIME_WAIT, 1210 * drop the old connection and start over if the if the 1211 * timestamp or the sequence numbers are above the previous 1212 * ones. 1213 */ 1214 case TCPS_TIME_WAIT: 1215 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1216 ((opti.ts_present && 1217 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1218 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1219 #if NPF > 0 1220 /* 1221 * The socket will be recreated but the new state 1222 * has already been linked to the socket. Remove the 1223 * link between old socket and new state. 1224 */ 1225 pf_inp_unlink(inp); 1226 #endif 1227 /* 1228 * Advance the iss by at least 32768, but 1229 * clear the msb in order to make sure 1230 * that SEG_LT(snd_nxt, iss). 1231 */ 1232 iss = tp->snd_nxt + 1233 ((arc4random() & 0x7fffffff) | 0x8000); 1234 reuse = &iss; 1235 tp = tcp_close(tp); 1236 inp = NULL; 1237 goto findpcb; 1238 } 1239 } 1240 1241 /* 1242 * States other than LISTEN or SYN_SENT. 1243 * First check timestamp, if present. 1244 * Then check that at least some bytes of segment are within 1245 * receive window. If segment begins before rcv_nxt, 1246 * drop leading data (and SYN); if nothing left, just ack. 1247 * 1248 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1249 * and it's less than opti.ts_recent, drop it. 1250 */ 1251 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1252 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1253 1254 /* Check to see if ts_recent is over 24 days old. */ 1255 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1256 /* 1257 * Invalidate ts_recent. If this segment updates 1258 * ts_recent, the age will be reset later and ts_recent 1259 * will get a valid value. If it does not, setting 1260 * ts_recent to zero will at least satisfy the 1261 * requirement that zero be placed in the timestamp 1262 * echo reply when ts_recent isn't valid. The 1263 * age isn't reset until we get a valid ts_recent 1264 * because we don't want out-of-order segments to be 1265 * dropped when ts_recent is old. 1266 */ 1267 tp->ts_recent = 0; 1268 } else { 1269 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1270 tcpstat_inc(tcps_pawsdrop); 1271 goto dropafterack; 1272 } 1273 } 1274 1275 todrop = tp->rcv_nxt - th->th_seq; 1276 if (todrop > 0) { 1277 if (tiflags & TH_SYN) { 1278 tiflags &= ~TH_SYN; 1279 th->th_seq++; 1280 if (th->th_urp > 1) 1281 th->th_urp--; 1282 else 1283 tiflags &= ~TH_URG; 1284 todrop--; 1285 } 1286 if (todrop > tlen || 1287 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1288 /* 1289 * Any valid FIN must be to the left of the 1290 * window. At this point, FIN must be a 1291 * duplicate or out-of-sequence, so drop it. 1292 */ 1293 tiflags &= ~TH_FIN; 1294 /* 1295 * Send ACK to resynchronize, and drop any data, 1296 * but keep on processing for RST or ACK. 1297 */ 1298 tp->t_flags |= TF_ACKNOW; 1299 todrop = tlen; 1300 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1301 } else { 1302 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1303 todrop); 1304 } 1305 hdroptlen += todrop; /* drop from head afterwards */ 1306 th->th_seq += todrop; 1307 tlen -= todrop; 1308 if (th->th_urp > todrop) 1309 th->th_urp -= todrop; 1310 else { 1311 tiflags &= ~TH_URG; 1312 th->th_urp = 0; 1313 } 1314 } 1315 1316 /* 1317 * If new data are received on a connection after the 1318 * user processes are gone, then RST the other end. 1319 */ 1320 if ((so->so_state & SS_NOFDREF) && 1321 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1322 tp = tcp_close(tp); 1323 tcpstat_inc(tcps_rcvafterclose); 1324 goto dropwithreset; 1325 } 1326 1327 /* 1328 * If segment ends after window, drop trailing data 1329 * (and PUSH and FIN); if nothing left, just ACK. 1330 */ 1331 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1332 if (todrop > 0) { 1333 tcpstat_inc(tcps_rcvpackafterwin); 1334 if (todrop >= tlen) { 1335 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1336 /* 1337 * If window is closed can only take segments at 1338 * window edge, and have to drop data and PUSH from 1339 * incoming segments. Continue processing, but 1340 * remember to ack. Otherwise, drop segment 1341 * and ack. 1342 */ 1343 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1344 tp->t_flags |= TF_ACKNOW; 1345 tcpstat_inc(tcps_rcvwinprobe); 1346 } else 1347 goto dropafterack; 1348 } else 1349 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1350 m_adj(m, -todrop); 1351 tlen -= todrop; 1352 tiflags &= ~(TH_PUSH|TH_FIN); 1353 } 1354 1355 /* 1356 * If last ACK falls within this segment's sequence numbers, 1357 * record its timestamp if it's more recent. 1358 * NOTE that the test is modified according to the latest 1359 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1360 */ 1361 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1362 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1363 tp->ts_recent_age = tcp_now; 1364 tp->ts_recent = opti.ts_val; 1365 } 1366 1367 /* 1368 * If the RST bit is set examine the state: 1369 * SYN_RECEIVED STATE: 1370 * If passive open, return to LISTEN state. 1371 * If active open, inform user that connection was refused. 1372 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1373 * Inform user that connection was reset, and close tcb. 1374 * CLOSING, LAST_ACK, TIME_WAIT STATES 1375 * Close the tcb. 1376 */ 1377 if (tiflags & TH_RST) { 1378 if (th->th_seq != tp->last_ack_sent && 1379 th->th_seq != tp->rcv_nxt && 1380 th->th_seq != (tp->rcv_nxt + 1)) 1381 goto drop; 1382 1383 switch (tp->t_state) { 1384 case TCPS_SYN_RECEIVED: 1385 #ifdef TCP_ECN 1386 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1387 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1388 goto drop; 1389 #endif 1390 so->so_error = ECONNREFUSED; 1391 goto close; 1392 1393 case TCPS_ESTABLISHED: 1394 case TCPS_FIN_WAIT_1: 1395 case TCPS_FIN_WAIT_2: 1396 case TCPS_CLOSE_WAIT: 1397 so->so_error = ECONNRESET; 1398 close: 1399 tp->t_state = TCPS_CLOSED; 1400 tcpstat_inc(tcps_drops); 1401 tp = tcp_close(tp); 1402 goto drop; 1403 case TCPS_CLOSING: 1404 case TCPS_LAST_ACK: 1405 case TCPS_TIME_WAIT: 1406 tp = tcp_close(tp); 1407 goto drop; 1408 } 1409 } 1410 1411 /* 1412 * If a SYN is in the window, then this is an 1413 * error and we ACK and drop the packet. 1414 */ 1415 if (tiflags & TH_SYN) 1416 goto dropafterack_ratelim; 1417 1418 /* 1419 * If the ACK bit is off we drop the segment and return. 1420 */ 1421 if ((tiflags & TH_ACK) == 0) { 1422 if (tp->t_flags & TF_ACKNOW) 1423 goto dropafterack; 1424 else 1425 goto drop; 1426 } 1427 1428 /* 1429 * Ack processing. 1430 */ 1431 switch (tp->t_state) { 1432 1433 /* 1434 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1435 * ESTABLISHED state and continue processing. 1436 * The ACK was checked above. 1437 */ 1438 case TCPS_SYN_RECEIVED: 1439 tcpstat_inc(tcps_connects); 1440 tp->t_flags |= TF_BLOCKOUTPUT; 1441 soisconnected(so); 1442 tp->t_flags &= ~TF_BLOCKOUTPUT; 1443 tp->t_state = TCPS_ESTABLISHED; 1444 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1445 /* Do window scaling? */ 1446 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1447 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1448 tp->snd_scale = tp->requested_s_scale; 1449 tp->rcv_scale = tp->request_r_scale; 1450 tiwin = th->th_win << tp->snd_scale; 1451 } 1452 tcp_flush_queue(tp); 1453 tp->snd_wl1 = th->th_seq - 1; 1454 /* fall into ... */ 1455 1456 /* 1457 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1458 * ACKs. If the ack is in the range 1459 * tp->snd_una < th->th_ack <= tp->snd_max 1460 * then advance tp->snd_una to th->th_ack and drop 1461 * data from the retransmission queue. If this ACK reflects 1462 * more up to date window information we update our window information. 1463 */ 1464 case TCPS_ESTABLISHED: 1465 case TCPS_FIN_WAIT_1: 1466 case TCPS_FIN_WAIT_2: 1467 case TCPS_CLOSE_WAIT: 1468 case TCPS_CLOSING: 1469 case TCPS_LAST_ACK: 1470 case TCPS_TIME_WAIT: 1471 #ifdef TCP_ECN 1472 /* 1473 * if we receive ECE and are not already in recovery phase, 1474 * reduce cwnd by half but don't slow-start. 1475 * advance snd_last to snd_max not to reduce cwnd again 1476 * until all outstanding packets are acked. 1477 */ 1478 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1479 if ((tp->t_flags & TF_ECN_PERMIT) && 1480 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1481 u_int win; 1482 1483 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1484 if (win > 1) { 1485 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1486 tp->snd_cwnd = tp->snd_ssthresh; 1487 tp->snd_last = tp->snd_max; 1488 tp->t_flags |= TF_SEND_CWR; 1489 tcpstat_inc(tcps_cwr_ecn); 1490 } 1491 } 1492 tcpstat_inc(tcps_ecn_rcvece); 1493 } 1494 /* 1495 * if we receive CWR, we know that the peer has reduced 1496 * its congestion window. stop sending ecn-echo. 1497 */ 1498 if ((tiflags & TH_CWR)) { 1499 tp->t_flags &= ~TF_RCVD_CE; 1500 tcpstat_inc(tcps_ecn_rcvcwr); 1501 } 1502 #endif /* TCP_ECN */ 1503 1504 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1505 /* 1506 * Duplicate/old ACK processing. 1507 * Increments t_dupacks: 1508 * Pure duplicate (same seq/ack/window, no data) 1509 * Doesn't affect t_dupacks: 1510 * Data packets. 1511 * Normal window updates (window opens) 1512 * Resets t_dupacks: 1513 * New data ACKed. 1514 * Window shrinks 1515 * Old ACK 1516 */ 1517 if (tlen) { 1518 /* Drop very old ACKs unless th_seq matches */ 1519 if (th->th_seq != tp->rcv_nxt && 1520 SEQ_LT(th->th_ack, 1521 tp->snd_una - tp->max_sndwnd)) { 1522 tcpstat_inc(tcps_rcvacktooold); 1523 goto drop; 1524 } 1525 break; 1526 } 1527 /* 1528 * If we get an old ACK, there is probably packet 1529 * reordering going on. Be conservative and reset 1530 * t_dupacks so that we are less aggressive in 1531 * doing a fast retransmit. 1532 */ 1533 if (th->th_ack != tp->snd_una) { 1534 tp->t_dupacks = 0; 1535 break; 1536 } 1537 if (tiwin == tp->snd_wnd) { 1538 tcpstat_inc(tcps_rcvdupack); 1539 /* 1540 * If we have outstanding data (other than 1541 * a window probe), this is a completely 1542 * duplicate ack (ie, window info didn't 1543 * change), the ack is the biggest we've 1544 * seen and we've seen exactly our rexmt 1545 * threshold of them, assume a packet 1546 * has been dropped and retransmit it. 1547 * Kludge snd_nxt & the congestion 1548 * window so we send only this one 1549 * packet. 1550 * 1551 * We know we're losing at the current 1552 * window size so do congestion avoidance 1553 * (set ssthresh to half the current window 1554 * and pull our congestion window back to 1555 * the new ssthresh). 1556 * 1557 * Dup acks mean that packets have left the 1558 * network (they're now cached at the receiver) 1559 * so bump cwnd by the amount in the receiver 1560 * to keep a constant cwnd packets in the 1561 * network. 1562 */ 1563 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1564 tp->t_dupacks = 0; 1565 else if (++tp->t_dupacks == tcprexmtthresh) { 1566 tcp_seq onxt = tp->snd_nxt; 1567 u_long win = 1568 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1569 2 / tp->t_maxseg; 1570 1571 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1572 /* 1573 * False fast retx after 1574 * timeout. Do not cut window. 1575 */ 1576 tp->t_dupacks = 0; 1577 goto drop; 1578 } 1579 if (win < 2) 1580 win = 2; 1581 tp->snd_ssthresh = win * tp->t_maxseg; 1582 tp->snd_last = tp->snd_max; 1583 if (tp->sack_enable) { 1584 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1585 tp->t_rtttime = 0; 1586 #ifdef TCP_ECN 1587 tp->t_flags |= TF_SEND_CWR; 1588 #endif 1589 tcpstat_inc(tcps_cwr_frecovery); 1590 tcpstat_inc(tcps_sack_recovery_episode); 1591 /* 1592 * tcp_output() will send 1593 * oldest SACK-eligible rtx. 1594 */ 1595 (void) tcp_output(tp); 1596 tp->snd_cwnd = tp->snd_ssthresh+ 1597 tp->t_maxseg * tp->t_dupacks; 1598 goto drop; 1599 } 1600 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1601 tp->t_rtttime = 0; 1602 tp->snd_nxt = th->th_ack; 1603 tp->snd_cwnd = tp->t_maxseg; 1604 #ifdef TCP_ECN 1605 tp->t_flags |= TF_SEND_CWR; 1606 #endif 1607 tcpstat_inc(tcps_cwr_frecovery); 1608 tcpstat_inc(tcps_sndrexmitfast); 1609 (void) tcp_output(tp); 1610 1611 tp->snd_cwnd = tp->snd_ssthresh + 1612 tp->t_maxseg * tp->t_dupacks; 1613 if (SEQ_GT(onxt, tp->snd_nxt)) 1614 tp->snd_nxt = onxt; 1615 goto drop; 1616 } else if (tp->t_dupacks > tcprexmtthresh) { 1617 tp->snd_cwnd += tp->t_maxseg; 1618 (void) tcp_output(tp); 1619 goto drop; 1620 } 1621 } else if (tiwin < tp->snd_wnd) { 1622 /* 1623 * The window was retracted! Previous dup 1624 * ACKs may have been due to packets arriving 1625 * after the shrunken window, not a missing 1626 * packet, so play it safe and reset t_dupacks 1627 */ 1628 tp->t_dupacks = 0; 1629 } 1630 break; 1631 } 1632 /* 1633 * If the congestion window was inflated to account 1634 * for the other side's cached packets, retract it. 1635 */ 1636 if (tp->t_dupacks >= tcprexmtthresh) { 1637 /* Check for a partial ACK */ 1638 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1639 if (tp->sack_enable) 1640 tcp_sack_partialack(tp, th); 1641 else 1642 tcp_newreno_partialack(tp, th); 1643 } else { 1644 /* Out of fast recovery */ 1645 tp->snd_cwnd = tp->snd_ssthresh; 1646 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1647 tp->snd_ssthresh) 1648 tp->snd_cwnd = 1649 tcp_seq_subtract(tp->snd_max, 1650 th->th_ack); 1651 tp->t_dupacks = 0; 1652 } 1653 } else { 1654 /* 1655 * Reset the duplicate ACK counter if we 1656 * were not in fast recovery. 1657 */ 1658 tp->t_dupacks = 0; 1659 } 1660 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1661 tcpstat_inc(tcps_rcvacktoomuch); 1662 goto dropafterack_ratelim; 1663 } 1664 acked = th->th_ack - tp->snd_una; 1665 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1666 1667 /* 1668 * If we have a timestamp reply, update smoothed 1669 * round trip time. If no timestamp is present but 1670 * transmit timer is running and timed sequence 1671 * number was acked, update smoothed round trip time. 1672 * Since we now have an rtt measurement, cancel the 1673 * timer backoff (cf., Phil Karn's retransmit alg.). 1674 * Recompute the initial retransmit timer. 1675 */ 1676 if (opti.ts_present && opti.ts_ecr) 1677 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1678 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1679 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1680 1681 /* 1682 * If all outstanding data is acked, stop retransmit 1683 * timer and remember to restart (more output or persist). 1684 * If there is more data to be acked, restart retransmit 1685 * timer, using current (possibly backed-off) value. 1686 */ 1687 if (th->th_ack == tp->snd_max) { 1688 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1689 tp->t_flags |= TF_NEEDOUTPUT; 1690 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1691 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1692 /* 1693 * When new data is acked, open the congestion window. 1694 * If the window gives us less than ssthresh packets 1695 * in flight, open exponentially (maxseg per packet). 1696 * Otherwise open linearly: maxseg per window 1697 * (maxseg^2 / cwnd per packet). 1698 */ 1699 { 1700 u_int cw = tp->snd_cwnd; 1701 u_int incr = tp->t_maxseg; 1702 1703 if (cw > tp->snd_ssthresh) 1704 incr = incr * incr / cw; 1705 if (tp->t_dupacks < tcprexmtthresh) 1706 tp->snd_cwnd = ulmin(cw + incr, 1707 TCP_MAXWIN << tp->snd_scale); 1708 } 1709 ND6_HINT(tp); 1710 if (acked > so->so_snd.sb_cc) { 1711 tp->snd_wnd -= so->so_snd.sb_cc; 1712 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1713 ourfinisacked = 1; 1714 } else { 1715 sbdrop(so, &so->so_snd, acked); 1716 tp->snd_wnd -= acked; 1717 ourfinisacked = 0; 1718 } 1719 1720 tcp_update_sndspace(tp); 1721 if (sb_notify(so, &so->so_snd)) { 1722 tp->t_flags |= TF_BLOCKOUTPUT; 1723 sowwakeup(so); 1724 tp->t_flags &= ~TF_BLOCKOUTPUT; 1725 } 1726 1727 /* 1728 * If we had a pending ICMP message that referred to data 1729 * that have just been acknowledged, disregard the recorded 1730 * ICMP message. 1731 */ 1732 if ((tp->t_flags & TF_PMTUD_PEND) && 1733 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1734 tp->t_flags &= ~TF_PMTUD_PEND; 1735 1736 /* 1737 * Keep track of the largest chunk of data acknowledged 1738 * since last PMTU update 1739 */ 1740 if (tp->t_pmtud_mss_acked < acked) 1741 tp->t_pmtud_mss_acked = acked; 1742 1743 tp->snd_una = th->th_ack; 1744 #ifdef TCP_ECN 1745 /* sync snd_last with snd_una */ 1746 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1747 tp->snd_last = tp->snd_una; 1748 #endif 1749 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1750 tp->snd_nxt = tp->snd_una; 1751 1752 switch (tp->t_state) { 1753 1754 /* 1755 * In FIN_WAIT_1 STATE in addition to the processing 1756 * for the ESTABLISHED state if our FIN is now acknowledged 1757 * then enter FIN_WAIT_2. 1758 */ 1759 case TCPS_FIN_WAIT_1: 1760 if (ourfinisacked) { 1761 /* 1762 * If we can't receive any more 1763 * data, then closing user can proceed. 1764 * Starting the timer is contrary to the 1765 * specification, but if we don't get a FIN 1766 * we'll hang forever. 1767 */ 1768 if (so->so_state & SS_CANTRCVMORE) { 1769 tp->t_flags |= TF_BLOCKOUTPUT; 1770 soisdisconnected(so); 1771 tp->t_flags &= ~TF_BLOCKOUTPUT; 1772 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1773 } 1774 tp->t_state = TCPS_FIN_WAIT_2; 1775 } 1776 break; 1777 1778 /* 1779 * In CLOSING STATE in addition to the processing for 1780 * the ESTABLISHED state if the ACK acknowledges our FIN 1781 * then enter the TIME-WAIT state, otherwise ignore 1782 * the segment. 1783 */ 1784 case TCPS_CLOSING: 1785 if (ourfinisacked) { 1786 tp->t_state = TCPS_TIME_WAIT; 1787 tcp_canceltimers(tp); 1788 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1789 tp->t_flags |= TF_BLOCKOUTPUT; 1790 soisdisconnected(so); 1791 tp->t_flags &= ~TF_BLOCKOUTPUT; 1792 } 1793 break; 1794 1795 /* 1796 * In LAST_ACK, we may still be waiting for data to drain 1797 * and/or to be acked, as well as for the ack of our FIN. 1798 * If our FIN is now acknowledged, delete the TCB, 1799 * enter the closed state and return. 1800 */ 1801 case TCPS_LAST_ACK: 1802 if (ourfinisacked) { 1803 tp = tcp_close(tp); 1804 goto drop; 1805 } 1806 break; 1807 1808 /* 1809 * In TIME_WAIT state the only thing that should arrive 1810 * is a retransmission of the remote FIN. Acknowledge 1811 * it and restart the finack timer. 1812 */ 1813 case TCPS_TIME_WAIT: 1814 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1815 goto dropafterack; 1816 } 1817 } 1818 1819 step6: 1820 /* 1821 * Update window information. 1822 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1823 */ 1824 if ((tiflags & TH_ACK) && 1825 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1826 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1827 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1828 /* keep track of pure window updates */ 1829 if (tlen == 0 && 1830 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1831 tcpstat_inc(tcps_rcvwinupd); 1832 tp->snd_wnd = tiwin; 1833 tp->snd_wl1 = th->th_seq; 1834 tp->snd_wl2 = th->th_ack; 1835 if (tp->snd_wnd > tp->max_sndwnd) 1836 tp->max_sndwnd = tp->snd_wnd; 1837 tp->t_flags |= TF_NEEDOUTPUT; 1838 } 1839 1840 /* 1841 * Process segments with URG. 1842 */ 1843 if ((tiflags & TH_URG) && th->th_urp && 1844 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1845 /* 1846 * This is a kludge, but if we receive and accept 1847 * random urgent pointers, we'll crash in 1848 * soreceive. It's hard to imagine someone 1849 * actually wanting to send this much urgent data. 1850 */ 1851 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1852 th->th_urp = 0; /* XXX */ 1853 tiflags &= ~TH_URG; /* XXX */ 1854 goto dodata; /* XXX */ 1855 } 1856 /* 1857 * If this segment advances the known urgent pointer, 1858 * then mark the data stream. This should not happen 1859 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1860 * a FIN has been received from the remote side. 1861 * In these states we ignore the URG. 1862 * 1863 * According to RFC961 (Assigned Protocols), 1864 * the urgent pointer points to the last octet 1865 * of urgent data. We continue, however, 1866 * to consider it to indicate the first octet 1867 * of data past the urgent section as the original 1868 * spec states (in one of two places). 1869 */ 1870 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1871 tp->rcv_up = th->th_seq + th->th_urp; 1872 so->so_oobmark = so->so_rcv.sb_cc + 1873 (tp->rcv_up - tp->rcv_nxt) - 1; 1874 if (so->so_oobmark == 0) 1875 so->so_state |= SS_RCVATMARK; 1876 sohasoutofband(so); 1877 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1878 } 1879 /* 1880 * Remove out of band data so doesn't get presented to user. 1881 * This can happen independent of advancing the URG pointer, 1882 * but if two URG's are pending at once, some out-of-band 1883 * data may creep in... ick. 1884 */ 1885 if (th->th_urp <= (u_int16_t) tlen && 1886 (so->so_options & SO_OOBINLINE) == 0) 1887 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1888 } else 1889 /* 1890 * If no out of band data is expected, 1891 * pull receive urgent pointer along 1892 * with the receive window. 1893 */ 1894 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1895 tp->rcv_up = tp->rcv_nxt; 1896 dodata: /* XXX */ 1897 1898 /* 1899 * Process the segment text, merging it into the TCP sequencing queue, 1900 * and arranging for acknowledgment of receipt if necessary. 1901 * This process logically involves adjusting tp->rcv_wnd as data 1902 * is presented to the user (this happens in tcp_usrreq.c, 1903 * case PRU_RCVD). If a FIN has already been received on this 1904 * connection then we just ignore the text. 1905 */ 1906 if ((tlen || (tiflags & TH_FIN)) && 1907 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1908 tcp_seq laststart = th->th_seq; 1909 tcp_seq lastend = th->th_seq + tlen; 1910 1911 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1912 tp->t_state == TCPS_ESTABLISHED) { 1913 TCP_SETUP_ACK(tp, tiflags, m); 1914 tp->rcv_nxt += tlen; 1915 tiflags = th->th_flags & TH_FIN; 1916 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1917 ND6_HINT(tp); 1918 if (so->so_state & SS_CANTRCVMORE) 1919 m_freem(m); 1920 else { 1921 m_adj(m, hdroptlen); 1922 sbappendstream(so, &so->so_rcv, m); 1923 } 1924 tp->t_flags |= TF_BLOCKOUTPUT; 1925 sorwakeup(so); 1926 tp->t_flags &= ~TF_BLOCKOUTPUT; 1927 } else { 1928 m_adj(m, hdroptlen); 1929 tiflags = tcp_reass(tp, th, m, &tlen); 1930 tp->t_flags |= TF_ACKNOW; 1931 } 1932 if (tp->sack_enable) 1933 tcp_update_sack_list(tp, laststart, lastend); 1934 1935 /* 1936 * variable len never referenced again in modern BSD, 1937 * so why bother computing it ?? 1938 */ 1939 #if 0 1940 /* 1941 * Note the amount of data that peer has sent into 1942 * our window, in order to estimate the sender's 1943 * buffer size. 1944 */ 1945 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1946 #endif /* 0 */ 1947 } else { 1948 m_freem(m); 1949 tiflags &= ~TH_FIN; 1950 } 1951 1952 /* 1953 * If FIN is received ACK the FIN and let the user know 1954 * that the connection is closing. Ignore a FIN received before 1955 * the connection is fully established. 1956 */ 1957 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1958 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1959 tp->t_flags |= TF_BLOCKOUTPUT; 1960 socantrcvmore(so); 1961 tp->t_flags &= ~TF_BLOCKOUTPUT; 1962 tp->t_flags |= TF_ACKNOW; 1963 tp->rcv_nxt++; 1964 } 1965 switch (tp->t_state) { 1966 1967 /* 1968 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1969 */ 1970 case TCPS_ESTABLISHED: 1971 tp->t_state = TCPS_CLOSE_WAIT; 1972 break; 1973 1974 /* 1975 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1976 * enter the CLOSING state. 1977 */ 1978 case TCPS_FIN_WAIT_1: 1979 tp->t_state = TCPS_CLOSING; 1980 break; 1981 1982 /* 1983 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1984 * starting the time-wait timer, turning off the other 1985 * standard timers. 1986 */ 1987 case TCPS_FIN_WAIT_2: 1988 tp->t_state = TCPS_TIME_WAIT; 1989 tcp_canceltimers(tp); 1990 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1991 tp->t_flags |= TF_BLOCKOUTPUT; 1992 soisdisconnected(so); 1993 tp->t_flags &= ~TF_BLOCKOUTPUT; 1994 break; 1995 1996 /* 1997 * In TIME_WAIT state restart the 2 MSL time_wait timer. 1998 */ 1999 case TCPS_TIME_WAIT: 2000 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2001 break; 2002 } 2003 } 2004 if (so->so_options & SO_DEBUG) { 2005 switch (tp->pf) { 2006 #ifdef INET6 2007 case PF_INET6: 2008 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2009 0, tlen); 2010 break; 2011 #endif /* INET6 */ 2012 case PF_INET: 2013 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2014 0, tlen); 2015 break; 2016 } 2017 } 2018 2019 /* 2020 * Return any desired output. 2021 */ 2022 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2023 (void) tcp_output(tp); 2024 return IPPROTO_DONE; 2025 2026 badsyn: 2027 /* 2028 * Received a bad SYN. Increment counters and dropwithreset. 2029 */ 2030 tcpstat_inc(tcps_badsyn); 2031 tp = NULL; 2032 goto dropwithreset; 2033 2034 dropafterack_ratelim: 2035 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2036 tcp_ackdrop_ppslim) == 0) { 2037 /* XXX stat */ 2038 goto drop; 2039 } 2040 /* ...fall into dropafterack... */ 2041 2042 dropafterack: 2043 /* 2044 * Generate an ACK dropping incoming segment if it occupies 2045 * sequence space, where the ACK reflects our state. 2046 */ 2047 if (tiflags & TH_RST) 2048 goto drop; 2049 m_freem(m); 2050 tp->t_flags |= TF_ACKNOW; 2051 (void) tcp_output(tp); 2052 return IPPROTO_DONE; 2053 2054 dropwithreset_ratelim: 2055 /* 2056 * We may want to rate-limit RSTs in certain situations, 2057 * particularly if we are sending an RST in response to 2058 * an attempt to connect to or otherwise communicate with 2059 * a port for which we have no socket. 2060 */ 2061 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2062 tcp_rst_ppslim) == 0) { 2063 /* XXX stat */ 2064 goto drop; 2065 } 2066 /* ...fall into dropwithreset... */ 2067 2068 dropwithreset: 2069 /* 2070 * Generate a RST, dropping incoming segment. 2071 * Make ACK acceptable to originator of segment. 2072 * Don't bother to respond to RST. 2073 */ 2074 if (tiflags & TH_RST) 2075 goto drop; 2076 if (tiflags & TH_ACK) { 2077 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2078 TH_RST, m->m_pkthdr.ph_rtableid); 2079 } else { 2080 if (tiflags & TH_SYN) 2081 tlen++; 2082 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2083 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2084 } 2085 m_freem(m); 2086 return IPPROTO_DONE; 2087 2088 drop: 2089 /* 2090 * Drop space held by incoming segment and return. 2091 */ 2092 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2093 switch (tp->pf) { 2094 #ifdef INET6 2095 case PF_INET6: 2096 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2097 0, tlen); 2098 break; 2099 #endif /* INET6 */ 2100 case PF_INET: 2101 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2102 0, tlen); 2103 break; 2104 } 2105 } 2106 2107 m_freem(m); 2108 return IPPROTO_DONE; 2109 } 2110 2111 int 2112 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2113 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2114 u_int rtableid) 2115 { 2116 u_int16_t mss = 0; 2117 int opt, optlen; 2118 #ifdef TCP_SIGNATURE 2119 caddr_t sigp = NULL; 2120 struct tdb *tdb = NULL; 2121 #endif /* TCP_SIGNATURE */ 2122 2123 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2124 opt = cp[0]; 2125 if (opt == TCPOPT_EOL) 2126 break; 2127 if (opt == TCPOPT_NOP) 2128 optlen = 1; 2129 else { 2130 if (cnt < 2) 2131 break; 2132 optlen = cp[1]; 2133 if (optlen < 2 || optlen > cnt) 2134 break; 2135 } 2136 switch (opt) { 2137 2138 default: 2139 continue; 2140 2141 case TCPOPT_MAXSEG: 2142 if (optlen != TCPOLEN_MAXSEG) 2143 continue; 2144 if (!(th->th_flags & TH_SYN)) 2145 continue; 2146 if (TCPS_HAVERCVDSYN(tp->t_state)) 2147 continue; 2148 memcpy(&mss, cp + 2, sizeof(mss)); 2149 mss = ntohs(mss); 2150 oi->maxseg = mss; 2151 break; 2152 2153 case TCPOPT_WINDOW: 2154 if (optlen != TCPOLEN_WINDOW) 2155 continue; 2156 if (!(th->th_flags & TH_SYN)) 2157 continue; 2158 if (TCPS_HAVERCVDSYN(tp->t_state)) 2159 continue; 2160 tp->t_flags |= TF_RCVD_SCALE; 2161 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2162 break; 2163 2164 case TCPOPT_TIMESTAMP: 2165 if (optlen != TCPOLEN_TIMESTAMP) 2166 continue; 2167 oi->ts_present = 1; 2168 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2169 oi->ts_val = ntohl(oi->ts_val); 2170 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2171 oi->ts_ecr = ntohl(oi->ts_ecr); 2172 2173 if (!(th->th_flags & TH_SYN)) 2174 continue; 2175 if (TCPS_HAVERCVDSYN(tp->t_state)) 2176 continue; 2177 /* 2178 * A timestamp received in a SYN makes 2179 * it ok to send timestamp requests and replies. 2180 */ 2181 tp->t_flags |= TF_RCVD_TSTMP; 2182 tp->ts_recent = oi->ts_val; 2183 tp->ts_recent_age = tcp_now; 2184 break; 2185 2186 case TCPOPT_SACK_PERMITTED: 2187 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2188 continue; 2189 if (!(th->th_flags & TH_SYN)) 2190 continue; 2191 if (TCPS_HAVERCVDSYN(tp->t_state)) 2192 continue; 2193 /* MUST only be set on SYN */ 2194 tp->t_flags |= TF_SACK_PERMIT; 2195 break; 2196 case TCPOPT_SACK: 2197 tcp_sack_option(tp, th, cp, optlen); 2198 break; 2199 #ifdef TCP_SIGNATURE 2200 case TCPOPT_SIGNATURE: 2201 if (optlen != TCPOLEN_SIGNATURE) 2202 continue; 2203 2204 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2205 return (-1); 2206 2207 sigp = cp + 2; 2208 break; 2209 #endif /* TCP_SIGNATURE */ 2210 } 2211 } 2212 2213 #ifdef TCP_SIGNATURE 2214 if (tp->t_flags & TF_SIGNATURE) { 2215 union sockaddr_union src, dst; 2216 2217 memset(&src, 0, sizeof(union sockaddr_union)); 2218 memset(&dst, 0, sizeof(union sockaddr_union)); 2219 2220 switch (tp->pf) { 2221 case 0: 2222 case AF_INET: 2223 src.sa.sa_len = sizeof(struct sockaddr_in); 2224 src.sa.sa_family = AF_INET; 2225 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2226 dst.sa.sa_len = sizeof(struct sockaddr_in); 2227 dst.sa.sa_family = AF_INET; 2228 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2229 break; 2230 #ifdef INET6 2231 case AF_INET6: 2232 src.sa.sa_len = sizeof(struct sockaddr_in6); 2233 src.sa.sa_family = AF_INET6; 2234 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2235 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2236 dst.sa.sa_family = AF_INET6; 2237 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2238 break; 2239 #endif /* INET6 */ 2240 } 2241 2242 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2243 0, &src, &dst, IPPROTO_TCP); 2244 2245 /* 2246 * We don't have an SA for this peer, so we turn off 2247 * TF_SIGNATURE on the listen socket 2248 */ 2249 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2250 tp->t_flags &= ~TF_SIGNATURE; 2251 2252 } 2253 2254 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2255 tcpstat_inc(tcps_rcvbadsig); 2256 return (-1); 2257 } 2258 2259 if (sigp) { 2260 char sig[16]; 2261 2262 if (tdb == NULL) { 2263 tcpstat_inc(tcps_rcvbadsig); 2264 return (-1); 2265 } 2266 2267 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2268 return (-1); 2269 2270 if (timingsafe_bcmp(sig, sigp, 16)) { 2271 tcpstat_inc(tcps_rcvbadsig); 2272 return (-1); 2273 } 2274 2275 tcpstat_inc(tcps_rcvgoodsig); 2276 } 2277 #endif /* TCP_SIGNATURE */ 2278 2279 return (0); 2280 } 2281 2282 u_long 2283 tcp_seq_subtract(u_long a, u_long b) 2284 { 2285 return ((long)(a - b)); 2286 } 2287 2288 /* 2289 * This function is called upon receipt of new valid data (while not in header 2290 * prediction mode), and it updates the ordered list of sacks. 2291 */ 2292 void 2293 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2294 tcp_seq rcv_lastend) 2295 { 2296 /* 2297 * First reported block MUST be the most recent one. Subsequent 2298 * blocks SHOULD be in the order in which they arrived at the 2299 * receiver. These two conditions make the implementation fully 2300 * compliant with RFC 2018. 2301 */ 2302 int i, j = 0, count = 0, lastpos = -1; 2303 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2304 2305 /* First clean up current list of sacks */ 2306 for (i = 0; i < tp->rcv_numsacks; i++) { 2307 sack = tp->sackblks[i]; 2308 if (sack.start == 0 && sack.end == 0) { 2309 count++; /* count = number of blocks to be discarded */ 2310 continue; 2311 } 2312 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2313 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2314 count++; 2315 } else { 2316 temp[j].start = tp->sackblks[i].start; 2317 temp[j++].end = tp->sackblks[i].end; 2318 } 2319 } 2320 tp->rcv_numsacks -= count; 2321 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2322 tcp_clean_sackreport(tp); 2323 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2324 /* ==> need first sack block */ 2325 tp->sackblks[0].start = rcv_laststart; 2326 tp->sackblks[0].end = rcv_lastend; 2327 tp->rcv_numsacks = 1; 2328 } 2329 return; 2330 } 2331 /* Otherwise, sack blocks are already present. */ 2332 for (i = 0; i < tp->rcv_numsacks; i++) 2333 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2334 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2335 return; /* sack list remains unchanged */ 2336 /* 2337 * From here, segment just received should be (part of) the 1st sack. 2338 * Go through list, possibly coalescing sack block entries. 2339 */ 2340 firstsack.start = rcv_laststart; 2341 firstsack.end = rcv_lastend; 2342 for (i = 0; i < tp->rcv_numsacks; i++) { 2343 sack = tp->sackblks[i]; 2344 if (SEQ_LT(sack.end, firstsack.start) || 2345 SEQ_GT(sack.start, firstsack.end)) 2346 continue; /* no overlap */ 2347 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2348 /* 2349 * identical block; delete it here since we will 2350 * move it to the front of the list. 2351 */ 2352 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2353 lastpos = i; /* last posn with a zero entry */ 2354 continue; 2355 } 2356 if (SEQ_LEQ(sack.start, firstsack.start)) 2357 firstsack.start = sack.start; /* merge blocks */ 2358 if (SEQ_GEQ(sack.end, firstsack.end)) 2359 firstsack.end = sack.end; /* merge blocks */ 2360 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2361 lastpos = i; /* last posn with a zero entry */ 2362 } 2363 if (lastpos != -1) { /* at least one merge */ 2364 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2365 sack = tp->sackblks[i]; 2366 if (sack.start == 0 && sack.end == 0) 2367 continue; 2368 temp[j++] = sack; 2369 } 2370 tp->rcv_numsacks = j; /* including first blk (added later) */ 2371 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2372 tp->sackblks[i] = temp[i]; 2373 } else { /* no merges -- shift sacks by 1 */ 2374 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2375 tp->rcv_numsacks++; 2376 for (i = tp->rcv_numsacks-1; i > 0; i--) 2377 tp->sackblks[i] = tp->sackblks[i-1]; 2378 } 2379 tp->sackblks[0] = firstsack; 2380 return; 2381 } 2382 2383 /* 2384 * Process the TCP SACK option. tp->snd_holes is an ordered list 2385 * of holes (oldest to newest, in terms of the sequence space). 2386 */ 2387 void 2388 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2389 { 2390 int tmp_olen; 2391 u_char *tmp_cp; 2392 struct sackhole *cur, *p, *temp; 2393 2394 if (!tp->sack_enable) 2395 return; 2396 /* SACK without ACK doesn't make sense. */ 2397 if ((th->th_flags & TH_ACK) == 0) 2398 return; 2399 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2400 if (SEQ_LT(th->th_ack, tp->snd_una) || 2401 SEQ_GT(th->th_ack, tp->snd_max)) 2402 return; 2403 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2404 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2405 return; 2406 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2407 tmp_cp = cp + 2; 2408 tmp_olen = optlen - 2; 2409 tcpstat_inc(tcps_sack_rcv_opts); 2410 if (tp->snd_numholes < 0) 2411 tp->snd_numholes = 0; 2412 if (tp->t_maxseg == 0) 2413 panic("tcp_sack_option"); /* Should never happen */ 2414 while (tmp_olen > 0) { 2415 struct sackblk sack; 2416 2417 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2418 sack.start = ntohl(sack.start); 2419 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2420 sack.end = ntohl(sack.end); 2421 tmp_olen -= TCPOLEN_SACK; 2422 tmp_cp += TCPOLEN_SACK; 2423 if (SEQ_LEQ(sack.end, sack.start)) 2424 continue; /* bad SACK fields */ 2425 if (SEQ_LEQ(sack.end, tp->snd_una)) 2426 continue; /* old block */ 2427 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2428 if (SEQ_LT(sack.start, th->th_ack)) 2429 continue; 2430 } 2431 if (SEQ_GT(sack.end, tp->snd_max)) 2432 continue; 2433 if (tp->snd_holes == NULL) { /* first hole */ 2434 tp->snd_holes = (struct sackhole *) 2435 pool_get(&sackhl_pool, PR_NOWAIT); 2436 if (tp->snd_holes == NULL) { 2437 /* ENOBUFS, so ignore SACKed block for now*/ 2438 goto done; 2439 } 2440 cur = tp->snd_holes; 2441 cur->start = th->th_ack; 2442 cur->end = sack.start; 2443 cur->rxmit = cur->start; 2444 cur->next = NULL; 2445 tp->snd_numholes = 1; 2446 tp->rcv_lastsack = sack.end; 2447 /* 2448 * dups is at least one. If more data has been 2449 * SACKed, it can be greater than one. 2450 */ 2451 cur->dups = min(tcprexmtthresh, 2452 ((sack.end - cur->end)/tp->t_maxseg)); 2453 if (cur->dups < 1) 2454 cur->dups = 1; 2455 continue; /* with next sack block */ 2456 } 2457 /* Go thru list of holes: p = previous, cur = current */ 2458 p = cur = tp->snd_holes; 2459 while (cur) { 2460 if (SEQ_LEQ(sack.end, cur->start)) 2461 /* SACKs data before the current hole */ 2462 break; /* no use going through more holes */ 2463 if (SEQ_GEQ(sack.start, cur->end)) { 2464 /* SACKs data beyond the current hole */ 2465 cur->dups++; 2466 if (((sack.end - cur->end)/tp->t_maxseg) >= 2467 tcprexmtthresh) 2468 cur->dups = tcprexmtthresh; 2469 p = cur; 2470 cur = cur->next; 2471 continue; 2472 } 2473 if (SEQ_LEQ(sack.start, cur->start)) { 2474 /* Data acks at least the beginning of hole */ 2475 if (SEQ_GEQ(sack.end, cur->end)) { 2476 /* Acks entire hole, so delete hole */ 2477 if (p != cur) { 2478 p->next = cur->next; 2479 pool_put(&sackhl_pool, cur); 2480 cur = p->next; 2481 } else { 2482 cur = cur->next; 2483 pool_put(&sackhl_pool, p); 2484 p = cur; 2485 tp->snd_holes = p; 2486 } 2487 tp->snd_numholes--; 2488 continue; 2489 } 2490 /* otherwise, move start of hole forward */ 2491 cur->start = sack.end; 2492 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2493 p = cur; 2494 cur = cur->next; 2495 continue; 2496 } 2497 /* move end of hole backward */ 2498 if (SEQ_GEQ(sack.end, cur->end)) { 2499 cur->end = sack.start; 2500 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2501 cur->dups++; 2502 if (((sack.end - cur->end)/tp->t_maxseg) >= 2503 tcprexmtthresh) 2504 cur->dups = tcprexmtthresh; 2505 p = cur; 2506 cur = cur->next; 2507 continue; 2508 } 2509 if (SEQ_LT(cur->start, sack.start) && 2510 SEQ_GT(cur->end, sack.end)) { 2511 /* 2512 * ACKs some data in middle of a hole; need to 2513 * split current hole 2514 */ 2515 temp = (struct sackhole *) 2516 pool_get(&sackhl_pool, PR_NOWAIT); 2517 if (temp == NULL) 2518 goto done; /* ENOBUFS */ 2519 temp->next = cur->next; 2520 temp->start = sack.end; 2521 temp->end = cur->end; 2522 temp->dups = cur->dups; 2523 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2524 cur->end = sack.start; 2525 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2526 cur->dups++; 2527 if (((sack.end - cur->end)/tp->t_maxseg) >= 2528 tcprexmtthresh) 2529 cur->dups = tcprexmtthresh; 2530 cur->next = temp; 2531 p = temp; 2532 cur = p->next; 2533 tp->snd_numholes++; 2534 } 2535 } 2536 /* At this point, p points to the last hole on the list */ 2537 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2538 /* 2539 * Need to append new hole at end. 2540 * Last hole is p (and it's not NULL). 2541 */ 2542 temp = (struct sackhole *) 2543 pool_get(&sackhl_pool, PR_NOWAIT); 2544 if (temp == NULL) 2545 goto done; /* ENOBUFS */ 2546 temp->start = tp->rcv_lastsack; 2547 temp->end = sack.start; 2548 temp->dups = min(tcprexmtthresh, 2549 ((sack.end - sack.start)/tp->t_maxseg)); 2550 if (temp->dups < 1) 2551 temp->dups = 1; 2552 temp->rxmit = temp->start; 2553 temp->next = 0; 2554 p->next = temp; 2555 tp->rcv_lastsack = sack.end; 2556 tp->snd_numholes++; 2557 } 2558 } 2559 done: 2560 return; 2561 } 2562 2563 /* 2564 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2565 * it is completely acked; otherwise, tcp_sack_option(), called from 2566 * tcp_dooptions(), will fix up the hole. 2567 */ 2568 void 2569 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2570 { 2571 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2572 /* max because this could be an older ack just arrived */ 2573 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2574 th->th_ack : tp->snd_una; 2575 struct sackhole *cur = tp->snd_holes; 2576 struct sackhole *prev; 2577 while (cur) 2578 if (SEQ_LEQ(cur->end, lastack)) { 2579 prev = cur; 2580 cur = cur->next; 2581 pool_put(&sackhl_pool, prev); 2582 tp->snd_numholes--; 2583 } else if (SEQ_LT(cur->start, lastack)) { 2584 cur->start = lastack; 2585 if (SEQ_LT(cur->rxmit, cur->start)) 2586 cur->rxmit = cur->start; 2587 break; 2588 } else 2589 break; 2590 tp->snd_holes = cur; 2591 } 2592 } 2593 2594 /* 2595 * Delete all receiver-side SACK information. 2596 */ 2597 void 2598 tcp_clean_sackreport(struct tcpcb *tp) 2599 { 2600 int i; 2601 2602 tp->rcv_numsacks = 0; 2603 for (i = 0; i < MAX_SACK_BLKS; i++) 2604 tp->sackblks[i].start = tp->sackblks[i].end=0; 2605 2606 } 2607 2608 /* 2609 * Partial ack handling within a sack recovery episode. When a partial ack 2610 * arrives, turn off retransmission timer, deflate the window, do not clear 2611 * tp->t_dupacks. 2612 */ 2613 void 2614 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2615 { 2616 /* Turn off retx. timer (will start again next segment) */ 2617 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2618 tp->t_rtttime = 0; 2619 /* 2620 * Partial window deflation. This statement relies on the 2621 * fact that tp->snd_una has not been updated yet. 2622 */ 2623 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2624 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2625 tp->snd_cwnd += tp->t_maxseg; 2626 } else 2627 tp->snd_cwnd = tp->t_maxseg; 2628 tp->snd_cwnd += tp->t_maxseg; 2629 tp->t_flags |= TF_NEEDOUTPUT; 2630 } 2631 2632 /* 2633 * Pull out of band byte out of a segment so 2634 * it doesn't appear in the user's data queue. 2635 * It is still reflected in the segment length for 2636 * sequencing purposes. 2637 */ 2638 void 2639 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2640 { 2641 int cnt = off + urgent - 1; 2642 2643 while (cnt >= 0) { 2644 if (m->m_len > cnt) { 2645 char *cp = mtod(m, caddr_t) + cnt; 2646 struct tcpcb *tp = sototcpcb(so); 2647 2648 tp->t_iobc = *cp; 2649 tp->t_oobflags |= TCPOOB_HAVEDATA; 2650 memmove(cp, cp + 1, m->m_len - cnt - 1); 2651 m->m_len--; 2652 return; 2653 } 2654 cnt -= m->m_len; 2655 m = m->m_next; 2656 if (m == NULL) 2657 break; 2658 } 2659 panic("tcp_pulloutofband"); 2660 } 2661 2662 /* 2663 * Collect new round-trip time estimate 2664 * and update averages and current timeout. 2665 */ 2666 void 2667 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2668 { 2669 short delta; 2670 short rttmin; 2671 2672 if (rtt < 0) 2673 rtt = 0; 2674 else if (rtt > TCP_RTT_MAX) 2675 rtt = TCP_RTT_MAX; 2676 2677 tcpstat_inc(tcps_rttupdated); 2678 if (tp->t_srtt != 0) { 2679 /* 2680 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2681 * after the binary point (scaled by 4), whereas 2682 * srtt is stored as fixed point with 5 bits after the 2683 * binary point (i.e., scaled by 32). The following magic 2684 * is equivalent to the smoothing algorithm in rfc793 with 2685 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2686 * point). 2687 */ 2688 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2689 (tp->t_srtt >> TCP_RTT_SHIFT); 2690 if ((tp->t_srtt += delta) <= 0) 2691 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2692 /* 2693 * We accumulate a smoothed rtt variance (actually, a 2694 * smoothed mean difference), then set the retransmit 2695 * timer to smoothed rtt + 4 times the smoothed variance. 2696 * rttvar is stored as fixed point with 4 bits after the 2697 * binary point (scaled by 16). The following is 2698 * equivalent to rfc793 smoothing with an alpha of .75 2699 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2700 * rfc793's wired-in beta. 2701 */ 2702 if (delta < 0) 2703 delta = -delta; 2704 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2705 if ((tp->t_rttvar += delta) <= 0) 2706 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2707 } else { 2708 /* 2709 * No rtt measurement yet - use the unsmoothed rtt. 2710 * Set the variance to half the rtt (so our first 2711 * retransmit happens at 3*rtt). 2712 */ 2713 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2714 tp->t_rttvar = (rtt + 1) << 2715 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2716 } 2717 tp->t_rtttime = 0; 2718 tp->t_rxtshift = 0; 2719 2720 /* 2721 * the retransmit should happen at rtt + 4 * rttvar. 2722 * Because of the way we do the smoothing, srtt and rttvar 2723 * will each average +1/2 tick of bias. When we compute 2724 * the retransmit timer, we want 1/2 tick of rounding and 2725 * 1 extra tick because of +-1/2 tick uncertainty in the 2726 * firing of the timer. The bias will give us exactly the 2727 * 1.5 tick we need. But, because the bias is 2728 * statistical, we have to test that we don't drop below 2729 * the minimum feasible timer (which is 2 ticks). 2730 */ 2731 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2732 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2733 2734 /* 2735 * We received an ack for a packet that wasn't retransmitted; 2736 * it is probably safe to discard any error indications we've 2737 * received recently. This isn't quite right, but close enough 2738 * for now (a route might have failed after we sent a segment, 2739 * and the return path might not be symmetrical). 2740 */ 2741 tp->t_softerror = 0; 2742 } 2743 2744 /* 2745 * Determine a reasonable value for maxseg size. 2746 * If the route is known, check route for mtu. 2747 * If none, use an mss that can be handled on the outgoing 2748 * interface without forcing IP to fragment; if bigger than 2749 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2750 * to utilize large mbufs. If no route is found, route has no mtu, 2751 * or the destination isn't local, use a default, hopefully conservative 2752 * size (usually 512 or the default IP max size, but no more than the mtu 2753 * of the interface), as we can't discover anything about intervening 2754 * gateways or networks. We also initialize the congestion/slow start 2755 * window to be a single segment if the destination isn't local. 2756 * While looking at the routing entry, we also initialize other path-dependent 2757 * parameters from pre-set or cached values in the routing entry. 2758 * 2759 * Also take into account the space needed for options that we 2760 * send regularly. Make maxseg shorter by that amount to assure 2761 * that we can send maxseg amount of data even when the options 2762 * are present. Store the upper limit of the length of options plus 2763 * data in maxopd. 2764 * 2765 * NOTE: offer == -1 indicates that the maxseg size changed due to 2766 * Path MTU discovery. 2767 */ 2768 int 2769 tcp_mss(struct tcpcb *tp, int offer) 2770 { 2771 struct rtentry *rt; 2772 struct ifnet *ifp = NULL; 2773 int mss, mssopt; 2774 int iphlen; 2775 struct inpcb *inp; 2776 2777 inp = tp->t_inpcb; 2778 2779 mssopt = mss = tcp_mssdflt; 2780 2781 rt = in_pcbrtentry(inp); 2782 2783 if (rt == NULL) 2784 goto out; 2785 2786 ifp = if_get(rt->rt_ifidx); 2787 if (ifp == NULL) 2788 goto out; 2789 2790 switch (tp->pf) { 2791 #ifdef INET6 2792 case AF_INET6: 2793 iphlen = sizeof(struct ip6_hdr); 2794 break; 2795 #endif 2796 case AF_INET: 2797 iphlen = sizeof(struct ip); 2798 break; 2799 default: 2800 /* the family does not support path MTU discovery */ 2801 goto out; 2802 } 2803 2804 /* 2805 * if there's an mtu associated with the route and we support 2806 * path MTU discovery for the underlying protocol family, use it. 2807 */ 2808 if (rt->rt_mtu) { 2809 /* 2810 * One may wish to lower MSS to take into account options, 2811 * especially security-related options. 2812 */ 2813 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2814 /* 2815 * RFC2460 section 5, last paragraph: if path MTU is 2816 * smaller than 1280, use 1280 as packet size and 2817 * attach fragment header. 2818 */ 2819 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2820 sizeof(struct tcphdr); 2821 } else { 2822 mss = rt->rt_mtu - iphlen - 2823 sizeof(struct tcphdr); 2824 } 2825 } else if (ifp->if_flags & IFF_LOOPBACK) { 2826 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2827 } else if (tp->pf == AF_INET) { 2828 if (ip_mtudisc) 2829 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2830 } 2831 #ifdef INET6 2832 else if (tp->pf == AF_INET6) { 2833 /* 2834 * for IPv6, path MTU discovery is always turned on, 2835 * or the node must use packet size <= 1280. 2836 */ 2837 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2838 } 2839 #endif /* INET6 */ 2840 2841 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2842 if (offer != -1) { 2843 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2844 mssopt = max(tcp_mssdflt, mssopt); 2845 } 2846 out: 2847 if_put(ifp); 2848 /* 2849 * The current mss, t_maxseg, is initialized to the default value. 2850 * If we compute a smaller value, reduce the current mss. 2851 * If we compute a larger value, return it for use in sending 2852 * a max seg size option, but don't store it for use 2853 * unless we received an offer at least that large from peer. 2854 * 2855 * However, do not accept offers lower than the minimum of 2856 * the interface MTU and 216. 2857 */ 2858 if (offer > 0) 2859 tp->t_peermss = offer; 2860 if (tp->t_peermss) 2861 mss = min(mss, max(tp->t_peermss, 216)); 2862 2863 /* sanity - at least max opt. space */ 2864 mss = max(mss, 64); 2865 2866 /* 2867 * maxopd stores the maximum length of data AND options 2868 * in a segment; maxseg is the amount of data in a normal 2869 * segment. We need to store this value (maxopd) apart 2870 * from maxseg, because now every segment carries options 2871 * and thus we normally have somewhat less data in segments. 2872 */ 2873 tp->t_maxopd = mss; 2874 2875 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2876 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2877 mss -= TCPOLEN_TSTAMP_APPA; 2878 #ifdef TCP_SIGNATURE 2879 if (tp->t_flags & TF_SIGNATURE) 2880 mss -= TCPOLEN_SIGLEN; 2881 #endif 2882 2883 if (offer == -1) { 2884 /* mss changed due to Path MTU discovery */ 2885 tp->t_flags &= ~TF_PMTUD_PEND; 2886 tp->t_pmtud_mtu_sent = 0; 2887 tp->t_pmtud_mss_acked = 0; 2888 if (mss < tp->t_maxseg) { 2889 /* 2890 * Follow suggestion in RFC 2414 to reduce the 2891 * congestion window by the ratio of the old 2892 * segment size to the new segment size. 2893 */ 2894 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2895 mss, mss); 2896 } 2897 } else if (tcp_do_rfc3390 == 2) { 2898 /* increase initial window */ 2899 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2900 } else if (tcp_do_rfc3390) { 2901 /* increase initial window */ 2902 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2903 } else 2904 tp->snd_cwnd = mss; 2905 2906 tp->t_maxseg = mss; 2907 2908 return (offer != -1 ? mssopt : mss); 2909 } 2910 2911 u_int 2912 tcp_hdrsz(struct tcpcb *tp) 2913 { 2914 u_int hlen; 2915 2916 switch (tp->pf) { 2917 #ifdef INET6 2918 case AF_INET6: 2919 hlen = sizeof(struct ip6_hdr); 2920 break; 2921 #endif 2922 case AF_INET: 2923 hlen = sizeof(struct ip); 2924 break; 2925 default: 2926 hlen = 0; 2927 break; 2928 } 2929 hlen += sizeof(struct tcphdr); 2930 2931 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2932 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2933 hlen += TCPOLEN_TSTAMP_APPA; 2934 #ifdef TCP_SIGNATURE 2935 if (tp->t_flags & TF_SIGNATURE) 2936 hlen += TCPOLEN_SIGLEN; 2937 #endif 2938 return (hlen); 2939 } 2940 2941 /* 2942 * Set connection variables based on the effective MSS. 2943 * We are passed the TCPCB for the actual connection. If we 2944 * are the server, we are called by the compressed state engine 2945 * when the 3-way handshake is complete. If we are the client, 2946 * we are called when we receive the SYN,ACK from the server. 2947 * 2948 * NOTE: The t_maxseg value must be initialized in the TCPCB 2949 * before this routine is called! 2950 */ 2951 void 2952 tcp_mss_update(struct tcpcb *tp) 2953 { 2954 int mss; 2955 u_long bufsize; 2956 struct rtentry *rt; 2957 struct socket *so; 2958 2959 so = tp->t_inpcb->inp_socket; 2960 mss = tp->t_maxseg; 2961 2962 rt = in_pcbrtentry(tp->t_inpcb); 2963 2964 if (rt == NULL) 2965 return; 2966 2967 bufsize = so->so_snd.sb_hiwat; 2968 if (bufsize < mss) { 2969 mss = bufsize; 2970 /* Update t_maxseg and t_maxopd */ 2971 tcp_mss(tp, mss); 2972 } else { 2973 bufsize = roundup(bufsize, mss); 2974 if (bufsize > sb_max) 2975 bufsize = sb_max; 2976 (void)sbreserve(so, &so->so_snd, bufsize); 2977 } 2978 2979 bufsize = so->so_rcv.sb_hiwat; 2980 if (bufsize > mss) { 2981 bufsize = roundup(bufsize, mss); 2982 if (bufsize > sb_max) 2983 bufsize = sb_max; 2984 (void)sbreserve(so, &so->so_rcv, bufsize); 2985 } 2986 2987 } 2988 2989 /* 2990 * When a partial ack arrives, force the retransmission of the 2991 * next unacknowledged segment. Do not clear tp->t_dupacks. 2992 * By setting snd_nxt to ti_ack, this forces retransmission timer 2993 * to be started again. 2994 */ 2995 void 2996 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 2997 { 2998 /* 2999 * snd_una has not been updated and the socket send buffer 3000 * not yet drained of the acked data, so we have to leave 3001 * snd_una as it was to get the correct data offset in 3002 * tcp_output(). 3003 */ 3004 tcp_seq onxt = tp->snd_nxt; 3005 u_long ocwnd = tp->snd_cwnd; 3006 3007 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3008 tp->t_rtttime = 0; 3009 tp->snd_nxt = th->th_ack; 3010 /* 3011 * Set snd_cwnd to one segment beyond acknowledged offset 3012 * (tp->snd_una not yet updated when this function is called) 3013 */ 3014 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3015 (void)tcp_output(tp); 3016 tp->snd_cwnd = ocwnd; 3017 if (SEQ_GT(onxt, tp->snd_nxt)) 3018 tp->snd_nxt = onxt; 3019 /* 3020 * Partial window deflation. Relies on fact that tp->snd_una 3021 * not updated yet. 3022 */ 3023 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3024 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3025 else 3026 tp->snd_cwnd = 0; 3027 tp->snd_cwnd += tp->t_maxseg; 3028 } 3029 3030 int 3031 tcp_mss_adv(struct mbuf *m, int af) 3032 { 3033 int mss = 0; 3034 int iphlen; 3035 struct ifnet *ifp = NULL; 3036 3037 if (m && (m->m_flags & M_PKTHDR)) 3038 ifp = if_get(m->m_pkthdr.ph_ifidx); 3039 3040 switch (af) { 3041 case AF_INET: 3042 if (ifp != NULL) 3043 mss = ifp->if_mtu; 3044 iphlen = sizeof(struct ip); 3045 break; 3046 #ifdef INET6 3047 case AF_INET6: 3048 if (ifp != NULL) 3049 mss = ifp->if_mtu; 3050 iphlen = sizeof(struct ip6_hdr); 3051 break; 3052 #endif 3053 default: 3054 unhandled_af(af); 3055 } 3056 if_put(ifp); 3057 mss = mss - iphlen - sizeof(struct tcphdr); 3058 return (max(mss, tcp_mssdflt)); 3059 } 3060 3061 /* 3062 * TCP compressed state engine. Currently used to hold compressed 3063 * state for SYN_RECEIVED. 3064 */ 3065 3066 /* syn hash parameters */ 3067 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3068 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3069 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3070 int tcp_syn_use_limit = 100000; 3071 3072 struct syn_cache_set tcp_syn_cache[2]; 3073 int tcp_syn_cache_active; 3074 3075 #define SYN_HASH(sa, sp, dp, rand) \ 3076 (((sa)->s_addr ^ (rand)[0]) * \ 3077 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3078 #ifndef INET6 3079 #define SYN_HASHALL(hash, src, dst, rand) \ 3080 do { \ 3081 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3082 satosin(src)->sin_port, \ 3083 satosin(dst)->sin_port, (rand)); \ 3084 } while (/*CONSTCOND*/ 0) 3085 #else 3086 #define SYN_HASH6(sa, sp, dp, rand) \ 3087 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3088 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3089 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3090 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3091 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3092 3093 #define SYN_HASHALL(hash, src, dst, rand) \ 3094 do { \ 3095 switch ((src)->sa_family) { \ 3096 case AF_INET: \ 3097 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3098 satosin(src)->sin_port, \ 3099 satosin(dst)->sin_port, (rand)); \ 3100 break; \ 3101 case AF_INET6: \ 3102 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3103 satosin6(src)->sin6_port, \ 3104 satosin6(dst)->sin6_port, (rand)); \ 3105 break; \ 3106 default: \ 3107 hash = 0; \ 3108 } \ 3109 } while (/*CONSTCOND*/0) 3110 #endif /* INET6 */ 3111 3112 void 3113 syn_cache_rm(struct syn_cache *sc) 3114 { 3115 sc->sc_flags |= SCF_DEAD; 3116 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3117 sc->sc_tp = NULL; 3118 LIST_REMOVE(sc, sc_tpq); 3119 sc->sc_buckethead->sch_length--; 3120 timeout_del(&sc->sc_timer); 3121 sc->sc_set->scs_count--; 3122 } 3123 3124 void 3125 syn_cache_put(struct syn_cache *sc) 3126 { 3127 m_free(sc->sc_ipopts); 3128 if (sc->sc_route4.ro_rt != NULL) { 3129 rtfree(sc->sc_route4.ro_rt); 3130 sc->sc_route4.ro_rt = NULL; 3131 } 3132 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3133 timeout_add(&sc->sc_timer, 0); 3134 } 3135 3136 struct pool syn_cache_pool; 3137 3138 /* 3139 * We don't estimate RTT with SYNs, so each packet starts with the default 3140 * RTT and each timer step has a fixed timeout value. 3141 */ 3142 #define SYN_CACHE_TIMER_ARM(sc) \ 3143 do { \ 3144 TCPT_RANGESET((sc)->sc_rxtcur, \ 3145 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3146 TCPTV_REXMTMAX); \ 3147 if (!timeout_initialized(&(sc)->sc_timer)) \ 3148 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3149 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3150 } while (/*CONSTCOND*/0) 3151 3152 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3153 3154 void 3155 syn_cache_init(void) 3156 { 3157 int i; 3158 3159 /* Initialize the hash buckets. */ 3160 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3161 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3162 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3163 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3164 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3165 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3166 for (i = 0; i < tcp_syn_hash_size; i++) { 3167 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3168 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3169 } 3170 3171 /* Initialize the syn cache pool. */ 3172 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3173 0, "syncache", NULL); 3174 } 3175 3176 void 3177 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3178 { 3179 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3180 struct syn_cache_head *scp; 3181 struct syn_cache *sc2; 3182 int i; 3183 3184 NET_ASSERT_LOCKED(); 3185 3186 /* 3187 * If there are no entries in the hash table, reinitialize 3188 * the hash secrets. To avoid useless cache swaps and 3189 * reinitialization, use it until the limit is reached. 3190 * An emtpy cache is also the oportunity to resize the hash. 3191 */ 3192 if (set->scs_count == 0 && set->scs_use <= 0) { 3193 set->scs_use = tcp_syn_use_limit; 3194 if (set->scs_size != tcp_syn_hash_size) { 3195 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3196 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3197 if (scp == NULL) { 3198 /* Try again next time. */ 3199 set->scs_use = 0; 3200 } else { 3201 free(set->scs_buckethead, M_SYNCACHE, 3202 set->scs_size * 3203 sizeof(struct syn_cache_head)); 3204 set->scs_buckethead = scp; 3205 set->scs_size = tcp_syn_hash_size; 3206 for (i = 0; i < tcp_syn_hash_size; i++) 3207 TAILQ_INIT(&scp[i].sch_bucket); 3208 } 3209 } 3210 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3211 tcpstat_inc(tcps_sc_seedrandom); 3212 } 3213 3214 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3215 set->scs_random); 3216 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3217 sc->sc_buckethead = scp; 3218 3219 /* 3220 * Make sure that we don't overflow the per-bucket 3221 * limit or the total cache size limit. 3222 */ 3223 if (scp->sch_length >= tcp_syn_bucket_limit) { 3224 tcpstat_inc(tcps_sc_bucketoverflow); 3225 /* 3226 * Someone might attack our bucket hash function. Reseed 3227 * with random as soon as the passive syn cache gets empty. 3228 */ 3229 set->scs_use = 0; 3230 /* 3231 * The bucket is full. Toss the oldest element in the 3232 * bucket. This will be the first entry in the bucket. 3233 */ 3234 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3235 #ifdef DIAGNOSTIC 3236 /* 3237 * This should never happen; we should always find an 3238 * entry in our bucket. 3239 */ 3240 if (sc2 == NULL) 3241 panic("%s: bucketoverflow: impossible", __func__); 3242 #endif 3243 syn_cache_rm(sc2); 3244 syn_cache_put(sc2); 3245 } else if (set->scs_count >= tcp_syn_cache_limit) { 3246 struct syn_cache_head *scp2, *sce; 3247 3248 tcpstat_inc(tcps_sc_overflowed); 3249 /* 3250 * The cache is full. Toss the oldest entry in the 3251 * first non-empty bucket we can find. 3252 * 3253 * XXX We would really like to toss the oldest 3254 * entry in the cache, but we hope that this 3255 * condition doesn't happen very often. 3256 */ 3257 scp2 = scp; 3258 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3259 sce = &set->scs_buckethead[set->scs_size]; 3260 for (++scp2; scp2 != scp; scp2++) { 3261 if (scp2 >= sce) 3262 scp2 = &set->scs_buckethead[0]; 3263 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3264 break; 3265 } 3266 #ifdef DIAGNOSTIC 3267 /* 3268 * This should never happen; we should always find a 3269 * non-empty bucket. 3270 */ 3271 if (scp2 == scp) 3272 panic("%s: cacheoverflow: impossible", 3273 __func__); 3274 #endif 3275 } 3276 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3277 syn_cache_rm(sc2); 3278 syn_cache_put(sc2); 3279 } 3280 3281 /* 3282 * Initialize the entry's timer. 3283 */ 3284 sc->sc_rxttot = 0; 3285 sc->sc_rxtshift = 0; 3286 SYN_CACHE_TIMER_ARM(sc); 3287 3288 /* Link it from tcpcb entry */ 3289 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3290 3291 /* Put it into the bucket. */ 3292 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3293 scp->sch_length++; 3294 sc->sc_set = set; 3295 set->scs_count++; 3296 set->scs_use--; 3297 3298 tcpstat_inc(tcps_sc_added); 3299 3300 /* 3301 * If the active cache has exceeded its use limit and 3302 * the passive syn cache is empty, exchange their roles. 3303 */ 3304 if (set->scs_use <= 0 && 3305 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3306 tcp_syn_cache_active = !tcp_syn_cache_active; 3307 } 3308 3309 /* 3310 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3311 * If we have retransmitted an entry the maximum number of times, expire 3312 * that entry. 3313 */ 3314 void 3315 syn_cache_timer(void *arg) 3316 { 3317 struct syn_cache *sc = arg; 3318 3319 NET_LOCK(); 3320 if (sc->sc_flags & SCF_DEAD) 3321 goto out; 3322 3323 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3324 /* Drop it -- too many retransmissions. */ 3325 goto dropit; 3326 } 3327 3328 /* 3329 * Compute the total amount of time this entry has 3330 * been on a queue. If this entry has been on longer 3331 * than the keep alive timer would allow, expire it. 3332 */ 3333 sc->sc_rxttot += sc->sc_rxtcur; 3334 if (sc->sc_rxttot >= tcptv_keep_init) 3335 goto dropit; 3336 3337 tcpstat_inc(tcps_sc_retransmitted); 3338 (void) syn_cache_respond(sc, NULL); 3339 3340 /* Advance the timer back-off. */ 3341 sc->sc_rxtshift++; 3342 SYN_CACHE_TIMER_ARM(sc); 3343 3344 out: 3345 NET_UNLOCK(); 3346 return; 3347 3348 dropit: 3349 tcpstat_inc(tcps_sc_timed_out); 3350 syn_cache_rm(sc); 3351 syn_cache_put(sc); 3352 NET_UNLOCK(); 3353 } 3354 3355 void 3356 syn_cache_reaper(void *arg) 3357 { 3358 struct syn_cache *sc = arg; 3359 3360 pool_put(&syn_cache_pool, (sc)); 3361 return; 3362 } 3363 3364 /* 3365 * Remove syn cache created by the specified tcb entry, 3366 * because this does not make sense to keep them 3367 * (if there's no tcb entry, syn cache entry will never be used) 3368 */ 3369 void 3370 syn_cache_cleanup(struct tcpcb *tp) 3371 { 3372 struct syn_cache *sc, *nsc; 3373 3374 NET_ASSERT_LOCKED(); 3375 3376 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3377 #ifdef DIAGNOSTIC 3378 if (sc->sc_tp != tp) 3379 panic("invalid sc_tp in syn_cache_cleanup"); 3380 #endif 3381 syn_cache_rm(sc); 3382 syn_cache_put(sc); 3383 } 3384 /* just for safety */ 3385 LIST_INIT(&tp->t_sc); 3386 } 3387 3388 /* 3389 * Find an entry in the syn cache. 3390 */ 3391 struct syn_cache * 3392 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3393 struct syn_cache_head **headp, u_int rtableid) 3394 { 3395 struct syn_cache_set *sets[2]; 3396 struct syn_cache *sc; 3397 struct syn_cache_head *scp; 3398 u_int32_t hash; 3399 int i; 3400 3401 NET_ASSERT_LOCKED(); 3402 3403 /* Check the active cache first, the passive cache is likely emtpy. */ 3404 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3405 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3406 for (i = 0; i < 2; i++) { 3407 if (sets[i]->scs_count == 0) 3408 continue; 3409 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3410 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3411 *headp = scp; 3412 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3413 if (sc->sc_hash != hash) 3414 continue; 3415 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3416 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3417 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3418 return (sc); 3419 } 3420 } 3421 return (NULL); 3422 } 3423 3424 /* 3425 * This function gets called when we receive an ACK for a 3426 * socket in the LISTEN state. We look up the connection 3427 * in the syn cache, and if its there, we pull it out of 3428 * the cache and turn it into a full-blown connection in 3429 * the SYN-RECEIVED state. 3430 * 3431 * The return values may not be immediately obvious, and their effects 3432 * can be subtle, so here they are: 3433 * 3434 * NULL SYN was not found in cache; caller should drop the 3435 * packet and send an RST. 3436 * 3437 * -1 We were unable to create the new connection, and are 3438 * aborting it. An ACK,RST is being sent to the peer 3439 * (unless we got screwey sequence numbners; see below), 3440 * because the 3-way handshake has been completed. Caller 3441 * should not free the mbuf, since we may be using it. If 3442 * we are not, we will free it. 3443 * 3444 * Otherwise, the return value is a pointer to the new socket 3445 * associated with the connection. 3446 */ 3447 struct socket * 3448 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3449 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3450 { 3451 struct syn_cache *sc; 3452 struct syn_cache_head *scp; 3453 struct inpcb *inp, *oldinp; 3454 struct tcpcb *tp = NULL; 3455 struct mbuf *am; 3456 struct socket *oso; 3457 3458 NET_ASSERT_LOCKED(); 3459 3460 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3461 if (sc == NULL) 3462 return (NULL); 3463 3464 /* 3465 * Verify the sequence and ack numbers. Try getting the correct 3466 * response again. 3467 */ 3468 if ((th->th_ack != sc->sc_iss + 1) || 3469 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3470 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3471 (void) syn_cache_respond(sc, m); 3472 return ((struct socket *)(-1)); 3473 } 3474 3475 /* Remove this cache entry */ 3476 syn_cache_rm(sc); 3477 3478 /* 3479 * Ok, create the full blown connection, and set things up 3480 * as they would have been set up if we had created the 3481 * connection when the SYN arrived. If we can't create 3482 * the connection, abort it. 3483 */ 3484 oso = so; 3485 so = sonewconn(so, SS_ISCONNECTED); 3486 if (so == NULL) 3487 goto resetandabort; 3488 3489 oldinp = sotoinpcb(oso); 3490 inp = sotoinpcb(so); 3491 3492 #ifdef IPSEC 3493 /* 3494 * We need to copy the required security levels 3495 * from the old pcb. Ditto for any other 3496 * IPsec-related information. 3497 */ 3498 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3499 sizeof(oldinp->inp_seclevel)); 3500 #endif /* IPSEC */ 3501 #ifdef INET6 3502 /* 3503 * inp still has the OLD in_pcb stuff, set the 3504 * v6-related flags on the new guy, too. 3505 */ 3506 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3507 if (inp->inp_flags & INP_IPV6) { 3508 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3509 inp->inp_hops = oldinp->inp_hops; 3510 } else 3511 #endif /* INET6 */ 3512 { 3513 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3514 } 3515 3516 #if NPF > 0 3517 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3518 struct pf_divert *divert; 3519 3520 divert = pf_find_divert(m); 3521 KASSERT(divert != NULL); 3522 inp->inp_rtableid = divert->rdomain; 3523 } else 3524 #endif 3525 /* inherit rtable from listening socket */ 3526 inp->inp_rtableid = sc->sc_rtableid; 3527 3528 inp->inp_lport = th->th_dport; 3529 switch (src->sa_family) { 3530 #ifdef INET6 3531 case AF_INET6: 3532 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3533 break; 3534 #endif /* INET6 */ 3535 case AF_INET: 3536 inp->inp_laddr = satosin(dst)->sin_addr; 3537 inp->inp_options = ip_srcroute(m); 3538 if (inp->inp_options == NULL) { 3539 inp->inp_options = sc->sc_ipopts; 3540 sc->sc_ipopts = NULL; 3541 } 3542 break; 3543 } 3544 in_pcbrehash(inp); 3545 3546 /* 3547 * Give the new socket our cached route reference. 3548 */ 3549 if (src->sa_family == AF_INET) 3550 inp->inp_route = sc->sc_route4; /* struct assignment */ 3551 #ifdef INET6 3552 else 3553 inp->inp_route6 = sc->sc_route6; 3554 #endif 3555 sc->sc_route4.ro_rt = NULL; 3556 3557 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3558 if (am == NULL) 3559 goto resetandabort; 3560 am->m_len = src->sa_len; 3561 memcpy(mtod(am, caddr_t), src, src->sa_len); 3562 3563 switch (src->sa_family) { 3564 case AF_INET: 3565 /* drop IPv4 packet to AF_INET6 socket */ 3566 if (inp->inp_flags & INP_IPV6) { 3567 (void) m_free(am); 3568 goto resetandabort; 3569 } 3570 if (in_pcbconnect(inp, am)) { 3571 (void) m_free(am); 3572 goto resetandabort; 3573 } 3574 break; 3575 #ifdef INET6 3576 case AF_INET6: 3577 if (in6_pcbconnect(inp, am)) { 3578 (void) m_free(am); 3579 goto resetandabort; 3580 } 3581 break; 3582 #endif 3583 } 3584 (void) m_free(am); 3585 3586 tp = intotcpcb(inp); 3587 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3588 if (sc->sc_request_r_scale != 15) { 3589 tp->requested_s_scale = sc->sc_requested_s_scale; 3590 tp->request_r_scale = sc->sc_request_r_scale; 3591 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3592 } 3593 if (sc->sc_flags & SCF_TIMESTAMP) 3594 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3595 3596 tp->t_template = tcp_template(tp); 3597 if (tp->t_template == 0) { 3598 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3599 so = NULL; 3600 m_freem(m); 3601 goto abort; 3602 } 3603 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3604 tp->ts_modulate = sc->sc_modulate; 3605 tp->ts_recent = sc->sc_timestamp; 3606 tp->iss = sc->sc_iss; 3607 tp->irs = sc->sc_irs; 3608 tcp_sendseqinit(tp); 3609 tp->snd_last = tp->snd_una; 3610 #ifdef TCP_ECN 3611 if (sc->sc_flags & SCF_ECN_PERMIT) { 3612 tp->t_flags |= TF_ECN_PERMIT; 3613 tcpstat_inc(tcps_ecn_accepts); 3614 } 3615 #endif 3616 if (sc->sc_flags & SCF_SACK_PERMIT) 3617 tp->t_flags |= TF_SACK_PERMIT; 3618 #ifdef TCP_SIGNATURE 3619 if (sc->sc_flags & SCF_SIGNATURE) 3620 tp->t_flags |= TF_SIGNATURE; 3621 #endif 3622 tcp_rcvseqinit(tp); 3623 tp->t_state = TCPS_SYN_RECEIVED; 3624 tp->t_rcvtime = tcp_now; 3625 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3626 tcpstat_inc(tcps_accepts); 3627 3628 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3629 if (sc->sc_peermaxseg) 3630 tcp_mss_update(tp); 3631 /* Reset initial window to 1 segment for retransmit */ 3632 if (sc->sc_rxtshift > 0) 3633 tp->snd_cwnd = tp->t_maxseg; 3634 tp->snd_wl1 = sc->sc_irs; 3635 tp->rcv_up = sc->sc_irs + 1; 3636 3637 /* 3638 * This is what whould have happened in tcp_output() when 3639 * the SYN,ACK was sent. 3640 */ 3641 tp->snd_up = tp->snd_una; 3642 tp->snd_max = tp->snd_nxt = tp->iss+1; 3643 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3644 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3645 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3646 tp->last_ack_sent = tp->rcv_nxt; 3647 3648 tcpstat_inc(tcps_sc_completed); 3649 syn_cache_put(sc); 3650 return (so); 3651 3652 resetandabort: 3653 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3654 m->m_pkthdr.ph_rtableid); 3655 m_freem(m); 3656 abort: 3657 if (so != NULL) 3658 (void) soabort(so); 3659 syn_cache_put(sc); 3660 tcpstat_inc(tcps_sc_aborted); 3661 return ((struct socket *)(-1)); 3662 } 3663 3664 /* 3665 * This function is called when we get a RST for a 3666 * non-existent connection, so that we can see if the 3667 * connection is in the syn cache. If it is, zap it. 3668 */ 3669 3670 void 3671 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3672 u_int rtableid) 3673 { 3674 struct syn_cache *sc; 3675 struct syn_cache_head *scp; 3676 3677 NET_ASSERT_LOCKED(); 3678 3679 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3680 return; 3681 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3682 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3683 return; 3684 syn_cache_rm(sc); 3685 tcpstat_inc(tcps_sc_reset); 3686 syn_cache_put(sc); 3687 } 3688 3689 void 3690 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3691 u_int rtableid) 3692 { 3693 struct syn_cache *sc; 3694 struct syn_cache_head *scp; 3695 3696 NET_ASSERT_LOCKED(); 3697 3698 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3699 return; 3700 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3701 if (ntohl (th->th_seq) != sc->sc_iss) { 3702 return; 3703 } 3704 3705 /* 3706 * If we've retransmitted 3 times and this is our second error, 3707 * we remove the entry. Otherwise, we allow it to continue on. 3708 * This prevents us from incorrectly nuking an entry during a 3709 * spurious network outage. 3710 * 3711 * See tcp_notify(). 3712 */ 3713 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3714 sc->sc_flags |= SCF_UNREACH; 3715 return; 3716 } 3717 3718 syn_cache_rm(sc); 3719 tcpstat_inc(tcps_sc_unreach); 3720 syn_cache_put(sc); 3721 } 3722 3723 /* 3724 * Given a LISTEN socket and an inbound SYN request, add 3725 * this to the syn cache, and send back a segment: 3726 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3727 * to the source. 3728 * 3729 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3730 * Doing so would require that we hold onto the data and deliver it 3731 * to the application. However, if we are the target of a SYN-flood 3732 * DoS attack, an attacker could send data which would eventually 3733 * consume all available buffer space if it were ACKed. By not ACKing 3734 * the data, we avoid this DoS scenario. 3735 */ 3736 3737 int 3738 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3739 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3740 struct tcp_opt_info *oi, tcp_seq *issp) 3741 { 3742 struct tcpcb tb, *tp; 3743 long win; 3744 struct syn_cache *sc; 3745 struct syn_cache_head *scp; 3746 struct mbuf *ipopts; 3747 3748 tp = sototcpcb(so); 3749 3750 /* 3751 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3752 * 3753 * Note this check is performed in tcp_input() very early on. 3754 */ 3755 3756 /* 3757 * Initialize some local state. 3758 */ 3759 win = sbspace(so, &so->so_rcv); 3760 if (win > TCP_MAXWIN) 3761 win = TCP_MAXWIN; 3762 3763 bzero(&tb, sizeof(tb)); 3764 #ifdef TCP_SIGNATURE 3765 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3766 #else 3767 if (optp) { 3768 #endif 3769 tb.pf = tp->pf; 3770 tb.sack_enable = tp->sack_enable; 3771 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3772 #ifdef TCP_SIGNATURE 3773 if (tp->t_flags & TF_SIGNATURE) 3774 tb.t_flags |= TF_SIGNATURE; 3775 #endif 3776 tb.t_state = TCPS_LISTEN; 3777 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3778 sotoinpcb(so)->inp_rtableid)) 3779 return (-1); 3780 } 3781 3782 switch (src->sa_family) { 3783 case AF_INET: 3784 /* 3785 * Remember the IP options, if any. 3786 */ 3787 ipopts = ip_srcroute(m); 3788 break; 3789 default: 3790 ipopts = NULL; 3791 } 3792 3793 /* 3794 * See if we already have an entry for this connection. 3795 * If we do, resend the SYN,ACK. We do not count this 3796 * as a retransmission (XXX though maybe we should). 3797 */ 3798 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3799 if (sc != NULL) { 3800 tcpstat_inc(tcps_sc_dupesyn); 3801 if (ipopts) { 3802 /* 3803 * If we were remembering a previous source route, 3804 * forget it and use the new one we've been given. 3805 */ 3806 m_free(sc->sc_ipopts); 3807 sc->sc_ipopts = ipopts; 3808 } 3809 sc->sc_timestamp = tb.ts_recent; 3810 if (syn_cache_respond(sc, m) == 0) { 3811 tcpstat_inc(tcps_sndacks); 3812 tcpstat_inc(tcps_sndtotal); 3813 } 3814 return (0); 3815 } 3816 3817 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3818 if (sc == NULL) { 3819 m_free(ipopts); 3820 return (-1); 3821 } 3822 3823 /* 3824 * Fill in the cache, and put the necessary IP and TCP 3825 * options into the reply. 3826 */ 3827 memcpy(&sc->sc_src, src, src->sa_len); 3828 memcpy(&sc->sc_dst, dst, dst->sa_len); 3829 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3830 sc->sc_flags = 0; 3831 sc->sc_ipopts = ipopts; 3832 sc->sc_irs = th->th_seq; 3833 3834 sc->sc_iss = issp ? *issp : arc4random(); 3835 sc->sc_peermaxseg = oi->maxseg; 3836 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3837 sc->sc_win = win; 3838 sc->sc_timestamp = tb.ts_recent; 3839 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3840 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3841 sc->sc_flags |= SCF_TIMESTAMP; 3842 sc->sc_modulate = arc4random(); 3843 } 3844 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3845 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3846 sc->sc_requested_s_scale = tb.requested_s_scale; 3847 sc->sc_request_r_scale = 0; 3848 /* 3849 * Pick the smallest possible scaling factor that 3850 * will still allow us to scale up to sb_max. 3851 * 3852 * We do this because there are broken firewalls that 3853 * will corrupt the window scale option, leading to 3854 * the other endpoint believing that our advertised 3855 * window is unscaled. At scale factors larger than 3856 * 5 the unscaled window will drop below 1500 bytes, 3857 * leading to serious problems when traversing these 3858 * broken firewalls. 3859 * 3860 * With the default sbmax of 256K, a scale factor 3861 * of 3 will be chosen by this algorithm. Those who 3862 * choose a larger sbmax should watch out 3863 * for the compatiblity problems mentioned above. 3864 * 3865 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3866 * or <SYN,ACK>) segment itself is never scaled. 3867 */ 3868 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3869 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3870 sc->sc_request_r_scale++; 3871 } else { 3872 sc->sc_requested_s_scale = 15; 3873 sc->sc_request_r_scale = 15; 3874 } 3875 #ifdef TCP_ECN 3876 /* 3877 * if both ECE and CWR flag bits are set, peer is ECN capable. 3878 */ 3879 if (tcp_do_ecn && 3880 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3881 sc->sc_flags |= SCF_ECN_PERMIT; 3882 #endif 3883 /* 3884 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3885 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3886 */ 3887 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3888 sc->sc_flags |= SCF_SACK_PERMIT; 3889 #ifdef TCP_SIGNATURE 3890 if (tb.t_flags & TF_SIGNATURE) 3891 sc->sc_flags |= SCF_SIGNATURE; 3892 #endif 3893 sc->sc_tp = tp; 3894 if (syn_cache_respond(sc, m) == 0) { 3895 syn_cache_insert(sc, tp); 3896 tcpstat_inc(tcps_sndacks); 3897 tcpstat_inc(tcps_sndtotal); 3898 } else { 3899 syn_cache_put(sc); 3900 tcpstat_inc(tcps_sc_dropped); 3901 } 3902 3903 return (0); 3904 } 3905 3906 int 3907 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 3908 { 3909 u_int8_t *optp; 3910 int optlen, error; 3911 u_int16_t tlen; 3912 struct ip *ip = NULL; 3913 #ifdef INET6 3914 struct ip6_hdr *ip6 = NULL; 3915 #endif 3916 struct tcphdr *th; 3917 u_int hlen; 3918 struct inpcb *inp; 3919 3920 switch (sc->sc_src.sa.sa_family) { 3921 case AF_INET: 3922 hlen = sizeof(struct ip); 3923 break; 3924 #ifdef INET6 3925 case AF_INET6: 3926 hlen = sizeof(struct ip6_hdr); 3927 break; 3928 #endif 3929 default: 3930 m_freem(m); 3931 return (EAFNOSUPPORT); 3932 } 3933 3934 /* Compute the size of the TCP options. */ 3935 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3936 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 3937 #ifdef TCP_SIGNATURE 3938 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3939 #endif 3940 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3941 3942 tlen = hlen + sizeof(struct tcphdr) + optlen; 3943 3944 /* 3945 * Create the IP+TCP header from scratch. 3946 */ 3947 m_freem(m); 3948 #ifdef DIAGNOSTIC 3949 if (max_linkhdr + tlen > MCLBYTES) 3950 return (ENOBUFS); 3951 #endif 3952 MGETHDR(m, M_DONTWAIT, MT_DATA); 3953 if (m && max_linkhdr + tlen > MHLEN) { 3954 MCLGET(m, M_DONTWAIT); 3955 if ((m->m_flags & M_EXT) == 0) { 3956 m_freem(m); 3957 m = NULL; 3958 } 3959 } 3960 if (m == NULL) 3961 return (ENOBUFS); 3962 3963 /* Fixup the mbuf. */ 3964 m->m_data += max_linkhdr; 3965 m->m_len = m->m_pkthdr.len = tlen; 3966 m->m_pkthdr.ph_ifidx = 0; 3967 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 3968 memset(mtod(m, u_char *), 0, tlen); 3969 3970 switch (sc->sc_src.sa.sa_family) { 3971 case AF_INET: 3972 ip = mtod(m, struct ip *); 3973 ip->ip_dst = sc->sc_src.sin.sin_addr; 3974 ip->ip_src = sc->sc_dst.sin.sin_addr; 3975 ip->ip_p = IPPROTO_TCP; 3976 th = (struct tcphdr *)(ip + 1); 3977 th->th_dport = sc->sc_src.sin.sin_port; 3978 th->th_sport = sc->sc_dst.sin.sin_port; 3979 break; 3980 #ifdef INET6 3981 case AF_INET6: 3982 ip6 = mtod(m, struct ip6_hdr *); 3983 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3984 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3985 ip6->ip6_nxt = IPPROTO_TCP; 3986 /* ip6_plen will be updated in ip6_output() */ 3987 th = (struct tcphdr *)(ip6 + 1); 3988 th->th_dport = sc->sc_src.sin6.sin6_port; 3989 th->th_sport = sc->sc_dst.sin6.sin6_port; 3990 break; 3991 #endif 3992 default: 3993 unhandled_af(sc->sc_src.sa.sa_family); 3994 } 3995 3996 th->th_seq = htonl(sc->sc_iss); 3997 th->th_ack = htonl(sc->sc_irs + 1); 3998 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3999 th->th_flags = TH_SYN|TH_ACK; 4000 #ifdef TCP_ECN 4001 /* Set ECE for SYN-ACK if peer supports ECN. */ 4002 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4003 th->th_flags |= TH_ECE; 4004 #endif 4005 th->th_win = htons(sc->sc_win); 4006 /* th_sum already 0 */ 4007 /* th_urp already 0 */ 4008 4009 /* Tack on the TCP options. */ 4010 optp = (u_int8_t *)(th + 1); 4011 *optp++ = TCPOPT_MAXSEG; 4012 *optp++ = 4; 4013 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4014 *optp++ = sc->sc_ourmaxseg & 0xff; 4015 4016 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4017 if (sc->sc_flags & SCF_SACK_PERMIT) { 4018 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4019 optp += 4; 4020 } 4021 4022 if (sc->sc_request_r_scale != 15) { 4023 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4024 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4025 sc->sc_request_r_scale); 4026 optp += 4; 4027 } 4028 4029 if (sc->sc_flags & SCF_TIMESTAMP) { 4030 u_int32_t *lp = (u_int32_t *)(optp); 4031 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4032 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4033 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4034 *lp = htonl(sc->sc_timestamp); 4035 optp += TCPOLEN_TSTAMP_APPA; 4036 } 4037 4038 #ifdef TCP_SIGNATURE 4039 if (sc->sc_flags & SCF_SIGNATURE) { 4040 union sockaddr_union src, dst; 4041 struct tdb *tdb; 4042 4043 bzero(&src, sizeof(union sockaddr_union)); 4044 bzero(&dst, sizeof(union sockaddr_union)); 4045 src.sa.sa_len = sc->sc_src.sa.sa_len; 4046 src.sa.sa_family = sc->sc_src.sa.sa_family; 4047 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4048 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4049 4050 switch (sc->sc_src.sa.sa_family) { 4051 case 0: /*default to PF_INET*/ 4052 case AF_INET: 4053 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4054 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4055 break; 4056 #ifdef INET6 4057 case AF_INET6: 4058 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4059 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4060 break; 4061 #endif /* INET6 */ 4062 } 4063 4064 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4065 0, &src, &dst, IPPROTO_TCP); 4066 if (tdb == NULL) { 4067 m_freem(m); 4068 return (EPERM); 4069 } 4070 4071 /* Send signature option */ 4072 *(optp++) = TCPOPT_SIGNATURE; 4073 *(optp++) = TCPOLEN_SIGNATURE; 4074 4075 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4076 hlen, 0, optp) < 0) { 4077 m_freem(m); 4078 return (EINVAL); 4079 } 4080 optp += 16; 4081 4082 /* Pad options list to the next 32 bit boundary and 4083 * terminate it. 4084 */ 4085 *optp++ = TCPOPT_NOP; 4086 *optp++ = TCPOPT_EOL; 4087 } 4088 #endif /* TCP_SIGNATURE */ 4089 4090 /* Compute the packet's checksum. */ 4091 switch (sc->sc_src.sa.sa_family) { 4092 case AF_INET: 4093 ip->ip_len = htons(tlen - hlen); 4094 th->th_sum = 0; 4095 th->th_sum = in_cksum(m, tlen); 4096 break; 4097 #ifdef INET6 4098 case AF_INET6: 4099 ip6->ip6_plen = htons(tlen - hlen); 4100 th->th_sum = 0; 4101 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4102 break; 4103 #endif 4104 } 4105 4106 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4107 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4108 4109 /* 4110 * Fill in some straggling IP bits. Note the stack expects 4111 * ip_len to be in host order, for convenience. 4112 */ 4113 switch (sc->sc_src.sa.sa_family) { 4114 case AF_INET: 4115 ip->ip_len = htons(tlen); 4116 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4117 if (inp != NULL) 4118 ip->ip_tos = inp->inp_ip.ip_tos; 4119 break; 4120 #ifdef INET6 4121 case AF_INET6: 4122 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4123 ip6->ip6_vfc |= IPV6_VERSION; 4124 ip6->ip6_plen = htons(tlen - hlen); 4125 /* ip6_hlim will be initialized afterwards */ 4126 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4127 break; 4128 #endif 4129 } 4130 4131 switch (sc->sc_src.sa.sa_family) { 4132 case AF_INET: 4133 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4134 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4135 break; 4136 #ifdef INET6 4137 case AF_INET6: 4138 ip6->ip6_hlim = in6_selecthlim(inp); 4139 4140 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4141 NULL, NULL); 4142 break; 4143 #endif 4144 default: 4145 error = EAFNOSUPPORT; 4146 break; 4147 } 4148 return (error); 4149 } 4150