1 /* $OpenBSD: tcp_input.c,v 1.366 2021/02/03 13:40:06 jan Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK until 169 * a read from the socket buffer or the delayed ACK timer causes one. 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if ((tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 180 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 181 tp->t_flags |= TF_ACKNOW; \ 182 else \ 183 TCP_TIMER_ARM_MSEC(tp, TCPT_DELACK, tcp_delack_msecs); \ 184 if_put(ifp); \ 185 } while (0) 186 187 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 188 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 189 190 void syn_cache_put(struct syn_cache *); 191 void syn_cache_rm(struct syn_cache *); 192 int syn_cache_respond(struct syn_cache *, struct mbuf *); 193 void syn_cache_timer(void *); 194 void syn_cache_reaper(void *); 195 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 196 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 197 struct tcphdr *, u_int); 198 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 199 unsigned int, struct socket *, struct mbuf *, u_char *, int, 200 struct tcp_opt_info *, tcp_seq *); 201 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 202 struct tcphdr *, unsigned int, unsigned int, struct socket *, 203 struct mbuf *); 204 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 205 struct syn_cache_head **, u_int); 206 207 /* 208 * Insert segment ti into reassembly queue of tcp with 209 * control block tp. Return TH_FIN if reassembly now includes 210 * a segment with FIN. The macro form does the common case inline 211 * (segment is the next to be received on an established connection, 212 * and the queue is empty), avoiding linkage into and removal 213 * from the queue and repetition of various conversions. 214 * Set DELACK for segments received in order, but ack immediately 215 * when segments are out of order (so fast retransmit can work). 216 */ 217 218 int 219 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 220 { 221 struct tcpqent *p, *q, *nq, *tiqe; 222 223 /* 224 * Allocate a new queue entry, before we throw away any data. 225 * If we can't, just drop the packet. XXX 226 */ 227 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 228 if (tiqe == NULL) { 229 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 230 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 231 /* Reuse last entry since new segment fills a hole */ 232 m_freem(tiqe->tcpqe_m); 233 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 234 } 235 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 236 /* Flush segment queue for this connection */ 237 tcp_freeq(tp); 238 tcpstat_inc(tcps_rcvmemdrop); 239 m_freem(m); 240 return (0); 241 } 242 } 243 244 /* 245 * Find a segment which begins after this one does. 246 */ 247 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 248 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 249 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 250 break; 251 252 /* 253 * If there is a preceding segment, it may provide some of 254 * our data already. If so, drop the data from the incoming 255 * segment. If it provides all of our data, drop us. 256 */ 257 if (p != NULL) { 258 struct tcphdr *phdr = p->tcpqe_tcp; 259 int i; 260 261 /* conversion to int (in i) handles seq wraparound */ 262 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 263 if (i > 0) { 264 if (i >= *tlen) { 265 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 266 *tlen); 267 m_freem(m); 268 pool_put(&tcpqe_pool, tiqe); 269 return (0); 270 } 271 m_adj(m, i); 272 *tlen -= i; 273 th->th_seq += i; 274 } 275 } 276 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 277 278 /* 279 * While we overlap succeeding segments trim them or, 280 * if they are completely covered, dequeue them. 281 */ 282 for (; q != NULL; q = nq) { 283 struct tcphdr *qhdr = q->tcpqe_tcp; 284 int i = (th->th_seq + *tlen) - qhdr->th_seq; 285 286 if (i <= 0) 287 break; 288 if (i < qhdr->th_reseqlen) { 289 qhdr->th_seq += i; 290 qhdr->th_reseqlen -= i; 291 m_adj(q->tcpqe_m, i); 292 break; 293 } 294 nq = TAILQ_NEXT(q, tcpqe_q); 295 m_freem(q->tcpqe_m); 296 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 297 pool_put(&tcpqe_pool, q); 298 } 299 300 /* Insert the new segment queue entry into place. */ 301 tiqe->tcpqe_m = m; 302 th->th_reseqlen = *tlen; 303 tiqe->tcpqe_tcp = th; 304 if (p == NULL) { 305 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 306 } else { 307 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 308 } 309 310 if (th->th_seq != tp->rcv_nxt) 311 return (0); 312 313 return (tcp_flush_queue(tp)); 314 } 315 316 int 317 tcp_flush_queue(struct tcpcb *tp) 318 { 319 struct socket *so = tp->t_inpcb->inp_socket; 320 struct tcpqent *q, *nq; 321 int flags; 322 323 /* 324 * Present data to user, advancing rcv_nxt through 325 * completed sequence space. 326 */ 327 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 328 return (0); 329 q = TAILQ_FIRST(&tp->t_segq); 330 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 331 return (0); 332 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 333 return (0); 334 do { 335 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 336 flags = q->tcpqe_tcp->th_flags & TH_FIN; 337 338 nq = TAILQ_NEXT(q, tcpqe_q); 339 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 340 ND6_HINT(tp); 341 if (so->so_state & SS_CANTRCVMORE) 342 m_freem(q->tcpqe_m); 343 else 344 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 345 pool_put(&tcpqe_pool, q); 346 q = nq; 347 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 348 tp->t_flags |= TF_BLOCKOUTPUT; 349 sorwakeup(so); 350 tp->t_flags &= ~TF_BLOCKOUTPUT; 351 return (flags); 352 } 353 354 /* 355 * TCP input routine, follows pages 65-76 of the 356 * protocol specification dated September, 1981 very closely. 357 */ 358 int 359 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 360 { 361 struct mbuf *m = *mp; 362 int iphlen = *offp; 363 struct ip *ip = NULL; 364 struct inpcb *inp = NULL; 365 u_int8_t *optp = NULL; 366 int optlen = 0; 367 int tlen, off; 368 struct tcpcb *otp = NULL, *tp = NULL; 369 int tiflags; 370 struct socket *so = NULL; 371 int todrop, acked, ourfinisacked; 372 int hdroptlen = 0; 373 short ostate; 374 caddr_t saveti; 375 tcp_seq iss, *reuse = NULL; 376 u_long tiwin; 377 struct tcp_opt_info opti; 378 struct tcphdr *th; 379 #ifdef INET6 380 struct ip6_hdr *ip6 = NULL; 381 #endif /* INET6 */ 382 #ifdef IPSEC 383 struct m_tag *mtag; 384 struct tdb_ident *tdbi; 385 struct tdb *tdb; 386 int error; 387 #endif /* IPSEC */ 388 #ifdef TCP_ECN 389 u_char iptos; 390 #endif 391 392 tcpstat_inc(tcps_rcvtotal); 393 394 opti.ts_present = 0; 395 opti.maxseg = 0; 396 397 /* 398 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 399 */ 400 if (m->m_flags & (M_BCAST|M_MCAST)) 401 goto drop; 402 403 /* 404 * Get IP and TCP header together in first mbuf. 405 * Note: IP leaves IP header in first mbuf. 406 */ 407 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 408 if (!th) { 409 tcpstat_inc(tcps_rcvshort); 410 return IPPROTO_DONE; 411 } 412 413 tlen = m->m_pkthdr.len - iphlen; 414 switch (af) { 415 case AF_INET: 416 ip = mtod(m, struct ip *); 417 #ifdef TCP_ECN 418 /* save ip_tos before clearing it for checksum */ 419 iptos = ip->ip_tos; 420 #endif 421 break; 422 #ifdef INET6 423 case AF_INET6: 424 ip6 = mtod(m, struct ip6_hdr *); 425 #ifdef TCP_ECN 426 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 427 #endif 428 429 /* 430 * Be proactive about unspecified IPv6 address in source. 431 * As we use all-zero to indicate unbounded/unconnected pcb, 432 * unspecified IPv6 address can be used to confuse us. 433 * 434 * Note that packets with unspecified IPv6 destination is 435 * already dropped in ip6_input. 436 */ 437 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 438 /* XXX stat */ 439 goto drop; 440 } 441 442 /* Discard packets to multicast */ 443 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 444 /* XXX stat */ 445 goto drop; 446 } 447 break; 448 #endif 449 default: 450 unhandled_af(af); 451 } 452 453 /* 454 * Checksum extended TCP header and data. 455 */ 456 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 457 int sum; 458 459 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 460 tcpstat_inc(tcps_rcvbadsum); 461 goto drop; 462 } 463 tcpstat_inc(tcps_inswcsum); 464 switch (af) { 465 case AF_INET: 466 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 467 break; 468 #ifdef INET6 469 case AF_INET6: 470 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 471 tlen); 472 break; 473 #endif 474 } 475 if (sum != 0) { 476 tcpstat_inc(tcps_rcvbadsum); 477 goto drop; 478 } 479 } 480 481 /* 482 * Check that TCP offset makes sense, 483 * pull out TCP options and adjust length. XXX 484 */ 485 off = th->th_off << 2; 486 if (off < sizeof(struct tcphdr) || off > tlen) { 487 tcpstat_inc(tcps_rcvbadoff); 488 goto drop; 489 } 490 tlen -= off; 491 if (off > sizeof(struct tcphdr)) { 492 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 493 if (!th) { 494 tcpstat_inc(tcps_rcvshort); 495 return IPPROTO_DONE; 496 } 497 optlen = off - sizeof(struct tcphdr); 498 optp = (u_int8_t *)(th + 1); 499 /* 500 * Do quick retrieval of timestamp options ("options 501 * prediction?"). If timestamp is the only option and it's 502 * formatted as recommended in RFC 1323 appendix A, we 503 * quickly get the values now and not bother calling 504 * tcp_dooptions(), etc. 505 */ 506 if ((optlen == TCPOLEN_TSTAMP_APPA || 507 (optlen > TCPOLEN_TSTAMP_APPA && 508 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 509 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 510 (th->th_flags & TH_SYN) == 0) { 511 opti.ts_present = 1; 512 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 513 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 514 optp = NULL; /* we've parsed the options */ 515 } 516 } 517 tiflags = th->th_flags; 518 519 /* 520 * Convert TCP protocol specific fields to host format. 521 */ 522 th->th_seq = ntohl(th->th_seq); 523 th->th_ack = ntohl(th->th_ack); 524 th->th_win = ntohs(th->th_win); 525 th->th_urp = ntohs(th->th_urp); 526 527 /* 528 * Locate pcb for segment. 529 */ 530 #if NPF > 0 531 inp = pf_inp_lookup(m); 532 #endif 533 findpcb: 534 if (inp == NULL) { 535 switch (af) { 536 #ifdef INET6 537 case AF_INET6: 538 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 539 th->th_sport, &ip6->ip6_dst, th->th_dport, 540 m->m_pkthdr.ph_rtableid); 541 break; 542 #endif 543 case AF_INET: 544 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 545 th->th_sport, ip->ip_dst, th->th_dport, 546 m->m_pkthdr.ph_rtableid); 547 break; 548 } 549 } 550 if (inp == NULL) { 551 tcpstat_inc(tcps_pcbhashmiss); 552 switch (af) { 553 #ifdef INET6 554 case AF_INET6: 555 inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, 556 th->th_dport, m, m->m_pkthdr.ph_rtableid); 557 break; 558 #endif /* INET6 */ 559 case AF_INET: 560 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 561 th->th_dport, m, m->m_pkthdr.ph_rtableid); 562 break; 563 } 564 /* 565 * If the state is CLOSED (i.e., TCB does not exist) then 566 * all data in the incoming segment is discarded. 567 * If the TCB exists but is in CLOSED state, it is embryonic, 568 * but should either do a listen or a connect soon. 569 */ 570 } 571 #ifdef IPSEC 572 if (ipsec_in_use) { 573 /* Find most recent IPsec tag */ 574 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 575 if (mtag != NULL) { 576 tdbi = (struct tdb_ident *)(mtag + 1); 577 tdb = gettdb(tdbi->rdomain, tdbi->spi, 578 &tdbi->dst, tdbi->proto); 579 } else 580 tdb = NULL; 581 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 582 tdb, inp, 0); 583 if (error) { 584 tcpstat_inc(tcps_rcvnosec); 585 goto drop; 586 } 587 } 588 #endif /* IPSEC */ 589 590 if (inp == NULL) { 591 tcpstat_inc(tcps_noport); 592 goto dropwithreset_ratelim; 593 } 594 595 KASSERT(sotoinpcb(inp->inp_socket) == inp); 596 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 597 soassertlocked(inp->inp_socket); 598 599 /* Check the minimum TTL for socket. */ 600 switch (af) { 601 case AF_INET: 602 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 603 goto drop; 604 break; 605 #ifdef INET6 606 case AF_INET6: 607 if (inp->inp_ip6_minhlim && 608 inp->inp_ip6_minhlim > ip6->ip6_hlim) 609 goto drop; 610 break; 611 #endif 612 } 613 614 tp = intotcpcb(inp); 615 if (tp == NULL) 616 goto dropwithreset_ratelim; 617 if (tp->t_state == TCPS_CLOSED) 618 goto drop; 619 620 /* Unscale the window into a 32-bit value. */ 621 if ((tiflags & TH_SYN) == 0) 622 tiwin = th->th_win << tp->snd_scale; 623 else 624 tiwin = th->th_win; 625 626 so = inp->inp_socket; 627 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 628 union syn_cache_sa src; 629 union syn_cache_sa dst; 630 631 bzero(&src, sizeof(src)); 632 bzero(&dst, sizeof(dst)); 633 switch (af) { 634 case AF_INET: 635 src.sin.sin_len = sizeof(struct sockaddr_in); 636 src.sin.sin_family = AF_INET; 637 src.sin.sin_addr = ip->ip_src; 638 src.sin.sin_port = th->th_sport; 639 640 dst.sin.sin_len = sizeof(struct sockaddr_in); 641 dst.sin.sin_family = AF_INET; 642 dst.sin.sin_addr = ip->ip_dst; 643 dst.sin.sin_port = th->th_dport; 644 break; 645 #ifdef INET6 646 case AF_INET6: 647 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 648 src.sin6.sin6_family = AF_INET6; 649 src.sin6.sin6_addr = ip6->ip6_src; 650 src.sin6.sin6_port = th->th_sport; 651 652 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 653 dst.sin6.sin6_family = AF_INET6; 654 dst.sin6.sin6_addr = ip6->ip6_dst; 655 dst.sin6.sin6_port = th->th_dport; 656 break; 657 #endif /* INET6 */ 658 } 659 660 if (so->so_options & SO_DEBUG) { 661 otp = tp; 662 ostate = tp->t_state; 663 switch (af) { 664 #ifdef INET6 665 case AF_INET6: 666 saveti = (caddr_t) &tcp_saveti6; 667 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 668 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 669 break; 670 #endif 671 case AF_INET: 672 saveti = (caddr_t) &tcp_saveti; 673 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 674 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 675 break; 676 } 677 } 678 if (so->so_options & SO_ACCEPTCONN) { 679 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 680 681 case TH_SYN|TH_ACK|TH_RST: 682 case TH_SYN|TH_RST: 683 case TH_ACK|TH_RST: 684 case TH_RST: 685 syn_cache_reset(&src.sa, &dst.sa, th, 686 inp->inp_rtableid); 687 goto drop; 688 689 case TH_SYN|TH_ACK: 690 /* 691 * Received a SYN,ACK. This should 692 * never happen while we are in 693 * LISTEN. Send an RST. 694 */ 695 goto badsyn; 696 697 case TH_ACK: 698 so = syn_cache_get(&src.sa, &dst.sa, 699 th, iphlen, tlen, so, m); 700 if (so == NULL) { 701 /* 702 * We don't have a SYN for 703 * this ACK; send an RST. 704 */ 705 goto badsyn; 706 } else if (so == (struct socket *)(-1)) { 707 /* 708 * We were unable to create 709 * the connection. If the 710 * 3-way handshake was 711 * completed, and RST has 712 * been sent to the peer. 713 * Since the mbuf might be 714 * in use for the reply, 715 * do not free it. 716 */ 717 m = *mp = NULL; 718 goto drop; 719 } else { 720 /* 721 * We have created a 722 * full-blown connection. 723 */ 724 tp = NULL; 725 inp = sotoinpcb(so); 726 tp = intotcpcb(inp); 727 if (tp == NULL) 728 goto badsyn; /*XXX*/ 729 730 } 731 break; 732 733 default: 734 /* 735 * None of RST, SYN or ACK was set. 736 * This is an invalid packet for a 737 * TCB in LISTEN state. Send a RST. 738 */ 739 goto badsyn; 740 741 case TH_SYN: 742 /* 743 * Received a SYN. 744 */ 745 #ifdef INET6 746 /* 747 * If deprecated address is forbidden, we do 748 * not accept SYN to deprecated interface 749 * address to prevent any new inbound 750 * connection from getting established. 751 * When we do not accept SYN, we send a TCP 752 * RST, with deprecated source address (instead 753 * of dropping it). We compromise it as it is 754 * much better for peer to send a RST, and 755 * RST will be the final packet for the 756 * exchange. 757 * 758 * If we do not forbid deprecated addresses, we 759 * accept the SYN packet. RFC2462 does not 760 * suggest dropping SYN in this case. 761 * If we decipher RFC2462 5.5.4, it says like 762 * this: 763 * 1. use of deprecated addr with existing 764 * communication is okay - "SHOULD continue 765 * to be used" 766 * 2. use of it with new communication: 767 * (2a) "SHOULD NOT be used if alternate 768 * address with sufficient scope is 769 * available" 770 * (2b) nothing mentioned otherwise. 771 * Here we fall into (2b) case as we have no 772 * choice in our source address selection - we 773 * must obey the peer. 774 * 775 * The wording in RFC2462 is confusing, and 776 * there are multiple description text for 777 * deprecated address handling - worse, they 778 * are not exactly the same. I believe 5.5.4 779 * is the best one, so we follow 5.5.4. 780 */ 781 if (ip6 && !ip6_use_deprecated) { 782 struct in6_ifaddr *ia6; 783 struct ifnet *ifp = 784 if_get(m->m_pkthdr.ph_ifidx); 785 786 if (ifp && 787 (ia6 = in6ifa_ifpwithaddr(ifp, 788 &ip6->ip6_dst)) && 789 (ia6->ia6_flags & 790 IN6_IFF_DEPRECATED)) { 791 tp = NULL; 792 if_put(ifp); 793 goto dropwithreset; 794 } 795 if_put(ifp); 796 } 797 #endif 798 799 /* 800 * LISTEN socket received a SYN 801 * from itself? This can't possibly 802 * be valid; drop the packet. 803 */ 804 if (th->th_dport == th->th_sport) { 805 switch (af) { 806 #ifdef INET6 807 case AF_INET6: 808 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 809 &ip6->ip6_dst)) { 810 tcpstat_inc(tcps_badsyn); 811 goto drop; 812 } 813 break; 814 #endif /* INET6 */ 815 case AF_INET: 816 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 817 tcpstat_inc(tcps_badsyn); 818 goto drop; 819 } 820 break; 821 } 822 } 823 824 /* 825 * SYN looks ok; create compressed TCP 826 * state for it. 827 */ 828 if (so->so_qlen > so->so_qlimit || 829 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 830 so, m, optp, optlen, &opti, reuse) == -1) { 831 tcpstat_inc(tcps_dropsyn); 832 goto drop; 833 } 834 return IPPROTO_DONE; 835 } 836 } 837 } 838 839 #ifdef DIAGNOSTIC 840 /* 841 * Should not happen now that all embryonic connections 842 * are handled with compressed state. 843 */ 844 if (tp->t_state == TCPS_LISTEN) 845 panic("tcp_input: TCPS_LISTEN"); 846 #endif 847 848 #if NPF > 0 849 pf_inp_link(m, inp); 850 #endif 851 852 /* 853 * Segment received on connection. 854 * Reset idle time and keep-alive timer. 855 */ 856 tp->t_rcvtime = tcp_now; 857 if (TCPS_HAVEESTABLISHED(tp->t_state)) 858 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 859 860 if (tp->sack_enable) 861 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 862 863 /* 864 * Process options. 865 */ 866 #ifdef TCP_SIGNATURE 867 if (optp || (tp->t_flags & TF_SIGNATURE)) 868 #else 869 if (optp) 870 #endif 871 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 872 m->m_pkthdr.ph_rtableid)) 873 goto drop; 874 875 if (opti.ts_present && opti.ts_ecr) { 876 int rtt_test; 877 878 /* subtract out the tcp timestamp modulator */ 879 opti.ts_ecr -= tp->ts_modulate; 880 881 /* make sure ts_ecr is sensible */ 882 rtt_test = tcp_now - opti.ts_ecr; 883 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 884 opti.ts_ecr = 0; 885 } 886 887 #ifdef TCP_ECN 888 /* if congestion experienced, set ECE bit in subsequent packets. */ 889 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 890 tp->t_flags |= TF_RCVD_CE; 891 tcpstat_inc(tcps_ecn_rcvce); 892 } 893 #endif 894 /* 895 * Header prediction: check for the two common cases 896 * of a uni-directional data xfer. If the packet has 897 * no control flags, is in-sequence, the window didn't 898 * change and we're not retransmitting, it's a 899 * candidate. If the length is zero and the ack moved 900 * forward, we're the sender side of the xfer. Just 901 * free the data acked & wake any higher level process 902 * that was blocked waiting for space. If the length 903 * is non-zero and the ack didn't move, we're the 904 * receiver side. If we're getting packets in-order 905 * (the reassembly queue is empty), add the data to 906 * the socket buffer and note that we need a delayed ack. 907 */ 908 if (tp->t_state == TCPS_ESTABLISHED && 909 #ifdef TCP_ECN 910 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 911 #else 912 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 913 #endif 914 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 915 th->th_seq == tp->rcv_nxt && 916 tiwin && tiwin == tp->snd_wnd && 917 tp->snd_nxt == tp->snd_max) { 918 919 /* 920 * If last ACK falls within this segment's sequence numbers, 921 * record the timestamp. 922 * Fix from Braden, see Stevens p. 870 923 */ 924 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 925 tp->ts_recent_age = tcp_now; 926 tp->ts_recent = opti.ts_val; 927 } 928 929 if (tlen == 0) { 930 if (SEQ_GT(th->th_ack, tp->snd_una) && 931 SEQ_LEQ(th->th_ack, tp->snd_max) && 932 tp->snd_cwnd >= tp->snd_wnd && 933 tp->t_dupacks == 0) { 934 /* 935 * this is a pure ack for outstanding data. 936 */ 937 tcpstat_inc(tcps_predack); 938 if (opti.ts_present && opti.ts_ecr) 939 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 940 else if (tp->t_rtttime && 941 SEQ_GT(th->th_ack, tp->t_rtseq)) 942 tcp_xmit_timer(tp, 943 tcp_now - tp->t_rtttime); 944 acked = th->th_ack - tp->snd_una; 945 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 946 acked); 947 ND6_HINT(tp); 948 sbdrop(so, &so->so_snd, acked); 949 950 /* 951 * If we had a pending ICMP message that 952 * refers to data that have just been 953 * acknowledged, disregard the recorded ICMP 954 * message. 955 */ 956 if ((tp->t_flags & TF_PMTUD_PEND) && 957 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 958 tp->t_flags &= ~TF_PMTUD_PEND; 959 960 /* 961 * Keep track of the largest chunk of data 962 * acknowledged since last PMTU update 963 */ 964 if (tp->t_pmtud_mss_acked < acked) 965 tp->t_pmtud_mss_acked = acked; 966 967 tp->snd_una = th->th_ack; 968 /* 969 * We want snd_last to track snd_una so 970 * as to avoid sequence wraparound problems 971 * for very large transfers. 972 */ 973 #ifdef TCP_ECN 974 if (SEQ_GT(tp->snd_una, tp->snd_last)) 975 #endif 976 tp->snd_last = tp->snd_una; 977 m_freem(m); 978 979 /* 980 * If all outstanding data are acked, stop 981 * retransmit timer, otherwise restart timer 982 * using current (possibly backed-off) value. 983 * If process is waiting for space, 984 * wakeup/selwakeup/signal. If data 985 * are ready to send, let tcp_output 986 * decide between more output or persist. 987 */ 988 if (tp->snd_una == tp->snd_max) 989 TCP_TIMER_DISARM(tp, TCPT_REXMT); 990 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 991 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 992 993 tcp_update_sndspace(tp); 994 if (sb_notify(so, &so->so_snd)) { 995 tp->t_flags |= TF_BLOCKOUTPUT; 996 sowwakeup(so); 997 tp->t_flags &= ~TF_BLOCKOUTPUT; 998 } 999 if (so->so_snd.sb_cc || 1000 tp->t_flags & TF_NEEDOUTPUT) 1001 (void) tcp_output(tp); 1002 return IPPROTO_DONE; 1003 } 1004 } else if (th->th_ack == tp->snd_una && 1005 TAILQ_EMPTY(&tp->t_segq) && 1006 tlen <= sbspace(so, &so->so_rcv)) { 1007 /* 1008 * This is a pure, in-sequence data packet 1009 * with nothing on the reassembly queue and 1010 * we have enough buffer space to take it. 1011 */ 1012 /* Clean receiver SACK report if present */ 1013 if (tp->sack_enable && tp->rcv_numsacks) 1014 tcp_clean_sackreport(tp); 1015 tcpstat_inc(tcps_preddat); 1016 tp->rcv_nxt += tlen; 1017 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1018 ND6_HINT(tp); 1019 1020 TCP_SETUP_ACK(tp, tiflags, m); 1021 /* 1022 * Drop TCP, IP headers and TCP options then add data 1023 * to socket buffer. 1024 */ 1025 if (so->so_state & SS_CANTRCVMORE) 1026 m_freem(m); 1027 else { 1028 if (opti.ts_present && opti.ts_ecr) { 1029 if (tp->rfbuf_ts < opti.ts_ecr && 1030 opti.ts_ecr - tp->rfbuf_ts < hz) { 1031 tcp_update_rcvspace(tp); 1032 /* Start over with next RTT. */ 1033 tp->rfbuf_cnt = 0; 1034 tp->rfbuf_ts = 0; 1035 } else 1036 tp->rfbuf_cnt += tlen; 1037 } 1038 m_adj(m, iphlen + off); 1039 sbappendstream(so, &so->so_rcv, m); 1040 } 1041 tp->t_flags |= TF_BLOCKOUTPUT; 1042 sorwakeup(so); 1043 tp->t_flags &= ~TF_BLOCKOUTPUT; 1044 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1045 (void) tcp_output(tp); 1046 return IPPROTO_DONE; 1047 } 1048 } 1049 1050 /* 1051 * Compute mbuf offset to TCP data segment. 1052 */ 1053 hdroptlen = iphlen + off; 1054 1055 /* 1056 * Calculate amount of space in receive window, 1057 * and then do TCP input processing. 1058 * Receive window is amount of space in rcv queue, 1059 * but not less than advertised window. 1060 */ 1061 { int win; 1062 1063 win = sbspace(so, &so->so_rcv); 1064 if (win < 0) 1065 win = 0; 1066 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1067 } 1068 1069 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1070 tp->rfbuf_cnt = 0; 1071 tp->rfbuf_ts = 0; 1072 1073 switch (tp->t_state) { 1074 1075 /* 1076 * If the state is SYN_RECEIVED: 1077 * if seg contains SYN/ACK, send an RST. 1078 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1079 */ 1080 1081 case TCPS_SYN_RECEIVED: 1082 if (tiflags & TH_ACK) { 1083 if (tiflags & TH_SYN) { 1084 tcpstat_inc(tcps_badsyn); 1085 goto dropwithreset; 1086 } 1087 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1088 SEQ_GT(th->th_ack, tp->snd_max)) 1089 goto dropwithreset; 1090 } 1091 break; 1092 1093 /* 1094 * If the state is SYN_SENT: 1095 * if seg contains an ACK, but not for our SYN, drop the input. 1096 * if seg contains a RST, then drop the connection. 1097 * if seg does not contain SYN, then drop it. 1098 * Otherwise this is an acceptable SYN segment 1099 * initialize tp->rcv_nxt and tp->irs 1100 * if seg contains ack then advance tp->snd_una 1101 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1102 * arrange for segment to be acked (eventually) 1103 * continue processing rest of data/controls, beginning with URG 1104 */ 1105 case TCPS_SYN_SENT: 1106 if ((tiflags & TH_ACK) && 1107 (SEQ_LEQ(th->th_ack, tp->iss) || 1108 SEQ_GT(th->th_ack, tp->snd_max))) 1109 goto dropwithreset; 1110 if (tiflags & TH_RST) { 1111 #ifdef TCP_ECN 1112 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1113 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1114 goto drop; 1115 #endif 1116 if (tiflags & TH_ACK) 1117 tp = tcp_drop(tp, ECONNREFUSED); 1118 goto drop; 1119 } 1120 if ((tiflags & TH_SYN) == 0) 1121 goto drop; 1122 if (tiflags & TH_ACK) { 1123 tp->snd_una = th->th_ack; 1124 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1125 tp->snd_nxt = tp->snd_una; 1126 } 1127 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1128 tp->irs = th->th_seq; 1129 tcp_mss(tp, opti.maxseg); 1130 /* Reset initial window to 1 segment for retransmit */ 1131 if (tp->t_rxtshift > 0) 1132 tp->snd_cwnd = tp->t_maxseg; 1133 tcp_rcvseqinit(tp); 1134 tp->t_flags |= TF_ACKNOW; 1135 /* 1136 * If we've sent a SACK_PERMITTED option, and the peer 1137 * also replied with one, then TF_SACK_PERMIT should have 1138 * been set in tcp_dooptions(). If it was not, disable SACKs. 1139 */ 1140 if (tp->sack_enable) 1141 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1142 #ifdef TCP_ECN 1143 /* 1144 * if ECE is set but CWR is not set for SYN-ACK, or 1145 * both ECE and CWR are set for simultaneous open, 1146 * peer is ECN capable. 1147 */ 1148 if (tcp_do_ecn) { 1149 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1150 case TH_ACK|TH_ECE: 1151 case TH_ECE|TH_CWR: 1152 tp->t_flags |= TF_ECN_PERMIT; 1153 tiflags &= ~(TH_ECE|TH_CWR); 1154 tcpstat_inc(tcps_ecn_accepts); 1155 } 1156 } 1157 #endif 1158 1159 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1160 tcpstat_inc(tcps_connects); 1161 tp->t_flags |= TF_BLOCKOUTPUT; 1162 soisconnected(so); 1163 tp->t_flags &= ~TF_BLOCKOUTPUT; 1164 tp->t_state = TCPS_ESTABLISHED; 1165 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1166 /* Do window scaling on this connection? */ 1167 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1168 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1169 tp->snd_scale = tp->requested_s_scale; 1170 tp->rcv_scale = tp->request_r_scale; 1171 } 1172 tcp_flush_queue(tp); 1173 1174 /* 1175 * if we didn't have to retransmit the SYN, 1176 * use its rtt as our initial srtt & rtt var. 1177 */ 1178 if (tp->t_rtttime) 1179 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1180 /* 1181 * Since new data was acked (the SYN), open the 1182 * congestion window by one MSS. We do this 1183 * here, because we won't go through the normal 1184 * ACK processing below. And since this is the 1185 * start of the connection, we know we are in 1186 * the exponential phase of slow-start. 1187 */ 1188 tp->snd_cwnd += tp->t_maxseg; 1189 } else 1190 tp->t_state = TCPS_SYN_RECEIVED; 1191 1192 #if 0 1193 trimthenstep6: 1194 #endif 1195 /* 1196 * Advance th->th_seq to correspond to first data byte. 1197 * If data, trim to stay within window, 1198 * dropping FIN if necessary. 1199 */ 1200 th->th_seq++; 1201 if (tlen > tp->rcv_wnd) { 1202 todrop = tlen - tp->rcv_wnd; 1203 m_adj(m, -todrop); 1204 tlen = tp->rcv_wnd; 1205 tiflags &= ~TH_FIN; 1206 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1207 todrop); 1208 } 1209 tp->snd_wl1 = th->th_seq - 1; 1210 tp->rcv_up = th->th_seq; 1211 goto step6; 1212 /* 1213 * If a new connection request is received while in TIME_WAIT, 1214 * drop the old connection and start over if the if the 1215 * timestamp or the sequence numbers are above the previous 1216 * ones. 1217 */ 1218 case TCPS_TIME_WAIT: 1219 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1220 ((opti.ts_present && 1221 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1222 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1223 #if NPF > 0 1224 /* 1225 * The socket will be recreated but the new state 1226 * has already been linked to the socket. Remove the 1227 * link between old socket and new state. 1228 */ 1229 pf_inp_unlink(inp); 1230 #endif 1231 /* 1232 * Advance the iss by at least 32768, but 1233 * clear the msb in order to make sure 1234 * that SEG_LT(snd_nxt, iss). 1235 */ 1236 iss = tp->snd_nxt + 1237 ((arc4random() & 0x7fffffff) | 0x8000); 1238 reuse = &iss; 1239 tp = tcp_close(tp); 1240 inp = NULL; 1241 goto findpcb; 1242 } 1243 } 1244 1245 /* 1246 * States other than LISTEN or SYN_SENT. 1247 * First check timestamp, if present. 1248 * Then check that at least some bytes of segment are within 1249 * receive window. If segment begins before rcv_nxt, 1250 * drop leading data (and SYN); if nothing left, just ack. 1251 * 1252 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1253 * and it's less than opti.ts_recent, drop it. 1254 */ 1255 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1256 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1257 1258 /* Check to see if ts_recent is over 24 days old. */ 1259 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1260 /* 1261 * Invalidate ts_recent. If this segment updates 1262 * ts_recent, the age will be reset later and ts_recent 1263 * will get a valid value. If it does not, setting 1264 * ts_recent to zero will at least satisfy the 1265 * requirement that zero be placed in the timestamp 1266 * echo reply when ts_recent isn't valid. The 1267 * age isn't reset until we get a valid ts_recent 1268 * because we don't want out-of-order segments to be 1269 * dropped when ts_recent is old. 1270 */ 1271 tp->ts_recent = 0; 1272 } else { 1273 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1274 tcpstat_inc(tcps_pawsdrop); 1275 if (tlen) 1276 goto dropafterack; 1277 goto drop; 1278 } 1279 } 1280 1281 todrop = tp->rcv_nxt - th->th_seq; 1282 if (todrop > 0) { 1283 if (tiflags & TH_SYN) { 1284 tiflags &= ~TH_SYN; 1285 th->th_seq++; 1286 if (th->th_urp > 1) 1287 th->th_urp--; 1288 else 1289 tiflags &= ~TH_URG; 1290 todrop--; 1291 } 1292 if (todrop > tlen || 1293 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1294 /* 1295 * Any valid FIN must be to the left of the 1296 * window. At this point, FIN must be a 1297 * duplicate or out-of-sequence, so drop it. 1298 */ 1299 tiflags &= ~TH_FIN; 1300 /* 1301 * Send ACK to resynchronize, and drop any data, 1302 * but keep on processing for RST or ACK. 1303 */ 1304 tp->t_flags |= TF_ACKNOW; 1305 todrop = tlen; 1306 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1307 } else { 1308 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1309 todrop); 1310 } 1311 hdroptlen += todrop; /* drop from head afterwards */ 1312 th->th_seq += todrop; 1313 tlen -= todrop; 1314 if (th->th_urp > todrop) 1315 th->th_urp -= todrop; 1316 else { 1317 tiflags &= ~TH_URG; 1318 th->th_urp = 0; 1319 } 1320 } 1321 1322 /* 1323 * If new data are received on a connection after the 1324 * user processes are gone, then RST the other end. 1325 */ 1326 if ((so->so_state & SS_NOFDREF) && 1327 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1328 tp = tcp_close(tp); 1329 tcpstat_inc(tcps_rcvafterclose); 1330 goto dropwithreset; 1331 } 1332 1333 /* 1334 * If segment ends after window, drop trailing data 1335 * (and PUSH and FIN); if nothing left, just ACK. 1336 */ 1337 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1338 if (todrop > 0) { 1339 tcpstat_inc(tcps_rcvpackafterwin); 1340 if (todrop >= tlen) { 1341 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1342 /* 1343 * If window is closed can only take segments at 1344 * window edge, and have to drop data and PUSH from 1345 * incoming segments. Continue processing, but 1346 * remember to ack. Otherwise, drop segment 1347 * and ack. 1348 */ 1349 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1350 tp->t_flags |= TF_ACKNOW; 1351 tcpstat_inc(tcps_rcvwinprobe); 1352 } else 1353 goto dropafterack; 1354 } else 1355 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1356 m_adj(m, -todrop); 1357 tlen -= todrop; 1358 tiflags &= ~(TH_PUSH|TH_FIN); 1359 } 1360 1361 /* 1362 * If last ACK falls within this segment's sequence numbers, 1363 * record its timestamp if it's more recent. 1364 * NOTE that the test is modified according to the latest 1365 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1366 */ 1367 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1368 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1369 tp->ts_recent_age = tcp_now; 1370 tp->ts_recent = opti.ts_val; 1371 } 1372 1373 /* 1374 * If the RST bit is set examine the state: 1375 * SYN_RECEIVED STATE: 1376 * If passive open, return to LISTEN state. 1377 * If active open, inform user that connection was refused. 1378 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1379 * Inform user that connection was reset, and close tcb. 1380 * CLOSING, LAST_ACK, TIME_WAIT STATES 1381 * Close the tcb. 1382 */ 1383 if (tiflags & TH_RST) { 1384 if (th->th_seq != tp->last_ack_sent && 1385 th->th_seq != tp->rcv_nxt && 1386 th->th_seq != (tp->rcv_nxt + 1)) 1387 goto drop; 1388 1389 switch (tp->t_state) { 1390 case TCPS_SYN_RECEIVED: 1391 #ifdef TCP_ECN 1392 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1393 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1394 goto drop; 1395 #endif 1396 so->so_error = ECONNREFUSED; 1397 goto close; 1398 1399 case TCPS_ESTABLISHED: 1400 case TCPS_FIN_WAIT_1: 1401 case TCPS_FIN_WAIT_2: 1402 case TCPS_CLOSE_WAIT: 1403 so->so_error = ECONNRESET; 1404 close: 1405 tp->t_state = TCPS_CLOSED; 1406 tcpstat_inc(tcps_drops); 1407 tp = tcp_close(tp); 1408 goto drop; 1409 case TCPS_CLOSING: 1410 case TCPS_LAST_ACK: 1411 case TCPS_TIME_WAIT: 1412 tp = tcp_close(tp); 1413 goto drop; 1414 } 1415 } 1416 1417 /* 1418 * If a SYN is in the window, then this is an 1419 * error and we ACK and drop the packet. 1420 */ 1421 if (tiflags & TH_SYN) 1422 goto dropafterack_ratelim; 1423 1424 /* 1425 * If the ACK bit is off we drop the segment and return. 1426 */ 1427 if ((tiflags & TH_ACK) == 0) { 1428 if (tp->t_flags & TF_ACKNOW) 1429 goto dropafterack; 1430 else 1431 goto drop; 1432 } 1433 1434 /* 1435 * Ack processing. 1436 */ 1437 switch (tp->t_state) { 1438 1439 /* 1440 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1441 * ESTABLISHED state and continue processing. 1442 * The ACK was checked above. 1443 */ 1444 case TCPS_SYN_RECEIVED: 1445 tcpstat_inc(tcps_connects); 1446 tp->t_flags |= TF_BLOCKOUTPUT; 1447 soisconnected(so); 1448 tp->t_flags &= ~TF_BLOCKOUTPUT; 1449 tp->t_state = TCPS_ESTABLISHED; 1450 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1451 /* Do window scaling? */ 1452 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1453 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1454 tp->snd_scale = tp->requested_s_scale; 1455 tp->rcv_scale = tp->request_r_scale; 1456 tiwin = th->th_win << tp->snd_scale; 1457 } 1458 tcp_flush_queue(tp); 1459 tp->snd_wl1 = th->th_seq - 1; 1460 /* fall into ... */ 1461 1462 /* 1463 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1464 * ACKs. If the ack is in the range 1465 * tp->snd_una < th->th_ack <= tp->snd_max 1466 * then advance tp->snd_una to th->th_ack and drop 1467 * data from the retransmission queue. If this ACK reflects 1468 * more up to date window information we update our window information. 1469 */ 1470 case TCPS_ESTABLISHED: 1471 case TCPS_FIN_WAIT_1: 1472 case TCPS_FIN_WAIT_2: 1473 case TCPS_CLOSE_WAIT: 1474 case TCPS_CLOSING: 1475 case TCPS_LAST_ACK: 1476 case TCPS_TIME_WAIT: 1477 #ifdef TCP_ECN 1478 /* 1479 * if we receive ECE and are not already in recovery phase, 1480 * reduce cwnd by half but don't slow-start. 1481 * advance snd_last to snd_max not to reduce cwnd again 1482 * until all outstanding packets are acked. 1483 */ 1484 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1485 if ((tp->t_flags & TF_ECN_PERMIT) && 1486 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1487 u_int win; 1488 1489 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1490 if (win > 1) { 1491 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1492 tp->snd_cwnd = tp->snd_ssthresh; 1493 tp->snd_last = tp->snd_max; 1494 tp->t_flags |= TF_SEND_CWR; 1495 tcpstat_inc(tcps_cwr_ecn); 1496 } 1497 } 1498 tcpstat_inc(tcps_ecn_rcvece); 1499 } 1500 /* 1501 * if we receive CWR, we know that the peer has reduced 1502 * its congestion window. stop sending ecn-echo. 1503 */ 1504 if ((tiflags & TH_CWR)) { 1505 tp->t_flags &= ~TF_RCVD_CE; 1506 tcpstat_inc(tcps_ecn_rcvcwr); 1507 } 1508 #endif /* TCP_ECN */ 1509 1510 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1511 /* 1512 * Duplicate/old ACK processing. 1513 * Increments t_dupacks: 1514 * Pure duplicate (same seq/ack/window, no data) 1515 * Doesn't affect t_dupacks: 1516 * Data packets. 1517 * Normal window updates (window opens) 1518 * Resets t_dupacks: 1519 * New data ACKed. 1520 * Window shrinks 1521 * Old ACK 1522 */ 1523 if (tlen) { 1524 /* Drop very old ACKs unless th_seq matches */ 1525 if (th->th_seq != tp->rcv_nxt && 1526 SEQ_LT(th->th_ack, 1527 tp->snd_una - tp->max_sndwnd)) { 1528 tcpstat_inc(tcps_rcvacktooold); 1529 goto drop; 1530 } 1531 break; 1532 } 1533 /* 1534 * If we get an old ACK, there is probably packet 1535 * reordering going on. Be conservative and reset 1536 * t_dupacks so that we are less aggressive in 1537 * doing a fast retransmit. 1538 */ 1539 if (th->th_ack != tp->snd_una) { 1540 tp->t_dupacks = 0; 1541 break; 1542 } 1543 if (tiwin == tp->snd_wnd) { 1544 tcpstat_inc(tcps_rcvdupack); 1545 /* 1546 * If we have outstanding data (other than 1547 * a window probe), this is a completely 1548 * duplicate ack (ie, window info didn't 1549 * change), the ack is the biggest we've 1550 * seen and we've seen exactly our rexmt 1551 * threshold of them, assume a packet 1552 * has been dropped and retransmit it. 1553 * Kludge snd_nxt & the congestion 1554 * window so we send only this one 1555 * packet. 1556 * 1557 * We know we're losing at the current 1558 * window size so do congestion avoidance 1559 * (set ssthresh to half the current window 1560 * and pull our congestion window back to 1561 * the new ssthresh). 1562 * 1563 * Dup acks mean that packets have left the 1564 * network (they're now cached at the receiver) 1565 * so bump cwnd by the amount in the receiver 1566 * to keep a constant cwnd packets in the 1567 * network. 1568 */ 1569 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1570 tp->t_dupacks = 0; 1571 else if (++tp->t_dupacks == tcprexmtthresh) { 1572 tcp_seq onxt = tp->snd_nxt; 1573 u_long win = 1574 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1575 2 / tp->t_maxseg; 1576 1577 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1578 /* 1579 * False fast retx after 1580 * timeout. Do not cut window. 1581 */ 1582 tp->t_dupacks = 0; 1583 goto drop; 1584 } 1585 if (win < 2) 1586 win = 2; 1587 tp->snd_ssthresh = win * tp->t_maxseg; 1588 tp->snd_last = tp->snd_max; 1589 if (tp->sack_enable) { 1590 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1591 tp->t_rtttime = 0; 1592 #ifdef TCP_ECN 1593 tp->t_flags |= TF_SEND_CWR; 1594 #endif 1595 tcpstat_inc(tcps_cwr_frecovery); 1596 tcpstat_inc(tcps_sack_recovery_episode); 1597 /* 1598 * tcp_output() will send 1599 * oldest SACK-eligible rtx. 1600 */ 1601 (void) tcp_output(tp); 1602 tp->snd_cwnd = tp->snd_ssthresh+ 1603 tp->t_maxseg * tp->t_dupacks; 1604 goto drop; 1605 } 1606 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1607 tp->t_rtttime = 0; 1608 tp->snd_nxt = th->th_ack; 1609 tp->snd_cwnd = tp->t_maxseg; 1610 #ifdef TCP_ECN 1611 tp->t_flags |= TF_SEND_CWR; 1612 #endif 1613 tcpstat_inc(tcps_cwr_frecovery); 1614 tcpstat_inc(tcps_sndrexmitfast); 1615 (void) tcp_output(tp); 1616 1617 tp->snd_cwnd = tp->snd_ssthresh + 1618 tp->t_maxseg * tp->t_dupacks; 1619 if (SEQ_GT(onxt, tp->snd_nxt)) 1620 tp->snd_nxt = onxt; 1621 goto drop; 1622 } else if (tp->t_dupacks > tcprexmtthresh) { 1623 tp->snd_cwnd += tp->t_maxseg; 1624 (void) tcp_output(tp); 1625 goto drop; 1626 } 1627 } else if (tiwin < tp->snd_wnd) { 1628 /* 1629 * The window was retracted! Previous dup 1630 * ACKs may have been due to packets arriving 1631 * after the shrunken window, not a missing 1632 * packet, so play it safe and reset t_dupacks 1633 */ 1634 tp->t_dupacks = 0; 1635 } 1636 break; 1637 } 1638 /* 1639 * If the congestion window was inflated to account 1640 * for the other side's cached packets, retract it. 1641 */ 1642 if (tp->t_dupacks >= tcprexmtthresh) { 1643 /* Check for a partial ACK */ 1644 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1645 if (tp->sack_enable) 1646 tcp_sack_partialack(tp, th); 1647 else 1648 tcp_newreno_partialack(tp, th); 1649 } else { 1650 /* Out of fast recovery */ 1651 tp->snd_cwnd = tp->snd_ssthresh; 1652 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1653 tp->snd_ssthresh) 1654 tp->snd_cwnd = 1655 tcp_seq_subtract(tp->snd_max, 1656 th->th_ack); 1657 tp->t_dupacks = 0; 1658 } 1659 } else { 1660 /* 1661 * Reset the duplicate ACK counter if we 1662 * were not in fast recovery. 1663 */ 1664 tp->t_dupacks = 0; 1665 } 1666 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1667 tcpstat_inc(tcps_rcvacktoomuch); 1668 goto dropafterack_ratelim; 1669 } 1670 acked = th->th_ack - tp->snd_una; 1671 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1672 1673 /* 1674 * If we have a timestamp reply, update smoothed 1675 * round trip time. If no timestamp is present but 1676 * transmit timer is running and timed sequence 1677 * number was acked, update smoothed round trip time. 1678 * Since we now have an rtt measurement, cancel the 1679 * timer backoff (cf., Phil Karn's retransmit alg.). 1680 * Recompute the initial retransmit timer. 1681 */ 1682 if (opti.ts_present && opti.ts_ecr) 1683 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1684 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1685 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1686 1687 /* 1688 * If all outstanding data is acked, stop retransmit 1689 * timer and remember to restart (more output or persist). 1690 * If there is more data to be acked, restart retransmit 1691 * timer, using current (possibly backed-off) value. 1692 */ 1693 if (th->th_ack == tp->snd_max) { 1694 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1695 tp->t_flags |= TF_NEEDOUTPUT; 1696 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1697 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1698 /* 1699 * When new data is acked, open the congestion window. 1700 * If the window gives us less than ssthresh packets 1701 * in flight, open exponentially (maxseg per packet). 1702 * Otherwise open linearly: maxseg per window 1703 * (maxseg^2 / cwnd per packet). 1704 */ 1705 { 1706 u_int cw = tp->snd_cwnd; 1707 u_int incr = tp->t_maxseg; 1708 1709 if (cw > tp->snd_ssthresh) 1710 incr = max(incr * incr / cw, 1); 1711 if (tp->t_dupacks < tcprexmtthresh) 1712 tp->snd_cwnd = ulmin(cw + incr, 1713 TCP_MAXWIN << tp->snd_scale); 1714 } 1715 ND6_HINT(tp); 1716 if (acked > so->so_snd.sb_cc) { 1717 if (tp->snd_wnd > so->so_snd.sb_cc) 1718 tp->snd_wnd -= so->so_snd.sb_cc; 1719 else 1720 tp->snd_wnd = 0; 1721 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1722 ourfinisacked = 1; 1723 } else { 1724 sbdrop(so, &so->so_snd, acked); 1725 if (tp->snd_wnd > acked) 1726 tp->snd_wnd -= acked; 1727 else 1728 tp->snd_wnd = 0; 1729 ourfinisacked = 0; 1730 } 1731 1732 tcp_update_sndspace(tp); 1733 if (sb_notify(so, &so->so_snd)) { 1734 tp->t_flags |= TF_BLOCKOUTPUT; 1735 sowwakeup(so); 1736 tp->t_flags &= ~TF_BLOCKOUTPUT; 1737 } 1738 1739 /* 1740 * If we had a pending ICMP message that referred to data 1741 * that have just been acknowledged, disregard the recorded 1742 * ICMP message. 1743 */ 1744 if ((tp->t_flags & TF_PMTUD_PEND) && 1745 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1746 tp->t_flags &= ~TF_PMTUD_PEND; 1747 1748 /* 1749 * Keep track of the largest chunk of data acknowledged 1750 * since last PMTU update 1751 */ 1752 if (tp->t_pmtud_mss_acked < acked) 1753 tp->t_pmtud_mss_acked = acked; 1754 1755 tp->snd_una = th->th_ack; 1756 #ifdef TCP_ECN 1757 /* sync snd_last with snd_una */ 1758 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1759 tp->snd_last = tp->snd_una; 1760 #endif 1761 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1762 tp->snd_nxt = tp->snd_una; 1763 1764 switch (tp->t_state) { 1765 1766 /* 1767 * In FIN_WAIT_1 STATE in addition to the processing 1768 * for the ESTABLISHED state if our FIN is now acknowledged 1769 * then enter FIN_WAIT_2. 1770 */ 1771 case TCPS_FIN_WAIT_1: 1772 if (ourfinisacked) { 1773 /* 1774 * If we can't receive any more 1775 * data, then closing user can proceed. 1776 * Starting the timer is contrary to the 1777 * specification, but if we don't get a FIN 1778 * we'll hang forever. 1779 */ 1780 if (so->so_state & SS_CANTRCVMORE) { 1781 tp->t_flags |= TF_BLOCKOUTPUT; 1782 soisdisconnected(so); 1783 tp->t_flags &= ~TF_BLOCKOUTPUT; 1784 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1785 } 1786 tp->t_state = TCPS_FIN_WAIT_2; 1787 } 1788 break; 1789 1790 /* 1791 * In CLOSING STATE in addition to the processing for 1792 * the ESTABLISHED state if the ACK acknowledges our FIN 1793 * then enter the TIME-WAIT state, otherwise ignore 1794 * the segment. 1795 */ 1796 case TCPS_CLOSING: 1797 if (ourfinisacked) { 1798 tp->t_state = TCPS_TIME_WAIT; 1799 tcp_canceltimers(tp); 1800 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1801 tp->t_flags |= TF_BLOCKOUTPUT; 1802 soisdisconnected(so); 1803 tp->t_flags &= ~TF_BLOCKOUTPUT; 1804 } 1805 break; 1806 1807 /* 1808 * In LAST_ACK, we may still be waiting for data to drain 1809 * and/or to be acked, as well as for the ack of our FIN. 1810 * If our FIN is now acknowledged, delete the TCB, 1811 * enter the closed state and return. 1812 */ 1813 case TCPS_LAST_ACK: 1814 if (ourfinisacked) { 1815 tp = tcp_close(tp); 1816 goto drop; 1817 } 1818 break; 1819 1820 /* 1821 * In TIME_WAIT state the only thing that should arrive 1822 * is a retransmission of the remote FIN. Acknowledge 1823 * it and restart the finack timer. 1824 */ 1825 case TCPS_TIME_WAIT: 1826 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1827 goto dropafterack; 1828 } 1829 } 1830 1831 step6: 1832 /* 1833 * Update window information. 1834 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1835 */ 1836 if ((tiflags & TH_ACK) && 1837 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1838 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1839 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1840 /* keep track of pure window updates */ 1841 if (tlen == 0 && 1842 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1843 tcpstat_inc(tcps_rcvwinupd); 1844 tp->snd_wnd = tiwin; 1845 tp->snd_wl1 = th->th_seq; 1846 tp->snd_wl2 = th->th_ack; 1847 if (tp->snd_wnd > tp->max_sndwnd) 1848 tp->max_sndwnd = tp->snd_wnd; 1849 tp->t_flags |= TF_NEEDOUTPUT; 1850 } 1851 1852 /* 1853 * Process segments with URG. 1854 */ 1855 if ((tiflags & TH_URG) && th->th_urp && 1856 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1857 /* 1858 * This is a kludge, but if we receive and accept 1859 * random urgent pointers, we'll crash in 1860 * soreceive. It's hard to imagine someone 1861 * actually wanting to send this much urgent data. 1862 */ 1863 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1864 th->th_urp = 0; /* XXX */ 1865 tiflags &= ~TH_URG; /* XXX */ 1866 goto dodata; /* XXX */ 1867 } 1868 /* 1869 * If this segment advances the known urgent pointer, 1870 * then mark the data stream. This should not happen 1871 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1872 * a FIN has been received from the remote side. 1873 * In these states we ignore the URG. 1874 * 1875 * According to RFC961 (Assigned Protocols), 1876 * the urgent pointer points to the last octet 1877 * of urgent data. We continue, however, 1878 * to consider it to indicate the first octet 1879 * of data past the urgent section as the original 1880 * spec states (in one of two places). 1881 */ 1882 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1883 tp->rcv_up = th->th_seq + th->th_urp; 1884 so->so_oobmark = so->so_rcv.sb_cc + 1885 (tp->rcv_up - tp->rcv_nxt) - 1; 1886 if (so->so_oobmark == 0) 1887 so->so_state |= SS_RCVATMARK; 1888 sohasoutofband(so); 1889 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1890 } 1891 /* 1892 * Remove out of band data so doesn't get presented to user. 1893 * This can happen independent of advancing the URG pointer, 1894 * but if two URG's are pending at once, some out-of-band 1895 * data may creep in... ick. 1896 */ 1897 if (th->th_urp <= (u_int16_t) tlen && 1898 (so->so_options & SO_OOBINLINE) == 0) 1899 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1900 } else 1901 /* 1902 * If no out of band data is expected, 1903 * pull receive urgent pointer along 1904 * with the receive window. 1905 */ 1906 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1907 tp->rcv_up = tp->rcv_nxt; 1908 dodata: /* XXX */ 1909 1910 /* 1911 * Process the segment text, merging it into the TCP sequencing queue, 1912 * and arranging for acknowledgment of receipt if necessary. 1913 * This process logically involves adjusting tp->rcv_wnd as data 1914 * is presented to the user (this happens in tcp_usrreq.c, 1915 * case PRU_RCVD). If a FIN has already been received on this 1916 * connection then we just ignore the text. 1917 */ 1918 if ((tlen || (tiflags & TH_FIN)) && 1919 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1920 tcp_seq laststart = th->th_seq; 1921 tcp_seq lastend = th->th_seq + tlen; 1922 1923 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1924 tp->t_state == TCPS_ESTABLISHED) { 1925 TCP_SETUP_ACK(tp, tiflags, m); 1926 tp->rcv_nxt += tlen; 1927 tiflags = th->th_flags & TH_FIN; 1928 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1929 ND6_HINT(tp); 1930 if (so->so_state & SS_CANTRCVMORE) 1931 m_freem(m); 1932 else { 1933 m_adj(m, hdroptlen); 1934 sbappendstream(so, &so->so_rcv, m); 1935 } 1936 tp->t_flags |= TF_BLOCKOUTPUT; 1937 sorwakeup(so); 1938 tp->t_flags &= ~TF_BLOCKOUTPUT; 1939 } else { 1940 m_adj(m, hdroptlen); 1941 tiflags = tcp_reass(tp, th, m, &tlen); 1942 tp->t_flags |= TF_ACKNOW; 1943 } 1944 if (tp->sack_enable) 1945 tcp_update_sack_list(tp, laststart, lastend); 1946 1947 /* 1948 * variable len never referenced again in modern BSD, 1949 * so why bother computing it ?? 1950 */ 1951 #if 0 1952 /* 1953 * Note the amount of data that peer has sent into 1954 * our window, in order to estimate the sender's 1955 * buffer size. 1956 */ 1957 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1958 #endif /* 0 */ 1959 } else { 1960 m_freem(m); 1961 tiflags &= ~TH_FIN; 1962 } 1963 1964 /* 1965 * If FIN is received ACK the FIN and let the user know 1966 * that the connection is closing. Ignore a FIN received before 1967 * the connection is fully established. 1968 */ 1969 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1970 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1971 tp->t_flags |= TF_BLOCKOUTPUT; 1972 socantrcvmore(so); 1973 tp->t_flags &= ~TF_BLOCKOUTPUT; 1974 tp->t_flags |= TF_ACKNOW; 1975 tp->rcv_nxt++; 1976 } 1977 switch (tp->t_state) { 1978 1979 /* 1980 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1981 */ 1982 case TCPS_ESTABLISHED: 1983 tp->t_state = TCPS_CLOSE_WAIT; 1984 break; 1985 1986 /* 1987 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1988 * enter the CLOSING state. 1989 */ 1990 case TCPS_FIN_WAIT_1: 1991 tp->t_state = TCPS_CLOSING; 1992 break; 1993 1994 /* 1995 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1996 * starting the time-wait timer, turning off the other 1997 * standard timers. 1998 */ 1999 case TCPS_FIN_WAIT_2: 2000 tp->t_state = TCPS_TIME_WAIT; 2001 tcp_canceltimers(tp); 2002 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2003 tp->t_flags |= TF_BLOCKOUTPUT; 2004 soisdisconnected(so); 2005 tp->t_flags &= ~TF_BLOCKOUTPUT; 2006 break; 2007 2008 /* 2009 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2010 */ 2011 case TCPS_TIME_WAIT: 2012 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2013 break; 2014 } 2015 } 2016 if (otp) 2017 tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen); 2018 2019 /* 2020 * Return any desired output. 2021 */ 2022 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2023 (void) tcp_output(tp); 2024 return IPPROTO_DONE; 2025 2026 badsyn: 2027 /* 2028 * Received a bad SYN. Increment counters and dropwithreset. 2029 */ 2030 tcpstat_inc(tcps_badsyn); 2031 tp = NULL; 2032 goto dropwithreset; 2033 2034 dropafterack_ratelim: 2035 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2036 tcp_ackdrop_ppslim) == 0) { 2037 /* XXX stat */ 2038 goto drop; 2039 } 2040 /* ...fall into dropafterack... */ 2041 2042 dropafterack: 2043 /* 2044 * Generate an ACK dropping incoming segment if it occupies 2045 * sequence space, where the ACK reflects our state. 2046 */ 2047 if (tiflags & TH_RST) 2048 goto drop; 2049 m_freem(m); 2050 tp->t_flags |= TF_ACKNOW; 2051 (void) tcp_output(tp); 2052 return IPPROTO_DONE; 2053 2054 dropwithreset_ratelim: 2055 /* 2056 * We may want to rate-limit RSTs in certain situations, 2057 * particularly if we are sending an RST in response to 2058 * an attempt to connect to or otherwise communicate with 2059 * a port for which we have no socket. 2060 */ 2061 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2062 tcp_rst_ppslim) == 0) { 2063 /* XXX stat */ 2064 goto drop; 2065 } 2066 /* ...fall into dropwithreset... */ 2067 2068 dropwithreset: 2069 /* 2070 * Generate a RST, dropping incoming segment. 2071 * Make ACK acceptable to originator of segment. 2072 * Don't bother to respond to RST. 2073 */ 2074 if (tiflags & TH_RST) 2075 goto drop; 2076 if (tiflags & TH_ACK) { 2077 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2078 TH_RST, m->m_pkthdr.ph_rtableid); 2079 } else { 2080 if (tiflags & TH_SYN) 2081 tlen++; 2082 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2083 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2084 } 2085 m_freem(m); 2086 return IPPROTO_DONE; 2087 2088 drop: 2089 /* 2090 * Drop space held by incoming segment and return. 2091 */ 2092 if (otp) 2093 tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen); 2094 2095 m_freem(m); 2096 return IPPROTO_DONE; 2097 } 2098 2099 int 2100 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2101 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2102 u_int rtableid) 2103 { 2104 u_int16_t mss = 0; 2105 int opt, optlen; 2106 #ifdef TCP_SIGNATURE 2107 caddr_t sigp = NULL; 2108 struct tdb *tdb = NULL; 2109 #endif /* TCP_SIGNATURE */ 2110 2111 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2112 opt = cp[0]; 2113 if (opt == TCPOPT_EOL) 2114 break; 2115 if (opt == TCPOPT_NOP) 2116 optlen = 1; 2117 else { 2118 if (cnt < 2) 2119 break; 2120 optlen = cp[1]; 2121 if (optlen < 2 || optlen > cnt) 2122 break; 2123 } 2124 switch (opt) { 2125 2126 default: 2127 continue; 2128 2129 case TCPOPT_MAXSEG: 2130 if (optlen != TCPOLEN_MAXSEG) 2131 continue; 2132 if (!(th->th_flags & TH_SYN)) 2133 continue; 2134 if (TCPS_HAVERCVDSYN(tp->t_state)) 2135 continue; 2136 memcpy(&mss, cp + 2, sizeof(mss)); 2137 mss = ntohs(mss); 2138 oi->maxseg = mss; 2139 break; 2140 2141 case TCPOPT_WINDOW: 2142 if (optlen != TCPOLEN_WINDOW) 2143 continue; 2144 if (!(th->th_flags & TH_SYN)) 2145 continue; 2146 if (TCPS_HAVERCVDSYN(tp->t_state)) 2147 continue; 2148 tp->t_flags |= TF_RCVD_SCALE; 2149 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2150 break; 2151 2152 case TCPOPT_TIMESTAMP: 2153 if (optlen != TCPOLEN_TIMESTAMP) 2154 continue; 2155 oi->ts_present = 1; 2156 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2157 oi->ts_val = ntohl(oi->ts_val); 2158 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2159 oi->ts_ecr = ntohl(oi->ts_ecr); 2160 2161 if (!(th->th_flags & TH_SYN)) 2162 continue; 2163 if (TCPS_HAVERCVDSYN(tp->t_state)) 2164 continue; 2165 /* 2166 * A timestamp received in a SYN makes 2167 * it ok to send timestamp requests and replies. 2168 */ 2169 tp->t_flags |= TF_RCVD_TSTMP; 2170 tp->ts_recent = oi->ts_val; 2171 tp->ts_recent_age = tcp_now; 2172 break; 2173 2174 case TCPOPT_SACK_PERMITTED: 2175 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2176 continue; 2177 if (!(th->th_flags & TH_SYN)) 2178 continue; 2179 if (TCPS_HAVERCVDSYN(tp->t_state)) 2180 continue; 2181 /* MUST only be set on SYN */ 2182 tp->t_flags |= TF_SACK_PERMIT; 2183 break; 2184 case TCPOPT_SACK: 2185 tcp_sack_option(tp, th, cp, optlen); 2186 break; 2187 #ifdef TCP_SIGNATURE 2188 case TCPOPT_SIGNATURE: 2189 if (optlen != TCPOLEN_SIGNATURE) 2190 continue; 2191 2192 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2193 return (-1); 2194 2195 sigp = cp + 2; 2196 break; 2197 #endif /* TCP_SIGNATURE */ 2198 } 2199 } 2200 2201 #ifdef TCP_SIGNATURE 2202 if (tp->t_flags & TF_SIGNATURE) { 2203 union sockaddr_union src, dst; 2204 2205 memset(&src, 0, sizeof(union sockaddr_union)); 2206 memset(&dst, 0, sizeof(union sockaddr_union)); 2207 2208 switch (tp->pf) { 2209 case 0: 2210 case AF_INET: 2211 src.sa.sa_len = sizeof(struct sockaddr_in); 2212 src.sa.sa_family = AF_INET; 2213 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2214 dst.sa.sa_len = sizeof(struct sockaddr_in); 2215 dst.sa.sa_family = AF_INET; 2216 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2217 break; 2218 #ifdef INET6 2219 case AF_INET6: 2220 src.sa.sa_len = sizeof(struct sockaddr_in6); 2221 src.sa.sa_family = AF_INET6; 2222 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2223 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2224 dst.sa.sa_family = AF_INET6; 2225 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2226 break; 2227 #endif /* INET6 */ 2228 } 2229 2230 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2231 0, &src, &dst, IPPROTO_TCP); 2232 2233 /* 2234 * We don't have an SA for this peer, so we turn off 2235 * TF_SIGNATURE on the listen socket 2236 */ 2237 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2238 tp->t_flags &= ~TF_SIGNATURE; 2239 2240 } 2241 2242 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2243 tcpstat_inc(tcps_rcvbadsig); 2244 return (-1); 2245 } 2246 2247 if (sigp) { 2248 char sig[16]; 2249 2250 if (tdb == NULL) { 2251 tcpstat_inc(tcps_rcvbadsig); 2252 return (-1); 2253 } 2254 2255 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2256 return (-1); 2257 2258 if (timingsafe_bcmp(sig, sigp, 16)) { 2259 tcpstat_inc(tcps_rcvbadsig); 2260 return (-1); 2261 } 2262 2263 tcpstat_inc(tcps_rcvgoodsig); 2264 } 2265 #endif /* TCP_SIGNATURE */ 2266 2267 return (0); 2268 } 2269 2270 u_long 2271 tcp_seq_subtract(u_long a, u_long b) 2272 { 2273 return ((long)(a - b)); 2274 } 2275 2276 /* 2277 * This function is called upon receipt of new valid data (while not in header 2278 * prediction mode), and it updates the ordered list of sacks. 2279 */ 2280 void 2281 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2282 tcp_seq rcv_lastend) 2283 { 2284 /* 2285 * First reported block MUST be the most recent one. Subsequent 2286 * blocks SHOULD be in the order in which they arrived at the 2287 * receiver. These two conditions make the implementation fully 2288 * compliant with RFC 2018. 2289 */ 2290 int i, j = 0, count = 0, lastpos = -1; 2291 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2292 2293 /* First clean up current list of sacks */ 2294 for (i = 0; i < tp->rcv_numsacks; i++) { 2295 sack = tp->sackblks[i]; 2296 if (sack.start == 0 && sack.end == 0) { 2297 count++; /* count = number of blocks to be discarded */ 2298 continue; 2299 } 2300 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2301 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2302 count++; 2303 } else { 2304 temp[j].start = tp->sackblks[i].start; 2305 temp[j++].end = tp->sackblks[i].end; 2306 } 2307 } 2308 tp->rcv_numsacks -= count; 2309 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2310 tcp_clean_sackreport(tp); 2311 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2312 /* ==> need first sack block */ 2313 tp->sackblks[0].start = rcv_laststart; 2314 tp->sackblks[0].end = rcv_lastend; 2315 tp->rcv_numsacks = 1; 2316 } 2317 return; 2318 } 2319 /* Otherwise, sack blocks are already present. */ 2320 for (i = 0; i < tp->rcv_numsacks; i++) 2321 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2322 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2323 return; /* sack list remains unchanged */ 2324 /* 2325 * From here, segment just received should be (part of) the 1st sack. 2326 * Go through list, possibly coalescing sack block entries. 2327 */ 2328 firstsack.start = rcv_laststart; 2329 firstsack.end = rcv_lastend; 2330 for (i = 0; i < tp->rcv_numsacks; i++) { 2331 sack = tp->sackblks[i]; 2332 if (SEQ_LT(sack.end, firstsack.start) || 2333 SEQ_GT(sack.start, firstsack.end)) 2334 continue; /* no overlap */ 2335 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2336 /* 2337 * identical block; delete it here since we will 2338 * move it to the front of the list. 2339 */ 2340 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2341 lastpos = i; /* last posn with a zero entry */ 2342 continue; 2343 } 2344 if (SEQ_LEQ(sack.start, firstsack.start)) 2345 firstsack.start = sack.start; /* merge blocks */ 2346 if (SEQ_GEQ(sack.end, firstsack.end)) 2347 firstsack.end = sack.end; /* merge blocks */ 2348 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2349 lastpos = i; /* last posn with a zero entry */ 2350 } 2351 if (lastpos != -1) { /* at least one merge */ 2352 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2353 sack = tp->sackblks[i]; 2354 if (sack.start == 0 && sack.end == 0) 2355 continue; 2356 temp[j++] = sack; 2357 } 2358 tp->rcv_numsacks = j; /* including first blk (added later) */ 2359 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2360 tp->sackblks[i] = temp[i]; 2361 } else { /* no merges -- shift sacks by 1 */ 2362 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2363 tp->rcv_numsacks++; 2364 for (i = tp->rcv_numsacks-1; i > 0; i--) 2365 tp->sackblks[i] = tp->sackblks[i-1]; 2366 } 2367 tp->sackblks[0] = firstsack; 2368 return; 2369 } 2370 2371 /* 2372 * Process the TCP SACK option. tp->snd_holes is an ordered list 2373 * of holes (oldest to newest, in terms of the sequence space). 2374 */ 2375 void 2376 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2377 { 2378 int tmp_olen; 2379 u_char *tmp_cp; 2380 struct sackhole *cur, *p, *temp; 2381 2382 if (!tp->sack_enable) 2383 return; 2384 /* SACK without ACK doesn't make sense. */ 2385 if ((th->th_flags & TH_ACK) == 0) 2386 return; 2387 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2388 if (SEQ_LT(th->th_ack, tp->snd_una) || 2389 SEQ_GT(th->th_ack, tp->snd_max)) 2390 return; 2391 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2392 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2393 return; 2394 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2395 tmp_cp = cp + 2; 2396 tmp_olen = optlen - 2; 2397 tcpstat_inc(tcps_sack_rcv_opts); 2398 if (tp->snd_numholes < 0) 2399 tp->snd_numholes = 0; 2400 if (tp->t_maxseg == 0) 2401 panic("tcp_sack_option"); /* Should never happen */ 2402 while (tmp_olen > 0) { 2403 struct sackblk sack; 2404 2405 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2406 sack.start = ntohl(sack.start); 2407 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2408 sack.end = ntohl(sack.end); 2409 tmp_olen -= TCPOLEN_SACK; 2410 tmp_cp += TCPOLEN_SACK; 2411 if (SEQ_LEQ(sack.end, sack.start)) 2412 continue; /* bad SACK fields */ 2413 if (SEQ_LEQ(sack.end, tp->snd_una)) 2414 continue; /* old block */ 2415 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2416 if (SEQ_LT(sack.start, th->th_ack)) 2417 continue; 2418 } 2419 if (SEQ_GT(sack.end, tp->snd_max)) 2420 continue; 2421 if (tp->snd_holes == NULL) { /* first hole */ 2422 tp->snd_holes = (struct sackhole *) 2423 pool_get(&sackhl_pool, PR_NOWAIT); 2424 if (tp->snd_holes == NULL) { 2425 /* ENOBUFS, so ignore SACKed block for now */ 2426 goto dropped; 2427 } 2428 cur = tp->snd_holes; 2429 cur->start = th->th_ack; 2430 cur->end = sack.start; 2431 cur->rxmit = cur->start; 2432 cur->next = NULL; 2433 tp->snd_numholes = 1; 2434 tp->rcv_lastsack = sack.end; 2435 /* 2436 * dups is at least one. If more data has been 2437 * SACKed, it can be greater than one. 2438 */ 2439 cur->dups = min(tcprexmtthresh, 2440 ((sack.end - cur->end)/tp->t_maxseg)); 2441 if (cur->dups < 1) 2442 cur->dups = 1; 2443 continue; /* with next sack block */ 2444 } 2445 /* Go thru list of holes: p = previous, cur = current */ 2446 p = cur = tp->snd_holes; 2447 while (cur) { 2448 if (SEQ_LEQ(sack.end, cur->start)) 2449 /* SACKs data before the current hole */ 2450 break; /* no use going through more holes */ 2451 if (SEQ_GEQ(sack.start, cur->end)) { 2452 /* SACKs data beyond the current hole */ 2453 cur->dups++; 2454 if (((sack.end - cur->end)/tp->t_maxseg) >= 2455 tcprexmtthresh) 2456 cur->dups = tcprexmtthresh; 2457 p = cur; 2458 cur = cur->next; 2459 continue; 2460 } 2461 if (SEQ_LEQ(sack.start, cur->start)) { 2462 /* Data acks at least the beginning of hole */ 2463 if (SEQ_GEQ(sack.end, cur->end)) { 2464 /* Acks entire hole, so delete hole */ 2465 if (p != cur) { 2466 p->next = cur->next; 2467 pool_put(&sackhl_pool, cur); 2468 cur = p->next; 2469 } else { 2470 cur = cur->next; 2471 pool_put(&sackhl_pool, p); 2472 p = cur; 2473 tp->snd_holes = p; 2474 } 2475 tp->snd_numholes--; 2476 continue; 2477 } 2478 /* otherwise, move start of hole forward */ 2479 cur->start = sack.end; 2480 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2481 p = cur; 2482 cur = cur->next; 2483 continue; 2484 } 2485 /* move end of hole backward */ 2486 if (SEQ_GEQ(sack.end, cur->end)) { 2487 cur->end = sack.start; 2488 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2489 cur->dups++; 2490 if (((sack.end - cur->end)/tp->t_maxseg) >= 2491 tcprexmtthresh) 2492 cur->dups = tcprexmtthresh; 2493 p = cur; 2494 cur = cur->next; 2495 continue; 2496 } 2497 if (SEQ_LT(cur->start, sack.start) && 2498 SEQ_GT(cur->end, sack.end)) { 2499 /* 2500 * ACKs some data in middle of a hole; need to 2501 * split current hole 2502 */ 2503 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2504 goto dropped; 2505 temp = (struct sackhole *) 2506 pool_get(&sackhl_pool, PR_NOWAIT); 2507 if (temp == NULL) 2508 goto dropped; /* ENOBUFS */ 2509 temp->next = cur->next; 2510 temp->start = sack.end; 2511 temp->end = cur->end; 2512 temp->dups = cur->dups; 2513 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2514 cur->end = sack.start; 2515 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2516 cur->dups++; 2517 if (((sack.end - cur->end)/tp->t_maxseg) >= 2518 tcprexmtthresh) 2519 cur->dups = tcprexmtthresh; 2520 cur->next = temp; 2521 p = temp; 2522 cur = p->next; 2523 tp->snd_numholes++; 2524 } 2525 } 2526 /* At this point, p points to the last hole on the list */ 2527 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2528 /* 2529 * Need to append new hole at end. 2530 * Last hole is p (and it's not NULL). 2531 */ 2532 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2533 goto dropped; 2534 temp = (struct sackhole *) 2535 pool_get(&sackhl_pool, PR_NOWAIT); 2536 if (temp == NULL) 2537 goto dropped; /* ENOBUFS */ 2538 temp->start = tp->rcv_lastsack; 2539 temp->end = sack.start; 2540 temp->dups = min(tcprexmtthresh, 2541 ((sack.end - sack.start)/tp->t_maxseg)); 2542 if (temp->dups < 1) 2543 temp->dups = 1; 2544 temp->rxmit = temp->start; 2545 temp->next = 0; 2546 p->next = temp; 2547 tp->rcv_lastsack = sack.end; 2548 tp->snd_numholes++; 2549 } 2550 } 2551 return; 2552 dropped: 2553 tcpstat_inc(tcps_sack_drop_opts); 2554 } 2555 2556 /* 2557 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2558 * it is completely acked; otherwise, tcp_sack_option(), called from 2559 * tcp_dooptions(), will fix up the hole. 2560 */ 2561 void 2562 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2563 { 2564 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2565 /* max because this could be an older ack just arrived */ 2566 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2567 th->th_ack : tp->snd_una; 2568 struct sackhole *cur = tp->snd_holes; 2569 struct sackhole *prev; 2570 while (cur) 2571 if (SEQ_LEQ(cur->end, lastack)) { 2572 prev = cur; 2573 cur = cur->next; 2574 pool_put(&sackhl_pool, prev); 2575 tp->snd_numholes--; 2576 } else if (SEQ_LT(cur->start, lastack)) { 2577 cur->start = lastack; 2578 if (SEQ_LT(cur->rxmit, cur->start)) 2579 cur->rxmit = cur->start; 2580 break; 2581 } else 2582 break; 2583 tp->snd_holes = cur; 2584 } 2585 } 2586 2587 /* 2588 * Delete all receiver-side SACK information. 2589 */ 2590 void 2591 tcp_clean_sackreport(struct tcpcb *tp) 2592 { 2593 int i; 2594 2595 tp->rcv_numsacks = 0; 2596 for (i = 0; i < MAX_SACK_BLKS; i++) 2597 tp->sackblks[i].start = tp->sackblks[i].end=0; 2598 2599 } 2600 2601 /* 2602 * Partial ack handling within a sack recovery episode. When a partial ack 2603 * arrives, turn off retransmission timer, deflate the window, do not clear 2604 * tp->t_dupacks. 2605 */ 2606 void 2607 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2608 { 2609 /* Turn off retx. timer (will start again next segment) */ 2610 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2611 tp->t_rtttime = 0; 2612 /* 2613 * Partial window deflation. This statement relies on the 2614 * fact that tp->snd_una has not been updated yet. 2615 */ 2616 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2617 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2618 tp->snd_cwnd += tp->t_maxseg; 2619 } else 2620 tp->snd_cwnd = tp->t_maxseg; 2621 tp->snd_cwnd += tp->t_maxseg; 2622 tp->t_flags |= TF_NEEDOUTPUT; 2623 } 2624 2625 /* 2626 * Pull out of band byte out of a segment so 2627 * it doesn't appear in the user's data queue. 2628 * It is still reflected in the segment length for 2629 * sequencing purposes. 2630 */ 2631 void 2632 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2633 { 2634 int cnt = off + urgent - 1; 2635 2636 while (cnt >= 0) { 2637 if (m->m_len > cnt) { 2638 char *cp = mtod(m, caddr_t) + cnt; 2639 struct tcpcb *tp = sototcpcb(so); 2640 2641 tp->t_iobc = *cp; 2642 tp->t_oobflags |= TCPOOB_HAVEDATA; 2643 memmove(cp, cp + 1, m->m_len - cnt - 1); 2644 m->m_len--; 2645 return; 2646 } 2647 cnt -= m->m_len; 2648 m = m->m_next; 2649 if (m == NULL) 2650 break; 2651 } 2652 panic("tcp_pulloutofband"); 2653 } 2654 2655 /* 2656 * Collect new round-trip time estimate 2657 * and update averages and current timeout. 2658 */ 2659 void 2660 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2661 { 2662 short delta; 2663 short rttmin; 2664 2665 if (rtt < 0) 2666 rtt = 0; 2667 else if (rtt > TCP_RTT_MAX) 2668 rtt = TCP_RTT_MAX; 2669 2670 tcpstat_inc(tcps_rttupdated); 2671 if (tp->t_srtt != 0) { 2672 /* 2673 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2674 * after the binary point (scaled by 4), whereas 2675 * srtt is stored as fixed point with 5 bits after the 2676 * binary point (i.e., scaled by 32). The following magic 2677 * is equivalent to the smoothing algorithm in rfc793 with 2678 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2679 * point). 2680 */ 2681 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2682 (tp->t_srtt >> TCP_RTT_SHIFT); 2683 if ((tp->t_srtt += delta) <= 0) 2684 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2685 /* 2686 * We accumulate a smoothed rtt variance (actually, a 2687 * smoothed mean difference), then set the retransmit 2688 * timer to smoothed rtt + 4 times the smoothed variance. 2689 * rttvar is stored as fixed point with 4 bits after the 2690 * binary point (scaled by 16). The following is 2691 * equivalent to rfc793 smoothing with an alpha of .75 2692 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2693 * rfc793's wired-in beta. 2694 */ 2695 if (delta < 0) 2696 delta = -delta; 2697 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2698 if ((tp->t_rttvar += delta) <= 0) 2699 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2700 } else { 2701 /* 2702 * No rtt measurement yet - use the unsmoothed rtt. 2703 * Set the variance to half the rtt (so our first 2704 * retransmit happens at 3*rtt). 2705 */ 2706 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2707 tp->t_rttvar = (rtt + 1) << 2708 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2709 } 2710 tp->t_rtttime = 0; 2711 tp->t_rxtshift = 0; 2712 2713 /* 2714 * the retransmit should happen at rtt + 4 * rttvar. 2715 * Because of the way we do the smoothing, srtt and rttvar 2716 * will each average +1/2 tick of bias. When we compute 2717 * the retransmit timer, we want 1/2 tick of rounding and 2718 * 1 extra tick because of +-1/2 tick uncertainty in the 2719 * firing of the timer. The bias will give us exactly the 2720 * 1.5 tick we need. But, because the bias is 2721 * statistical, we have to test that we don't drop below 2722 * the minimum feasible timer (which is 2 ticks). 2723 */ 2724 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2725 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2726 2727 /* 2728 * We received an ack for a packet that wasn't retransmitted; 2729 * it is probably safe to discard any error indications we've 2730 * received recently. This isn't quite right, but close enough 2731 * for now (a route might have failed after we sent a segment, 2732 * and the return path might not be symmetrical). 2733 */ 2734 tp->t_softerror = 0; 2735 } 2736 2737 /* 2738 * Determine a reasonable value for maxseg size. 2739 * If the route is known, check route for mtu. 2740 * If none, use an mss that can be handled on the outgoing 2741 * interface without forcing IP to fragment; if bigger than 2742 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2743 * to utilize large mbufs. If no route is found, route has no mtu, 2744 * or the destination isn't local, use a default, hopefully conservative 2745 * size (usually 512 or the default IP max size, but no more than the mtu 2746 * of the interface), as we can't discover anything about intervening 2747 * gateways or networks. We also initialize the congestion/slow start 2748 * window to be a single segment if the destination isn't local. 2749 * While looking at the routing entry, we also initialize other path-dependent 2750 * parameters from pre-set or cached values in the routing entry. 2751 * 2752 * Also take into account the space needed for options that we 2753 * send regularly. Make maxseg shorter by that amount to assure 2754 * that we can send maxseg amount of data even when the options 2755 * are present. Store the upper limit of the length of options plus 2756 * data in maxopd. 2757 * 2758 * NOTE: offer == -1 indicates that the maxseg size changed due to 2759 * Path MTU discovery. 2760 */ 2761 int 2762 tcp_mss(struct tcpcb *tp, int offer) 2763 { 2764 struct rtentry *rt; 2765 struct ifnet *ifp = NULL; 2766 int mss, mssopt; 2767 int iphlen; 2768 struct inpcb *inp; 2769 2770 inp = tp->t_inpcb; 2771 2772 mssopt = mss = tcp_mssdflt; 2773 2774 rt = in_pcbrtentry(inp); 2775 2776 if (rt == NULL) 2777 goto out; 2778 2779 ifp = if_get(rt->rt_ifidx); 2780 if (ifp == NULL) 2781 goto out; 2782 2783 switch (tp->pf) { 2784 #ifdef INET6 2785 case AF_INET6: 2786 iphlen = sizeof(struct ip6_hdr); 2787 break; 2788 #endif 2789 case AF_INET: 2790 iphlen = sizeof(struct ip); 2791 break; 2792 default: 2793 /* the family does not support path MTU discovery */ 2794 goto out; 2795 } 2796 2797 /* 2798 * if there's an mtu associated with the route and we support 2799 * path MTU discovery for the underlying protocol family, use it. 2800 */ 2801 if (rt->rt_mtu) { 2802 /* 2803 * One may wish to lower MSS to take into account options, 2804 * especially security-related options. 2805 */ 2806 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2807 /* 2808 * RFC2460 section 5, last paragraph: if path MTU is 2809 * smaller than 1280, use 1280 as packet size and 2810 * attach fragment header. 2811 */ 2812 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2813 sizeof(struct tcphdr); 2814 } else { 2815 mss = rt->rt_mtu - iphlen - 2816 sizeof(struct tcphdr); 2817 } 2818 } else if (ifp->if_flags & IFF_LOOPBACK) { 2819 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2820 } else if (tp->pf == AF_INET) { 2821 if (ip_mtudisc) 2822 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2823 } 2824 #ifdef INET6 2825 else if (tp->pf == AF_INET6) { 2826 /* 2827 * for IPv6, path MTU discovery is always turned on, 2828 * or the node must use packet size <= 1280. 2829 */ 2830 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2831 } 2832 #endif /* INET6 */ 2833 2834 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2835 if (offer != -1) { 2836 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2837 mssopt = max(tcp_mssdflt, mssopt); 2838 } 2839 out: 2840 if_put(ifp); 2841 /* 2842 * The current mss, t_maxseg, is initialized to the default value. 2843 * If we compute a smaller value, reduce the current mss. 2844 * If we compute a larger value, return it for use in sending 2845 * a max seg size option, but don't store it for use 2846 * unless we received an offer at least that large from peer. 2847 * 2848 * However, do not accept offers lower than the minimum of 2849 * the interface MTU and 216. 2850 */ 2851 if (offer > 0) 2852 tp->t_peermss = offer; 2853 if (tp->t_peermss) 2854 mss = min(mss, max(tp->t_peermss, 216)); 2855 2856 /* sanity - at least max opt. space */ 2857 mss = max(mss, 64); 2858 2859 /* 2860 * maxopd stores the maximum length of data AND options 2861 * in a segment; maxseg is the amount of data in a normal 2862 * segment. We need to store this value (maxopd) apart 2863 * from maxseg, because now every segment carries options 2864 * and thus we normally have somewhat less data in segments. 2865 */ 2866 tp->t_maxopd = mss; 2867 2868 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2869 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2870 mss -= TCPOLEN_TSTAMP_APPA; 2871 #ifdef TCP_SIGNATURE 2872 if (tp->t_flags & TF_SIGNATURE) 2873 mss -= TCPOLEN_SIGLEN; 2874 #endif 2875 2876 if (offer == -1) { 2877 /* mss changed due to Path MTU discovery */ 2878 tp->t_flags &= ~TF_PMTUD_PEND; 2879 tp->t_pmtud_mtu_sent = 0; 2880 tp->t_pmtud_mss_acked = 0; 2881 if (mss < tp->t_maxseg) { 2882 /* 2883 * Follow suggestion in RFC 2414 to reduce the 2884 * congestion window by the ratio of the old 2885 * segment size to the new segment size. 2886 */ 2887 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2888 mss, mss); 2889 } 2890 } else if (tcp_do_rfc3390 == 2) { 2891 /* increase initial window */ 2892 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2893 } else if (tcp_do_rfc3390) { 2894 /* increase initial window */ 2895 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2896 } else 2897 tp->snd_cwnd = mss; 2898 2899 tp->t_maxseg = mss; 2900 2901 return (offer != -1 ? mssopt : mss); 2902 } 2903 2904 u_int 2905 tcp_hdrsz(struct tcpcb *tp) 2906 { 2907 u_int hlen; 2908 2909 switch (tp->pf) { 2910 #ifdef INET6 2911 case AF_INET6: 2912 hlen = sizeof(struct ip6_hdr); 2913 break; 2914 #endif 2915 case AF_INET: 2916 hlen = sizeof(struct ip); 2917 break; 2918 default: 2919 hlen = 0; 2920 break; 2921 } 2922 hlen += sizeof(struct tcphdr); 2923 2924 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2925 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2926 hlen += TCPOLEN_TSTAMP_APPA; 2927 #ifdef TCP_SIGNATURE 2928 if (tp->t_flags & TF_SIGNATURE) 2929 hlen += TCPOLEN_SIGLEN; 2930 #endif 2931 return (hlen); 2932 } 2933 2934 /* 2935 * Set connection variables based on the effective MSS. 2936 * We are passed the TCPCB for the actual connection. If we 2937 * are the server, we are called by the compressed state engine 2938 * when the 3-way handshake is complete. If we are the client, 2939 * we are called when we receive the SYN,ACK from the server. 2940 * 2941 * NOTE: The t_maxseg value must be initialized in the TCPCB 2942 * before this routine is called! 2943 */ 2944 void 2945 tcp_mss_update(struct tcpcb *tp) 2946 { 2947 int mss; 2948 u_long bufsize; 2949 struct rtentry *rt; 2950 struct socket *so; 2951 2952 so = tp->t_inpcb->inp_socket; 2953 mss = tp->t_maxseg; 2954 2955 rt = in_pcbrtentry(tp->t_inpcb); 2956 2957 if (rt == NULL) 2958 return; 2959 2960 bufsize = so->so_snd.sb_hiwat; 2961 if (bufsize < mss) { 2962 mss = bufsize; 2963 /* Update t_maxseg and t_maxopd */ 2964 tcp_mss(tp, mss); 2965 } else { 2966 bufsize = roundup(bufsize, mss); 2967 if (bufsize > sb_max) 2968 bufsize = sb_max; 2969 (void)sbreserve(so, &so->so_snd, bufsize); 2970 } 2971 2972 bufsize = so->so_rcv.sb_hiwat; 2973 if (bufsize > mss) { 2974 bufsize = roundup(bufsize, mss); 2975 if (bufsize > sb_max) 2976 bufsize = sb_max; 2977 (void)sbreserve(so, &so->so_rcv, bufsize); 2978 } 2979 2980 } 2981 2982 /* 2983 * When a partial ack arrives, force the retransmission of the 2984 * next unacknowledged segment. Do not clear tp->t_dupacks. 2985 * By setting snd_nxt to ti_ack, this forces retransmission timer 2986 * to be started again. 2987 */ 2988 void 2989 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 2990 { 2991 /* 2992 * snd_una has not been updated and the socket send buffer 2993 * not yet drained of the acked data, so we have to leave 2994 * snd_una as it was to get the correct data offset in 2995 * tcp_output(). 2996 */ 2997 tcp_seq onxt = tp->snd_nxt; 2998 u_long ocwnd = tp->snd_cwnd; 2999 3000 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3001 tp->t_rtttime = 0; 3002 tp->snd_nxt = th->th_ack; 3003 /* 3004 * Set snd_cwnd to one segment beyond acknowledged offset 3005 * (tp->snd_una not yet updated when this function is called) 3006 */ 3007 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3008 (void)tcp_output(tp); 3009 tp->snd_cwnd = ocwnd; 3010 if (SEQ_GT(onxt, tp->snd_nxt)) 3011 tp->snd_nxt = onxt; 3012 /* 3013 * Partial window deflation. Relies on fact that tp->snd_una 3014 * not updated yet. 3015 */ 3016 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3017 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3018 else 3019 tp->snd_cwnd = 0; 3020 tp->snd_cwnd += tp->t_maxseg; 3021 } 3022 3023 int 3024 tcp_mss_adv(struct mbuf *m, int af) 3025 { 3026 int mss = 0; 3027 int iphlen; 3028 struct ifnet *ifp = NULL; 3029 3030 if (m && (m->m_flags & M_PKTHDR)) 3031 ifp = if_get(m->m_pkthdr.ph_ifidx); 3032 3033 switch (af) { 3034 case AF_INET: 3035 if (ifp != NULL) 3036 mss = ifp->if_mtu; 3037 iphlen = sizeof(struct ip); 3038 break; 3039 #ifdef INET6 3040 case AF_INET6: 3041 if (ifp != NULL) 3042 mss = ifp->if_mtu; 3043 iphlen = sizeof(struct ip6_hdr); 3044 break; 3045 #endif 3046 default: 3047 unhandled_af(af); 3048 } 3049 if_put(ifp); 3050 mss = mss - iphlen - sizeof(struct tcphdr); 3051 return (max(mss, tcp_mssdflt)); 3052 } 3053 3054 /* 3055 * TCP compressed state engine. Currently used to hold compressed 3056 * state for SYN_RECEIVED. 3057 */ 3058 3059 /* syn hash parameters */ 3060 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3061 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3062 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3063 int tcp_syn_use_limit = 100000; 3064 3065 struct syn_cache_set tcp_syn_cache[2]; 3066 int tcp_syn_cache_active; 3067 3068 #define SYN_HASH(sa, sp, dp, rand) \ 3069 (((sa)->s_addr ^ (rand)[0]) * \ 3070 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3071 #ifndef INET6 3072 #define SYN_HASHALL(hash, src, dst, rand) \ 3073 do { \ 3074 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3075 satosin(src)->sin_port, \ 3076 satosin(dst)->sin_port, (rand)); \ 3077 } while (/*CONSTCOND*/ 0) 3078 #else 3079 #define SYN_HASH6(sa, sp, dp, rand) \ 3080 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3081 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3082 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3083 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3084 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3085 3086 #define SYN_HASHALL(hash, src, dst, rand) \ 3087 do { \ 3088 switch ((src)->sa_family) { \ 3089 case AF_INET: \ 3090 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3091 satosin(src)->sin_port, \ 3092 satosin(dst)->sin_port, (rand)); \ 3093 break; \ 3094 case AF_INET6: \ 3095 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3096 satosin6(src)->sin6_port, \ 3097 satosin6(dst)->sin6_port, (rand)); \ 3098 break; \ 3099 default: \ 3100 hash = 0; \ 3101 } \ 3102 } while (/*CONSTCOND*/0) 3103 #endif /* INET6 */ 3104 3105 void 3106 syn_cache_rm(struct syn_cache *sc) 3107 { 3108 sc->sc_flags |= SCF_DEAD; 3109 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3110 sc->sc_tp = NULL; 3111 LIST_REMOVE(sc, sc_tpq); 3112 sc->sc_buckethead->sch_length--; 3113 timeout_del(&sc->sc_timer); 3114 sc->sc_set->scs_count--; 3115 } 3116 3117 void 3118 syn_cache_put(struct syn_cache *sc) 3119 { 3120 m_free(sc->sc_ipopts); 3121 if (sc->sc_route4.ro_rt != NULL) { 3122 rtfree(sc->sc_route4.ro_rt); 3123 sc->sc_route4.ro_rt = NULL; 3124 } 3125 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3126 timeout_add(&sc->sc_timer, 0); 3127 } 3128 3129 struct pool syn_cache_pool; 3130 3131 /* 3132 * We don't estimate RTT with SYNs, so each packet starts with the default 3133 * RTT and each timer step has a fixed timeout value. 3134 */ 3135 #define SYN_CACHE_TIMER_ARM(sc) \ 3136 do { \ 3137 TCPT_RANGESET((sc)->sc_rxtcur, \ 3138 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3139 TCPTV_REXMTMAX); \ 3140 if (!timeout_initialized(&(sc)->sc_timer)) \ 3141 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3142 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3143 } while (/*CONSTCOND*/0) 3144 3145 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3146 3147 void 3148 syn_cache_init(void) 3149 { 3150 int i; 3151 3152 /* Initialize the hash buckets. */ 3153 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3154 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3155 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3156 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3157 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3158 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3159 for (i = 0; i < tcp_syn_hash_size; i++) { 3160 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3161 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3162 } 3163 3164 /* Initialize the syn cache pool. */ 3165 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3166 0, "syncache", NULL); 3167 } 3168 3169 void 3170 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3171 { 3172 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3173 struct syn_cache_head *scp; 3174 struct syn_cache *sc2; 3175 int i; 3176 3177 NET_ASSERT_LOCKED(); 3178 3179 /* 3180 * If there are no entries in the hash table, reinitialize 3181 * the hash secrets. To avoid useless cache swaps and 3182 * reinitialization, use it until the limit is reached. 3183 * An emtpy cache is also the oportunity to resize the hash. 3184 */ 3185 if (set->scs_count == 0 && set->scs_use <= 0) { 3186 set->scs_use = tcp_syn_use_limit; 3187 if (set->scs_size != tcp_syn_hash_size) { 3188 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3189 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3190 if (scp == NULL) { 3191 /* Try again next time. */ 3192 set->scs_use = 0; 3193 } else { 3194 free(set->scs_buckethead, M_SYNCACHE, 3195 set->scs_size * 3196 sizeof(struct syn_cache_head)); 3197 set->scs_buckethead = scp; 3198 set->scs_size = tcp_syn_hash_size; 3199 for (i = 0; i < tcp_syn_hash_size; i++) 3200 TAILQ_INIT(&scp[i].sch_bucket); 3201 } 3202 } 3203 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3204 tcpstat_inc(tcps_sc_seedrandom); 3205 } 3206 3207 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3208 set->scs_random); 3209 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3210 sc->sc_buckethead = scp; 3211 3212 /* 3213 * Make sure that we don't overflow the per-bucket 3214 * limit or the total cache size limit. 3215 */ 3216 if (scp->sch_length >= tcp_syn_bucket_limit) { 3217 tcpstat_inc(tcps_sc_bucketoverflow); 3218 /* 3219 * Someone might attack our bucket hash function. Reseed 3220 * with random as soon as the passive syn cache gets empty. 3221 */ 3222 set->scs_use = 0; 3223 /* 3224 * The bucket is full. Toss the oldest element in the 3225 * bucket. This will be the first entry in the bucket. 3226 */ 3227 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3228 #ifdef DIAGNOSTIC 3229 /* 3230 * This should never happen; we should always find an 3231 * entry in our bucket. 3232 */ 3233 if (sc2 == NULL) 3234 panic("%s: bucketoverflow: impossible", __func__); 3235 #endif 3236 syn_cache_rm(sc2); 3237 syn_cache_put(sc2); 3238 } else if (set->scs_count >= tcp_syn_cache_limit) { 3239 struct syn_cache_head *scp2, *sce; 3240 3241 tcpstat_inc(tcps_sc_overflowed); 3242 /* 3243 * The cache is full. Toss the oldest entry in the 3244 * first non-empty bucket we can find. 3245 * 3246 * XXX We would really like to toss the oldest 3247 * entry in the cache, but we hope that this 3248 * condition doesn't happen very often. 3249 */ 3250 scp2 = scp; 3251 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3252 sce = &set->scs_buckethead[set->scs_size]; 3253 for (++scp2; scp2 != scp; scp2++) { 3254 if (scp2 >= sce) 3255 scp2 = &set->scs_buckethead[0]; 3256 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3257 break; 3258 } 3259 #ifdef DIAGNOSTIC 3260 /* 3261 * This should never happen; we should always find a 3262 * non-empty bucket. 3263 */ 3264 if (scp2 == scp) 3265 panic("%s: cacheoverflow: impossible", 3266 __func__); 3267 #endif 3268 } 3269 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3270 syn_cache_rm(sc2); 3271 syn_cache_put(sc2); 3272 } 3273 3274 /* 3275 * Initialize the entry's timer. 3276 */ 3277 sc->sc_rxttot = 0; 3278 sc->sc_rxtshift = 0; 3279 SYN_CACHE_TIMER_ARM(sc); 3280 3281 /* Link it from tcpcb entry */ 3282 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3283 3284 /* Put it into the bucket. */ 3285 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3286 scp->sch_length++; 3287 sc->sc_set = set; 3288 set->scs_count++; 3289 set->scs_use--; 3290 3291 tcpstat_inc(tcps_sc_added); 3292 3293 /* 3294 * If the active cache has exceeded its use limit and 3295 * the passive syn cache is empty, exchange their roles. 3296 */ 3297 if (set->scs_use <= 0 && 3298 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3299 tcp_syn_cache_active = !tcp_syn_cache_active; 3300 } 3301 3302 /* 3303 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3304 * If we have retransmitted an entry the maximum number of times, expire 3305 * that entry. 3306 */ 3307 void 3308 syn_cache_timer(void *arg) 3309 { 3310 struct syn_cache *sc = arg; 3311 3312 NET_LOCK(); 3313 if (sc->sc_flags & SCF_DEAD) 3314 goto out; 3315 3316 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3317 /* Drop it -- too many retransmissions. */ 3318 goto dropit; 3319 } 3320 3321 /* 3322 * Compute the total amount of time this entry has 3323 * been on a queue. If this entry has been on longer 3324 * than the keep alive timer would allow, expire it. 3325 */ 3326 sc->sc_rxttot += sc->sc_rxtcur; 3327 if (sc->sc_rxttot >= tcptv_keep_init) 3328 goto dropit; 3329 3330 tcpstat_inc(tcps_sc_retransmitted); 3331 (void) syn_cache_respond(sc, NULL); 3332 3333 /* Advance the timer back-off. */ 3334 sc->sc_rxtshift++; 3335 SYN_CACHE_TIMER_ARM(sc); 3336 3337 out: 3338 NET_UNLOCK(); 3339 return; 3340 3341 dropit: 3342 tcpstat_inc(tcps_sc_timed_out); 3343 syn_cache_rm(sc); 3344 syn_cache_put(sc); 3345 NET_UNLOCK(); 3346 } 3347 3348 void 3349 syn_cache_reaper(void *arg) 3350 { 3351 struct syn_cache *sc = arg; 3352 3353 pool_put(&syn_cache_pool, (sc)); 3354 return; 3355 } 3356 3357 /* 3358 * Remove syn cache created by the specified tcb entry, 3359 * because this does not make sense to keep them 3360 * (if there's no tcb entry, syn cache entry will never be used) 3361 */ 3362 void 3363 syn_cache_cleanup(struct tcpcb *tp) 3364 { 3365 struct syn_cache *sc, *nsc; 3366 3367 NET_ASSERT_LOCKED(); 3368 3369 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3370 #ifdef DIAGNOSTIC 3371 if (sc->sc_tp != tp) 3372 panic("invalid sc_tp in syn_cache_cleanup"); 3373 #endif 3374 syn_cache_rm(sc); 3375 syn_cache_put(sc); 3376 } 3377 /* just for safety */ 3378 LIST_INIT(&tp->t_sc); 3379 } 3380 3381 /* 3382 * Find an entry in the syn cache. 3383 */ 3384 struct syn_cache * 3385 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3386 struct syn_cache_head **headp, u_int rtableid) 3387 { 3388 struct syn_cache_set *sets[2]; 3389 struct syn_cache *sc; 3390 struct syn_cache_head *scp; 3391 u_int32_t hash; 3392 int i; 3393 3394 NET_ASSERT_LOCKED(); 3395 3396 /* Check the active cache first, the passive cache is likely emtpy. */ 3397 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3398 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3399 for (i = 0; i < 2; i++) { 3400 if (sets[i]->scs_count == 0) 3401 continue; 3402 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3403 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3404 *headp = scp; 3405 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3406 if (sc->sc_hash != hash) 3407 continue; 3408 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3409 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3410 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3411 return (sc); 3412 } 3413 } 3414 return (NULL); 3415 } 3416 3417 /* 3418 * This function gets called when we receive an ACK for a 3419 * socket in the LISTEN state. We look up the connection 3420 * in the syn cache, and if its there, we pull it out of 3421 * the cache and turn it into a full-blown connection in 3422 * the SYN-RECEIVED state. 3423 * 3424 * The return values may not be immediately obvious, and their effects 3425 * can be subtle, so here they are: 3426 * 3427 * NULL SYN was not found in cache; caller should drop the 3428 * packet and send an RST. 3429 * 3430 * -1 We were unable to create the new connection, and are 3431 * aborting it. An ACK,RST is being sent to the peer 3432 * (unless we got screwey sequence numbners; see below), 3433 * because the 3-way handshake has been completed. Caller 3434 * should not free the mbuf, since we may be using it. If 3435 * we are not, we will free it. 3436 * 3437 * Otherwise, the return value is a pointer to the new socket 3438 * associated with the connection. 3439 */ 3440 struct socket * 3441 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3442 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3443 { 3444 struct syn_cache *sc; 3445 struct syn_cache_head *scp; 3446 struct inpcb *inp, *oldinp; 3447 struct tcpcb *tp = NULL; 3448 struct mbuf *am; 3449 struct socket *oso; 3450 3451 NET_ASSERT_LOCKED(); 3452 3453 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3454 if (sc == NULL) 3455 return (NULL); 3456 3457 /* 3458 * Verify the sequence and ack numbers. Try getting the correct 3459 * response again. 3460 */ 3461 if ((th->th_ack != sc->sc_iss + 1) || 3462 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3463 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3464 (void) syn_cache_respond(sc, m); 3465 return ((struct socket *)(-1)); 3466 } 3467 3468 /* Remove this cache entry */ 3469 syn_cache_rm(sc); 3470 3471 /* 3472 * Ok, create the full blown connection, and set things up 3473 * as they would have been set up if we had created the 3474 * connection when the SYN arrived. If we can't create 3475 * the connection, abort it. 3476 */ 3477 oso = so; 3478 so = sonewconn(so, SS_ISCONNECTED); 3479 if (so == NULL) 3480 goto resetandabort; 3481 3482 oldinp = sotoinpcb(oso); 3483 inp = sotoinpcb(so); 3484 3485 #ifdef IPSEC 3486 /* 3487 * We need to copy the required security levels 3488 * from the old pcb. Ditto for any other 3489 * IPsec-related information. 3490 */ 3491 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3492 sizeof(oldinp->inp_seclevel)); 3493 #endif /* IPSEC */ 3494 #ifdef INET6 3495 /* 3496 * inp still has the OLD in_pcb stuff, set the 3497 * v6-related flags on the new guy, too. 3498 */ 3499 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3500 if (inp->inp_flags & INP_IPV6) { 3501 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3502 inp->inp_hops = oldinp->inp_hops; 3503 } else 3504 #endif /* INET6 */ 3505 { 3506 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3507 } 3508 3509 #if NPF > 0 3510 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3511 struct pf_divert *divert; 3512 3513 divert = pf_find_divert(m); 3514 KASSERT(divert != NULL); 3515 inp->inp_rtableid = divert->rdomain; 3516 } else 3517 #endif 3518 /* inherit rtable from listening socket */ 3519 inp->inp_rtableid = sc->sc_rtableid; 3520 3521 inp->inp_lport = th->th_dport; 3522 switch (src->sa_family) { 3523 #ifdef INET6 3524 case AF_INET6: 3525 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3526 break; 3527 #endif /* INET6 */ 3528 case AF_INET: 3529 inp->inp_laddr = satosin(dst)->sin_addr; 3530 inp->inp_options = ip_srcroute(m); 3531 if (inp->inp_options == NULL) { 3532 inp->inp_options = sc->sc_ipopts; 3533 sc->sc_ipopts = NULL; 3534 } 3535 break; 3536 } 3537 in_pcbrehash(inp); 3538 3539 /* 3540 * Give the new socket our cached route reference. 3541 */ 3542 if (src->sa_family == AF_INET) 3543 inp->inp_route = sc->sc_route4; /* struct assignment */ 3544 #ifdef INET6 3545 else 3546 inp->inp_route6 = sc->sc_route6; 3547 #endif 3548 sc->sc_route4.ro_rt = NULL; 3549 3550 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3551 if (am == NULL) 3552 goto resetandabort; 3553 am->m_len = src->sa_len; 3554 memcpy(mtod(am, caddr_t), src, src->sa_len); 3555 if (in_pcbconnect(inp, am)) { 3556 (void) m_free(am); 3557 goto resetandabort; 3558 } 3559 (void) m_free(am); 3560 3561 tp = intotcpcb(inp); 3562 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3563 if (sc->sc_request_r_scale != 15) { 3564 tp->requested_s_scale = sc->sc_requested_s_scale; 3565 tp->request_r_scale = sc->sc_request_r_scale; 3566 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3567 } 3568 if (sc->sc_flags & SCF_TIMESTAMP) 3569 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3570 3571 tp->t_template = tcp_template(tp); 3572 if (tp->t_template == 0) { 3573 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3574 so = NULL; 3575 goto abort; 3576 } 3577 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3578 tp->ts_modulate = sc->sc_modulate; 3579 tp->ts_recent = sc->sc_timestamp; 3580 tp->iss = sc->sc_iss; 3581 tp->irs = sc->sc_irs; 3582 tcp_sendseqinit(tp); 3583 tp->snd_last = tp->snd_una; 3584 #ifdef TCP_ECN 3585 if (sc->sc_flags & SCF_ECN_PERMIT) { 3586 tp->t_flags |= TF_ECN_PERMIT; 3587 tcpstat_inc(tcps_ecn_accepts); 3588 } 3589 #endif 3590 if (sc->sc_flags & SCF_SACK_PERMIT) 3591 tp->t_flags |= TF_SACK_PERMIT; 3592 #ifdef TCP_SIGNATURE 3593 if (sc->sc_flags & SCF_SIGNATURE) 3594 tp->t_flags |= TF_SIGNATURE; 3595 #endif 3596 tcp_rcvseqinit(tp); 3597 tp->t_state = TCPS_SYN_RECEIVED; 3598 tp->t_rcvtime = tcp_now; 3599 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3600 tcpstat_inc(tcps_accepts); 3601 3602 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3603 if (sc->sc_peermaxseg) 3604 tcp_mss_update(tp); 3605 /* Reset initial window to 1 segment for retransmit */ 3606 if (sc->sc_rxtshift > 0) 3607 tp->snd_cwnd = tp->t_maxseg; 3608 tp->snd_wl1 = sc->sc_irs; 3609 tp->rcv_up = sc->sc_irs + 1; 3610 3611 /* 3612 * This is what whould have happened in tcp_output() when 3613 * the SYN,ACK was sent. 3614 */ 3615 tp->snd_up = tp->snd_una; 3616 tp->snd_max = tp->snd_nxt = tp->iss+1; 3617 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3618 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3619 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3620 tp->last_ack_sent = tp->rcv_nxt; 3621 3622 tcpstat_inc(tcps_sc_completed); 3623 syn_cache_put(sc); 3624 return (so); 3625 3626 resetandabort: 3627 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3628 m->m_pkthdr.ph_rtableid); 3629 abort: 3630 m_freem(m); 3631 if (so != NULL) 3632 (void) soabort(so); 3633 syn_cache_put(sc); 3634 tcpstat_inc(tcps_sc_aborted); 3635 return ((struct socket *)(-1)); 3636 } 3637 3638 /* 3639 * This function is called when we get a RST for a 3640 * non-existent connection, so that we can see if the 3641 * connection is in the syn cache. If it is, zap it. 3642 */ 3643 3644 void 3645 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3646 u_int rtableid) 3647 { 3648 struct syn_cache *sc; 3649 struct syn_cache_head *scp; 3650 3651 NET_ASSERT_LOCKED(); 3652 3653 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3654 return; 3655 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3656 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3657 return; 3658 syn_cache_rm(sc); 3659 tcpstat_inc(tcps_sc_reset); 3660 syn_cache_put(sc); 3661 } 3662 3663 void 3664 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3665 u_int rtableid) 3666 { 3667 struct syn_cache *sc; 3668 struct syn_cache_head *scp; 3669 3670 NET_ASSERT_LOCKED(); 3671 3672 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3673 return; 3674 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3675 if (ntohl (th->th_seq) != sc->sc_iss) { 3676 return; 3677 } 3678 3679 /* 3680 * If we've retransmitted 3 times and this is our second error, 3681 * we remove the entry. Otherwise, we allow it to continue on. 3682 * This prevents us from incorrectly nuking an entry during a 3683 * spurious network outage. 3684 * 3685 * See tcp_notify(). 3686 */ 3687 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3688 sc->sc_flags |= SCF_UNREACH; 3689 return; 3690 } 3691 3692 syn_cache_rm(sc); 3693 tcpstat_inc(tcps_sc_unreach); 3694 syn_cache_put(sc); 3695 } 3696 3697 /* 3698 * Given a LISTEN socket and an inbound SYN request, add 3699 * this to the syn cache, and send back a segment: 3700 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3701 * to the source. 3702 * 3703 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3704 * Doing so would require that we hold onto the data and deliver it 3705 * to the application. However, if we are the target of a SYN-flood 3706 * DoS attack, an attacker could send data which would eventually 3707 * consume all available buffer space if it were ACKed. By not ACKing 3708 * the data, we avoid this DoS scenario. 3709 */ 3710 3711 int 3712 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3713 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3714 struct tcp_opt_info *oi, tcp_seq *issp) 3715 { 3716 struct tcpcb tb, *tp; 3717 long win; 3718 struct syn_cache *sc; 3719 struct syn_cache_head *scp; 3720 struct mbuf *ipopts; 3721 3722 tp = sototcpcb(so); 3723 3724 /* 3725 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3726 * 3727 * Note this check is performed in tcp_input() very early on. 3728 */ 3729 3730 /* 3731 * Initialize some local state. 3732 */ 3733 win = sbspace(so, &so->so_rcv); 3734 if (win > TCP_MAXWIN) 3735 win = TCP_MAXWIN; 3736 3737 bzero(&tb, sizeof(tb)); 3738 #ifdef TCP_SIGNATURE 3739 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3740 #else 3741 if (optp) { 3742 #endif 3743 tb.pf = tp->pf; 3744 tb.sack_enable = tp->sack_enable; 3745 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3746 #ifdef TCP_SIGNATURE 3747 if (tp->t_flags & TF_SIGNATURE) 3748 tb.t_flags |= TF_SIGNATURE; 3749 #endif 3750 tb.t_state = TCPS_LISTEN; 3751 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3752 sotoinpcb(so)->inp_rtableid)) 3753 return (-1); 3754 } 3755 3756 switch (src->sa_family) { 3757 case AF_INET: 3758 /* 3759 * Remember the IP options, if any. 3760 */ 3761 ipopts = ip_srcroute(m); 3762 break; 3763 default: 3764 ipopts = NULL; 3765 } 3766 3767 /* 3768 * See if we already have an entry for this connection. 3769 * If we do, resend the SYN,ACK. We do not count this 3770 * as a retransmission (XXX though maybe we should). 3771 */ 3772 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3773 if (sc != NULL) { 3774 tcpstat_inc(tcps_sc_dupesyn); 3775 if (ipopts) { 3776 /* 3777 * If we were remembering a previous source route, 3778 * forget it and use the new one we've been given. 3779 */ 3780 m_free(sc->sc_ipopts); 3781 sc->sc_ipopts = ipopts; 3782 } 3783 sc->sc_timestamp = tb.ts_recent; 3784 if (syn_cache_respond(sc, m) == 0) { 3785 tcpstat_inc(tcps_sndacks); 3786 tcpstat_inc(tcps_sndtotal); 3787 } 3788 return (0); 3789 } 3790 3791 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3792 if (sc == NULL) { 3793 m_free(ipopts); 3794 return (-1); 3795 } 3796 3797 /* 3798 * Fill in the cache, and put the necessary IP and TCP 3799 * options into the reply. 3800 */ 3801 memcpy(&sc->sc_src, src, src->sa_len); 3802 memcpy(&sc->sc_dst, dst, dst->sa_len); 3803 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3804 sc->sc_flags = 0; 3805 sc->sc_ipopts = ipopts; 3806 sc->sc_irs = th->th_seq; 3807 3808 sc->sc_iss = issp ? *issp : arc4random(); 3809 sc->sc_peermaxseg = oi->maxseg; 3810 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3811 sc->sc_win = win; 3812 sc->sc_timestamp = tb.ts_recent; 3813 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3814 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3815 sc->sc_flags |= SCF_TIMESTAMP; 3816 sc->sc_modulate = arc4random(); 3817 } 3818 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3819 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3820 sc->sc_requested_s_scale = tb.requested_s_scale; 3821 sc->sc_request_r_scale = 0; 3822 /* 3823 * Pick the smallest possible scaling factor that 3824 * will still allow us to scale up to sb_max. 3825 * 3826 * We do this because there are broken firewalls that 3827 * will corrupt the window scale option, leading to 3828 * the other endpoint believing that our advertised 3829 * window is unscaled. At scale factors larger than 3830 * 5 the unscaled window will drop below 1500 bytes, 3831 * leading to serious problems when traversing these 3832 * broken firewalls. 3833 * 3834 * With the default sbmax of 256K, a scale factor 3835 * of 3 will be chosen by this algorithm. Those who 3836 * choose a larger sbmax should watch out 3837 * for the compatiblity problems mentioned above. 3838 * 3839 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3840 * or <SYN,ACK>) segment itself is never scaled. 3841 */ 3842 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3843 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3844 sc->sc_request_r_scale++; 3845 } else { 3846 sc->sc_requested_s_scale = 15; 3847 sc->sc_request_r_scale = 15; 3848 } 3849 #ifdef TCP_ECN 3850 /* 3851 * if both ECE and CWR flag bits are set, peer is ECN capable. 3852 */ 3853 if (tcp_do_ecn && 3854 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3855 sc->sc_flags |= SCF_ECN_PERMIT; 3856 #endif 3857 /* 3858 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3859 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3860 */ 3861 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3862 sc->sc_flags |= SCF_SACK_PERMIT; 3863 #ifdef TCP_SIGNATURE 3864 if (tb.t_flags & TF_SIGNATURE) 3865 sc->sc_flags |= SCF_SIGNATURE; 3866 #endif 3867 sc->sc_tp = tp; 3868 if (syn_cache_respond(sc, m) == 0) { 3869 syn_cache_insert(sc, tp); 3870 tcpstat_inc(tcps_sndacks); 3871 tcpstat_inc(tcps_sndtotal); 3872 } else { 3873 syn_cache_put(sc); 3874 tcpstat_inc(tcps_sc_dropped); 3875 } 3876 3877 return (0); 3878 } 3879 3880 int 3881 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 3882 { 3883 u_int8_t *optp; 3884 int optlen, error; 3885 u_int16_t tlen; 3886 struct ip *ip = NULL; 3887 #ifdef INET6 3888 struct ip6_hdr *ip6 = NULL; 3889 #endif 3890 struct tcphdr *th; 3891 u_int hlen; 3892 struct inpcb *inp; 3893 3894 switch (sc->sc_src.sa.sa_family) { 3895 case AF_INET: 3896 hlen = sizeof(struct ip); 3897 break; 3898 #ifdef INET6 3899 case AF_INET6: 3900 hlen = sizeof(struct ip6_hdr); 3901 break; 3902 #endif 3903 default: 3904 m_freem(m); 3905 return (EAFNOSUPPORT); 3906 } 3907 3908 /* Compute the size of the TCP options. */ 3909 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3910 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 3911 #ifdef TCP_SIGNATURE 3912 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3913 #endif 3914 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3915 3916 tlen = hlen + sizeof(struct tcphdr) + optlen; 3917 3918 /* 3919 * Create the IP+TCP header from scratch. 3920 */ 3921 m_freem(m); 3922 #ifdef DIAGNOSTIC 3923 if (max_linkhdr + tlen > MCLBYTES) 3924 return (ENOBUFS); 3925 #endif 3926 MGETHDR(m, M_DONTWAIT, MT_DATA); 3927 if (m && max_linkhdr + tlen > MHLEN) { 3928 MCLGET(m, M_DONTWAIT); 3929 if ((m->m_flags & M_EXT) == 0) { 3930 m_freem(m); 3931 m = NULL; 3932 } 3933 } 3934 if (m == NULL) 3935 return (ENOBUFS); 3936 3937 /* Fixup the mbuf. */ 3938 m->m_data += max_linkhdr; 3939 m->m_len = m->m_pkthdr.len = tlen; 3940 m->m_pkthdr.ph_ifidx = 0; 3941 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 3942 memset(mtod(m, u_char *), 0, tlen); 3943 3944 switch (sc->sc_src.sa.sa_family) { 3945 case AF_INET: 3946 ip = mtod(m, struct ip *); 3947 ip->ip_dst = sc->sc_src.sin.sin_addr; 3948 ip->ip_src = sc->sc_dst.sin.sin_addr; 3949 ip->ip_p = IPPROTO_TCP; 3950 th = (struct tcphdr *)(ip + 1); 3951 th->th_dport = sc->sc_src.sin.sin_port; 3952 th->th_sport = sc->sc_dst.sin.sin_port; 3953 break; 3954 #ifdef INET6 3955 case AF_INET6: 3956 ip6 = mtod(m, struct ip6_hdr *); 3957 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3958 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3959 ip6->ip6_nxt = IPPROTO_TCP; 3960 /* ip6_plen will be updated in ip6_output() */ 3961 th = (struct tcphdr *)(ip6 + 1); 3962 th->th_dport = sc->sc_src.sin6.sin6_port; 3963 th->th_sport = sc->sc_dst.sin6.sin6_port; 3964 break; 3965 #endif 3966 default: 3967 unhandled_af(sc->sc_src.sa.sa_family); 3968 } 3969 3970 th->th_seq = htonl(sc->sc_iss); 3971 th->th_ack = htonl(sc->sc_irs + 1); 3972 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3973 th->th_flags = TH_SYN|TH_ACK; 3974 #ifdef TCP_ECN 3975 /* Set ECE for SYN-ACK if peer supports ECN. */ 3976 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 3977 th->th_flags |= TH_ECE; 3978 #endif 3979 th->th_win = htons(sc->sc_win); 3980 /* th_sum already 0 */ 3981 /* th_urp already 0 */ 3982 3983 /* Tack on the TCP options. */ 3984 optp = (u_int8_t *)(th + 1); 3985 *optp++ = TCPOPT_MAXSEG; 3986 *optp++ = 4; 3987 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3988 *optp++ = sc->sc_ourmaxseg & 0xff; 3989 3990 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 3991 if (sc->sc_flags & SCF_SACK_PERMIT) { 3992 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 3993 optp += 4; 3994 } 3995 3996 if (sc->sc_request_r_scale != 15) { 3997 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3998 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3999 sc->sc_request_r_scale); 4000 optp += 4; 4001 } 4002 4003 if (sc->sc_flags & SCF_TIMESTAMP) { 4004 u_int32_t *lp = (u_int32_t *)(optp); 4005 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4006 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4007 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4008 *lp = htonl(sc->sc_timestamp); 4009 optp += TCPOLEN_TSTAMP_APPA; 4010 } 4011 4012 #ifdef TCP_SIGNATURE 4013 if (sc->sc_flags & SCF_SIGNATURE) { 4014 union sockaddr_union src, dst; 4015 struct tdb *tdb; 4016 4017 bzero(&src, sizeof(union sockaddr_union)); 4018 bzero(&dst, sizeof(union sockaddr_union)); 4019 src.sa.sa_len = sc->sc_src.sa.sa_len; 4020 src.sa.sa_family = sc->sc_src.sa.sa_family; 4021 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4022 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4023 4024 switch (sc->sc_src.sa.sa_family) { 4025 case 0: /*default to PF_INET*/ 4026 case AF_INET: 4027 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4028 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4029 break; 4030 #ifdef INET6 4031 case AF_INET6: 4032 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4033 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4034 break; 4035 #endif /* INET6 */ 4036 } 4037 4038 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4039 0, &src, &dst, IPPROTO_TCP); 4040 if (tdb == NULL) { 4041 m_freem(m); 4042 return (EPERM); 4043 } 4044 4045 /* Send signature option */ 4046 *(optp++) = TCPOPT_SIGNATURE; 4047 *(optp++) = TCPOLEN_SIGNATURE; 4048 4049 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4050 hlen, 0, optp) < 0) { 4051 m_freem(m); 4052 return (EINVAL); 4053 } 4054 optp += 16; 4055 4056 /* Pad options list to the next 32 bit boundary and 4057 * terminate it. 4058 */ 4059 *optp++ = TCPOPT_NOP; 4060 *optp++ = TCPOPT_EOL; 4061 } 4062 #endif /* TCP_SIGNATURE */ 4063 4064 /* Compute the packet's checksum. */ 4065 switch (sc->sc_src.sa.sa_family) { 4066 case AF_INET: 4067 ip->ip_len = htons(tlen - hlen); 4068 th->th_sum = 0; 4069 th->th_sum = in_cksum(m, tlen); 4070 break; 4071 #ifdef INET6 4072 case AF_INET6: 4073 ip6->ip6_plen = htons(tlen - hlen); 4074 th->th_sum = 0; 4075 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4076 break; 4077 #endif 4078 } 4079 4080 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4081 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4082 4083 /* 4084 * Fill in some straggling IP bits. Note the stack expects 4085 * ip_len to be in host order, for convenience. 4086 */ 4087 switch (sc->sc_src.sa.sa_family) { 4088 case AF_INET: 4089 ip->ip_len = htons(tlen); 4090 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4091 if (inp != NULL) 4092 ip->ip_tos = inp->inp_ip.ip_tos; 4093 break; 4094 #ifdef INET6 4095 case AF_INET6: 4096 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4097 ip6->ip6_vfc |= IPV6_VERSION; 4098 ip6->ip6_plen = htons(tlen - hlen); 4099 /* ip6_hlim will be initialized afterwards */ 4100 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4101 break; 4102 #endif 4103 } 4104 4105 switch (sc->sc_src.sa.sa_family) { 4106 case AF_INET: 4107 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4108 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4109 break; 4110 #ifdef INET6 4111 case AF_INET6: 4112 ip6->ip6_hlim = in6_selecthlim(inp); 4113 4114 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4115 NULL, NULL); 4116 break; 4117 #endif 4118 default: 4119 error = EAFNOSUPPORT; 4120 break; 4121 } 4122 return (error); 4123 } 4124