1 /* $OpenBSD: tcp_input.c,v 1.382 2022/11/07 11:22:55 yasuoka Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE TCP_TIME(24 * 24 * 60 * 60) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 189 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 190 191 void syn_cache_put(struct syn_cache *); 192 void syn_cache_rm(struct syn_cache *); 193 int syn_cache_respond(struct syn_cache *, struct mbuf *, uint32_t); 194 void syn_cache_timer(void *); 195 void syn_cache_reaper(void *); 196 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 197 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 198 struct tcphdr *, u_int); 199 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 200 unsigned int, struct socket *, struct mbuf *, u_char *, int, 201 struct tcp_opt_info *, tcp_seq *, uint32_t); 202 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 203 struct tcphdr *, unsigned int, unsigned int, struct socket *, 204 struct mbuf *, uint32_t); 205 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 206 struct syn_cache_head **, u_int); 207 208 /* 209 * Insert segment ti into reassembly queue of tcp with 210 * control block tp. Return TH_FIN if reassembly now includes 211 * a segment with FIN. The macro form does the common case inline 212 * (segment is the next to be received on an established connection, 213 * and the queue is empty), avoiding linkage into and removal 214 * from the queue and repetition of various conversions. 215 * Set DELACK for segments received in order, but ack immediately 216 * when segments are out of order (so fast retransmit can work). 217 */ 218 219 int 220 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 221 { 222 struct tcpqent *p, *q, *nq, *tiqe; 223 224 /* 225 * Allocate a new queue entry, before we throw away any data. 226 * If we can't, just drop the packet. XXX 227 */ 228 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 229 if (tiqe == NULL) { 230 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 231 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 232 /* Reuse last entry since new segment fills a hole */ 233 m_freem(tiqe->tcpqe_m); 234 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 235 } 236 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 237 /* Flush segment queue for this connection */ 238 tcp_freeq(tp); 239 tcpstat_inc(tcps_rcvmemdrop); 240 m_freem(m); 241 return (0); 242 } 243 } 244 245 /* 246 * Find a segment which begins after this one does. 247 */ 248 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 249 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 250 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 251 break; 252 253 /* 254 * If there is a preceding segment, it may provide some of 255 * our data already. If so, drop the data from the incoming 256 * segment. If it provides all of our data, drop us. 257 */ 258 if (p != NULL) { 259 struct tcphdr *phdr = p->tcpqe_tcp; 260 int i; 261 262 /* conversion to int (in i) handles seq wraparound */ 263 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 264 if (i > 0) { 265 if (i >= *tlen) { 266 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 267 *tlen); 268 m_freem(m); 269 pool_put(&tcpqe_pool, tiqe); 270 return (0); 271 } 272 m_adj(m, i); 273 *tlen -= i; 274 th->th_seq += i; 275 } 276 } 277 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 278 tp->t_rcvoopack++; 279 280 /* 281 * While we overlap succeeding segments trim them or, 282 * if they are completely covered, dequeue them. 283 */ 284 for (; q != NULL; q = nq) { 285 struct tcphdr *qhdr = q->tcpqe_tcp; 286 int i = (th->th_seq + *tlen) - qhdr->th_seq; 287 288 if (i <= 0) 289 break; 290 if (i < qhdr->th_reseqlen) { 291 qhdr->th_seq += i; 292 qhdr->th_reseqlen -= i; 293 m_adj(q->tcpqe_m, i); 294 break; 295 } 296 nq = TAILQ_NEXT(q, tcpqe_q); 297 m_freem(q->tcpqe_m); 298 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 299 pool_put(&tcpqe_pool, q); 300 } 301 302 /* Insert the new segment queue entry into place. */ 303 tiqe->tcpqe_m = m; 304 th->th_reseqlen = *tlen; 305 tiqe->tcpqe_tcp = th; 306 if (p == NULL) { 307 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 308 } else { 309 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 310 } 311 312 if (th->th_seq != tp->rcv_nxt) 313 return (0); 314 315 return (tcp_flush_queue(tp)); 316 } 317 318 int 319 tcp_flush_queue(struct tcpcb *tp) 320 { 321 struct socket *so = tp->t_inpcb->inp_socket; 322 struct tcpqent *q, *nq; 323 int flags; 324 325 /* 326 * Present data to user, advancing rcv_nxt through 327 * completed sequence space. 328 */ 329 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 330 return (0); 331 q = TAILQ_FIRST(&tp->t_segq); 332 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 333 return (0); 334 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 335 return (0); 336 do { 337 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 338 flags = q->tcpqe_tcp->th_flags & TH_FIN; 339 340 nq = TAILQ_NEXT(q, tcpqe_q); 341 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 342 ND6_HINT(tp); 343 if (so->so_state & SS_CANTRCVMORE) 344 m_freem(q->tcpqe_m); 345 else 346 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 347 pool_put(&tcpqe_pool, q); 348 q = nq; 349 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 350 tp->t_flags |= TF_BLOCKOUTPUT; 351 sorwakeup(so); 352 tp->t_flags &= ~TF_BLOCKOUTPUT; 353 return (flags); 354 } 355 356 /* 357 * TCP input routine, follows pages 65-76 of the 358 * protocol specification dated September, 1981 very closely. 359 */ 360 int 361 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 362 { 363 struct mbuf *m = *mp; 364 int iphlen = *offp; 365 struct ip *ip = NULL; 366 struct inpcb *inp = NULL; 367 u_int8_t *optp = NULL; 368 int optlen = 0; 369 int tlen, off; 370 struct tcpcb *otp = NULL, *tp = NULL; 371 int tiflags; 372 struct socket *so = NULL; 373 int todrop, acked, ourfinisacked; 374 int hdroptlen = 0; 375 short ostate; 376 caddr_t saveti; 377 tcp_seq iss, *reuse = NULL; 378 uint32_t now; 379 u_long tiwin; 380 struct tcp_opt_info opti; 381 struct tcphdr *th; 382 #ifdef INET6 383 struct ip6_hdr *ip6 = NULL; 384 #endif /* INET6 */ 385 #ifdef TCP_ECN 386 u_char iptos; 387 #endif 388 389 tcpstat_inc(tcps_rcvtotal); 390 391 opti.ts_present = 0; 392 opti.maxseg = 0; 393 now = tcp_now(); 394 395 /* 396 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 397 */ 398 if (m->m_flags & (M_BCAST|M_MCAST)) 399 goto drop; 400 401 /* 402 * Get IP and TCP header together in first mbuf. 403 * Note: IP leaves IP header in first mbuf. 404 */ 405 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 406 if (!th) { 407 tcpstat_inc(tcps_rcvshort); 408 return IPPROTO_DONE; 409 } 410 411 tlen = m->m_pkthdr.len - iphlen; 412 switch (af) { 413 case AF_INET: 414 ip = mtod(m, struct ip *); 415 #ifdef TCP_ECN 416 /* save ip_tos before clearing it for checksum */ 417 iptos = ip->ip_tos; 418 #endif 419 break; 420 #ifdef INET6 421 case AF_INET6: 422 ip6 = mtod(m, struct ip6_hdr *); 423 #ifdef TCP_ECN 424 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 425 #endif 426 427 /* 428 * Be proactive about unspecified IPv6 address in source. 429 * As we use all-zero to indicate unbounded/unconnected pcb, 430 * unspecified IPv6 address can be used to confuse us. 431 * 432 * Note that packets with unspecified IPv6 destination is 433 * already dropped in ip6_input. 434 */ 435 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 436 /* XXX stat */ 437 goto drop; 438 } 439 440 /* Discard packets to multicast */ 441 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 442 /* XXX stat */ 443 goto drop; 444 } 445 break; 446 #endif 447 default: 448 unhandled_af(af); 449 } 450 451 /* 452 * Checksum extended TCP header and data. 453 */ 454 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 455 int sum; 456 457 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 458 tcpstat_inc(tcps_rcvbadsum); 459 goto drop; 460 } 461 tcpstat_inc(tcps_inswcsum); 462 switch (af) { 463 case AF_INET: 464 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 465 break; 466 #ifdef INET6 467 case AF_INET6: 468 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 469 tlen); 470 break; 471 #endif 472 } 473 if (sum != 0) { 474 tcpstat_inc(tcps_rcvbadsum); 475 goto drop; 476 } 477 } 478 479 /* 480 * Check that TCP offset makes sense, 481 * pull out TCP options and adjust length. XXX 482 */ 483 off = th->th_off << 2; 484 if (off < sizeof(struct tcphdr) || off > tlen) { 485 tcpstat_inc(tcps_rcvbadoff); 486 goto drop; 487 } 488 tlen -= off; 489 if (off > sizeof(struct tcphdr)) { 490 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 491 if (!th) { 492 tcpstat_inc(tcps_rcvshort); 493 return IPPROTO_DONE; 494 } 495 optlen = off - sizeof(struct tcphdr); 496 optp = (u_int8_t *)(th + 1); 497 /* 498 * Do quick retrieval of timestamp options ("options 499 * prediction?"). If timestamp is the only option and it's 500 * formatted as recommended in RFC 1323 appendix A, we 501 * quickly get the values now and not bother calling 502 * tcp_dooptions(), etc. 503 */ 504 if ((optlen == TCPOLEN_TSTAMP_APPA || 505 (optlen > TCPOLEN_TSTAMP_APPA && 506 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 507 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 508 (th->th_flags & TH_SYN) == 0) { 509 opti.ts_present = 1; 510 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 511 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 512 optp = NULL; /* we've parsed the options */ 513 } 514 } 515 tiflags = th->th_flags; 516 517 /* 518 * Convert TCP protocol specific fields to host format. 519 */ 520 th->th_seq = ntohl(th->th_seq); 521 th->th_ack = ntohl(th->th_ack); 522 th->th_win = ntohs(th->th_win); 523 th->th_urp = ntohs(th->th_urp); 524 525 /* 526 * Locate pcb for segment. 527 */ 528 #if NPF > 0 529 inp = pf_inp_lookup(m); 530 #endif 531 findpcb: 532 if (inp == NULL) { 533 switch (af) { 534 #ifdef INET6 535 case AF_INET6: 536 inp = in6_pcblookup(&tcbtable, &ip6->ip6_src, 537 th->th_sport, &ip6->ip6_dst, th->th_dport, 538 m->m_pkthdr.ph_rtableid); 539 break; 540 #endif 541 case AF_INET: 542 inp = in_pcblookup(&tcbtable, ip->ip_src, 543 th->th_sport, ip->ip_dst, th->th_dport, 544 m->m_pkthdr.ph_rtableid); 545 break; 546 } 547 } 548 if (inp == NULL) { 549 tcpstat_inc(tcps_pcbhashmiss); 550 switch (af) { 551 #ifdef INET6 552 case AF_INET6: 553 inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, 554 th->th_dport, m, m->m_pkthdr.ph_rtableid); 555 break; 556 #endif /* INET6 */ 557 case AF_INET: 558 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 559 th->th_dport, m, m->m_pkthdr.ph_rtableid); 560 break; 561 } 562 /* 563 * If the state is CLOSED (i.e., TCB does not exist) then 564 * all data in the incoming segment is discarded. 565 * If the TCB exists but is in CLOSED state, it is embryonic, 566 * but should either do a listen or a connect soon. 567 */ 568 } 569 #ifdef IPSEC 570 if (ipsec_in_use) { 571 struct m_tag *mtag; 572 struct tdb *tdb = NULL; 573 int error; 574 575 /* Find most recent IPsec tag */ 576 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 577 if (mtag != NULL) { 578 struct tdb_ident *tdbi; 579 580 tdbi = (struct tdb_ident *)(mtag + 1); 581 tdb = gettdb(tdbi->rdomain, tdbi->spi, 582 &tdbi->dst, tdbi->proto); 583 } 584 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 585 tdb, inp, NULL, NULL); 586 tdb_unref(tdb); 587 if (error) { 588 tcpstat_inc(tcps_rcvnosec); 589 goto drop; 590 } 591 } 592 #endif /* IPSEC */ 593 594 if (inp == NULL) { 595 tcpstat_inc(tcps_noport); 596 goto dropwithreset_ratelim; 597 } 598 599 KASSERT(sotoinpcb(inp->inp_socket) == inp); 600 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 601 soassertlocked(inp->inp_socket); 602 603 /* Check the minimum TTL for socket. */ 604 switch (af) { 605 case AF_INET: 606 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 607 goto drop; 608 break; 609 #ifdef INET6 610 case AF_INET6: 611 if (inp->inp_ip6_minhlim && 612 inp->inp_ip6_minhlim > ip6->ip6_hlim) 613 goto drop; 614 break; 615 #endif 616 } 617 618 tp = intotcpcb(inp); 619 if (tp == NULL) 620 goto dropwithreset_ratelim; 621 if (tp->t_state == TCPS_CLOSED) 622 goto drop; 623 624 /* Unscale the window into a 32-bit value. */ 625 if ((tiflags & TH_SYN) == 0) 626 tiwin = th->th_win << tp->snd_scale; 627 else 628 tiwin = th->th_win; 629 630 so = inp->inp_socket; 631 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 632 union syn_cache_sa src; 633 union syn_cache_sa dst; 634 635 bzero(&src, sizeof(src)); 636 bzero(&dst, sizeof(dst)); 637 switch (af) { 638 case AF_INET: 639 src.sin.sin_len = sizeof(struct sockaddr_in); 640 src.sin.sin_family = AF_INET; 641 src.sin.sin_addr = ip->ip_src; 642 src.sin.sin_port = th->th_sport; 643 644 dst.sin.sin_len = sizeof(struct sockaddr_in); 645 dst.sin.sin_family = AF_INET; 646 dst.sin.sin_addr = ip->ip_dst; 647 dst.sin.sin_port = th->th_dport; 648 break; 649 #ifdef INET6 650 case AF_INET6: 651 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 652 src.sin6.sin6_family = AF_INET6; 653 src.sin6.sin6_addr = ip6->ip6_src; 654 src.sin6.sin6_port = th->th_sport; 655 656 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 657 dst.sin6.sin6_family = AF_INET6; 658 dst.sin6.sin6_addr = ip6->ip6_dst; 659 dst.sin6.sin6_port = th->th_dport; 660 break; 661 #endif /* INET6 */ 662 } 663 664 if (so->so_options & SO_DEBUG) { 665 otp = tp; 666 ostate = tp->t_state; 667 switch (af) { 668 #ifdef INET6 669 case AF_INET6: 670 saveti = (caddr_t) &tcp_saveti6; 671 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 672 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 673 break; 674 #endif 675 case AF_INET: 676 saveti = (caddr_t) &tcp_saveti; 677 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 678 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 679 break; 680 } 681 } 682 if (so->so_options & SO_ACCEPTCONN) { 683 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 684 685 case TH_SYN|TH_ACK|TH_RST: 686 case TH_SYN|TH_RST: 687 case TH_ACK|TH_RST: 688 case TH_RST: 689 syn_cache_reset(&src.sa, &dst.sa, th, 690 inp->inp_rtableid); 691 goto drop; 692 693 case TH_SYN|TH_ACK: 694 /* 695 * Received a SYN,ACK. This should 696 * never happen while we are in 697 * LISTEN. Send an RST. 698 */ 699 goto badsyn; 700 701 case TH_ACK: 702 so = syn_cache_get(&src.sa, &dst.sa, 703 th, iphlen, tlen, so, m, now); 704 if (so == NULL) { 705 /* 706 * We don't have a SYN for 707 * this ACK; send an RST. 708 */ 709 goto badsyn; 710 } else if (so == (struct socket *)(-1)) { 711 /* 712 * We were unable to create 713 * the connection. If the 714 * 3-way handshake was 715 * completed, and RST has 716 * been sent to the peer. 717 * Since the mbuf might be 718 * in use for the reply, 719 * do not free it. 720 */ 721 m = *mp = NULL; 722 goto drop; 723 } else { 724 /* 725 * We have created a 726 * full-blown connection. 727 */ 728 tp = NULL; 729 in_pcbunref(inp); 730 inp = in_pcbref(sotoinpcb(so)); 731 tp = intotcpcb(inp); 732 if (tp == NULL) 733 goto badsyn; /*XXX*/ 734 735 } 736 break; 737 738 default: 739 /* 740 * None of RST, SYN or ACK was set. 741 * This is an invalid packet for a 742 * TCB in LISTEN state. Send a RST. 743 */ 744 goto badsyn; 745 746 case TH_SYN: 747 /* 748 * Received a SYN. 749 */ 750 #ifdef INET6 751 /* 752 * If deprecated address is forbidden, we do 753 * not accept SYN to deprecated interface 754 * address to prevent any new inbound 755 * connection from getting established. 756 * When we do not accept SYN, we send a TCP 757 * RST, with deprecated source address (instead 758 * of dropping it). We compromise it as it is 759 * much better for peer to send a RST, and 760 * RST will be the final packet for the 761 * exchange. 762 * 763 * If we do not forbid deprecated addresses, we 764 * accept the SYN packet. RFC2462 does not 765 * suggest dropping SYN in this case. 766 * If we decipher RFC2462 5.5.4, it says like 767 * this: 768 * 1. use of deprecated addr with existing 769 * communication is okay - "SHOULD continue 770 * to be used" 771 * 2. use of it with new communication: 772 * (2a) "SHOULD NOT be used if alternate 773 * address with sufficient scope is 774 * available" 775 * (2b) nothing mentioned otherwise. 776 * Here we fall into (2b) case as we have no 777 * choice in our source address selection - we 778 * must obey the peer. 779 * 780 * The wording in RFC2462 is confusing, and 781 * there are multiple description text for 782 * deprecated address handling - worse, they 783 * are not exactly the same. I believe 5.5.4 784 * is the best one, so we follow 5.5.4. 785 */ 786 if (ip6 && !ip6_use_deprecated) { 787 struct in6_ifaddr *ia6; 788 struct ifnet *ifp = 789 if_get(m->m_pkthdr.ph_ifidx); 790 791 if (ifp && 792 (ia6 = in6ifa_ifpwithaddr(ifp, 793 &ip6->ip6_dst)) && 794 (ia6->ia6_flags & 795 IN6_IFF_DEPRECATED)) { 796 tp = NULL; 797 if_put(ifp); 798 goto dropwithreset; 799 } 800 if_put(ifp); 801 } 802 #endif 803 804 /* 805 * LISTEN socket received a SYN 806 * from itself? This can't possibly 807 * be valid; drop the packet. 808 */ 809 if (th->th_dport == th->th_sport) { 810 switch (af) { 811 #ifdef INET6 812 case AF_INET6: 813 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 814 &ip6->ip6_dst)) { 815 tcpstat_inc(tcps_badsyn); 816 goto drop; 817 } 818 break; 819 #endif /* INET6 */ 820 case AF_INET: 821 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 822 tcpstat_inc(tcps_badsyn); 823 goto drop; 824 } 825 break; 826 } 827 } 828 829 /* 830 * SYN looks ok; create compressed TCP 831 * state for it. 832 */ 833 if (so->so_qlen > so->so_qlimit || 834 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 835 so, m, optp, optlen, &opti, reuse, now) 836 == -1) { 837 tcpstat_inc(tcps_dropsyn); 838 goto drop; 839 } 840 in_pcbunref(inp); 841 return IPPROTO_DONE; 842 } 843 } 844 } 845 846 #ifdef DIAGNOSTIC 847 /* 848 * Should not happen now that all embryonic connections 849 * are handled with compressed state. 850 */ 851 if (tp->t_state == TCPS_LISTEN) 852 panic("tcp_input: TCPS_LISTEN"); 853 #endif 854 855 #if NPF > 0 856 pf_inp_link(m, inp); 857 #endif 858 859 /* 860 * Segment received on connection. 861 * Reset idle time and keep-alive timer. 862 */ 863 tp->t_rcvtime = now; 864 if (TCPS_HAVEESTABLISHED(tp->t_state)) 865 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle)); 866 867 if (tp->sack_enable) 868 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 869 870 /* 871 * Process options. 872 */ 873 #ifdef TCP_SIGNATURE 874 if (optp || (tp->t_flags & TF_SIGNATURE)) 875 #else 876 if (optp) 877 #endif 878 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 879 m->m_pkthdr.ph_rtableid, now)) 880 goto drop; 881 882 if (opti.ts_present && opti.ts_ecr) { 883 int rtt_test; 884 885 /* subtract out the tcp timestamp modulator */ 886 opti.ts_ecr -= tp->ts_modulate; 887 888 /* make sure ts_ecr is sensible */ 889 rtt_test = now - opti.ts_ecr; 890 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 891 opti.ts_ecr = 0; 892 } 893 894 #ifdef TCP_ECN 895 /* if congestion experienced, set ECE bit in subsequent packets. */ 896 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 897 tp->t_flags |= TF_RCVD_CE; 898 tcpstat_inc(tcps_ecn_rcvce); 899 } 900 #endif 901 /* 902 * Header prediction: check for the two common cases 903 * of a uni-directional data xfer. If the packet has 904 * no control flags, is in-sequence, the window didn't 905 * change and we're not retransmitting, it's a 906 * candidate. If the length is zero and the ack moved 907 * forward, we're the sender side of the xfer. Just 908 * free the data acked & wake any higher level process 909 * that was blocked waiting for space. If the length 910 * is non-zero and the ack didn't move, we're the 911 * receiver side. If we're getting packets in-order 912 * (the reassembly queue is empty), add the data to 913 * the socket buffer and note that we need a delayed ack. 914 */ 915 if (tp->t_state == TCPS_ESTABLISHED && 916 #ifdef TCP_ECN 917 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 918 #else 919 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 920 #endif 921 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 922 th->th_seq == tp->rcv_nxt && 923 tiwin && tiwin == tp->snd_wnd && 924 tp->snd_nxt == tp->snd_max) { 925 926 /* 927 * If last ACK falls within this segment's sequence numbers, 928 * record the timestamp. 929 * Fix from Braden, see Stevens p. 870 930 */ 931 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 932 tp->ts_recent_age = now; 933 tp->ts_recent = opti.ts_val; 934 } 935 936 if (tlen == 0) { 937 if (SEQ_GT(th->th_ack, tp->snd_una) && 938 SEQ_LEQ(th->th_ack, tp->snd_max) && 939 tp->snd_cwnd >= tp->snd_wnd && 940 tp->t_dupacks == 0) { 941 /* 942 * this is a pure ack for outstanding data. 943 */ 944 tcpstat_inc(tcps_predack); 945 if (opti.ts_present && opti.ts_ecr) 946 tcp_xmit_timer(tp, now - opti.ts_ecr); 947 else if (tp->t_rtttime && 948 SEQ_GT(th->th_ack, tp->t_rtseq)) 949 tcp_xmit_timer(tp, now - tp->t_rtttime); 950 acked = th->th_ack - tp->snd_una; 951 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 952 acked); 953 tp->t_rcvacktime = now; 954 ND6_HINT(tp); 955 sbdrop(so, &so->so_snd, acked); 956 957 /* 958 * If we had a pending ICMP message that 959 * refers to data that have just been 960 * acknowledged, disregard the recorded ICMP 961 * message. 962 */ 963 if ((tp->t_flags & TF_PMTUD_PEND) && 964 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 965 tp->t_flags &= ~TF_PMTUD_PEND; 966 967 /* 968 * Keep track of the largest chunk of data 969 * acknowledged since last PMTU update 970 */ 971 if (tp->t_pmtud_mss_acked < acked) 972 tp->t_pmtud_mss_acked = acked; 973 974 tp->snd_una = th->th_ack; 975 /* Pull snd_wl2 up to prevent seq wrap. */ 976 tp->snd_wl2 = th->th_ack; 977 /* 978 * We want snd_last to track snd_una so 979 * as to avoid sequence wraparound problems 980 * for very large transfers. 981 */ 982 #ifdef TCP_ECN 983 if (SEQ_GT(tp->snd_una, tp->snd_last)) 984 #endif 985 tp->snd_last = tp->snd_una; 986 m_freem(m); 987 988 /* 989 * If all outstanding data are acked, stop 990 * retransmit timer, otherwise restart timer 991 * using current (possibly backed-off) value. 992 * If process is waiting for space, 993 * wakeup/selwakeup/signal. If data 994 * are ready to send, let tcp_output 995 * decide between more output or persist. 996 */ 997 if (tp->snd_una == tp->snd_max) 998 TCP_TIMER_DISARM(tp, TCPT_REXMT); 999 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1000 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1001 1002 tcp_update_sndspace(tp); 1003 if (sb_notify(so, &so->so_snd)) { 1004 tp->t_flags |= TF_BLOCKOUTPUT; 1005 sowwakeup(so); 1006 tp->t_flags &= ~TF_BLOCKOUTPUT; 1007 } 1008 if (so->so_snd.sb_cc || 1009 tp->t_flags & TF_NEEDOUTPUT) 1010 (void) tcp_output(tp); 1011 in_pcbunref(inp); 1012 return IPPROTO_DONE; 1013 } 1014 } else if (th->th_ack == tp->snd_una && 1015 TAILQ_EMPTY(&tp->t_segq) && 1016 tlen <= sbspace(so, &so->so_rcv)) { 1017 /* 1018 * This is a pure, in-sequence data packet 1019 * with nothing on the reassembly queue and 1020 * we have enough buffer space to take it. 1021 */ 1022 /* Clean receiver SACK report if present */ 1023 if (tp->sack_enable && tp->rcv_numsacks) 1024 tcp_clean_sackreport(tp); 1025 tcpstat_inc(tcps_preddat); 1026 tp->rcv_nxt += tlen; 1027 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1028 tp->snd_wl1 = th->th_seq; 1029 /* Packet has most recent segment, no urgent exists. */ 1030 tp->rcv_up = tp->rcv_nxt; 1031 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1032 ND6_HINT(tp); 1033 1034 TCP_SETUP_ACK(tp, tiflags, m); 1035 /* 1036 * Drop TCP, IP headers and TCP options then add data 1037 * to socket buffer. 1038 */ 1039 if (so->so_state & SS_CANTRCVMORE) 1040 m_freem(m); 1041 else { 1042 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1043 now - tp->rfbuf_ts > (tp->t_srtt >> 1044 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) { 1045 tcp_update_rcvspace(tp); 1046 /* Start over with next RTT. */ 1047 tp->rfbuf_cnt = 0; 1048 tp->rfbuf_ts = 0; 1049 } else 1050 tp->rfbuf_cnt += tlen; 1051 m_adj(m, iphlen + off); 1052 sbappendstream(so, &so->so_rcv, m); 1053 } 1054 tp->t_flags |= TF_BLOCKOUTPUT; 1055 sorwakeup(so); 1056 tp->t_flags &= ~TF_BLOCKOUTPUT; 1057 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1058 (void) tcp_output(tp); 1059 in_pcbunref(inp); 1060 return IPPROTO_DONE; 1061 } 1062 } 1063 1064 /* 1065 * Compute mbuf offset to TCP data segment. 1066 */ 1067 hdroptlen = iphlen + off; 1068 1069 /* 1070 * Calculate amount of space in receive window, 1071 * and then do TCP input processing. 1072 * Receive window is amount of space in rcv queue, 1073 * but not less than advertised window. 1074 */ 1075 { int win; 1076 1077 win = sbspace(so, &so->so_rcv); 1078 if (win < 0) 1079 win = 0; 1080 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1081 } 1082 1083 switch (tp->t_state) { 1084 1085 /* 1086 * If the state is SYN_RECEIVED: 1087 * if seg contains SYN/ACK, send an RST. 1088 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1089 */ 1090 1091 case TCPS_SYN_RECEIVED: 1092 if (tiflags & TH_ACK) { 1093 if (tiflags & TH_SYN) { 1094 tcpstat_inc(tcps_badsyn); 1095 goto dropwithreset; 1096 } 1097 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1098 SEQ_GT(th->th_ack, tp->snd_max)) 1099 goto dropwithreset; 1100 } 1101 break; 1102 1103 /* 1104 * If the state is SYN_SENT: 1105 * if seg contains an ACK, but not for our SYN, drop the input. 1106 * if seg contains a RST, then drop the connection. 1107 * if seg does not contain SYN, then drop it. 1108 * Otherwise this is an acceptable SYN segment 1109 * initialize tp->rcv_nxt and tp->irs 1110 * if seg contains ack then advance tp->snd_una 1111 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1112 * arrange for segment to be acked (eventually) 1113 * continue processing rest of data/controls, beginning with URG 1114 */ 1115 case TCPS_SYN_SENT: 1116 if ((tiflags & TH_ACK) && 1117 (SEQ_LEQ(th->th_ack, tp->iss) || 1118 SEQ_GT(th->th_ack, tp->snd_max))) 1119 goto dropwithreset; 1120 if (tiflags & TH_RST) { 1121 #ifdef TCP_ECN 1122 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1123 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1124 goto drop; 1125 #endif 1126 if (tiflags & TH_ACK) 1127 tp = tcp_drop(tp, ECONNREFUSED); 1128 goto drop; 1129 } 1130 if ((tiflags & TH_SYN) == 0) 1131 goto drop; 1132 if (tiflags & TH_ACK) { 1133 tp->snd_una = th->th_ack; 1134 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1135 tp->snd_nxt = tp->snd_una; 1136 } 1137 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1138 tp->irs = th->th_seq; 1139 tcp_mss(tp, opti.maxseg); 1140 /* Reset initial window to 1 segment for retransmit */ 1141 if (tp->t_rxtshift > 0) 1142 tp->snd_cwnd = tp->t_maxseg; 1143 tcp_rcvseqinit(tp); 1144 tp->t_flags |= TF_ACKNOW; 1145 /* 1146 * If we've sent a SACK_PERMITTED option, and the peer 1147 * also replied with one, then TF_SACK_PERMIT should have 1148 * been set in tcp_dooptions(). If it was not, disable SACKs. 1149 */ 1150 if (tp->sack_enable) 1151 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1152 #ifdef TCP_ECN 1153 /* 1154 * if ECE is set but CWR is not set for SYN-ACK, or 1155 * both ECE and CWR are set for simultaneous open, 1156 * peer is ECN capable. 1157 */ 1158 if (tcp_do_ecn) { 1159 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1160 case TH_ACK|TH_ECE: 1161 case TH_ECE|TH_CWR: 1162 tp->t_flags |= TF_ECN_PERMIT; 1163 tiflags &= ~(TH_ECE|TH_CWR); 1164 tcpstat_inc(tcps_ecn_accepts); 1165 } 1166 } 1167 #endif 1168 1169 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1170 tcpstat_inc(tcps_connects); 1171 tp->t_flags |= TF_BLOCKOUTPUT; 1172 soisconnected(so); 1173 tp->t_flags &= ~TF_BLOCKOUTPUT; 1174 tp->t_state = TCPS_ESTABLISHED; 1175 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle)); 1176 /* Do window scaling on this connection? */ 1177 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1178 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1179 tp->snd_scale = tp->requested_s_scale; 1180 tp->rcv_scale = tp->request_r_scale; 1181 } 1182 tcp_flush_queue(tp); 1183 1184 /* 1185 * if we didn't have to retransmit the SYN, 1186 * use its rtt as our initial srtt & rtt var. 1187 */ 1188 if (tp->t_rtttime) 1189 tcp_xmit_timer(tp, now - tp->t_rtttime); 1190 /* 1191 * Since new data was acked (the SYN), open the 1192 * congestion window by one MSS. We do this 1193 * here, because we won't go through the normal 1194 * ACK processing below. And since this is the 1195 * start of the connection, we know we are in 1196 * the exponential phase of slow-start. 1197 */ 1198 tp->snd_cwnd += tp->t_maxseg; 1199 } else 1200 tp->t_state = TCPS_SYN_RECEIVED; 1201 1202 #if 0 1203 trimthenstep6: 1204 #endif 1205 /* 1206 * Advance th->th_seq to correspond to first data byte. 1207 * If data, trim to stay within window, 1208 * dropping FIN if necessary. 1209 */ 1210 th->th_seq++; 1211 if (tlen > tp->rcv_wnd) { 1212 todrop = tlen - tp->rcv_wnd; 1213 m_adj(m, -todrop); 1214 tlen = tp->rcv_wnd; 1215 tiflags &= ~TH_FIN; 1216 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1217 todrop); 1218 } 1219 tp->snd_wl1 = th->th_seq - 1; 1220 tp->rcv_up = th->th_seq; 1221 goto step6; 1222 /* 1223 * If a new connection request is received while in TIME_WAIT, 1224 * drop the old connection and start over if the if the 1225 * timestamp or the sequence numbers are above the previous 1226 * ones. 1227 */ 1228 case TCPS_TIME_WAIT: 1229 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1230 ((opti.ts_present && 1231 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1232 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1233 #if NPF > 0 1234 /* 1235 * The socket will be recreated but the new state 1236 * has already been linked to the socket. Remove the 1237 * link between old socket and new state. 1238 */ 1239 pf_inp_unlink(inp); 1240 #endif 1241 /* 1242 * Advance the iss by at least 32768, but 1243 * clear the msb in order to make sure 1244 * that SEG_LT(snd_nxt, iss). 1245 */ 1246 iss = tp->snd_nxt + 1247 ((arc4random() & 0x7fffffff) | 0x8000); 1248 reuse = &iss; 1249 tp = tcp_close(tp); 1250 in_pcbunref(inp); 1251 inp = NULL; 1252 goto findpcb; 1253 } 1254 } 1255 1256 /* 1257 * States other than LISTEN or SYN_SENT. 1258 * First check timestamp, if present. 1259 * Then check that at least some bytes of segment are within 1260 * receive window. If segment begins before rcv_nxt, 1261 * drop leading data (and SYN); if nothing left, just ack. 1262 * 1263 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1264 * and it's less than opti.ts_recent, drop it. 1265 */ 1266 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1267 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1268 1269 /* Check to see if ts_recent is over 24 days old. */ 1270 if ((int)(now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1271 /* 1272 * Invalidate ts_recent. If this segment updates 1273 * ts_recent, the age will be reset later and ts_recent 1274 * will get a valid value. If it does not, setting 1275 * ts_recent to zero will at least satisfy the 1276 * requirement that zero be placed in the timestamp 1277 * echo reply when ts_recent isn't valid. The 1278 * age isn't reset until we get a valid ts_recent 1279 * because we don't want out-of-order segments to be 1280 * dropped when ts_recent is old. 1281 */ 1282 tp->ts_recent = 0; 1283 } else { 1284 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1285 tcpstat_inc(tcps_pawsdrop); 1286 if (tlen) 1287 goto dropafterack; 1288 goto drop; 1289 } 1290 } 1291 1292 todrop = tp->rcv_nxt - th->th_seq; 1293 if (todrop > 0) { 1294 if (tiflags & TH_SYN) { 1295 tiflags &= ~TH_SYN; 1296 th->th_seq++; 1297 if (th->th_urp > 1) 1298 th->th_urp--; 1299 else 1300 tiflags &= ~TH_URG; 1301 todrop--; 1302 } 1303 if (todrop > tlen || 1304 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1305 /* 1306 * Any valid FIN must be to the left of the 1307 * window. At this point, FIN must be a 1308 * duplicate or out-of-sequence, so drop it. 1309 */ 1310 tiflags &= ~TH_FIN; 1311 /* 1312 * Send ACK to resynchronize, and drop any data, 1313 * but keep on processing for RST or ACK. 1314 */ 1315 tp->t_flags |= TF_ACKNOW; 1316 todrop = tlen; 1317 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1318 } else { 1319 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1320 todrop); 1321 } 1322 hdroptlen += todrop; /* drop from head afterwards */ 1323 th->th_seq += todrop; 1324 tlen -= todrop; 1325 if (th->th_urp > todrop) 1326 th->th_urp -= todrop; 1327 else { 1328 tiflags &= ~TH_URG; 1329 th->th_urp = 0; 1330 } 1331 } 1332 1333 /* 1334 * If new data are received on a connection after the 1335 * user processes are gone, then RST the other end. 1336 */ 1337 if ((so->so_state & SS_NOFDREF) && 1338 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1339 tp = tcp_close(tp); 1340 tcpstat_inc(tcps_rcvafterclose); 1341 goto dropwithreset; 1342 } 1343 1344 /* 1345 * If segment ends after window, drop trailing data 1346 * (and PUSH and FIN); if nothing left, just ACK. 1347 */ 1348 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1349 if (todrop > 0) { 1350 tcpstat_inc(tcps_rcvpackafterwin); 1351 if (todrop >= tlen) { 1352 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1353 /* 1354 * If window is closed can only take segments at 1355 * window edge, and have to drop data and PUSH from 1356 * incoming segments. Continue processing, but 1357 * remember to ack. Otherwise, drop segment 1358 * and ack. 1359 */ 1360 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1361 tp->t_flags |= TF_ACKNOW; 1362 tcpstat_inc(tcps_rcvwinprobe); 1363 } else 1364 goto dropafterack; 1365 } else 1366 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1367 m_adj(m, -todrop); 1368 tlen -= todrop; 1369 tiflags &= ~(TH_PUSH|TH_FIN); 1370 } 1371 1372 /* 1373 * If last ACK falls within this segment's sequence numbers, 1374 * record its timestamp if it's more recent. 1375 * NOTE that the test is modified according to the latest 1376 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1377 */ 1378 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1379 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1380 tp->ts_recent_age = now; 1381 tp->ts_recent = opti.ts_val; 1382 } 1383 1384 /* 1385 * If the RST bit is set examine the state: 1386 * SYN_RECEIVED STATE: 1387 * If passive open, return to LISTEN state. 1388 * If active open, inform user that connection was refused. 1389 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1390 * Inform user that connection was reset, and close tcb. 1391 * CLOSING, LAST_ACK, TIME_WAIT STATES 1392 * Close the tcb. 1393 */ 1394 if (tiflags & TH_RST) { 1395 if (th->th_seq != tp->last_ack_sent && 1396 th->th_seq != tp->rcv_nxt && 1397 th->th_seq != (tp->rcv_nxt + 1)) 1398 goto drop; 1399 1400 switch (tp->t_state) { 1401 case TCPS_SYN_RECEIVED: 1402 #ifdef TCP_ECN 1403 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1404 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1405 goto drop; 1406 #endif 1407 so->so_error = ECONNREFUSED; 1408 goto close; 1409 1410 case TCPS_ESTABLISHED: 1411 case TCPS_FIN_WAIT_1: 1412 case TCPS_FIN_WAIT_2: 1413 case TCPS_CLOSE_WAIT: 1414 so->so_error = ECONNRESET; 1415 close: 1416 tp->t_state = TCPS_CLOSED; 1417 tcpstat_inc(tcps_drops); 1418 tp = tcp_close(tp); 1419 goto drop; 1420 case TCPS_CLOSING: 1421 case TCPS_LAST_ACK: 1422 case TCPS_TIME_WAIT: 1423 tp = tcp_close(tp); 1424 goto drop; 1425 } 1426 } 1427 1428 /* 1429 * If a SYN is in the window, then this is an 1430 * error and we ACK and drop the packet. 1431 */ 1432 if (tiflags & TH_SYN) 1433 goto dropafterack_ratelim; 1434 1435 /* 1436 * If the ACK bit is off we drop the segment and return. 1437 */ 1438 if ((tiflags & TH_ACK) == 0) { 1439 if (tp->t_flags & TF_ACKNOW) 1440 goto dropafterack; 1441 else 1442 goto drop; 1443 } 1444 1445 /* 1446 * Ack processing. 1447 */ 1448 switch (tp->t_state) { 1449 1450 /* 1451 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1452 * ESTABLISHED state and continue processing. 1453 * The ACK was checked above. 1454 */ 1455 case TCPS_SYN_RECEIVED: 1456 tcpstat_inc(tcps_connects); 1457 tp->t_flags |= TF_BLOCKOUTPUT; 1458 soisconnected(so); 1459 tp->t_flags &= ~TF_BLOCKOUTPUT; 1460 tp->t_state = TCPS_ESTABLISHED; 1461 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle)); 1462 /* Do window scaling? */ 1463 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1464 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1465 tp->snd_scale = tp->requested_s_scale; 1466 tp->rcv_scale = tp->request_r_scale; 1467 tiwin = th->th_win << tp->snd_scale; 1468 } 1469 tcp_flush_queue(tp); 1470 tp->snd_wl1 = th->th_seq - 1; 1471 /* fall into ... */ 1472 1473 /* 1474 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1475 * ACKs. If the ack is in the range 1476 * tp->snd_una < th->th_ack <= tp->snd_max 1477 * then advance tp->snd_una to th->th_ack and drop 1478 * data from the retransmission queue. If this ACK reflects 1479 * more up to date window information we update our window information. 1480 */ 1481 case TCPS_ESTABLISHED: 1482 case TCPS_FIN_WAIT_1: 1483 case TCPS_FIN_WAIT_2: 1484 case TCPS_CLOSE_WAIT: 1485 case TCPS_CLOSING: 1486 case TCPS_LAST_ACK: 1487 case TCPS_TIME_WAIT: 1488 #ifdef TCP_ECN 1489 /* 1490 * if we receive ECE and are not already in recovery phase, 1491 * reduce cwnd by half but don't slow-start. 1492 * advance snd_last to snd_max not to reduce cwnd again 1493 * until all outstanding packets are acked. 1494 */ 1495 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1496 if ((tp->t_flags & TF_ECN_PERMIT) && 1497 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1498 u_int win; 1499 1500 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1501 if (win > 1) { 1502 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1503 tp->snd_cwnd = tp->snd_ssthresh; 1504 tp->snd_last = tp->snd_max; 1505 tp->t_flags |= TF_SEND_CWR; 1506 tcpstat_inc(tcps_cwr_ecn); 1507 } 1508 } 1509 tcpstat_inc(tcps_ecn_rcvece); 1510 } 1511 /* 1512 * if we receive CWR, we know that the peer has reduced 1513 * its congestion window. stop sending ecn-echo. 1514 */ 1515 if ((tiflags & TH_CWR)) { 1516 tp->t_flags &= ~TF_RCVD_CE; 1517 tcpstat_inc(tcps_ecn_rcvcwr); 1518 } 1519 #endif /* TCP_ECN */ 1520 1521 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1522 /* 1523 * Duplicate/old ACK processing. 1524 * Increments t_dupacks: 1525 * Pure duplicate (same seq/ack/window, no data) 1526 * Doesn't affect t_dupacks: 1527 * Data packets. 1528 * Normal window updates (window opens) 1529 * Resets t_dupacks: 1530 * New data ACKed. 1531 * Window shrinks 1532 * Old ACK 1533 */ 1534 if (tlen) { 1535 /* Drop very old ACKs unless th_seq matches */ 1536 if (th->th_seq != tp->rcv_nxt && 1537 SEQ_LT(th->th_ack, 1538 tp->snd_una - tp->max_sndwnd)) { 1539 tcpstat_inc(tcps_rcvacktooold); 1540 goto drop; 1541 } 1542 break; 1543 } 1544 /* 1545 * If we get an old ACK, there is probably packet 1546 * reordering going on. Be conservative and reset 1547 * t_dupacks so that we are less aggressive in 1548 * doing a fast retransmit. 1549 */ 1550 if (th->th_ack != tp->snd_una) { 1551 tp->t_dupacks = 0; 1552 break; 1553 } 1554 if (tiwin == tp->snd_wnd) { 1555 tcpstat_inc(tcps_rcvdupack); 1556 /* 1557 * If we have outstanding data (other than 1558 * a window probe), this is a completely 1559 * duplicate ack (ie, window info didn't 1560 * change), the ack is the biggest we've 1561 * seen and we've seen exactly our rexmt 1562 * threshold of them, assume a packet 1563 * has been dropped and retransmit it. 1564 * Kludge snd_nxt & the congestion 1565 * window so we send only this one 1566 * packet. 1567 * 1568 * We know we're losing at the current 1569 * window size so do congestion avoidance 1570 * (set ssthresh to half the current window 1571 * and pull our congestion window back to 1572 * the new ssthresh). 1573 * 1574 * Dup acks mean that packets have left the 1575 * network (they're now cached at the receiver) 1576 * so bump cwnd by the amount in the receiver 1577 * to keep a constant cwnd packets in the 1578 * network. 1579 */ 1580 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1581 tp->t_dupacks = 0; 1582 else if (++tp->t_dupacks == tcprexmtthresh) { 1583 tcp_seq onxt = tp->snd_nxt; 1584 u_long win = 1585 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1586 2 / tp->t_maxseg; 1587 1588 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1589 /* 1590 * False fast retx after 1591 * timeout. Do not cut window. 1592 */ 1593 tp->t_dupacks = 0; 1594 goto drop; 1595 } 1596 if (win < 2) 1597 win = 2; 1598 tp->snd_ssthresh = win * tp->t_maxseg; 1599 tp->snd_last = tp->snd_max; 1600 if (tp->sack_enable) { 1601 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1602 tp->t_rtttime = 0; 1603 #ifdef TCP_ECN 1604 tp->t_flags |= TF_SEND_CWR; 1605 #endif 1606 tcpstat_inc(tcps_cwr_frecovery); 1607 tcpstat_inc(tcps_sack_recovery_episode); 1608 /* 1609 * tcp_output() will send 1610 * oldest SACK-eligible rtx. 1611 */ 1612 (void) tcp_output(tp); 1613 tp->snd_cwnd = tp->snd_ssthresh+ 1614 tp->t_maxseg * tp->t_dupacks; 1615 goto drop; 1616 } 1617 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1618 tp->t_rtttime = 0; 1619 tp->snd_nxt = th->th_ack; 1620 tp->snd_cwnd = tp->t_maxseg; 1621 #ifdef TCP_ECN 1622 tp->t_flags |= TF_SEND_CWR; 1623 #endif 1624 tcpstat_inc(tcps_cwr_frecovery); 1625 tcpstat_inc(tcps_sndrexmitfast); 1626 (void) tcp_output(tp); 1627 1628 tp->snd_cwnd = tp->snd_ssthresh + 1629 tp->t_maxseg * tp->t_dupacks; 1630 if (SEQ_GT(onxt, tp->snd_nxt)) 1631 tp->snd_nxt = onxt; 1632 goto drop; 1633 } else if (tp->t_dupacks > tcprexmtthresh) { 1634 tp->snd_cwnd += tp->t_maxseg; 1635 (void) tcp_output(tp); 1636 goto drop; 1637 } 1638 } else if (tiwin < tp->snd_wnd) { 1639 /* 1640 * The window was retracted! Previous dup 1641 * ACKs may have been due to packets arriving 1642 * after the shrunken window, not a missing 1643 * packet, so play it safe and reset t_dupacks 1644 */ 1645 tp->t_dupacks = 0; 1646 } 1647 break; 1648 } 1649 /* 1650 * If the congestion window was inflated to account 1651 * for the other side's cached packets, retract it. 1652 */ 1653 if (tp->t_dupacks >= tcprexmtthresh) { 1654 /* Check for a partial ACK */ 1655 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1656 if (tp->sack_enable) 1657 tcp_sack_partialack(tp, th); 1658 else 1659 tcp_newreno_partialack(tp, th); 1660 } else { 1661 /* Out of fast recovery */ 1662 tp->snd_cwnd = tp->snd_ssthresh; 1663 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1664 tp->snd_ssthresh) 1665 tp->snd_cwnd = 1666 tcp_seq_subtract(tp->snd_max, 1667 th->th_ack); 1668 tp->t_dupacks = 0; 1669 } 1670 } else { 1671 /* 1672 * Reset the duplicate ACK counter if we 1673 * were not in fast recovery. 1674 */ 1675 tp->t_dupacks = 0; 1676 } 1677 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1678 tcpstat_inc(tcps_rcvacktoomuch); 1679 goto dropafterack_ratelim; 1680 } 1681 acked = th->th_ack - tp->snd_una; 1682 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1683 tp->t_rcvacktime = now; 1684 1685 /* 1686 * If we have a timestamp reply, update smoothed 1687 * round trip time. If no timestamp is present but 1688 * transmit timer is running and timed sequence 1689 * number was acked, update smoothed round trip time. 1690 * Since we now have an rtt measurement, cancel the 1691 * timer backoff (cf., Phil Karn's retransmit alg.). 1692 * Recompute the initial retransmit timer. 1693 */ 1694 if (opti.ts_present && opti.ts_ecr) 1695 tcp_xmit_timer(tp, now - opti.ts_ecr); 1696 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1697 tcp_xmit_timer(tp, now - tp->t_rtttime); 1698 1699 /* 1700 * If all outstanding data is acked, stop retransmit 1701 * timer and remember to restart (more output or persist). 1702 * If there is more data to be acked, restart retransmit 1703 * timer, using current (possibly backed-off) value. 1704 */ 1705 if (th->th_ack == tp->snd_max) { 1706 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1707 tp->t_flags |= TF_NEEDOUTPUT; 1708 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1709 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1710 /* 1711 * When new data is acked, open the congestion window. 1712 * If the window gives us less than ssthresh packets 1713 * in flight, open exponentially (maxseg per packet). 1714 * Otherwise open linearly: maxseg per window 1715 * (maxseg^2 / cwnd per packet). 1716 */ 1717 { 1718 u_int cw = tp->snd_cwnd; 1719 u_int incr = tp->t_maxseg; 1720 1721 if (cw > tp->snd_ssthresh) 1722 incr = max(incr * incr / cw, 1); 1723 if (tp->t_dupacks < tcprexmtthresh) 1724 tp->snd_cwnd = ulmin(cw + incr, 1725 TCP_MAXWIN << tp->snd_scale); 1726 } 1727 ND6_HINT(tp); 1728 if (acked > so->so_snd.sb_cc) { 1729 if (tp->snd_wnd > so->so_snd.sb_cc) 1730 tp->snd_wnd -= so->so_snd.sb_cc; 1731 else 1732 tp->snd_wnd = 0; 1733 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1734 ourfinisacked = 1; 1735 } else { 1736 sbdrop(so, &so->so_snd, acked); 1737 if (tp->snd_wnd > acked) 1738 tp->snd_wnd -= acked; 1739 else 1740 tp->snd_wnd = 0; 1741 ourfinisacked = 0; 1742 } 1743 1744 tcp_update_sndspace(tp); 1745 if (sb_notify(so, &so->so_snd)) { 1746 tp->t_flags |= TF_BLOCKOUTPUT; 1747 sowwakeup(so); 1748 tp->t_flags &= ~TF_BLOCKOUTPUT; 1749 } 1750 1751 /* 1752 * If we had a pending ICMP message that referred to data 1753 * that have just been acknowledged, disregard the recorded 1754 * ICMP message. 1755 */ 1756 if ((tp->t_flags & TF_PMTUD_PEND) && 1757 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1758 tp->t_flags &= ~TF_PMTUD_PEND; 1759 1760 /* 1761 * Keep track of the largest chunk of data acknowledged 1762 * since last PMTU update 1763 */ 1764 if (tp->t_pmtud_mss_acked < acked) 1765 tp->t_pmtud_mss_acked = acked; 1766 1767 tp->snd_una = th->th_ack; 1768 #ifdef TCP_ECN 1769 /* sync snd_last with snd_una */ 1770 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1771 tp->snd_last = tp->snd_una; 1772 #endif 1773 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1774 tp->snd_nxt = tp->snd_una; 1775 1776 switch (tp->t_state) { 1777 1778 /* 1779 * In FIN_WAIT_1 STATE in addition to the processing 1780 * for the ESTABLISHED state if our FIN is now acknowledged 1781 * then enter FIN_WAIT_2. 1782 */ 1783 case TCPS_FIN_WAIT_1: 1784 if (ourfinisacked) { 1785 /* 1786 * If we can't receive any more 1787 * data, then closing user can proceed. 1788 * Starting the timer is contrary to the 1789 * specification, but if we don't get a FIN 1790 * we'll hang forever. 1791 */ 1792 if (so->so_state & SS_CANTRCVMORE) { 1793 tp->t_flags |= TF_BLOCKOUTPUT; 1794 soisdisconnected(so); 1795 tp->t_flags &= ~TF_BLOCKOUTPUT; 1796 TCP_TIMER_ARM(tp, TCPT_2MSL, 1797 TCP_TIME(tcp_maxidle)); 1798 } 1799 tp->t_state = TCPS_FIN_WAIT_2; 1800 } 1801 break; 1802 1803 /* 1804 * In CLOSING STATE in addition to the processing for 1805 * the ESTABLISHED state if the ACK acknowledges our FIN 1806 * then enter the TIME-WAIT state, otherwise ignore 1807 * the segment. 1808 */ 1809 case TCPS_CLOSING: 1810 if (ourfinisacked) { 1811 tp->t_state = TCPS_TIME_WAIT; 1812 tcp_canceltimers(tp); 1813 TCP_TIMER_ARM(tp, TCPT_2MSL, 1814 TCP_TIME(2 * TCPTV_MSL)); 1815 tp->t_flags |= TF_BLOCKOUTPUT; 1816 soisdisconnected(so); 1817 tp->t_flags &= ~TF_BLOCKOUTPUT; 1818 } 1819 break; 1820 1821 /* 1822 * In LAST_ACK, we may still be waiting for data to drain 1823 * and/or to be acked, as well as for the ack of our FIN. 1824 * If our FIN is now acknowledged, delete the TCB, 1825 * enter the closed state and return. 1826 */ 1827 case TCPS_LAST_ACK: 1828 if (ourfinisacked) { 1829 tp = tcp_close(tp); 1830 goto drop; 1831 } 1832 break; 1833 1834 /* 1835 * In TIME_WAIT state the only thing that should arrive 1836 * is a retransmission of the remote FIN. Acknowledge 1837 * it and restart the finack timer. 1838 */ 1839 case TCPS_TIME_WAIT: 1840 TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL)); 1841 goto dropafterack; 1842 } 1843 } 1844 1845 step6: 1846 /* 1847 * Update window information. 1848 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1849 */ 1850 if ((tiflags & TH_ACK) && 1851 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1852 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1853 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1854 /* keep track of pure window updates */ 1855 if (tlen == 0 && 1856 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1857 tcpstat_inc(tcps_rcvwinupd); 1858 tp->snd_wnd = tiwin; 1859 tp->snd_wl1 = th->th_seq; 1860 tp->snd_wl2 = th->th_ack; 1861 if (tp->snd_wnd > tp->max_sndwnd) 1862 tp->max_sndwnd = tp->snd_wnd; 1863 tp->t_flags |= TF_NEEDOUTPUT; 1864 } 1865 1866 /* 1867 * Process segments with URG. 1868 */ 1869 if ((tiflags & TH_URG) && th->th_urp && 1870 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1871 /* 1872 * This is a kludge, but if we receive and accept 1873 * random urgent pointers, we'll crash in 1874 * soreceive. It's hard to imagine someone 1875 * actually wanting to send this much urgent data. 1876 */ 1877 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1878 th->th_urp = 0; /* XXX */ 1879 tiflags &= ~TH_URG; /* XXX */ 1880 goto dodata; /* XXX */ 1881 } 1882 /* 1883 * If this segment advances the known urgent pointer, 1884 * then mark the data stream. This should not happen 1885 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1886 * a FIN has been received from the remote side. 1887 * In these states we ignore the URG. 1888 * 1889 * According to RFC961 (Assigned Protocols), 1890 * the urgent pointer points to the last octet 1891 * of urgent data. We continue, however, 1892 * to consider it to indicate the first octet 1893 * of data past the urgent section as the original 1894 * spec states (in one of two places). 1895 */ 1896 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1897 tp->rcv_up = th->th_seq + th->th_urp; 1898 so->so_oobmark = so->so_rcv.sb_cc + 1899 (tp->rcv_up - tp->rcv_nxt) - 1; 1900 if (so->so_oobmark == 0) 1901 so->so_state |= SS_RCVATMARK; 1902 sohasoutofband(so); 1903 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1904 } 1905 /* 1906 * Remove out of band data so doesn't get presented to user. 1907 * This can happen independent of advancing the URG pointer, 1908 * but if two URG's are pending at once, some out-of-band 1909 * data may creep in... ick. 1910 */ 1911 if (th->th_urp <= (u_int16_t) tlen && 1912 (so->so_options & SO_OOBINLINE) == 0) 1913 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1914 } else 1915 /* 1916 * If no out of band data is expected, 1917 * pull receive urgent pointer along 1918 * with the receive window. 1919 */ 1920 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1921 tp->rcv_up = tp->rcv_nxt; 1922 dodata: /* XXX */ 1923 1924 /* 1925 * Process the segment text, merging it into the TCP sequencing queue, 1926 * and arranging for acknowledgment of receipt if necessary. 1927 * This process logically involves adjusting tp->rcv_wnd as data 1928 * is presented to the user (this happens in tcp_usrreq.c, 1929 * case PRU_RCVD). If a FIN has already been received on this 1930 * connection then we just ignore the text. 1931 */ 1932 if ((tlen || (tiflags & TH_FIN)) && 1933 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1934 tcp_seq laststart = th->th_seq; 1935 tcp_seq lastend = th->th_seq + tlen; 1936 1937 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1938 tp->t_state == TCPS_ESTABLISHED) { 1939 TCP_SETUP_ACK(tp, tiflags, m); 1940 tp->rcv_nxt += tlen; 1941 tiflags = th->th_flags & TH_FIN; 1942 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1943 ND6_HINT(tp); 1944 if (so->so_state & SS_CANTRCVMORE) 1945 m_freem(m); 1946 else { 1947 m_adj(m, hdroptlen); 1948 sbappendstream(so, &so->so_rcv, m); 1949 } 1950 tp->t_flags |= TF_BLOCKOUTPUT; 1951 sorwakeup(so); 1952 tp->t_flags &= ~TF_BLOCKOUTPUT; 1953 } else { 1954 m_adj(m, hdroptlen); 1955 tiflags = tcp_reass(tp, th, m, &tlen); 1956 tp->t_flags |= TF_ACKNOW; 1957 } 1958 if (tp->sack_enable) 1959 tcp_update_sack_list(tp, laststart, lastend); 1960 1961 /* 1962 * variable len never referenced again in modern BSD, 1963 * so why bother computing it ?? 1964 */ 1965 #if 0 1966 /* 1967 * Note the amount of data that peer has sent into 1968 * our window, in order to estimate the sender's 1969 * buffer size. 1970 */ 1971 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1972 #endif /* 0 */ 1973 } else { 1974 m_freem(m); 1975 tiflags &= ~TH_FIN; 1976 } 1977 1978 /* 1979 * If FIN is received ACK the FIN and let the user know 1980 * that the connection is closing. Ignore a FIN received before 1981 * the connection is fully established. 1982 */ 1983 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1984 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1985 tp->t_flags |= TF_BLOCKOUTPUT; 1986 socantrcvmore(so); 1987 tp->t_flags &= ~TF_BLOCKOUTPUT; 1988 tp->t_flags |= TF_ACKNOW; 1989 tp->rcv_nxt++; 1990 } 1991 switch (tp->t_state) { 1992 1993 /* 1994 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1995 */ 1996 case TCPS_ESTABLISHED: 1997 tp->t_state = TCPS_CLOSE_WAIT; 1998 break; 1999 2000 /* 2001 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2002 * enter the CLOSING state. 2003 */ 2004 case TCPS_FIN_WAIT_1: 2005 tp->t_state = TCPS_CLOSING; 2006 break; 2007 2008 /* 2009 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2010 * starting the time-wait timer, turning off the other 2011 * standard timers. 2012 */ 2013 case TCPS_FIN_WAIT_2: 2014 tp->t_state = TCPS_TIME_WAIT; 2015 tcp_canceltimers(tp); 2016 TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL)); 2017 tp->t_flags |= TF_BLOCKOUTPUT; 2018 soisdisconnected(so); 2019 tp->t_flags &= ~TF_BLOCKOUTPUT; 2020 break; 2021 2022 /* 2023 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2024 */ 2025 case TCPS_TIME_WAIT: 2026 TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL)); 2027 break; 2028 } 2029 } 2030 if (otp) 2031 tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen); 2032 2033 /* 2034 * Return any desired output. 2035 */ 2036 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2037 (void) tcp_output(tp); 2038 in_pcbunref(inp); 2039 return IPPROTO_DONE; 2040 2041 badsyn: 2042 /* 2043 * Received a bad SYN. Increment counters and dropwithreset. 2044 */ 2045 tcpstat_inc(tcps_badsyn); 2046 tp = NULL; 2047 goto dropwithreset; 2048 2049 dropafterack_ratelim: 2050 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2051 tcp_ackdrop_ppslim) == 0) { 2052 /* XXX stat */ 2053 goto drop; 2054 } 2055 /* ...fall into dropafterack... */ 2056 2057 dropafterack: 2058 /* 2059 * Generate an ACK dropping incoming segment if it occupies 2060 * sequence space, where the ACK reflects our state. 2061 */ 2062 if (tiflags & TH_RST) 2063 goto drop; 2064 m_freem(m); 2065 tp->t_flags |= TF_ACKNOW; 2066 (void) tcp_output(tp); 2067 in_pcbunref(inp); 2068 return IPPROTO_DONE; 2069 2070 dropwithreset_ratelim: 2071 /* 2072 * We may want to rate-limit RSTs in certain situations, 2073 * particularly if we are sending an RST in response to 2074 * an attempt to connect to or otherwise communicate with 2075 * a port for which we have no socket. 2076 */ 2077 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2078 tcp_rst_ppslim) == 0) { 2079 /* XXX stat */ 2080 goto drop; 2081 } 2082 /* ...fall into dropwithreset... */ 2083 2084 dropwithreset: 2085 /* 2086 * Generate a RST, dropping incoming segment. 2087 * Make ACK acceptable to originator of segment. 2088 * Don't bother to respond to RST. 2089 */ 2090 if (tiflags & TH_RST) 2091 goto drop; 2092 if (tiflags & TH_ACK) { 2093 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2094 TH_RST, m->m_pkthdr.ph_rtableid, now); 2095 } else { 2096 if (tiflags & TH_SYN) 2097 tlen++; 2098 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2099 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); 2100 } 2101 m_freem(m); 2102 in_pcbunref(inp); 2103 return IPPROTO_DONE; 2104 2105 drop: 2106 /* 2107 * Drop space held by incoming segment and return. 2108 */ 2109 if (otp) 2110 tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen); 2111 2112 m_freem(m); 2113 in_pcbunref(inp); 2114 return IPPROTO_DONE; 2115 } 2116 2117 int 2118 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2119 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2120 u_int rtableid, uint32_t now) 2121 { 2122 u_int16_t mss = 0; 2123 int opt, optlen; 2124 #ifdef TCP_SIGNATURE 2125 caddr_t sigp = NULL; 2126 struct tdb *tdb = NULL; 2127 #endif /* TCP_SIGNATURE */ 2128 2129 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2130 opt = cp[0]; 2131 if (opt == TCPOPT_EOL) 2132 break; 2133 if (opt == TCPOPT_NOP) 2134 optlen = 1; 2135 else { 2136 if (cnt < 2) 2137 break; 2138 optlen = cp[1]; 2139 if (optlen < 2 || optlen > cnt) 2140 break; 2141 } 2142 switch (opt) { 2143 2144 default: 2145 continue; 2146 2147 case TCPOPT_MAXSEG: 2148 if (optlen != TCPOLEN_MAXSEG) 2149 continue; 2150 if (!(th->th_flags & TH_SYN)) 2151 continue; 2152 if (TCPS_HAVERCVDSYN(tp->t_state)) 2153 continue; 2154 memcpy(&mss, cp + 2, sizeof(mss)); 2155 mss = ntohs(mss); 2156 oi->maxseg = mss; 2157 break; 2158 2159 case TCPOPT_WINDOW: 2160 if (optlen != TCPOLEN_WINDOW) 2161 continue; 2162 if (!(th->th_flags & TH_SYN)) 2163 continue; 2164 if (TCPS_HAVERCVDSYN(tp->t_state)) 2165 continue; 2166 tp->t_flags |= TF_RCVD_SCALE; 2167 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2168 break; 2169 2170 case TCPOPT_TIMESTAMP: 2171 if (optlen != TCPOLEN_TIMESTAMP) 2172 continue; 2173 oi->ts_present = 1; 2174 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2175 oi->ts_val = ntohl(oi->ts_val); 2176 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2177 oi->ts_ecr = ntohl(oi->ts_ecr); 2178 2179 if (!(th->th_flags & TH_SYN)) 2180 continue; 2181 if (TCPS_HAVERCVDSYN(tp->t_state)) 2182 continue; 2183 /* 2184 * A timestamp received in a SYN makes 2185 * it ok to send timestamp requests and replies. 2186 */ 2187 tp->t_flags |= TF_RCVD_TSTMP; 2188 tp->ts_recent = oi->ts_val; 2189 tp->ts_recent_age = now; 2190 break; 2191 2192 case TCPOPT_SACK_PERMITTED: 2193 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2194 continue; 2195 if (!(th->th_flags & TH_SYN)) 2196 continue; 2197 if (TCPS_HAVERCVDSYN(tp->t_state)) 2198 continue; 2199 /* MUST only be set on SYN */ 2200 tp->t_flags |= TF_SACK_PERMIT; 2201 break; 2202 case TCPOPT_SACK: 2203 tcp_sack_option(tp, th, cp, optlen); 2204 break; 2205 #ifdef TCP_SIGNATURE 2206 case TCPOPT_SIGNATURE: 2207 if (optlen != TCPOLEN_SIGNATURE) 2208 continue; 2209 2210 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2211 goto bad; 2212 2213 sigp = cp + 2; 2214 break; 2215 #endif /* TCP_SIGNATURE */ 2216 } 2217 } 2218 2219 #ifdef TCP_SIGNATURE 2220 if (tp->t_flags & TF_SIGNATURE) { 2221 union sockaddr_union src, dst; 2222 2223 memset(&src, 0, sizeof(union sockaddr_union)); 2224 memset(&dst, 0, sizeof(union sockaddr_union)); 2225 2226 switch (tp->pf) { 2227 case 0: 2228 case AF_INET: 2229 src.sa.sa_len = sizeof(struct sockaddr_in); 2230 src.sa.sa_family = AF_INET; 2231 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2232 dst.sa.sa_len = sizeof(struct sockaddr_in); 2233 dst.sa.sa_family = AF_INET; 2234 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2235 break; 2236 #ifdef INET6 2237 case AF_INET6: 2238 src.sa.sa_len = sizeof(struct sockaddr_in6); 2239 src.sa.sa_family = AF_INET6; 2240 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2241 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2242 dst.sa.sa_family = AF_INET6; 2243 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2244 break; 2245 #endif /* INET6 */ 2246 } 2247 2248 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2249 0, &src, &dst, IPPROTO_TCP); 2250 2251 /* 2252 * We don't have an SA for this peer, so we turn off 2253 * TF_SIGNATURE on the listen socket 2254 */ 2255 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2256 tp->t_flags &= ~TF_SIGNATURE; 2257 2258 } 2259 2260 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2261 tcpstat_inc(tcps_rcvbadsig); 2262 goto bad; 2263 } 2264 2265 if (sigp) { 2266 char sig[16]; 2267 2268 if (tdb == NULL) { 2269 tcpstat_inc(tcps_rcvbadsig); 2270 goto bad; 2271 } 2272 2273 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2274 goto bad; 2275 2276 if (timingsafe_bcmp(sig, sigp, 16)) { 2277 tcpstat_inc(tcps_rcvbadsig); 2278 goto bad; 2279 } 2280 2281 tcpstat_inc(tcps_rcvgoodsig); 2282 } 2283 2284 tdb_unref(tdb); 2285 #endif /* TCP_SIGNATURE */ 2286 2287 return (0); 2288 2289 #ifdef TCP_SIGNATURE 2290 bad: 2291 tdb_unref(tdb); 2292 #endif /* TCP_SIGNATURE */ 2293 return (-1); 2294 } 2295 2296 u_long 2297 tcp_seq_subtract(u_long a, u_long b) 2298 { 2299 return ((long)(a - b)); 2300 } 2301 2302 /* 2303 * This function is called upon receipt of new valid data (while not in header 2304 * prediction mode), and it updates the ordered list of sacks. 2305 */ 2306 void 2307 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2308 tcp_seq rcv_lastend) 2309 { 2310 /* 2311 * First reported block MUST be the most recent one. Subsequent 2312 * blocks SHOULD be in the order in which they arrived at the 2313 * receiver. These two conditions make the implementation fully 2314 * compliant with RFC 2018. 2315 */ 2316 int i, j = 0, count = 0, lastpos = -1; 2317 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2318 2319 /* First clean up current list of sacks */ 2320 for (i = 0; i < tp->rcv_numsacks; i++) { 2321 sack = tp->sackblks[i]; 2322 if (sack.start == 0 && sack.end == 0) { 2323 count++; /* count = number of blocks to be discarded */ 2324 continue; 2325 } 2326 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2327 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2328 count++; 2329 } else { 2330 temp[j].start = tp->sackblks[i].start; 2331 temp[j++].end = tp->sackblks[i].end; 2332 } 2333 } 2334 tp->rcv_numsacks -= count; 2335 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2336 tcp_clean_sackreport(tp); 2337 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2338 /* ==> need first sack block */ 2339 tp->sackblks[0].start = rcv_laststart; 2340 tp->sackblks[0].end = rcv_lastend; 2341 tp->rcv_numsacks = 1; 2342 } 2343 return; 2344 } 2345 /* Otherwise, sack blocks are already present. */ 2346 for (i = 0; i < tp->rcv_numsacks; i++) 2347 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2348 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2349 return; /* sack list remains unchanged */ 2350 /* 2351 * From here, segment just received should be (part of) the 1st sack. 2352 * Go through list, possibly coalescing sack block entries. 2353 */ 2354 firstsack.start = rcv_laststart; 2355 firstsack.end = rcv_lastend; 2356 for (i = 0; i < tp->rcv_numsacks; i++) { 2357 sack = tp->sackblks[i]; 2358 if (SEQ_LT(sack.end, firstsack.start) || 2359 SEQ_GT(sack.start, firstsack.end)) 2360 continue; /* no overlap */ 2361 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2362 /* 2363 * identical block; delete it here since we will 2364 * move it to the front of the list. 2365 */ 2366 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2367 lastpos = i; /* last posn with a zero entry */ 2368 continue; 2369 } 2370 if (SEQ_LEQ(sack.start, firstsack.start)) 2371 firstsack.start = sack.start; /* merge blocks */ 2372 if (SEQ_GEQ(sack.end, firstsack.end)) 2373 firstsack.end = sack.end; /* merge blocks */ 2374 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2375 lastpos = i; /* last posn with a zero entry */ 2376 } 2377 if (lastpos != -1) { /* at least one merge */ 2378 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2379 sack = tp->sackblks[i]; 2380 if (sack.start == 0 && sack.end == 0) 2381 continue; 2382 temp[j++] = sack; 2383 } 2384 tp->rcv_numsacks = j; /* including first blk (added later) */ 2385 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2386 tp->sackblks[i] = temp[i]; 2387 } else { /* no merges -- shift sacks by 1 */ 2388 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2389 tp->rcv_numsacks++; 2390 for (i = tp->rcv_numsacks-1; i > 0; i--) 2391 tp->sackblks[i] = tp->sackblks[i-1]; 2392 } 2393 tp->sackblks[0] = firstsack; 2394 return; 2395 } 2396 2397 /* 2398 * Process the TCP SACK option. tp->snd_holes is an ordered list 2399 * of holes (oldest to newest, in terms of the sequence space). 2400 */ 2401 void 2402 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2403 { 2404 int tmp_olen; 2405 u_char *tmp_cp; 2406 struct sackhole *cur, *p, *temp; 2407 2408 if (!tp->sack_enable) 2409 return; 2410 /* SACK without ACK doesn't make sense. */ 2411 if ((th->th_flags & TH_ACK) == 0) 2412 return; 2413 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2414 if (SEQ_LT(th->th_ack, tp->snd_una) || 2415 SEQ_GT(th->th_ack, tp->snd_max)) 2416 return; 2417 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2418 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2419 return; 2420 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2421 tmp_cp = cp + 2; 2422 tmp_olen = optlen - 2; 2423 tcpstat_inc(tcps_sack_rcv_opts); 2424 if (tp->snd_numholes < 0) 2425 tp->snd_numholes = 0; 2426 if (tp->t_maxseg == 0) 2427 panic("tcp_sack_option"); /* Should never happen */ 2428 while (tmp_olen > 0) { 2429 struct sackblk sack; 2430 2431 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2432 sack.start = ntohl(sack.start); 2433 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2434 sack.end = ntohl(sack.end); 2435 tmp_olen -= TCPOLEN_SACK; 2436 tmp_cp += TCPOLEN_SACK; 2437 if (SEQ_LEQ(sack.end, sack.start)) 2438 continue; /* bad SACK fields */ 2439 if (SEQ_LEQ(sack.end, tp->snd_una)) 2440 continue; /* old block */ 2441 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2442 if (SEQ_LT(sack.start, th->th_ack)) 2443 continue; 2444 } 2445 if (SEQ_GT(sack.end, tp->snd_max)) 2446 continue; 2447 if (tp->snd_holes == NULL) { /* first hole */ 2448 tp->snd_holes = (struct sackhole *) 2449 pool_get(&sackhl_pool, PR_NOWAIT); 2450 if (tp->snd_holes == NULL) { 2451 /* ENOBUFS, so ignore SACKed block for now */ 2452 goto dropped; 2453 } 2454 cur = tp->snd_holes; 2455 cur->start = th->th_ack; 2456 cur->end = sack.start; 2457 cur->rxmit = cur->start; 2458 cur->next = NULL; 2459 tp->snd_numholes = 1; 2460 tp->rcv_lastsack = sack.end; 2461 /* 2462 * dups is at least one. If more data has been 2463 * SACKed, it can be greater than one. 2464 */ 2465 cur->dups = min(tcprexmtthresh, 2466 ((sack.end - cur->end)/tp->t_maxseg)); 2467 if (cur->dups < 1) 2468 cur->dups = 1; 2469 continue; /* with next sack block */ 2470 } 2471 /* Go thru list of holes: p = previous, cur = current */ 2472 p = cur = tp->snd_holes; 2473 while (cur) { 2474 if (SEQ_LEQ(sack.end, cur->start)) 2475 /* SACKs data before the current hole */ 2476 break; /* no use going through more holes */ 2477 if (SEQ_GEQ(sack.start, cur->end)) { 2478 /* SACKs data beyond the current hole */ 2479 cur->dups++; 2480 if (((sack.end - cur->end)/tp->t_maxseg) >= 2481 tcprexmtthresh) 2482 cur->dups = tcprexmtthresh; 2483 p = cur; 2484 cur = cur->next; 2485 continue; 2486 } 2487 if (SEQ_LEQ(sack.start, cur->start)) { 2488 /* Data acks at least the beginning of hole */ 2489 if (SEQ_GEQ(sack.end, cur->end)) { 2490 /* Acks entire hole, so delete hole */ 2491 if (p != cur) { 2492 p->next = cur->next; 2493 pool_put(&sackhl_pool, cur); 2494 cur = p->next; 2495 } else { 2496 cur = cur->next; 2497 pool_put(&sackhl_pool, p); 2498 p = cur; 2499 tp->snd_holes = p; 2500 } 2501 tp->snd_numholes--; 2502 continue; 2503 } 2504 /* otherwise, move start of hole forward */ 2505 cur->start = sack.end; 2506 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2507 p = cur; 2508 cur = cur->next; 2509 continue; 2510 } 2511 /* move end of hole backward */ 2512 if (SEQ_GEQ(sack.end, cur->end)) { 2513 cur->end = sack.start; 2514 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2515 cur->dups++; 2516 if (((sack.end - cur->end)/tp->t_maxseg) >= 2517 tcprexmtthresh) 2518 cur->dups = tcprexmtthresh; 2519 p = cur; 2520 cur = cur->next; 2521 continue; 2522 } 2523 if (SEQ_LT(cur->start, sack.start) && 2524 SEQ_GT(cur->end, sack.end)) { 2525 /* 2526 * ACKs some data in middle of a hole; need to 2527 * split current hole 2528 */ 2529 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2530 goto dropped; 2531 temp = (struct sackhole *) 2532 pool_get(&sackhl_pool, PR_NOWAIT); 2533 if (temp == NULL) 2534 goto dropped; /* ENOBUFS */ 2535 temp->next = cur->next; 2536 temp->start = sack.end; 2537 temp->end = cur->end; 2538 temp->dups = cur->dups; 2539 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2540 cur->end = sack.start; 2541 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2542 cur->dups++; 2543 if (((sack.end - cur->end)/tp->t_maxseg) >= 2544 tcprexmtthresh) 2545 cur->dups = tcprexmtthresh; 2546 cur->next = temp; 2547 p = temp; 2548 cur = p->next; 2549 tp->snd_numholes++; 2550 } 2551 } 2552 /* At this point, p points to the last hole on the list */ 2553 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2554 /* 2555 * Need to append new hole at end. 2556 * Last hole is p (and it's not NULL). 2557 */ 2558 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2559 goto dropped; 2560 temp = (struct sackhole *) 2561 pool_get(&sackhl_pool, PR_NOWAIT); 2562 if (temp == NULL) 2563 goto dropped; /* ENOBUFS */ 2564 temp->start = tp->rcv_lastsack; 2565 temp->end = sack.start; 2566 temp->dups = min(tcprexmtthresh, 2567 ((sack.end - sack.start)/tp->t_maxseg)); 2568 if (temp->dups < 1) 2569 temp->dups = 1; 2570 temp->rxmit = temp->start; 2571 temp->next = 0; 2572 p->next = temp; 2573 tp->rcv_lastsack = sack.end; 2574 tp->snd_numholes++; 2575 } 2576 } 2577 return; 2578 dropped: 2579 tcpstat_inc(tcps_sack_drop_opts); 2580 } 2581 2582 /* 2583 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2584 * it is completely acked; otherwise, tcp_sack_option(), called from 2585 * tcp_dooptions(), will fix up the hole. 2586 */ 2587 void 2588 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2589 { 2590 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2591 /* max because this could be an older ack just arrived */ 2592 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2593 th->th_ack : tp->snd_una; 2594 struct sackhole *cur = tp->snd_holes; 2595 struct sackhole *prev; 2596 while (cur) 2597 if (SEQ_LEQ(cur->end, lastack)) { 2598 prev = cur; 2599 cur = cur->next; 2600 pool_put(&sackhl_pool, prev); 2601 tp->snd_numholes--; 2602 } else if (SEQ_LT(cur->start, lastack)) { 2603 cur->start = lastack; 2604 if (SEQ_LT(cur->rxmit, cur->start)) 2605 cur->rxmit = cur->start; 2606 break; 2607 } else 2608 break; 2609 tp->snd_holes = cur; 2610 } 2611 } 2612 2613 /* 2614 * Delete all receiver-side SACK information. 2615 */ 2616 void 2617 tcp_clean_sackreport(struct tcpcb *tp) 2618 { 2619 int i; 2620 2621 tp->rcv_numsacks = 0; 2622 for (i = 0; i < MAX_SACK_BLKS; i++) 2623 tp->sackblks[i].start = tp->sackblks[i].end=0; 2624 2625 } 2626 2627 /* 2628 * Partial ack handling within a sack recovery episode. When a partial ack 2629 * arrives, turn off retransmission timer, deflate the window, do not clear 2630 * tp->t_dupacks. 2631 */ 2632 void 2633 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2634 { 2635 /* Turn off retx. timer (will start again next segment) */ 2636 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2637 tp->t_rtttime = 0; 2638 /* 2639 * Partial window deflation. This statement relies on the 2640 * fact that tp->snd_una has not been updated yet. 2641 */ 2642 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2643 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2644 tp->snd_cwnd += tp->t_maxseg; 2645 } else 2646 tp->snd_cwnd = tp->t_maxseg; 2647 tp->snd_cwnd += tp->t_maxseg; 2648 tp->t_flags |= TF_NEEDOUTPUT; 2649 } 2650 2651 /* 2652 * Pull out of band byte out of a segment so 2653 * it doesn't appear in the user's data queue. 2654 * It is still reflected in the segment length for 2655 * sequencing purposes. 2656 */ 2657 void 2658 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2659 { 2660 int cnt = off + urgent - 1; 2661 2662 while (cnt >= 0) { 2663 if (m->m_len > cnt) { 2664 char *cp = mtod(m, caddr_t) + cnt; 2665 struct tcpcb *tp = sototcpcb(so); 2666 2667 tp->t_iobc = *cp; 2668 tp->t_oobflags |= TCPOOB_HAVEDATA; 2669 memmove(cp, cp + 1, m->m_len - cnt - 1); 2670 m->m_len--; 2671 return; 2672 } 2673 cnt -= m->m_len; 2674 m = m->m_next; 2675 if (m == NULL) 2676 break; 2677 } 2678 panic("tcp_pulloutofband"); 2679 } 2680 2681 /* 2682 * Collect new round-trip time estimate 2683 * and update averages and current timeout. 2684 */ 2685 void 2686 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2687 { 2688 short delta; 2689 short rttmin; 2690 2691 if (rtt < 0) 2692 rtt = 0; 2693 else if (rtt > TCP_RTT_MAX) 2694 rtt = TCP_RTT_MAX; 2695 2696 tcpstat_inc(tcps_rttupdated); 2697 if (tp->t_srtt != 0) { 2698 /* 2699 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2700 * after the binary point (scaled by 4), whereas 2701 * srtt is stored as fixed point with 5 bits after the 2702 * binary point (i.e., scaled by 32). The following magic 2703 * is equivalent to the smoothing algorithm in rfc793 with 2704 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2705 * point). 2706 */ 2707 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2708 (tp->t_srtt >> TCP_RTT_SHIFT); 2709 if ((tp->t_srtt += delta) <= 0) 2710 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2711 /* 2712 * We accumulate a smoothed rtt variance (actually, a 2713 * smoothed mean difference), then set the retransmit 2714 * timer to smoothed rtt + 4 times the smoothed variance. 2715 * rttvar is stored as fixed point with 4 bits after the 2716 * binary point (scaled by 16). The following is 2717 * equivalent to rfc793 smoothing with an alpha of .75 2718 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2719 * rfc793's wired-in beta. 2720 */ 2721 if (delta < 0) 2722 delta = -delta; 2723 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2724 if ((tp->t_rttvar += delta) <= 0) 2725 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2726 } else { 2727 /* 2728 * No rtt measurement yet - use the unsmoothed rtt. 2729 * Set the variance to half the rtt (so our first 2730 * retransmit happens at 3*rtt). 2731 */ 2732 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2733 tp->t_rttvar = (rtt + 1) << 2734 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2735 } 2736 tp->t_rtttime = 0; 2737 tp->t_rxtshift = 0; 2738 2739 /* 2740 * the retransmit should happen at rtt + 4 * rttvar. 2741 * Because of the way we do the smoothing, srtt and rttvar 2742 * will each average +1/2 tick of bias. When we compute 2743 * the retransmit timer, we want 1/2 tick of rounding and 2744 * 1 extra tick because of +-1/2 tick uncertainty in the 2745 * firing of the timer. The bias will give us exactly the 2746 * 1.5 tick we need. But, because the bias is 2747 * statistical, we have to test that we don't drop below 2748 * the minimum feasible timer (which is 2 ticks). 2749 */ 2750 rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)), 2751 TCPTV_REXMTMAX); 2752 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2753 2754 /* 2755 * We received an ack for a packet that wasn't retransmitted; 2756 * it is probably safe to discard any error indications we've 2757 * received recently. This isn't quite right, but close enough 2758 * for now (a route might have failed after we sent a segment, 2759 * and the return path might not be symmetrical). 2760 */ 2761 tp->t_softerror = 0; 2762 } 2763 2764 /* 2765 * Determine a reasonable value for maxseg size. 2766 * If the route is known, check route for mtu. 2767 * If none, use an mss that can be handled on the outgoing 2768 * interface without forcing IP to fragment; if bigger than 2769 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2770 * to utilize large mbufs. If no route is found, route has no mtu, 2771 * or the destination isn't local, use a default, hopefully conservative 2772 * size (usually 512 or the default IP max size, but no more than the mtu 2773 * of the interface), as we can't discover anything about intervening 2774 * gateways or networks. We also initialize the congestion/slow start 2775 * window to be a single segment if the destination isn't local. 2776 * While looking at the routing entry, we also initialize other path-dependent 2777 * parameters from pre-set or cached values in the routing entry. 2778 * 2779 * Also take into account the space needed for options that we 2780 * send regularly. Make maxseg shorter by that amount to assure 2781 * that we can send maxseg amount of data even when the options 2782 * are present. Store the upper limit of the length of options plus 2783 * data in maxopd. 2784 * 2785 * NOTE: offer == -1 indicates that the maxseg size changed due to 2786 * Path MTU discovery. 2787 */ 2788 int 2789 tcp_mss(struct tcpcb *tp, int offer) 2790 { 2791 struct rtentry *rt; 2792 struct ifnet *ifp = NULL; 2793 int mss, mssopt; 2794 int iphlen; 2795 struct inpcb *inp; 2796 2797 inp = tp->t_inpcb; 2798 2799 mssopt = mss = tcp_mssdflt; 2800 2801 rt = in_pcbrtentry(inp); 2802 2803 if (rt == NULL) 2804 goto out; 2805 2806 ifp = if_get(rt->rt_ifidx); 2807 if (ifp == NULL) 2808 goto out; 2809 2810 switch (tp->pf) { 2811 #ifdef INET6 2812 case AF_INET6: 2813 iphlen = sizeof(struct ip6_hdr); 2814 break; 2815 #endif 2816 case AF_INET: 2817 iphlen = sizeof(struct ip); 2818 break; 2819 default: 2820 /* the family does not support path MTU discovery */ 2821 goto out; 2822 } 2823 2824 /* 2825 * if there's an mtu associated with the route and we support 2826 * path MTU discovery for the underlying protocol family, use it. 2827 */ 2828 if (rt->rt_mtu) { 2829 /* 2830 * One may wish to lower MSS to take into account options, 2831 * especially security-related options. 2832 */ 2833 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2834 /* 2835 * RFC2460 section 5, last paragraph: if path MTU is 2836 * smaller than 1280, use 1280 as packet size and 2837 * attach fragment header. 2838 */ 2839 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2840 sizeof(struct tcphdr); 2841 } else { 2842 mss = rt->rt_mtu - iphlen - 2843 sizeof(struct tcphdr); 2844 } 2845 } else if (ifp->if_flags & IFF_LOOPBACK) { 2846 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2847 } else if (tp->pf == AF_INET) { 2848 if (ip_mtudisc) 2849 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2850 } 2851 #ifdef INET6 2852 else if (tp->pf == AF_INET6) { 2853 /* 2854 * for IPv6, path MTU discovery is always turned on, 2855 * or the node must use packet size <= 1280. 2856 */ 2857 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2858 } 2859 #endif /* INET6 */ 2860 2861 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2862 if (offer != -1) { 2863 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2864 mssopt = max(tcp_mssdflt, mssopt); 2865 } 2866 out: 2867 if_put(ifp); 2868 /* 2869 * The current mss, t_maxseg, is initialized to the default value. 2870 * If we compute a smaller value, reduce the current mss. 2871 * If we compute a larger value, return it for use in sending 2872 * a max seg size option, but don't store it for use 2873 * unless we received an offer at least that large from peer. 2874 * 2875 * However, do not accept offers lower than the minimum of 2876 * the interface MTU and 216. 2877 */ 2878 if (offer > 0) 2879 tp->t_peermss = offer; 2880 if (tp->t_peermss) 2881 mss = min(mss, max(tp->t_peermss, 216)); 2882 2883 /* sanity - at least max opt. space */ 2884 mss = max(mss, 64); 2885 2886 /* 2887 * maxopd stores the maximum length of data AND options 2888 * in a segment; maxseg is the amount of data in a normal 2889 * segment. We need to store this value (maxopd) apart 2890 * from maxseg, because now every segment carries options 2891 * and thus we normally have somewhat less data in segments. 2892 */ 2893 tp->t_maxopd = mss; 2894 2895 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2896 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2897 mss -= TCPOLEN_TSTAMP_APPA; 2898 #ifdef TCP_SIGNATURE 2899 if (tp->t_flags & TF_SIGNATURE) 2900 mss -= TCPOLEN_SIGLEN; 2901 #endif 2902 2903 if (offer == -1) { 2904 /* mss changed due to Path MTU discovery */ 2905 tp->t_flags &= ~TF_PMTUD_PEND; 2906 tp->t_pmtud_mtu_sent = 0; 2907 tp->t_pmtud_mss_acked = 0; 2908 if (mss < tp->t_maxseg) { 2909 /* 2910 * Follow suggestion in RFC 2414 to reduce the 2911 * congestion window by the ratio of the old 2912 * segment size to the new segment size. 2913 */ 2914 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2915 mss, mss); 2916 } 2917 } else if (tcp_do_rfc3390 == 2) { 2918 /* increase initial window */ 2919 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2920 } else if (tcp_do_rfc3390) { 2921 /* increase initial window */ 2922 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2923 } else 2924 tp->snd_cwnd = mss; 2925 2926 tp->t_maxseg = mss; 2927 2928 return (offer != -1 ? mssopt : mss); 2929 } 2930 2931 u_int 2932 tcp_hdrsz(struct tcpcb *tp) 2933 { 2934 u_int hlen; 2935 2936 switch (tp->pf) { 2937 #ifdef INET6 2938 case AF_INET6: 2939 hlen = sizeof(struct ip6_hdr); 2940 break; 2941 #endif 2942 case AF_INET: 2943 hlen = sizeof(struct ip); 2944 break; 2945 default: 2946 hlen = 0; 2947 break; 2948 } 2949 hlen += sizeof(struct tcphdr); 2950 2951 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2952 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2953 hlen += TCPOLEN_TSTAMP_APPA; 2954 #ifdef TCP_SIGNATURE 2955 if (tp->t_flags & TF_SIGNATURE) 2956 hlen += TCPOLEN_SIGLEN; 2957 #endif 2958 return (hlen); 2959 } 2960 2961 /* 2962 * Set connection variables based on the effective MSS. 2963 * We are passed the TCPCB for the actual connection. If we 2964 * are the server, we are called by the compressed state engine 2965 * when the 3-way handshake is complete. If we are the client, 2966 * we are called when we receive the SYN,ACK from the server. 2967 * 2968 * NOTE: The t_maxseg value must be initialized in the TCPCB 2969 * before this routine is called! 2970 */ 2971 void 2972 tcp_mss_update(struct tcpcb *tp) 2973 { 2974 int mss; 2975 u_long bufsize; 2976 struct rtentry *rt; 2977 struct socket *so; 2978 2979 so = tp->t_inpcb->inp_socket; 2980 mss = tp->t_maxseg; 2981 2982 rt = in_pcbrtentry(tp->t_inpcb); 2983 2984 if (rt == NULL) 2985 return; 2986 2987 bufsize = so->so_snd.sb_hiwat; 2988 if (bufsize < mss) { 2989 mss = bufsize; 2990 /* Update t_maxseg and t_maxopd */ 2991 tcp_mss(tp, mss); 2992 } else { 2993 bufsize = roundup(bufsize, mss); 2994 if (bufsize > sb_max) 2995 bufsize = sb_max; 2996 (void)sbreserve(so, &so->so_snd, bufsize); 2997 } 2998 2999 bufsize = so->so_rcv.sb_hiwat; 3000 if (bufsize > mss) { 3001 bufsize = roundup(bufsize, mss); 3002 if (bufsize > sb_max) 3003 bufsize = sb_max; 3004 (void)sbreserve(so, &so->so_rcv, bufsize); 3005 } 3006 3007 } 3008 3009 /* 3010 * When a partial ack arrives, force the retransmission of the 3011 * next unacknowledged segment. Do not clear tp->t_dupacks. 3012 * By setting snd_nxt to ti_ack, this forces retransmission timer 3013 * to be started again. 3014 */ 3015 void 3016 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3017 { 3018 /* 3019 * snd_una has not been updated and the socket send buffer 3020 * not yet drained of the acked data, so we have to leave 3021 * snd_una as it was to get the correct data offset in 3022 * tcp_output(). 3023 */ 3024 tcp_seq onxt = tp->snd_nxt; 3025 u_long ocwnd = tp->snd_cwnd; 3026 3027 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3028 tp->t_rtttime = 0; 3029 tp->snd_nxt = th->th_ack; 3030 /* 3031 * Set snd_cwnd to one segment beyond acknowledged offset 3032 * (tp->snd_una not yet updated when this function is called) 3033 */ 3034 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3035 (void)tcp_output(tp); 3036 tp->snd_cwnd = ocwnd; 3037 if (SEQ_GT(onxt, tp->snd_nxt)) 3038 tp->snd_nxt = onxt; 3039 /* 3040 * Partial window deflation. Relies on fact that tp->snd_una 3041 * not updated yet. 3042 */ 3043 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3044 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3045 else 3046 tp->snd_cwnd = 0; 3047 tp->snd_cwnd += tp->t_maxseg; 3048 } 3049 3050 int 3051 tcp_mss_adv(struct mbuf *m, int af) 3052 { 3053 int mss = 0; 3054 int iphlen; 3055 struct ifnet *ifp = NULL; 3056 3057 if (m && (m->m_flags & M_PKTHDR)) 3058 ifp = if_get(m->m_pkthdr.ph_ifidx); 3059 3060 switch (af) { 3061 case AF_INET: 3062 if (ifp != NULL) 3063 mss = ifp->if_mtu; 3064 iphlen = sizeof(struct ip); 3065 break; 3066 #ifdef INET6 3067 case AF_INET6: 3068 if (ifp != NULL) 3069 mss = ifp->if_mtu; 3070 iphlen = sizeof(struct ip6_hdr); 3071 break; 3072 #endif 3073 default: 3074 unhandled_af(af); 3075 } 3076 if_put(ifp); 3077 mss = mss - iphlen - sizeof(struct tcphdr); 3078 return (max(mss, tcp_mssdflt)); 3079 } 3080 3081 /* 3082 * TCP compressed state engine. Currently used to hold compressed 3083 * state for SYN_RECEIVED. 3084 */ 3085 3086 /* syn hash parameters */ 3087 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3088 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3089 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3090 int tcp_syn_use_limit = 100000; 3091 3092 struct syn_cache_set tcp_syn_cache[2]; 3093 int tcp_syn_cache_active; 3094 3095 #define SYN_HASH(sa, sp, dp, rand) \ 3096 (((sa)->s_addr ^ (rand)[0]) * \ 3097 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3098 #ifndef INET6 3099 #define SYN_HASHALL(hash, src, dst, rand) \ 3100 do { \ 3101 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3102 satosin(src)->sin_port, \ 3103 satosin(dst)->sin_port, (rand)); \ 3104 } while (/*CONSTCOND*/ 0) 3105 #else 3106 #define SYN_HASH6(sa, sp, dp, rand) \ 3107 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3108 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3109 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3110 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3111 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3112 3113 #define SYN_HASHALL(hash, src, dst, rand) \ 3114 do { \ 3115 switch ((src)->sa_family) { \ 3116 case AF_INET: \ 3117 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3118 satosin(src)->sin_port, \ 3119 satosin(dst)->sin_port, (rand)); \ 3120 break; \ 3121 case AF_INET6: \ 3122 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3123 satosin6(src)->sin6_port, \ 3124 satosin6(dst)->sin6_port, (rand)); \ 3125 break; \ 3126 default: \ 3127 hash = 0; \ 3128 } \ 3129 } while (/*CONSTCOND*/0) 3130 #endif /* INET6 */ 3131 3132 void 3133 syn_cache_rm(struct syn_cache *sc) 3134 { 3135 sc->sc_flags |= SCF_DEAD; 3136 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3137 sc->sc_tp = NULL; 3138 LIST_REMOVE(sc, sc_tpq); 3139 sc->sc_buckethead->sch_length--; 3140 timeout_del(&sc->sc_timer); 3141 sc->sc_set->scs_count--; 3142 } 3143 3144 void 3145 syn_cache_put(struct syn_cache *sc) 3146 { 3147 m_free(sc->sc_ipopts); 3148 if (sc->sc_route4.ro_rt != NULL) { 3149 rtfree(sc->sc_route4.ro_rt); 3150 sc->sc_route4.ro_rt = NULL; 3151 } 3152 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3153 timeout_add(&sc->sc_timer, 0); 3154 } 3155 3156 struct pool syn_cache_pool; 3157 3158 /* 3159 * We don't estimate RTT with SYNs, so each packet starts with the default 3160 * RTT and each timer step has a fixed timeout value. 3161 */ 3162 #define SYN_CACHE_TIMER_ARM(sc) \ 3163 do { \ 3164 TCPT_RANGESET((sc)->sc_rxtcur, \ 3165 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3166 TCPTV_REXMTMAX); \ 3167 if (!timeout_initialized(&(sc)->sc_timer)) \ 3168 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3169 timeout_add_msec(&(sc)->sc_timer, (sc)->sc_rxtcur); \ 3170 } while (/*CONSTCOND*/0) 3171 3172 void 3173 syn_cache_init(void) 3174 { 3175 int i; 3176 3177 /* Initialize the hash buckets. */ 3178 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3179 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3180 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3181 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3182 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3183 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3184 for (i = 0; i < tcp_syn_hash_size; i++) { 3185 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3186 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3187 } 3188 3189 /* Initialize the syn cache pool. */ 3190 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3191 0, "syncache", NULL); 3192 } 3193 3194 void 3195 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3196 { 3197 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3198 struct syn_cache_head *scp; 3199 struct syn_cache *sc2; 3200 int i; 3201 3202 NET_ASSERT_LOCKED(); 3203 3204 /* 3205 * If there are no entries in the hash table, reinitialize 3206 * the hash secrets. To avoid useless cache swaps and 3207 * reinitialization, use it until the limit is reached. 3208 * An empty cache is also the opportunity to resize the hash. 3209 */ 3210 if (set->scs_count == 0 && set->scs_use <= 0) { 3211 set->scs_use = tcp_syn_use_limit; 3212 if (set->scs_size != tcp_syn_hash_size) { 3213 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3214 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3215 if (scp == NULL) { 3216 /* Try again next time. */ 3217 set->scs_use = 0; 3218 } else { 3219 free(set->scs_buckethead, M_SYNCACHE, 3220 set->scs_size * 3221 sizeof(struct syn_cache_head)); 3222 set->scs_buckethead = scp; 3223 set->scs_size = tcp_syn_hash_size; 3224 for (i = 0; i < tcp_syn_hash_size; i++) 3225 TAILQ_INIT(&scp[i].sch_bucket); 3226 } 3227 } 3228 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3229 tcpstat_inc(tcps_sc_seedrandom); 3230 } 3231 3232 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3233 set->scs_random); 3234 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3235 sc->sc_buckethead = scp; 3236 3237 /* 3238 * Make sure that we don't overflow the per-bucket 3239 * limit or the total cache size limit. 3240 */ 3241 if (scp->sch_length >= tcp_syn_bucket_limit) { 3242 tcpstat_inc(tcps_sc_bucketoverflow); 3243 /* 3244 * Someone might attack our bucket hash function. Reseed 3245 * with random as soon as the passive syn cache gets empty. 3246 */ 3247 set->scs_use = 0; 3248 /* 3249 * The bucket is full. Toss the oldest element in the 3250 * bucket. This will be the first entry in the bucket. 3251 */ 3252 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3253 #ifdef DIAGNOSTIC 3254 /* 3255 * This should never happen; we should always find an 3256 * entry in our bucket. 3257 */ 3258 if (sc2 == NULL) 3259 panic("%s: bucketoverflow: impossible", __func__); 3260 #endif 3261 syn_cache_rm(sc2); 3262 syn_cache_put(sc2); 3263 } else if (set->scs_count >= tcp_syn_cache_limit) { 3264 struct syn_cache_head *scp2, *sce; 3265 3266 tcpstat_inc(tcps_sc_overflowed); 3267 /* 3268 * The cache is full. Toss the oldest entry in the 3269 * first non-empty bucket we can find. 3270 * 3271 * XXX We would really like to toss the oldest 3272 * entry in the cache, but we hope that this 3273 * condition doesn't happen very often. 3274 */ 3275 scp2 = scp; 3276 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3277 sce = &set->scs_buckethead[set->scs_size]; 3278 for (++scp2; scp2 != scp; scp2++) { 3279 if (scp2 >= sce) 3280 scp2 = &set->scs_buckethead[0]; 3281 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3282 break; 3283 } 3284 #ifdef DIAGNOSTIC 3285 /* 3286 * This should never happen; we should always find a 3287 * non-empty bucket. 3288 */ 3289 if (scp2 == scp) 3290 panic("%s: cacheoverflow: impossible", 3291 __func__); 3292 #endif 3293 } 3294 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3295 syn_cache_rm(sc2); 3296 syn_cache_put(sc2); 3297 } 3298 3299 /* 3300 * Initialize the entry's timer. 3301 */ 3302 sc->sc_rxttot = 0; 3303 sc->sc_rxtshift = 0; 3304 SYN_CACHE_TIMER_ARM(sc); 3305 3306 /* Link it from tcpcb entry */ 3307 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3308 3309 /* Put it into the bucket. */ 3310 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3311 scp->sch_length++; 3312 sc->sc_set = set; 3313 set->scs_count++; 3314 set->scs_use--; 3315 3316 tcpstat_inc(tcps_sc_added); 3317 3318 /* 3319 * If the active cache has exceeded its use limit and 3320 * the passive syn cache is empty, exchange their roles. 3321 */ 3322 if (set->scs_use <= 0 && 3323 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3324 tcp_syn_cache_active = !tcp_syn_cache_active; 3325 } 3326 3327 /* 3328 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3329 * If we have retransmitted an entry the maximum number of times, expire 3330 * that entry. 3331 */ 3332 void 3333 syn_cache_timer(void *arg) 3334 { 3335 struct syn_cache *sc = arg; 3336 uint32_t now; 3337 3338 NET_LOCK(); 3339 if (sc->sc_flags & SCF_DEAD) 3340 goto out; 3341 3342 now = tcp_now(); 3343 3344 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3345 /* Drop it -- too many retransmissions. */ 3346 goto dropit; 3347 } 3348 3349 /* 3350 * Compute the total amount of time this entry has 3351 * been on a queue. If this entry has been on longer 3352 * than the keep alive timer would allow, expire it. 3353 */ 3354 sc->sc_rxttot += sc->sc_rxtcur; 3355 if (sc->sc_rxttot >= tcptv_keep_init) 3356 goto dropit; 3357 3358 tcpstat_inc(tcps_sc_retransmitted); 3359 (void) syn_cache_respond(sc, NULL, now); 3360 3361 /* Advance the timer back-off. */ 3362 sc->sc_rxtshift++; 3363 SYN_CACHE_TIMER_ARM(sc); 3364 3365 out: 3366 NET_UNLOCK(); 3367 return; 3368 3369 dropit: 3370 tcpstat_inc(tcps_sc_timed_out); 3371 syn_cache_rm(sc); 3372 syn_cache_put(sc); 3373 NET_UNLOCK(); 3374 } 3375 3376 void 3377 syn_cache_reaper(void *arg) 3378 { 3379 struct syn_cache *sc = arg; 3380 3381 pool_put(&syn_cache_pool, (sc)); 3382 return; 3383 } 3384 3385 /* 3386 * Remove syn cache created by the specified tcb entry, 3387 * because this does not make sense to keep them 3388 * (if there's no tcb entry, syn cache entry will never be used) 3389 */ 3390 void 3391 syn_cache_cleanup(struct tcpcb *tp) 3392 { 3393 struct syn_cache *sc, *nsc; 3394 3395 NET_ASSERT_LOCKED(); 3396 3397 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3398 #ifdef DIAGNOSTIC 3399 if (sc->sc_tp != tp) 3400 panic("invalid sc_tp in syn_cache_cleanup"); 3401 #endif 3402 syn_cache_rm(sc); 3403 syn_cache_put(sc); 3404 } 3405 /* just for safety */ 3406 LIST_INIT(&tp->t_sc); 3407 } 3408 3409 /* 3410 * Find an entry in the syn cache. 3411 */ 3412 struct syn_cache * 3413 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3414 struct syn_cache_head **headp, u_int rtableid) 3415 { 3416 struct syn_cache_set *sets[2]; 3417 struct syn_cache *sc; 3418 struct syn_cache_head *scp; 3419 u_int32_t hash; 3420 int i; 3421 3422 NET_ASSERT_LOCKED(); 3423 3424 /* Check the active cache first, the passive cache is likely empty. */ 3425 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3426 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3427 for (i = 0; i < 2; i++) { 3428 if (sets[i]->scs_count == 0) 3429 continue; 3430 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3431 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3432 *headp = scp; 3433 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3434 if (sc->sc_hash != hash) 3435 continue; 3436 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3437 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3438 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3439 return (sc); 3440 } 3441 } 3442 return (NULL); 3443 } 3444 3445 /* 3446 * This function gets called when we receive an ACK for a 3447 * socket in the LISTEN state. We look up the connection 3448 * in the syn cache, and if its there, we pull it out of 3449 * the cache and turn it into a full-blown connection in 3450 * the SYN-RECEIVED state. 3451 * 3452 * The return values may not be immediately obvious, and their effects 3453 * can be subtle, so here they are: 3454 * 3455 * NULL SYN was not found in cache; caller should drop the 3456 * packet and send an RST. 3457 * 3458 * -1 We were unable to create the new connection, and are 3459 * aborting it. An ACK,RST is being sent to the peer 3460 * (unless we got screwy sequence numbers; see below), 3461 * because the 3-way handshake has been completed. Caller 3462 * should not free the mbuf, since we may be using it. If 3463 * we are not, we will free it. 3464 * 3465 * Otherwise, the return value is a pointer to the new socket 3466 * associated with the connection. 3467 */ 3468 struct socket * 3469 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3470 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint32_t now) 3471 { 3472 struct syn_cache *sc; 3473 struct syn_cache_head *scp; 3474 struct inpcb *inp, *oldinp; 3475 struct tcpcb *tp = NULL; 3476 struct mbuf *am; 3477 struct socket *oso; 3478 3479 NET_ASSERT_LOCKED(); 3480 3481 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3482 if (sc == NULL) 3483 return (NULL); 3484 3485 /* 3486 * Verify the sequence and ack numbers. Try getting the correct 3487 * response again. 3488 */ 3489 if ((th->th_ack != sc->sc_iss + 1) || 3490 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3491 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3492 (void) syn_cache_respond(sc, m, now); 3493 return ((struct socket *)(-1)); 3494 } 3495 3496 /* Remove this cache entry */ 3497 syn_cache_rm(sc); 3498 3499 /* 3500 * Ok, create the full blown connection, and set things up 3501 * as they would have been set up if we had created the 3502 * connection when the SYN arrived. If we can't create 3503 * the connection, abort it. 3504 */ 3505 oso = so; 3506 so = sonewconn(so, SS_ISCONNECTED, M_DONTWAIT); 3507 if (so == NULL) 3508 goto resetandabort; 3509 3510 oldinp = sotoinpcb(oso); 3511 inp = sotoinpcb(so); 3512 3513 #ifdef IPSEC 3514 /* 3515 * We need to copy the required security levels 3516 * from the old pcb. Ditto for any other 3517 * IPsec-related information. 3518 */ 3519 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3520 sizeof(oldinp->inp_seclevel)); 3521 #endif /* IPSEC */ 3522 #ifdef INET6 3523 /* 3524 * inp still has the OLD in_pcb stuff, set the 3525 * v6-related flags on the new guy, too. 3526 */ 3527 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3528 if (inp->inp_flags & INP_IPV6) { 3529 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3530 inp->inp_hops = oldinp->inp_hops; 3531 } else 3532 #endif /* INET6 */ 3533 { 3534 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3535 } 3536 3537 #if NPF > 0 3538 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3539 struct pf_divert *divert; 3540 3541 divert = pf_find_divert(m); 3542 KASSERT(divert != NULL); 3543 inp->inp_rtableid = divert->rdomain; 3544 } else 3545 #endif 3546 /* inherit rtable from listening socket */ 3547 inp->inp_rtableid = sc->sc_rtableid; 3548 3549 inp->inp_lport = th->th_dport; 3550 switch (src->sa_family) { 3551 #ifdef INET6 3552 case AF_INET6: 3553 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3554 break; 3555 #endif /* INET6 */ 3556 case AF_INET: 3557 inp->inp_laddr = satosin(dst)->sin_addr; 3558 inp->inp_options = ip_srcroute(m); 3559 if (inp->inp_options == NULL) { 3560 inp->inp_options = sc->sc_ipopts; 3561 sc->sc_ipopts = NULL; 3562 } 3563 break; 3564 } 3565 in_pcbrehash(inp); 3566 3567 /* 3568 * Give the new socket our cached route reference. 3569 */ 3570 if (src->sa_family == AF_INET) 3571 inp->inp_route = sc->sc_route4; /* struct assignment */ 3572 #ifdef INET6 3573 else 3574 inp->inp_route6 = sc->sc_route6; 3575 #endif 3576 sc->sc_route4.ro_rt = NULL; 3577 3578 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3579 if (am == NULL) 3580 goto resetandabort; 3581 am->m_len = src->sa_len; 3582 memcpy(mtod(am, caddr_t), src, src->sa_len); 3583 if (in_pcbconnect(inp, am)) { 3584 (void) m_free(am); 3585 goto resetandabort; 3586 } 3587 (void) m_free(am); 3588 3589 tp = intotcpcb(inp); 3590 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3591 if (sc->sc_request_r_scale != 15) { 3592 tp->requested_s_scale = sc->sc_requested_s_scale; 3593 tp->request_r_scale = sc->sc_request_r_scale; 3594 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3595 } 3596 if (sc->sc_flags & SCF_TIMESTAMP) 3597 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3598 3599 tp->t_template = tcp_template(tp); 3600 if (tp->t_template == 0) { 3601 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3602 so = NULL; 3603 goto abort; 3604 } 3605 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3606 tp->ts_modulate = sc->sc_modulate; 3607 tp->ts_recent = sc->sc_timestamp; 3608 tp->iss = sc->sc_iss; 3609 tp->irs = sc->sc_irs; 3610 tcp_sendseqinit(tp); 3611 tp->snd_last = tp->snd_una; 3612 #ifdef TCP_ECN 3613 if (sc->sc_flags & SCF_ECN_PERMIT) { 3614 tp->t_flags |= TF_ECN_PERMIT; 3615 tcpstat_inc(tcps_ecn_accepts); 3616 } 3617 #endif 3618 if (sc->sc_flags & SCF_SACK_PERMIT) 3619 tp->t_flags |= TF_SACK_PERMIT; 3620 #ifdef TCP_SIGNATURE 3621 if (sc->sc_flags & SCF_SIGNATURE) 3622 tp->t_flags |= TF_SIGNATURE; 3623 #endif 3624 tcp_rcvseqinit(tp); 3625 tp->t_state = TCPS_SYN_RECEIVED; 3626 tp->t_rcvtime = now; 3627 tp->t_sndtime = now; 3628 tp->t_rcvacktime = now; 3629 tp->t_sndacktime = now; 3630 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcptv_keep_init)); 3631 tcpstat_inc(tcps_accepts); 3632 3633 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3634 if (sc->sc_peermaxseg) 3635 tcp_mss_update(tp); 3636 /* Reset initial window to 1 segment for retransmit */ 3637 if (sc->sc_rxtshift > 0) 3638 tp->snd_cwnd = tp->t_maxseg; 3639 tp->snd_wl1 = sc->sc_irs; 3640 tp->rcv_up = sc->sc_irs + 1; 3641 3642 /* 3643 * This is what would have happened in tcp_output() when 3644 * the SYN,ACK was sent. 3645 */ 3646 tp->snd_up = tp->snd_una; 3647 tp->snd_max = tp->snd_nxt = tp->iss+1; 3648 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3649 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3650 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3651 tp->last_ack_sent = tp->rcv_nxt; 3652 3653 tcpstat_inc(tcps_sc_completed); 3654 syn_cache_put(sc); 3655 return (so); 3656 3657 resetandabort: 3658 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3659 m->m_pkthdr.ph_rtableid, now); 3660 abort: 3661 m_freem(m); 3662 if (so != NULL) 3663 soabort(so); 3664 syn_cache_put(sc); 3665 tcpstat_inc(tcps_sc_aborted); 3666 return ((struct socket *)(-1)); 3667 } 3668 3669 /* 3670 * This function is called when we get a RST for a 3671 * non-existent connection, so that we can see if the 3672 * connection is in the syn cache. If it is, zap it. 3673 */ 3674 3675 void 3676 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3677 u_int rtableid) 3678 { 3679 struct syn_cache *sc; 3680 struct syn_cache_head *scp; 3681 3682 NET_ASSERT_LOCKED(); 3683 3684 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3685 return; 3686 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3687 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3688 return; 3689 syn_cache_rm(sc); 3690 tcpstat_inc(tcps_sc_reset); 3691 syn_cache_put(sc); 3692 } 3693 3694 void 3695 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3696 u_int rtableid) 3697 { 3698 struct syn_cache *sc; 3699 struct syn_cache_head *scp; 3700 3701 NET_ASSERT_LOCKED(); 3702 3703 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3704 return; 3705 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3706 if (ntohl (th->th_seq) != sc->sc_iss) { 3707 return; 3708 } 3709 3710 /* 3711 * If we've retransmitted 3 times and this is our second error, 3712 * we remove the entry. Otherwise, we allow it to continue on. 3713 * This prevents us from incorrectly nuking an entry during a 3714 * spurious network outage. 3715 * 3716 * See tcp_notify(). 3717 */ 3718 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3719 sc->sc_flags |= SCF_UNREACH; 3720 return; 3721 } 3722 3723 syn_cache_rm(sc); 3724 tcpstat_inc(tcps_sc_unreach); 3725 syn_cache_put(sc); 3726 } 3727 3728 /* 3729 * Given a LISTEN socket and an inbound SYN request, add 3730 * this to the syn cache, and send back a segment: 3731 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3732 * to the source. 3733 * 3734 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3735 * Doing so would require that we hold onto the data and deliver it 3736 * to the application. However, if we are the target of a SYN-flood 3737 * DoS attack, an attacker could send data which would eventually 3738 * consume all available buffer space if it were ACKed. By not ACKing 3739 * the data, we avoid this DoS scenario. 3740 */ 3741 3742 int 3743 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3744 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3745 struct tcp_opt_info *oi, tcp_seq *issp, uint32_t now) 3746 { 3747 struct tcpcb tb, *tp; 3748 long win; 3749 struct syn_cache *sc; 3750 struct syn_cache_head *scp; 3751 struct mbuf *ipopts; 3752 3753 tp = sototcpcb(so); 3754 3755 /* 3756 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3757 * 3758 * Note this check is performed in tcp_input() very early on. 3759 */ 3760 3761 /* 3762 * Initialize some local state. 3763 */ 3764 win = sbspace(so, &so->so_rcv); 3765 if (win > TCP_MAXWIN) 3766 win = TCP_MAXWIN; 3767 3768 bzero(&tb, sizeof(tb)); 3769 #ifdef TCP_SIGNATURE 3770 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3771 #else 3772 if (optp) { 3773 #endif 3774 tb.pf = tp->pf; 3775 tb.sack_enable = tp->sack_enable; 3776 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3777 #ifdef TCP_SIGNATURE 3778 if (tp->t_flags & TF_SIGNATURE) 3779 tb.t_flags |= TF_SIGNATURE; 3780 #endif 3781 tb.t_state = TCPS_LISTEN; 3782 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3783 sotoinpcb(so)->inp_rtableid, now)) 3784 return (-1); 3785 } 3786 3787 switch (src->sa_family) { 3788 case AF_INET: 3789 /* 3790 * Remember the IP options, if any. 3791 */ 3792 ipopts = ip_srcroute(m); 3793 break; 3794 default: 3795 ipopts = NULL; 3796 } 3797 3798 /* 3799 * See if we already have an entry for this connection. 3800 * If we do, resend the SYN,ACK. We do not count this 3801 * as a retransmission (XXX though maybe we should). 3802 */ 3803 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3804 if (sc != NULL) { 3805 tcpstat_inc(tcps_sc_dupesyn); 3806 if (ipopts) { 3807 /* 3808 * If we were remembering a previous source route, 3809 * forget it and use the new one we've been given. 3810 */ 3811 m_free(sc->sc_ipopts); 3812 sc->sc_ipopts = ipopts; 3813 } 3814 sc->sc_timestamp = tb.ts_recent; 3815 if (syn_cache_respond(sc, m, now) == 0) { 3816 tcpstat_inc(tcps_sndacks); 3817 tcpstat_inc(tcps_sndtotal); 3818 } 3819 return (0); 3820 } 3821 3822 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3823 if (sc == NULL) { 3824 m_free(ipopts); 3825 return (-1); 3826 } 3827 3828 /* 3829 * Fill in the cache, and put the necessary IP and TCP 3830 * options into the reply. 3831 */ 3832 memcpy(&sc->sc_src, src, src->sa_len); 3833 memcpy(&sc->sc_dst, dst, dst->sa_len); 3834 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3835 sc->sc_flags = 0; 3836 sc->sc_ipopts = ipopts; 3837 sc->sc_irs = th->th_seq; 3838 3839 sc->sc_iss = issp ? *issp : arc4random(); 3840 sc->sc_peermaxseg = oi->maxseg; 3841 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3842 sc->sc_win = win; 3843 sc->sc_timestamp = tb.ts_recent; 3844 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3845 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3846 sc->sc_flags |= SCF_TIMESTAMP; 3847 sc->sc_modulate = arc4random(); 3848 } 3849 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3850 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3851 sc->sc_requested_s_scale = tb.requested_s_scale; 3852 sc->sc_request_r_scale = 0; 3853 /* 3854 * Pick the smallest possible scaling factor that 3855 * will still allow us to scale up to sb_max. 3856 * 3857 * We do this because there are broken firewalls that 3858 * will corrupt the window scale option, leading to 3859 * the other endpoint believing that our advertised 3860 * window is unscaled. At scale factors larger than 3861 * 5 the unscaled window will drop below 1500 bytes, 3862 * leading to serious problems when traversing these 3863 * broken firewalls. 3864 * 3865 * With the default sbmax of 256K, a scale factor 3866 * of 3 will be chosen by this algorithm. Those who 3867 * choose a larger sbmax should watch out 3868 * for the compatibility problems mentioned above. 3869 * 3870 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3871 * or <SYN,ACK>) segment itself is never scaled. 3872 */ 3873 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3874 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3875 sc->sc_request_r_scale++; 3876 } else { 3877 sc->sc_requested_s_scale = 15; 3878 sc->sc_request_r_scale = 15; 3879 } 3880 #ifdef TCP_ECN 3881 /* 3882 * if both ECE and CWR flag bits are set, peer is ECN capable. 3883 */ 3884 if (tcp_do_ecn && 3885 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3886 sc->sc_flags |= SCF_ECN_PERMIT; 3887 #endif 3888 /* 3889 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3890 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3891 */ 3892 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3893 sc->sc_flags |= SCF_SACK_PERMIT; 3894 #ifdef TCP_SIGNATURE 3895 if (tb.t_flags & TF_SIGNATURE) 3896 sc->sc_flags |= SCF_SIGNATURE; 3897 #endif 3898 sc->sc_tp = tp; 3899 if (syn_cache_respond(sc, m, now) == 0) { 3900 syn_cache_insert(sc, tp); 3901 tcpstat_inc(tcps_sndacks); 3902 tcpstat_inc(tcps_sndtotal); 3903 } else { 3904 syn_cache_put(sc); 3905 tcpstat_inc(tcps_sc_dropped); 3906 } 3907 3908 return (0); 3909 } 3910 3911 int 3912 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint32_t now) 3913 { 3914 u_int8_t *optp; 3915 int optlen, error; 3916 u_int16_t tlen; 3917 struct ip *ip = NULL; 3918 #ifdef INET6 3919 struct ip6_hdr *ip6 = NULL; 3920 #endif 3921 struct tcphdr *th; 3922 u_int hlen; 3923 struct inpcb *inp; 3924 3925 switch (sc->sc_src.sa.sa_family) { 3926 case AF_INET: 3927 hlen = sizeof(struct ip); 3928 break; 3929 #ifdef INET6 3930 case AF_INET6: 3931 hlen = sizeof(struct ip6_hdr); 3932 break; 3933 #endif 3934 default: 3935 m_freem(m); 3936 return (EAFNOSUPPORT); 3937 } 3938 3939 /* Compute the size of the TCP options. */ 3940 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3941 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 3942 #ifdef TCP_SIGNATURE 3943 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3944 #endif 3945 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3946 3947 tlen = hlen + sizeof(struct tcphdr) + optlen; 3948 3949 /* 3950 * Create the IP+TCP header from scratch. 3951 */ 3952 m_freem(m); 3953 #ifdef DIAGNOSTIC 3954 if (max_linkhdr + tlen > MCLBYTES) 3955 return (ENOBUFS); 3956 #endif 3957 MGETHDR(m, M_DONTWAIT, MT_DATA); 3958 if (m && max_linkhdr + tlen > MHLEN) { 3959 MCLGET(m, M_DONTWAIT); 3960 if ((m->m_flags & M_EXT) == 0) { 3961 m_freem(m); 3962 m = NULL; 3963 } 3964 } 3965 if (m == NULL) 3966 return (ENOBUFS); 3967 3968 /* Fixup the mbuf. */ 3969 m->m_data += max_linkhdr; 3970 m->m_len = m->m_pkthdr.len = tlen; 3971 m->m_pkthdr.ph_ifidx = 0; 3972 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 3973 memset(mtod(m, u_char *), 0, tlen); 3974 3975 switch (sc->sc_src.sa.sa_family) { 3976 case AF_INET: 3977 ip = mtod(m, struct ip *); 3978 ip->ip_dst = sc->sc_src.sin.sin_addr; 3979 ip->ip_src = sc->sc_dst.sin.sin_addr; 3980 ip->ip_p = IPPROTO_TCP; 3981 th = (struct tcphdr *)(ip + 1); 3982 th->th_dport = sc->sc_src.sin.sin_port; 3983 th->th_sport = sc->sc_dst.sin.sin_port; 3984 break; 3985 #ifdef INET6 3986 case AF_INET6: 3987 ip6 = mtod(m, struct ip6_hdr *); 3988 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3989 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3990 ip6->ip6_nxt = IPPROTO_TCP; 3991 /* ip6_plen will be updated in ip6_output() */ 3992 th = (struct tcphdr *)(ip6 + 1); 3993 th->th_dport = sc->sc_src.sin6.sin6_port; 3994 th->th_sport = sc->sc_dst.sin6.sin6_port; 3995 break; 3996 #endif 3997 default: 3998 unhandled_af(sc->sc_src.sa.sa_family); 3999 } 4000 4001 th->th_seq = htonl(sc->sc_iss); 4002 th->th_ack = htonl(sc->sc_irs + 1); 4003 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4004 th->th_flags = TH_SYN|TH_ACK; 4005 #ifdef TCP_ECN 4006 /* Set ECE for SYN-ACK if peer supports ECN. */ 4007 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4008 th->th_flags |= TH_ECE; 4009 #endif 4010 th->th_win = htons(sc->sc_win); 4011 /* th_sum already 0 */ 4012 /* th_urp already 0 */ 4013 4014 /* Tack on the TCP options. */ 4015 optp = (u_int8_t *)(th + 1); 4016 *optp++ = TCPOPT_MAXSEG; 4017 *optp++ = 4; 4018 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4019 *optp++ = sc->sc_ourmaxseg & 0xff; 4020 4021 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4022 if (sc->sc_flags & SCF_SACK_PERMIT) { 4023 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4024 optp += 4; 4025 } 4026 4027 if (sc->sc_request_r_scale != 15) { 4028 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4029 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4030 sc->sc_request_r_scale); 4031 optp += 4; 4032 } 4033 4034 if (sc->sc_flags & SCF_TIMESTAMP) { 4035 u_int32_t *lp = (u_int32_t *)(optp); 4036 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4037 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4038 *lp++ = htonl(now + sc->sc_modulate); 4039 *lp = htonl(sc->sc_timestamp); 4040 optp += TCPOLEN_TSTAMP_APPA; 4041 } 4042 4043 #ifdef TCP_SIGNATURE 4044 if (sc->sc_flags & SCF_SIGNATURE) { 4045 union sockaddr_union src, dst; 4046 struct tdb *tdb; 4047 4048 bzero(&src, sizeof(union sockaddr_union)); 4049 bzero(&dst, sizeof(union sockaddr_union)); 4050 src.sa.sa_len = sc->sc_src.sa.sa_len; 4051 src.sa.sa_family = sc->sc_src.sa.sa_family; 4052 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4053 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4054 4055 switch (sc->sc_src.sa.sa_family) { 4056 case 0: /*default to PF_INET*/ 4057 case AF_INET: 4058 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4059 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4060 break; 4061 #ifdef INET6 4062 case AF_INET6: 4063 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4064 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4065 break; 4066 #endif /* INET6 */ 4067 } 4068 4069 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4070 0, &src, &dst, IPPROTO_TCP); 4071 if (tdb == NULL) { 4072 m_freem(m); 4073 return (EPERM); 4074 } 4075 4076 /* Send signature option */ 4077 *(optp++) = TCPOPT_SIGNATURE; 4078 *(optp++) = TCPOLEN_SIGNATURE; 4079 4080 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4081 hlen, 0, optp) < 0) { 4082 m_freem(m); 4083 tdb_unref(tdb); 4084 return (EINVAL); 4085 } 4086 tdb_unref(tdb); 4087 optp += 16; 4088 4089 /* Pad options list to the next 32 bit boundary and 4090 * terminate it. 4091 */ 4092 *optp++ = TCPOPT_NOP; 4093 *optp++ = TCPOPT_EOL; 4094 } 4095 #endif /* TCP_SIGNATURE */ 4096 4097 /* Compute the packet's checksum. */ 4098 switch (sc->sc_src.sa.sa_family) { 4099 case AF_INET: 4100 ip->ip_len = htons(tlen - hlen); 4101 th->th_sum = 0; 4102 th->th_sum = in_cksum(m, tlen); 4103 break; 4104 #ifdef INET6 4105 case AF_INET6: 4106 ip6->ip6_plen = htons(tlen - hlen); 4107 th->th_sum = 0; 4108 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4109 break; 4110 #endif 4111 } 4112 4113 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4114 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4115 4116 /* 4117 * Fill in some straggling IP bits. Note the stack expects 4118 * ip_len to be in host order, for convenience. 4119 */ 4120 switch (sc->sc_src.sa.sa_family) { 4121 case AF_INET: 4122 ip->ip_len = htons(tlen); 4123 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4124 if (inp != NULL) 4125 ip->ip_tos = inp->inp_ip.ip_tos; 4126 break; 4127 #ifdef INET6 4128 case AF_INET6: 4129 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4130 ip6->ip6_vfc |= IPV6_VERSION; 4131 ip6->ip6_plen = htons(tlen - hlen); 4132 /* ip6_hlim will be initialized afterwards */ 4133 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4134 break; 4135 #endif 4136 } 4137 4138 switch (sc->sc_src.sa.sa_family) { 4139 case AF_INET: 4140 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4141 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4142 break; 4143 #ifdef INET6 4144 case AF_INET6: 4145 ip6->ip6_hlim = in6_selecthlim(inp); 4146 4147 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4148 NULL, NULL); 4149 break; 4150 #endif 4151 default: 4152 error = EAFNOSUPPORT; 4153 break; 4154 } 4155 return (error); 4156 } 4157