1 /* $OpenBSD: tcp_input.c,v 1.384 2022/12/09 00:24:44 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE TCP_TIME(24 * 24 * 60 * 60) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 189 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 190 191 void syn_cache_put(struct syn_cache *); 192 void syn_cache_rm(struct syn_cache *); 193 int syn_cache_respond(struct syn_cache *, struct mbuf *, uint32_t); 194 void syn_cache_timer(void *); 195 void syn_cache_reaper(void *); 196 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 197 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 198 struct tcphdr *, u_int); 199 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 200 unsigned int, struct socket *, struct mbuf *, u_char *, int, 201 struct tcp_opt_info *, tcp_seq *, uint32_t); 202 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 203 struct tcphdr *, unsigned int, unsigned int, struct socket *, 204 struct mbuf *, uint32_t); 205 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 206 struct syn_cache_head **, u_int); 207 208 /* 209 * Insert segment ti into reassembly queue of tcp with 210 * control block tp. Return TH_FIN if reassembly now includes 211 * a segment with FIN. The macro form does the common case inline 212 * (segment is the next to be received on an established connection, 213 * and the queue is empty), avoiding linkage into and removal 214 * from the queue and repetition of various conversions. 215 * Set DELACK for segments received in order, but ack immediately 216 * when segments are out of order (so fast retransmit can work). 217 */ 218 219 int 220 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 221 { 222 struct tcpqent *p, *q, *nq, *tiqe; 223 224 /* 225 * Allocate a new queue entry, before we throw away any data. 226 * If we can't, just drop the packet. XXX 227 */ 228 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 229 if (tiqe == NULL) { 230 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 231 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 232 /* Reuse last entry since new segment fills a hole */ 233 m_freem(tiqe->tcpqe_m); 234 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 235 } 236 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 237 /* Flush segment queue for this connection */ 238 tcp_freeq(tp); 239 tcpstat_inc(tcps_rcvmemdrop); 240 m_freem(m); 241 return (0); 242 } 243 } 244 245 /* 246 * Find a segment which begins after this one does. 247 */ 248 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 249 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 250 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 251 break; 252 253 /* 254 * If there is a preceding segment, it may provide some of 255 * our data already. If so, drop the data from the incoming 256 * segment. If it provides all of our data, drop us. 257 */ 258 if (p != NULL) { 259 struct tcphdr *phdr = p->tcpqe_tcp; 260 int i; 261 262 /* conversion to int (in i) handles seq wraparound */ 263 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 264 if (i > 0) { 265 if (i >= *tlen) { 266 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 267 *tlen); 268 m_freem(m); 269 pool_put(&tcpqe_pool, tiqe); 270 return (0); 271 } 272 m_adj(m, i); 273 *tlen -= i; 274 th->th_seq += i; 275 } 276 } 277 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 278 tp->t_rcvoopack++; 279 280 /* 281 * While we overlap succeeding segments trim them or, 282 * if they are completely covered, dequeue them. 283 */ 284 for (; q != NULL; q = nq) { 285 struct tcphdr *qhdr = q->tcpqe_tcp; 286 int i = (th->th_seq + *tlen) - qhdr->th_seq; 287 288 if (i <= 0) 289 break; 290 if (i < qhdr->th_reseqlen) { 291 qhdr->th_seq += i; 292 qhdr->th_reseqlen -= i; 293 m_adj(q->tcpqe_m, i); 294 break; 295 } 296 nq = TAILQ_NEXT(q, tcpqe_q); 297 m_freem(q->tcpqe_m); 298 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 299 pool_put(&tcpqe_pool, q); 300 } 301 302 /* Insert the new segment queue entry into place. */ 303 tiqe->tcpqe_m = m; 304 th->th_reseqlen = *tlen; 305 tiqe->tcpqe_tcp = th; 306 if (p == NULL) { 307 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 308 } else { 309 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 310 } 311 312 if (th->th_seq != tp->rcv_nxt) 313 return (0); 314 315 return (tcp_flush_queue(tp)); 316 } 317 318 int 319 tcp_flush_queue(struct tcpcb *tp) 320 { 321 struct socket *so = tp->t_inpcb->inp_socket; 322 struct tcpqent *q, *nq; 323 int flags; 324 325 /* 326 * Present data to user, advancing rcv_nxt through 327 * completed sequence space. 328 */ 329 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 330 return (0); 331 q = TAILQ_FIRST(&tp->t_segq); 332 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 333 return (0); 334 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 335 return (0); 336 do { 337 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 338 flags = q->tcpqe_tcp->th_flags & TH_FIN; 339 340 nq = TAILQ_NEXT(q, tcpqe_q); 341 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 342 ND6_HINT(tp); 343 if (so->so_state & SS_CANTRCVMORE) 344 m_freem(q->tcpqe_m); 345 else 346 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 347 pool_put(&tcpqe_pool, q); 348 q = nq; 349 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 350 tp->t_flags |= TF_BLOCKOUTPUT; 351 sorwakeup(so); 352 tp->t_flags &= ~TF_BLOCKOUTPUT; 353 return (flags); 354 } 355 356 /* 357 * TCP input routine, follows pages 65-76 of the 358 * protocol specification dated September, 1981 very closely. 359 */ 360 int 361 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 362 { 363 struct mbuf *m = *mp; 364 int iphlen = *offp; 365 struct ip *ip = NULL; 366 struct inpcb *inp = NULL; 367 u_int8_t *optp = NULL; 368 int optlen = 0; 369 int tlen, off; 370 struct tcpcb *otp = NULL, *tp = NULL; 371 int tiflags; 372 struct socket *so = NULL; 373 int todrop, acked, ourfinisacked; 374 int hdroptlen = 0; 375 short ostate; 376 caddr_t saveti; 377 tcp_seq iss, *reuse = NULL; 378 uint32_t now; 379 u_long tiwin; 380 struct tcp_opt_info opti; 381 struct tcphdr *th; 382 #ifdef INET6 383 struct ip6_hdr *ip6 = NULL; 384 #endif /* INET6 */ 385 #ifdef TCP_ECN 386 u_char iptos; 387 #endif 388 389 tcpstat_inc(tcps_rcvtotal); 390 391 opti.ts_present = 0; 392 opti.maxseg = 0; 393 now = tcp_now(); 394 395 /* 396 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 397 */ 398 if (m->m_flags & (M_BCAST|M_MCAST)) 399 goto drop; 400 401 /* 402 * Get IP and TCP header together in first mbuf. 403 * Note: IP leaves IP header in first mbuf. 404 */ 405 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 406 if (!th) { 407 tcpstat_inc(tcps_rcvshort); 408 return IPPROTO_DONE; 409 } 410 411 tlen = m->m_pkthdr.len - iphlen; 412 switch (af) { 413 case AF_INET: 414 ip = mtod(m, struct ip *); 415 #ifdef TCP_ECN 416 /* save ip_tos before clearing it for checksum */ 417 iptos = ip->ip_tos; 418 #endif 419 break; 420 #ifdef INET6 421 case AF_INET6: 422 ip6 = mtod(m, struct ip6_hdr *); 423 #ifdef TCP_ECN 424 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 425 #endif 426 427 /* 428 * Be proactive about unspecified IPv6 address in source. 429 * As we use all-zero to indicate unbounded/unconnected pcb, 430 * unspecified IPv6 address can be used to confuse us. 431 * 432 * Note that packets with unspecified IPv6 destination is 433 * already dropped in ip6_input. 434 */ 435 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 436 /* XXX stat */ 437 goto drop; 438 } 439 440 /* Discard packets to multicast */ 441 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 442 /* XXX stat */ 443 goto drop; 444 } 445 break; 446 #endif 447 default: 448 unhandled_af(af); 449 } 450 451 /* 452 * Checksum extended TCP header and data. 453 */ 454 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 455 int sum; 456 457 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 458 tcpstat_inc(tcps_rcvbadsum); 459 goto drop; 460 } 461 tcpstat_inc(tcps_inswcsum); 462 switch (af) { 463 case AF_INET: 464 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 465 break; 466 #ifdef INET6 467 case AF_INET6: 468 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 469 tlen); 470 break; 471 #endif 472 } 473 if (sum != 0) { 474 tcpstat_inc(tcps_rcvbadsum); 475 goto drop; 476 } 477 } 478 479 /* 480 * Check that TCP offset makes sense, 481 * pull out TCP options and adjust length. XXX 482 */ 483 off = th->th_off << 2; 484 if (off < sizeof(struct tcphdr) || off > tlen) { 485 tcpstat_inc(tcps_rcvbadoff); 486 goto drop; 487 } 488 tlen -= off; 489 if (off > sizeof(struct tcphdr)) { 490 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 491 if (!th) { 492 tcpstat_inc(tcps_rcvshort); 493 return IPPROTO_DONE; 494 } 495 optlen = off - sizeof(struct tcphdr); 496 optp = (u_int8_t *)(th + 1); 497 /* 498 * Do quick retrieval of timestamp options ("options 499 * prediction?"). If timestamp is the only option and it's 500 * formatted as recommended in RFC 1323 appendix A, we 501 * quickly get the values now and not bother calling 502 * tcp_dooptions(), etc. 503 */ 504 if ((optlen == TCPOLEN_TSTAMP_APPA || 505 (optlen > TCPOLEN_TSTAMP_APPA && 506 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 507 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 508 (th->th_flags & TH_SYN) == 0) { 509 opti.ts_present = 1; 510 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 511 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 512 optp = NULL; /* we've parsed the options */ 513 } 514 } 515 tiflags = th->th_flags; 516 517 /* 518 * Convert TCP protocol specific fields to host format. 519 */ 520 th->th_seq = ntohl(th->th_seq); 521 th->th_ack = ntohl(th->th_ack); 522 th->th_win = ntohs(th->th_win); 523 th->th_urp = ntohs(th->th_urp); 524 525 /* 526 * Locate pcb for segment. 527 */ 528 #if NPF > 0 529 inp = pf_inp_lookup(m); 530 #endif 531 findpcb: 532 if (inp == NULL) { 533 switch (af) { 534 #ifdef INET6 535 case AF_INET6: 536 inp = in6_pcblookup(&tcbtable, &ip6->ip6_src, 537 th->th_sport, &ip6->ip6_dst, th->th_dport, 538 m->m_pkthdr.ph_rtableid); 539 break; 540 #endif 541 case AF_INET: 542 inp = in_pcblookup(&tcbtable, ip->ip_src, 543 th->th_sport, ip->ip_dst, th->th_dport, 544 m->m_pkthdr.ph_rtableid); 545 break; 546 } 547 } 548 if (inp == NULL) { 549 tcpstat_inc(tcps_pcbhashmiss); 550 switch (af) { 551 #ifdef INET6 552 case AF_INET6: 553 inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, 554 th->th_dport, m, m->m_pkthdr.ph_rtableid); 555 break; 556 #endif /* INET6 */ 557 case AF_INET: 558 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 559 th->th_dport, m, m->m_pkthdr.ph_rtableid); 560 break; 561 } 562 /* 563 * If the state is CLOSED (i.e., TCB does not exist) then 564 * all data in the incoming segment is discarded. 565 * If the TCB exists but is in CLOSED state, it is embryonic, 566 * but should either do a listen or a connect soon. 567 */ 568 } 569 #ifdef IPSEC 570 if (ipsec_in_use) { 571 struct m_tag *mtag; 572 struct tdb *tdb = NULL; 573 int error; 574 575 /* Find most recent IPsec tag */ 576 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 577 if (mtag != NULL) { 578 struct tdb_ident *tdbi; 579 580 tdbi = (struct tdb_ident *)(mtag + 1); 581 tdb = gettdb(tdbi->rdomain, tdbi->spi, 582 &tdbi->dst, tdbi->proto); 583 } 584 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 585 tdb, inp, NULL, NULL); 586 tdb_unref(tdb); 587 if (error) { 588 tcpstat_inc(tcps_rcvnosec); 589 goto drop; 590 } 591 } 592 #endif /* IPSEC */ 593 594 if (inp == NULL) { 595 tcpstat_inc(tcps_noport); 596 goto dropwithreset_ratelim; 597 } 598 599 KASSERT(sotoinpcb(inp->inp_socket) == inp); 600 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 601 soassertlocked(inp->inp_socket); 602 603 /* Check the minimum TTL for socket. */ 604 switch (af) { 605 case AF_INET: 606 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 607 goto drop; 608 break; 609 #ifdef INET6 610 case AF_INET6: 611 if (inp->inp_ip6_minhlim && 612 inp->inp_ip6_minhlim > ip6->ip6_hlim) 613 goto drop; 614 break; 615 #endif 616 } 617 618 tp = intotcpcb(inp); 619 if (tp == NULL) 620 goto dropwithreset_ratelim; 621 if (tp->t_state == TCPS_CLOSED) 622 goto drop; 623 624 /* Unscale the window into a 32-bit value. */ 625 if ((tiflags & TH_SYN) == 0) 626 tiwin = th->th_win << tp->snd_scale; 627 else 628 tiwin = th->th_win; 629 630 so = inp->inp_socket; 631 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 632 union syn_cache_sa src; 633 union syn_cache_sa dst; 634 635 bzero(&src, sizeof(src)); 636 bzero(&dst, sizeof(dst)); 637 switch (af) { 638 case AF_INET: 639 src.sin.sin_len = sizeof(struct sockaddr_in); 640 src.sin.sin_family = AF_INET; 641 src.sin.sin_addr = ip->ip_src; 642 src.sin.sin_port = th->th_sport; 643 644 dst.sin.sin_len = sizeof(struct sockaddr_in); 645 dst.sin.sin_family = AF_INET; 646 dst.sin.sin_addr = ip->ip_dst; 647 dst.sin.sin_port = th->th_dport; 648 break; 649 #ifdef INET6 650 case AF_INET6: 651 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 652 src.sin6.sin6_family = AF_INET6; 653 src.sin6.sin6_addr = ip6->ip6_src; 654 src.sin6.sin6_port = th->th_sport; 655 656 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 657 dst.sin6.sin6_family = AF_INET6; 658 dst.sin6.sin6_addr = ip6->ip6_dst; 659 dst.sin6.sin6_port = th->th_dport; 660 break; 661 #endif /* INET6 */ 662 } 663 664 if (so->so_options & SO_DEBUG) { 665 otp = tp; 666 ostate = tp->t_state; 667 switch (af) { 668 #ifdef INET6 669 case AF_INET6: 670 saveti = (caddr_t) &tcp_saveti6; 671 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 672 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 673 break; 674 #endif 675 case AF_INET: 676 saveti = (caddr_t) &tcp_saveti; 677 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 678 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 679 break; 680 } 681 } 682 if (so->so_options & SO_ACCEPTCONN) { 683 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 684 685 case TH_SYN|TH_ACK|TH_RST: 686 case TH_SYN|TH_RST: 687 case TH_ACK|TH_RST: 688 case TH_RST: 689 syn_cache_reset(&src.sa, &dst.sa, th, 690 inp->inp_rtableid); 691 goto drop; 692 693 case TH_SYN|TH_ACK: 694 /* 695 * Received a SYN,ACK. This should 696 * never happen while we are in 697 * LISTEN. Send an RST. 698 */ 699 goto badsyn; 700 701 case TH_ACK: 702 so = syn_cache_get(&src.sa, &dst.sa, 703 th, iphlen, tlen, so, m, now); 704 if (so == NULL) { 705 /* 706 * We don't have a SYN for 707 * this ACK; send an RST. 708 */ 709 goto badsyn; 710 } else if (so == (struct socket *)(-1)) { 711 /* 712 * We were unable to create 713 * the connection. If the 714 * 3-way handshake was 715 * completed, and RST has 716 * been sent to the peer. 717 * Since the mbuf might be 718 * in use for the reply, 719 * do not free it. 720 */ 721 m = *mp = NULL; 722 goto drop; 723 } else { 724 /* 725 * We have created a 726 * full-blown connection. 727 */ 728 tp = NULL; 729 in_pcbunref(inp); 730 inp = in_pcbref(sotoinpcb(so)); 731 tp = intotcpcb(inp); 732 if (tp == NULL) 733 goto badsyn; /*XXX*/ 734 735 } 736 break; 737 738 default: 739 /* 740 * None of RST, SYN or ACK was set. 741 * This is an invalid packet for a 742 * TCB in LISTEN state. Send a RST. 743 */ 744 goto badsyn; 745 746 case TH_SYN: 747 /* 748 * Received a SYN. 749 */ 750 #ifdef INET6 751 /* 752 * If deprecated address is forbidden, we do 753 * not accept SYN to deprecated interface 754 * address to prevent any new inbound 755 * connection from getting established. 756 * When we do not accept SYN, we send a TCP 757 * RST, with deprecated source address (instead 758 * of dropping it). We compromise it as it is 759 * much better for peer to send a RST, and 760 * RST will be the final packet for the 761 * exchange. 762 * 763 * If we do not forbid deprecated addresses, we 764 * accept the SYN packet. RFC2462 does not 765 * suggest dropping SYN in this case. 766 * If we decipher RFC2462 5.5.4, it says like 767 * this: 768 * 1. use of deprecated addr with existing 769 * communication is okay - "SHOULD continue 770 * to be used" 771 * 2. use of it with new communication: 772 * (2a) "SHOULD NOT be used if alternate 773 * address with sufficient scope is 774 * available" 775 * (2b) nothing mentioned otherwise. 776 * Here we fall into (2b) case as we have no 777 * choice in our source address selection - we 778 * must obey the peer. 779 * 780 * The wording in RFC2462 is confusing, and 781 * there are multiple description text for 782 * deprecated address handling - worse, they 783 * are not exactly the same. I believe 5.5.4 784 * is the best one, so we follow 5.5.4. 785 */ 786 if (ip6 && !ip6_use_deprecated) { 787 struct in6_ifaddr *ia6; 788 struct ifnet *ifp = 789 if_get(m->m_pkthdr.ph_ifidx); 790 791 if (ifp && 792 (ia6 = in6ifa_ifpwithaddr(ifp, 793 &ip6->ip6_dst)) && 794 (ia6->ia6_flags & 795 IN6_IFF_DEPRECATED)) { 796 tp = NULL; 797 if_put(ifp); 798 goto dropwithreset; 799 } 800 if_put(ifp); 801 } 802 #endif 803 804 /* 805 * LISTEN socket received a SYN 806 * from itself? This can't possibly 807 * be valid; drop the packet. 808 */ 809 if (th->th_dport == th->th_sport) { 810 switch (af) { 811 #ifdef INET6 812 case AF_INET6: 813 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 814 &ip6->ip6_dst)) { 815 tcpstat_inc(tcps_badsyn); 816 goto drop; 817 } 818 break; 819 #endif /* INET6 */ 820 case AF_INET: 821 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 822 tcpstat_inc(tcps_badsyn); 823 goto drop; 824 } 825 break; 826 } 827 } 828 829 /* 830 * SYN looks ok; create compressed TCP 831 * state for it. 832 */ 833 if (so->so_qlen > so->so_qlimit || 834 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 835 so, m, optp, optlen, &opti, reuse, now) 836 == -1) { 837 tcpstat_inc(tcps_dropsyn); 838 goto drop; 839 } 840 in_pcbunref(inp); 841 return IPPROTO_DONE; 842 } 843 } 844 } 845 846 #ifdef DIAGNOSTIC 847 /* 848 * Should not happen now that all embryonic connections 849 * are handled with compressed state. 850 */ 851 if (tp->t_state == TCPS_LISTEN) 852 panic("tcp_input: TCPS_LISTEN"); 853 #endif 854 855 #if NPF > 0 856 pf_inp_link(m, inp); 857 #endif 858 859 /* 860 * Segment received on connection. 861 * Reset idle time and keep-alive timer. 862 */ 863 tp->t_rcvtime = now; 864 if (TCPS_HAVEESTABLISHED(tp->t_state)) 865 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle)); 866 867 if (tp->sack_enable) 868 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 869 870 /* 871 * Process options. 872 */ 873 #ifdef TCP_SIGNATURE 874 if (optp || (tp->t_flags & TF_SIGNATURE)) 875 #else 876 if (optp) 877 #endif 878 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 879 m->m_pkthdr.ph_rtableid, now)) 880 goto drop; 881 882 if (opti.ts_present && opti.ts_ecr) { 883 int rtt_test; 884 885 /* subtract out the tcp timestamp modulator */ 886 opti.ts_ecr -= tp->ts_modulate; 887 888 /* make sure ts_ecr is sensible */ 889 rtt_test = now - opti.ts_ecr; 890 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 891 opti.ts_ecr = 0; 892 } 893 894 #ifdef TCP_ECN 895 /* if congestion experienced, set ECE bit in subsequent packets. */ 896 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 897 tp->t_flags |= TF_RCVD_CE; 898 tcpstat_inc(tcps_ecn_rcvce); 899 } 900 #endif 901 /* 902 * Header prediction: check for the two common cases 903 * of a uni-directional data xfer. If the packet has 904 * no control flags, is in-sequence, the window didn't 905 * change and we're not retransmitting, it's a 906 * candidate. If the length is zero and the ack moved 907 * forward, we're the sender side of the xfer. Just 908 * free the data acked & wake any higher level process 909 * that was blocked waiting for space. If the length 910 * is non-zero and the ack didn't move, we're the 911 * receiver side. If we're getting packets in-order 912 * (the reassembly queue is empty), add the data to 913 * the socket buffer and note that we need a delayed ack. 914 */ 915 if (tp->t_state == TCPS_ESTABLISHED && 916 #ifdef TCP_ECN 917 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 918 #else 919 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 920 #endif 921 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 922 th->th_seq == tp->rcv_nxt && 923 tiwin && tiwin == tp->snd_wnd && 924 tp->snd_nxt == tp->snd_max) { 925 926 /* 927 * If last ACK falls within this segment's sequence numbers, 928 * record the timestamp. 929 * Fix from Braden, see Stevens p. 870 930 */ 931 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 932 tp->ts_recent_age = now; 933 tp->ts_recent = opti.ts_val; 934 } 935 936 if (tlen == 0) { 937 if (SEQ_GT(th->th_ack, tp->snd_una) && 938 SEQ_LEQ(th->th_ack, tp->snd_max) && 939 tp->snd_cwnd >= tp->snd_wnd && 940 tp->t_dupacks == 0) { 941 /* 942 * this is a pure ack for outstanding data. 943 */ 944 tcpstat_inc(tcps_predack); 945 if (opti.ts_present && opti.ts_ecr) 946 tcp_xmit_timer(tp, now - opti.ts_ecr); 947 else if (tp->t_rtttime && 948 SEQ_GT(th->th_ack, tp->t_rtseq)) 949 tcp_xmit_timer(tp, now - tp->t_rtttime); 950 acked = th->th_ack - tp->snd_una; 951 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 952 acked); 953 tp->t_rcvacktime = now; 954 ND6_HINT(tp); 955 sbdrop(so, &so->so_snd, acked); 956 957 /* 958 * If we had a pending ICMP message that 959 * refers to data that have just been 960 * acknowledged, disregard the recorded ICMP 961 * message. 962 */ 963 if ((tp->t_flags & TF_PMTUD_PEND) && 964 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 965 tp->t_flags &= ~TF_PMTUD_PEND; 966 967 /* 968 * Keep track of the largest chunk of data 969 * acknowledged since last PMTU update 970 */ 971 if (tp->t_pmtud_mss_acked < acked) 972 tp->t_pmtud_mss_acked = acked; 973 974 tp->snd_una = th->th_ack; 975 /* Pull snd_wl2 up to prevent seq wrap. */ 976 tp->snd_wl2 = th->th_ack; 977 /* 978 * We want snd_last to track snd_una so 979 * as to avoid sequence wraparound problems 980 * for very large transfers. 981 */ 982 #ifdef TCP_ECN 983 if (SEQ_GT(tp->snd_una, tp->snd_last)) 984 #endif 985 tp->snd_last = tp->snd_una; 986 m_freem(m); 987 988 /* 989 * If all outstanding data are acked, stop 990 * retransmit timer, otherwise restart timer 991 * using current (possibly backed-off) value. 992 * If process is waiting for space, 993 * wakeup/selwakeup/signal. If data 994 * are ready to send, let tcp_output 995 * decide between more output or persist. 996 */ 997 if (tp->snd_una == tp->snd_max) 998 TCP_TIMER_DISARM(tp, TCPT_REXMT); 999 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1000 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1001 1002 tcp_update_sndspace(tp); 1003 if (sb_notify(so, &so->so_snd)) { 1004 tp->t_flags |= TF_BLOCKOUTPUT; 1005 sowwakeup(so); 1006 tp->t_flags &= ~TF_BLOCKOUTPUT; 1007 } 1008 if (so->so_snd.sb_cc || 1009 tp->t_flags & TF_NEEDOUTPUT) 1010 (void) tcp_output(tp); 1011 in_pcbunref(inp); 1012 return IPPROTO_DONE; 1013 } 1014 } else if (th->th_ack == tp->snd_una && 1015 TAILQ_EMPTY(&tp->t_segq) && 1016 tlen <= sbspace(so, &so->so_rcv)) { 1017 /* 1018 * This is a pure, in-sequence data packet 1019 * with nothing on the reassembly queue and 1020 * we have enough buffer space to take it. 1021 */ 1022 /* Clean receiver SACK report if present */ 1023 if (tp->sack_enable && tp->rcv_numsacks) 1024 tcp_clean_sackreport(tp); 1025 tcpstat_inc(tcps_preddat); 1026 tp->rcv_nxt += tlen; 1027 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1028 tp->snd_wl1 = th->th_seq; 1029 /* Packet has most recent segment, no urgent exists. */ 1030 tp->rcv_up = tp->rcv_nxt; 1031 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1032 ND6_HINT(tp); 1033 1034 TCP_SETUP_ACK(tp, tiflags, m); 1035 /* 1036 * Drop TCP, IP headers and TCP options then add data 1037 * to socket buffer. 1038 */ 1039 if (so->so_state & SS_CANTRCVMORE) 1040 m_freem(m); 1041 else { 1042 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1043 now - tp->rfbuf_ts > (tp->t_srtt >> 1044 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) { 1045 tcp_update_rcvspace(tp); 1046 /* Start over with next RTT. */ 1047 tp->rfbuf_cnt = 0; 1048 tp->rfbuf_ts = 0; 1049 } else 1050 tp->rfbuf_cnt += tlen; 1051 m_adj(m, iphlen + off); 1052 sbappendstream(so, &so->so_rcv, m); 1053 } 1054 tp->t_flags |= TF_BLOCKOUTPUT; 1055 sorwakeup(so); 1056 tp->t_flags &= ~TF_BLOCKOUTPUT; 1057 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1058 (void) tcp_output(tp); 1059 in_pcbunref(inp); 1060 return IPPROTO_DONE; 1061 } 1062 } 1063 1064 /* 1065 * Compute mbuf offset to TCP data segment. 1066 */ 1067 hdroptlen = iphlen + off; 1068 1069 /* 1070 * Calculate amount of space in receive window, 1071 * and then do TCP input processing. 1072 * Receive window is amount of space in rcv queue, 1073 * but not less than advertised window. 1074 */ 1075 { int win; 1076 1077 win = sbspace(so, &so->so_rcv); 1078 if (win < 0) 1079 win = 0; 1080 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1081 } 1082 1083 switch (tp->t_state) { 1084 1085 /* 1086 * If the state is SYN_RECEIVED: 1087 * if seg contains SYN/ACK, send an RST. 1088 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1089 */ 1090 1091 case TCPS_SYN_RECEIVED: 1092 if (tiflags & TH_ACK) { 1093 if (tiflags & TH_SYN) { 1094 tcpstat_inc(tcps_badsyn); 1095 goto dropwithreset; 1096 } 1097 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1098 SEQ_GT(th->th_ack, tp->snd_max)) 1099 goto dropwithreset; 1100 } 1101 break; 1102 1103 /* 1104 * If the state is SYN_SENT: 1105 * if seg contains an ACK, but not for our SYN, drop the input. 1106 * if seg contains a RST, then drop the connection. 1107 * if seg does not contain SYN, then drop it. 1108 * Otherwise this is an acceptable SYN segment 1109 * initialize tp->rcv_nxt and tp->irs 1110 * if seg contains ack then advance tp->snd_una 1111 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1112 * arrange for segment to be acked (eventually) 1113 * continue processing rest of data/controls, beginning with URG 1114 */ 1115 case TCPS_SYN_SENT: 1116 if ((tiflags & TH_ACK) && 1117 (SEQ_LEQ(th->th_ack, tp->iss) || 1118 SEQ_GT(th->th_ack, tp->snd_max))) 1119 goto dropwithreset; 1120 if (tiflags & TH_RST) { 1121 #ifdef TCP_ECN 1122 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1123 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1124 goto drop; 1125 #endif 1126 if (tiflags & TH_ACK) 1127 tp = tcp_drop(tp, ECONNREFUSED); 1128 goto drop; 1129 } 1130 if ((tiflags & TH_SYN) == 0) 1131 goto drop; 1132 if (tiflags & TH_ACK) { 1133 tp->snd_una = th->th_ack; 1134 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1135 tp->snd_nxt = tp->snd_una; 1136 } 1137 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1138 tp->irs = th->th_seq; 1139 tcp_mss(tp, opti.maxseg); 1140 /* Reset initial window to 1 segment for retransmit */ 1141 if (tp->t_rxtshift > 0) 1142 tp->snd_cwnd = tp->t_maxseg; 1143 tcp_rcvseqinit(tp); 1144 tp->t_flags |= TF_ACKNOW; 1145 /* 1146 * If we've sent a SACK_PERMITTED option, and the peer 1147 * also replied with one, then TF_SACK_PERMIT should have 1148 * been set in tcp_dooptions(). If it was not, disable SACKs. 1149 */ 1150 if (tp->sack_enable) 1151 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1152 #ifdef TCP_ECN 1153 /* 1154 * if ECE is set but CWR is not set for SYN-ACK, or 1155 * both ECE and CWR are set for simultaneous open, 1156 * peer is ECN capable. 1157 */ 1158 if (tcp_do_ecn) { 1159 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1160 case TH_ACK|TH_ECE: 1161 case TH_ECE|TH_CWR: 1162 tp->t_flags |= TF_ECN_PERMIT; 1163 tiflags &= ~(TH_ECE|TH_CWR); 1164 tcpstat_inc(tcps_ecn_accepts); 1165 } 1166 } 1167 #endif 1168 1169 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1170 tcpstat_inc(tcps_connects); 1171 tp->t_flags |= TF_BLOCKOUTPUT; 1172 soisconnected(so); 1173 tp->t_flags &= ~TF_BLOCKOUTPUT; 1174 tp->t_state = TCPS_ESTABLISHED; 1175 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle)); 1176 /* Do window scaling on this connection? */ 1177 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1178 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1179 tp->snd_scale = tp->requested_s_scale; 1180 tp->rcv_scale = tp->request_r_scale; 1181 } 1182 tcp_flush_queue(tp); 1183 1184 /* 1185 * if we didn't have to retransmit the SYN, 1186 * use its rtt as our initial srtt & rtt var. 1187 */ 1188 if (tp->t_rtttime) 1189 tcp_xmit_timer(tp, now - tp->t_rtttime); 1190 /* 1191 * Since new data was acked (the SYN), open the 1192 * congestion window by one MSS. We do this 1193 * here, because we won't go through the normal 1194 * ACK processing below. And since this is the 1195 * start of the connection, we know we are in 1196 * the exponential phase of slow-start. 1197 */ 1198 tp->snd_cwnd += tp->t_maxseg; 1199 } else 1200 tp->t_state = TCPS_SYN_RECEIVED; 1201 1202 #if 0 1203 trimthenstep6: 1204 #endif 1205 /* 1206 * Advance th->th_seq to correspond to first data byte. 1207 * If data, trim to stay within window, 1208 * dropping FIN if necessary. 1209 */ 1210 th->th_seq++; 1211 if (tlen > tp->rcv_wnd) { 1212 todrop = tlen - tp->rcv_wnd; 1213 m_adj(m, -todrop); 1214 tlen = tp->rcv_wnd; 1215 tiflags &= ~TH_FIN; 1216 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1217 todrop); 1218 } 1219 tp->snd_wl1 = th->th_seq - 1; 1220 tp->rcv_up = th->th_seq; 1221 goto step6; 1222 /* 1223 * If a new connection request is received while in TIME_WAIT, 1224 * drop the old connection and start over if the if the 1225 * timestamp or the sequence numbers are above the previous 1226 * ones. 1227 */ 1228 case TCPS_TIME_WAIT: 1229 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1230 ((opti.ts_present && 1231 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1232 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1233 #if NPF > 0 1234 /* 1235 * The socket will be recreated but the new state 1236 * has already been linked to the socket. Remove the 1237 * link between old socket and new state. 1238 */ 1239 pf_inp_unlink(inp); 1240 #endif 1241 /* 1242 * Advance the iss by at least 32768, but 1243 * clear the msb in order to make sure 1244 * that SEG_LT(snd_nxt, iss). 1245 */ 1246 iss = tp->snd_nxt + 1247 ((arc4random() & 0x7fffffff) | 0x8000); 1248 reuse = &iss; 1249 tp = tcp_close(tp); 1250 in_pcbunref(inp); 1251 inp = NULL; 1252 goto findpcb; 1253 } 1254 } 1255 1256 /* 1257 * States other than LISTEN or SYN_SENT. 1258 * First check timestamp, if present. 1259 * Then check that at least some bytes of segment are within 1260 * receive window. If segment begins before rcv_nxt, 1261 * drop leading data (and SYN); if nothing left, just ack. 1262 * 1263 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1264 * and it's less than opti.ts_recent, drop it. 1265 */ 1266 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1267 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1268 1269 /* Check to see if ts_recent is over 24 days old. */ 1270 if ((int)(now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1271 /* 1272 * Invalidate ts_recent. If this segment updates 1273 * ts_recent, the age will be reset later and ts_recent 1274 * will get a valid value. If it does not, setting 1275 * ts_recent to zero will at least satisfy the 1276 * requirement that zero be placed in the timestamp 1277 * echo reply when ts_recent isn't valid. The 1278 * age isn't reset until we get a valid ts_recent 1279 * because we don't want out-of-order segments to be 1280 * dropped when ts_recent is old. 1281 */ 1282 tp->ts_recent = 0; 1283 } else { 1284 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1285 tcpstat_inc(tcps_pawsdrop); 1286 if (tlen) 1287 goto dropafterack; 1288 goto drop; 1289 } 1290 } 1291 1292 todrop = tp->rcv_nxt - th->th_seq; 1293 if (todrop > 0) { 1294 if (tiflags & TH_SYN) { 1295 tiflags &= ~TH_SYN; 1296 th->th_seq++; 1297 if (th->th_urp > 1) 1298 th->th_urp--; 1299 else 1300 tiflags &= ~TH_URG; 1301 todrop--; 1302 } 1303 if (todrop > tlen || 1304 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1305 /* 1306 * Any valid FIN must be to the left of the 1307 * window. At this point, FIN must be a 1308 * duplicate or out-of-sequence, so drop it. 1309 */ 1310 tiflags &= ~TH_FIN; 1311 /* 1312 * Send ACK to resynchronize, and drop any data, 1313 * but keep on processing for RST or ACK. 1314 */ 1315 tp->t_flags |= TF_ACKNOW; 1316 todrop = tlen; 1317 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1318 } else { 1319 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1320 todrop); 1321 } 1322 hdroptlen += todrop; /* drop from head afterwards */ 1323 th->th_seq += todrop; 1324 tlen -= todrop; 1325 if (th->th_urp > todrop) 1326 th->th_urp -= todrop; 1327 else { 1328 tiflags &= ~TH_URG; 1329 th->th_urp = 0; 1330 } 1331 } 1332 1333 /* 1334 * If new data are received on a connection after the 1335 * user processes are gone, then RST the other end. 1336 */ 1337 if ((so->so_state & SS_NOFDREF) && 1338 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1339 tp = tcp_close(tp); 1340 tcpstat_inc(tcps_rcvafterclose); 1341 goto dropwithreset; 1342 } 1343 1344 /* 1345 * If segment ends after window, drop trailing data 1346 * (and PUSH and FIN); if nothing left, just ACK. 1347 */ 1348 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1349 if (todrop > 0) { 1350 tcpstat_inc(tcps_rcvpackafterwin); 1351 if (todrop >= tlen) { 1352 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1353 /* 1354 * If window is closed can only take segments at 1355 * window edge, and have to drop data and PUSH from 1356 * incoming segments. Continue processing, but 1357 * remember to ack. Otherwise, drop segment 1358 * and ack. 1359 */ 1360 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1361 tp->t_flags |= TF_ACKNOW; 1362 tcpstat_inc(tcps_rcvwinprobe); 1363 } else 1364 goto dropafterack; 1365 } else 1366 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1367 m_adj(m, -todrop); 1368 tlen -= todrop; 1369 tiflags &= ~(TH_PUSH|TH_FIN); 1370 } 1371 1372 /* 1373 * If last ACK falls within this segment's sequence numbers, 1374 * record its timestamp if it's more recent. 1375 * NOTE that the test is modified according to the latest 1376 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1377 */ 1378 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1379 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1380 tp->ts_recent_age = now; 1381 tp->ts_recent = opti.ts_val; 1382 } 1383 1384 /* 1385 * If the RST bit is set examine the state: 1386 * SYN_RECEIVED STATE: 1387 * If passive open, return to LISTEN state. 1388 * If active open, inform user that connection was refused. 1389 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1390 * Inform user that connection was reset, and close tcb. 1391 * CLOSING, LAST_ACK, TIME_WAIT STATES 1392 * Close the tcb. 1393 */ 1394 if (tiflags & TH_RST) { 1395 if (th->th_seq != tp->last_ack_sent && 1396 th->th_seq != tp->rcv_nxt && 1397 th->th_seq != (tp->rcv_nxt + 1)) 1398 goto drop; 1399 1400 switch (tp->t_state) { 1401 case TCPS_SYN_RECEIVED: 1402 #ifdef TCP_ECN 1403 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1404 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1405 goto drop; 1406 #endif 1407 so->so_error = ECONNREFUSED; 1408 goto close; 1409 1410 case TCPS_ESTABLISHED: 1411 case TCPS_FIN_WAIT_1: 1412 case TCPS_FIN_WAIT_2: 1413 case TCPS_CLOSE_WAIT: 1414 so->so_error = ECONNRESET; 1415 close: 1416 tp->t_state = TCPS_CLOSED; 1417 tcpstat_inc(tcps_drops); 1418 tp = tcp_close(tp); 1419 goto drop; 1420 case TCPS_CLOSING: 1421 case TCPS_LAST_ACK: 1422 case TCPS_TIME_WAIT: 1423 tp = tcp_close(tp); 1424 goto drop; 1425 } 1426 } 1427 1428 /* 1429 * If a SYN is in the window, then this is an 1430 * error and we ACK and drop the packet. 1431 */ 1432 if (tiflags & TH_SYN) 1433 goto dropafterack_ratelim; 1434 1435 /* 1436 * If the ACK bit is off we drop the segment and return. 1437 */ 1438 if ((tiflags & TH_ACK) == 0) { 1439 if (tp->t_flags & TF_ACKNOW) 1440 goto dropafterack; 1441 else 1442 goto drop; 1443 } 1444 1445 /* 1446 * Ack processing. 1447 */ 1448 switch (tp->t_state) { 1449 1450 /* 1451 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1452 * ESTABLISHED state and continue processing. 1453 * The ACK was checked above. 1454 */ 1455 case TCPS_SYN_RECEIVED: 1456 tcpstat_inc(tcps_connects); 1457 tp->t_flags |= TF_BLOCKOUTPUT; 1458 soisconnected(so); 1459 tp->t_flags &= ~TF_BLOCKOUTPUT; 1460 tp->t_state = TCPS_ESTABLISHED; 1461 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle)); 1462 /* Do window scaling? */ 1463 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1464 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1465 tp->snd_scale = tp->requested_s_scale; 1466 tp->rcv_scale = tp->request_r_scale; 1467 tiwin = th->th_win << tp->snd_scale; 1468 } 1469 tcp_flush_queue(tp); 1470 tp->snd_wl1 = th->th_seq - 1; 1471 /* fall into ... */ 1472 1473 /* 1474 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1475 * ACKs. If the ack is in the range 1476 * tp->snd_una < th->th_ack <= tp->snd_max 1477 * then advance tp->snd_una to th->th_ack and drop 1478 * data from the retransmission queue. If this ACK reflects 1479 * more up to date window information we update our window information. 1480 */ 1481 case TCPS_ESTABLISHED: 1482 case TCPS_FIN_WAIT_1: 1483 case TCPS_FIN_WAIT_2: 1484 case TCPS_CLOSE_WAIT: 1485 case TCPS_CLOSING: 1486 case TCPS_LAST_ACK: 1487 case TCPS_TIME_WAIT: 1488 #ifdef TCP_ECN 1489 /* 1490 * if we receive ECE and are not already in recovery phase, 1491 * reduce cwnd by half but don't slow-start. 1492 * advance snd_last to snd_max not to reduce cwnd again 1493 * until all outstanding packets are acked. 1494 */ 1495 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1496 if ((tp->t_flags & TF_ECN_PERMIT) && 1497 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1498 u_int win; 1499 1500 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1501 if (win > 1) { 1502 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1503 tp->snd_cwnd = tp->snd_ssthresh; 1504 tp->snd_last = tp->snd_max; 1505 tp->t_flags |= TF_SEND_CWR; 1506 tcpstat_inc(tcps_cwr_ecn); 1507 } 1508 } 1509 tcpstat_inc(tcps_ecn_rcvece); 1510 } 1511 /* 1512 * if we receive CWR, we know that the peer has reduced 1513 * its congestion window. stop sending ecn-echo. 1514 */ 1515 if ((tiflags & TH_CWR)) { 1516 tp->t_flags &= ~TF_RCVD_CE; 1517 tcpstat_inc(tcps_ecn_rcvcwr); 1518 } 1519 #endif /* TCP_ECN */ 1520 1521 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1522 /* 1523 * Duplicate/old ACK processing. 1524 * Increments t_dupacks: 1525 * Pure duplicate (same seq/ack/window, no data) 1526 * Doesn't affect t_dupacks: 1527 * Data packets. 1528 * Normal window updates (window opens) 1529 * Resets t_dupacks: 1530 * New data ACKed. 1531 * Window shrinks 1532 * Old ACK 1533 */ 1534 if (tlen) { 1535 /* Drop very old ACKs unless th_seq matches */ 1536 if (th->th_seq != tp->rcv_nxt && 1537 SEQ_LT(th->th_ack, 1538 tp->snd_una - tp->max_sndwnd)) { 1539 tcpstat_inc(tcps_rcvacktooold); 1540 goto drop; 1541 } 1542 break; 1543 } 1544 /* 1545 * If we get an old ACK, there is probably packet 1546 * reordering going on. Be conservative and reset 1547 * t_dupacks so that we are less aggressive in 1548 * doing a fast retransmit. 1549 */ 1550 if (th->th_ack != tp->snd_una) { 1551 tp->t_dupacks = 0; 1552 break; 1553 } 1554 if (tiwin == tp->snd_wnd) { 1555 tcpstat_inc(tcps_rcvdupack); 1556 /* 1557 * If we have outstanding data (other than 1558 * a window probe), this is a completely 1559 * duplicate ack (ie, window info didn't 1560 * change), the ack is the biggest we've 1561 * seen and we've seen exactly our rexmt 1562 * threshold of them, assume a packet 1563 * has been dropped and retransmit it. 1564 * Kludge snd_nxt & the congestion 1565 * window so we send only this one 1566 * packet. 1567 * 1568 * We know we're losing at the current 1569 * window size so do congestion avoidance 1570 * (set ssthresh to half the current window 1571 * and pull our congestion window back to 1572 * the new ssthresh). 1573 * 1574 * Dup acks mean that packets have left the 1575 * network (they're now cached at the receiver) 1576 * so bump cwnd by the amount in the receiver 1577 * to keep a constant cwnd packets in the 1578 * network. 1579 */ 1580 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1581 tp->t_dupacks = 0; 1582 else if (++tp->t_dupacks == tcprexmtthresh) { 1583 tcp_seq onxt = tp->snd_nxt; 1584 u_long win = 1585 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1586 2 / tp->t_maxseg; 1587 1588 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1589 /* 1590 * False fast retx after 1591 * timeout. Do not cut window. 1592 */ 1593 tp->t_dupacks = 0; 1594 goto drop; 1595 } 1596 if (win < 2) 1597 win = 2; 1598 tp->snd_ssthresh = win * tp->t_maxseg; 1599 tp->snd_last = tp->snd_max; 1600 if (tp->sack_enable) { 1601 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1602 tp->t_rtttime = 0; 1603 #ifdef TCP_ECN 1604 tp->t_flags |= TF_SEND_CWR; 1605 #endif 1606 tcpstat_inc(tcps_cwr_frecovery); 1607 tcpstat_inc(tcps_sack_recovery_episode); 1608 /* 1609 * tcp_output() will send 1610 * oldest SACK-eligible rtx. 1611 */ 1612 (void) tcp_output(tp); 1613 tp->snd_cwnd = tp->snd_ssthresh+ 1614 tp->t_maxseg * tp->t_dupacks; 1615 goto drop; 1616 } 1617 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1618 tp->t_rtttime = 0; 1619 tp->snd_nxt = th->th_ack; 1620 tp->snd_cwnd = tp->t_maxseg; 1621 #ifdef TCP_ECN 1622 tp->t_flags |= TF_SEND_CWR; 1623 #endif 1624 tcpstat_inc(tcps_cwr_frecovery); 1625 tcpstat_inc(tcps_sndrexmitfast); 1626 (void) tcp_output(tp); 1627 1628 tp->snd_cwnd = tp->snd_ssthresh + 1629 tp->t_maxseg * tp->t_dupacks; 1630 if (SEQ_GT(onxt, tp->snd_nxt)) 1631 tp->snd_nxt = onxt; 1632 goto drop; 1633 } else if (tp->t_dupacks > tcprexmtthresh) { 1634 tp->snd_cwnd += tp->t_maxseg; 1635 (void) tcp_output(tp); 1636 goto drop; 1637 } 1638 } else if (tiwin < tp->snd_wnd) { 1639 /* 1640 * The window was retracted! Previous dup 1641 * ACKs may have been due to packets arriving 1642 * after the shrunken window, not a missing 1643 * packet, so play it safe and reset t_dupacks 1644 */ 1645 tp->t_dupacks = 0; 1646 } 1647 break; 1648 } 1649 /* 1650 * If the congestion window was inflated to account 1651 * for the other side's cached packets, retract it. 1652 */ 1653 if (tp->t_dupacks >= tcprexmtthresh) { 1654 /* Check for a partial ACK */ 1655 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1656 if (tp->sack_enable) 1657 tcp_sack_partialack(tp, th); 1658 else 1659 tcp_newreno_partialack(tp, th); 1660 } else { 1661 /* Out of fast recovery */ 1662 tp->snd_cwnd = tp->snd_ssthresh; 1663 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1664 tp->snd_ssthresh) 1665 tp->snd_cwnd = 1666 tcp_seq_subtract(tp->snd_max, 1667 th->th_ack); 1668 tp->t_dupacks = 0; 1669 } 1670 } else { 1671 /* 1672 * Reset the duplicate ACK counter if we 1673 * were not in fast recovery. 1674 */ 1675 tp->t_dupacks = 0; 1676 } 1677 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1678 tcpstat_inc(tcps_rcvacktoomuch); 1679 goto dropafterack_ratelim; 1680 } 1681 acked = th->th_ack - tp->snd_una; 1682 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1683 tp->t_rcvacktime = now; 1684 1685 /* 1686 * If we have a timestamp reply, update smoothed 1687 * round trip time. If no timestamp is present but 1688 * transmit timer is running and timed sequence 1689 * number was acked, update smoothed round trip time. 1690 * Since we now have an rtt measurement, cancel the 1691 * timer backoff (cf., Phil Karn's retransmit alg.). 1692 * Recompute the initial retransmit timer. 1693 */ 1694 if (opti.ts_present && opti.ts_ecr) 1695 tcp_xmit_timer(tp, now - opti.ts_ecr); 1696 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1697 tcp_xmit_timer(tp, now - tp->t_rtttime); 1698 1699 /* 1700 * If all outstanding data is acked, stop retransmit 1701 * timer and remember to restart (more output or persist). 1702 * If there is more data to be acked, restart retransmit 1703 * timer, using current (possibly backed-off) value. 1704 */ 1705 if (th->th_ack == tp->snd_max) { 1706 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1707 tp->t_flags |= TF_NEEDOUTPUT; 1708 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1709 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1710 /* 1711 * When new data is acked, open the congestion window. 1712 * If the window gives us less than ssthresh packets 1713 * in flight, open exponentially (maxseg per packet). 1714 * Otherwise open linearly: maxseg per window 1715 * (maxseg^2 / cwnd per packet). 1716 */ 1717 { 1718 u_int cw = tp->snd_cwnd; 1719 u_int incr = tp->t_maxseg; 1720 1721 if (cw > tp->snd_ssthresh) 1722 incr = max(incr * incr / cw, 1); 1723 if (tp->t_dupacks < tcprexmtthresh) 1724 tp->snd_cwnd = ulmin(cw + incr, 1725 TCP_MAXWIN << tp->snd_scale); 1726 } 1727 ND6_HINT(tp); 1728 if (acked > so->so_snd.sb_cc) { 1729 if (tp->snd_wnd > so->so_snd.sb_cc) 1730 tp->snd_wnd -= so->so_snd.sb_cc; 1731 else 1732 tp->snd_wnd = 0; 1733 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1734 ourfinisacked = 1; 1735 } else { 1736 sbdrop(so, &so->so_snd, acked); 1737 if (tp->snd_wnd > acked) 1738 tp->snd_wnd -= acked; 1739 else 1740 tp->snd_wnd = 0; 1741 ourfinisacked = 0; 1742 } 1743 1744 tcp_update_sndspace(tp); 1745 if (sb_notify(so, &so->so_snd)) { 1746 tp->t_flags |= TF_BLOCKOUTPUT; 1747 sowwakeup(so); 1748 tp->t_flags &= ~TF_BLOCKOUTPUT; 1749 } 1750 1751 /* 1752 * If we had a pending ICMP message that referred to data 1753 * that have just been acknowledged, disregard the recorded 1754 * ICMP message. 1755 */ 1756 if ((tp->t_flags & TF_PMTUD_PEND) && 1757 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1758 tp->t_flags &= ~TF_PMTUD_PEND; 1759 1760 /* 1761 * Keep track of the largest chunk of data acknowledged 1762 * since last PMTU update 1763 */ 1764 if (tp->t_pmtud_mss_acked < acked) 1765 tp->t_pmtud_mss_acked = acked; 1766 1767 tp->snd_una = th->th_ack; 1768 #ifdef TCP_ECN 1769 /* sync snd_last with snd_una */ 1770 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1771 tp->snd_last = tp->snd_una; 1772 #endif 1773 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1774 tp->snd_nxt = tp->snd_una; 1775 1776 switch (tp->t_state) { 1777 1778 /* 1779 * In FIN_WAIT_1 STATE in addition to the processing 1780 * for the ESTABLISHED state if our FIN is now acknowledged 1781 * then enter FIN_WAIT_2. 1782 */ 1783 case TCPS_FIN_WAIT_1: 1784 if (ourfinisacked) { 1785 /* 1786 * If we can't receive any more 1787 * data, then closing user can proceed. 1788 * Starting the timer is contrary to the 1789 * specification, but if we don't get a FIN 1790 * we'll hang forever. 1791 */ 1792 if (so->so_state & SS_CANTRCVMORE) { 1793 tp->t_flags |= TF_BLOCKOUTPUT; 1794 soisdisconnected(so); 1795 tp->t_flags &= ~TF_BLOCKOUTPUT; 1796 TCP_TIMER_ARM(tp, TCPT_2MSL, 1797 TCP_TIME(tcp_maxidle)); 1798 } 1799 tp->t_state = TCPS_FIN_WAIT_2; 1800 } 1801 break; 1802 1803 /* 1804 * In CLOSING STATE in addition to the processing for 1805 * the ESTABLISHED state if the ACK acknowledges our FIN 1806 * then enter the TIME-WAIT state, otherwise ignore 1807 * the segment. 1808 */ 1809 case TCPS_CLOSING: 1810 if (ourfinisacked) { 1811 tp->t_state = TCPS_TIME_WAIT; 1812 tcp_canceltimers(tp); 1813 TCP_TIMER_ARM(tp, TCPT_2MSL, 1814 TCP_TIME(2 * TCPTV_MSL)); 1815 tp->t_flags |= TF_BLOCKOUTPUT; 1816 soisdisconnected(so); 1817 tp->t_flags &= ~TF_BLOCKOUTPUT; 1818 } 1819 break; 1820 1821 /* 1822 * In LAST_ACK, we may still be waiting for data to drain 1823 * and/or to be acked, as well as for the ack of our FIN. 1824 * If our FIN is now acknowledged, delete the TCB, 1825 * enter the closed state and return. 1826 */ 1827 case TCPS_LAST_ACK: 1828 if (ourfinisacked) { 1829 tp = tcp_close(tp); 1830 goto drop; 1831 } 1832 break; 1833 1834 /* 1835 * In TIME_WAIT state the only thing that should arrive 1836 * is a retransmission of the remote FIN. Acknowledge 1837 * it and restart the finack timer. 1838 */ 1839 case TCPS_TIME_WAIT: 1840 TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL)); 1841 goto dropafterack; 1842 } 1843 } 1844 1845 step6: 1846 /* 1847 * Update window information. 1848 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1849 */ 1850 if ((tiflags & TH_ACK) && 1851 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1852 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1853 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1854 /* keep track of pure window updates */ 1855 if (tlen == 0 && 1856 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1857 tcpstat_inc(tcps_rcvwinupd); 1858 tp->snd_wnd = tiwin; 1859 tp->snd_wl1 = th->th_seq; 1860 tp->snd_wl2 = th->th_ack; 1861 if (tp->snd_wnd > tp->max_sndwnd) 1862 tp->max_sndwnd = tp->snd_wnd; 1863 tp->t_flags |= TF_NEEDOUTPUT; 1864 } 1865 1866 /* 1867 * Process segments with URG. 1868 */ 1869 if ((tiflags & TH_URG) && th->th_urp && 1870 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1871 /* 1872 * This is a kludge, but if we receive and accept 1873 * random urgent pointers, we'll crash in 1874 * soreceive. It's hard to imagine someone 1875 * actually wanting to send this much urgent data. 1876 */ 1877 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1878 th->th_urp = 0; /* XXX */ 1879 tiflags &= ~TH_URG; /* XXX */ 1880 goto dodata; /* XXX */ 1881 } 1882 /* 1883 * If this segment advances the known urgent pointer, 1884 * then mark the data stream. This should not happen 1885 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1886 * a FIN has been received from the remote side. 1887 * In these states we ignore the URG. 1888 * 1889 * According to RFC961 (Assigned Protocols), 1890 * the urgent pointer points to the last octet 1891 * of urgent data. We continue, however, 1892 * to consider it to indicate the first octet 1893 * of data past the urgent section as the original 1894 * spec states (in one of two places). 1895 */ 1896 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1897 tp->rcv_up = th->th_seq + th->th_urp; 1898 so->so_oobmark = so->so_rcv.sb_cc + 1899 (tp->rcv_up - tp->rcv_nxt) - 1; 1900 if (so->so_oobmark == 0) 1901 so->so_state |= SS_RCVATMARK; 1902 sohasoutofband(so); 1903 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1904 } 1905 /* 1906 * Remove out of band data so doesn't get presented to user. 1907 * This can happen independent of advancing the URG pointer, 1908 * but if two URG's are pending at once, some out-of-band 1909 * data may creep in... ick. 1910 */ 1911 if (th->th_urp <= (u_int16_t) tlen && 1912 (so->so_options & SO_OOBINLINE) == 0) 1913 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1914 } else 1915 /* 1916 * If no out of band data is expected, 1917 * pull receive urgent pointer along 1918 * with the receive window. 1919 */ 1920 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1921 tp->rcv_up = tp->rcv_nxt; 1922 dodata: /* XXX */ 1923 1924 /* 1925 * Process the segment text, merging it into the TCP sequencing queue, 1926 * and arranging for acknowledgment of receipt if necessary. 1927 * This process logically involves adjusting tp->rcv_wnd as data 1928 * is presented to the user (this happens in tcp_usrreq.c, 1929 * case PRU_RCVD). If a FIN has already been received on this 1930 * connection then we just ignore the text. 1931 */ 1932 if ((tlen || (tiflags & TH_FIN)) && 1933 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1934 tcp_seq laststart = th->th_seq; 1935 tcp_seq lastend = th->th_seq + tlen; 1936 1937 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1938 tp->t_state == TCPS_ESTABLISHED) { 1939 TCP_SETUP_ACK(tp, tiflags, m); 1940 tp->rcv_nxt += tlen; 1941 tiflags = th->th_flags & TH_FIN; 1942 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1943 ND6_HINT(tp); 1944 if (so->so_state & SS_CANTRCVMORE) 1945 m_freem(m); 1946 else { 1947 m_adj(m, hdroptlen); 1948 sbappendstream(so, &so->so_rcv, m); 1949 } 1950 tp->t_flags |= TF_BLOCKOUTPUT; 1951 sorwakeup(so); 1952 tp->t_flags &= ~TF_BLOCKOUTPUT; 1953 } else { 1954 m_adj(m, hdroptlen); 1955 tiflags = tcp_reass(tp, th, m, &tlen); 1956 tp->t_flags |= TF_ACKNOW; 1957 } 1958 if (tp->sack_enable) 1959 tcp_update_sack_list(tp, laststart, lastend); 1960 1961 /* 1962 * variable len never referenced again in modern BSD, 1963 * so why bother computing it ?? 1964 */ 1965 #if 0 1966 /* 1967 * Note the amount of data that peer has sent into 1968 * our window, in order to estimate the sender's 1969 * buffer size. 1970 */ 1971 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1972 #endif /* 0 */ 1973 } else { 1974 m_freem(m); 1975 tiflags &= ~TH_FIN; 1976 } 1977 1978 /* 1979 * If FIN is received ACK the FIN and let the user know 1980 * that the connection is closing. Ignore a FIN received before 1981 * the connection is fully established. 1982 */ 1983 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1984 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1985 tp->t_flags |= TF_BLOCKOUTPUT; 1986 socantrcvmore(so); 1987 tp->t_flags &= ~TF_BLOCKOUTPUT; 1988 tp->t_flags |= TF_ACKNOW; 1989 tp->rcv_nxt++; 1990 } 1991 switch (tp->t_state) { 1992 1993 /* 1994 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1995 */ 1996 case TCPS_ESTABLISHED: 1997 tp->t_state = TCPS_CLOSE_WAIT; 1998 break; 1999 2000 /* 2001 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2002 * enter the CLOSING state. 2003 */ 2004 case TCPS_FIN_WAIT_1: 2005 tp->t_state = TCPS_CLOSING; 2006 break; 2007 2008 /* 2009 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2010 * starting the time-wait timer, turning off the other 2011 * standard timers. 2012 */ 2013 case TCPS_FIN_WAIT_2: 2014 tp->t_state = TCPS_TIME_WAIT; 2015 tcp_canceltimers(tp); 2016 TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL)); 2017 tp->t_flags |= TF_BLOCKOUTPUT; 2018 soisdisconnected(so); 2019 tp->t_flags &= ~TF_BLOCKOUTPUT; 2020 break; 2021 2022 /* 2023 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2024 */ 2025 case TCPS_TIME_WAIT: 2026 TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL)); 2027 break; 2028 } 2029 } 2030 if (otp) 2031 tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen); 2032 2033 /* 2034 * Return any desired output. 2035 */ 2036 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2037 (void) tcp_output(tp); 2038 in_pcbunref(inp); 2039 return IPPROTO_DONE; 2040 2041 badsyn: 2042 /* 2043 * Received a bad SYN. Increment counters and dropwithreset. 2044 */ 2045 tcpstat_inc(tcps_badsyn); 2046 tp = NULL; 2047 goto dropwithreset; 2048 2049 dropafterack_ratelim: 2050 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2051 tcp_ackdrop_ppslim) == 0) { 2052 /* XXX stat */ 2053 goto drop; 2054 } 2055 /* ...fall into dropafterack... */ 2056 2057 dropafterack: 2058 /* 2059 * Generate an ACK dropping incoming segment if it occupies 2060 * sequence space, where the ACK reflects our state. 2061 */ 2062 if (tiflags & TH_RST) 2063 goto drop; 2064 m_freem(m); 2065 tp->t_flags |= TF_ACKNOW; 2066 (void) tcp_output(tp); 2067 in_pcbunref(inp); 2068 return IPPROTO_DONE; 2069 2070 dropwithreset_ratelim: 2071 /* 2072 * We may want to rate-limit RSTs in certain situations, 2073 * particularly if we are sending an RST in response to 2074 * an attempt to connect to or otherwise communicate with 2075 * a port for which we have no socket. 2076 */ 2077 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2078 tcp_rst_ppslim) == 0) { 2079 /* XXX stat */ 2080 goto drop; 2081 } 2082 /* ...fall into dropwithreset... */ 2083 2084 dropwithreset: 2085 /* 2086 * Generate a RST, dropping incoming segment. 2087 * Make ACK acceptable to originator of segment. 2088 * Don't bother to respond to RST. 2089 */ 2090 if (tiflags & TH_RST) 2091 goto drop; 2092 if (tiflags & TH_ACK) { 2093 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2094 TH_RST, m->m_pkthdr.ph_rtableid, now); 2095 } else { 2096 if (tiflags & TH_SYN) 2097 tlen++; 2098 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2099 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); 2100 } 2101 m_freem(m); 2102 in_pcbunref(inp); 2103 return IPPROTO_DONE; 2104 2105 drop: 2106 /* 2107 * Drop space held by incoming segment and return. 2108 */ 2109 if (otp) 2110 tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen); 2111 2112 m_freem(m); 2113 in_pcbunref(inp); 2114 return IPPROTO_DONE; 2115 } 2116 2117 int 2118 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2119 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2120 u_int rtableid, uint32_t now) 2121 { 2122 u_int16_t mss = 0; 2123 int opt, optlen; 2124 #ifdef TCP_SIGNATURE 2125 caddr_t sigp = NULL; 2126 struct tdb *tdb = NULL; 2127 #endif /* TCP_SIGNATURE */ 2128 2129 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2130 opt = cp[0]; 2131 if (opt == TCPOPT_EOL) 2132 break; 2133 if (opt == TCPOPT_NOP) 2134 optlen = 1; 2135 else { 2136 if (cnt < 2) 2137 break; 2138 optlen = cp[1]; 2139 if (optlen < 2 || optlen > cnt) 2140 break; 2141 } 2142 switch (opt) { 2143 2144 default: 2145 continue; 2146 2147 case TCPOPT_MAXSEG: 2148 if (optlen != TCPOLEN_MAXSEG) 2149 continue; 2150 if (!(th->th_flags & TH_SYN)) 2151 continue; 2152 if (TCPS_HAVERCVDSYN(tp->t_state)) 2153 continue; 2154 memcpy(&mss, cp + 2, sizeof(mss)); 2155 mss = ntohs(mss); 2156 oi->maxseg = mss; 2157 break; 2158 2159 case TCPOPT_WINDOW: 2160 if (optlen != TCPOLEN_WINDOW) 2161 continue; 2162 if (!(th->th_flags & TH_SYN)) 2163 continue; 2164 if (TCPS_HAVERCVDSYN(tp->t_state)) 2165 continue; 2166 tp->t_flags |= TF_RCVD_SCALE; 2167 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2168 break; 2169 2170 case TCPOPT_TIMESTAMP: 2171 if (optlen != TCPOLEN_TIMESTAMP) 2172 continue; 2173 oi->ts_present = 1; 2174 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2175 oi->ts_val = ntohl(oi->ts_val); 2176 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2177 oi->ts_ecr = ntohl(oi->ts_ecr); 2178 2179 if (!(th->th_flags & TH_SYN)) 2180 continue; 2181 if (TCPS_HAVERCVDSYN(tp->t_state)) 2182 continue; 2183 /* 2184 * A timestamp received in a SYN makes 2185 * it ok to send timestamp requests and replies. 2186 */ 2187 tp->t_flags |= TF_RCVD_TSTMP; 2188 tp->ts_recent = oi->ts_val; 2189 tp->ts_recent_age = now; 2190 break; 2191 2192 case TCPOPT_SACK_PERMITTED: 2193 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2194 continue; 2195 if (!(th->th_flags & TH_SYN)) 2196 continue; 2197 if (TCPS_HAVERCVDSYN(tp->t_state)) 2198 continue; 2199 /* MUST only be set on SYN */ 2200 tp->t_flags |= TF_SACK_PERMIT; 2201 break; 2202 case TCPOPT_SACK: 2203 tcp_sack_option(tp, th, cp, optlen); 2204 break; 2205 #ifdef TCP_SIGNATURE 2206 case TCPOPT_SIGNATURE: 2207 if (optlen != TCPOLEN_SIGNATURE) 2208 continue; 2209 2210 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2211 goto bad; 2212 2213 sigp = cp + 2; 2214 break; 2215 #endif /* TCP_SIGNATURE */ 2216 } 2217 } 2218 2219 #ifdef TCP_SIGNATURE 2220 if (tp->t_flags & TF_SIGNATURE) { 2221 union sockaddr_union src, dst; 2222 2223 memset(&src, 0, sizeof(union sockaddr_union)); 2224 memset(&dst, 0, sizeof(union sockaddr_union)); 2225 2226 switch (tp->pf) { 2227 case 0: 2228 case AF_INET: 2229 src.sa.sa_len = sizeof(struct sockaddr_in); 2230 src.sa.sa_family = AF_INET; 2231 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2232 dst.sa.sa_len = sizeof(struct sockaddr_in); 2233 dst.sa.sa_family = AF_INET; 2234 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2235 break; 2236 #ifdef INET6 2237 case AF_INET6: 2238 src.sa.sa_len = sizeof(struct sockaddr_in6); 2239 src.sa.sa_family = AF_INET6; 2240 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2241 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2242 dst.sa.sa_family = AF_INET6; 2243 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2244 break; 2245 #endif /* INET6 */ 2246 } 2247 2248 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2249 0, &src, &dst, IPPROTO_TCP); 2250 2251 /* 2252 * We don't have an SA for this peer, so we turn off 2253 * TF_SIGNATURE on the listen socket 2254 */ 2255 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2256 tp->t_flags &= ~TF_SIGNATURE; 2257 2258 } 2259 2260 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2261 tcpstat_inc(tcps_rcvbadsig); 2262 goto bad; 2263 } 2264 2265 if (sigp) { 2266 char sig[16]; 2267 2268 if (tdb == NULL) { 2269 tcpstat_inc(tcps_rcvbadsig); 2270 goto bad; 2271 } 2272 2273 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2274 goto bad; 2275 2276 if (timingsafe_bcmp(sig, sigp, 16)) { 2277 tcpstat_inc(tcps_rcvbadsig); 2278 goto bad; 2279 } 2280 2281 tcpstat_inc(tcps_rcvgoodsig); 2282 } 2283 2284 tdb_unref(tdb); 2285 #endif /* TCP_SIGNATURE */ 2286 2287 return (0); 2288 2289 #ifdef TCP_SIGNATURE 2290 bad: 2291 tdb_unref(tdb); 2292 #endif /* TCP_SIGNATURE */ 2293 return (-1); 2294 } 2295 2296 u_long 2297 tcp_seq_subtract(u_long a, u_long b) 2298 { 2299 return ((long)(a - b)); 2300 } 2301 2302 /* 2303 * This function is called upon receipt of new valid data (while not in header 2304 * prediction mode), and it updates the ordered list of sacks. 2305 */ 2306 void 2307 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2308 tcp_seq rcv_lastend) 2309 { 2310 /* 2311 * First reported block MUST be the most recent one. Subsequent 2312 * blocks SHOULD be in the order in which they arrived at the 2313 * receiver. These two conditions make the implementation fully 2314 * compliant with RFC 2018. 2315 */ 2316 int i, j = 0, count = 0, lastpos = -1; 2317 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2318 2319 /* First clean up current list of sacks */ 2320 for (i = 0; i < tp->rcv_numsacks; i++) { 2321 sack = tp->sackblks[i]; 2322 if (sack.start == 0 && sack.end == 0) { 2323 count++; /* count = number of blocks to be discarded */ 2324 continue; 2325 } 2326 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2327 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2328 count++; 2329 } else { 2330 temp[j].start = tp->sackblks[i].start; 2331 temp[j++].end = tp->sackblks[i].end; 2332 } 2333 } 2334 tp->rcv_numsacks -= count; 2335 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2336 tcp_clean_sackreport(tp); 2337 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2338 /* ==> need first sack block */ 2339 tp->sackblks[0].start = rcv_laststart; 2340 tp->sackblks[0].end = rcv_lastend; 2341 tp->rcv_numsacks = 1; 2342 } 2343 return; 2344 } 2345 /* Otherwise, sack blocks are already present. */ 2346 for (i = 0; i < tp->rcv_numsacks; i++) 2347 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2348 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2349 return; /* sack list remains unchanged */ 2350 /* 2351 * From here, segment just received should be (part of) the 1st sack. 2352 * Go through list, possibly coalescing sack block entries. 2353 */ 2354 firstsack.start = rcv_laststart; 2355 firstsack.end = rcv_lastend; 2356 for (i = 0; i < tp->rcv_numsacks; i++) { 2357 sack = tp->sackblks[i]; 2358 if (SEQ_LT(sack.end, firstsack.start) || 2359 SEQ_GT(sack.start, firstsack.end)) 2360 continue; /* no overlap */ 2361 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2362 /* 2363 * identical block; delete it here since we will 2364 * move it to the front of the list. 2365 */ 2366 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2367 lastpos = i; /* last posn with a zero entry */ 2368 continue; 2369 } 2370 if (SEQ_LEQ(sack.start, firstsack.start)) 2371 firstsack.start = sack.start; /* merge blocks */ 2372 if (SEQ_GEQ(sack.end, firstsack.end)) 2373 firstsack.end = sack.end; /* merge blocks */ 2374 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2375 lastpos = i; /* last posn with a zero entry */ 2376 } 2377 if (lastpos != -1) { /* at least one merge */ 2378 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2379 sack = tp->sackblks[i]; 2380 if (sack.start == 0 && sack.end == 0) 2381 continue; 2382 temp[j++] = sack; 2383 } 2384 tp->rcv_numsacks = j; /* including first blk (added later) */ 2385 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2386 tp->sackblks[i] = temp[i]; 2387 } else { /* no merges -- shift sacks by 1 */ 2388 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2389 tp->rcv_numsacks++; 2390 for (i = tp->rcv_numsacks-1; i > 0; i--) 2391 tp->sackblks[i] = tp->sackblks[i-1]; 2392 } 2393 tp->sackblks[0] = firstsack; 2394 return; 2395 } 2396 2397 /* 2398 * Process the TCP SACK option. tp->snd_holes is an ordered list 2399 * of holes (oldest to newest, in terms of the sequence space). 2400 */ 2401 void 2402 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2403 { 2404 int tmp_olen; 2405 u_char *tmp_cp; 2406 struct sackhole *cur, *p, *temp; 2407 2408 if (!tp->sack_enable) 2409 return; 2410 /* SACK without ACK doesn't make sense. */ 2411 if ((th->th_flags & TH_ACK) == 0) 2412 return; 2413 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2414 if (SEQ_LT(th->th_ack, tp->snd_una) || 2415 SEQ_GT(th->th_ack, tp->snd_max)) 2416 return; 2417 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2418 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2419 return; 2420 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2421 tmp_cp = cp + 2; 2422 tmp_olen = optlen - 2; 2423 tcpstat_inc(tcps_sack_rcv_opts); 2424 if (tp->snd_numholes < 0) 2425 tp->snd_numholes = 0; 2426 if (tp->t_maxseg == 0) 2427 panic("tcp_sack_option"); /* Should never happen */ 2428 while (tmp_olen > 0) { 2429 struct sackblk sack; 2430 2431 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2432 sack.start = ntohl(sack.start); 2433 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2434 sack.end = ntohl(sack.end); 2435 tmp_olen -= TCPOLEN_SACK; 2436 tmp_cp += TCPOLEN_SACK; 2437 if (SEQ_LEQ(sack.end, sack.start)) 2438 continue; /* bad SACK fields */ 2439 if (SEQ_LEQ(sack.end, tp->snd_una)) 2440 continue; /* old block */ 2441 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2442 if (SEQ_LT(sack.start, th->th_ack)) 2443 continue; 2444 } 2445 if (SEQ_GT(sack.end, tp->snd_max)) 2446 continue; 2447 if (tp->snd_holes == NULL) { /* first hole */ 2448 tp->snd_holes = (struct sackhole *) 2449 pool_get(&sackhl_pool, PR_NOWAIT); 2450 if (tp->snd_holes == NULL) { 2451 /* ENOBUFS, so ignore SACKed block for now */ 2452 goto dropped; 2453 } 2454 cur = tp->snd_holes; 2455 cur->start = th->th_ack; 2456 cur->end = sack.start; 2457 cur->rxmit = cur->start; 2458 cur->next = NULL; 2459 tp->snd_numholes = 1; 2460 tp->rcv_lastsack = sack.end; 2461 /* 2462 * dups is at least one. If more data has been 2463 * SACKed, it can be greater than one. 2464 */ 2465 cur->dups = min(tcprexmtthresh, 2466 ((sack.end - cur->end)/tp->t_maxseg)); 2467 if (cur->dups < 1) 2468 cur->dups = 1; 2469 continue; /* with next sack block */ 2470 } 2471 /* Go thru list of holes: p = previous, cur = current */ 2472 p = cur = tp->snd_holes; 2473 while (cur) { 2474 if (SEQ_LEQ(sack.end, cur->start)) 2475 /* SACKs data before the current hole */ 2476 break; /* no use going through more holes */ 2477 if (SEQ_GEQ(sack.start, cur->end)) { 2478 /* SACKs data beyond the current hole */ 2479 cur->dups++; 2480 if (((sack.end - cur->end)/tp->t_maxseg) >= 2481 tcprexmtthresh) 2482 cur->dups = tcprexmtthresh; 2483 p = cur; 2484 cur = cur->next; 2485 continue; 2486 } 2487 if (SEQ_LEQ(sack.start, cur->start)) { 2488 /* Data acks at least the beginning of hole */ 2489 if (SEQ_GEQ(sack.end, cur->end)) { 2490 /* Acks entire hole, so delete hole */ 2491 if (p != cur) { 2492 p->next = cur->next; 2493 pool_put(&sackhl_pool, cur); 2494 cur = p->next; 2495 } else { 2496 cur = cur->next; 2497 pool_put(&sackhl_pool, p); 2498 p = cur; 2499 tp->snd_holes = p; 2500 } 2501 tp->snd_numholes--; 2502 continue; 2503 } 2504 /* otherwise, move start of hole forward */ 2505 cur->start = sack.end; 2506 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2507 p = cur; 2508 cur = cur->next; 2509 continue; 2510 } 2511 /* move end of hole backward */ 2512 if (SEQ_GEQ(sack.end, cur->end)) { 2513 cur->end = sack.start; 2514 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2515 cur->dups++; 2516 if (((sack.end - cur->end)/tp->t_maxseg) >= 2517 tcprexmtthresh) 2518 cur->dups = tcprexmtthresh; 2519 p = cur; 2520 cur = cur->next; 2521 continue; 2522 } 2523 if (SEQ_LT(cur->start, sack.start) && 2524 SEQ_GT(cur->end, sack.end)) { 2525 /* 2526 * ACKs some data in middle of a hole; need to 2527 * split current hole 2528 */ 2529 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2530 goto dropped; 2531 temp = (struct sackhole *) 2532 pool_get(&sackhl_pool, PR_NOWAIT); 2533 if (temp == NULL) 2534 goto dropped; /* ENOBUFS */ 2535 temp->next = cur->next; 2536 temp->start = sack.end; 2537 temp->end = cur->end; 2538 temp->dups = cur->dups; 2539 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2540 cur->end = sack.start; 2541 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2542 cur->dups++; 2543 if (((sack.end - cur->end)/tp->t_maxseg) >= 2544 tcprexmtthresh) 2545 cur->dups = tcprexmtthresh; 2546 cur->next = temp; 2547 p = temp; 2548 cur = p->next; 2549 tp->snd_numholes++; 2550 } 2551 } 2552 /* At this point, p points to the last hole on the list */ 2553 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2554 /* 2555 * Need to append new hole at end. 2556 * Last hole is p (and it's not NULL). 2557 */ 2558 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2559 goto dropped; 2560 temp = (struct sackhole *) 2561 pool_get(&sackhl_pool, PR_NOWAIT); 2562 if (temp == NULL) 2563 goto dropped; /* ENOBUFS */ 2564 temp->start = tp->rcv_lastsack; 2565 temp->end = sack.start; 2566 temp->dups = min(tcprexmtthresh, 2567 ((sack.end - sack.start)/tp->t_maxseg)); 2568 if (temp->dups < 1) 2569 temp->dups = 1; 2570 temp->rxmit = temp->start; 2571 temp->next = 0; 2572 p->next = temp; 2573 tp->rcv_lastsack = sack.end; 2574 tp->snd_numholes++; 2575 } 2576 } 2577 return; 2578 dropped: 2579 tcpstat_inc(tcps_sack_drop_opts); 2580 } 2581 2582 /* 2583 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2584 * it is completely acked; otherwise, tcp_sack_option(), called from 2585 * tcp_dooptions(), will fix up the hole. 2586 */ 2587 void 2588 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2589 { 2590 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2591 /* max because this could be an older ack just arrived */ 2592 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2593 th->th_ack : tp->snd_una; 2594 struct sackhole *cur = tp->snd_holes; 2595 struct sackhole *prev; 2596 while (cur) 2597 if (SEQ_LEQ(cur->end, lastack)) { 2598 prev = cur; 2599 cur = cur->next; 2600 pool_put(&sackhl_pool, prev); 2601 tp->snd_numholes--; 2602 } else if (SEQ_LT(cur->start, lastack)) { 2603 cur->start = lastack; 2604 if (SEQ_LT(cur->rxmit, cur->start)) 2605 cur->rxmit = cur->start; 2606 break; 2607 } else 2608 break; 2609 tp->snd_holes = cur; 2610 } 2611 } 2612 2613 /* 2614 * Delete all receiver-side SACK information. 2615 */ 2616 void 2617 tcp_clean_sackreport(struct tcpcb *tp) 2618 { 2619 int i; 2620 2621 tp->rcv_numsacks = 0; 2622 for (i = 0; i < MAX_SACK_BLKS; i++) 2623 tp->sackblks[i].start = tp->sackblks[i].end=0; 2624 2625 } 2626 2627 /* 2628 * Partial ack handling within a sack recovery episode. When a partial ack 2629 * arrives, turn off retransmission timer, deflate the window, do not clear 2630 * tp->t_dupacks. 2631 */ 2632 void 2633 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2634 { 2635 /* Turn off retx. timer (will start again next segment) */ 2636 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2637 tp->t_rtttime = 0; 2638 /* 2639 * Partial window deflation. This statement relies on the 2640 * fact that tp->snd_una has not been updated yet. 2641 */ 2642 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2643 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2644 tp->snd_cwnd += tp->t_maxseg; 2645 } else 2646 tp->snd_cwnd = tp->t_maxseg; 2647 tp->snd_cwnd += tp->t_maxseg; 2648 tp->t_flags |= TF_NEEDOUTPUT; 2649 } 2650 2651 /* 2652 * Pull out of band byte out of a segment so 2653 * it doesn't appear in the user's data queue. 2654 * It is still reflected in the segment length for 2655 * sequencing purposes. 2656 */ 2657 void 2658 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2659 { 2660 int cnt = off + urgent - 1; 2661 2662 while (cnt >= 0) { 2663 if (m->m_len > cnt) { 2664 char *cp = mtod(m, caddr_t) + cnt; 2665 struct tcpcb *tp = sototcpcb(so); 2666 2667 tp->t_iobc = *cp; 2668 tp->t_oobflags |= TCPOOB_HAVEDATA; 2669 memmove(cp, cp + 1, m->m_len - cnt - 1); 2670 m->m_len--; 2671 return; 2672 } 2673 cnt -= m->m_len; 2674 m = m->m_next; 2675 if (m == NULL) 2676 break; 2677 } 2678 panic("tcp_pulloutofband"); 2679 } 2680 2681 /* 2682 * Collect new round-trip time estimate 2683 * and update averages and current timeout. 2684 */ 2685 void 2686 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2687 { 2688 int delta, rttmin; 2689 2690 if (rtt < 0) 2691 rtt = 0; 2692 else if (rtt > TCP_RTT_MAX) 2693 rtt = TCP_RTT_MAX; 2694 2695 tcpstat_inc(tcps_rttupdated); 2696 if (tp->t_srtt != 0) { 2697 /* 2698 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2699 * after the binary point (scaled by 4), whereas 2700 * srtt is stored as fixed point with 5 bits after the 2701 * binary point (i.e., scaled by 32). The following magic 2702 * is equivalent to the smoothing algorithm in rfc793 with 2703 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2704 * point). 2705 */ 2706 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2707 (tp->t_srtt >> TCP_RTT_SHIFT); 2708 if ((tp->t_srtt += delta) <= 0) 2709 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2710 /* 2711 * We accumulate a smoothed rtt variance (actually, a 2712 * smoothed mean difference), then set the retransmit 2713 * timer to smoothed rtt + 4 times the smoothed variance. 2714 * rttvar is stored as fixed point with 4 bits after the 2715 * binary point (scaled by 16). The following is 2716 * equivalent to rfc793 smoothing with an alpha of .75 2717 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2718 * rfc793's wired-in beta. 2719 */ 2720 if (delta < 0) 2721 delta = -delta; 2722 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2723 if ((tp->t_rttvar += delta) <= 0) 2724 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2725 } else { 2726 /* 2727 * No rtt measurement yet - use the unsmoothed rtt. 2728 * Set the variance to half the rtt (so our first 2729 * retransmit happens at 3*rtt). 2730 */ 2731 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2732 tp->t_rttvar = (rtt + 1) << 2733 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2734 } 2735 tp->t_rtttime = 0; 2736 tp->t_rxtshift = 0; 2737 2738 /* 2739 * the retransmit should happen at rtt + 4 * rttvar. 2740 * Because of the way we do the smoothing, srtt and rttvar 2741 * will each average +1/2 tick of bias. When we compute 2742 * the retransmit timer, we want 1/2 tick of rounding and 2743 * 1 extra tick because of +-1/2 tick uncertainty in the 2744 * firing of the timer. The bias will give us exactly the 2745 * 1.5 tick we need. But, because the bias is 2746 * statistical, we have to test that we don't drop below 2747 * the minimum feasible timer (which is 2 ticks). 2748 */ 2749 rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)), 2750 TCPTV_REXMTMAX); 2751 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2752 2753 /* 2754 * We received an ack for a packet that wasn't retransmitted; 2755 * it is probably safe to discard any error indications we've 2756 * received recently. This isn't quite right, but close enough 2757 * for now (a route might have failed after we sent a segment, 2758 * and the return path might not be symmetrical). 2759 */ 2760 tp->t_softerror = 0; 2761 } 2762 2763 /* 2764 * Determine a reasonable value for maxseg size. 2765 * If the route is known, check route for mtu. 2766 * If none, use an mss that can be handled on the outgoing 2767 * interface without forcing IP to fragment; if bigger than 2768 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2769 * to utilize large mbufs. If no route is found, route has no mtu, 2770 * or the destination isn't local, use a default, hopefully conservative 2771 * size (usually 512 or the default IP max size, but no more than the mtu 2772 * of the interface), as we can't discover anything about intervening 2773 * gateways or networks. We also initialize the congestion/slow start 2774 * window to be a single segment if the destination isn't local. 2775 * While looking at the routing entry, we also initialize other path-dependent 2776 * parameters from pre-set or cached values in the routing entry. 2777 * 2778 * Also take into account the space needed for options that we 2779 * send regularly. Make maxseg shorter by that amount to assure 2780 * that we can send maxseg amount of data even when the options 2781 * are present. Store the upper limit of the length of options plus 2782 * data in maxopd. 2783 * 2784 * NOTE: offer == -1 indicates that the maxseg size changed due to 2785 * Path MTU discovery. 2786 */ 2787 int 2788 tcp_mss(struct tcpcb *tp, int offer) 2789 { 2790 struct rtentry *rt; 2791 struct ifnet *ifp = NULL; 2792 int mss, mssopt; 2793 int iphlen; 2794 struct inpcb *inp; 2795 2796 inp = tp->t_inpcb; 2797 2798 mssopt = mss = tcp_mssdflt; 2799 2800 rt = in_pcbrtentry(inp); 2801 2802 if (rt == NULL) 2803 goto out; 2804 2805 ifp = if_get(rt->rt_ifidx); 2806 if (ifp == NULL) 2807 goto out; 2808 2809 switch (tp->pf) { 2810 #ifdef INET6 2811 case AF_INET6: 2812 iphlen = sizeof(struct ip6_hdr); 2813 break; 2814 #endif 2815 case AF_INET: 2816 iphlen = sizeof(struct ip); 2817 break; 2818 default: 2819 /* the family does not support path MTU discovery */ 2820 goto out; 2821 } 2822 2823 /* 2824 * if there's an mtu associated with the route and we support 2825 * path MTU discovery for the underlying protocol family, use it. 2826 */ 2827 if (rt->rt_mtu) { 2828 /* 2829 * One may wish to lower MSS to take into account options, 2830 * especially security-related options. 2831 */ 2832 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2833 /* 2834 * RFC2460 section 5, last paragraph: if path MTU is 2835 * smaller than 1280, use 1280 as packet size and 2836 * attach fragment header. 2837 */ 2838 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2839 sizeof(struct tcphdr); 2840 } else { 2841 mss = rt->rt_mtu - iphlen - 2842 sizeof(struct tcphdr); 2843 } 2844 } else if (ifp->if_flags & IFF_LOOPBACK) { 2845 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2846 } else if (tp->pf == AF_INET) { 2847 if (ip_mtudisc) 2848 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2849 } 2850 #ifdef INET6 2851 else if (tp->pf == AF_INET6) { 2852 /* 2853 * for IPv6, path MTU discovery is always turned on, 2854 * or the node must use packet size <= 1280. 2855 */ 2856 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2857 } 2858 #endif /* INET6 */ 2859 2860 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2861 if (offer != -1) { 2862 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2863 mssopt = max(tcp_mssdflt, mssopt); 2864 } 2865 out: 2866 if_put(ifp); 2867 /* 2868 * The current mss, t_maxseg, is initialized to the default value. 2869 * If we compute a smaller value, reduce the current mss. 2870 * If we compute a larger value, return it for use in sending 2871 * a max seg size option, but don't store it for use 2872 * unless we received an offer at least that large from peer. 2873 * 2874 * However, do not accept offers lower than the minimum of 2875 * the interface MTU and 216. 2876 */ 2877 if (offer > 0) 2878 tp->t_peermss = offer; 2879 if (tp->t_peermss) 2880 mss = min(mss, max(tp->t_peermss, 216)); 2881 2882 /* sanity - at least max opt. space */ 2883 mss = max(mss, 64); 2884 2885 /* 2886 * maxopd stores the maximum length of data AND options 2887 * in a segment; maxseg is the amount of data in a normal 2888 * segment. We need to store this value (maxopd) apart 2889 * from maxseg, because now every segment carries options 2890 * and thus we normally have somewhat less data in segments. 2891 */ 2892 tp->t_maxopd = mss; 2893 2894 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2895 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2896 mss -= TCPOLEN_TSTAMP_APPA; 2897 #ifdef TCP_SIGNATURE 2898 if (tp->t_flags & TF_SIGNATURE) 2899 mss -= TCPOLEN_SIGLEN; 2900 #endif 2901 2902 if (offer == -1) { 2903 /* mss changed due to Path MTU discovery */ 2904 tp->t_flags &= ~TF_PMTUD_PEND; 2905 tp->t_pmtud_mtu_sent = 0; 2906 tp->t_pmtud_mss_acked = 0; 2907 if (mss < tp->t_maxseg) { 2908 /* 2909 * Follow suggestion in RFC 2414 to reduce the 2910 * congestion window by the ratio of the old 2911 * segment size to the new segment size. 2912 */ 2913 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2914 mss, mss); 2915 } 2916 } else if (tcp_do_rfc3390 == 2) { 2917 /* increase initial window */ 2918 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2919 } else if (tcp_do_rfc3390) { 2920 /* increase initial window */ 2921 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2922 } else 2923 tp->snd_cwnd = mss; 2924 2925 tp->t_maxseg = mss; 2926 2927 return (offer != -1 ? mssopt : mss); 2928 } 2929 2930 u_int 2931 tcp_hdrsz(struct tcpcb *tp) 2932 { 2933 u_int hlen; 2934 2935 switch (tp->pf) { 2936 #ifdef INET6 2937 case AF_INET6: 2938 hlen = sizeof(struct ip6_hdr); 2939 break; 2940 #endif 2941 case AF_INET: 2942 hlen = sizeof(struct ip); 2943 break; 2944 default: 2945 hlen = 0; 2946 break; 2947 } 2948 hlen += sizeof(struct tcphdr); 2949 2950 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2951 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2952 hlen += TCPOLEN_TSTAMP_APPA; 2953 #ifdef TCP_SIGNATURE 2954 if (tp->t_flags & TF_SIGNATURE) 2955 hlen += TCPOLEN_SIGLEN; 2956 #endif 2957 return (hlen); 2958 } 2959 2960 /* 2961 * Set connection variables based on the effective MSS. 2962 * We are passed the TCPCB for the actual connection. If we 2963 * are the server, we are called by the compressed state engine 2964 * when the 3-way handshake is complete. If we are the client, 2965 * we are called when we receive the SYN,ACK from the server. 2966 * 2967 * NOTE: The t_maxseg value must be initialized in the TCPCB 2968 * before this routine is called! 2969 */ 2970 void 2971 tcp_mss_update(struct tcpcb *tp) 2972 { 2973 int mss; 2974 u_long bufsize; 2975 struct rtentry *rt; 2976 struct socket *so; 2977 2978 so = tp->t_inpcb->inp_socket; 2979 mss = tp->t_maxseg; 2980 2981 rt = in_pcbrtentry(tp->t_inpcb); 2982 2983 if (rt == NULL) 2984 return; 2985 2986 bufsize = so->so_snd.sb_hiwat; 2987 if (bufsize < mss) { 2988 mss = bufsize; 2989 /* Update t_maxseg and t_maxopd */ 2990 tcp_mss(tp, mss); 2991 } else { 2992 bufsize = roundup(bufsize, mss); 2993 if (bufsize > sb_max) 2994 bufsize = sb_max; 2995 (void)sbreserve(so, &so->so_snd, bufsize); 2996 } 2997 2998 bufsize = so->so_rcv.sb_hiwat; 2999 if (bufsize > mss) { 3000 bufsize = roundup(bufsize, mss); 3001 if (bufsize > sb_max) 3002 bufsize = sb_max; 3003 (void)sbreserve(so, &so->so_rcv, bufsize); 3004 } 3005 3006 } 3007 3008 /* 3009 * When a partial ack arrives, force the retransmission of the 3010 * next unacknowledged segment. Do not clear tp->t_dupacks. 3011 * By setting snd_nxt to ti_ack, this forces retransmission timer 3012 * to be started again. 3013 */ 3014 void 3015 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3016 { 3017 /* 3018 * snd_una has not been updated and the socket send buffer 3019 * not yet drained of the acked data, so we have to leave 3020 * snd_una as it was to get the correct data offset in 3021 * tcp_output(). 3022 */ 3023 tcp_seq onxt = tp->snd_nxt; 3024 u_long ocwnd = tp->snd_cwnd; 3025 3026 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3027 tp->t_rtttime = 0; 3028 tp->snd_nxt = th->th_ack; 3029 /* 3030 * Set snd_cwnd to one segment beyond acknowledged offset 3031 * (tp->snd_una not yet updated when this function is called) 3032 */ 3033 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3034 (void)tcp_output(tp); 3035 tp->snd_cwnd = ocwnd; 3036 if (SEQ_GT(onxt, tp->snd_nxt)) 3037 tp->snd_nxt = onxt; 3038 /* 3039 * Partial window deflation. Relies on fact that tp->snd_una 3040 * not updated yet. 3041 */ 3042 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3043 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3044 else 3045 tp->snd_cwnd = 0; 3046 tp->snd_cwnd += tp->t_maxseg; 3047 } 3048 3049 int 3050 tcp_mss_adv(struct mbuf *m, int af) 3051 { 3052 int mss = 0; 3053 int iphlen; 3054 struct ifnet *ifp = NULL; 3055 3056 if (m && (m->m_flags & M_PKTHDR)) 3057 ifp = if_get(m->m_pkthdr.ph_ifidx); 3058 3059 switch (af) { 3060 case AF_INET: 3061 if (ifp != NULL) 3062 mss = ifp->if_mtu; 3063 iphlen = sizeof(struct ip); 3064 break; 3065 #ifdef INET6 3066 case AF_INET6: 3067 if (ifp != NULL) 3068 mss = ifp->if_mtu; 3069 iphlen = sizeof(struct ip6_hdr); 3070 break; 3071 #endif 3072 default: 3073 unhandled_af(af); 3074 } 3075 if_put(ifp); 3076 mss = mss - iphlen - sizeof(struct tcphdr); 3077 return (max(mss, tcp_mssdflt)); 3078 } 3079 3080 /* 3081 * TCP compressed state engine. Currently used to hold compressed 3082 * state for SYN_RECEIVED. 3083 */ 3084 3085 /* syn hash parameters */ 3086 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3087 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3088 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3089 int tcp_syn_use_limit = 100000; 3090 3091 struct syn_cache_set tcp_syn_cache[2]; 3092 int tcp_syn_cache_active; 3093 3094 #define SYN_HASH(sa, sp, dp, rand) \ 3095 (((sa)->s_addr ^ (rand)[0]) * \ 3096 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3097 #ifndef INET6 3098 #define SYN_HASHALL(hash, src, dst, rand) \ 3099 do { \ 3100 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3101 satosin(src)->sin_port, \ 3102 satosin(dst)->sin_port, (rand)); \ 3103 } while (/*CONSTCOND*/ 0) 3104 #else 3105 #define SYN_HASH6(sa, sp, dp, rand) \ 3106 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3107 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3108 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3109 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3110 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3111 3112 #define SYN_HASHALL(hash, src, dst, rand) \ 3113 do { \ 3114 switch ((src)->sa_family) { \ 3115 case AF_INET: \ 3116 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3117 satosin(src)->sin_port, \ 3118 satosin(dst)->sin_port, (rand)); \ 3119 break; \ 3120 case AF_INET6: \ 3121 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3122 satosin6(src)->sin6_port, \ 3123 satosin6(dst)->sin6_port, (rand)); \ 3124 break; \ 3125 default: \ 3126 hash = 0; \ 3127 } \ 3128 } while (/*CONSTCOND*/0) 3129 #endif /* INET6 */ 3130 3131 void 3132 syn_cache_rm(struct syn_cache *sc) 3133 { 3134 sc->sc_flags |= SCF_DEAD; 3135 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3136 sc->sc_tp = NULL; 3137 LIST_REMOVE(sc, sc_tpq); 3138 sc->sc_buckethead->sch_length--; 3139 timeout_del(&sc->sc_timer); 3140 sc->sc_set->scs_count--; 3141 } 3142 3143 void 3144 syn_cache_put(struct syn_cache *sc) 3145 { 3146 m_free(sc->sc_ipopts); 3147 if (sc->sc_route4.ro_rt != NULL) { 3148 rtfree(sc->sc_route4.ro_rt); 3149 sc->sc_route4.ro_rt = NULL; 3150 } 3151 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3152 timeout_add(&sc->sc_timer, 0); 3153 } 3154 3155 struct pool syn_cache_pool; 3156 3157 /* 3158 * We don't estimate RTT with SYNs, so each packet starts with the default 3159 * RTT and each timer step has a fixed timeout value. 3160 */ 3161 #define SYN_CACHE_TIMER_ARM(sc) \ 3162 do { \ 3163 TCPT_RANGESET((sc)->sc_rxtcur, \ 3164 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3165 TCPTV_REXMTMAX); \ 3166 if (!timeout_initialized(&(sc)->sc_timer)) \ 3167 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3168 timeout_add_msec(&(sc)->sc_timer, (sc)->sc_rxtcur); \ 3169 } while (/*CONSTCOND*/0) 3170 3171 void 3172 syn_cache_init(void) 3173 { 3174 int i; 3175 3176 /* Initialize the hash buckets. */ 3177 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3178 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3179 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3180 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3181 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3182 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3183 for (i = 0; i < tcp_syn_hash_size; i++) { 3184 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3185 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3186 } 3187 3188 /* Initialize the syn cache pool. */ 3189 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3190 0, "syncache", NULL); 3191 } 3192 3193 void 3194 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3195 { 3196 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3197 struct syn_cache_head *scp; 3198 struct syn_cache *sc2; 3199 int i; 3200 3201 NET_ASSERT_LOCKED(); 3202 3203 /* 3204 * If there are no entries in the hash table, reinitialize 3205 * the hash secrets. To avoid useless cache swaps and 3206 * reinitialization, use it until the limit is reached. 3207 * An empty cache is also the opportunity to resize the hash. 3208 */ 3209 if (set->scs_count == 0 && set->scs_use <= 0) { 3210 set->scs_use = tcp_syn_use_limit; 3211 if (set->scs_size != tcp_syn_hash_size) { 3212 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3213 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3214 if (scp == NULL) { 3215 /* Try again next time. */ 3216 set->scs_use = 0; 3217 } else { 3218 free(set->scs_buckethead, M_SYNCACHE, 3219 set->scs_size * 3220 sizeof(struct syn_cache_head)); 3221 set->scs_buckethead = scp; 3222 set->scs_size = tcp_syn_hash_size; 3223 for (i = 0; i < tcp_syn_hash_size; i++) 3224 TAILQ_INIT(&scp[i].sch_bucket); 3225 } 3226 } 3227 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3228 tcpstat_inc(tcps_sc_seedrandom); 3229 } 3230 3231 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3232 set->scs_random); 3233 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3234 sc->sc_buckethead = scp; 3235 3236 /* 3237 * Make sure that we don't overflow the per-bucket 3238 * limit or the total cache size limit. 3239 */ 3240 if (scp->sch_length >= tcp_syn_bucket_limit) { 3241 tcpstat_inc(tcps_sc_bucketoverflow); 3242 /* 3243 * Someone might attack our bucket hash function. Reseed 3244 * with random as soon as the passive syn cache gets empty. 3245 */ 3246 set->scs_use = 0; 3247 /* 3248 * The bucket is full. Toss the oldest element in the 3249 * bucket. This will be the first entry in the bucket. 3250 */ 3251 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3252 #ifdef DIAGNOSTIC 3253 /* 3254 * This should never happen; we should always find an 3255 * entry in our bucket. 3256 */ 3257 if (sc2 == NULL) 3258 panic("%s: bucketoverflow: impossible", __func__); 3259 #endif 3260 syn_cache_rm(sc2); 3261 syn_cache_put(sc2); 3262 } else if (set->scs_count >= tcp_syn_cache_limit) { 3263 struct syn_cache_head *scp2, *sce; 3264 3265 tcpstat_inc(tcps_sc_overflowed); 3266 /* 3267 * The cache is full. Toss the oldest entry in the 3268 * first non-empty bucket we can find. 3269 * 3270 * XXX We would really like to toss the oldest 3271 * entry in the cache, but we hope that this 3272 * condition doesn't happen very often. 3273 */ 3274 scp2 = scp; 3275 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3276 sce = &set->scs_buckethead[set->scs_size]; 3277 for (++scp2; scp2 != scp; scp2++) { 3278 if (scp2 >= sce) 3279 scp2 = &set->scs_buckethead[0]; 3280 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3281 break; 3282 } 3283 #ifdef DIAGNOSTIC 3284 /* 3285 * This should never happen; we should always find a 3286 * non-empty bucket. 3287 */ 3288 if (scp2 == scp) 3289 panic("%s: cacheoverflow: impossible", 3290 __func__); 3291 #endif 3292 } 3293 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3294 syn_cache_rm(sc2); 3295 syn_cache_put(sc2); 3296 } 3297 3298 /* 3299 * Initialize the entry's timer. 3300 */ 3301 sc->sc_rxttot = 0; 3302 sc->sc_rxtshift = 0; 3303 SYN_CACHE_TIMER_ARM(sc); 3304 3305 /* Link it from tcpcb entry */ 3306 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3307 3308 /* Put it into the bucket. */ 3309 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3310 scp->sch_length++; 3311 sc->sc_set = set; 3312 set->scs_count++; 3313 set->scs_use--; 3314 3315 tcpstat_inc(tcps_sc_added); 3316 3317 /* 3318 * If the active cache has exceeded its use limit and 3319 * the passive syn cache is empty, exchange their roles. 3320 */ 3321 if (set->scs_use <= 0 && 3322 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3323 tcp_syn_cache_active = !tcp_syn_cache_active; 3324 } 3325 3326 /* 3327 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3328 * If we have retransmitted an entry the maximum number of times, expire 3329 * that entry. 3330 */ 3331 void 3332 syn_cache_timer(void *arg) 3333 { 3334 struct syn_cache *sc = arg; 3335 uint32_t now; 3336 3337 NET_LOCK(); 3338 if (sc->sc_flags & SCF_DEAD) 3339 goto out; 3340 3341 now = tcp_now(); 3342 3343 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3344 /* Drop it -- too many retransmissions. */ 3345 goto dropit; 3346 } 3347 3348 /* 3349 * Compute the total amount of time this entry has 3350 * been on a queue. If this entry has been on longer 3351 * than the keep alive timer would allow, expire it. 3352 */ 3353 sc->sc_rxttot += sc->sc_rxtcur; 3354 if (sc->sc_rxttot >= TCP_TIME(tcptv_keep_init)) 3355 goto dropit; 3356 3357 tcpstat_inc(tcps_sc_retransmitted); 3358 (void) syn_cache_respond(sc, NULL, now); 3359 3360 /* Advance the timer back-off. */ 3361 sc->sc_rxtshift++; 3362 SYN_CACHE_TIMER_ARM(sc); 3363 3364 out: 3365 NET_UNLOCK(); 3366 return; 3367 3368 dropit: 3369 tcpstat_inc(tcps_sc_timed_out); 3370 syn_cache_rm(sc); 3371 syn_cache_put(sc); 3372 NET_UNLOCK(); 3373 } 3374 3375 void 3376 syn_cache_reaper(void *arg) 3377 { 3378 struct syn_cache *sc = arg; 3379 3380 pool_put(&syn_cache_pool, (sc)); 3381 return; 3382 } 3383 3384 /* 3385 * Remove syn cache created by the specified tcb entry, 3386 * because this does not make sense to keep them 3387 * (if there's no tcb entry, syn cache entry will never be used) 3388 */ 3389 void 3390 syn_cache_cleanup(struct tcpcb *tp) 3391 { 3392 struct syn_cache *sc, *nsc; 3393 3394 NET_ASSERT_LOCKED(); 3395 3396 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3397 #ifdef DIAGNOSTIC 3398 if (sc->sc_tp != tp) 3399 panic("invalid sc_tp in syn_cache_cleanup"); 3400 #endif 3401 syn_cache_rm(sc); 3402 syn_cache_put(sc); 3403 } 3404 /* just for safety */ 3405 LIST_INIT(&tp->t_sc); 3406 } 3407 3408 /* 3409 * Find an entry in the syn cache. 3410 */ 3411 struct syn_cache * 3412 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3413 struct syn_cache_head **headp, u_int rtableid) 3414 { 3415 struct syn_cache_set *sets[2]; 3416 struct syn_cache *sc; 3417 struct syn_cache_head *scp; 3418 u_int32_t hash; 3419 int i; 3420 3421 NET_ASSERT_LOCKED(); 3422 3423 /* Check the active cache first, the passive cache is likely empty. */ 3424 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3425 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3426 for (i = 0; i < 2; i++) { 3427 if (sets[i]->scs_count == 0) 3428 continue; 3429 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3430 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3431 *headp = scp; 3432 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3433 if (sc->sc_hash != hash) 3434 continue; 3435 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3436 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3437 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3438 return (sc); 3439 } 3440 } 3441 return (NULL); 3442 } 3443 3444 /* 3445 * This function gets called when we receive an ACK for a 3446 * socket in the LISTEN state. We look up the connection 3447 * in the syn cache, and if its there, we pull it out of 3448 * the cache and turn it into a full-blown connection in 3449 * the SYN-RECEIVED state. 3450 * 3451 * The return values may not be immediately obvious, and their effects 3452 * can be subtle, so here they are: 3453 * 3454 * NULL SYN was not found in cache; caller should drop the 3455 * packet and send an RST. 3456 * 3457 * -1 We were unable to create the new connection, and are 3458 * aborting it. An ACK,RST is being sent to the peer 3459 * (unless we got screwy sequence numbers; see below), 3460 * because the 3-way handshake has been completed. Caller 3461 * should not free the mbuf, since we may be using it. If 3462 * we are not, we will free it. 3463 * 3464 * Otherwise, the return value is a pointer to the new socket 3465 * associated with the connection. 3466 */ 3467 struct socket * 3468 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3469 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint32_t now) 3470 { 3471 struct syn_cache *sc; 3472 struct syn_cache_head *scp; 3473 struct inpcb *inp, *oldinp; 3474 struct tcpcb *tp = NULL; 3475 struct mbuf *am; 3476 struct socket *oso; 3477 3478 NET_ASSERT_LOCKED(); 3479 3480 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3481 if (sc == NULL) 3482 return (NULL); 3483 3484 /* 3485 * Verify the sequence and ack numbers. Try getting the correct 3486 * response again. 3487 */ 3488 if ((th->th_ack != sc->sc_iss + 1) || 3489 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3490 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3491 (void) syn_cache_respond(sc, m, now); 3492 return ((struct socket *)(-1)); 3493 } 3494 3495 /* Remove this cache entry */ 3496 syn_cache_rm(sc); 3497 3498 /* 3499 * Ok, create the full blown connection, and set things up 3500 * as they would have been set up if we had created the 3501 * connection when the SYN arrived. If we can't create 3502 * the connection, abort it. 3503 */ 3504 oso = so; 3505 so = sonewconn(so, SS_ISCONNECTED, M_DONTWAIT); 3506 if (so == NULL) 3507 goto resetandabort; 3508 3509 oldinp = sotoinpcb(oso); 3510 inp = sotoinpcb(so); 3511 3512 #ifdef IPSEC 3513 /* 3514 * We need to copy the required security levels 3515 * from the old pcb. Ditto for any other 3516 * IPsec-related information. 3517 */ 3518 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3519 sizeof(oldinp->inp_seclevel)); 3520 #endif /* IPSEC */ 3521 #ifdef INET6 3522 /* 3523 * inp still has the OLD in_pcb stuff, set the 3524 * v6-related flags on the new guy, too. 3525 */ 3526 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3527 if (inp->inp_flags & INP_IPV6) { 3528 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3529 inp->inp_hops = oldinp->inp_hops; 3530 } else 3531 #endif /* INET6 */ 3532 { 3533 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3534 } 3535 3536 #if NPF > 0 3537 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3538 struct pf_divert *divert; 3539 3540 divert = pf_find_divert(m); 3541 KASSERT(divert != NULL); 3542 inp->inp_rtableid = divert->rdomain; 3543 } else 3544 #endif 3545 /* inherit rtable from listening socket */ 3546 inp->inp_rtableid = sc->sc_rtableid; 3547 3548 inp->inp_lport = th->th_dport; 3549 switch (src->sa_family) { 3550 #ifdef INET6 3551 case AF_INET6: 3552 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3553 break; 3554 #endif /* INET6 */ 3555 case AF_INET: 3556 inp->inp_laddr = satosin(dst)->sin_addr; 3557 inp->inp_options = ip_srcroute(m); 3558 if (inp->inp_options == NULL) { 3559 inp->inp_options = sc->sc_ipopts; 3560 sc->sc_ipopts = NULL; 3561 } 3562 break; 3563 } 3564 in_pcbrehash(inp); 3565 3566 /* 3567 * Give the new socket our cached route reference. 3568 */ 3569 if (src->sa_family == AF_INET) 3570 inp->inp_route = sc->sc_route4; /* struct assignment */ 3571 #ifdef INET6 3572 else 3573 inp->inp_route6 = sc->sc_route6; 3574 #endif 3575 sc->sc_route4.ro_rt = NULL; 3576 3577 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3578 if (am == NULL) 3579 goto resetandabort; 3580 am->m_len = src->sa_len; 3581 memcpy(mtod(am, caddr_t), src, src->sa_len); 3582 if (in_pcbconnect(inp, am)) { 3583 (void) m_free(am); 3584 goto resetandabort; 3585 } 3586 (void) m_free(am); 3587 3588 tp = intotcpcb(inp); 3589 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3590 if (sc->sc_request_r_scale != 15) { 3591 tp->requested_s_scale = sc->sc_requested_s_scale; 3592 tp->request_r_scale = sc->sc_request_r_scale; 3593 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3594 } 3595 if (sc->sc_flags & SCF_TIMESTAMP) 3596 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3597 3598 tp->t_template = tcp_template(tp); 3599 if (tp->t_template == 0) { 3600 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3601 so = NULL; 3602 goto abort; 3603 } 3604 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3605 tp->ts_modulate = sc->sc_modulate; 3606 tp->ts_recent = sc->sc_timestamp; 3607 tp->iss = sc->sc_iss; 3608 tp->irs = sc->sc_irs; 3609 tcp_sendseqinit(tp); 3610 tp->snd_last = tp->snd_una; 3611 #ifdef TCP_ECN 3612 if (sc->sc_flags & SCF_ECN_PERMIT) { 3613 tp->t_flags |= TF_ECN_PERMIT; 3614 tcpstat_inc(tcps_ecn_accepts); 3615 } 3616 #endif 3617 if (sc->sc_flags & SCF_SACK_PERMIT) 3618 tp->t_flags |= TF_SACK_PERMIT; 3619 #ifdef TCP_SIGNATURE 3620 if (sc->sc_flags & SCF_SIGNATURE) 3621 tp->t_flags |= TF_SIGNATURE; 3622 #endif 3623 tcp_rcvseqinit(tp); 3624 tp->t_state = TCPS_SYN_RECEIVED; 3625 tp->t_rcvtime = now; 3626 tp->t_sndtime = now; 3627 tp->t_rcvacktime = now; 3628 tp->t_sndacktime = now; 3629 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcptv_keep_init)); 3630 tcpstat_inc(tcps_accepts); 3631 3632 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3633 if (sc->sc_peermaxseg) 3634 tcp_mss_update(tp); 3635 /* Reset initial window to 1 segment for retransmit */ 3636 if (sc->sc_rxtshift > 0) 3637 tp->snd_cwnd = tp->t_maxseg; 3638 tp->snd_wl1 = sc->sc_irs; 3639 tp->rcv_up = sc->sc_irs + 1; 3640 3641 /* 3642 * This is what would have happened in tcp_output() when 3643 * the SYN,ACK was sent. 3644 */ 3645 tp->snd_up = tp->snd_una; 3646 tp->snd_max = tp->snd_nxt = tp->iss+1; 3647 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3648 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3649 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3650 tp->last_ack_sent = tp->rcv_nxt; 3651 3652 tcpstat_inc(tcps_sc_completed); 3653 syn_cache_put(sc); 3654 return (so); 3655 3656 resetandabort: 3657 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3658 m->m_pkthdr.ph_rtableid, now); 3659 abort: 3660 m_freem(m); 3661 if (so != NULL) 3662 soabort(so); 3663 syn_cache_put(sc); 3664 tcpstat_inc(tcps_sc_aborted); 3665 return ((struct socket *)(-1)); 3666 } 3667 3668 /* 3669 * This function is called when we get a RST for a 3670 * non-existent connection, so that we can see if the 3671 * connection is in the syn cache. If it is, zap it. 3672 */ 3673 3674 void 3675 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3676 u_int rtableid) 3677 { 3678 struct syn_cache *sc; 3679 struct syn_cache_head *scp; 3680 3681 NET_ASSERT_LOCKED(); 3682 3683 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3684 return; 3685 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3686 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3687 return; 3688 syn_cache_rm(sc); 3689 tcpstat_inc(tcps_sc_reset); 3690 syn_cache_put(sc); 3691 } 3692 3693 void 3694 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3695 u_int rtableid) 3696 { 3697 struct syn_cache *sc; 3698 struct syn_cache_head *scp; 3699 3700 NET_ASSERT_LOCKED(); 3701 3702 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3703 return; 3704 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3705 if (ntohl (th->th_seq) != sc->sc_iss) { 3706 return; 3707 } 3708 3709 /* 3710 * If we've retransmitted 3 times and this is our second error, 3711 * we remove the entry. Otherwise, we allow it to continue on. 3712 * This prevents us from incorrectly nuking an entry during a 3713 * spurious network outage. 3714 * 3715 * See tcp_notify(). 3716 */ 3717 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3718 sc->sc_flags |= SCF_UNREACH; 3719 return; 3720 } 3721 3722 syn_cache_rm(sc); 3723 tcpstat_inc(tcps_sc_unreach); 3724 syn_cache_put(sc); 3725 } 3726 3727 /* 3728 * Given a LISTEN socket and an inbound SYN request, add 3729 * this to the syn cache, and send back a segment: 3730 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3731 * to the source. 3732 * 3733 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3734 * Doing so would require that we hold onto the data and deliver it 3735 * to the application. However, if we are the target of a SYN-flood 3736 * DoS attack, an attacker could send data which would eventually 3737 * consume all available buffer space if it were ACKed. By not ACKing 3738 * the data, we avoid this DoS scenario. 3739 */ 3740 3741 int 3742 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3743 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3744 struct tcp_opt_info *oi, tcp_seq *issp, uint32_t now) 3745 { 3746 struct tcpcb tb, *tp; 3747 long win; 3748 struct syn_cache *sc; 3749 struct syn_cache_head *scp; 3750 struct mbuf *ipopts; 3751 3752 tp = sototcpcb(so); 3753 3754 /* 3755 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3756 * 3757 * Note this check is performed in tcp_input() very early on. 3758 */ 3759 3760 /* 3761 * Initialize some local state. 3762 */ 3763 win = sbspace(so, &so->so_rcv); 3764 if (win > TCP_MAXWIN) 3765 win = TCP_MAXWIN; 3766 3767 bzero(&tb, sizeof(tb)); 3768 #ifdef TCP_SIGNATURE 3769 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3770 #else 3771 if (optp) { 3772 #endif 3773 tb.pf = tp->pf; 3774 tb.sack_enable = tp->sack_enable; 3775 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3776 #ifdef TCP_SIGNATURE 3777 if (tp->t_flags & TF_SIGNATURE) 3778 tb.t_flags |= TF_SIGNATURE; 3779 #endif 3780 tb.t_state = TCPS_LISTEN; 3781 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3782 sotoinpcb(so)->inp_rtableid, now)) 3783 return (-1); 3784 } 3785 3786 switch (src->sa_family) { 3787 case AF_INET: 3788 /* 3789 * Remember the IP options, if any. 3790 */ 3791 ipopts = ip_srcroute(m); 3792 break; 3793 default: 3794 ipopts = NULL; 3795 } 3796 3797 /* 3798 * See if we already have an entry for this connection. 3799 * If we do, resend the SYN,ACK. We do not count this 3800 * as a retransmission (XXX though maybe we should). 3801 */ 3802 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3803 if (sc != NULL) { 3804 tcpstat_inc(tcps_sc_dupesyn); 3805 if (ipopts) { 3806 /* 3807 * If we were remembering a previous source route, 3808 * forget it and use the new one we've been given. 3809 */ 3810 m_free(sc->sc_ipopts); 3811 sc->sc_ipopts = ipopts; 3812 } 3813 sc->sc_timestamp = tb.ts_recent; 3814 if (syn_cache_respond(sc, m, now) == 0) { 3815 tcpstat_inc(tcps_sndacks); 3816 tcpstat_inc(tcps_sndtotal); 3817 } 3818 return (0); 3819 } 3820 3821 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3822 if (sc == NULL) { 3823 m_free(ipopts); 3824 return (-1); 3825 } 3826 3827 /* 3828 * Fill in the cache, and put the necessary IP and TCP 3829 * options into the reply. 3830 */ 3831 memcpy(&sc->sc_src, src, src->sa_len); 3832 memcpy(&sc->sc_dst, dst, dst->sa_len); 3833 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3834 sc->sc_flags = 0; 3835 sc->sc_ipopts = ipopts; 3836 sc->sc_irs = th->th_seq; 3837 3838 sc->sc_iss = issp ? *issp : arc4random(); 3839 sc->sc_peermaxseg = oi->maxseg; 3840 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3841 sc->sc_win = win; 3842 sc->sc_timestamp = tb.ts_recent; 3843 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3844 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3845 sc->sc_flags |= SCF_TIMESTAMP; 3846 sc->sc_modulate = arc4random(); 3847 } 3848 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3849 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3850 sc->sc_requested_s_scale = tb.requested_s_scale; 3851 sc->sc_request_r_scale = 0; 3852 /* 3853 * Pick the smallest possible scaling factor that 3854 * will still allow us to scale up to sb_max. 3855 * 3856 * We do this because there are broken firewalls that 3857 * will corrupt the window scale option, leading to 3858 * the other endpoint believing that our advertised 3859 * window is unscaled. At scale factors larger than 3860 * 5 the unscaled window will drop below 1500 bytes, 3861 * leading to serious problems when traversing these 3862 * broken firewalls. 3863 * 3864 * With the default sbmax of 256K, a scale factor 3865 * of 3 will be chosen by this algorithm. Those who 3866 * choose a larger sbmax should watch out 3867 * for the compatibility problems mentioned above. 3868 * 3869 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3870 * or <SYN,ACK>) segment itself is never scaled. 3871 */ 3872 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3873 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3874 sc->sc_request_r_scale++; 3875 } else { 3876 sc->sc_requested_s_scale = 15; 3877 sc->sc_request_r_scale = 15; 3878 } 3879 #ifdef TCP_ECN 3880 /* 3881 * if both ECE and CWR flag bits are set, peer is ECN capable. 3882 */ 3883 if (tcp_do_ecn && 3884 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3885 sc->sc_flags |= SCF_ECN_PERMIT; 3886 #endif 3887 /* 3888 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3889 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3890 */ 3891 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3892 sc->sc_flags |= SCF_SACK_PERMIT; 3893 #ifdef TCP_SIGNATURE 3894 if (tb.t_flags & TF_SIGNATURE) 3895 sc->sc_flags |= SCF_SIGNATURE; 3896 #endif 3897 sc->sc_tp = tp; 3898 if (syn_cache_respond(sc, m, now) == 0) { 3899 syn_cache_insert(sc, tp); 3900 tcpstat_inc(tcps_sndacks); 3901 tcpstat_inc(tcps_sndtotal); 3902 } else { 3903 syn_cache_put(sc); 3904 tcpstat_inc(tcps_sc_dropped); 3905 } 3906 3907 return (0); 3908 } 3909 3910 int 3911 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint32_t now) 3912 { 3913 u_int8_t *optp; 3914 int optlen, error; 3915 u_int16_t tlen; 3916 struct ip *ip = NULL; 3917 #ifdef INET6 3918 struct ip6_hdr *ip6 = NULL; 3919 #endif 3920 struct tcphdr *th; 3921 u_int hlen; 3922 struct inpcb *inp; 3923 3924 switch (sc->sc_src.sa.sa_family) { 3925 case AF_INET: 3926 hlen = sizeof(struct ip); 3927 break; 3928 #ifdef INET6 3929 case AF_INET6: 3930 hlen = sizeof(struct ip6_hdr); 3931 break; 3932 #endif 3933 default: 3934 m_freem(m); 3935 return (EAFNOSUPPORT); 3936 } 3937 3938 /* Compute the size of the TCP options. */ 3939 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3940 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 3941 #ifdef TCP_SIGNATURE 3942 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3943 #endif 3944 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3945 3946 tlen = hlen + sizeof(struct tcphdr) + optlen; 3947 3948 /* 3949 * Create the IP+TCP header from scratch. 3950 */ 3951 m_freem(m); 3952 #ifdef DIAGNOSTIC 3953 if (max_linkhdr + tlen > MCLBYTES) 3954 return (ENOBUFS); 3955 #endif 3956 MGETHDR(m, M_DONTWAIT, MT_DATA); 3957 if (m && max_linkhdr + tlen > MHLEN) { 3958 MCLGET(m, M_DONTWAIT); 3959 if ((m->m_flags & M_EXT) == 0) { 3960 m_freem(m); 3961 m = NULL; 3962 } 3963 } 3964 if (m == NULL) 3965 return (ENOBUFS); 3966 3967 /* Fixup the mbuf. */ 3968 m->m_data += max_linkhdr; 3969 m->m_len = m->m_pkthdr.len = tlen; 3970 m->m_pkthdr.ph_ifidx = 0; 3971 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 3972 memset(mtod(m, u_char *), 0, tlen); 3973 3974 switch (sc->sc_src.sa.sa_family) { 3975 case AF_INET: 3976 ip = mtod(m, struct ip *); 3977 ip->ip_dst = sc->sc_src.sin.sin_addr; 3978 ip->ip_src = sc->sc_dst.sin.sin_addr; 3979 ip->ip_p = IPPROTO_TCP; 3980 th = (struct tcphdr *)(ip + 1); 3981 th->th_dport = sc->sc_src.sin.sin_port; 3982 th->th_sport = sc->sc_dst.sin.sin_port; 3983 break; 3984 #ifdef INET6 3985 case AF_INET6: 3986 ip6 = mtod(m, struct ip6_hdr *); 3987 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3988 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3989 ip6->ip6_nxt = IPPROTO_TCP; 3990 /* ip6_plen will be updated in ip6_output() */ 3991 th = (struct tcphdr *)(ip6 + 1); 3992 th->th_dport = sc->sc_src.sin6.sin6_port; 3993 th->th_sport = sc->sc_dst.sin6.sin6_port; 3994 break; 3995 #endif 3996 default: 3997 unhandled_af(sc->sc_src.sa.sa_family); 3998 } 3999 4000 th->th_seq = htonl(sc->sc_iss); 4001 th->th_ack = htonl(sc->sc_irs + 1); 4002 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4003 th->th_flags = TH_SYN|TH_ACK; 4004 #ifdef TCP_ECN 4005 /* Set ECE for SYN-ACK if peer supports ECN. */ 4006 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4007 th->th_flags |= TH_ECE; 4008 #endif 4009 th->th_win = htons(sc->sc_win); 4010 /* th_sum already 0 */ 4011 /* th_urp already 0 */ 4012 4013 /* Tack on the TCP options. */ 4014 optp = (u_int8_t *)(th + 1); 4015 *optp++ = TCPOPT_MAXSEG; 4016 *optp++ = 4; 4017 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4018 *optp++ = sc->sc_ourmaxseg & 0xff; 4019 4020 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4021 if (sc->sc_flags & SCF_SACK_PERMIT) { 4022 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4023 optp += 4; 4024 } 4025 4026 if (sc->sc_request_r_scale != 15) { 4027 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4028 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4029 sc->sc_request_r_scale); 4030 optp += 4; 4031 } 4032 4033 if (sc->sc_flags & SCF_TIMESTAMP) { 4034 u_int32_t *lp = (u_int32_t *)(optp); 4035 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4036 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4037 *lp++ = htonl(now + sc->sc_modulate); 4038 *lp = htonl(sc->sc_timestamp); 4039 optp += TCPOLEN_TSTAMP_APPA; 4040 } 4041 4042 #ifdef TCP_SIGNATURE 4043 if (sc->sc_flags & SCF_SIGNATURE) { 4044 union sockaddr_union src, dst; 4045 struct tdb *tdb; 4046 4047 bzero(&src, sizeof(union sockaddr_union)); 4048 bzero(&dst, sizeof(union sockaddr_union)); 4049 src.sa.sa_len = sc->sc_src.sa.sa_len; 4050 src.sa.sa_family = sc->sc_src.sa.sa_family; 4051 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4052 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4053 4054 switch (sc->sc_src.sa.sa_family) { 4055 case 0: /*default to PF_INET*/ 4056 case AF_INET: 4057 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4058 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4059 break; 4060 #ifdef INET6 4061 case AF_INET6: 4062 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4063 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4064 break; 4065 #endif /* INET6 */ 4066 } 4067 4068 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4069 0, &src, &dst, IPPROTO_TCP); 4070 if (tdb == NULL) { 4071 m_freem(m); 4072 return (EPERM); 4073 } 4074 4075 /* Send signature option */ 4076 *(optp++) = TCPOPT_SIGNATURE; 4077 *(optp++) = TCPOLEN_SIGNATURE; 4078 4079 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4080 hlen, 0, optp) < 0) { 4081 m_freem(m); 4082 tdb_unref(tdb); 4083 return (EINVAL); 4084 } 4085 tdb_unref(tdb); 4086 optp += 16; 4087 4088 /* Pad options list to the next 32 bit boundary and 4089 * terminate it. 4090 */ 4091 *optp++ = TCPOPT_NOP; 4092 *optp++ = TCPOPT_EOL; 4093 } 4094 #endif /* TCP_SIGNATURE */ 4095 4096 /* Compute the packet's checksum. */ 4097 switch (sc->sc_src.sa.sa_family) { 4098 case AF_INET: 4099 ip->ip_len = htons(tlen - hlen); 4100 th->th_sum = 0; 4101 th->th_sum = in_cksum(m, tlen); 4102 break; 4103 #ifdef INET6 4104 case AF_INET6: 4105 ip6->ip6_plen = htons(tlen - hlen); 4106 th->th_sum = 0; 4107 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4108 break; 4109 #endif 4110 } 4111 4112 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4113 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4114 4115 /* 4116 * Fill in some straggling IP bits. Note the stack expects 4117 * ip_len to be in host order, for convenience. 4118 */ 4119 switch (sc->sc_src.sa.sa_family) { 4120 case AF_INET: 4121 ip->ip_len = htons(tlen); 4122 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4123 if (inp != NULL) 4124 ip->ip_tos = inp->inp_ip.ip_tos; 4125 break; 4126 #ifdef INET6 4127 case AF_INET6: 4128 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4129 ip6->ip6_vfc |= IPV6_VERSION; 4130 ip6->ip6_plen = htons(tlen - hlen); 4131 /* ip6_hlim will be initialized afterwards */ 4132 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4133 break; 4134 #endif 4135 } 4136 4137 switch (sc->sc_src.sa.sa_family) { 4138 case AF_INET: 4139 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4140 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4141 break; 4142 #ifdef INET6 4143 case AF_INET6: 4144 ip6->ip6_hlim = in6_selecthlim(inp); 4145 4146 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4147 NULL, NULL); 4148 break; 4149 #endif 4150 default: 4151 error = EAFNOSUPPORT; 4152 break; 4153 } 4154 return (error); 4155 } 4156