1 /* $OpenBSD: tcp_input.c,v 1.405 2024/04/17 20:48:51 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet6/ip6_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #if NPF > 0 100 #include <net/pfvar.h> 101 #endif 102 103 int tcp_mss_adv(struct mbuf *, int); 104 int tcp_flush_queue(struct tcpcb *); 105 106 #ifdef INET6 107 #include <netinet6/in6_var.h> 108 #include <netinet6/nd6.h> 109 110 /* for the packet header length in the mbuf */ 111 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 112 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 113 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 114 #endif /* INET6 */ 115 116 int tcprexmtthresh = 3; 117 int tcptv_keep_init = TCPTV_KEEP_INIT; 118 119 int tcp_rst_ppslim = 100; /* 100pps */ 120 int tcp_rst_ppslim_count = 0; 121 struct timeval tcp_rst_ppslim_last; 122 123 int tcp_ackdrop_ppslim = 100; /* 100pps */ 124 int tcp_ackdrop_ppslim_count = 0; 125 struct timeval tcp_ackdrop_ppslim_last; 126 127 #define TCP_PAWS_IDLE TCP_TIME(24 * 24 * 60 * 60) 128 129 /* for modulo comparisons of timestamps */ 130 #define TSTMP_LT(a,b) ((int32_t)((a)-(b)) < 0) 131 #define TSTMP_GEQ(a,b) ((int32_t)((a)-(b)) >= 0) 132 133 /* for TCP SACK comparisons */ 134 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 135 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 136 137 /* 138 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 139 */ 140 #ifdef INET6 141 #define ND6_HINT(tp) \ 142 do { \ 143 if (tp && tp->t_inpcb && \ 144 ISSET(tp->t_inpcb->inp_flags, INP_IPV6) && \ 145 rtisvalid(tp->t_inpcb->inp_route.ro_rt)) { \ 146 nd6_nud_hint(tp->t_inpcb->inp_route.ro_rt); \ 147 } \ 148 } while (0) 149 #else 150 #define ND6_HINT(tp) 151 #endif 152 153 #ifdef TCP_ECN 154 /* 155 * ECN (Explicit Congestion Notification) support based on RFC3168 156 * implementation note: 157 * snd_last is used to track a recovery phase. 158 * when cwnd is reduced, snd_last is set to snd_max. 159 * while snd_last > snd_una, the sender is in a recovery phase and 160 * its cwnd should not be reduced again. 161 * snd_last follows snd_una when not in a recovery phase. 162 */ 163 #endif 164 165 /* 166 * Macro to compute ACK transmission behavior. Delay the ACK unless 167 * we have already delayed an ACK (must send an ACK every two segments). 168 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 169 * option is enabled or when the packet is coming from a loopback 170 * interface. 171 */ 172 #define TCP_SETUP_ACK(tp, tiflags, m) \ 173 do { \ 174 struct ifnet *ifp = NULL; \ 175 if (m && (m->m_flags & M_PKTHDR)) \ 176 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 177 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 178 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 179 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 180 tp->t_flags |= TF_ACKNOW; \ 181 else \ 182 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \ 183 if_put(ifp); \ 184 } while (0) 185 186 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 187 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 188 189 void syn_cache_put(struct syn_cache *); 190 void syn_cache_rm(struct syn_cache *); 191 int syn_cache_respond(struct syn_cache *, struct mbuf *, uint64_t); 192 void syn_cache_timer(void *); 193 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 194 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 195 struct tcphdr *, u_int); 196 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 197 unsigned int, struct socket *, struct mbuf *, u_char *, int, 198 struct tcp_opt_info *, tcp_seq *, uint64_t); 199 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 200 struct tcphdr *, unsigned int, unsigned int, struct socket *, 201 struct mbuf *, uint64_t); 202 struct syn_cache *syn_cache_lookup(const struct sockaddr *, 203 const struct sockaddr *, struct syn_cache_head **, u_int); 204 205 /* 206 * Insert segment ti into reassembly queue of tcp with 207 * control block tp. Return TH_FIN if reassembly now includes 208 * a segment with FIN. The macro form does the common case inline 209 * (segment is the next to be received on an established connection, 210 * and the queue is empty), avoiding linkage into and removal 211 * from the queue and repetition of various conversions. 212 * Set DELACK for segments received in order, but ack immediately 213 * when segments are out of order (so fast retransmit can work). 214 */ 215 216 int 217 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 218 { 219 struct tcpqent *p, *q, *nq, *tiqe; 220 221 /* 222 * Allocate a new queue entry, before we throw away any data. 223 * If we can't, just drop the packet. XXX 224 */ 225 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 226 if (tiqe == NULL) { 227 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 228 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 229 /* Reuse last entry since new segment fills a hole */ 230 m_freem(tiqe->tcpqe_m); 231 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 232 } 233 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 234 /* Flush segment queue for this connection */ 235 tcp_freeq(tp); 236 tcpstat_inc(tcps_rcvmemdrop); 237 m_freem(m); 238 return (0); 239 } 240 } 241 242 /* 243 * Find a segment which begins after this one does. 244 */ 245 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 246 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 247 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 248 break; 249 250 /* 251 * If there is a preceding segment, it may provide some of 252 * our data already. If so, drop the data from the incoming 253 * segment. If it provides all of our data, drop us. 254 */ 255 if (p != NULL) { 256 struct tcphdr *phdr = p->tcpqe_tcp; 257 int i; 258 259 /* conversion to int (in i) handles seq wraparound */ 260 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 261 if (i > 0) { 262 if (i >= *tlen) { 263 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 264 *tlen); 265 m_freem(m); 266 pool_put(&tcpqe_pool, tiqe); 267 return (0); 268 } 269 m_adj(m, i); 270 *tlen -= i; 271 th->th_seq += i; 272 } 273 } 274 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 275 tp->t_rcvoopack++; 276 277 /* 278 * While we overlap succeeding segments trim them or, 279 * if they are completely covered, dequeue them. 280 */ 281 for (; q != NULL; q = nq) { 282 struct tcphdr *qhdr = q->tcpqe_tcp; 283 int i = (th->th_seq + *tlen) - qhdr->th_seq; 284 285 if (i <= 0) 286 break; 287 if (i < qhdr->th_reseqlen) { 288 qhdr->th_seq += i; 289 qhdr->th_reseqlen -= i; 290 m_adj(q->tcpqe_m, i); 291 break; 292 } 293 nq = TAILQ_NEXT(q, tcpqe_q); 294 m_freem(q->tcpqe_m); 295 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 296 pool_put(&tcpqe_pool, q); 297 } 298 299 /* Insert the new segment queue entry into place. */ 300 tiqe->tcpqe_m = m; 301 th->th_reseqlen = *tlen; 302 tiqe->tcpqe_tcp = th; 303 if (p == NULL) { 304 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 305 } else { 306 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 307 } 308 309 if (th->th_seq != tp->rcv_nxt) 310 return (0); 311 312 return (tcp_flush_queue(tp)); 313 } 314 315 int 316 tcp_flush_queue(struct tcpcb *tp) 317 { 318 struct socket *so = tp->t_inpcb->inp_socket; 319 struct tcpqent *q, *nq; 320 int flags; 321 322 /* 323 * Present data to user, advancing rcv_nxt through 324 * completed sequence space. 325 */ 326 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 327 return (0); 328 q = TAILQ_FIRST(&tp->t_segq); 329 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 330 return (0); 331 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 332 return (0); 333 do { 334 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 335 flags = q->tcpqe_tcp->th_flags & TH_FIN; 336 337 nq = TAILQ_NEXT(q, tcpqe_q); 338 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 339 ND6_HINT(tp); 340 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 341 m_freem(q->tcpqe_m); 342 else 343 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 344 pool_put(&tcpqe_pool, q); 345 q = nq; 346 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 347 tp->t_flags |= TF_BLOCKOUTPUT; 348 sorwakeup(so); 349 tp->t_flags &= ~TF_BLOCKOUTPUT; 350 return (flags); 351 } 352 353 /* 354 * TCP input routine, follows pages 65-76 of the 355 * protocol specification dated September, 1981 very closely. 356 */ 357 int 358 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 359 { 360 struct mbuf *m = *mp; 361 int iphlen = *offp; 362 struct ip *ip = NULL; 363 struct inpcb *inp = NULL; 364 u_int8_t *optp = NULL; 365 int optlen = 0; 366 int tlen, off; 367 struct tcpcb *otp = NULL, *tp = NULL; 368 int tiflags; 369 struct socket *so = NULL; 370 int todrop, acked, ourfinisacked; 371 int hdroptlen = 0; 372 short ostate; 373 union { 374 struct tcpiphdr tcpip; 375 #ifdef INET6 376 struct tcpipv6hdr tcpip6; 377 #endif 378 char caddr; 379 } saveti; 380 tcp_seq iss, *reuse = NULL; 381 uint64_t now; 382 u_long tiwin; 383 struct tcp_opt_info opti; 384 struct tcphdr *th; 385 #ifdef INET6 386 struct ip6_hdr *ip6 = NULL; 387 #endif /* INET6 */ 388 #ifdef TCP_ECN 389 u_char iptos; 390 #endif 391 392 tcpstat_inc(tcps_rcvtotal); 393 394 opti.ts_present = 0; 395 opti.maxseg = 0; 396 now = tcp_now(); 397 398 /* 399 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 400 */ 401 if (m->m_flags & (M_BCAST|M_MCAST)) 402 goto drop; 403 404 /* 405 * Get IP and TCP header together in first mbuf. 406 * Note: IP leaves IP header in first mbuf. 407 */ 408 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 409 if (!th) { 410 tcpstat_inc(tcps_rcvshort); 411 return IPPROTO_DONE; 412 } 413 414 tlen = m->m_pkthdr.len - iphlen; 415 switch (af) { 416 case AF_INET: 417 ip = mtod(m, struct ip *); 418 #ifdef TCP_ECN 419 /* save ip_tos before clearing it for checksum */ 420 iptos = ip->ip_tos; 421 #endif 422 break; 423 #ifdef INET6 424 case AF_INET6: 425 ip6 = mtod(m, struct ip6_hdr *); 426 #ifdef TCP_ECN 427 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 428 #endif 429 430 /* 431 * Be proactive about unspecified IPv6 address in source. 432 * As we use all-zero to indicate unbounded/unconnected pcb, 433 * unspecified IPv6 address can be used to confuse us. 434 * 435 * Note that packets with unspecified IPv6 destination is 436 * already dropped in ip6_input. 437 */ 438 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 439 /* XXX stat */ 440 goto drop; 441 } 442 443 /* Discard packets to multicast */ 444 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 445 /* XXX stat */ 446 goto drop; 447 } 448 break; 449 #endif 450 default: 451 unhandled_af(af); 452 } 453 454 /* 455 * Checksum extended TCP header and data. 456 */ 457 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 458 int sum; 459 460 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 461 tcpstat_inc(tcps_rcvbadsum); 462 goto drop; 463 } 464 tcpstat_inc(tcps_inswcsum); 465 switch (af) { 466 case AF_INET: 467 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 468 break; 469 #ifdef INET6 470 case AF_INET6: 471 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 472 tlen); 473 break; 474 #endif 475 } 476 if (sum != 0) { 477 tcpstat_inc(tcps_rcvbadsum); 478 goto drop; 479 } 480 } 481 482 /* 483 * Check that TCP offset makes sense, 484 * pull out TCP options and adjust length. XXX 485 */ 486 off = th->th_off << 2; 487 if (off < sizeof(struct tcphdr) || off > tlen) { 488 tcpstat_inc(tcps_rcvbadoff); 489 goto drop; 490 } 491 tlen -= off; 492 if (off > sizeof(struct tcphdr)) { 493 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 494 if (!th) { 495 tcpstat_inc(tcps_rcvshort); 496 return IPPROTO_DONE; 497 } 498 optlen = off - sizeof(struct tcphdr); 499 optp = (u_int8_t *)(th + 1); 500 /* 501 * Do quick retrieval of timestamp options ("options 502 * prediction?"). If timestamp is the only option and it's 503 * formatted as recommended in RFC 1323 appendix A, we 504 * quickly get the values now and not bother calling 505 * tcp_dooptions(), etc. 506 */ 507 if ((optlen == TCPOLEN_TSTAMP_APPA || 508 (optlen > TCPOLEN_TSTAMP_APPA && 509 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 510 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 511 (th->th_flags & TH_SYN) == 0) { 512 opti.ts_present = 1; 513 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 514 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 515 optp = NULL; /* we've parsed the options */ 516 } 517 } 518 tiflags = th->th_flags; 519 520 /* 521 * Convert TCP protocol specific fields to host format. 522 */ 523 th->th_seq = ntohl(th->th_seq); 524 th->th_ack = ntohl(th->th_ack); 525 th->th_win = ntohs(th->th_win); 526 th->th_urp = ntohs(th->th_urp); 527 528 if (th->th_dport == 0) { 529 tcpstat_inc(tcps_noport); 530 goto dropwithreset_ratelim; 531 } 532 533 /* 534 * Locate pcb for segment. 535 */ 536 #if NPF > 0 537 inp = pf_inp_lookup(m); 538 #endif 539 findpcb: 540 if (inp == NULL) { 541 switch (af) { 542 #ifdef INET6 543 case AF_INET6: 544 inp = in6_pcblookup(&tcb6table, &ip6->ip6_src, 545 th->th_sport, &ip6->ip6_dst, th->th_dport, 546 m->m_pkthdr.ph_rtableid); 547 break; 548 #endif 549 case AF_INET: 550 inp = in_pcblookup(&tcbtable, ip->ip_src, 551 th->th_sport, ip->ip_dst, th->th_dport, 552 m->m_pkthdr.ph_rtableid); 553 break; 554 } 555 } 556 if (inp == NULL) { 557 tcpstat_inc(tcps_pcbhashmiss); 558 switch (af) { 559 #ifdef INET6 560 case AF_INET6: 561 inp = in6_pcblookup_listen(&tcb6table, &ip6->ip6_dst, 562 th->th_dport, m, m->m_pkthdr.ph_rtableid); 563 break; 564 #endif 565 case AF_INET: 566 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 567 th->th_dport, m, m->m_pkthdr.ph_rtableid); 568 break; 569 } 570 /* 571 * If the state is CLOSED (i.e., TCB does not exist) then 572 * all data in the incoming segment is discarded. 573 * If the TCB exists but is in CLOSED state, it is embryonic, 574 * but should either do a listen or a connect soon. 575 */ 576 } 577 #ifdef IPSEC 578 if (ipsec_in_use) { 579 struct m_tag *mtag; 580 struct tdb *tdb = NULL; 581 int error; 582 583 /* Find most recent IPsec tag */ 584 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 585 if (mtag != NULL) { 586 struct tdb_ident *tdbi; 587 588 tdbi = (struct tdb_ident *)(mtag + 1); 589 tdb = gettdb(tdbi->rdomain, tdbi->spi, 590 &tdbi->dst, tdbi->proto); 591 } 592 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 593 tdb, inp ? &inp->inp_seclevel : NULL, NULL, NULL); 594 tdb_unref(tdb); 595 if (error) { 596 tcpstat_inc(tcps_rcvnosec); 597 goto drop; 598 } 599 } 600 #endif /* IPSEC */ 601 602 if (inp == NULL) { 603 tcpstat_inc(tcps_noport); 604 goto dropwithreset_ratelim; 605 } 606 607 KASSERT(sotoinpcb(inp->inp_socket) == inp); 608 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 609 soassertlocked(inp->inp_socket); 610 611 /* Check the minimum TTL for socket. */ 612 switch (af) { 613 case AF_INET: 614 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 615 goto drop; 616 break; 617 #ifdef INET6 618 case AF_INET6: 619 if (inp->inp_ip6_minhlim && 620 inp->inp_ip6_minhlim > ip6->ip6_hlim) 621 goto drop; 622 break; 623 #endif 624 } 625 626 tp = intotcpcb(inp); 627 if (tp == NULL) 628 goto dropwithreset_ratelim; 629 if (tp->t_state == TCPS_CLOSED) 630 goto drop; 631 632 /* Unscale the window into a 32-bit value. */ 633 if ((tiflags & TH_SYN) == 0) 634 tiwin = th->th_win << tp->snd_scale; 635 else 636 tiwin = th->th_win; 637 638 so = inp->inp_socket; 639 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 640 union syn_cache_sa src; 641 union syn_cache_sa dst; 642 643 bzero(&src, sizeof(src)); 644 bzero(&dst, sizeof(dst)); 645 switch (af) { 646 case AF_INET: 647 src.sin.sin_len = sizeof(struct sockaddr_in); 648 src.sin.sin_family = AF_INET; 649 src.sin.sin_addr = ip->ip_src; 650 src.sin.sin_port = th->th_sport; 651 652 dst.sin.sin_len = sizeof(struct sockaddr_in); 653 dst.sin.sin_family = AF_INET; 654 dst.sin.sin_addr = ip->ip_dst; 655 dst.sin.sin_port = th->th_dport; 656 break; 657 #ifdef INET6 658 case AF_INET6: 659 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 660 src.sin6.sin6_family = AF_INET6; 661 src.sin6.sin6_addr = ip6->ip6_src; 662 src.sin6.sin6_port = th->th_sport; 663 664 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 665 dst.sin6.sin6_family = AF_INET6; 666 dst.sin6.sin6_addr = ip6->ip6_dst; 667 dst.sin6.sin6_port = th->th_dport; 668 break; 669 #endif /* INET6 */ 670 } 671 672 if (so->so_options & SO_DEBUG) { 673 otp = tp; 674 ostate = tp->t_state; 675 switch (af) { 676 #ifdef INET6 677 case AF_INET6: 678 saveti.tcpip6.ti6_i = *ip6; 679 saveti.tcpip6.ti6_t = *th; 680 break; 681 #endif 682 case AF_INET: 683 memcpy(&saveti.tcpip.ti_i, ip, sizeof(*ip)); 684 saveti.tcpip.ti_t = *th; 685 break; 686 } 687 } 688 if (so->so_options & SO_ACCEPTCONN) { 689 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 690 691 case TH_SYN|TH_ACK|TH_RST: 692 case TH_SYN|TH_RST: 693 case TH_ACK|TH_RST: 694 case TH_RST: 695 syn_cache_reset(&src.sa, &dst.sa, th, 696 inp->inp_rtableid); 697 goto drop; 698 699 case TH_SYN|TH_ACK: 700 /* 701 * Received a SYN,ACK. This should 702 * never happen while we are in 703 * LISTEN. Send an RST. 704 */ 705 goto badsyn; 706 707 case TH_ACK: 708 so = syn_cache_get(&src.sa, &dst.sa, 709 th, iphlen, tlen, so, m, now); 710 if (so == NULL) { 711 /* 712 * We don't have a SYN for 713 * this ACK; send an RST. 714 */ 715 goto badsyn; 716 } else if (so == (struct socket *)(-1)) { 717 /* 718 * We were unable to create 719 * the connection. If the 720 * 3-way handshake was 721 * completed, and RST has 722 * been sent to the peer. 723 * Since the mbuf might be 724 * in use for the reply, 725 * do not free it. 726 */ 727 m = *mp = NULL; 728 goto drop; 729 } else { 730 /* 731 * We have created a 732 * full-blown connection. 733 */ 734 tp = NULL; 735 in_pcbunref(inp); 736 inp = in_pcbref(sotoinpcb(so)); 737 tp = intotcpcb(inp); 738 if (tp == NULL) 739 goto badsyn; /*XXX*/ 740 741 } 742 break; 743 744 default: 745 /* 746 * None of RST, SYN or ACK was set. 747 * This is an invalid packet for a 748 * TCB in LISTEN state. Send a RST. 749 */ 750 goto badsyn; 751 752 case TH_SYN: 753 /* 754 * Received a SYN. 755 */ 756 #ifdef INET6 757 /* 758 * If deprecated address is forbidden, we do 759 * not accept SYN to deprecated interface 760 * address to prevent any new inbound 761 * connection from getting established. 762 * When we do not accept SYN, we send a TCP 763 * RST, with deprecated source address (instead 764 * of dropping it). We compromise it as it is 765 * much better for peer to send a RST, and 766 * RST will be the final packet for the 767 * exchange. 768 * 769 * If we do not forbid deprecated addresses, we 770 * accept the SYN packet. RFC2462 does not 771 * suggest dropping SYN in this case. 772 * If we decipher RFC2462 5.5.4, it says like 773 * this: 774 * 1. use of deprecated addr with existing 775 * communication is okay - "SHOULD continue 776 * to be used" 777 * 2. use of it with new communication: 778 * (2a) "SHOULD NOT be used if alternate 779 * address with sufficient scope is 780 * available" 781 * (2b) nothing mentioned otherwise. 782 * Here we fall into (2b) case as we have no 783 * choice in our source address selection - we 784 * must obey the peer. 785 * 786 * The wording in RFC2462 is confusing, and 787 * there are multiple description text for 788 * deprecated address handling - worse, they 789 * are not exactly the same. I believe 5.5.4 790 * is the best one, so we follow 5.5.4. 791 */ 792 if (ip6 && !ip6_use_deprecated) { 793 struct in6_ifaddr *ia6; 794 struct ifnet *ifp = 795 if_get(m->m_pkthdr.ph_ifidx); 796 797 if (ifp && 798 (ia6 = in6ifa_ifpwithaddr(ifp, 799 &ip6->ip6_dst)) && 800 (ia6->ia6_flags & 801 IN6_IFF_DEPRECATED)) { 802 tp = NULL; 803 if_put(ifp); 804 goto dropwithreset; 805 } 806 if_put(ifp); 807 } 808 #endif 809 810 /* 811 * LISTEN socket received a SYN 812 * from itself? This can't possibly 813 * be valid; drop the packet. 814 */ 815 if (th->th_dport == th->th_sport) { 816 switch (af) { 817 #ifdef INET6 818 case AF_INET6: 819 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 820 &ip6->ip6_dst)) { 821 tcpstat_inc(tcps_badsyn); 822 goto drop; 823 } 824 break; 825 #endif /* INET6 */ 826 case AF_INET: 827 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 828 tcpstat_inc(tcps_badsyn); 829 goto drop; 830 } 831 break; 832 } 833 } 834 835 /* 836 * SYN looks ok; create compressed TCP 837 * state for it. 838 */ 839 if (so->so_qlen > so->so_qlimit || 840 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 841 so, m, optp, optlen, &opti, reuse, now) 842 == -1) { 843 tcpstat_inc(tcps_dropsyn); 844 goto drop; 845 } 846 in_pcbunref(inp); 847 return IPPROTO_DONE; 848 } 849 } 850 } 851 852 #ifdef DIAGNOSTIC 853 /* 854 * Should not happen now that all embryonic connections 855 * are handled with compressed state. 856 */ 857 if (tp->t_state == TCPS_LISTEN) 858 panic("tcp_input: TCPS_LISTEN"); 859 #endif 860 861 #if NPF > 0 862 pf_inp_link(m, inp); 863 #endif 864 865 /* 866 * Segment received on connection. 867 * Reset idle time and keep-alive timer. 868 */ 869 tp->t_rcvtime = now; 870 if (TCPS_HAVEESTABLISHED(tp->t_state)) 871 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 872 873 if (tp->sack_enable) 874 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 875 876 /* 877 * Process options. 878 */ 879 #ifdef TCP_SIGNATURE 880 if (optp || (tp->t_flags & TF_SIGNATURE)) 881 #else 882 if (optp) 883 #endif 884 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 885 m->m_pkthdr.ph_rtableid, now)) 886 goto drop; 887 888 if (opti.ts_present && opti.ts_ecr) { 889 int32_t rtt_test; 890 891 /* subtract out the tcp timestamp modulator */ 892 opti.ts_ecr -= tp->ts_modulate; 893 894 /* make sure ts_ecr is sensible */ 895 rtt_test = now - opti.ts_ecr; 896 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 897 opti.ts_ecr = 0; 898 } 899 900 #ifdef TCP_ECN 901 /* if congestion experienced, set ECE bit in subsequent packets. */ 902 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 903 tp->t_flags |= TF_RCVD_CE; 904 tcpstat_inc(tcps_ecn_rcvce); 905 } 906 #endif 907 /* 908 * Header prediction: check for the two common cases 909 * of a uni-directional data xfer. If the packet has 910 * no control flags, is in-sequence, the window didn't 911 * change and we're not retransmitting, it's a 912 * candidate. If the length is zero and the ack moved 913 * forward, we're the sender side of the xfer. Just 914 * free the data acked & wake any higher level process 915 * that was blocked waiting for space. If the length 916 * is non-zero and the ack didn't move, we're the 917 * receiver side. If we're getting packets in-order 918 * (the reassembly queue is empty), add the data to 919 * the socket buffer and note that we need a delayed ack. 920 */ 921 if (tp->t_state == TCPS_ESTABLISHED && 922 #ifdef TCP_ECN 923 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 924 #else 925 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 926 #endif 927 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 928 th->th_seq == tp->rcv_nxt && 929 tiwin && tiwin == tp->snd_wnd && 930 tp->snd_nxt == tp->snd_max) { 931 932 /* 933 * If last ACK falls within this segment's sequence numbers, 934 * record the timestamp. 935 * Fix from Braden, see Stevens p. 870 936 */ 937 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 938 tp->ts_recent_age = now; 939 tp->ts_recent = opti.ts_val; 940 } 941 942 if (tlen == 0) { 943 if (SEQ_GT(th->th_ack, tp->snd_una) && 944 SEQ_LEQ(th->th_ack, tp->snd_max) && 945 tp->snd_cwnd >= tp->snd_wnd && 946 tp->t_dupacks == 0) { 947 /* 948 * this is a pure ack for outstanding data. 949 */ 950 tcpstat_inc(tcps_predack); 951 if (opti.ts_present && opti.ts_ecr) 952 tcp_xmit_timer(tp, now - opti.ts_ecr); 953 else if (tp->t_rtttime && 954 SEQ_GT(th->th_ack, tp->t_rtseq)) 955 tcp_xmit_timer(tp, now - tp->t_rtttime); 956 acked = th->th_ack - tp->snd_una; 957 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 958 acked); 959 tp->t_rcvacktime = now; 960 ND6_HINT(tp); 961 sbdrop(so, &so->so_snd, acked); 962 963 /* 964 * If we had a pending ICMP message that 965 * refers to data that have just been 966 * acknowledged, disregard the recorded ICMP 967 * message. 968 */ 969 if ((tp->t_flags & TF_PMTUD_PEND) && 970 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 971 tp->t_flags &= ~TF_PMTUD_PEND; 972 973 /* 974 * Keep track of the largest chunk of data 975 * acknowledged since last PMTU update 976 */ 977 if (tp->t_pmtud_mss_acked < acked) 978 tp->t_pmtud_mss_acked = acked; 979 980 tp->snd_una = th->th_ack; 981 /* Pull snd_wl2 up to prevent seq wrap. */ 982 tp->snd_wl2 = th->th_ack; 983 /* 984 * We want snd_last to track snd_una so 985 * as to avoid sequence wraparound problems 986 * for very large transfers. 987 */ 988 #ifdef TCP_ECN 989 if (SEQ_GT(tp->snd_una, tp->snd_last)) 990 #endif 991 tp->snd_last = tp->snd_una; 992 m_freem(m); 993 994 /* 995 * If all outstanding data are acked, stop 996 * retransmit timer, otherwise restart timer 997 * using current (possibly backed-off) value. 998 * If process is waiting for space, 999 * wakeup/selwakeup/signal. If data 1000 * are ready to send, let tcp_output 1001 * decide between more output or persist. 1002 */ 1003 if (tp->snd_una == tp->snd_max) 1004 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1005 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1006 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1007 1008 tcp_update_sndspace(tp); 1009 if (sb_notify(so, &so->so_snd)) { 1010 tp->t_flags |= TF_BLOCKOUTPUT; 1011 sowwakeup(so); 1012 tp->t_flags &= ~TF_BLOCKOUTPUT; 1013 } 1014 if (so->so_snd.sb_cc || 1015 tp->t_flags & TF_NEEDOUTPUT) 1016 (void) tcp_output(tp); 1017 in_pcbunref(inp); 1018 return IPPROTO_DONE; 1019 } 1020 } else if (th->th_ack == tp->snd_una && 1021 TAILQ_EMPTY(&tp->t_segq) && 1022 tlen <= sbspace(so, &so->so_rcv)) { 1023 /* 1024 * This is a pure, in-sequence data packet 1025 * with nothing on the reassembly queue and 1026 * we have enough buffer space to take it. 1027 */ 1028 /* Clean receiver SACK report if present */ 1029 if (tp->sack_enable && tp->rcv_numsacks) 1030 tcp_clean_sackreport(tp); 1031 tcpstat_inc(tcps_preddat); 1032 tp->rcv_nxt += tlen; 1033 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1034 tp->snd_wl1 = th->th_seq; 1035 /* Packet has most recent segment, no urgent exists. */ 1036 tp->rcv_up = tp->rcv_nxt; 1037 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1038 ND6_HINT(tp); 1039 1040 TCP_SETUP_ACK(tp, tiflags, m); 1041 /* 1042 * Drop TCP, IP headers and TCP options then add data 1043 * to socket buffer. 1044 */ 1045 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1046 m_freem(m); 1047 else { 1048 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1049 now - tp->rfbuf_ts > (tp->t_srtt >> 1050 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) { 1051 tcp_update_rcvspace(tp); 1052 /* Start over with next RTT. */ 1053 tp->rfbuf_cnt = 0; 1054 tp->rfbuf_ts = 0; 1055 } else 1056 tp->rfbuf_cnt += tlen; 1057 m_adj(m, iphlen + off); 1058 sbappendstream(so, &so->so_rcv, m); 1059 } 1060 tp->t_flags |= TF_BLOCKOUTPUT; 1061 sorwakeup(so); 1062 tp->t_flags &= ~TF_BLOCKOUTPUT; 1063 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1064 (void) tcp_output(tp); 1065 in_pcbunref(inp); 1066 return IPPROTO_DONE; 1067 } 1068 } 1069 1070 /* 1071 * Compute mbuf offset to TCP data segment. 1072 */ 1073 hdroptlen = iphlen + off; 1074 1075 /* 1076 * Calculate amount of space in receive window, 1077 * and then do TCP input processing. 1078 * Receive window is amount of space in rcv queue, 1079 * but not less than advertised window. 1080 */ 1081 { 1082 int win; 1083 1084 win = sbspace(so, &so->so_rcv); 1085 if (win < 0) 1086 win = 0; 1087 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1088 } 1089 1090 switch (tp->t_state) { 1091 1092 /* 1093 * If the state is SYN_RECEIVED: 1094 * if seg contains SYN/ACK, send an RST. 1095 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1096 */ 1097 1098 case TCPS_SYN_RECEIVED: 1099 if (tiflags & TH_ACK) { 1100 if (tiflags & TH_SYN) { 1101 tcpstat_inc(tcps_badsyn); 1102 goto dropwithreset; 1103 } 1104 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1105 SEQ_GT(th->th_ack, tp->snd_max)) 1106 goto dropwithreset; 1107 } 1108 break; 1109 1110 /* 1111 * If the state is SYN_SENT: 1112 * if seg contains an ACK, but not for our SYN, drop the input. 1113 * if seg contains a RST, then drop the connection. 1114 * if seg does not contain SYN, then drop it. 1115 * Otherwise this is an acceptable SYN segment 1116 * initialize tp->rcv_nxt and tp->irs 1117 * if seg contains ack then advance tp->snd_una 1118 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1119 * arrange for segment to be acked (eventually) 1120 * continue processing rest of data/controls, beginning with URG 1121 */ 1122 case TCPS_SYN_SENT: 1123 if ((tiflags & TH_ACK) && 1124 (SEQ_LEQ(th->th_ack, tp->iss) || 1125 SEQ_GT(th->th_ack, tp->snd_max))) 1126 goto dropwithreset; 1127 if (tiflags & TH_RST) { 1128 #ifdef TCP_ECN 1129 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1130 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1131 goto drop; 1132 #endif 1133 if (tiflags & TH_ACK) 1134 tp = tcp_drop(tp, ECONNREFUSED); 1135 goto drop; 1136 } 1137 if ((tiflags & TH_SYN) == 0) 1138 goto drop; 1139 if (tiflags & TH_ACK) { 1140 tp->snd_una = th->th_ack; 1141 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1142 tp->snd_nxt = tp->snd_una; 1143 } 1144 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1145 tp->irs = th->th_seq; 1146 tcp_mss(tp, opti.maxseg); 1147 /* Reset initial window to 1 segment for retransmit */ 1148 if (tp->t_rxtshift > 0) 1149 tp->snd_cwnd = tp->t_maxseg; 1150 tcp_rcvseqinit(tp); 1151 tp->t_flags |= TF_ACKNOW; 1152 /* 1153 * If we've sent a SACK_PERMITTED option, and the peer 1154 * also replied with one, then TF_SACK_PERMIT should have 1155 * been set in tcp_dooptions(). If it was not, disable SACKs. 1156 */ 1157 if (tp->sack_enable) 1158 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1159 #ifdef TCP_ECN 1160 /* 1161 * if ECE is set but CWR is not set for SYN-ACK, or 1162 * both ECE and CWR are set for simultaneous open, 1163 * peer is ECN capable. 1164 */ 1165 if (tcp_do_ecn) { 1166 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1167 case TH_ACK|TH_ECE: 1168 case TH_ECE|TH_CWR: 1169 tp->t_flags |= TF_ECN_PERMIT; 1170 tiflags &= ~(TH_ECE|TH_CWR); 1171 tcpstat_inc(tcps_ecn_accepts); 1172 } 1173 } 1174 #endif 1175 1176 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1177 tcpstat_inc(tcps_connects); 1178 tp->t_flags |= TF_BLOCKOUTPUT; 1179 soisconnected(so); 1180 tp->t_flags &= ~TF_BLOCKOUTPUT; 1181 tp->t_state = TCPS_ESTABLISHED; 1182 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1183 /* Do window scaling on this connection? */ 1184 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1185 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1186 tp->snd_scale = tp->requested_s_scale; 1187 tp->rcv_scale = tp->request_r_scale; 1188 } 1189 tcp_flush_queue(tp); 1190 1191 /* 1192 * if we didn't have to retransmit the SYN, 1193 * use its rtt as our initial srtt & rtt var. 1194 */ 1195 if (tp->t_rtttime) 1196 tcp_xmit_timer(tp, now - tp->t_rtttime); 1197 /* 1198 * Since new data was acked (the SYN), open the 1199 * congestion window by one MSS. We do this 1200 * here, because we won't go through the normal 1201 * ACK processing below. And since this is the 1202 * start of the connection, we know we are in 1203 * the exponential phase of slow-start. 1204 */ 1205 tp->snd_cwnd += tp->t_maxseg; 1206 } else 1207 tp->t_state = TCPS_SYN_RECEIVED; 1208 1209 #if 0 1210 trimthenstep6: 1211 #endif 1212 /* 1213 * Advance th->th_seq to correspond to first data byte. 1214 * If data, trim to stay within window, 1215 * dropping FIN if necessary. 1216 */ 1217 th->th_seq++; 1218 if (tlen > tp->rcv_wnd) { 1219 todrop = tlen - tp->rcv_wnd; 1220 m_adj(m, -todrop); 1221 tlen = tp->rcv_wnd; 1222 tiflags &= ~TH_FIN; 1223 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1224 todrop); 1225 } 1226 tp->snd_wl1 = th->th_seq - 1; 1227 tp->rcv_up = th->th_seq; 1228 goto step6; 1229 /* 1230 * If a new connection request is received while in TIME_WAIT, 1231 * drop the old connection and start over if the if the 1232 * timestamp or the sequence numbers are above the previous 1233 * ones. 1234 */ 1235 case TCPS_TIME_WAIT: 1236 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1237 ((opti.ts_present && 1238 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1239 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1240 #if NPF > 0 1241 /* 1242 * The socket will be recreated but the new state 1243 * has already been linked to the socket. Remove the 1244 * link between old socket and new state. 1245 */ 1246 pf_inp_unlink(inp); 1247 #endif 1248 /* 1249 * Advance the iss by at least 32768, but 1250 * clear the msb in order to make sure 1251 * that SEG_LT(snd_nxt, iss). 1252 */ 1253 iss = tp->snd_nxt + 1254 ((arc4random() & 0x7fffffff) | 0x8000); 1255 reuse = &iss; 1256 tp = tcp_close(tp); 1257 in_pcbunref(inp); 1258 inp = NULL; 1259 goto findpcb; 1260 } 1261 } 1262 1263 /* 1264 * States other than LISTEN or SYN_SENT. 1265 * First check timestamp, if present. 1266 * Then check that at least some bytes of segment are within 1267 * receive window. If segment begins before rcv_nxt, 1268 * drop leading data (and SYN); if nothing left, just ack. 1269 * 1270 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1271 * and it's less than opti.ts_recent, drop it. 1272 */ 1273 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1274 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1275 1276 /* Check to see if ts_recent is over 24 days old. */ 1277 if (now - tp->ts_recent_age > TCP_PAWS_IDLE) { 1278 /* 1279 * Invalidate ts_recent. If this segment updates 1280 * ts_recent, the age will be reset later and ts_recent 1281 * will get a valid value. If it does not, setting 1282 * ts_recent to zero will at least satisfy the 1283 * requirement that zero be placed in the timestamp 1284 * echo reply when ts_recent isn't valid. The 1285 * age isn't reset until we get a valid ts_recent 1286 * because we don't want out-of-order segments to be 1287 * dropped when ts_recent is old. 1288 */ 1289 tp->ts_recent = 0; 1290 } else { 1291 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1292 tcpstat_inc(tcps_pawsdrop); 1293 if (tlen) 1294 goto dropafterack; 1295 goto drop; 1296 } 1297 } 1298 1299 todrop = tp->rcv_nxt - th->th_seq; 1300 if (todrop > 0) { 1301 if (tiflags & TH_SYN) { 1302 tiflags &= ~TH_SYN; 1303 th->th_seq++; 1304 if (th->th_urp > 1) 1305 th->th_urp--; 1306 else 1307 tiflags &= ~TH_URG; 1308 todrop--; 1309 } 1310 if (todrop > tlen || 1311 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1312 /* 1313 * Any valid FIN must be to the left of the 1314 * window. At this point, FIN must be a 1315 * duplicate or out-of-sequence, so drop it. 1316 */ 1317 tiflags &= ~TH_FIN; 1318 /* 1319 * Send ACK to resynchronize, and drop any data, 1320 * but keep on processing for RST or ACK. 1321 */ 1322 tp->t_flags |= TF_ACKNOW; 1323 todrop = tlen; 1324 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1325 } else { 1326 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1327 todrop); 1328 } 1329 hdroptlen += todrop; /* drop from head afterwards */ 1330 th->th_seq += todrop; 1331 tlen -= todrop; 1332 if (th->th_urp > todrop) 1333 th->th_urp -= todrop; 1334 else { 1335 tiflags &= ~TH_URG; 1336 th->th_urp = 0; 1337 } 1338 } 1339 1340 /* 1341 * If new data are received on a connection after the 1342 * user processes are gone, then RST the other end. 1343 */ 1344 if ((so->so_state & SS_NOFDREF) && 1345 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1346 tp = tcp_close(tp); 1347 tcpstat_inc(tcps_rcvafterclose); 1348 goto dropwithreset; 1349 } 1350 1351 /* 1352 * If segment ends after window, drop trailing data 1353 * (and PUSH and FIN); if nothing left, just ACK. 1354 */ 1355 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1356 if (todrop > 0) { 1357 tcpstat_inc(tcps_rcvpackafterwin); 1358 if (todrop >= tlen) { 1359 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1360 /* 1361 * If window is closed can only take segments at 1362 * window edge, and have to drop data and PUSH from 1363 * incoming segments. Continue processing, but 1364 * remember to ack. Otherwise, drop segment 1365 * and ack. 1366 */ 1367 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1368 tp->t_flags |= TF_ACKNOW; 1369 tcpstat_inc(tcps_rcvwinprobe); 1370 } else 1371 goto dropafterack; 1372 } else 1373 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1374 m_adj(m, -todrop); 1375 tlen -= todrop; 1376 tiflags &= ~(TH_PUSH|TH_FIN); 1377 } 1378 1379 /* 1380 * If last ACK falls within this segment's sequence numbers, 1381 * record its timestamp if it's more recent. 1382 * NOTE that the test is modified according to the latest 1383 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1384 */ 1385 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1386 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1387 tp->ts_recent_age = now; 1388 tp->ts_recent = opti.ts_val; 1389 } 1390 1391 /* 1392 * If the RST bit is set examine the state: 1393 * SYN_RECEIVED STATE: 1394 * If passive open, return to LISTEN state. 1395 * If active open, inform user that connection was refused. 1396 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1397 * Inform user that connection was reset, and close tcb. 1398 * CLOSING, LAST_ACK, TIME_WAIT STATES 1399 * Close the tcb. 1400 */ 1401 if (tiflags & TH_RST) { 1402 if (th->th_seq != tp->last_ack_sent && 1403 th->th_seq != tp->rcv_nxt && 1404 th->th_seq != (tp->rcv_nxt + 1)) 1405 goto drop; 1406 1407 switch (tp->t_state) { 1408 case TCPS_SYN_RECEIVED: 1409 #ifdef TCP_ECN 1410 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1411 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1412 goto drop; 1413 #endif 1414 so->so_error = ECONNREFUSED; 1415 goto close; 1416 1417 case TCPS_ESTABLISHED: 1418 case TCPS_FIN_WAIT_1: 1419 case TCPS_FIN_WAIT_2: 1420 case TCPS_CLOSE_WAIT: 1421 so->so_error = ECONNRESET; 1422 close: 1423 tp->t_state = TCPS_CLOSED; 1424 tcpstat_inc(tcps_drops); 1425 tp = tcp_close(tp); 1426 goto drop; 1427 case TCPS_CLOSING: 1428 case TCPS_LAST_ACK: 1429 case TCPS_TIME_WAIT: 1430 tp = tcp_close(tp); 1431 goto drop; 1432 } 1433 } 1434 1435 /* 1436 * If a SYN is in the window, then this is an 1437 * error and we ACK and drop the packet. 1438 */ 1439 if (tiflags & TH_SYN) 1440 goto dropafterack_ratelim; 1441 1442 /* 1443 * If the ACK bit is off we drop the segment and return. 1444 */ 1445 if ((tiflags & TH_ACK) == 0) { 1446 if (tp->t_flags & TF_ACKNOW) 1447 goto dropafterack; 1448 else 1449 goto drop; 1450 } 1451 1452 /* 1453 * Ack processing. 1454 */ 1455 switch (tp->t_state) { 1456 1457 /* 1458 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1459 * ESTABLISHED state and continue processing. 1460 * The ACK was checked above. 1461 */ 1462 case TCPS_SYN_RECEIVED: 1463 tcpstat_inc(tcps_connects); 1464 tp->t_flags |= TF_BLOCKOUTPUT; 1465 soisconnected(so); 1466 tp->t_flags &= ~TF_BLOCKOUTPUT; 1467 tp->t_state = TCPS_ESTABLISHED; 1468 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1469 /* Do window scaling? */ 1470 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1471 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1472 tp->snd_scale = tp->requested_s_scale; 1473 tp->rcv_scale = tp->request_r_scale; 1474 tiwin = th->th_win << tp->snd_scale; 1475 } 1476 tcp_flush_queue(tp); 1477 tp->snd_wl1 = th->th_seq - 1; 1478 /* fall into ... */ 1479 1480 /* 1481 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1482 * ACKs. If the ack is in the range 1483 * tp->snd_una < th->th_ack <= tp->snd_max 1484 * then advance tp->snd_una to th->th_ack and drop 1485 * data from the retransmission queue. If this ACK reflects 1486 * more up to date window information we update our window information. 1487 */ 1488 case TCPS_ESTABLISHED: 1489 case TCPS_FIN_WAIT_1: 1490 case TCPS_FIN_WAIT_2: 1491 case TCPS_CLOSE_WAIT: 1492 case TCPS_CLOSING: 1493 case TCPS_LAST_ACK: 1494 case TCPS_TIME_WAIT: 1495 #ifdef TCP_ECN 1496 /* 1497 * if we receive ECE and are not already in recovery phase, 1498 * reduce cwnd by half but don't slow-start. 1499 * advance snd_last to snd_max not to reduce cwnd again 1500 * until all outstanding packets are acked. 1501 */ 1502 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1503 if ((tp->t_flags & TF_ECN_PERMIT) && 1504 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1505 u_int win; 1506 1507 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1508 if (win > 1) { 1509 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1510 tp->snd_cwnd = tp->snd_ssthresh; 1511 tp->snd_last = tp->snd_max; 1512 tp->t_flags |= TF_SEND_CWR; 1513 tcpstat_inc(tcps_cwr_ecn); 1514 } 1515 } 1516 tcpstat_inc(tcps_ecn_rcvece); 1517 } 1518 /* 1519 * if we receive CWR, we know that the peer has reduced 1520 * its congestion window. stop sending ecn-echo. 1521 */ 1522 if ((tiflags & TH_CWR)) { 1523 tp->t_flags &= ~TF_RCVD_CE; 1524 tcpstat_inc(tcps_ecn_rcvcwr); 1525 } 1526 #endif /* TCP_ECN */ 1527 1528 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1529 /* 1530 * Duplicate/old ACK processing. 1531 * Increments t_dupacks: 1532 * Pure duplicate (same seq/ack/window, no data) 1533 * Doesn't affect t_dupacks: 1534 * Data packets. 1535 * Normal window updates (window opens) 1536 * Resets t_dupacks: 1537 * New data ACKed. 1538 * Window shrinks 1539 * Old ACK 1540 */ 1541 if (tlen) { 1542 /* Drop very old ACKs unless th_seq matches */ 1543 if (th->th_seq != tp->rcv_nxt && 1544 SEQ_LT(th->th_ack, 1545 tp->snd_una - tp->max_sndwnd)) { 1546 tcpstat_inc(tcps_rcvacktooold); 1547 goto drop; 1548 } 1549 break; 1550 } 1551 /* 1552 * If we get an old ACK, there is probably packet 1553 * reordering going on. Be conservative and reset 1554 * t_dupacks so that we are less aggressive in 1555 * doing a fast retransmit. 1556 */ 1557 if (th->th_ack != tp->snd_una) { 1558 tp->t_dupacks = 0; 1559 break; 1560 } 1561 if (tiwin == tp->snd_wnd) { 1562 tcpstat_inc(tcps_rcvdupack); 1563 /* 1564 * If we have outstanding data (other than 1565 * a window probe), this is a completely 1566 * duplicate ack (ie, window info didn't 1567 * change), the ack is the biggest we've 1568 * seen and we've seen exactly our rexmt 1569 * threshold of them, assume a packet 1570 * has been dropped and retransmit it. 1571 * Kludge snd_nxt & the congestion 1572 * window so we send only this one 1573 * packet. 1574 * 1575 * We know we're losing at the current 1576 * window size so do congestion avoidance 1577 * (set ssthresh to half the current window 1578 * and pull our congestion window back to 1579 * the new ssthresh). 1580 * 1581 * Dup acks mean that packets have left the 1582 * network (they're now cached at the receiver) 1583 * so bump cwnd by the amount in the receiver 1584 * to keep a constant cwnd packets in the 1585 * network. 1586 */ 1587 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1588 tp->t_dupacks = 0; 1589 else if (++tp->t_dupacks == tcprexmtthresh) { 1590 tcp_seq onxt = tp->snd_nxt; 1591 u_long win = 1592 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1593 2 / tp->t_maxseg; 1594 1595 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1596 /* 1597 * False fast retx after 1598 * timeout. Do not cut window. 1599 */ 1600 tp->t_dupacks = 0; 1601 goto drop; 1602 } 1603 if (win < 2) 1604 win = 2; 1605 tp->snd_ssthresh = win * tp->t_maxseg; 1606 tp->snd_last = tp->snd_max; 1607 if (tp->sack_enable) { 1608 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1609 tp->t_rtttime = 0; 1610 #ifdef TCP_ECN 1611 tp->t_flags |= TF_SEND_CWR; 1612 #endif 1613 tcpstat_inc(tcps_cwr_frecovery); 1614 tcpstat_inc(tcps_sack_recovery_episode); 1615 /* 1616 * tcp_output() will send 1617 * oldest SACK-eligible rtx. 1618 */ 1619 (void) tcp_output(tp); 1620 tp->snd_cwnd = tp->snd_ssthresh+ 1621 tp->t_maxseg * tp->t_dupacks; 1622 goto drop; 1623 } 1624 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1625 tp->t_rtttime = 0; 1626 tp->snd_nxt = th->th_ack; 1627 tp->snd_cwnd = tp->t_maxseg; 1628 #ifdef TCP_ECN 1629 tp->t_flags |= TF_SEND_CWR; 1630 #endif 1631 tcpstat_inc(tcps_cwr_frecovery); 1632 tcpstat_inc(tcps_sndrexmitfast); 1633 (void) tcp_output(tp); 1634 1635 tp->snd_cwnd = tp->snd_ssthresh + 1636 tp->t_maxseg * tp->t_dupacks; 1637 if (SEQ_GT(onxt, tp->snd_nxt)) 1638 tp->snd_nxt = onxt; 1639 goto drop; 1640 } else if (tp->t_dupacks > tcprexmtthresh) { 1641 tp->snd_cwnd += tp->t_maxseg; 1642 (void) tcp_output(tp); 1643 goto drop; 1644 } 1645 } else if (tiwin < tp->snd_wnd) { 1646 /* 1647 * The window was retracted! Previous dup 1648 * ACKs may have been due to packets arriving 1649 * after the shrunken window, not a missing 1650 * packet, so play it safe and reset t_dupacks 1651 */ 1652 tp->t_dupacks = 0; 1653 } 1654 break; 1655 } 1656 /* 1657 * If the congestion window was inflated to account 1658 * for the other side's cached packets, retract it. 1659 */ 1660 if (tp->t_dupacks >= tcprexmtthresh) { 1661 /* Check for a partial ACK */ 1662 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1663 if (tp->sack_enable) 1664 tcp_sack_partialack(tp, th); 1665 else 1666 tcp_newreno_partialack(tp, th); 1667 } else { 1668 /* Out of fast recovery */ 1669 tp->snd_cwnd = tp->snd_ssthresh; 1670 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1671 tp->snd_ssthresh) 1672 tp->snd_cwnd = 1673 tcp_seq_subtract(tp->snd_max, 1674 th->th_ack); 1675 tp->t_dupacks = 0; 1676 } 1677 } else { 1678 /* 1679 * Reset the duplicate ACK counter if we 1680 * were not in fast recovery. 1681 */ 1682 tp->t_dupacks = 0; 1683 } 1684 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1685 tcpstat_inc(tcps_rcvacktoomuch); 1686 goto dropafterack_ratelim; 1687 } 1688 acked = th->th_ack - tp->snd_una; 1689 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1690 tp->t_rcvacktime = now; 1691 1692 /* 1693 * If we have a timestamp reply, update smoothed 1694 * round trip time. If no timestamp is present but 1695 * transmit timer is running and timed sequence 1696 * number was acked, update smoothed round trip time. 1697 * Since we now have an rtt measurement, cancel the 1698 * timer backoff (cf., Phil Karn's retransmit alg.). 1699 * Recompute the initial retransmit timer. 1700 */ 1701 if (opti.ts_present && opti.ts_ecr) 1702 tcp_xmit_timer(tp, now - opti.ts_ecr); 1703 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1704 tcp_xmit_timer(tp, now - tp->t_rtttime); 1705 1706 /* 1707 * If all outstanding data is acked, stop retransmit 1708 * timer and remember to restart (more output or persist). 1709 * If there is more data to be acked, restart retransmit 1710 * timer, using current (possibly backed-off) value. 1711 */ 1712 if (th->th_ack == tp->snd_max) { 1713 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1714 tp->t_flags |= TF_NEEDOUTPUT; 1715 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1716 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1717 /* 1718 * When new data is acked, open the congestion window. 1719 * If the window gives us less than ssthresh packets 1720 * in flight, open exponentially (maxseg per packet). 1721 * Otherwise open linearly: maxseg per window 1722 * (maxseg^2 / cwnd per packet). 1723 */ 1724 { 1725 u_int cw = tp->snd_cwnd; 1726 u_int incr = tp->t_maxseg; 1727 1728 if (cw > tp->snd_ssthresh) 1729 incr = max(incr * incr / cw, 1); 1730 if (tp->t_dupacks < tcprexmtthresh) 1731 tp->snd_cwnd = ulmin(cw + incr, 1732 TCP_MAXWIN << tp->snd_scale); 1733 } 1734 ND6_HINT(tp); 1735 if (acked > so->so_snd.sb_cc) { 1736 if (tp->snd_wnd > so->so_snd.sb_cc) 1737 tp->snd_wnd -= so->so_snd.sb_cc; 1738 else 1739 tp->snd_wnd = 0; 1740 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1741 ourfinisacked = 1; 1742 } else { 1743 sbdrop(so, &so->so_snd, acked); 1744 if (tp->snd_wnd > acked) 1745 tp->snd_wnd -= acked; 1746 else 1747 tp->snd_wnd = 0; 1748 ourfinisacked = 0; 1749 } 1750 1751 tcp_update_sndspace(tp); 1752 if (sb_notify(so, &so->so_snd)) { 1753 tp->t_flags |= TF_BLOCKOUTPUT; 1754 sowwakeup(so); 1755 tp->t_flags &= ~TF_BLOCKOUTPUT; 1756 } 1757 1758 /* 1759 * If we had a pending ICMP message that referred to data 1760 * that have just been acknowledged, disregard the recorded 1761 * ICMP message. 1762 */ 1763 if ((tp->t_flags & TF_PMTUD_PEND) && 1764 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1765 tp->t_flags &= ~TF_PMTUD_PEND; 1766 1767 /* 1768 * Keep track of the largest chunk of data acknowledged 1769 * since last PMTU update 1770 */ 1771 if (tp->t_pmtud_mss_acked < acked) 1772 tp->t_pmtud_mss_acked = acked; 1773 1774 tp->snd_una = th->th_ack; 1775 #ifdef TCP_ECN 1776 /* sync snd_last with snd_una */ 1777 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1778 tp->snd_last = tp->snd_una; 1779 #endif 1780 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1781 tp->snd_nxt = tp->snd_una; 1782 1783 switch (tp->t_state) { 1784 1785 /* 1786 * In FIN_WAIT_1 STATE in addition to the processing 1787 * for the ESTABLISHED state if our FIN is now acknowledged 1788 * then enter FIN_WAIT_2. 1789 */ 1790 case TCPS_FIN_WAIT_1: 1791 if (ourfinisacked) { 1792 /* 1793 * If we can't receive any more 1794 * data, then closing user can proceed. 1795 * Starting the timer is contrary to the 1796 * specification, but if we don't get a FIN 1797 * we'll hang forever. 1798 */ 1799 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 1800 tp->t_flags |= TF_BLOCKOUTPUT; 1801 soisdisconnected(so); 1802 tp->t_flags &= ~TF_BLOCKOUTPUT; 1803 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1804 } 1805 tp->t_state = TCPS_FIN_WAIT_2; 1806 } 1807 break; 1808 1809 /* 1810 * In CLOSING STATE in addition to the processing for 1811 * the ESTABLISHED state if the ACK acknowledges our FIN 1812 * then enter the TIME-WAIT state, otherwise ignore 1813 * the segment. 1814 */ 1815 case TCPS_CLOSING: 1816 if (ourfinisacked) { 1817 tp->t_state = TCPS_TIME_WAIT; 1818 tcp_canceltimers(tp); 1819 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1820 tp->t_flags |= TF_BLOCKOUTPUT; 1821 soisdisconnected(so); 1822 tp->t_flags &= ~TF_BLOCKOUTPUT; 1823 } 1824 break; 1825 1826 /* 1827 * In LAST_ACK, we may still be waiting for data to drain 1828 * and/or to be acked, as well as for the ack of our FIN. 1829 * If our FIN is now acknowledged, delete the TCB, 1830 * enter the closed state and return. 1831 */ 1832 case TCPS_LAST_ACK: 1833 if (ourfinisacked) { 1834 tp = tcp_close(tp); 1835 goto drop; 1836 } 1837 break; 1838 1839 /* 1840 * In TIME_WAIT state the only thing that should arrive 1841 * is a retransmission of the remote FIN. Acknowledge 1842 * it and restart the finack timer. 1843 */ 1844 case TCPS_TIME_WAIT: 1845 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1846 goto dropafterack; 1847 } 1848 } 1849 1850 step6: 1851 /* 1852 * Update window information. 1853 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1854 */ 1855 if ((tiflags & TH_ACK) && 1856 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1857 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1858 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1859 /* keep track of pure window updates */ 1860 if (tlen == 0 && 1861 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1862 tcpstat_inc(tcps_rcvwinupd); 1863 tp->snd_wnd = tiwin; 1864 tp->snd_wl1 = th->th_seq; 1865 tp->snd_wl2 = th->th_ack; 1866 if (tp->snd_wnd > tp->max_sndwnd) 1867 tp->max_sndwnd = tp->snd_wnd; 1868 tp->t_flags |= TF_NEEDOUTPUT; 1869 } 1870 1871 /* 1872 * Process segments with URG. 1873 */ 1874 if ((tiflags & TH_URG) && th->th_urp && 1875 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1876 /* 1877 * This is a kludge, but if we receive and accept 1878 * random urgent pointers, we'll crash in 1879 * soreceive. It's hard to imagine someone 1880 * actually wanting to send this much urgent data. 1881 */ 1882 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1883 th->th_urp = 0; /* XXX */ 1884 tiflags &= ~TH_URG; /* XXX */ 1885 goto dodata; /* XXX */ 1886 } 1887 /* 1888 * If this segment advances the known urgent pointer, 1889 * then mark the data stream. This should not happen 1890 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1891 * a FIN has been received from the remote side. 1892 * In these states we ignore the URG. 1893 * 1894 * According to RFC961 (Assigned Protocols), 1895 * the urgent pointer points to the last octet 1896 * of urgent data. We continue, however, 1897 * to consider it to indicate the first octet 1898 * of data past the urgent section as the original 1899 * spec states (in one of two places). 1900 */ 1901 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1902 tp->rcv_up = th->th_seq + th->th_urp; 1903 so->so_oobmark = so->so_rcv.sb_cc + 1904 (tp->rcv_up - tp->rcv_nxt) - 1; 1905 if (so->so_oobmark == 0) 1906 so->so_rcv.sb_state |= SS_RCVATMARK; 1907 sohasoutofband(so); 1908 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1909 } 1910 /* 1911 * Remove out of band data so doesn't get presented to user. 1912 * This can happen independent of advancing the URG pointer, 1913 * but if two URG's are pending at once, some out-of-band 1914 * data may creep in... ick. 1915 */ 1916 if (th->th_urp <= (u_int16_t) tlen && 1917 (so->so_options & SO_OOBINLINE) == 0) 1918 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1919 } else 1920 /* 1921 * If no out of band data is expected, 1922 * pull receive urgent pointer along 1923 * with the receive window. 1924 */ 1925 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1926 tp->rcv_up = tp->rcv_nxt; 1927 dodata: /* XXX */ 1928 1929 /* 1930 * Process the segment text, merging it into the TCP sequencing queue, 1931 * and arranging for acknowledgment of receipt if necessary. 1932 * This process logically involves adjusting tp->rcv_wnd as data 1933 * is presented to the user (this happens in tcp_usrreq.c, 1934 * case PRU_RCVD). If a FIN has already been received on this 1935 * connection then we just ignore the text. 1936 */ 1937 if ((tlen || (tiflags & TH_FIN)) && 1938 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1939 tcp_seq laststart = th->th_seq; 1940 tcp_seq lastend = th->th_seq + tlen; 1941 1942 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1943 tp->t_state == TCPS_ESTABLISHED) { 1944 TCP_SETUP_ACK(tp, tiflags, m); 1945 tp->rcv_nxt += tlen; 1946 tiflags = th->th_flags & TH_FIN; 1947 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1948 ND6_HINT(tp); 1949 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1950 m_freem(m); 1951 else { 1952 m_adj(m, hdroptlen); 1953 sbappendstream(so, &so->so_rcv, m); 1954 } 1955 tp->t_flags |= TF_BLOCKOUTPUT; 1956 sorwakeup(so); 1957 tp->t_flags &= ~TF_BLOCKOUTPUT; 1958 } else { 1959 m_adj(m, hdroptlen); 1960 tiflags = tcp_reass(tp, th, m, &tlen); 1961 tp->t_flags |= TF_ACKNOW; 1962 } 1963 if (tp->sack_enable) 1964 tcp_update_sack_list(tp, laststart, lastend); 1965 1966 /* 1967 * variable len never referenced again in modern BSD, 1968 * so why bother computing it ?? 1969 */ 1970 #if 0 1971 /* 1972 * Note the amount of data that peer has sent into 1973 * our window, in order to estimate the sender's 1974 * buffer size. 1975 */ 1976 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1977 #endif /* 0 */ 1978 } else { 1979 m_freem(m); 1980 tiflags &= ~TH_FIN; 1981 } 1982 1983 /* 1984 * If FIN is received ACK the FIN and let the user know 1985 * that the connection is closing. Ignore a FIN received before 1986 * the connection is fully established. 1987 */ 1988 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1989 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1990 tp->t_flags |= TF_BLOCKOUTPUT; 1991 socantrcvmore(so); 1992 tp->t_flags &= ~TF_BLOCKOUTPUT; 1993 tp->t_flags |= TF_ACKNOW; 1994 tp->rcv_nxt++; 1995 } 1996 switch (tp->t_state) { 1997 1998 /* 1999 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2000 */ 2001 case TCPS_ESTABLISHED: 2002 tp->t_state = TCPS_CLOSE_WAIT; 2003 break; 2004 2005 /* 2006 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2007 * enter the CLOSING state. 2008 */ 2009 case TCPS_FIN_WAIT_1: 2010 tp->t_state = TCPS_CLOSING; 2011 break; 2012 2013 /* 2014 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2015 * starting the time-wait timer, turning off the other 2016 * standard timers. 2017 */ 2018 case TCPS_FIN_WAIT_2: 2019 tp->t_state = TCPS_TIME_WAIT; 2020 tcp_canceltimers(tp); 2021 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2022 tp->t_flags |= TF_BLOCKOUTPUT; 2023 soisdisconnected(so); 2024 tp->t_flags &= ~TF_BLOCKOUTPUT; 2025 break; 2026 2027 /* 2028 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2029 */ 2030 case TCPS_TIME_WAIT: 2031 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2032 break; 2033 } 2034 } 2035 if (otp) 2036 tcp_trace(TA_INPUT, ostate, tp, otp, &saveti.caddr, 0, tlen); 2037 2038 /* 2039 * Return any desired output. 2040 */ 2041 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2042 (void) tcp_output(tp); 2043 in_pcbunref(inp); 2044 return IPPROTO_DONE; 2045 2046 badsyn: 2047 /* 2048 * Received a bad SYN. Increment counters and dropwithreset. 2049 */ 2050 tcpstat_inc(tcps_badsyn); 2051 tp = NULL; 2052 goto dropwithreset; 2053 2054 dropafterack_ratelim: 2055 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2056 tcp_ackdrop_ppslim) == 0) { 2057 /* XXX stat */ 2058 goto drop; 2059 } 2060 /* ...fall into dropafterack... */ 2061 2062 dropafterack: 2063 /* 2064 * Generate an ACK dropping incoming segment if it occupies 2065 * sequence space, where the ACK reflects our state. 2066 */ 2067 if (tiflags & TH_RST) 2068 goto drop; 2069 m_freem(m); 2070 tp->t_flags |= TF_ACKNOW; 2071 (void) tcp_output(tp); 2072 in_pcbunref(inp); 2073 return IPPROTO_DONE; 2074 2075 dropwithreset_ratelim: 2076 /* 2077 * We may want to rate-limit RSTs in certain situations, 2078 * particularly if we are sending an RST in response to 2079 * an attempt to connect to or otherwise communicate with 2080 * a port for which we have no socket. 2081 */ 2082 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2083 tcp_rst_ppslim) == 0) { 2084 /* XXX stat */ 2085 goto drop; 2086 } 2087 /* ...fall into dropwithreset... */ 2088 2089 dropwithreset: 2090 /* 2091 * Generate a RST, dropping incoming segment. 2092 * Make ACK acceptable to originator of segment. 2093 * Don't bother to respond to RST. 2094 */ 2095 if (tiflags & TH_RST) 2096 goto drop; 2097 if (tiflags & TH_ACK) { 2098 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2099 TH_RST, m->m_pkthdr.ph_rtableid, now); 2100 } else { 2101 if (tiflags & TH_SYN) 2102 tlen++; 2103 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2104 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); 2105 } 2106 m_freem(m); 2107 in_pcbunref(inp); 2108 return IPPROTO_DONE; 2109 2110 drop: 2111 /* 2112 * Drop space held by incoming segment and return. 2113 */ 2114 if (otp) 2115 tcp_trace(TA_DROP, ostate, tp, otp, &saveti.caddr, 0, tlen); 2116 2117 m_freem(m); 2118 in_pcbunref(inp); 2119 return IPPROTO_DONE; 2120 } 2121 2122 int 2123 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2124 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2125 u_int rtableid, uint64_t now) 2126 { 2127 u_int16_t mss = 0; 2128 int opt, optlen; 2129 #ifdef TCP_SIGNATURE 2130 caddr_t sigp = NULL; 2131 struct tdb *tdb = NULL; 2132 #endif /* TCP_SIGNATURE */ 2133 2134 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2135 opt = cp[0]; 2136 if (opt == TCPOPT_EOL) 2137 break; 2138 if (opt == TCPOPT_NOP) 2139 optlen = 1; 2140 else { 2141 if (cnt < 2) 2142 break; 2143 optlen = cp[1]; 2144 if (optlen < 2 || optlen > cnt) 2145 break; 2146 } 2147 switch (opt) { 2148 2149 default: 2150 continue; 2151 2152 case TCPOPT_MAXSEG: 2153 if (optlen != TCPOLEN_MAXSEG) 2154 continue; 2155 if (!(th->th_flags & TH_SYN)) 2156 continue; 2157 if (TCPS_HAVERCVDSYN(tp->t_state)) 2158 continue; 2159 memcpy(&mss, cp + 2, sizeof(mss)); 2160 mss = ntohs(mss); 2161 oi->maxseg = mss; 2162 break; 2163 2164 case TCPOPT_WINDOW: 2165 if (optlen != TCPOLEN_WINDOW) 2166 continue; 2167 if (!(th->th_flags & TH_SYN)) 2168 continue; 2169 if (TCPS_HAVERCVDSYN(tp->t_state)) 2170 continue; 2171 tp->t_flags |= TF_RCVD_SCALE; 2172 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2173 break; 2174 2175 case TCPOPT_TIMESTAMP: 2176 if (optlen != TCPOLEN_TIMESTAMP) 2177 continue; 2178 oi->ts_present = 1; 2179 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2180 oi->ts_val = ntohl(oi->ts_val); 2181 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2182 oi->ts_ecr = ntohl(oi->ts_ecr); 2183 2184 if (!(th->th_flags & TH_SYN)) 2185 continue; 2186 if (TCPS_HAVERCVDSYN(tp->t_state)) 2187 continue; 2188 /* 2189 * A timestamp received in a SYN makes 2190 * it ok to send timestamp requests and replies. 2191 */ 2192 tp->t_flags |= TF_RCVD_TSTMP; 2193 tp->ts_recent = oi->ts_val; 2194 tp->ts_recent_age = now; 2195 break; 2196 2197 case TCPOPT_SACK_PERMITTED: 2198 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2199 continue; 2200 if (!(th->th_flags & TH_SYN)) 2201 continue; 2202 if (TCPS_HAVERCVDSYN(tp->t_state)) 2203 continue; 2204 /* MUST only be set on SYN */ 2205 tp->t_flags |= TF_SACK_PERMIT; 2206 break; 2207 case TCPOPT_SACK: 2208 tcp_sack_option(tp, th, cp, optlen); 2209 break; 2210 #ifdef TCP_SIGNATURE 2211 case TCPOPT_SIGNATURE: 2212 if (optlen != TCPOLEN_SIGNATURE) 2213 continue; 2214 2215 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2216 goto bad; 2217 2218 sigp = cp + 2; 2219 break; 2220 #endif /* TCP_SIGNATURE */ 2221 } 2222 } 2223 2224 #ifdef TCP_SIGNATURE 2225 if (tp->t_flags & TF_SIGNATURE) { 2226 union sockaddr_union src, dst; 2227 2228 memset(&src, 0, sizeof(union sockaddr_union)); 2229 memset(&dst, 0, sizeof(union sockaddr_union)); 2230 2231 switch (tp->pf) { 2232 case 0: 2233 case AF_INET: 2234 src.sa.sa_len = sizeof(struct sockaddr_in); 2235 src.sa.sa_family = AF_INET; 2236 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2237 dst.sa.sa_len = sizeof(struct sockaddr_in); 2238 dst.sa.sa_family = AF_INET; 2239 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2240 break; 2241 #ifdef INET6 2242 case AF_INET6: 2243 src.sa.sa_len = sizeof(struct sockaddr_in6); 2244 src.sa.sa_family = AF_INET6; 2245 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2246 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2247 dst.sa.sa_family = AF_INET6; 2248 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2249 break; 2250 #endif /* INET6 */ 2251 } 2252 2253 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2254 0, &src, &dst, IPPROTO_TCP); 2255 2256 /* 2257 * We don't have an SA for this peer, so we turn off 2258 * TF_SIGNATURE on the listen socket 2259 */ 2260 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2261 tp->t_flags &= ~TF_SIGNATURE; 2262 2263 } 2264 2265 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2266 tcpstat_inc(tcps_rcvbadsig); 2267 goto bad; 2268 } 2269 2270 if (sigp) { 2271 char sig[16]; 2272 2273 if (tdb == NULL) { 2274 tcpstat_inc(tcps_rcvbadsig); 2275 goto bad; 2276 } 2277 2278 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2279 goto bad; 2280 2281 if (timingsafe_bcmp(sig, sigp, 16)) { 2282 tcpstat_inc(tcps_rcvbadsig); 2283 goto bad; 2284 } 2285 2286 tcpstat_inc(tcps_rcvgoodsig); 2287 } 2288 2289 tdb_unref(tdb); 2290 #endif /* TCP_SIGNATURE */ 2291 2292 return (0); 2293 2294 #ifdef TCP_SIGNATURE 2295 bad: 2296 tdb_unref(tdb); 2297 #endif /* TCP_SIGNATURE */ 2298 return (-1); 2299 } 2300 2301 u_long 2302 tcp_seq_subtract(u_long a, u_long b) 2303 { 2304 return ((long)(a - b)); 2305 } 2306 2307 /* 2308 * This function is called upon receipt of new valid data (while not in header 2309 * prediction mode), and it updates the ordered list of sacks. 2310 */ 2311 void 2312 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2313 tcp_seq rcv_lastend) 2314 { 2315 /* 2316 * First reported block MUST be the most recent one. Subsequent 2317 * blocks SHOULD be in the order in which they arrived at the 2318 * receiver. These two conditions make the implementation fully 2319 * compliant with RFC 2018. 2320 */ 2321 int i, j = 0, count = 0, lastpos = -1; 2322 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2323 2324 /* First clean up current list of sacks */ 2325 for (i = 0; i < tp->rcv_numsacks; i++) { 2326 sack = tp->sackblks[i]; 2327 if (sack.start == 0 && sack.end == 0) { 2328 count++; /* count = number of blocks to be discarded */ 2329 continue; 2330 } 2331 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2332 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2333 count++; 2334 } else { 2335 temp[j].start = tp->sackblks[i].start; 2336 temp[j++].end = tp->sackblks[i].end; 2337 } 2338 } 2339 tp->rcv_numsacks -= count; 2340 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2341 tcp_clean_sackreport(tp); 2342 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2343 /* ==> need first sack block */ 2344 tp->sackblks[0].start = rcv_laststart; 2345 tp->sackblks[0].end = rcv_lastend; 2346 tp->rcv_numsacks = 1; 2347 } 2348 return; 2349 } 2350 /* Otherwise, sack blocks are already present. */ 2351 for (i = 0; i < tp->rcv_numsacks; i++) 2352 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2353 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2354 return; /* sack list remains unchanged */ 2355 /* 2356 * From here, segment just received should be (part of) the 1st sack. 2357 * Go through list, possibly coalescing sack block entries. 2358 */ 2359 firstsack.start = rcv_laststart; 2360 firstsack.end = rcv_lastend; 2361 for (i = 0; i < tp->rcv_numsacks; i++) { 2362 sack = tp->sackblks[i]; 2363 if (SEQ_LT(sack.end, firstsack.start) || 2364 SEQ_GT(sack.start, firstsack.end)) 2365 continue; /* no overlap */ 2366 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2367 /* 2368 * identical block; delete it here since we will 2369 * move it to the front of the list. 2370 */ 2371 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2372 lastpos = i; /* last posn with a zero entry */ 2373 continue; 2374 } 2375 if (SEQ_LEQ(sack.start, firstsack.start)) 2376 firstsack.start = sack.start; /* merge blocks */ 2377 if (SEQ_GEQ(sack.end, firstsack.end)) 2378 firstsack.end = sack.end; /* merge blocks */ 2379 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2380 lastpos = i; /* last posn with a zero entry */ 2381 } 2382 if (lastpos != -1) { /* at least one merge */ 2383 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2384 sack = tp->sackblks[i]; 2385 if (sack.start == 0 && sack.end == 0) 2386 continue; 2387 temp[j++] = sack; 2388 } 2389 tp->rcv_numsacks = j; /* including first blk (added later) */ 2390 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2391 tp->sackblks[i] = temp[i]; 2392 } else { /* no merges -- shift sacks by 1 */ 2393 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2394 tp->rcv_numsacks++; 2395 for (i = tp->rcv_numsacks-1; i > 0; i--) 2396 tp->sackblks[i] = tp->sackblks[i-1]; 2397 } 2398 tp->sackblks[0] = firstsack; 2399 return; 2400 } 2401 2402 /* 2403 * Process the TCP SACK option. tp->snd_holes is an ordered list 2404 * of holes (oldest to newest, in terms of the sequence space). 2405 */ 2406 void 2407 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2408 { 2409 int tmp_olen; 2410 u_char *tmp_cp; 2411 struct sackhole *cur, *p, *temp; 2412 2413 if (!tp->sack_enable) 2414 return; 2415 /* SACK without ACK doesn't make sense. */ 2416 if ((th->th_flags & TH_ACK) == 0) 2417 return; 2418 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2419 if (SEQ_LT(th->th_ack, tp->snd_una) || 2420 SEQ_GT(th->th_ack, tp->snd_max)) 2421 return; 2422 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2423 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2424 return; 2425 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2426 tmp_cp = cp + 2; 2427 tmp_olen = optlen - 2; 2428 tcpstat_inc(tcps_sack_rcv_opts); 2429 if (tp->snd_numholes < 0) 2430 tp->snd_numholes = 0; 2431 if (tp->t_maxseg == 0) 2432 panic("tcp_sack_option"); /* Should never happen */ 2433 while (tmp_olen > 0) { 2434 struct sackblk sack; 2435 2436 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2437 sack.start = ntohl(sack.start); 2438 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2439 sack.end = ntohl(sack.end); 2440 tmp_olen -= TCPOLEN_SACK; 2441 tmp_cp += TCPOLEN_SACK; 2442 if (SEQ_LEQ(sack.end, sack.start)) 2443 continue; /* bad SACK fields */ 2444 if (SEQ_LEQ(sack.end, tp->snd_una)) 2445 continue; /* old block */ 2446 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2447 if (SEQ_LT(sack.start, th->th_ack)) 2448 continue; 2449 } 2450 if (SEQ_GT(sack.end, tp->snd_max)) 2451 continue; 2452 if (tp->snd_holes == NULL) { /* first hole */ 2453 tp->snd_holes = (struct sackhole *) 2454 pool_get(&sackhl_pool, PR_NOWAIT); 2455 if (tp->snd_holes == NULL) { 2456 /* ENOBUFS, so ignore SACKed block for now */ 2457 goto dropped; 2458 } 2459 cur = tp->snd_holes; 2460 cur->start = th->th_ack; 2461 cur->end = sack.start; 2462 cur->rxmit = cur->start; 2463 cur->next = NULL; 2464 tp->snd_numholes = 1; 2465 tp->rcv_lastsack = sack.end; 2466 /* 2467 * dups is at least one. If more data has been 2468 * SACKed, it can be greater than one. 2469 */ 2470 cur->dups = min(tcprexmtthresh, 2471 ((sack.end - cur->end)/tp->t_maxseg)); 2472 if (cur->dups < 1) 2473 cur->dups = 1; 2474 continue; /* with next sack block */ 2475 } 2476 /* Go thru list of holes: p = previous, cur = current */ 2477 p = cur = tp->snd_holes; 2478 while (cur) { 2479 if (SEQ_LEQ(sack.end, cur->start)) 2480 /* SACKs data before the current hole */ 2481 break; /* no use going through more holes */ 2482 if (SEQ_GEQ(sack.start, cur->end)) { 2483 /* SACKs data beyond the current hole */ 2484 cur->dups++; 2485 if (((sack.end - cur->end)/tp->t_maxseg) >= 2486 tcprexmtthresh) 2487 cur->dups = tcprexmtthresh; 2488 p = cur; 2489 cur = cur->next; 2490 continue; 2491 } 2492 if (SEQ_LEQ(sack.start, cur->start)) { 2493 /* Data acks at least the beginning of hole */ 2494 if (SEQ_GEQ(sack.end, cur->end)) { 2495 /* Acks entire hole, so delete hole */ 2496 if (p != cur) { 2497 p->next = cur->next; 2498 pool_put(&sackhl_pool, cur); 2499 cur = p->next; 2500 } else { 2501 cur = cur->next; 2502 pool_put(&sackhl_pool, p); 2503 p = cur; 2504 tp->snd_holes = p; 2505 } 2506 tp->snd_numholes--; 2507 continue; 2508 } 2509 /* otherwise, move start of hole forward */ 2510 cur->start = sack.end; 2511 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2512 p = cur; 2513 cur = cur->next; 2514 continue; 2515 } 2516 /* move end of hole backward */ 2517 if (SEQ_GEQ(sack.end, cur->end)) { 2518 cur->end = sack.start; 2519 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2520 cur->dups++; 2521 if (((sack.end - cur->end)/tp->t_maxseg) >= 2522 tcprexmtthresh) 2523 cur->dups = tcprexmtthresh; 2524 p = cur; 2525 cur = cur->next; 2526 continue; 2527 } 2528 if (SEQ_LT(cur->start, sack.start) && 2529 SEQ_GT(cur->end, sack.end)) { 2530 /* 2531 * ACKs some data in middle of a hole; need to 2532 * split current hole 2533 */ 2534 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2535 goto dropped; 2536 temp = (struct sackhole *) 2537 pool_get(&sackhl_pool, PR_NOWAIT); 2538 if (temp == NULL) 2539 goto dropped; /* ENOBUFS */ 2540 temp->next = cur->next; 2541 temp->start = sack.end; 2542 temp->end = cur->end; 2543 temp->dups = cur->dups; 2544 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2545 cur->end = sack.start; 2546 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2547 cur->dups++; 2548 if (((sack.end - cur->end)/tp->t_maxseg) >= 2549 tcprexmtthresh) 2550 cur->dups = tcprexmtthresh; 2551 cur->next = temp; 2552 p = temp; 2553 cur = p->next; 2554 tp->snd_numholes++; 2555 } 2556 } 2557 /* At this point, p points to the last hole on the list */ 2558 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2559 /* 2560 * Need to append new hole at end. 2561 * Last hole is p (and it's not NULL). 2562 */ 2563 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2564 goto dropped; 2565 temp = (struct sackhole *) 2566 pool_get(&sackhl_pool, PR_NOWAIT); 2567 if (temp == NULL) 2568 goto dropped; /* ENOBUFS */ 2569 temp->start = tp->rcv_lastsack; 2570 temp->end = sack.start; 2571 temp->dups = min(tcprexmtthresh, 2572 ((sack.end - sack.start)/tp->t_maxseg)); 2573 if (temp->dups < 1) 2574 temp->dups = 1; 2575 temp->rxmit = temp->start; 2576 temp->next = 0; 2577 p->next = temp; 2578 tp->rcv_lastsack = sack.end; 2579 tp->snd_numholes++; 2580 } 2581 } 2582 return; 2583 dropped: 2584 tcpstat_inc(tcps_sack_drop_opts); 2585 } 2586 2587 /* 2588 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2589 * it is completely acked; otherwise, tcp_sack_option(), called from 2590 * tcp_dooptions(), will fix up the hole. 2591 */ 2592 void 2593 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2594 { 2595 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2596 /* max because this could be an older ack just arrived */ 2597 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2598 th->th_ack : tp->snd_una; 2599 struct sackhole *cur = tp->snd_holes; 2600 struct sackhole *prev; 2601 while (cur) 2602 if (SEQ_LEQ(cur->end, lastack)) { 2603 prev = cur; 2604 cur = cur->next; 2605 pool_put(&sackhl_pool, prev); 2606 tp->snd_numholes--; 2607 } else if (SEQ_LT(cur->start, lastack)) { 2608 cur->start = lastack; 2609 if (SEQ_LT(cur->rxmit, cur->start)) 2610 cur->rxmit = cur->start; 2611 break; 2612 } else 2613 break; 2614 tp->snd_holes = cur; 2615 } 2616 } 2617 2618 /* 2619 * Delete all receiver-side SACK information. 2620 */ 2621 void 2622 tcp_clean_sackreport(struct tcpcb *tp) 2623 { 2624 int i; 2625 2626 tp->rcv_numsacks = 0; 2627 for (i = 0; i < MAX_SACK_BLKS; i++) 2628 tp->sackblks[i].start = tp->sackblks[i].end=0; 2629 2630 } 2631 2632 /* 2633 * Partial ack handling within a sack recovery episode. When a partial ack 2634 * arrives, turn off retransmission timer, deflate the window, do not clear 2635 * tp->t_dupacks. 2636 */ 2637 void 2638 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2639 { 2640 /* Turn off retx. timer (will start again next segment) */ 2641 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2642 tp->t_rtttime = 0; 2643 /* 2644 * Partial window deflation. This statement relies on the 2645 * fact that tp->snd_una has not been updated yet. 2646 */ 2647 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2648 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2649 tp->snd_cwnd += tp->t_maxseg; 2650 } else 2651 tp->snd_cwnd = tp->t_maxseg; 2652 tp->snd_cwnd += tp->t_maxseg; 2653 tp->t_flags |= TF_NEEDOUTPUT; 2654 } 2655 2656 /* 2657 * Pull out of band byte out of a segment so 2658 * it doesn't appear in the user's data queue. 2659 * It is still reflected in the segment length for 2660 * sequencing purposes. 2661 */ 2662 void 2663 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2664 { 2665 int cnt = off + urgent - 1; 2666 2667 while (cnt >= 0) { 2668 if (m->m_len > cnt) { 2669 char *cp = mtod(m, caddr_t) + cnt; 2670 struct tcpcb *tp = sototcpcb(so); 2671 2672 tp->t_iobc = *cp; 2673 tp->t_oobflags |= TCPOOB_HAVEDATA; 2674 memmove(cp, cp + 1, m->m_len - cnt - 1); 2675 m->m_len--; 2676 return; 2677 } 2678 cnt -= m->m_len; 2679 m = m->m_next; 2680 if (m == NULL) 2681 break; 2682 } 2683 panic("tcp_pulloutofband"); 2684 } 2685 2686 /* 2687 * Collect new round-trip time estimate 2688 * and update averages and current timeout. 2689 */ 2690 void 2691 tcp_xmit_timer(struct tcpcb *tp, int32_t rtt) 2692 { 2693 int delta, rttmin; 2694 2695 if (rtt < 0) 2696 rtt = 0; 2697 else if (rtt > TCP_RTT_MAX) 2698 rtt = TCP_RTT_MAX; 2699 2700 tcpstat_inc(tcps_rttupdated); 2701 if (tp->t_srtt != 0) { 2702 /* 2703 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2704 * after the binary point (scaled by 4), whereas 2705 * srtt is stored as fixed point with 5 bits after the 2706 * binary point (i.e., scaled by 32). The following magic 2707 * is equivalent to the smoothing algorithm in rfc793 with 2708 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2709 * point). 2710 */ 2711 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2712 (tp->t_srtt >> TCP_RTT_SHIFT); 2713 if ((tp->t_srtt += delta) <= 0) 2714 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2715 /* 2716 * We accumulate a smoothed rtt variance (actually, a 2717 * smoothed mean difference), then set the retransmit 2718 * timer to smoothed rtt + 4 times the smoothed variance. 2719 * rttvar is stored as fixed point with 4 bits after the 2720 * binary point (scaled by 16). The following is 2721 * equivalent to rfc793 smoothing with an alpha of .75 2722 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2723 * rfc793's wired-in beta. 2724 */ 2725 if (delta < 0) 2726 delta = -delta; 2727 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2728 if ((tp->t_rttvar += delta) <= 0) 2729 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2730 } else { 2731 /* 2732 * No rtt measurement yet - use the unsmoothed rtt. 2733 * Set the variance to half the rtt (so our first 2734 * retransmit happens at 3*rtt). 2735 */ 2736 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2737 tp->t_rttvar = (rtt + 1) << 2738 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2739 } 2740 tp->t_rtttime = 0; 2741 tp->t_rxtshift = 0; 2742 2743 /* 2744 * the retransmit should happen at rtt + 4 * rttvar. 2745 * Because of the way we do the smoothing, srtt and rttvar 2746 * will each average +1/2 tick of bias. When we compute 2747 * the retransmit timer, we want 1/2 tick of rounding and 2748 * 1 extra tick because of +-1/2 tick uncertainty in the 2749 * firing of the timer. The bias will give us exactly the 2750 * 1.5 tick we need. But, because the bias is 2751 * statistical, we have to test that we don't drop below 2752 * the minimum feasible timer (which is 2 ticks). 2753 */ 2754 rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)), 2755 TCPTV_REXMTMAX); 2756 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2757 2758 /* 2759 * We received an ack for a packet that wasn't retransmitted; 2760 * it is probably safe to discard any error indications we've 2761 * received recently. This isn't quite right, but close enough 2762 * for now (a route might have failed after we sent a segment, 2763 * and the return path might not be symmetrical). 2764 */ 2765 tp->t_softerror = 0; 2766 } 2767 2768 /* 2769 * Determine a reasonable value for maxseg size. 2770 * If the route is known, check route for mtu. 2771 * If none, use an mss that can be handled on the outgoing 2772 * interface without forcing IP to fragment; if bigger than 2773 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2774 * to utilize large mbufs. If no route is found, route has no mtu, 2775 * or the destination isn't local, use a default, hopefully conservative 2776 * size (usually 512 or the default IP max size, but no more than the mtu 2777 * of the interface), as we can't discover anything about intervening 2778 * gateways or networks. We also initialize the congestion/slow start 2779 * window to be a single segment if the destination isn't local. 2780 * While looking at the routing entry, we also initialize other path-dependent 2781 * parameters from pre-set or cached values in the routing entry. 2782 * 2783 * Also take into account the space needed for options that we 2784 * send regularly. Make maxseg shorter by that amount to assure 2785 * that we can send maxseg amount of data even when the options 2786 * are present. Store the upper limit of the length of options plus 2787 * data in maxopd. 2788 * 2789 * NOTE: offer == -1 indicates that the maxseg size changed due to 2790 * Path MTU discovery. 2791 */ 2792 int 2793 tcp_mss(struct tcpcb *tp, int offer) 2794 { 2795 struct rtentry *rt; 2796 struct ifnet *ifp = NULL; 2797 int mss, mssopt; 2798 int iphlen; 2799 struct inpcb *inp; 2800 2801 inp = tp->t_inpcb; 2802 2803 mssopt = mss = tcp_mssdflt; 2804 2805 rt = in_pcbrtentry(inp); 2806 2807 if (rt == NULL) 2808 goto out; 2809 2810 ifp = if_get(rt->rt_ifidx); 2811 if (ifp == NULL) 2812 goto out; 2813 2814 switch (tp->pf) { 2815 #ifdef INET6 2816 case AF_INET6: 2817 iphlen = sizeof(struct ip6_hdr); 2818 break; 2819 #endif 2820 case AF_INET: 2821 iphlen = sizeof(struct ip); 2822 break; 2823 default: 2824 /* the family does not support path MTU discovery */ 2825 goto out; 2826 } 2827 2828 /* 2829 * if there's an mtu associated with the route and we support 2830 * path MTU discovery for the underlying protocol family, use it. 2831 */ 2832 if (rt->rt_mtu) { 2833 /* 2834 * One may wish to lower MSS to take into account options, 2835 * especially security-related options. 2836 */ 2837 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2838 /* 2839 * RFC2460 section 5, last paragraph: if path MTU is 2840 * smaller than 1280, use 1280 as packet size and 2841 * attach fragment header. 2842 */ 2843 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2844 sizeof(struct tcphdr); 2845 } else { 2846 mss = rt->rt_mtu - iphlen - 2847 sizeof(struct tcphdr); 2848 } 2849 } else if (ifp->if_flags & IFF_LOOPBACK) { 2850 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2851 } else if (tp->pf == AF_INET) { 2852 if (ip_mtudisc) 2853 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2854 } 2855 #ifdef INET6 2856 else if (tp->pf == AF_INET6) { 2857 /* 2858 * for IPv6, path MTU discovery is always turned on, 2859 * or the node must use packet size <= 1280. 2860 */ 2861 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2862 } 2863 #endif /* INET6 */ 2864 2865 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2866 if (offer != -1) { 2867 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2868 mssopt = max(tcp_mssdflt, mssopt); 2869 } 2870 out: 2871 if_put(ifp); 2872 /* 2873 * The current mss, t_maxseg, is initialized to the default value. 2874 * If we compute a smaller value, reduce the current mss. 2875 * If we compute a larger value, return it for use in sending 2876 * a max seg size option, but don't store it for use 2877 * unless we received an offer at least that large from peer. 2878 * 2879 * However, do not accept offers lower than the minimum of 2880 * the interface MTU and 216. 2881 */ 2882 if (offer > 0) 2883 tp->t_peermss = offer; 2884 if (tp->t_peermss) 2885 mss = min(mss, max(tp->t_peermss, 216)); 2886 2887 /* sanity - at least max opt. space */ 2888 mss = max(mss, 64); 2889 2890 /* 2891 * maxopd stores the maximum length of data AND options 2892 * in a segment; maxseg is the amount of data in a normal 2893 * segment. We need to store this value (maxopd) apart 2894 * from maxseg, because now every segment carries options 2895 * and thus we normally have somewhat less data in segments. 2896 */ 2897 tp->t_maxopd = mss; 2898 2899 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2900 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2901 mss -= TCPOLEN_TSTAMP_APPA; 2902 #ifdef TCP_SIGNATURE 2903 if (tp->t_flags & TF_SIGNATURE) 2904 mss -= TCPOLEN_SIGLEN; 2905 #endif 2906 2907 if (offer == -1) { 2908 /* mss changed due to Path MTU discovery */ 2909 tp->t_flags &= ~TF_PMTUD_PEND; 2910 tp->t_pmtud_mtu_sent = 0; 2911 tp->t_pmtud_mss_acked = 0; 2912 if (mss < tp->t_maxseg) { 2913 /* 2914 * Follow suggestion in RFC 2414 to reduce the 2915 * congestion window by the ratio of the old 2916 * segment size to the new segment size. 2917 */ 2918 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2919 mss, mss); 2920 } 2921 } else if (tcp_do_rfc3390 == 2) { 2922 /* increase initial window */ 2923 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2924 } else if (tcp_do_rfc3390) { 2925 /* increase initial window */ 2926 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2927 } else 2928 tp->snd_cwnd = mss; 2929 2930 tp->t_maxseg = mss; 2931 2932 return (offer != -1 ? mssopt : mss); 2933 } 2934 2935 u_int 2936 tcp_hdrsz(struct tcpcb *tp) 2937 { 2938 u_int hlen; 2939 2940 switch (tp->pf) { 2941 #ifdef INET6 2942 case AF_INET6: 2943 hlen = sizeof(struct ip6_hdr); 2944 break; 2945 #endif 2946 case AF_INET: 2947 hlen = sizeof(struct ip); 2948 break; 2949 default: 2950 hlen = 0; 2951 break; 2952 } 2953 hlen += sizeof(struct tcphdr); 2954 2955 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2956 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2957 hlen += TCPOLEN_TSTAMP_APPA; 2958 #ifdef TCP_SIGNATURE 2959 if (tp->t_flags & TF_SIGNATURE) 2960 hlen += TCPOLEN_SIGLEN; 2961 #endif 2962 return (hlen); 2963 } 2964 2965 /* 2966 * Set connection variables based on the effective MSS. 2967 * We are passed the TCPCB for the actual connection. If we 2968 * are the server, we are called by the compressed state engine 2969 * when the 3-way handshake is complete. If we are the client, 2970 * we are called when we receive the SYN,ACK from the server. 2971 * 2972 * NOTE: The t_maxseg value must be initialized in the TCPCB 2973 * before this routine is called! 2974 */ 2975 void 2976 tcp_mss_update(struct tcpcb *tp) 2977 { 2978 int mss; 2979 u_long bufsize; 2980 struct rtentry *rt; 2981 struct socket *so; 2982 2983 so = tp->t_inpcb->inp_socket; 2984 mss = tp->t_maxseg; 2985 2986 rt = in_pcbrtentry(tp->t_inpcb); 2987 2988 if (rt == NULL) 2989 return; 2990 2991 bufsize = so->so_snd.sb_hiwat; 2992 if (bufsize < mss) { 2993 mss = bufsize; 2994 /* Update t_maxseg and t_maxopd */ 2995 tcp_mss(tp, mss); 2996 } else { 2997 bufsize = roundup(bufsize, mss); 2998 if (bufsize > sb_max) 2999 bufsize = sb_max; 3000 (void)sbreserve(so, &so->so_snd, bufsize); 3001 } 3002 3003 bufsize = so->so_rcv.sb_hiwat; 3004 if (bufsize > mss) { 3005 bufsize = roundup(bufsize, mss); 3006 if (bufsize > sb_max) 3007 bufsize = sb_max; 3008 (void)sbreserve(so, &so->so_rcv, bufsize); 3009 } 3010 3011 } 3012 3013 /* 3014 * When a partial ack arrives, force the retransmission of the 3015 * next unacknowledged segment. Do not clear tp->t_dupacks. 3016 * By setting snd_nxt to ti_ack, this forces retransmission timer 3017 * to be started again. 3018 */ 3019 void 3020 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3021 { 3022 /* 3023 * snd_una has not been updated and the socket send buffer 3024 * not yet drained of the acked data, so we have to leave 3025 * snd_una as it was to get the correct data offset in 3026 * tcp_output(). 3027 */ 3028 tcp_seq onxt = tp->snd_nxt; 3029 u_long ocwnd = tp->snd_cwnd; 3030 3031 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3032 tp->t_rtttime = 0; 3033 tp->snd_nxt = th->th_ack; 3034 /* 3035 * Set snd_cwnd to one segment beyond acknowledged offset 3036 * (tp->snd_una not yet updated when this function is called) 3037 */ 3038 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3039 (void)tcp_output(tp); 3040 tp->snd_cwnd = ocwnd; 3041 if (SEQ_GT(onxt, tp->snd_nxt)) 3042 tp->snd_nxt = onxt; 3043 /* 3044 * Partial window deflation. Relies on fact that tp->snd_una 3045 * not updated yet. 3046 */ 3047 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3048 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3049 else 3050 tp->snd_cwnd = 0; 3051 tp->snd_cwnd += tp->t_maxseg; 3052 } 3053 3054 int 3055 tcp_mss_adv(struct mbuf *m, int af) 3056 { 3057 int mss = 0; 3058 int iphlen; 3059 struct ifnet *ifp = NULL; 3060 3061 if (m && (m->m_flags & M_PKTHDR)) 3062 ifp = if_get(m->m_pkthdr.ph_ifidx); 3063 3064 switch (af) { 3065 case AF_INET: 3066 if (ifp != NULL) 3067 mss = ifp->if_mtu; 3068 iphlen = sizeof(struct ip); 3069 break; 3070 #ifdef INET6 3071 case AF_INET6: 3072 if (ifp != NULL) 3073 mss = ifp->if_mtu; 3074 iphlen = sizeof(struct ip6_hdr); 3075 break; 3076 #endif 3077 default: 3078 unhandled_af(af); 3079 } 3080 if_put(ifp); 3081 mss = mss - iphlen - sizeof(struct tcphdr); 3082 return (max(mss, tcp_mssdflt)); 3083 } 3084 3085 /* 3086 * TCP compressed state engine. Currently used to hold compressed 3087 * state for SYN_RECEIVED. 3088 */ 3089 3090 /* 3091 * Locks used to protect global data and struct members: 3092 * N net lock 3093 * S syn_cache_mtx tcp syn cache global mutex 3094 */ 3095 3096 /* syn hash parameters */ 3097 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; /* [N] size of hash table */ 3098 int tcp_syn_cache_limit = /* [N] global entry limit */ 3099 TCP_SYN_HASH_SIZE * TCP_SYN_BUCKET_SIZE; 3100 int tcp_syn_bucket_limit = /* [N] per bucket limit */ 3101 3 * TCP_SYN_BUCKET_SIZE; 3102 int tcp_syn_use_limit = 100000; /* [N] reseed after uses */ 3103 3104 struct pool syn_cache_pool; 3105 struct syn_cache_set tcp_syn_cache[2]; 3106 int tcp_syn_cache_active; 3107 struct mutex syn_cache_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); 3108 3109 #define SYN_HASH(sa, sp, dp, rand) \ 3110 (((sa)->s_addr ^ (rand)[0]) * \ 3111 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3112 #ifndef INET6 3113 #define SYN_HASHALL(hash, src, dst, rand) \ 3114 do { \ 3115 hash = SYN_HASH(&satosin_const(src)->sin_addr, \ 3116 satosin_const(src)->sin_port, \ 3117 satosin_const(dst)->sin_port, (rand)); \ 3118 } while (/*CONSTCOND*/ 0) 3119 #else 3120 #define SYN_HASH6(sa, sp, dp, rand) \ 3121 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3122 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3123 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3124 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3125 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3126 3127 #define SYN_HASHALL(hash, src, dst, rand) \ 3128 do { \ 3129 switch ((src)->sa_family) { \ 3130 case AF_INET: \ 3131 hash = SYN_HASH(&satosin_const(src)->sin_addr, \ 3132 satosin_const(src)->sin_port, \ 3133 satosin_const(dst)->sin_port, (rand)); \ 3134 break; \ 3135 case AF_INET6: \ 3136 hash = SYN_HASH6(&satosin6_const(src)->sin6_addr, \ 3137 satosin6_const(src)->sin6_port, \ 3138 satosin6_const(dst)->sin6_port, (rand)); \ 3139 break; \ 3140 default: \ 3141 hash = 0; \ 3142 } \ 3143 } while (/*CONSTCOND*/0) 3144 #endif /* INET6 */ 3145 3146 void 3147 syn_cache_rm(struct syn_cache *sc) 3148 { 3149 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3150 3151 KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD)); 3152 SET(sc->sc_dynflags, SCF_DEAD); 3153 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3154 sc->sc_tp = NULL; 3155 LIST_REMOVE(sc, sc_tpq); 3156 refcnt_rele(&sc->sc_refcnt); 3157 sc->sc_buckethead->sch_length--; 3158 if (timeout_del(&sc->sc_timer)) 3159 refcnt_rele(&sc->sc_refcnt); 3160 sc->sc_set->scs_count--; 3161 } 3162 3163 void 3164 syn_cache_put(struct syn_cache *sc) 3165 { 3166 if (refcnt_rele(&sc->sc_refcnt) == 0) 3167 return; 3168 3169 /* Dealing with last reference, no lock needed. */ 3170 m_free(sc->sc_ipopts); 3171 rtfree(sc->sc_route.ro_rt); 3172 3173 pool_put(&syn_cache_pool, sc); 3174 } 3175 3176 void 3177 syn_cache_init(void) 3178 { 3179 int i; 3180 3181 /* Initialize the hash buckets. */ 3182 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3183 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3184 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3185 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3186 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3187 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3188 for (i = 0; i < tcp_syn_hash_size; i++) { 3189 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3190 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3191 } 3192 3193 /* Initialize the syn cache pool. */ 3194 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3195 0, "syncache", NULL); 3196 } 3197 3198 void 3199 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3200 { 3201 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3202 struct syn_cache_head *scp; 3203 struct syn_cache *sc2; 3204 int i; 3205 3206 NET_ASSERT_LOCKED(); 3207 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3208 3209 /* 3210 * If there are no entries in the hash table, reinitialize 3211 * the hash secrets. To avoid useless cache swaps and 3212 * reinitialization, use it until the limit is reached. 3213 * An empty cache is also the opportunity to resize the hash. 3214 */ 3215 if (set->scs_count == 0 && set->scs_use <= 0) { 3216 set->scs_use = tcp_syn_use_limit; 3217 if (set->scs_size != tcp_syn_hash_size) { 3218 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3219 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3220 if (scp == NULL) { 3221 /* Try again next time. */ 3222 set->scs_use = 0; 3223 } else { 3224 free(set->scs_buckethead, M_SYNCACHE, 3225 set->scs_size * 3226 sizeof(struct syn_cache_head)); 3227 set->scs_buckethead = scp; 3228 set->scs_size = tcp_syn_hash_size; 3229 for (i = 0; i < tcp_syn_hash_size; i++) 3230 TAILQ_INIT(&scp[i].sch_bucket); 3231 } 3232 } 3233 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3234 tcpstat_inc(tcps_sc_seedrandom); 3235 } 3236 3237 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3238 set->scs_random); 3239 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3240 sc->sc_buckethead = scp; 3241 3242 /* 3243 * Make sure that we don't overflow the per-bucket 3244 * limit or the total cache size limit. 3245 */ 3246 if (scp->sch_length >= tcp_syn_bucket_limit) { 3247 tcpstat_inc(tcps_sc_bucketoverflow); 3248 /* 3249 * Someone might attack our bucket hash function. Reseed 3250 * with random as soon as the passive syn cache gets empty. 3251 */ 3252 set->scs_use = 0; 3253 /* 3254 * The bucket is full. Toss the oldest element in the 3255 * bucket. This will be the first entry in the bucket. 3256 */ 3257 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3258 #ifdef DIAGNOSTIC 3259 /* 3260 * This should never happen; we should always find an 3261 * entry in our bucket. 3262 */ 3263 if (sc2 == NULL) 3264 panic("%s: bucketoverflow: impossible", __func__); 3265 #endif 3266 syn_cache_rm(sc2); 3267 syn_cache_put(sc2); 3268 } else if (set->scs_count >= tcp_syn_cache_limit) { 3269 struct syn_cache_head *scp2, *sce; 3270 3271 tcpstat_inc(tcps_sc_overflowed); 3272 /* 3273 * The cache is full. Toss the oldest entry in the 3274 * first non-empty bucket we can find. 3275 * 3276 * XXX We would really like to toss the oldest 3277 * entry in the cache, but we hope that this 3278 * condition doesn't happen very often. 3279 */ 3280 scp2 = scp; 3281 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3282 sce = &set->scs_buckethead[set->scs_size]; 3283 for (++scp2; scp2 != scp; scp2++) { 3284 if (scp2 >= sce) 3285 scp2 = &set->scs_buckethead[0]; 3286 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3287 break; 3288 } 3289 #ifdef DIAGNOSTIC 3290 /* 3291 * This should never happen; we should always find a 3292 * non-empty bucket. 3293 */ 3294 if (scp2 == scp) 3295 panic("%s: cacheoverflow: impossible", 3296 __func__); 3297 #endif 3298 } 3299 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3300 syn_cache_rm(sc2); 3301 syn_cache_put(sc2); 3302 } 3303 3304 /* 3305 * Initialize the entry's timer. We don't estimate RTT 3306 * with SYNs, so each packet starts with the default RTT 3307 * and each timer step has a fixed timeout value. 3308 */ 3309 sc->sc_rxttot = 0; 3310 sc->sc_rxtshift = 0; 3311 TCPT_RANGESET(sc->sc_rxtcur, 3312 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3313 TCPTV_REXMTMAX); 3314 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3315 refcnt_take(&sc->sc_refcnt); 3316 3317 /* Link it from tcpcb entry */ 3318 refcnt_take(&sc->sc_refcnt); 3319 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3320 3321 /* Put it into the bucket. */ 3322 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3323 scp->sch_length++; 3324 sc->sc_set = set; 3325 set->scs_count++; 3326 set->scs_use--; 3327 3328 tcpstat_inc(tcps_sc_added); 3329 3330 /* 3331 * If the active cache has exceeded its use limit and 3332 * the passive syn cache is empty, exchange their roles. 3333 */ 3334 if (set->scs_use <= 0 && 3335 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3336 tcp_syn_cache_active = !tcp_syn_cache_active; 3337 } 3338 3339 /* 3340 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3341 * If we have retransmitted an entry the maximum number of times, expire 3342 * that entry. 3343 */ 3344 void 3345 syn_cache_timer(void *arg) 3346 { 3347 struct syn_cache *sc = arg; 3348 uint64_t now; 3349 int lastref; 3350 3351 mtx_enter(&syn_cache_mtx); 3352 if (ISSET(sc->sc_dynflags, SCF_DEAD)) 3353 goto freeit; 3354 3355 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3356 /* Drop it -- too many retransmissions. */ 3357 goto dropit; 3358 } 3359 3360 /* 3361 * Compute the total amount of time this entry has 3362 * been on a queue. If this entry has been on longer 3363 * than the keep alive timer would allow, expire it. 3364 */ 3365 sc->sc_rxttot += sc->sc_rxtcur; 3366 if (sc->sc_rxttot >= READ_ONCE(tcptv_keep_init)) 3367 goto dropit; 3368 3369 /* Advance the timer back-off. */ 3370 sc->sc_rxtshift++; 3371 TCPT_RANGESET(sc->sc_rxtcur, 3372 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3373 TCPTV_REXMTMAX); 3374 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3375 refcnt_take(&sc->sc_refcnt); 3376 mtx_leave(&syn_cache_mtx); 3377 3378 NET_LOCK(); 3379 now = tcp_now(); 3380 (void) syn_cache_respond(sc, NULL, now); 3381 tcpstat_inc(tcps_sc_retransmitted); 3382 NET_UNLOCK(); 3383 3384 syn_cache_put(sc); 3385 return; 3386 3387 dropit: 3388 tcpstat_inc(tcps_sc_timed_out); 3389 syn_cache_rm(sc); 3390 /* Decrement reference of the timer and free object after remove. */ 3391 lastref = refcnt_rele(&sc->sc_refcnt); 3392 KASSERT(lastref == 0); 3393 (void)lastref; 3394 freeit: 3395 mtx_leave(&syn_cache_mtx); 3396 syn_cache_put(sc); 3397 } 3398 3399 /* 3400 * Remove syn cache created by the specified tcb entry, 3401 * because this does not make sense to keep them 3402 * (if there's no tcb entry, syn cache entry will never be used) 3403 */ 3404 void 3405 syn_cache_cleanup(struct tcpcb *tp) 3406 { 3407 struct syn_cache *sc, *nsc; 3408 3409 NET_ASSERT_LOCKED(); 3410 3411 mtx_enter(&syn_cache_mtx); 3412 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3413 #ifdef DIAGNOSTIC 3414 if (sc->sc_tp != tp) 3415 panic("invalid sc_tp in syn_cache_cleanup"); 3416 #endif 3417 syn_cache_rm(sc); 3418 syn_cache_put(sc); 3419 } 3420 mtx_leave(&syn_cache_mtx); 3421 3422 KASSERT(LIST_EMPTY(&tp->t_sc)); 3423 } 3424 3425 /* 3426 * Find an entry in the syn cache. 3427 */ 3428 struct syn_cache * 3429 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, 3430 struct syn_cache_head **headp, u_int rtableid) 3431 { 3432 struct syn_cache_set *sets[2]; 3433 struct syn_cache *sc; 3434 struct syn_cache_head *scp; 3435 u_int32_t hash; 3436 int i; 3437 3438 NET_ASSERT_LOCKED(); 3439 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3440 3441 /* Check the active cache first, the passive cache is likely empty. */ 3442 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3443 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3444 for (i = 0; i < 2; i++) { 3445 if (sets[i]->scs_count == 0) 3446 continue; 3447 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3448 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3449 *headp = scp; 3450 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3451 if (sc->sc_hash != hash) 3452 continue; 3453 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3454 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3455 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3456 return (sc); 3457 } 3458 } 3459 return (NULL); 3460 } 3461 3462 /* 3463 * This function gets called when we receive an ACK for a 3464 * socket in the LISTEN state. We look up the connection 3465 * in the syn cache, and if its there, we pull it out of 3466 * the cache and turn it into a full-blown connection in 3467 * the SYN-RECEIVED state. 3468 * 3469 * The return values may not be immediately obvious, and their effects 3470 * can be subtle, so here they are: 3471 * 3472 * NULL SYN was not found in cache; caller should drop the 3473 * packet and send an RST. 3474 * 3475 * -1 We were unable to create the new connection, and are 3476 * aborting it. An ACK,RST is being sent to the peer 3477 * (unless we got screwy sequence numbers; see below), 3478 * because the 3-way handshake has been completed. Caller 3479 * should not free the mbuf, since we may be using it. If 3480 * we are not, we will free it. 3481 * 3482 * Otherwise, the return value is a pointer to the new socket 3483 * associated with the connection. 3484 */ 3485 struct socket * 3486 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3487 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint64_t now) 3488 { 3489 struct syn_cache *sc; 3490 struct syn_cache_head *scp; 3491 struct inpcb *inp, *oldinp; 3492 struct tcpcb *tp = NULL; 3493 struct mbuf *am; 3494 struct socket *oso; 3495 u_int rtableid; 3496 3497 NET_ASSERT_LOCKED(); 3498 3499 mtx_enter(&syn_cache_mtx); 3500 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3501 if (sc == NULL) { 3502 mtx_leave(&syn_cache_mtx); 3503 return (NULL); 3504 } 3505 3506 /* 3507 * Verify the sequence and ack numbers. Try getting the correct 3508 * response again. 3509 */ 3510 if ((th->th_ack != sc->sc_iss + 1) || 3511 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3512 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3513 refcnt_take(&sc->sc_refcnt); 3514 mtx_leave(&syn_cache_mtx); 3515 (void) syn_cache_respond(sc, m, now); 3516 syn_cache_put(sc); 3517 return ((struct socket *)(-1)); 3518 } 3519 3520 /* Remove this cache entry */ 3521 syn_cache_rm(sc); 3522 mtx_leave(&syn_cache_mtx); 3523 3524 /* 3525 * Ok, create the full blown connection, and set things up 3526 * as they would have been set up if we had created the 3527 * connection when the SYN arrived. If we can't create 3528 * the connection, abort it. 3529 */ 3530 oso = so; 3531 so = sonewconn(so, SS_ISCONNECTED, M_DONTWAIT); 3532 if (so == NULL) 3533 goto resetandabort; 3534 3535 oldinp = sotoinpcb(oso); 3536 inp = sotoinpcb(so); 3537 3538 #ifdef IPSEC 3539 /* 3540 * We need to copy the required security levels 3541 * from the old pcb. Ditto for any other 3542 * IPsec-related information. 3543 */ 3544 inp->inp_seclevel = oldinp->inp_seclevel; 3545 #endif /* IPSEC */ 3546 #ifdef INET6 3547 if (ISSET(inp->inp_flags, INP_IPV6)) { 3548 KASSERT(ISSET(oldinp->inp_flags, INP_IPV6)); 3549 3550 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3551 inp->inp_hops = oldinp->inp_hops; 3552 } else 3553 #endif 3554 { 3555 KASSERT(!ISSET(oldinp->inp_flags, INP_IPV6)); 3556 3557 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3558 inp->inp_options = ip_srcroute(m); 3559 if (inp->inp_options == NULL) { 3560 inp->inp_options = sc->sc_ipopts; 3561 sc->sc_ipopts = NULL; 3562 } 3563 } 3564 3565 /* inherit rtable from listening socket */ 3566 rtableid = sc->sc_rtableid; 3567 #if NPF > 0 3568 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3569 struct pf_divert *divert; 3570 3571 divert = pf_find_divert(m); 3572 KASSERT(divert != NULL); 3573 rtableid = divert->rdomain; 3574 } 3575 #endif 3576 in_pcbset_laddr(inp, dst, rtableid); 3577 3578 /* 3579 * Give the new socket our cached route reference. 3580 */ 3581 inp->inp_route = sc->sc_route; /* struct assignment */ 3582 sc->sc_route.ro_rt = NULL; 3583 3584 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3585 if (am == NULL) 3586 goto resetandabort; 3587 am->m_len = src->sa_len; 3588 memcpy(mtod(am, caddr_t), src, src->sa_len); 3589 if (in_pcbconnect(inp, am)) { 3590 (void) m_free(am); 3591 goto resetandabort; 3592 } 3593 (void) m_free(am); 3594 3595 tp = intotcpcb(inp); 3596 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3597 if (sc->sc_request_r_scale != 15) { 3598 tp->requested_s_scale = sc->sc_requested_s_scale; 3599 tp->request_r_scale = sc->sc_request_r_scale; 3600 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3601 } 3602 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) 3603 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3604 3605 tp->t_template = tcp_template(tp); 3606 if (tp->t_template == 0) { 3607 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3608 so = NULL; 3609 goto abort; 3610 } 3611 tp->sack_enable = ISSET(sc->sc_fixflags, SCF_SACK_PERMIT); 3612 tp->ts_modulate = sc->sc_modulate; 3613 tp->ts_recent = sc->sc_timestamp; 3614 tp->iss = sc->sc_iss; 3615 tp->irs = sc->sc_irs; 3616 tcp_sendseqinit(tp); 3617 tp->snd_last = tp->snd_una; 3618 #ifdef TCP_ECN 3619 if (ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) { 3620 tp->t_flags |= TF_ECN_PERMIT; 3621 tcpstat_inc(tcps_ecn_accepts); 3622 } 3623 #endif 3624 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) 3625 tp->t_flags |= TF_SACK_PERMIT; 3626 #ifdef TCP_SIGNATURE 3627 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) 3628 tp->t_flags |= TF_SIGNATURE; 3629 #endif 3630 tcp_rcvseqinit(tp); 3631 tp->t_state = TCPS_SYN_RECEIVED; 3632 tp->t_rcvtime = now; 3633 tp->t_sndtime = now; 3634 tp->t_rcvacktime = now; 3635 tp->t_sndacktime = now; 3636 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3637 tcpstat_inc(tcps_accepts); 3638 3639 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3640 if (sc->sc_peermaxseg) 3641 tcp_mss_update(tp); 3642 /* Reset initial window to 1 segment for retransmit */ 3643 if (READ_ONCE(sc->sc_rxtshift) > 0) 3644 tp->snd_cwnd = tp->t_maxseg; 3645 tp->snd_wl1 = sc->sc_irs; 3646 tp->rcv_up = sc->sc_irs + 1; 3647 3648 /* 3649 * This is what would have happened in tcp_output() when 3650 * the SYN,ACK was sent. 3651 */ 3652 tp->snd_up = tp->snd_una; 3653 tp->snd_max = tp->snd_nxt = tp->iss+1; 3654 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3655 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3656 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3657 tp->last_ack_sent = tp->rcv_nxt; 3658 3659 tcpstat_inc(tcps_sc_completed); 3660 syn_cache_put(sc); 3661 return (so); 3662 3663 resetandabort: 3664 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3665 m->m_pkthdr.ph_rtableid, now); 3666 abort: 3667 m_freem(m); 3668 if (so != NULL) 3669 soabort(so); 3670 syn_cache_put(sc); 3671 tcpstat_inc(tcps_sc_aborted); 3672 return ((struct socket *)(-1)); 3673 } 3674 3675 /* 3676 * This function is called when we get a RST for a 3677 * non-existent connection, so that we can see if the 3678 * connection is in the syn cache. If it is, zap it. 3679 */ 3680 3681 void 3682 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3683 u_int rtableid) 3684 { 3685 struct syn_cache *sc; 3686 struct syn_cache_head *scp; 3687 3688 NET_ASSERT_LOCKED(); 3689 3690 mtx_enter(&syn_cache_mtx); 3691 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3692 if (sc == NULL) { 3693 mtx_leave(&syn_cache_mtx); 3694 return; 3695 } 3696 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3697 SEQ_GT(th->th_seq, sc->sc_irs + 1)) { 3698 mtx_leave(&syn_cache_mtx); 3699 return; 3700 } 3701 syn_cache_rm(sc); 3702 mtx_leave(&syn_cache_mtx); 3703 tcpstat_inc(tcps_sc_reset); 3704 syn_cache_put(sc); 3705 } 3706 3707 void 3708 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, 3709 struct tcphdr *th, u_int rtableid) 3710 { 3711 struct syn_cache *sc; 3712 struct syn_cache_head *scp; 3713 3714 NET_ASSERT_LOCKED(); 3715 3716 mtx_enter(&syn_cache_mtx); 3717 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3718 if (sc == NULL) { 3719 mtx_leave(&syn_cache_mtx); 3720 return; 3721 } 3722 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3723 if (ntohl (th->th_seq) != sc->sc_iss) { 3724 mtx_leave(&syn_cache_mtx); 3725 return; 3726 } 3727 3728 /* 3729 * If we've retransmitted 3 times and this is our second error, 3730 * we remove the entry. Otherwise, we allow it to continue on. 3731 * This prevents us from incorrectly nuking an entry during a 3732 * spurious network outage. 3733 * 3734 * See tcp_notify(). 3735 */ 3736 if (!ISSET(sc->sc_dynflags, SCF_UNREACH) || sc->sc_rxtshift < 3) { 3737 SET(sc->sc_dynflags, SCF_UNREACH); 3738 mtx_leave(&syn_cache_mtx); 3739 return; 3740 } 3741 3742 syn_cache_rm(sc); 3743 mtx_leave(&syn_cache_mtx); 3744 tcpstat_inc(tcps_sc_unreach); 3745 syn_cache_put(sc); 3746 } 3747 3748 /* 3749 * Given a LISTEN socket and an inbound SYN request, add 3750 * this to the syn cache, and send back a segment: 3751 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3752 * to the source. 3753 * 3754 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3755 * Doing so would require that we hold onto the data and deliver it 3756 * to the application. However, if we are the target of a SYN-flood 3757 * DoS attack, an attacker could send data which would eventually 3758 * consume all available buffer space if it were ACKed. By not ACKing 3759 * the data, we avoid this DoS scenario. 3760 */ 3761 3762 int 3763 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3764 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3765 struct tcp_opt_info *oi, tcp_seq *issp, uint64_t now) 3766 { 3767 struct tcpcb tb, *tp; 3768 long win; 3769 struct syn_cache *sc; 3770 struct syn_cache_head *scp; 3771 struct mbuf *ipopts; 3772 3773 NET_ASSERT_LOCKED(); 3774 3775 tp = sototcpcb(so); 3776 3777 /* 3778 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3779 * 3780 * Note this check is performed in tcp_input() very early on. 3781 */ 3782 3783 /* 3784 * Initialize some local state. 3785 */ 3786 win = sbspace(so, &so->so_rcv); 3787 if (win > TCP_MAXWIN) 3788 win = TCP_MAXWIN; 3789 3790 bzero(&tb, sizeof(tb)); 3791 #ifdef TCP_SIGNATURE 3792 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3793 #else 3794 if (optp) { 3795 #endif 3796 tb.pf = tp->pf; 3797 tb.sack_enable = tp->sack_enable; 3798 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3799 #ifdef TCP_SIGNATURE 3800 if (tp->t_flags & TF_SIGNATURE) 3801 tb.t_flags |= TF_SIGNATURE; 3802 #endif 3803 tb.t_state = TCPS_LISTEN; 3804 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3805 sotoinpcb(so)->inp_rtableid, now)) 3806 return (-1); 3807 } 3808 3809 switch (src->sa_family) { 3810 case AF_INET: 3811 /* 3812 * Remember the IP options, if any. 3813 */ 3814 ipopts = ip_srcroute(m); 3815 break; 3816 default: 3817 ipopts = NULL; 3818 } 3819 3820 /* 3821 * See if we already have an entry for this connection. 3822 * If we do, resend the SYN,ACK. We do not count this 3823 * as a retransmission (XXX though maybe we should). 3824 */ 3825 mtx_enter(&syn_cache_mtx); 3826 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3827 if (sc != NULL) { 3828 refcnt_take(&sc->sc_refcnt); 3829 mtx_leave(&syn_cache_mtx); 3830 tcpstat_inc(tcps_sc_dupesyn); 3831 if (ipopts) { 3832 /* 3833 * If we were remembering a previous source route, 3834 * forget it and use the new one we've been given. 3835 */ 3836 m_free(sc->sc_ipopts); 3837 sc->sc_ipopts = ipopts; 3838 } 3839 sc->sc_timestamp = tb.ts_recent; 3840 if (syn_cache_respond(sc, m, now) == 0) { 3841 tcpstat_inc(tcps_sndacks); 3842 tcpstat_inc(tcps_sndtotal); 3843 } 3844 syn_cache_put(sc); 3845 return (0); 3846 } 3847 mtx_leave(&syn_cache_mtx); 3848 3849 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3850 if (sc == NULL) { 3851 m_free(ipopts); 3852 return (-1); 3853 } 3854 refcnt_init_trace(&sc->sc_refcnt, DT_REFCNT_IDX_SYNCACHE); 3855 timeout_set_flags(&sc->sc_timer, syn_cache_timer, sc, 3856 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 3857 3858 /* 3859 * Fill in the cache, and put the necessary IP and TCP 3860 * options into the reply. 3861 */ 3862 memcpy(&sc->sc_src, src, src->sa_len); 3863 memcpy(&sc->sc_dst, dst, dst->sa_len); 3864 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3865 sc->sc_ipopts = ipopts; 3866 sc->sc_irs = th->th_seq; 3867 3868 sc->sc_iss = issp ? *issp : arc4random(); 3869 sc->sc_peermaxseg = oi->maxseg; 3870 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3871 sc->sc_win = win; 3872 sc->sc_timestamp = tb.ts_recent; 3873 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3874 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3875 SET(sc->sc_fixflags, SCF_TIMESTAMP); 3876 sc->sc_modulate = arc4random(); 3877 } 3878 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3879 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3880 sc->sc_requested_s_scale = tb.requested_s_scale; 3881 sc->sc_request_r_scale = 0; 3882 /* 3883 * Pick the smallest possible scaling factor that 3884 * will still allow us to scale up to sb_max. 3885 * 3886 * We do this because there are broken firewalls that 3887 * will corrupt the window scale option, leading to 3888 * the other endpoint believing that our advertised 3889 * window is unscaled. At scale factors larger than 3890 * 5 the unscaled window will drop below 1500 bytes, 3891 * leading to serious problems when traversing these 3892 * broken firewalls. 3893 * 3894 * With the default sbmax of 256K, a scale factor 3895 * of 3 will be chosen by this algorithm. Those who 3896 * choose a larger sbmax should watch out 3897 * for the compatibility problems mentioned above. 3898 * 3899 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3900 * or <SYN,ACK>) segment itself is never scaled. 3901 */ 3902 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3903 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3904 sc->sc_request_r_scale++; 3905 } else { 3906 sc->sc_requested_s_scale = 15; 3907 sc->sc_request_r_scale = 15; 3908 } 3909 #ifdef TCP_ECN 3910 /* 3911 * if both ECE and CWR flag bits are set, peer is ECN capable. 3912 */ 3913 if (tcp_do_ecn && 3914 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3915 SET(sc->sc_fixflags, SCF_ECN_PERMIT); 3916 #endif 3917 /* 3918 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3919 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3920 */ 3921 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3922 SET(sc->sc_fixflags, SCF_SACK_PERMIT); 3923 #ifdef TCP_SIGNATURE 3924 if (tb.t_flags & TF_SIGNATURE) 3925 SET(sc->sc_fixflags, SCF_SIGNATURE); 3926 #endif 3927 sc->sc_tp = tp; 3928 if (syn_cache_respond(sc, m, now) == 0) { 3929 mtx_enter(&syn_cache_mtx); 3930 /* 3931 * XXXSMP Currently exclusive netlock prevents another insert 3932 * after our syn_cache_lookup() and before syn_cache_insert(). 3933 * Double insert should be handled and not rely on netlock. 3934 */ 3935 syn_cache_insert(sc, tp); 3936 mtx_leave(&syn_cache_mtx); 3937 tcpstat_inc(tcps_sndacks); 3938 tcpstat_inc(tcps_sndtotal); 3939 } else { 3940 syn_cache_put(sc); 3941 tcpstat_inc(tcps_sc_dropped); 3942 } 3943 3944 return (0); 3945 } 3946 3947 int 3948 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint64_t now) 3949 { 3950 u_int8_t *optp; 3951 int optlen, error; 3952 u_int16_t tlen; 3953 struct ip *ip = NULL; 3954 #ifdef INET6 3955 struct ip6_hdr *ip6 = NULL; 3956 #endif 3957 struct tcphdr *th; 3958 u_int hlen; 3959 struct inpcb *inp; 3960 3961 NET_ASSERT_LOCKED(); 3962 3963 switch (sc->sc_src.sa.sa_family) { 3964 case AF_INET: 3965 hlen = sizeof(struct ip); 3966 break; 3967 #ifdef INET6 3968 case AF_INET6: 3969 hlen = sizeof(struct ip6_hdr); 3970 break; 3971 #endif 3972 default: 3973 m_freem(m); 3974 return (EAFNOSUPPORT); 3975 } 3976 3977 /* Compute the size of the TCP options. */ 3978 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3979 (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT) ? 4 : 0) + 3980 #ifdef TCP_SIGNATURE 3981 (ISSET(sc->sc_fixflags, SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3982 #endif 3983 (ISSET(sc->sc_fixflags, SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3984 3985 tlen = hlen + sizeof(struct tcphdr) + optlen; 3986 3987 /* 3988 * Create the IP+TCP header from scratch. 3989 */ 3990 m_freem(m); 3991 #ifdef DIAGNOSTIC 3992 if (max_linkhdr + tlen > MCLBYTES) 3993 return (ENOBUFS); 3994 #endif 3995 MGETHDR(m, M_DONTWAIT, MT_DATA); 3996 if (m && max_linkhdr + tlen > MHLEN) { 3997 MCLGET(m, M_DONTWAIT); 3998 if ((m->m_flags & M_EXT) == 0) { 3999 m_freem(m); 4000 m = NULL; 4001 } 4002 } 4003 if (m == NULL) 4004 return (ENOBUFS); 4005 4006 /* Fixup the mbuf. */ 4007 m->m_data += max_linkhdr; 4008 m->m_len = m->m_pkthdr.len = tlen; 4009 m->m_pkthdr.ph_ifidx = 0; 4010 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4011 memset(mtod(m, u_char *), 0, tlen); 4012 4013 switch (sc->sc_src.sa.sa_family) { 4014 case AF_INET: 4015 ip = mtod(m, struct ip *); 4016 ip->ip_dst = sc->sc_src.sin.sin_addr; 4017 ip->ip_src = sc->sc_dst.sin.sin_addr; 4018 ip->ip_p = IPPROTO_TCP; 4019 th = (struct tcphdr *)(ip + 1); 4020 th->th_dport = sc->sc_src.sin.sin_port; 4021 th->th_sport = sc->sc_dst.sin.sin_port; 4022 break; 4023 #ifdef INET6 4024 case AF_INET6: 4025 ip6 = mtod(m, struct ip6_hdr *); 4026 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4027 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4028 ip6->ip6_nxt = IPPROTO_TCP; 4029 th = (struct tcphdr *)(ip6 + 1); 4030 th->th_dport = sc->sc_src.sin6.sin6_port; 4031 th->th_sport = sc->sc_dst.sin6.sin6_port; 4032 break; 4033 #endif 4034 } 4035 4036 th->th_seq = htonl(sc->sc_iss); 4037 th->th_ack = htonl(sc->sc_irs + 1); 4038 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4039 th->th_flags = TH_SYN|TH_ACK; 4040 #ifdef TCP_ECN 4041 /* Set ECE for SYN-ACK if peer supports ECN. */ 4042 if (tcp_do_ecn && ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) 4043 th->th_flags |= TH_ECE; 4044 #endif 4045 th->th_win = htons(sc->sc_win); 4046 /* th_sum already 0 */ 4047 /* th_urp already 0 */ 4048 4049 /* Tack on the TCP options. */ 4050 optp = (u_int8_t *)(th + 1); 4051 *optp++ = TCPOPT_MAXSEG; 4052 *optp++ = 4; 4053 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4054 *optp++ = sc->sc_ourmaxseg & 0xff; 4055 4056 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4057 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) { 4058 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4059 optp += 4; 4060 } 4061 4062 if (sc->sc_request_r_scale != 15) { 4063 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4064 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4065 sc->sc_request_r_scale); 4066 optp += 4; 4067 } 4068 4069 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) { 4070 u_int32_t *lp = (u_int32_t *)(optp); 4071 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4072 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4073 *lp++ = htonl(now + sc->sc_modulate); 4074 *lp = htonl(sc->sc_timestamp); 4075 optp += TCPOLEN_TSTAMP_APPA; 4076 } 4077 4078 #ifdef TCP_SIGNATURE 4079 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) { 4080 union sockaddr_union src, dst; 4081 struct tdb *tdb; 4082 4083 bzero(&src, sizeof(union sockaddr_union)); 4084 bzero(&dst, sizeof(union sockaddr_union)); 4085 src.sa.sa_len = sc->sc_src.sa.sa_len; 4086 src.sa.sa_family = sc->sc_src.sa.sa_family; 4087 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4088 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4089 4090 switch (sc->sc_src.sa.sa_family) { 4091 case 0: /*default to PF_INET*/ 4092 case AF_INET: 4093 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4094 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4095 break; 4096 #ifdef INET6 4097 case AF_INET6: 4098 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4099 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4100 break; 4101 #endif /* INET6 */ 4102 } 4103 4104 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4105 0, &src, &dst, IPPROTO_TCP); 4106 if (tdb == NULL) { 4107 m_freem(m); 4108 return (EPERM); 4109 } 4110 4111 /* Send signature option */ 4112 *(optp++) = TCPOPT_SIGNATURE; 4113 *(optp++) = TCPOLEN_SIGNATURE; 4114 4115 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4116 hlen, 0, optp) < 0) { 4117 m_freem(m); 4118 tdb_unref(tdb); 4119 return (EINVAL); 4120 } 4121 tdb_unref(tdb); 4122 optp += 16; 4123 4124 /* Pad options list to the next 32 bit boundary and 4125 * terminate it. 4126 */ 4127 *optp++ = TCPOPT_NOP; 4128 *optp++ = TCPOPT_EOL; 4129 } 4130 #endif /* TCP_SIGNATURE */ 4131 4132 SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); 4133 4134 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4135 mtx_enter(&syn_cache_mtx); 4136 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4137 mtx_leave(&syn_cache_mtx); 4138 4139 /* 4140 * Fill in some straggling IP bits. Note the stack expects 4141 * ip_len to be in host order, for convenience. 4142 */ 4143 switch (sc->sc_src.sa.sa_family) { 4144 case AF_INET: 4145 ip->ip_len = htons(tlen); 4146 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4147 if (inp != NULL) 4148 ip->ip_tos = inp->inp_ip.ip_tos; 4149 4150 error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 4151 (ip_mtudisc ? IP_MTUDISC : 0), NULL, 4152 inp ? &inp->inp_seclevel : NULL, 0); 4153 break; 4154 #ifdef INET6 4155 case AF_INET6: 4156 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4157 ip6->ip6_vfc |= IPV6_VERSION; 4158 /* ip6_plen will be updated in ip6_output() */ 4159 ip6->ip6_hlim = in6_selecthlim(inp); 4160 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4161 4162 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route, 0, 4163 NULL, inp ? &inp->inp_seclevel : NULL); 4164 break; 4165 #endif 4166 } 4167 return (error); 4168 } 4169