1 /* $OpenBSD: tcp_input.c,v 1.426 2025/01/26 17:21:26 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet6/ip6_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #if NPF > 0 100 #include <net/pfvar.h> 101 #endif 102 103 int tcp_mss_adv(struct mbuf *, int); 104 int tcp_flush_queue(struct tcpcb *); 105 106 #ifdef INET6 107 #include <netinet6/in6_var.h> 108 #include <netinet6/nd6.h> 109 #endif /* INET6 */ 110 111 const int tcprexmtthresh = 3; 112 113 int tcp_rst_ppslim = 100; /* 100pps */ 114 int tcp_rst_ppslim_count = 0; 115 struct timeval tcp_rst_ppslim_last; 116 117 int tcp_ackdrop_ppslim = 100; /* 100pps */ 118 int tcp_ackdrop_ppslim_count = 0; 119 struct timeval tcp_ackdrop_ppslim_last; 120 121 #define TCP_PAWS_IDLE TCP_TIME(24 * 24 * 60 * 60) 122 123 /* for modulo comparisons of timestamps */ 124 #define TSTMP_LT(a,b) ((int32_t)((a)-(b)) < 0) 125 #define TSTMP_GEQ(a,b) ((int32_t)((a)-(b)) >= 0) 126 127 /* for TCP SACK comparisons */ 128 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 129 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 130 131 /* 132 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 133 */ 134 #ifdef INET6 135 #define ND6_HINT(tp) \ 136 do { \ 137 if (tp && tp->t_inpcb && \ 138 ISSET(tp->t_inpcb->inp_flags, INP_IPV6) && \ 139 rtisvalid(tp->t_inpcb->inp_route.ro_rt)) { \ 140 nd6_nud_hint(tp->t_inpcb->inp_route.ro_rt); \ 141 } \ 142 } while (0) 143 #else 144 #define ND6_HINT(tp) 145 #endif 146 147 #ifdef TCP_ECN 148 /* 149 * ECN (Explicit Congestion Notification) support based on RFC3168 150 * implementation note: 151 * snd_last is used to track a recovery phase. 152 * when cwnd is reduced, snd_last is set to snd_max. 153 * while snd_last > snd_una, the sender is in a recovery phase and 154 * its cwnd should not be reduced again. 155 * snd_last follows snd_una when not in a recovery phase. 156 */ 157 #endif 158 159 /* 160 * Macro to compute ACK transmission behavior. Delay the ACK unless 161 * we have already delayed an ACK (must send an ACK every two segments). 162 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 163 * option is enabled or when the packet is coming from a loopback 164 * interface. 165 */ 166 #define TCP_SETUP_ACK(tp, tiflags, m) \ 167 do { \ 168 struct ifnet *ifp = NULL; \ 169 if (m && (m->m_flags & M_PKTHDR)) \ 170 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 171 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 172 (atomic_load_int(&tcp_ack_on_push) && (tiflags) & TH_PUSH) || \ 173 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 174 tp->t_flags |= TF_ACKNOW; \ 175 else \ 176 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \ 177 if_put(ifp); \ 178 } while (0) 179 180 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 181 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 182 183 void syn_cache_put(struct syn_cache *); 184 void syn_cache_rm(struct syn_cache *); 185 int syn_cache_respond(struct syn_cache *, struct mbuf *, uint64_t, int); 186 void syn_cache_timer(void *); 187 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 188 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 189 struct tcphdr *, u_int); 190 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 191 unsigned int, struct socket *, struct mbuf *, u_char *, int, 192 struct tcp_opt_info *, tcp_seq *, uint64_t, int); 193 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 194 struct tcphdr *, unsigned int, unsigned int, struct socket *, 195 struct mbuf *, uint64_t, int); 196 struct syn_cache *syn_cache_lookup(const struct sockaddr *, 197 const struct sockaddr *, struct syn_cache_head **, u_int); 198 199 /* 200 * Insert segment ti into reassembly queue of tcp with 201 * control block tp. Return TH_FIN if reassembly now includes 202 * a segment with FIN. The macro form does the common case inline 203 * (segment is the next to be received on an established connection, 204 * and the queue is empty), avoiding linkage into and removal 205 * from the queue and repetition of various conversions. 206 * Set DELACK for segments received in order, but ack immediately 207 * when segments are out of order (so fast retransmit can work). 208 */ 209 210 int 211 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 212 { 213 struct tcpqent *p, *q, *nq, *tiqe; 214 215 /* 216 * Allocate a new queue entry, before we throw away any data. 217 * If we can't, just drop the packet. XXX 218 */ 219 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 220 if (tiqe == NULL) { 221 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 222 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 223 /* Reuse last entry since new segment fills a hole */ 224 m_freem(tiqe->tcpqe_m); 225 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 226 } 227 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 228 /* Flush segment queue for this connection */ 229 tcp_freeq(tp); 230 tcpstat_inc(tcps_rcvmemdrop); 231 m_freem(m); 232 return (0); 233 } 234 } 235 236 /* 237 * Find a segment which begins after this one does. 238 */ 239 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 240 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 241 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 242 break; 243 244 /* 245 * If there is a preceding segment, it may provide some of 246 * our data already. If so, drop the data from the incoming 247 * segment. If it provides all of our data, drop us. 248 */ 249 if (p != NULL) { 250 struct tcphdr *phdr = p->tcpqe_tcp; 251 int i; 252 253 /* conversion to int (in i) handles seq wraparound */ 254 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 255 if (i > 0) { 256 if (i >= *tlen) { 257 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 258 *tlen); 259 m_freem(m); 260 pool_put(&tcpqe_pool, tiqe); 261 return (0); 262 } 263 m_adj(m, i); 264 *tlen -= i; 265 th->th_seq += i; 266 } 267 } 268 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 269 tp->t_rcvoopack++; 270 271 /* 272 * While we overlap succeeding segments trim them or, 273 * if they are completely covered, dequeue them. 274 */ 275 for (; q != NULL; q = nq) { 276 struct tcphdr *qhdr = q->tcpqe_tcp; 277 int i = (th->th_seq + *tlen) - qhdr->th_seq; 278 279 if (i <= 0) 280 break; 281 if (i < qhdr->th_reseqlen) { 282 qhdr->th_seq += i; 283 qhdr->th_reseqlen -= i; 284 m_adj(q->tcpqe_m, i); 285 break; 286 } 287 nq = TAILQ_NEXT(q, tcpqe_q); 288 m_freem(q->tcpqe_m); 289 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 290 pool_put(&tcpqe_pool, q); 291 } 292 293 /* Insert the new segment queue entry into place. */ 294 tiqe->tcpqe_m = m; 295 th->th_reseqlen = *tlen; 296 tiqe->tcpqe_tcp = th; 297 if (p == NULL) { 298 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 299 } else { 300 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 301 } 302 303 if (th->th_seq != tp->rcv_nxt) 304 return (0); 305 306 return (tcp_flush_queue(tp)); 307 } 308 309 int 310 tcp_flush_queue(struct tcpcb *tp) 311 { 312 struct socket *so = tp->t_inpcb->inp_socket; 313 struct tcpqent *q, *nq; 314 int flags; 315 316 /* 317 * Present data to user, advancing rcv_nxt through 318 * completed sequence space. 319 */ 320 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 321 return (0); 322 q = TAILQ_FIRST(&tp->t_segq); 323 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 324 return (0); 325 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 326 return (0); 327 do { 328 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 329 flags = q->tcpqe_tcp->th_flags & TH_FIN; 330 331 nq = TAILQ_NEXT(q, tcpqe_q); 332 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 333 ND6_HINT(tp); 334 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 335 m_freem(q->tcpqe_m); 336 else { 337 mtx_enter(&so->so_rcv.sb_mtx); 338 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 339 mtx_leave(&so->so_rcv.sb_mtx); 340 } 341 pool_put(&tcpqe_pool, q); 342 q = nq; 343 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 344 tp->t_flags |= TF_BLOCKOUTPUT; 345 sorwakeup(so); 346 tp->t_flags &= ~TF_BLOCKOUTPUT; 347 return (flags); 348 } 349 350 /* 351 * TCP input routine, follows pages 65-76 of the 352 * protocol specification dated September, 1981 very closely. 353 */ 354 int 355 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 356 { 357 struct mbuf *m = *mp; 358 int iphlen = *offp; 359 struct ip *ip = NULL; 360 struct inpcb *inp = NULL; 361 u_int8_t *optp = NULL; 362 int optlen = 0; 363 int tlen, off; 364 struct tcpcb *otp = NULL, *tp = NULL; 365 int tiflags; 366 struct socket *so = NULL; 367 int todrop, acked, ourfinisacked; 368 int hdroptlen = 0; 369 short ostate; 370 union { 371 struct tcpiphdr tcpip; 372 #ifdef INET6 373 struct tcpipv6hdr tcpip6; 374 #endif 375 char caddr; 376 } saveti; 377 tcp_seq iss, *reuse = NULL; 378 uint64_t now; 379 u_long tiwin; 380 struct tcp_opt_info opti; 381 struct tcphdr *th; 382 #ifdef INET6 383 struct ip6_hdr *ip6 = NULL; 384 #endif /* INET6 */ 385 int do_ecn = 0; 386 #ifdef TCP_ECN 387 u_char iptos; 388 #endif 389 390 tcpstat_inc(tcps_rcvtotal); 391 392 opti.ts_present = 0; 393 opti.maxseg = 0; 394 now = tcp_now(); 395 #ifdef TCP_ECN 396 do_ecn = atomic_load_int(&tcp_do_ecn); 397 #endif 398 399 /* 400 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 401 */ 402 if (m->m_flags & (M_BCAST|M_MCAST)) 403 goto drop; 404 405 /* 406 * Get IP and TCP header together in first mbuf. 407 * Note: IP leaves IP header in first mbuf. 408 */ 409 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 410 if (!th) { 411 tcpstat_inc(tcps_rcvshort); 412 return IPPROTO_DONE; 413 } 414 415 tlen = m->m_pkthdr.len - iphlen; 416 switch (af) { 417 case AF_INET: 418 ip = mtod(m, struct ip *); 419 #ifdef TCP_ECN 420 /* save ip_tos before clearing it for checksum */ 421 iptos = ip->ip_tos; 422 #endif 423 break; 424 #ifdef INET6 425 case AF_INET6: 426 ip6 = mtod(m, struct ip6_hdr *); 427 #ifdef TCP_ECN 428 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 429 #endif 430 431 /* 432 * Be proactive about unspecified IPv6 address in source. 433 * As we use all-zero to indicate unbounded/unconnected pcb, 434 * unspecified IPv6 address can be used to confuse us. 435 * 436 * Note that packets with unspecified IPv6 destination is 437 * already dropped in ip6_input. 438 */ 439 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 440 /* XXX stat */ 441 goto drop; 442 } 443 444 /* Discard packets to multicast */ 445 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 446 /* XXX stat */ 447 goto drop; 448 } 449 break; 450 #endif 451 default: 452 unhandled_af(af); 453 } 454 455 /* 456 * Checksum extended TCP header and data. 457 */ 458 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 459 int sum; 460 461 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 462 tcpstat_inc(tcps_rcvbadsum); 463 goto drop; 464 } 465 tcpstat_inc(tcps_inswcsum); 466 switch (af) { 467 case AF_INET: 468 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 469 break; 470 #ifdef INET6 471 case AF_INET6: 472 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 473 tlen); 474 break; 475 #endif 476 } 477 if (sum != 0) { 478 tcpstat_inc(tcps_rcvbadsum); 479 goto drop; 480 } 481 } 482 483 /* 484 * Check that TCP offset makes sense, 485 * pull out TCP options and adjust length. XXX 486 */ 487 off = th->th_off << 2; 488 if (off < sizeof(struct tcphdr) || off > tlen) { 489 tcpstat_inc(tcps_rcvbadoff); 490 goto drop; 491 } 492 tlen -= off; 493 if (off > sizeof(struct tcphdr)) { 494 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 495 if (!th) { 496 tcpstat_inc(tcps_rcvshort); 497 return IPPROTO_DONE; 498 } 499 optlen = off - sizeof(struct tcphdr); 500 optp = (u_int8_t *)(th + 1); 501 /* 502 * Do quick retrieval of timestamp options ("options 503 * prediction?"). If timestamp is the only option and it's 504 * formatted as recommended in RFC 1323 appendix A, we 505 * quickly get the values now and not bother calling 506 * tcp_dooptions(), etc. 507 */ 508 if ((optlen == TCPOLEN_TSTAMP_APPA || 509 (optlen > TCPOLEN_TSTAMP_APPA && 510 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 511 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 512 (th->th_flags & TH_SYN) == 0) { 513 opti.ts_present = 1; 514 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 515 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 516 optp = NULL; /* we've parsed the options */ 517 } 518 } 519 tiflags = th->th_flags; 520 521 /* 522 * Convert TCP protocol specific fields to host format. 523 */ 524 th->th_seq = ntohl(th->th_seq); 525 th->th_ack = ntohl(th->th_ack); 526 th->th_win = ntohs(th->th_win); 527 th->th_urp = ntohs(th->th_urp); 528 529 if (th->th_dport == 0) { 530 tcpstat_inc(tcps_noport); 531 goto dropwithreset_ratelim; 532 } 533 534 /* 535 * Locate pcb for segment. 536 */ 537 #if NPF > 0 538 inp = pf_inp_lookup(m); 539 #endif 540 findpcb: 541 if (inp == NULL) { 542 switch (af) { 543 #ifdef INET6 544 case AF_INET6: 545 inp = in6_pcblookup(&tcb6table, &ip6->ip6_src, 546 th->th_sport, &ip6->ip6_dst, th->th_dport, 547 m->m_pkthdr.ph_rtableid); 548 break; 549 #endif 550 case AF_INET: 551 inp = in_pcblookup(&tcbtable, ip->ip_src, 552 th->th_sport, ip->ip_dst, th->th_dport, 553 m->m_pkthdr.ph_rtableid); 554 break; 555 } 556 } 557 if (inp == NULL) { 558 tcpstat_inc(tcps_pcbhashmiss); 559 switch (af) { 560 #ifdef INET6 561 case AF_INET6: 562 inp = in6_pcblookup_listen(&tcb6table, &ip6->ip6_dst, 563 th->th_dport, m, m->m_pkthdr.ph_rtableid); 564 break; 565 #endif 566 case AF_INET: 567 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 568 th->th_dport, m, m->m_pkthdr.ph_rtableid); 569 break; 570 } 571 /* 572 * If the state is CLOSED (i.e., TCB does not exist) then 573 * all data in the incoming segment is discarded. 574 * If the TCB exists but is in CLOSED state, it is embryonic, 575 * but should either do a listen or a connect soon. 576 */ 577 } 578 #ifdef IPSEC 579 if (ipsec_in_use) { 580 struct m_tag *mtag; 581 struct tdb *tdb = NULL; 582 int error; 583 584 /* Find most recent IPsec tag */ 585 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 586 if (mtag != NULL) { 587 struct tdb_ident *tdbi; 588 589 tdbi = (struct tdb_ident *)(mtag + 1); 590 tdb = gettdb(tdbi->rdomain, tdbi->spi, 591 &tdbi->dst, tdbi->proto); 592 } 593 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 594 tdb, inp ? &inp->inp_seclevel : NULL, NULL, NULL); 595 tdb_unref(tdb); 596 if (error) { 597 tcpstat_inc(tcps_rcvnosec); 598 goto drop; 599 } 600 } 601 #endif /* IPSEC */ 602 603 if (inp == NULL) { 604 tcpstat_inc(tcps_noport); 605 goto dropwithreset_ratelim; 606 } 607 608 KASSERT(sotoinpcb(inp->inp_socket) == inp); 609 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 610 soassertlocked(inp->inp_socket); 611 612 /* Check the minimum TTL for socket. */ 613 switch (af) { 614 case AF_INET: 615 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 616 goto drop; 617 break; 618 #ifdef INET6 619 case AF_INET6: 620 if (inp->inp_ip6_minhlim && 621 inp->inp_ip6_minhlim > ip6->ip6_hlim) 622 goto drop; 623 break; 624 #endif 625 } 626 627 tp = intotcpcb(inp); 628 if (tp == NULL) 629 goto dropwithreset_ratelim; 630 if (tp->t_state == TCPS_CLOSED) 631 goto drop; 632 633 /* Unscale the window into a 32-bit value. */ 634 if ((tiflags & TH_SYN) == 0) 635 tiwin = th->th_win << tp->snd_scale; 636 else 637 tiwin = th->th_win; 638 639 so = inp->inp_socket; 640 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 641 union syn_cache_sa src; 642 union syn_cache_sa dst; 643 644 bzero(&src, sizeof(src)); 645 bzero(&dst, sizeof(dst)); 646 switch (af) { 647 case AF_INET: 648 src.sin.sin_len = sizeof(struct sockaddr_in); 649 src.sin.sin_family = AF_INET; 650 src.sin.sin_addr = ip->ip_src; 651 src.sin.sin_port = th->th_sport; 652 653 dst.sin.sin_len = sizeof(struct sockaddr_in); 654 dst.sin.sin_family = AF_INET; 655 dst.sin.sin_addr = ip->ip_dst; 656 dst.sin.sin_port = th->th_dport; 657 break; 658 #ifdef INET6 659 case AF_INET6: 660 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 661 src.sin6.sin6_family = AF_INET6; 662 src.sin6.sin6_addr = ip6->ip6_src; 663 src.sin6.sin6_port = th->th_sport; 664 665 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 666 dst.sin6.sin6_family = AF_INET6; 667 dst.sin6.sin6_addr = ip6->ip6_dst; 668 dst.sin6.sin6_port = th->th_dport; 669 break; 670 #endif /* INET6 */ 671 } 672 673 if (so->so_options & SO_DEBUG) { 674 otp = tp; 675 ostate = tp->t_state; 676 switch (af) { 677 #ifdef INET6 678 case AF_INET6: 679 saveti.tcpip6.ti6_i = *ip6; 680 saveti.tcpip6.ti6_t = *th; 681 break; 682 #endif 683 case AF_INET: 684 memcpy(&saveti.tcpip.ti_i, ip, sizeof(*ip)); 685 saveti.tcpip.ti_t = *th; 686 break; 687 } 688 } 689 if (so->so_options & SO_ACCEPTCONN) { 690 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 691 692 case TH_SYN|TH_ACK|TH_RST: 693 case TH_SYN|TH_RST: 694 case TH_ACK|TH_RST: 695 case TH_RST: 696 syn_cache_reset(&src.sa, &dst.sa, th, 697 inp->inp_rtableid); 698 goto drop; 699 700 case TH_SYN|TH_ACK: 701 /* 702 * Received a SYN,ACK. This should 703 * never happen while we are in 704 * LISTEN. Send an RST. 705 */ 706 goto badsyn; 707 708 case TH_ACK: 709 so = syn_cache_get(&src.sa, &dst.sa, 710 th, iphlen, tlen, so, m, now, do_ecn); 711 if (so == NULL) { 712 /* 713 * We don't have a SYN for 714 * this ACK; send an RST. 715 */ 716 goto badsyn; 717 } else if (so == (struct socket *)(-1)) { 718 /* 719 * We were unable to create 720 * the connection. If the 721 * 3-way handshake was 722 * completed, and RST has 723 * been sent to the peer. 724 * Since the mbuf might be 725 * in use for the reply, 726 * do not free it. 727 */ 728 m = *mp = NULL; 729 goto drop; 730 } else { 731 /* 732 * We have created a 733 * full-blown connection. 734 */ 735 tp = NULL; 736 in_pcbunref(inp); 737 inp = in_pcbref(sotoinpcb(so)); 738 tp = intotcpcb(inp); 739 if (tp == NULL) 740 goto badsyn; /*XXX*/ 741 742 } 743 break; 744 745 default: 746 /* 747 * None of RST, SYN or ACK was set. 748 * This is an invalid packet for a 749 * TCB in LISTEN state. Send a RST. 750 */ 751 goto badsyn; 752 753 case TH_SYN: 754 /* 755 * Received a SYN. 756 */ 757 #ifdef INET6 758 /* 759 * If deprecated address is forbidden, we do 760 * not accept SYN to deprecated interface 761 * address to prevent any new inbound 762 * connection from getting established. 763 * When we do not accept SYN, we send a TCP 764 * RST, with deprecated source address (instead 765 * of dropping it). We compromise it as it is 766 * much better for peer to send a RST, and 767 * RST will be the final packet for the 768 * exchange. 769 * 770 * If we do not forbid deprecated addresses, we 771 * accept the SYN packet. RFC2462 does not 772 * suggest dropping SYN in this case. 773 * If we decipher RFC2462 5.5.4, it says like 774 * this: 775 * 1. use of deprecated addr with existing 776 * communication is okay - "SHOULD continue 777 * to be used" 778 * 2. use of it with new communication: 779 * (2a) "SHOULD NOT be used if alternate 780 * address with sufficient scope is 781 * available" 782 * (2b) nothing mentioned otherwise. 783 * Here we fall into (2b) case as we have no 784 * choice in our source address selection - we 785 * must obey the peer. 786 * 787 * The wording in RFC2462 is confusing, and 788 * there are multiple description text for 789 * deprecated address handling - worse, they 790 * are not exactly the same. I believe 5.5.4 791 * is the best one, so we follow 5.5.4. 792 */ 793 if (ip6 && !ip6_use_deprecated) { 794 struct in6_ifaddr *ia6; 795 struct ifnet *ifp = 796 if_get(m->m_pkthdr.ph_ifidx); 797 798 if (ifp && 799 (ia6 = in6ifa_ifpwithaddr(ifp, 800 &ip6->ip6_dst)) && 801 (ia6->ia6_flags & 802 IN6_IFF_DEPRECATED)) { 803 tp = NULL; 804 if_put(ifp); 805 goto dropwithreset; 806 } 807 if_put(ifp); 808 } 809 #endif 810 811 /* 812 * LISTEN socket received a SYN 813 * from itself? This can't possibly 814 * be valid; drop the packet. 815 */ 816 if (th->th_dport == th->th_sport) { 817 switch (af) { 818 #ifdef INET6 819 case AF_INET6: 820 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 821 &ip6->ip6_dst)) { 822 tcpstat_inc(tcps_badsyn); 823 goto drop; 824 } 825 break; 826 #endif /* INET6 */ 827 case AF_INET: 828 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 829 tcpstat_inc(tcps_badsyn); 830 goto drop; 831 } 832 break; 833 } 834 } 835 836 /* 837 * SYN looks ok; create compressed TCP 838 * state for it. 839 */ 840 if (so->so_qlen > so->so_qlimit || 841 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 842 so, m, optp, optlen, &opti, reuse, now, 843 do_ecn) == -1) { 844 tcpstat_inc(tcps_dropsyn); 845 goto drop; 846 } 847 in_pcbunref(inp); 848 return IPPROTO_DONE; 849 } 850 } 851 } 852 853 #ifdef DIAGNOSTIC 854 /* 855 * Should not happen now that all embryonic connections 856 * are handled with compressed state. 857 */ 858 if (tp->t_state == TCPS_LISTEN) 859 panic("tcp_input: TCPS_LISTEN"); 860 #endif 861 862 #if NPF > 0 863 pf_inp_link(m, inp); 864 #endif 865 866 /* 867 * Segment received on connection. 868 * Reset idle time and keep-alive timer. 869 */ 870 tp->t_rcvtime = now; 871 if (TCPS_HAVEESTABLISHED(tp->t_state)) 872 TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepidle)); 873 874 if (tp->sack_enable) 875 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 876 877 /* 878 * Process options. 879 */ 880 if (optp 881 #ifdef TCP_SIGNATURE 882 || (tp->t_flags & TF_SIGNATURE) 883 #endif 884 ) { 885 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 886 m->m_pkthdr.ph_rtableid, now)) 887 goto drop; 888 } 889 890 if (opti.ts_present && opti.ts_ecr) { 891 int32_t rtt_test; 892 893 /* subtract out the tcp timestamp modulator */ 894 opti.ts_ecr -= tp->ts_modulate; 895 896 /* make sure ts_ecr is sensible */ 897 rtt_test = now - opti.ts_ecr; 898 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 899 opti.ts_ecr = 0; 900 } 901 902 #ifdef TCP_ECN 903 /* if congestion experienced, set ECE bit in subsequent packets. */ 904 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 905 tp->t_flags |= TF_RCVD_CE; 906 tcpstat_inc(tcps_ecn_rcvce); 907 } 908 #endif 909 /* 910 * Header prediction: check for the two common cases 911 * of a uni-directional data xfer. If the packet has 912 * no control flags, is in-sequence, the window didn't 913 * change and we're not retransmitting, it's a 914 * candidate. If the length is zero and the ack moved 915 * forward, we're the sender side of the xfer. Just 916 * free the data acked & wake any higher level process 917 * that was blocked waiting for space. If the length 918 * is non-zero and the ack didn't move, we're the 919 * receiver side. If we're getting packets in-order 920 * (the reassembly queue is empty), add the data to 921 * the socket buffer and note that we need a delayed ack. 922 */ 923 if (tp->t_state == TCPS_ESTABLISHED && 924 #ifdef TCP_ECN 925 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 926 #else 927 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 928 #endif 929 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 930 th->th_seq == tp->rcv_nxt && 931 tiwin && tiwin == tp->snd_wnd && 932 tp->snd_nxt == tp->snd_max) { 933 934 /* 935 * If last ACK falls within this segment's sequence numbers, 936 * record the timestamp. 937 * Fix from Braden, see Stevens p. 870 938 */ 939 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 940 tp->ts_recent_age = now; 941 tp->ts_recent = opti.ts_val; 942 } 943 944 if (tlen == 0) { 945 if (SEQ_GT(th->th_ack, tp->snd_una) && 946 SEQ_LEQ(th->th_ack, tp->snd_max) && 947 tp->snd_cwnd >= tp->snd_wnd && 948 tp->t_dupacks == 0) { 949 /* 950 * this is a pure ack for outstanding data. 951 */ 952 tcpstat_inc(tcps_predack); 953 if (opti.ts_present && opti.ts_ecr) 954 tcp_xmit_timer(tp, now - opti.ts_ecr); 955 else if (tp->t_rtttime && 956 SEQ_GT(th->th_ack, tp->t_rtseq)) 957 tcp_xmit_timer(tp, now - tp->t_rtttime); 958 acked = th->th_ack - tp->snd_una; 959 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 960 acked); 961 tp->t_rcvacktime = now; 962 ND6_HINT(tp); 963 964 mtx_enter(&so->so_snd.sb_mtx); 965 sbdrop(so, &so->so_snd, acked); 966 mtx_leave(&so->so_snd.sb_mtx); 967 968 /* 969 * If we had a pending ICMP message that 970 * refers to data that have just been 971 * acknowledged, disregard the recorded ICMP 972 * message. 973 */ 974 if ((tp->t_flags & TF_PMTUD_PEND) && 975 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 976 tp->t_flags &= ~TF_PMTUD_PEND; 977 978 /* 979 * Keep track of the largest chunk of data 980 * acknowledged since last PMTU update 981 */ 982 if (tp->t_pmtud_mss_acked < acked) 983 tp->t_pmtud_mss_acked = acked; 984 985 tp->snd_una = th->th_ack; 986 /* Pull snd_wl2 up to prevent seq wrap. */ 987 tp->snd_wl2 = th->th_ack; 988 /* 989 * We want snd_last to track snd_una so 990 * as to avoid sequence wraparound problems 991 * for very large transfers. 992 */ 993 #ifdef TCP_ECN 994 if (SEQ_GT(tp->snd_una, tp->snd_last)) 995 #endif 996 tp->snd_last = tp->snd_una; 997 m_freem(m); 998 999 /* 1000 * If all outstanding data are acked, stop 1001 * retransmit timer, otherwise restart timer 1002 * using current (possibly backed-off) value. 1003 * If process is waiting for space, 1004 * wakeup/selwakeup/signal. If data 1005 * are ready to send, let tcp_output 1006 * decide between more output or persist. 1007 */ 1008 if (tp->snd_una == tp->snd_max) 1009 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1010 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1011 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1012 1013 tcp_update_sndspace(tp); 1014 if (sb_notify(so, &so->so_snd)) { 1015 tp->t_flags |= TF_BLOCKOUTPUT; 1016 sowwakeup(so); 1017 tp->t_flags &= ~TF_BLOCKOUTPUT; 1018 } 1019 if (so->so_snd.sb_cc || 1020 tp->t_flags & TF_NEEDOUTPUT) 1021 (void) tcp_output(tp); 1022 in_pcbunref(inp); 1023 return IPPROTO_DONE; 1024 } 1025 } else if (th->th_ack == tp->snd_una && 1026 TAILQ_EMPTY(&tp->t_segq) && 1027 tlen <= sbspace(so, &so->so_rcv)) { 1028 /* 1029 * This is a pure, in-sequence data packet 1030 * with nothing on the reassembly queue and 1031 * we have enough buffer space to take it. 1032 */ 1033 /* Clean receiver SACK report if present */ 1034 if (tp->sack_enable && tp->rcv_numsacks) 1035 tcp_clean_sackreport(tp); 1036 tcpstat_inc(tcps_preddat); 1037 tp->rcv_nxt += tlen; 1038 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1039 tp->snd_wl1 = th->th_seq; 1040 /* Packet has most recent segment, no urgent exists. */ 1041 tp->rcv_up = tp->rcv_nxt; 1042 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1043 ND6_HINT(tp); 1044 1045 TCP_SETUP_ACK(tp, tiflags, m); 1046 /* 1047 * Drop TCP, IP headers and TCP options then add data 1048 * to socket buffer. 1049 */ 1050 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1051 m_freem(m); 1052 else { 1053 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1054 now - tp->rfbuf_ts > (tp->t_srtt >> 1055 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) { 1056 tcp_update_rcvspace(tp); 1057 /* Start over with next RTT. */ 1058 tp->rfbuf_cnt = 0; 1059 tp->rfbuf_ts = 0; 1060 } else 1061 tp->rfbuf_cnt += tlen; 1062 m_adj(m, iphlen + off); 1063 mtx_enter(&so->so_rcv.sb_mtx); 1064 sbappendstream(so, &so->so_rcv, m); 1065 mtx_leave(&so->so_rcv.sb_mtx); 1066 } 1067 tp->t_flags |= TF_BLOCKOUTPUT; 1068 sorwakeup(so); 1069 tp->t_flags &= ~TF_BLOCKOUTPUT; 1070 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1071 (void) tcp_output(tp); 1072 in_pcbunref(inp); 1073 return IPPROTO_DONE; 1074 } 1075 } 1076 1077 /* 1078 * Compute mbuf offset to TCP data segment. 1079 */ 1080 hdroptlen = iphlen + off; 1081 1082 /* 1083 * Calculate amount of space in receive window, 1084 * and then do TCP input processing. 1085 * Receive window is amount of space in rcv queue, 1086 * but not less than advertised window. 1087 */ 1088 { 1089 int win; 1090 1091 win = sbspace(so, &so->so_rcv); 1092 if (win < 0) 1093 win = 0; 1094 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1095 } 1096 1097 switch (tp->t_state) { 1098 1099 /* 1100 * If the state is SYN_RECEIVED: 1101 * if seg contains SYN/ACK, send an RST. 1102 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1103 */ 1104 1105 case TCPS_SYN_RECEIVED: 1106 if (tiflags & TH_ACK) { 1107 if (tiflags & TH_SYN) { 1108 tcpstat_inc(tcps_badsyn); 1109 goto dropwithreset; 1110 } 1111 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1112 SEQ_GT(th->th_ack, tp->snd_max)) 1113 goto dropwithreset; 1114 } 1115 break; 1116 1117 /* 1118 * If the state is SYN_SENT: 1119 * if seg contains an ACK, but not for our SYN, drop the input. 1120 * if seg contains a RST, then drop the connection. 1121 * if seg does not contain SYN, then drop it. 1122 * Otherwise this is an acceptable SYN segment 1123 * initialize tp->rcv_nxt and tp->irs 1124 * if seg contains ack then advance tp->snd_una 1125 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1126 * arrange for segment to be acked (eventually) 1127 * continue processing rest of data/controls, beginning with URG 1128 */ 1129 case TCPS_SYN_SENT: 1130 if ((tiflags & TH_ACK) && 1131 (SEQ_LEQ(th->th_ack, tp->iss) || 1132 SEQ_GT(th->th_ack, tp->snd_max))) 1133 goto dropwithreset; 1134 if (tiflags & TH_RST) { 1135 #ifdef TCP_ECN 1136 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1137 if (do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1138 goto drop; 1139 #endif 1140 if (tiflags & TH_ACK) 1141 tp = tcp_drop(tp, ECONNREFUSED); 1142 goto drop; 1143 } 1144 if ((tiflags & TH_SYN) == 0) 1145 goto drop; 1146 if (tiflags & TH_ACK) { 1147 tp->snd_una = th->th_ack; 1148 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1149 tp->snd_nxt = tp->snd_una; 1150 } 1151 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1152 tp->irs = th->th_seq; 1153 tcp_mss(tp, opti.maxseg); 1154 /* Reset initial window to 1 segment for retransmit */ 1155 if (tp->t_rxtshift > 0) 1156 tp->snd_cwnd = tp->t_maxseg; 1157 tcp_rcvseqinit(tp); 1158 tp->t_flags |= TF_ACKNOW; 1159 /* 1160 * If we've sent a SACK_PERMITTED option, and the peer 1161 * also replied with one, then TF_SACK_PERMIT should have 1162 * been set in tcp_dooptions(). If it was not, disable SACKs. 1163 */ 1164 if (tp->sack_enable) 1165 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1166 #ifdef TCP_ECN 1167 /* 1168 * if ECE is set but CWR is not set for SYN-ACK, or 1169 * both ECE and CWR are set for simultaneous open, 1170 * peer is ECN capable. 1171 */ 1172 if (do_ecn) { 1173 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1174 case TH_ACK|TH_ECE: 1175 case TH_ECE|TH_CWR: 1176 tp->t_flags |= TF_ECN_PERMIT; 1177 tiflags &= ~(TH_ECE|TH_CWR); 1178 tcpstat_inc(tcps_ecn_accepts); 1179 } 1180 } 1181 #endif 1182 1183 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1184 tcpstat_inc(tcps_connects); 1185 tp->t_flags |= TF_BLOCKOUTPUT; 1186 soisconnected(so); 1187 tp->t_flags &= ~TF_BLOCKOUTPUT; 1188 tp->t_state = TCPS_ESTABLISHED; 1189 TCP_TIMER_ARM(tp, TCPT_KEEP, 1190 atomic_load_int(&tcp_keepidle)); 1191 /* Do window scaling on this connection? */ 1192 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1193 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1194 tp->snd_scale = tp->requested_s_scale; 1195 tp->rcv_scale = tp->request_r_scale; 1196 } 1197 tcp_flush_queue(tp); 1198 1199 /* 1200 * if we didn't have to retransmit the SYN, 1201 * use its rtt as our initial srtt & rtt var. 1202 */ 1203 if (tp->t_rtttime) 1204 tcp_xmit_timer(tp, now - tp->t_rtttime); 1205 /* 1206 * Since new data was acked (the SYN), open the 1207 * congestion window by one MSS. We do this 1208 * here, because we won't go through the normal 1209 * ACK processing below. And since this is the 1210 * start of the connection, we know we are in 1211 * the exponential phase of slow-start. 1212 */ 1213 tp->snd_cwnd += tp->t_maxseg; 1214 } else 1215 tp->t_state = TCPS_SYN_RECEIVED; 1216 1217 #if 0 1218 trimthenstep6: 1219 #endif 1220 /* 1221 * Advance th->th_seq to correspond to first data byte. 1222 * If data, trim to stay within window, 1223 * dropping FIN if necessary. 1224 */ 1225 th->th_seq++; 1226 if (tlen > tp->rcv_wnd) { 1227 todrop = tlen - tp->rcv_wnd; 1228 m_adj(m, -todrop); 1229 tlen = tp->rcv_wnd; 1230 tiflags &= ~TH_FIN; 1231 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1232 todrop); 1233 } 1234 tp->snd_wl1 = th->th_seq - 1; 1235 tp->rcv_up = th->th_seq; 1236 goto step6; 1237 /* 1238 * If a new connection request is received while in TIME_WAIT, 1239 * drop the old connection and start over if the if the 1240 * timestamp or the sequence numbers are above the previous 1241 * ones. 1242 */ 1243 case TCPS_TIME_WAIT: 1244 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1245 ((opti.ts_present && 1246 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1247 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1248 #if NPF > 0 1249 /* 1250 * The socket will be recreated but the new state 1251 * has already been linked to the socket. Remove the 1252 * link between old socket and new state. 1253 */ 1254 pf_inp_unlink(inp); 1255 #endif 1256 /* 1257 * Advance the iss by at least 32768, but 1258 * clear the msb in order to make sure 1259 * that SEG_LT(snd_nxt, iss). 1260 */ 1261 iss = tp->snd_nxt + 1262 ((arc4random() & 0x7fffffff) | 0x8000); 1263 reuse = &iss; 1264 tp = tcp_close(tp); 1265 in_pcbunref(inp); 1266 inp = NULL; 1267 goto findpcb; 1268 } 1269 } 1270 1271 /* 1272 * States other than LISTEN or SYN_SENT. 1273 * First check timestamp, if present. 1274 * Then check that at least some bytes of segment are within 1275 * receive window. If segment begins before rcv_nxt, 1276 * drop leading data (and SYN); if nothing left, just ack. 1277 * 1278 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1279 * and it's less than opti.ts_recent, drop it. 1280 */ 1281 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1282 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1283 1284 /* Check to see if ts_recent is over 24 days old. */ 1285 if (now - tp->ts_recent_age > TCP_PAWS_IDLE) { 1286 /* 1287 * Invalidate ts_recent. If this segment updates 1288 * ts_recent, the age will be reset later and ts_recent 1289 * will get a valid value. If it does not, setting 1290 * ts_recent to zero will at least satisfy the 1291 * requirement that zero be placed in the timestamp 1292 * echo reply when ts_recent isn't valid. The 1293 * age isn't reset until we get a valid ts_recent 1294 * because we don't want out-of-order segments to be 1295 * dropped when ts_recent is old. 1296 */ 1297 tp->ts_recent = 0; 1298 } else { 1299 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1300 tcpstat_inc(tcps_pawsdrop); 1301 if (tlen) 1302 goto dropafterack; 1303 goto drop; 1304 } 1305 } 1306 1307 todrop = tp->rcv_nxt - th->th_seq; 1308 if (todrop > 0) { 1309 if (tiflags & TH_SYN) { 1310 tiflags &= ~TH_SYN; 1311 th->th_seq++; 1312 if (th->th_urp > 1) 1313 th->th_urp--; 1314 else 1315 tiflags &= ~TH_URG; 1316 todrop--; 1317 } 1318 if (todrop > tlen || 1319 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1320 /* 1321 * Any valid FIN must be to the left of the 1322 * window. At this point, FIN must be a 1323 * duplicate or out-of-sequence, so drop it. 1324 */ 1325 tiflags &= ~TH_FIN; 1326 /* 1327 * Send ACK to resynchronize, and drop any data, 1328 * but keep on processing for RST or ACK. 1329 */ 1330 tp->t_flags |= TF_ACKNOW; 1331 todrop = tlen; 1332 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1333 } else { 1334 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1335 todrop); 1336 } 1337 hdroptlen += todrop; /* drop from head afterwards */ 1338 th->th_seq += todrop; 1339 tlen -= todrop; 1340 if (th->th_urp > todrop) 1341 th->th_urp -= todrop; 1342 else { 1343 tiflags &= ~TH_URG; 1344 th->th_urp = 0; 1345 } 1346 } 1347 1348 /* 1349 * If new data are received on a connection after the 1350 * user processes are gone, then RST the other end. 1351 */ 1352 if ((so->so_state & SS_NOFDREF) && 1353 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1354 tp = tcp_close(tp); 1355 tcpstat_inc(tcps_rcvafterclose); 1356 goto dropwithreset; 1357 } 1358 1359 /* 1360 * If segment ends after window, drop trailing data 1361 * (and PUSH and FIN); if nothing left, just ACK. 1362 */ 1363 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1364 if (todrop > 0) { 1365 tcpstat_inc(tcps_rcvpackafterwin); 1366 if (todrop >= tlen) { 1367 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1368 /* 1369 * If window is closed can only take segments at 1370 * window edge, and have to drop data and PUSH from 1371 * incoming segments. Continue processing, but 1372 * remember to ack. Otherwise, drop segment 1373 * and ack. 1374 */ 1375 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1376 tp->t_flags |= TF_ACKNOW; 1377 tcpstat_inc(tcps_rcvwinprobe); 1378 } else 1379 goto dropafterack; 1380 } else 1381 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1382 m_adj(m, -todrop); 1383 tlen -= todrop; 1384 tiflags &= ~(TH_PUSH|TH_FIN); 1385 } 1386 1387 /* 1388 * If last ACK falls within this segment's sequence numbers, 1389 * record its timestamp if it's more recent. 1390 * NOTE that the test is modified according to the latest 1391 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1392 */ 1393 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1394 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1395 tp->ts_recent_age = now; 1396 tp->ts_recent = opti.ts_val; 1397 } 1398 1399 /* 1400 * If the RST bit is set examine the state: 1401 * SYN_RECEIVED STATE: 1402 * If passive open, return to LISTEN state. 1403 * If active open, inform user that connection was refused. 1404 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1405 * Inform user that connection was reset, and close tcb. 1406 * CLOSING, LAST_ACK, TIME_WAIT STATES 1407 * Close the tcb. 1408 */ 1409 if (tiflags & TH_RST) { 1410 if (th->th_seq != tp->last_ack_sent && 1411 th->th_seq != tp->rcv_nxt && 1412 th->th_seq != (tp->rcv_nxt + 1)) 1413 goto drop; 1414 1415 switch (tp->t_state) { 1416 case TCPS_SYN_RECEIVED: 1417 #ifdef TCP_ECN 1418 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1419 if (do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1420 goto drop; 1421 #endif 1422 so->so_error = ECONNREFUSED; 1423 goto close; 1424 1425 case TCPS_ESTABLISHED: 1426 case TCPS_FIN_WAIT_1: 1427 case TCPS_FIN_WAIT_2: 1428 case TCPS_CLOSE_WAIT: 1429 so->so_error = ECONNRESET; 1430 close: 1431 tp->t_state = TCPS_CLOSED; 1432 tcpstat_inc(tcps_drops); 1433 tp = tcp_close(tp); 1434 goto drop; 1435 case TCPS_CLOSING: 1436 case TCPS_LAST_ACK: 1437 case TCPS_TIME_WAIT: 1438 tp = tcp_close(tp); 1439 goto drop; 1440 } 1441 } 1442 1443 /* 1444 * If a SYN is in the window, then this is an 1445 * error and we ACK and drop the packet. 1446 */ 1447 if (tiflags & TH_SYN) 1448 goto dropafterack_ratelim; 1449 1450 /* 1451 * If the ACK bit is off we drop the segment and return. 1452 */ 1453 if ((tiflags & TH_ACK) == 0) { 1454 if (tp->t_flags & TF_ACKNOW) 1455 goto dropafterack; 1456 else 1457 goto drop; 1458 } 1459 1460 /* 1461 * Ack processing. 1462 */ 1463 switch (tp->t_state) { 1464 1465 /* 1466 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1467 * ESTABLISHED state and continue processing. 1468 * The ACK was checked above. 1469 */ 1470 case TCPS_SYN_RECEIVED: 1471 tcpstat_inc(tcps_connects); 1472 tp->t_flags |= TF_BLOCKOUTPUT; 1473 soisconnected(so); 1474 tp->t_flags &= ~TF_BLOCKOUTPUT; 1475 tp->t_state = TCPS_ESTABLISHED; 1476 TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepidle)); 1477 /* Do window scaling? */ 1478 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1479 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1480 tp->snd_scale = tp->requested_s_scale; 1481 tp->rcv_scale = tp->request_r_scale; 1482 tiwin = th->th_win << tp->snd_scale; 1483 } 1484 tcp_flush_queue(tp); 1485 tp->snd_wl1 = th->th_seq - 1; 1486 /* fall into ... */ 1487 1488 /* 1489 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1490 * ACKs. If the ack is in the range 1491 * tp->snd_una < th->th_ack <= tp->snd_max 1492 * then advance tp->snd_una to th->th_ack and drop 1493 * data from the retransmission queue. If this ACK reflects 1494 * more up to date window information we update our window information. 1495 */ 1496 case TCPS_ESTABLISHED: 1497 case TCPS_FIN_WAIT_1: 1498 case TCPS_FIN_WAIT_2: 1499 case TCPS_CLOSE_WAIT: 1500 case TCPS_CLOSING: 1501 case TCPS_LAST_ACK: 1502 case TCPS_TIME_WAIT: 1503 #ifdef TCP_ECN 1504 /* 1505 * if we receive ECE and are not already in recovery phase, 1506 * reduce cwnd by half but don't slow-start. 1507 * advance snd_last to snd_max not to reduce cwnd again 1508 * until all outstanding packets are acked. 1509 */ 1510 if (do_ecn && (tiflags & TH_ECE)) { 1511 if ((tp->t_flags & TF_ECN_PERMIT) && 1512 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1513 u_int win; 1514 1515 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1516 if (win > 1) { 1517 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1518 tp->snd_cwnd = tp->snd_ssthresh; 1519 tp->snd_last = tp->snd_max; 1520 tp->t_flags |= TF_SEND_CWR; 1521 tcpstat_inc(tcps_cwr_ecn); 1522 } 1523 } 1524 tcpstat_inc(tcps_ecn_rcvece); 1525 } 1526 /* 1527 * if we receive CWR, we know that the peer has reduced 1528 * its congestion window. stop sending ecn-echo. 1529 */ 1530 if ((tiflags & TH_CWR)) { 1531 tp->t_flags &= ~TF_RCVD_CE; 1532 tcpstat_inc(tcps_ecn_rcvcwr); 1533 } 1534 #endif /* TCP_ECN */ 1535 1536 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1537 /* 1538 * Duplicate/old ACK processing. 1539 * Increments t_dupacks: 1540 * Pure duplicate (same seq/ack/window, no data) 1541 * Doesn't affect t_dupacks: 1542 * Data packets. 1543 * Normal window updates (window opens) 1544 * Resets t_dupacks: 1545 * New data ACKed. 1546 * Window shrinks 1547 * Old ACK 1548 */ 1549 if (tlen) { 1550 /* Drop very old ACKs unless th_seq matches */ 1551 if (th->th_seq != tp->rcv_nxt && 1552 SEQ_LT(th->th_ack, 1553 tp->snd_una - tp->max_sndwnd)) { 1554 tcpstat_inc(tcps_rcvacktooold); 1555 goto drop; 1556 } 1557 break; 1558 } 1559 /* 1560 * If we get an old ACK, there is probably packet 1561 * reordering going on. Be conservative and reset 1562 * t_dupacks so that we are less aggressive in 1563 * doing a fast retransmit. 1564 */ 1565 if (th->th_ack != tp->snd_una) { 1566 tp->t_dupacks = 0; 1567 break; 1568 } 1569 if (tiwin == tp->snd_wnd) { 1570 tcpstat_inc(tcps_rcvdupack); 1571 /* 1572 * If we have outstanding data (other than 1573 * a window probe), this is a completely 1574 * duplicate ack (ie, window info didn't 1575 * change), the ack is the biggest we've 1576 * seen and we've seen exactly our rexmt 1577 * threshold of them, assume a packet 1578 * has been dropped and retransmit it. 1579 * Kludge snd_nxt & the congestion 1580 * window so we send only this one 1581 * packet. 1582 * 1583 * We know we're losing at the current 1584 * window size so do congestion avoidance 1585 * (set ssthresh to half the current window 1586 * and pull our congestion window back to 1587 * the new ssthresh). 1588 * 1589 * Dup acks mean that packets have left the 1590 * network (they're now cached at the receiver) 1591 * so bump cwnd by the amount in the receiver 1592 * to keep a constant cwnd packets in the 1593 * network. 1594 */ 1595 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1596 tp->t_dupacks = 0; 1597 else if (++tp->t_dupacks == tcprexmtthresh) { 1598 tcp_seq onxt = tp->snd_nxt; 1599 u_long win = 1600 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1601 2 / tp->t_maxseg; 1602 1603 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1604 /* 1605 * False fast retx after 1606 * timeout. Do not cut window. 1607 */ 1608 tp->t_dupacks = 0; 1609 goto drop; 1610 } 1611 if (win < 2) 1612 win = 2; 1613 tp->snd_ssthresh = win * tp->t_maxseg; 1614 tp->snd_last = tp->snd_max; 1615 if (tp->sack_enable) { 1616 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1617 tp->t_rtttime = 0; 1618 #ifdef TCP_ECN 1619 tp->t_flags |= TF_SEND_CWR; 1620 #endif 1621 tcpstat_inc(tcps_cwr_frecovery); 1622 tcpstat_inc(tcps_sack_recovery_episode); 1623 /* 1624 * tcp_output() will send 1625 * oldest SACK-eligible rtx. 1626 */ 1627 (void) tcp_output(tp); 1628 tp->snd_cwnd = tp->snd_ssthresh+ 1629 tp->t_maxseg * tp->t_dupacks; 1630 goto drop; 1631 } 1632 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1633 tp->t_rtttime = 0; 1634 tp->snd_nxt = th->th_ack; 1635 tp->snd_cwnd = tp->t_maxseg; 1636 #ifdef TCP_ECN 1637 tp->t_flags |= TF_SEND_CWR; 1638 #endif 1639 tcpstat_inc(tcps_cwr_frecovery); 1640 tcpstat_inc(tcps_sndrexmitfast); 1641 (void) tcp_output(tp); 1642 1643 tp->snd_cwnd = tp->snd_ssthresh + 1644 tp->t_maxseg * tp->t_dupacks; 1645 if (SEQ_GT(onxt, tp->snd_nxt)) 1646 tp->snd_nxt = onxt; 1647 goto drop; 1648 } else if (tp->t_dupacks > tcprexmtthresh) { 1649 tp->snd_cwnd += tp->t_maxseg; 1650 (void) tcp_output(tp); 1651 goto drop; 1652 } 1653 } else if (tiwin < tp->snd_wnd) { 1654 /* 1655 * The window was retracted! Previous dup 1656 * ACKs may have been due to packets arriving 1657 * after the shrunken window, not a missing 1658 * packet, so play it safe and reset t_dupacks 1659 */ 1660 tp->t_dupacks = 0; 1661 } 1662 break; 1663 } 1664 /* 1665 * If the congestion window was inflated to account 1666 * for the other side's cached packets, retract it. 1667 */ 1668 if (tp->t_dupacks >= tcprexmtthresh) { 1669 /* Check for a partial ACK */ 1670 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1671 if (tp->sack_enable) 1672 tcp_sack_partialack(tp, th); 1673 else 1674 tcp_newreno_partialack(tp, th); 1675 } else { 1676 /* Out of fast recovery */ 1677 tp->snd_cwnd = tp->snd_ssthresh; 1678 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1679 tp->snd_ssthresh) 1680 tp->snd_cwnd = 1681 tcp_seq_subtract(tp->snd_max, 1682 th->th_ack); 1683 tp->t_dupacks = 0; 1684 } 1685 } else { 1686 /* 1687 * Reset the duplicate ACK counter if we 1688 * were not in fast recovery. 1689 */ 1690 tp->t_dupacks = 0; 1691 } 1692 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1693 tcpstat_inc(tcps_rcvacktoomuch); 1694 goto dropafterack_ratelim; 1695 } 1696 acked = th->th_ack - tp->snd_una; 1697 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1698 tp->t_rcvacktime = now; 1699 1700 /* 1701 * If we have a timestamp reply, update smoothed 1702 * round trip time. If no timestamp is present but 1703 * transmit timer is running and timed sequence 1704 * number was acked, update smoothed round trip time. 1705 * Since we now have an rtt measurement, cancel the 1706 * timer backoff (cf., Phil Karn's retransmit alg.). 1707 * Recompute the initial retransmit timer. 1708 */ 1709 if (opti.ts_present && opti.ts_ecr) 1710 tcp_xmit_timer(tp, now - opti.ts_ecr); 1711 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1712 tcp_xmit_timer(tp, now - tp->t_rtttime); 1713 1714 /* 1715 * If all outstanding data is acked, stop retransmit 1716 * timer and remember to restart (more output or persist). 1717 * If there is more data to be acked, restart retransmit 1718 * timer, using current (possibly backed-off) value. 1719 */ 1720 if (th->th_ack == tp->snd_max) { 1721 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1722 tp->t_flags |= TF_NEEDOUTPUT; 1723 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1724 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1725 /* 1726 * When new data is acked, open the congestion window. 1727 * If the window gives us less than ssthresh packets 1728 * in flight, open exponentially (maxseg per packet). 1729 * Otherwise open linearly: maxseg per window 1730 * (maxseg^2 / cwnd per packet). 1731 */ 1732 { 1733 u_int cw = tp->snd_cwnd; 1734 u_int incr = tp->t_maxseg; 1735 1736 if (cw > tp->snd_ssthresh) 1737 incr = max(incr * incr / cw, 1); 1738 if (tp->t_dupacks < tcprexmtthresh) 1739 tp->snd_cwnd = ulmin(cw + incr, 1740 TCP_MAXWIN << tp->snd_scale); 1741 } 1742 ND6_HINT(tp); 1743 if (acked > so->so_snd.sb_cc) { 1744 if (tp->snd_wnd > so->so_snd.sb_cc) 1745 tp->snd_wnd -= so->so_snd.sb_cc; 1746 else 1747 tp->snd_wnd = 0; 1748 mtx_enter(&so->so_snd.sb_mtx); 1749 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1750 mtx_leave(&so->so_snd.sb_mtx); 1751 ourfinisacked = 1; 1752 } else { 1753 mtx_enter(&so->so_snd.sb_mtx); 1754 sbdrop(so, &so->so_snd, acked); 1755 mtx_leave(&so->so_snd.sb_mtx); 1756 if (tp->snd_wnd > acked) 1757 tp->snd_wnd -= acked; 1758 else 1759 tp->snd_wnd = 0; 1760 ourfinisacked = 0; 1761 } 1762 1763 tcp_update_sndspace(tp); 1764 if (sb_notify(so, &so->so_snd)) { 1765 tp->t_flags |= TF_BLOCKOUTPUT; 1766 sowwakeup(so); 1767 tp->t_flags &= ~TF_BLOCKOUTPUT; 1768 } 1769 1770 /* 1771 * If we had a pending ICMP message that referred to data 1772 * that have just been acknowledged, disregard the recorded 1773 * ICMP message. 1774 */ 1775 if ((tp->t_flags & TF_PMTUD_PEND) && 1776 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1777 tp->t_flags &= ~TF_PMTUD_PEND; 1778 1779 /* 1780 * Keep track of the largest chunk of data acknowledged 1781 * since last PMTU update 1782 */ 1783 if (tp->t_pmtud_mss_acked < acked) 1784 tp->t_pmtud_mss_acked = acked; 1785 1786 tp->snd_una = th->th_ack; 1787 #ifdef TCP_ECN 1788 /* sync snd_last with snd_una */ 1789 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1790 tp->snd_last = tp->snd_una; 1791 #endif 1792 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1793 tp->snd_nxt = tp->snd_una; 1794 1795 switch (tp->t_state) { 1796 1797 /* 1798 * In FIN_WAIT_1 STATE in addition to the processing 1799 * for the ESTABLISHED state if our FIN is now acknowledged 1800 * then enter FIN_WAIT_2. 1801 */ 1802 case TCPS_FIN_WAIT_1: 1803 if (ourfinisacked) { 1804 /* 1805 * If we can't receive any more 1806 * data, then closing user can proceed. 1807 * Starting the timer is contrary to the 1808 * specification, but if we don't get a FIN 1809 * we'll hang forever. 1810 */ 1811 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 1812 int maxidle; 1813 1814 tp->t_flags |= TF_BLOCKOUTPUT; 1815 soisdisconnected(so); 1816 tp->t_flags &= ~TF_BLOCKOUTPUT; 1817 maxidle = TCPTV_KEEPCNT * 1818 atomic_load_int(&tcp_keepidle); 1819 TCP_TIMER_ARM(tp, TCPT_2MSL, maxidle); 1820 } 1821 tp->t_state = TCPS_FIN_WAIT_2; 1822 } 1823 break; 1824 1825 /* 1826 * In CLOSING STATE in addition to the processing for 1827 * the ESTABLISHED state if the ACK acknowledges our FIN 1828 * then enter the TIME-WAIT state, otherwise ignore 1829 * the segment. 1830 */ 1831 case TCPS_CLOSING: 1832 if (ourfinisacked) { 1833 tp->t_state = TCPS_TIME_WAIT; 1834 tcp_canceltimers(tp); 1835 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1836 tp->t_flags |= TF_BLOCKOUTPUT; 1837 soisdisconnected(so); 1838 tp->t_flags &= ~TF_BLOCKOUTPUT; 1839 } 1840 break; 1841 1842 /* 1843 * In LAST_ACK, we may still be waiting for data to drain 1844 * and/or to be acked, as well as for the ack of our FIN. 1845 * If our FIN is now acknowledged, delete the TCB, 1846 * enter the closed state and return. 1847 */ 1848 case TCPS_LAST_ACK: 1849 if (ourfinisacked) { 1850 tp = tcp_close(tp); 1851 goto drop; 1852 } 1853 break; 1854 1855 /* 1856 * In TIME_WAIT state the only thing that should arrive 1857 * is a retransmission of the remote FIN. Acknowledge 1858 * it and restart the finack timer. 1859 */ 1860 case TCPS_TIME_WAIT: 1861 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1862 goto dropafterack; 1863 } 1864 } 1865 1866 step6: 1867 /* 1868 * Update window information. 1869 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1870 */ 1871 if ((tiflags & TH_ACK) && 1872 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1873 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1874 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1875 /* keep track of pure window updates */ 1876 if (tlen == 0 && 1877 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1878 tcpstat_inc(tcps_rcvwinupd); 1879 tp->snd_wnd = tiwin; 1880 tp->snd_wl1 = th->th_seq; 1881 tp->snd_wl2 = th->th_ack; 1882 if (tp->snd_wnd > tp->max_sndwnd) 1883 tp->max_sndwnd = tp->snd_wnd; 1884 tp->t_flags |= TF_NEEDOUTPUT; 1885 } 1886 1887 /* 1888 * Process segments with URG. 1889 */ 1890 if ((tiflags & TH_URG) && th->th_urp && 1891 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1892 u_long urgent; 1893 1894 /* 1895 * This is a kludge, but if we receive and accept 1896 * random urgent pointers, we'll crash in 1897 * soreceive. It's hard to imagine someone 1898 * actually wanting to send this much urgent data. 1899 */ 1900 mtx_enter(&so->so_rcv.sb_mtx); 1901 urgent = th->th_urp + so->so_rcv.sb_cc; 1902 mtx_leave(&so->so_rcv.sb_mtx); 1903 1904 if (urgent > sb_max) { 1905 th->th_urp = 0; /* XXX */ 1906 tiflags &= ~TH_URG; /* XXX */ 1907 goto dodata; /* XXX */ 1908 } 1909 /* 1910 * If this segment advances the known urgent pointer, 1911 * then mark the data stream. This should not happen 1912 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1913 * a FIN has been received from the remote side. 1914 * In these states we ignore the URG. 1915 * 1916 * According to RFC961 (Assigned Protocols), 1917 * the urgent pointer points to the last octet 1918 * of urgent data. We continue, however, 1919 * to consider it to indicate the first octet 1920 * of data past the urgent section as the original 1921 * spec states (in one of two places). 1922 */ 1923 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1924 tp->rcv_up = th->th_seq + th->th_urp; 1925 mtx_enter(&so->so_rcv.sb_mtx); 1926 so->so_oobmark = so->so_rcv.sb_cc + 1927 (tp->rcv_up - tp->rcv_nxt) - 1; 1928 if (so->so_oobmark == 0) 1929 so->so_rcv.sb_state |= SS_RCVATMARK; 1930 mtx_leave(&so->so_rcv.sb_mtx); 1931 sohasoutofband(so); 1932 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1933 } 1934 /* 1935 * Remove out of band data so doesn't get presented to user. 1936 * This can happen independent of advancing the URG pointer, 1937 * but if two URG's are pending at once, some out-of-band 1938 * data may creep in... ick. 1939 */ 1940 if (th->th_urp <= (u_int16_t) tlen && 1941 (so->so_options & SO_OOBINLINE) == 0) 1942 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1943 } else 1944 /* 1945 * If no out of band data is expected, 1946 * pull receive urgent pointer along 1947 * with the receive window. 1948 */ 1949 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1950 tp->rcv_up = tp->rcv_nxt; 1951 dodata: /* XXX */ 1952 1953 /* 1954 * Process the segment text, merging it into the TCP sequencing queue, 1955 * and arranging for acknowledgment of receipt if necessary. 1956 * This process logically involves adjusting tp->rcv_wnd as data 1957 * is presented to the user (this happens in tcp_usrreq.c, 1958 * case PRU_RCVD). If a FIN has already been received on this 1959 * connection then we just ignore the text. 1960 */ 1961 if ((tlen || (tiflags & TH_FIN)) && 1962 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1963 tcp_seq laststart = th->th_seq; 1964 tcp_seq lastend = th->th_seq + tlen; 1965 1966 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1967 tp->t_state == TCPS_ESTABLISHED) { 1968 TCP_SETUP_ACK(tp, tiflags, m); 1969 tp->rcv_nxt += tlen; 1970 tiflags = th->th_flags & TH_FIN; 1971 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1972 ND6_HINT(tp); 1973 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1974 m_freem(m); 1975 else { 1976 m_adj(m, hdroptlen); 1977 mtx_enter(&so->so_rcv.sb_mtx); 1978 sbappendstream(so, &so->so_rcv, m); 1979 mtx_leave(&so->so_rcv.sb_mtx); 1980 } 1981 tp->t_flags |= TF_BLOCKOUTPUT; 1982 sorwakeup(so); 1983 tp->t_flags &= ~TF_BLOCKOUTPUT; 1984 } else { 1985 m_adj(m, hdroptlen); 1986 tiflags = tcp_reass(tp, th, m, &tlen); 1987 tp->t_flags |= TF_ACKNOW; 1988 } 1989 if (tp->sack_enable) 1990 tcp_update_sack_list(tp, laststart, lastend); 1991 1992 /* 1993 * variable len never referenced again in modern BSD, 1994 * so why bother computing it ?? 1995 */ 1996 #if 0 1997 /* 1998 * Note the amount of data that peer has sent into 1999 * our window, in order to estimate the sender's 2000 * buffer size. 2001 */ 2002 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2003 #endif /* 0 */ 2004 } else { 2005 m_freem(m); 2006 tiflags &= ~TH_FIN; 2007 } 2008 2009 /* 2010 * If FIN is received ACK the FIN and let the user know 2011 * that the connection is closing. Ignore a FIN received before 2012 * the connection is fully established. 2013 */ 2014 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2015 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2016 tp->t_flags |= TF_BLOCKOUTPUT; 2017 socantrcvmore(so); 2018 tp->t_flags &= ~TF_BLOCKOUTPUT; 2019 tp->t_flags |= TF_ACKNOW; 2020 tp->rcv_nxt++; 2021 } 2022 switch (tp->t_state) { 2023 2024 /* 2025 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2026 */ 2027 case TCPS_ESTABLISHED: 2028 tp->t_state = TCPS_CLOSE_WAIT; 2029 break; 2030 2031 /* 2032 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2033 * enter the CLOSING state. 2034 */ 2035 case TCPS_FIN_WAIT_1: 2036 tp->t_state = TCPS_CLOSING; 2037 break; 2038 2039 /* 2040 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2041 * starting the time-wait timer, turning off the other 2042 * standard timers. 2043 */ 2044 case TCPS_FIN_WAIT_2: 2045 tp->t_state = TCPS_TIME_WAIT; 2046 tcp_canceltimers(tp); 2047 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2048 tp->t_flags |= TF_BLOCKOUTPUT; 2049 soisdisconnected(so); 2050 tp->t_flags &= ~TF_BLOCKOUTPUT; 2051 break; 2052 2053 /* 2054 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2055 */ 2056 case TCPS_TIME_WAIT: 2057 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2058 break; 2059 } 2060 } 2061 if (otp) 2062 tcp_trace(TA_INPUT, ostate, tp, otp, &saveti.caddr, 0, tlen); 2063 2064 /* 2065 * Return any desired output. 2066 */ 2067 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2068 (void) tcp_output(tp); 2069 in_pcbunref(inp); 2070 return IPPROTO_DONE; 2071 2072 badsyn: 2073 /* 2074 * Received a bad SYN. Increment counters and dropwithreset. 2075 */ 2076 tcpstat_inc(tcps_badsyn); 2077 tp = NULL; 2078 goto dropwithreset; 2079 2080 dropafterack_ratelim: 2081 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2082 tcp_ackdrop_ppslim) == 0) { 2083 /* XXX stat */ 2084 goto drop; 2085 } 2086 /* ...fall into dropafterack... */ 2087 2088 dropafterack: 2089 /* 2090 * Generate an ACK dropping incoming segment if it occupies 2091 * sequence space, where the ACK reflects our state. 2092 */ 2093 if (tiflags & TH_RST) 2094 goto drop; 2095 m_freem(m); 2096 tp->t_flags |= TF_ACKNOW; 2097 (void) tcp_output(tp); 2098 in_pcbunref(inp); 2099 return IPPROTO_DONE; 2100 2101 dropwithreset_ratelim: 2102 /* 2103 * We may want to rate-limit RSTs in certain situations, 2104 * particularly if we are sending an RST in response to 2105 * an attempt to connect to or otherwise communicate with 2106 * a port for which we have no socket. 2107 */ 2108 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2109 atomic_load_int(&tcp_rst_ppslim)) == 0) { 2110 /* XXX stat */ 2111 goto drop; 2112 } 2113 /* ...fall into dropwithreset... */ 2114 2115 dropwithreset: 2116 /* 2117 * Generate a RST, dropping incoming segment. 2118 * Make ACK acceptable to originator of segment. 2119 * Don't bother to respond to RST. 2120 */ 2121 if (tiflags & TH_RST) 2122 goto drop; 2123 if (tiflags & TH_ACK) { 2124 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2125 TH_RST, m->m_pkthdr.ph_rtableid, now); 2126 } else { 2127 if (tiflags & TH_SYN) 2128 tlen++; 2129 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2130 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); 2131 } 2132 m_freem(m); 2133 in_pcbunref(inp); 2134 return IPPROTO_DONE; 2135 2136 drop: 2137 /* 2138 * Drop space held by incoming segment and return. 2139 */ 2140 if (otp) 2141 tcp_trace(TA_DROP, ostate, tp, otp, &saveti.caddr, 0, tlen); 2142 2143 m_freem(m); 2144 in_pcbunref(inp); 2145 return IPPROTO_DONE; 2146 } 2147 2148 int 2149 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2150 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2151 u_int rtableid, uint64_t now) 2152 { 2153 u_int16_t mss = 0; 2154 int opt, optlen; 2155 #ifdef TCP_SIGNATURE 2156 caddr_t sigp = NULL; 2157 struct tdb *tdb = NULL; 2158 #endif 2159 2160 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2161 opt = cp[0]; 2162 if (opt == TCPOPT_EOL) 2163 break; 2164 if (opt == TCPOPT_NOP) 2165 optlen = 1; 2166 else { 2167 if (cnt < 2) 2168 break; 2169 optlen = cp[1]; 2170 if (optlen < 2 || optlen > cnt) 2171 break; 2172 } 2173 switch (opt) { 2174 2175 default: 2176 continue; 2177 2178 case TCPOPT_MAXSEG: 2179 if (optlen != TCPOLEN_MAXSEG) 2180 continue; 2181 if (!(th->th_flags & TH_SYN)) 2182 continue; 2183 if (TCPS_HAVERCVDSYN(tp->t_state)) 2184 continue; 2185 memcpy(&mss, cp + 2, sizeof(mss)); 2186 mss = ntohs(mss); 2187 oi->maxseg = mss; 2188 break; 2189 2190 case TCPOPT_WINDOW: 2191 if (optlen != TCPOLEN_WINDOW) 2192 continue; 2193 if (!(th->th_flags & TH_SYN)) 2194 continue; 2195 if (TCPS_HAVERCVDSYN(tp->t_state)) 2196 continue; 2197 tp->t_flags |= TF_RCVD_SCALE; 2198 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2199 break; 2200 2201 case TCPOPT_TIMESTAMP: 2202 if (optlen != TCPOLEN_TIMESTAMP) 2203 continue; 2204 oi->ts_present = 1; 2205 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2206 oi->ts_val = ntohl(oi->ts_val); 2207 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2208 oi->ts_ecr = ntohl(oi->ts_ecr); 2209 2210 if (!(th->th_flags & TH_SYN)) 2211 continue; 2212 if (TCPS_HAVERCVDSYN(tp->t_state)) 2213 continue; 2214 /* 2215 * A timestamp received in a SYN makes 2216 * it ok to send timestamp requests and replies. 2217 */ 2218 tp->t_flags |= TF_RCVD_TSTMP; 2219 tp->ts_recent = oi->ts_val; 2220 tp->ts_recent_age = now; 2221 break; 2222 2223 case TCPOPT_SACK_PERMITTED: 2224 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2225 continue; 2226 if (!(th->th_flags & TH_SYN)) 2227 continue; 2228 if (TCPS_HAVERCVDSYN(tp->t_state)) 2229 continue; 2230 /* MUST only be set on SYN */ 2231 tp->t_flags |= TF_SACK_PERMIT; 2232 break; 2233 case TCPOPT_SACK: 2234 tcp_sack_option(tp, th, cp, optlen); 2235 break; 2236 #ifdef TCP_SIGNATURE 2237 case TCPOPT_SIGNATURE: 2238 if (optlen != TCPOLEN_SIGNATURE) 2239 continue; 2240 2241 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2242 goto bad; 2243 2244 sigp = cp + 2; 2245 break; 2246 #endif /* TCP_SIGNATURE */ 2247 } 2248 } 2249 2250 #ifdef TCP_SIGNATURE 2251 if (tp->t_flags & TF_SIGNATURE) { 2252 union sockaddr_union src, dst; 2253 2254 memset(&src, 0, sizeof(union sockaddr_union)); 2255 memset(&dst, 0, sizeof(union sockaddr_union)); 2256 2257 switch (tp->pf) { 2258 case 0: 2259 case AF_INET: 2260 src.sa.sa_len = sizeof(struct sockaddr_in); 2261 src.sa.sa_family = AF_INET; 2262 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2263 dst.sa.sa_len = sizeof(struct sockaddr_in); 2264 dst.sa.sa_family = AF_INET; 2265 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2266 break; 2267 #ifdef INET6 2268 case AF_INET6: 2269 src.sa.sa_len = sizeof(struct sockaddr_in6); 2270 src.sa.sa_family = AF_INET6; 2271 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2272 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2273 dst.sa.sa_family = AF_INET6; 2274 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2275 break; 2276 #endif /* INET6 */ 2277 } 2278 2279 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2280 0, &src, &dst, IPPROTO_TCP); 2281 2282 /* 2283 * We don't have an SA for this peer, so we turn off 2284 * TF_SIGNATURE on the listen socket 2285 */ 2286 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2287 tp->t_flags &= ~TF_SIGNATURE; 2288 2289 } 2290 2291 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2292 tcpstat_inc(tcps_rcvbadsig); 2293 goto bad; 2294 } 2295 2296 if (sigp) { 2297 char sig[16]; 2298 2299 if (tdb == NULL) { 2300 tcpstat_inc(tcps_rcvbadsig); 2301 goto bad; 2302 } 2303 2304 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2305 goto bad; 2306 2307 if (timingsafe_bcmp(sig, sigp, 16)) { 2308 tcpstat_inc(tcps_rcvbadsig); 2309 goto bad; 2310 } 2311 2312 tcpstat_inc(tcps_rcvgoodsig); 2313 } 2314 2315 tdb_unref(tdb); 2316 #endif /* TCP_SIGNATURE */ 2317 2318 return (0); 2319 2320 #ifdef TCP_SIGNATURE 2321 bad: 2322 tdb_unref(tdb); 2323 #endif 2324 return (-1); 2325 } 2326 2327 u_long 2328 tcp_seq_subtract(u_long a, u_long b) 2329 { 2330 return ((long)(a - b)); 2331 } 2332 2333 /* 2334 * This function is called upon receipt of new valid data (while not in header 2335 * prediction mode), and it updates the ordered list of sacks. 2336 */ 2337 void 2338 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2339 tcp_seq rcv_lastend) 2340 { 2341 /* 2342 * First reported block MUST be the most recent one. Subsequent 2343 * blocks SHOULD be in the order in which they arrived at the 2344 * receiver. These two conditions make the implementation fully 2345 * compliant with RFC 2018. 2346 */ 2347 int i, j = 0, count = 0, lastpos = -1; 2348 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2349 2350 /* First clean up current list of sacks */ 2351 for (i = 0; i < tp->rcv_numsacks; i++) { 2352 sack = tp->sackblks[i]; 2353 if (sack.start == 0 && sack.end == 0) { 2354 count++; /* count = number of blocks to be discarded */ 2355 continue; 2356 } 2357 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2358 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2359 count++; 2360 } else { 2361 temp[j].start = tp->sackblks[i].start; 2362 temp[j++].end = tp->sackblks[i].end; 2363 } 2364 } 2365 tp->rcv_numsacks -= count; 2366 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2367 tcp_clean_sackreport(tp); 2368 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2369 /* ==> need first sack block */ 2370 tp->sackblks[0].start = rcv_laststart; 2371 tp->sackblks[0].end = rcv_lastend; 2372 tp->rcv_numsacks = 1; 2373 } 2374 return; 2375 } 2376 /* Otherwise, sack blocks are already present. */ 2377 for (i = 0; i < tp->rcv_numsacks; i++) 2378 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2379 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2380 return; /* sack list remains unchanged */ 2381 /* 2382 * From here, segment just received should be (part of) the 1st sack. 2383 * Go through list, possibly coalescing sack block entries. 2384 */ 2385 firstsack.start = rcv_laststart; 2386 firstsack.end = rcv_lastend; 2387 for (i = 0; i < tp->rcv_numsacks; i++) { 2388 sack = tp->sackblks[i]; 2389 if (SEQ_LT(sack.end, firstsack.start) || 2390 SEQ_GT(sack.start, firstsack.end)) 2391 continue; /* no overlap */ 2392 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2393 /* 2394 * identical block; delete it here since we will 2395 * move it to the front of the list. 2396 */ 2397 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2398 lastpos = i; /* last posn with a zero entry */ 2399 continue; 2400 } 2401 if (SEQ_LEQ(sack.start, firstsack.start)) 2402 firstsack.start = sack.start; /* merge blocks */ 2403 if (SEQ_GEQ(sack.end, firstsack.end)) 2404 firstsack.end = sack.end; /* merge blocks */ 2405 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2406 lastpos = i; /* last posn with a zero entry */ 2407 } 2408 if (lastpos != -1) { /* at least one merge */ 2409 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2410 sack = tp->sackblks[i]; 2411 if (sack.start == 0 && sack.end == 0) 2412 continue; 2413 temp[j++] = sack; 2414 } 2415 tp->rcv_numsacks = j; /* including first blk (added later) */ 2416 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2417 tp->sackblks[i] = temp[i]; 2418 } else { /* no merges -- shift sacks by 1 */ 2419 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2420 tp->rcv_numsacks++; 2421 for (i = tp->rcv_numsacks-1; i > 0; i--) 2422 tp->sackblks[i] = tp->sackblks[i-1]; 2423 } 2424 tp->sackblks[0] = firstsack; 2425 return; 2426 } 2427 2428 /* 2429 * Process the TCP SACK option. tp->snd_holes is an ordered list 2430 * of holes (oldest to newest, in terms of the sequence space). 2431 */ 2432 void 2433 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2434 { 2435 int tmp_olen; 2436 u_char *tmp_cp; 2437 struct sackhole *cur, *p, *temp; 2438 2439 if (!tp->sack_enable) 2440 return; 2441 /* SACK without ACK doesn't make sense. */ 2442 if ((th->th_flags & TH_ACK) == 0) 2443 return; 2444 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2445 if (SEQ_LT(th->th_ack, tp->snd_una) || 2446 SEQ_GT(th->th_ack, tp->snd_max)) 2447 return; 2448 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2449 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2450 return; 2451 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2452 tmp_cp = cp + 2; 2453 tmp_olen = optlen - 2; 2454 tcpstat_inc(tcps_sack_rcv_opts); 2455 if (tp->snd_numholes < 0) 2456 tp->snd_numholes = 0; 2457 if (tp->t_maxseg == 0) 2458 panic("tcp_sack_option"); /* Should never happen */ 2459 while (tmp_olen > 0) { 2460 struct sackblk sack; 2461 2462 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2463 sack.start = ntohl(sack.start); 2464 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2465 sack.end = ntohl(sack.end); 2466 tmp_olen -= TCPOLEN_SACK; 2467 tmp_cp += TCPOLEN_SACK; 2468 if (SEQ_LEQ(sack.end, sack.start)) 2469 continue; /* bad SACK fields */ 2470 if (SEQ_LEQ(sack.end, tp->snd_una)) 2471 continue; /* old block */ 2472 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2473 if (SEQ_LT(sack.start, th->th_ack)) 2474 continue; 2475 } 2476 if (SEQ_GT(sack.end, tp->snd_max)) 2477 continue; 2478 if (tp->snd_holes == NULL) { /* first hole */ 2479 tp->snd_holes = (struct sackhole *) 2480 pool_get(&sackhl_pool, PR_NOWAIT); 2481 if (tp->snd_holes == NULL) { 2482 /* ENOBUFS, so ignore SACKed block for now */ 2483 goto dropped; 2484 } 2485 cur = tp->snd_holes; 2486 cur->start = th->th_ack; 2487 cur->end = sack.start; 2488 cur->rxmit = cur->start; 2489 cur->next = NULL; 2490 tp->snd_numholes = 1; 2491 tp->rcv_lastsack = sack.end; 2492 /* 2493 * dups is at least one. If more data has been 2494 * SACKed, it can be greater than one. 2495 */ 2496 cur->dups = min(tcprexmtthresh, 2497 ((sack.end - cur->end)/tp->t_maxseg)); 2498 if (cur->dups < 1) 2499 cur->dups = 1; 2500 continue; /* with next sack block */ 2501 } 2502 /* Go thru list of holes: p = previous, cur = current */ 2503 p = cur = tp->snd_holes; 2504 while (cur) { 2505 if (SEQ_LEQ(sack.end, cur->start)) 2506 /* SACKs data before the current hole */ 2507 break; /* no use going through more holes */ 2508 if (SEQ_GEQ(sack.start, cur->end)) { 2509 /* SACKs data beyond the current hole */ 2510 cur->dups++; 2511 if (((sack.end - cur->end)/tp->t_maxseg) >= 2512 tcprexmtthresh) 2513 cur->dups = tcprexmtthresh; 2514 p = cur; 2515 cur = cur->next; 2516 continue; 2517 } 2518 if (SEQ_LEQ(sack.start, cur->start)) { 2519 /* Data acks at least the beginning of hole */ 2520 if (SEQ_GEQ(sack.end, cur->end)) { 2521 /* Acks entire hole, so delete hole */ 2522 if (p != cur) { 2523 p->next = cur->next; 2524 pool_put(&sackhl_pool, cur); 2525 cur = p->next; 2526 } else { 2527 cur = cur->next; 2528 pool_put(&sackhl_pool, p); 2529 p = cur; 2530 tp->snd_holes = p; 2531 } 2532 tp->snd_numholes--; 2533 continue; 2534 } 2535 /* otherwise, move start of hole forward */ 2536 cur->start = sack.end; 2537 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2538 p = cur; 2539 cur = cur->next; 2540 continue; 2541 } 2542 /* move end of hole backward */ 2543 if (SEQ_GEQ(sack.end, cur->end)) { 2544 cur->end = sack.start; 2545 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2546 cur->dups++; 2547 if (((sack.end - cur->end)/tp->t_maxseg) >= 2548 tcprexmtthresh) 2549 cur->dups = tcprexmtthresh; 2550 p = cur; 2551 cur = cur->next; 2552 continue; 2553 } 2554 if (SEQ_LT(cur->start, sack.start) && 2555 SEQ_GT(cur->end, sack.end)) { 2556 /* 2557 * ACKs some data in middle of a hole; need to 2558 * split current hole 2559 */ 2560 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2561 goto dropped; 2562 temp = (struct sackhole *) 2563 pool_get(&sackhl_pool, PR_NOWAIT); 2564 if (temp == NULL) 2565 goto dropped; /* ENOBUFS */ 2566 temp->next = cur->next; 2567 temp->start = sack.end; 2568 temp->end = cur->end; 2569 temp->dups = cur->dups; 2570 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2571 cur->end = sack.start; 2572 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2573 cur->dups++; 2574 if (((sack.end - cur->end)/tp->t_maxseg) >= 2575 tcprexmtthresh) 2576 cur->dups = tcprexmtthresh; 2577 cur->next = temp; 2578 p = temp; 2579 cur = p->next; 2580 tp->snd_numholes++; 2581 } 2582 } 2583 /* At this point, p points to the last hole on the list */ 2584 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2585 /* 2586 * Need to append new hole at end. 2587 * Last hole is p (and it's not NULL). 2588 */ 2589 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2590 goto dropped; 2591 temp = (struct sackhole *) 2592 pool_get(&sackhl_pool, PR_NOWAIT); 2593 if (temp == NULL) 2594 goto dropped; /* ENOBUFS */ 2595 temp->start = tp->rcv_lastsack; 2596 temp->end = sack.start; 2597 temp->dups = min(tcprexmtthresh, 2598 ((sack.end - sack.start)/tp->t_maxseg)); 2599 if (temp->dups < 1) 2600 temp->dups = 1; 2601 temp->rxmit = temp->start; 2602 temp->next = 0; 2603 p->next = temp; 2604 tp->rcv_lastsack = sack.end; 2605 tp->snd_numholes++; 2606 } 2607 } 2608 return; 2609 dropped: 2610 tcpstat_inc(tcps_sack_drop_opts); 2611 } 2612 2613 /* 2614 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2615 * it is completely acked; otherwise, tcp_sack_option(), called from 2616 * tcp_dooptions(), will fix up the hole. 2617 */ 2618 void 2619 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2620 { 2621 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2622 /* max because this could be an older ack just arrived */ 2623 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2624 th->th_ack : tp->snd_una; 2625 struct sackhole *cur = tp->snd_holes; 2626 struct sackhole *prev; 2627 while (cur) 2628 if (SEQ_LEQ(cur->end, lastack)) { 2629 prev = cur; 2630 cur = cur->next; 2631 pool_put(&sackhl_pool, prev); 2632 tp->snd_numholes--; 2633 } else if (SEQ_LT(cur->start, lastack)) { 2634 cur->start = lastack; 2635 if (SEQ_LT(cur->rxmit, cur->start)) 2636 cur->rxmit = cur->start; 2637 break; 2638 } else 2639 break; 2640 tp->snd_holes = cur; 2641 } 2642 } 2643 2644 /* 2645 * Delete all receiver-side SACK information. 2646 */ 2647 void 2648 tcp_clean_sackreport(struct tcpcb *tp) 2649 { 2650 int i; 2651 2652 tp->rcv_numsacks = 0; 2653 for (i = 0; i < MAX_SACK_BLKS; i++) 2654 tp->sackblks[i].start = tp->sackblks[i].end=0; 2655 2656 } 2657 2658 /* 2659 * Partial ack handling within a sack recovery episode. When a partial ack 2660 * arrives, turn off retransmission timer, deflate the window, do not clear 2661 * tp->t_dupacks. 2662 */ 2663 void 2664 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2665 { 2666 /* Turn off retx. timer (will start again next segment) */ 2667 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2668 tp->t_rtttime = 0; 2669 /* 2670 * Partial window deflation. This statement relies on the 2671 * fact that tp->snd_una has not been updated yet. 2672 */ 2673 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2674 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2675 tp->snd_cwnd += tp->t_maxseg; 2676 } else 2677 tp->snd_cwnd = tp->t_maxseg; 2678 tp->snd_cwnd += tp->t_maxseg; 2679 tp->t_flags |= TF_NEEDOUTPUT; 2680 } 2681 2682 /* 2683 * Pull out of band byte out of a segment so 2684 * it doesn't appear in the user's data queue. 2685 * It is still reflected in the segment length for 2686 * sequencing purposes. 2687 */ 2688 void 2689 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2690 { 2691 int cnt = off + urgent - 1; 2692 2693 while (cnt >= 0) { 2694 if (m->m_len > cnt) { 2695 char *cp = mtod(m, caddr_t) + cnt; 2696 struct tcpcb *tp = sototcpcb(so); 2697 2698 tp->t_iobc = *cp; 2699 tp->t_oobflags |= TCPOOB_HAVEDATA; 2700 memmove(cp, cp + 1, m->m_len - cnt - 1); 2701 m->m_len--; 2702 return; 2703 } 2704 cnt -= m->m_len; 2705 m = m->m_next; 2706 if (m == NULL) 2707 break; 2708 } 2709 panic("tcp_pulloutofband"); 2710 } 2711 2712 /* 2713 * Collect new round-trip time estimate 2714 * and update averages and current timeout. 2715 */ 2716 void 2717 tcp_xmit_timer(struct tcpcb *tp, int32_t rtt) 2718 { 2719 int delta, rttmin; 2720 2721 if (rtt < 0) 2722 rtt = 0; 2723 else if (rtt > TCP_RTT_MAX) 2724 rtt = TCP_RTT_MAX; 2725 2726 tcpstat_inc(tcps_rttupdated); 2727 if (tp->t_srtt != 0) { 2728 /* 2729 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2730 * after the binary point (scaled by 4), whereas 2731 * srtt is stored as fixed point with 5 bits after the 2732 * binary point (i.e., scaled by 32). The following magic 2733 * is equivalent to the smoothing algorithm in rfc793 with 2734 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2735 * point). 2736 */ 2737 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2738 (tp->t_srtt >> TCP_RTT_SHIFT); 2739 if ((tp->t_srtt += delta) <= 0) 2740 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2741 /* 2742 * We accumulate a smoothed rtt variance (actually, a 2743 * smoothed mean difference), then set the retransmit 2744 * timer to smoothed rtt + 4 times the smoothed variance. 2745 * rttvar is stored as fixed point with 4 bits after the 2746 * binary point (scaled by 16). The following is 2747 * equivalent to rfc793 smoothing with an alpha of .75 2748 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2749 * rfc793's wired-in beta. 2750 */ 2751 if (delta < 0) 2752 delta = -delta; 2753 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2754 if ((tp->t_rttvar += delta) <= 0) 2755 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2756 } else { 2757 /* 2758 * No rtt measurement yet - use the unsmoothed rtt. 2759 * Set the variance to half the rtt (so our first 2760 * retransmit happens at 3*rtt). 2761 */ 2762 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2763 tp->t_rttvar = (rtt + 1) << 2764 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2765 } 2766 tp->t_rtttime = 0; 2767 tp->t_rxtshift = 0; 2768 2769 /* 2770 * the retransmit should happen at rtt + 4 * rttvar. 2771 * Because of the way we do the smoothing, srtt and rttvar 2772 * will each average +1/2 tick of bias. When we compute 2773 * the retransmit timer, we want 1/2 tick of rounding and 2774 * 1 extra tick because of +-1/2 tick uncertainty in the 2775 * firing of the timer. The bias will give us exactly the 2776 * 1.5 tick we need. But, because the bias is 2777 * statistical, we have to test that we don't drop below 2778 * the minimum feasible timer (which is 2 ticks). 2779 */ 2780 rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)), 2781 TCPTV_REXMTMAX); 2782 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2783 2784 /* 2785 * We received an ack for a packet that wasn't retransmitted; 2786 * it is probably safe to discard any error indications we've 2787 * received recently. This isn't quite right, but close enough 2788 * for now (a route might have failed after we sent a segment, 2789 * and the return path might not be symmetrical). 2790 */ 2791 tp->t_softerror = 0; 2792 } 2793 2794 /* 2795 * Determine a reasonable value for maxseg size. 2796 * If the route is known, check route for mtu. 2797 * If none, use an mss that can be handled on the outgoing 2798 * interface without forcing IP to fragment; if bigger than 2799 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2800 * to utilize large mbufs. If no route is found, route has no mtu, 2801 * or the destination isn't local, use a default, hopefully conservative 2802 * size (usually 512 or the default IP max size, but no more than the mtu 2803 * of the interface), as we can't discover anything about intervening 2804 * gateways or networks. We also initialize the congestion/slow start 2805 * window to be a single segment if the destination isn't local. 2806 * While looking at the routing entry, we also initialize other path-dependent 2807 * parameters from pre-set or cached values in the routing entry. 2808 * 2809 * Also take into account the space needed for options that we 2810 * send regularly. Make maxseg shorter by that amount to assure 2811 * that we can send maxseg amount of data even when the options 2812 * are present. Store the upper limit of the length of options plus 2813 * data in maxopd. 2814 * 2815 * NOTE: offer == -1 indicates that the maxseg size changed due to 2816 * Path MTU discovery. 2817 */ 2818 int 2819 tcp_mss(struct tcpcb *tp, int offer) 2820 { 2821 struct rtentry *rt; 2822 struct ifnet *ifp; 2823 int mss, mssopt, mssdflt, iphlen, do_rfc3390; 2824 u_int rtmtu; 2825 2826 mss = mssopt = mssdflt = atomic_load_int(&tcp_mssdflt); 2827 2828 rt = in_pcbrtentry(tp->t_inpcb); 2829 if (rt == NULL) 2830 goto out; 2831 2832 ifp = if_get(rt->rt_ifidx); 2833 if (ifp == NULL) 2834 goto out; 2835 2836 switch (tp->pf) { 2837 case AF_INET: 2838 iphlen = sizeof(struct ip); 2839 break; 2840 #ifdef INET6 2841 case AF_INET6: 2842 iphlen = sizeof(struct ip6_hdr); 2843 break; 2844 #endif 2845 default: 2846 unhandled_af(tp->pf); 2847 } 2848 2849 /* 2850 * if there's an mtu associated with the route and we support 2851 * path MTU discovery for the underlying protocol family, use it. 2852 */ 2853 rtmtu = atomic_load_int(&rt->rt_mtu); 2854 if (rtmtu) { 2855 /* 2856 * One may wish to lower MSS to take into account options, 2857 * especially security-related options. 2858 */ 2859 if (tp->pf == AF_INET6 && rtmtu < IPV6_MMTU) { 2860 /* 2861 * RFC2460 section 5, last paragraph: if path MTU is 2862 * smaller than 1280, use 1280 as packet size and 2863 * attach fragment header. 2864 */ 2865 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2866 sizeof(struct tcphdr); 2867 } else { 2868 mss = rtmtu - iphlen - sizeof(struct tcphdr); 2869 } 2870 } else if (ifp->if_flags & IFF_LOOPBACK) { 2871 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2872 } else if (tp->pf == AF_INET) { 2873 if (ip_mtudisc) 2874 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2875 } 2876 #ifdef INET6 2877 else if (tp->pf == AF_INET6) { 2878 /* 2879 * for IPv6, path MTU discovery is always turned on, 2880 * or the node must use packet size <= 1280. 2881 */ 2882 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2883 } 2884 #endif /* INET6 */ 2885 2886 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2887 if (offer != -1) { 2888 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2889 mssopt = imax(mssopt, mssdflt); 2890 } 2891 if_put(ifp); 2892 out: 2893 /* 2894 * The current mss, t_maxseg, is initialized to the default value. 2895 * If we compute a smaller value, reduce the current mss. 2896 * If we compute a larger value, return it for use in sending 2897 * a max seg size option, but don't store it for use 2898 * unless we received an offer at least that large from peer. 2899 * 2900 * However, do not accept offers lower than the minimum of 2901 * the interface MTU and 216. 2902 */ 2903 if (offer > 0) 2904 tp->t_peermss = offer; 2905 if (tp->t_peermss) 2906 mss = imin(mss, max(tp->t_peermss, 216)); 2907 2908 /* sanity - at least max opt. space */ 2909 mss = imax(mss, 64); 2910 2911 /* 2912 * maxopd stores the maximum length of data AND options 2913 * in a segment; maxseg is the amount of data in a normal 2914 * segment. We need to store this value (maxopd) apart 2915 * from maxseg, because now every segment carries options 2916 * and thus we normally have somewhat less data in segments. 2917 */ 2918 tp->t_maxopd = mss; 2919 2920 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2921 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2922 mss -= TCPOLEN_TSTAMP_APPA; 2923 #ifdef TCP_SIGNATURE 2924 if (tp->t_flags & TF_SIGNATURE) 2925 mss -= TCPOLEN_SIGLEN; 2926 #endif 2927 2928 do_rfc3390 = atomic_load_int(&tcp_do_rfc3390); 2929 if (offer == -1) { 2930 /* mss changed due to Path MTU discovery */ 2931 tp->t_flags &= ~TF_PMTUD_PEND; 2932 tp->t_pmtud_mtu_sent = 0; 2933 tp->t_pmtud_mss_acked = 0; 2934 if (mss < tp->t_maxseg) { 2935 /* 2936 * Follow suggestion in RFC 2414 to reduce the 2937 * congestion window by the ratio of the old 2938 * segment size to the new segment size. 2939 */ 2940 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2941 mss, mss); 2942 } 2943 } else if (do_rfc3390 == 2) { 2944 /* increase initial window */ 2945 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2946 } else if (do_rfc3390) { 2947 /* increase initial window */ 2948 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2949 } else 2950 tp->snd_cwnd = mss; 2951 2952 tp->t_maxseg = mss; 2953 2954 return (offer != -1 ? mssopt : mss); 2955 } 2956 2957 u_int 2958 tcp_hdrsz(struct tcpcb *tp) 2959 { 2960 u_int hlen; 2961 2962 switch (tp->pf) { 2963 #ifdef INET6 2964 case AF_INET6: 2965 hlen = sizeof(struct ip6_hdr); 2966 break; 2967 #endif 2968 case AF_INET: 2969 hlen = sizeof(struct ip); 2970 break; 2971 default: 2972 hlen = 0; 2973 break; 2974 } 2975 hlen += sizeof(struct tcphdr); 2976 2977 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2978 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2979 hlen += TCPOLEN_TSTAMP_APPA; 2980 #ifdef TCP_SIGNATURE 2981 if (tp->t_flags & TF_SIGNATURE) 2982 hlen += TCPOLEN_SIGLEN; 2983 #endif 2984 return (hlen); 2985 } 2986 2987 /* 2988 * Set connection variables based on the effective MSS. 2989 * We are passed the TCPCB for the actual connection. If we 2990 * are the server, we are called by the compressed state engine 2991 * when the 3-way handshake is complete. If we are the client, 2992 * we are called when we receive the SYN,ACK from the server. 2993 * 2994 * NOTE: The t_maxseg value must be initialized in the TCPCB 2995 * before this routine is called! 2996 */ 2997 void 2998 tcp_mss_update(struct tcpcb *tp) 2999 { 3000 int mss; 3001 u_long bufsize; 3002 struct rtentry *rt; 3003 struct socket *so; 3004 3005 so = tp->t_inpcb->inp_socket; 3006 mss = tp->t_maxseg; 3007 3008 rt = in_pcbrtentry(tp->t_inpcb); 3009 if (rt == NULL) 3010 return; 3011 3012 mtx_enter(&so->so_snd.sb_mtx); 3013 bufsize = so->so_snd.sb_hiwat; 3014 if (bufsize < mss) { 3015 mtx_leave(&so->so_snd.sb_mtx); 3016 mss = bufsize; 3017 /* Update t_maxseg and t_maxopd */ 3018 tcp_mss(tp, mss); 3019 } else { 3020 bufsize = roundup(bufsize, mss); 3021 if (bufsize > sb_max) 3022 bufsize = sb_max; 3023 (void)sbreserve(so, &so->so_snd, bufsize); 3024 mtx_leave(&so->so_snd.sb_mtx); 3025 } 3026 3027 mtx_enter(&so->so_rcv.sb_mtx); 3028 bufsize = so->so_rcv.sb_hiwat; 3029 if (bufsize > mss) { 3030 bufsize = roundup(bufsize, mss); 3031 if (bufsize > sb_max) 3032 bufsize = sb_max; 3033 (void)sbreserve(so, &so->so_rcv, bufsize); 3034 } 3035 mtx_leave(&so->so_rcv.sb_mtx); 3036 } 3037 3038 /* 3039 * When a partial ack arrives, force the retransmission of the 3040 * next unacknowledged segment. Do not clear tp->t_dupacks. 3041 * By setting snd_nxt to ti_ack, this forces retransmission timer 3042 * to be started again. 3043 */ 3044 void 3045 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3046 { 3047 /* 3048 * snd_una has not been updated and the socket send buffer 3049 * not yet drained of the acked data, so we have to leave 3050 * snd_una as it was to get the correct data offset in 3051 * tcp_output(). 3052 */ 3053 tcp_seq onxt = tp->snd_nxt; 3054 u_long ocwnd = tp->snd_cwnd; 3055 3056 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3057 tp->t_rtttime = 0; 3058 tp->snd_nxt = th->th_ack; 3059 /* 3060 * Set snd_cwnd to one segment beyond acknowledged offset 3061 * (tp->snd_una not yet updated when this function is called) 3062 */ 3063 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3064 (void)tcp_output(tp); 3065 tp->snd_cwnd = ocwnd; 3066 if (SEQ_GT(onxt, tp->snd_nxt)) 3067 tp->snd_nxt = onxt; 3068 /* 3069 * Partial window deflation. Relies on fact that tp->snd_una 3070 * not updated yet. 3071 */ 3072 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3073 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3074 else 3075 tp->snd_cwnd = 0; 3076 tp->snd_cwnd += tp->t_maxseg; 3077 } 3078 3079 int 3080 tcp_mss_adv(struct mbuf *m, int af) 3081 { 3082 struct ifnet *ifp; 3083 int iphlen, mss, mssdflt; 3084 3085 mssdflt = atomic_load_int(&tcp_mssdflt); 3086 3087 if (m == NULL || (m->m_flags & M_PKTHDR) == 0) 3088 return mssdflt; 3089 3090 ifp = if_get(m->m_pkthdr.ph_ifidx); 3091 if (ifp == NULL) 3092 return mssdflt; 3093 3094 switch (af) { 3095 case AF_INET: 3096 iphlen = sizeof(struct ip); 3097 break; 3098 #ifdef INET6 3099 case AF_INET6: 3100 iphlen = sizeof(struct ip6_hdr); 3101 break; 3102 #endif 3103 default: 3104 unhandled_af(af); 3105 } 3106 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3107 if_put(ifp); 3108 3109 if (mss < mssdflt) 3110 return mssdflt; 3111 return mss; 3112 } 3113 3114 /* 3115 * TCP compressed state engine. Currently used to hold compressed 3116 * state for SYN_RECEIVED. 3117 */ 3118 3119 /* 3120 * Locks used to protect global data and struct members: 3121 * a atomic operations 3122 * N net lock 3123 * S syn_cache_mtx tcp syn cache global mutex 3124 */ 3125 3126 /* syn hash parameters */ 3127 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; /* [S] size of hash table */ 3128 int tcp_syn_cache_limit = /* [a] global entry limit */ 3129 TCP_SYN_HASH_SIZE * TCP_SYN_BUCKET_SIZE; 3130 int tcp_syn_bucket_limit = /* [a] per bucket limit */ 3131 3 * TCP_SYN_BUCKET_SIZE; 3132 int tcp_syn_use_limit = 100000; /* [S] reseed after uses */ 3133 3134 struct pool syn_cache_pool; 3135 struct syn_cache_set tcp_syn_cache[2]; /* [S] */ 3136 int tcp_syn_cache_active; /* [S] */ 3137 struct mutex syn_cache_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); 3138 3139 #define SYN_HASH(sa, sp, dp, rand) \ 3140 (((sa)->s_addr ^ (rand)[0]) * \ 3141 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3142 #ifndef INET6 3143 #define SYN_HASHALL(hash, src, dst, rand) \ 3144 do { \ 3145 hash = SYN_HASH(&satosin_const(src)->sin_addr, \ 3146 satosin_const(src)->sin_port, \ 3147 satosin_const(dst)->sin_port, (rand)); \ 3148 } while (/*CONSTCOND*/ 0) 3149 #else 3150 #define SYN_HASH6(sa, sp, dp, rand) \ 3151 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3152 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3153 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3154 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3155 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3156 3157 #define SYN_HASHALL(hash, src, dst, rand) \ 3158 do { \ 3159 switch ((src)->sa_family) { \ 3160 case AF_INET: \ 3161 hash = SYN_HASH(&satosin_const(src)->sin_addr, \ 3162 satosin_const(src)->sin_port, \ 3163 satosin_const(dst)->sin_port, (rand)); \ 3164 break; \ 3165 case AF_INET6: \ 3166 hash = SYN_HASH6(&satosin6_const(src)->sin6_addr, \ 3167 satosin6_const(src)->sin6_port, \ 3168 satosin6_const(dst)->sin6_port, (rand)); \ 3169 break; \ 3170 default: \ 3171 hash = 0; \ 3172 } \ 3173 } while (/*CONSTCOND*/0) 3174 #endif /* INET6 */ 3175 3176 void 3177 syn_cache_rm(struct syn_cache *sc) 3178 { 3179 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3180 3181 KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD)); 3182 SET(sc->sc_dynflags, SCF_DEAD); 3183 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3184 in_pcbunref(sc->sc_inplisten); 3185 sc->sc_inplisten = NULL; 3186 LIST_REMOVE(sc, sc_tpq); 3187 refcnt_rele(&sc->sc_refcnt); 3188 sc->sc_buckethead->sch_length--; 3189 if (timeout_del(&sc->sc_timer)) 3190 refcnt_rele(&sc->sc_refcnt); 3191 sc->sc_set->scs_count--; 3192 } 3193 3194 void 3195 syn_cache_put(struct syn_cache *sc) 3196 { 3197 if (refcnt_rele(&sc->sc_refcnt) == 0) 3198 return; 3199 3200 /* Dealing with last reference, no lock needed. */ 3201 m_free(sc->sc_ipopts); 3202 rtfree(sc->sc_route.ro_rt); 3203 3204 pool_put(&syn_cache_pool, sc); 3205 } 3206 3207 void 3208 syn_cache_init(void) 3209 { 3210 int i; 3211 3212 /* Initialize the hash buckets. */ 3213 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3214 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3215 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3216 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3217 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3218 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3219 for (i = 0; i < tcp_syn_hash_size; i++) { 3220 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3221 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3222 } 3223 3224 /* Initialize the syn cache pool. */ 3225 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3226 0, "syncache", NULL); 3227 } 3228 3229 void 3230 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3231 { 3232 struct syn_cache_set *set; 3233 struct syn_cache_head *scp; 3234 struct syn_cache *sc2; 3235 int i; 3236 3237 NET_ASSERT_LOCKED(); 3238 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3239 3240 set = &tcp_syn_cache[tcp_syn_cache_active]; 3241 3242 /* 3243 * If there are no entries in the hash table, reinitialize 3244 * the hash secrets. To avoid useless cache swaps and 3245 * reinitialization, use it until the limit is reached. 3246 * An empty cache is also the opportunity to resize the hash. 3247 */ 3248 if (set->scs_count == 0 && set->scs_use <= 0) { 3249 set->scs_use = tcp_syn_use_limit; 3250 if (set->scs_size != tcp_syn_hash_size) { 3251 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3252 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3253 if (scp == NULL) { 3254 /* Try again next time. */ 3255 set->scs_use = 0; 3256 } else { 3257 free(set->scs_buckethead, M_SYNCACHE, 3258 set->scs_size * 3259 sizeof(struct syn_cache_head)); 3260 set->scs_buckethead = scp; 3261 set->scs_size = tcp_syn_hash_size; 3262 for (i = 0; i < tcp_syn_hash_size; i++) 3263 TAILQ_INIT(&scp[i].sch_bucket); 3264 } 3265 } 3266 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3267 tcpstat_inc(tcps_sc_seedrandom); 3268 } 3269 3270 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3271 set->scs_random); 3272 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3273 sc->sc_buckethead = scp; 3274 3275 /* 3276 * Make sure that we don't overflow the per-bucket 3277 * limit or the total cache size limit. 3278 */ 3279 if (scp->sch_length >= atomic_load_int(&tcp_syn_bucket_limit)) { 3280 tcpstat_inc(tcps_sc_bucketoverflow); 3281 /* 3282 * Someone might attack our bucket hash function. Reseed 3283 * with random as soon as the passive syn cache gets empty. 3284 */ 3285 set->scs_use = 0; 3286 /* 3287 * The bucket is full. Toss the oldest element in the 3288 * bucket. This will be the first entry in the bucket. 3289 */ 3290 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3291 #ifdef DIAGNOSTIC 3292 /* 3293 * This should never happen; we should always find an 3294 * entry in our bucket. 3295 */ 3296 if (sc2 == NULL) 3297 panic("%s: bucketoverflow: impossible", __func__); 3298 #endif 3299 syn_cache_rm(sc2); 3300 syn_cache_put(sc2); 3301 } else if (set->scs_count >= atomic_load_int(&tcp_syn_cache_limit)) { 3302 struct syn_cache_head *scp2, *sce; 3303 3304 tcpstat_inc(tcps_sc_overflowed); 3305 /* 3306 * The cache is full. Toss the oldest entry in the 3307 * first non-empty bucket we can find. 3308 * 3309 * XXX We would really like to toss the oldest 3310 * entry in the cache, but we hope that this 3311 * condition doesn't happen very often. 3312 */ 3313 scp2 = scp; 3314 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3315 sce = &set->scs_buckethead[set->scs_size]; 3316 for (++scp2; scp2 != scp; scp2++) { 3317 if (scp2 >= sce) 3318 scp2 = &set->scs_buckethead[0]; 3319 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3320 break; 3321 } 3322 #ifdef DIAGNOSTIC 3323 /* 3324 * This should never happen; we should always find a 3325 * non-empty bucket. 3326 */ 3327 if (scp2 == scp) 3328 panic("%s: cacheoverflow: impossible", 3329 __func__); 3330 #endif 3331 } 3332 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3333 syn_cache_rm(sc2); 3334 syn_cache_put(sc2); 3335 } 3336 3337 /* 3338 * Initialize the entry's timer. We don't estimate RTT 3339 * with SYNs, so each packet starts with the default RTT 3340 * and each timer step has a fixed timeout value. 3341 */ 3342 sc->sc_rxttot = 0; 3343 sc->sc_rxtshift = 0; 3344 TCPT_RANGESET(sc->sc_rxtcur, 3345 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3346 TCPTV_REXMTMAX); 3347 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3348 refcnt_take(&sc->sc_refcnt); 3349 3350 /* Link it from tcpcb entry */ 3351 refcnt_take(&sc->sc_refcnt); 3352 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3353 3354 /* Put it into the bucket. */ 3355 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3356 scp->sch_length++; 3357 sc->sc_set = set; 3358 set->scs_count++; 3359 set->scs_use--; 3360 3361 tcpstat_inc(tcps_sc_added); 3362 3363 /* 3364 * If the active cache has exceeded its use limit and 3365 * the passive syn cache is empty, exchange their roles. 3366 */ 3367 if (set->scs_use <= 0 && 3368 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3369 tcp_syn_cache_active = !tcp_syn_cache_active; 3370 } 3371 3372 /* 3373 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3374 * If we have retransmitted an entry the maximum number of times, expire 3375 * that entry. 3376 */ 3377 void 3378 syn_cache_timer(void *arg) 3379 { 3380 struct syn_cache *sc = arg; 3381 struct inpcb *inp; 3382 struct socket *so; 3383 uint64_t now; 3384 int lastref, do_ecn = 0; 3385 3386 mtx_enter(&syn_cache_mtx); 3387 if (ISSET(sc->sc_dynflags, SCF_DEAD)) 3388 goto freeit; 3389 3390 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3391 /* Drop it -- too many retransmissions. */ 3392 goto dropit; 3393 } 3394 3395 /* 3396 * Compute the total amount of time this entry has 3397 * been on a queue. If this entry has been on longer 3398 * than the keep alive timer would allow, expire it. 3399 */ 3400 sc->sc_rxttot += sc->sc_rxtcur; 3401 if (sc->sc_rxttot >= atomic_load_int(&tcp_keepinit)) 3402 goto dropit; 3403 3404 /* Advance the timer back-off. */ 3405 sc->sc_rxtshift++; 3406 TCPT_RANGESET(sc->sc_rxtcur, 3407 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3408 TCPTV_REXMTMAX); 3409 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3410 refcnt_take(&sc->sc_refcnt); 3411 inp = in_pcbref(sc->sc_inplisten); 3412 if (inp == NULL) 3413 goto freeit; 3414 mtx_leave(&syn_cache_mtx); 3415 3416 NET_LOCK_SHARED(); 3417 so = in_pcbsolock_ref(inp); 3418 if (so != NULL) { 3419 now = tcp_now(); 3420 #ifdef TCP_ECN 3421 do_ecn = atomic_load_int(&tcp_do_ecn); 3422 #endif 3423 (void) syn_cache_respond(sc, NULL, now, do_ecn); 3424 tcpstat_inc(tcps_sc_retransmitted); 3425 } 3426 in_pcbsounlock_rele(inp, so); 3427 NET_UNLOCK_SHARED(); 3428 3429 in_pcbunref(inp); 3430 syn_cache_put(sc); 3431 return; 3432 3433 dropit: 3434 tcpstat_inc(tcps_sc_timed_out); 3435 syn_cache_rm(sc); 3436 /* Decrement reference of the timer and free object after remove. */ 3437 lastref = refcnt_rele(&sc->sc_refcnt); 3438 KASSERT(lastref == 0); 3439 (void)lastref; 3440 freeit: 3441 mtx_leave(&syn_cache_mtx); 3442 syn_cache_put(sc); 3443 } 3444 3445 /* 3446 * Remove syn cache created by the specified tcb entry, 3447 * because this does not make sense to keep them 3448 * (if there's no tcb entry, syn cache entry will never be used) 3449 */ 3450 void 3451 syn_cache_cleanup(struct tcpcb *tp) 3452 { 3453 struct syn_cache *sc, *nsc; 3454 3455 NET_ASSERT_LOCKED(); 3456 3457 mtx_enter(&syn_cache_mtx); 3458 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3459 KASSERT(sc->sc_inplisten == tp->t_inpcb); 3460 syn_cache_rm(sc); 3461 syn_cache_put(sc); 3462 } 3463 mtx_leave(&syn_cache_mtx); 3464 3465 KASSERT(LIST_EMPTY(&tp->t_sc)); 3466 } 3467 3468 /* 3469 * Find an entry in the syn cache. 3470 */ 3471 struct syn_cache * 3472 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, 3473 struct syn_cache_head **headp, u_int rtableid) 3474 { 3475 struct syn_cache_set *sets[2]; 3476 struct syn_cache *sc; 3477 struct syn_cache_head *scp; 3478 u_int32_t hash; 3479 int i; 3480 3481 NET_ASSERT_LOCKED(); 3482 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3483 3484 /* Check the active cache first, the passive cache is likely empty. */ 3485 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3486 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3487 for (i = 0; i < 2; i++) { 3488 if (sets[i]->scs_count == 0) 3489 continue; 3490 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3491 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3492 *headp = scp; 3493 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3494 if (sc->sc_hash != hash) 3495 continue; 3496 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3497 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3498 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3499 return (sc); 3500 } 3501 } 3502 return (NULL); 3503 } 3504 3505 /* 3506 * This function gets called when we receive an ACK for a 3507 * socket in the LISTEN state. We look up the connection 3508 * in the syn cache, and if its there, we pull it out of 3509 * the cache and turn it into a full-blown connection in 3510 * the SYN-RECEIVED state. 3511 * 3512 * The return values may not be immediately obvious, and their effects 3513 * can be subtle, so here they are: 3514 * 3515 * NULL SYN was not found in cache; caller should drop the 3516 * packet and send an RST. 3517 * 3518 * -1 We were unable to create the new connection, and are 3519 * aborting it. An ACK,RST is being sent to the peer 3520 * (unless we got screwy sequence numbers; see below), 3521 * because the 3-way handshake has been completed. Caller 3522 * should not free the mbuf, since we may be using it. If 3523 * we are not, we will free it. 3524 * 3525 * Otherwise, the return value is a pointer to the new socket 3526 * associated with the connection. 3527 */ 3528 struct socket * 3529 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3530 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint64_t now, 3531 int do_ecn) 3532 { 3533 struct syn_cache *sc; 3534 struct syn_cache_head *scp; 3535 struct socket *listenso; 3536 struct inpcb *inp, *listeninp; 3537 struct tcpcb *tp = NULL; 3538 struct mbuf *am; 3539 u_int rtableid; 3540 3541 NET_ASSERT_LOCKED(); 3542 3543 inp = sotoinpcb(so); 3544 3545 mtx_enter(&syn_cache_mtx); 3546 sc = syn_cache_lookup(src, dst, &scp, inp->inp_rtableid); 3547 if (sc == NULL) { 3548 mtx_leave(&syn_cache_mtx); 3549 return (NULL); 3550 } 3551 3552 /* 3553 * Verify the sequence and ack numbers. Try getting the correct 3554 * response again. 3555 */ 3556 if ((th->th_ack != sc->sc_iss + 1) || 3557 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3558 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3559 refcnt_take(&sc->sc_refcnt); 3560 mtx_leave(&syn_cache_mtx); 3561 (void) syn_cache_respond(sc, m, now, do_ecn); 3562 syn_cache_put(sc); 3563 return ((struct socket *)(-1)); 3564 } 3565 3566 /* Remove this cache entry */ 3567 syn_cache_rm(sc); 3568 mtx_leave(&syn_cache_mtx); 3569 3570 /* 3571 * Ok, create the full blown connection, and set things up 3572 * as they would have been set up if we had created the 3573 * connection when the SYN arrived. If we can't create 3574 * the connection, abort it. 3575 */ 3576 listenso = so; 3577 listeninp = inp; 3578 so = sonewconn(listenso, SS_ISCONNECTED, M_DONTWAIT); 3579 if (so == NULL) 3580 goto resetandabort; 3581 soassertlocked(so); 3582 soref(so); 3583 inp = sotoinpcb(so); 3584 tp = intotcpcb(inp); 3585 3586 #ifdef IPSEC 3587 /* 3588 * We need to copy the required security levels from the listen pcb. 3589 * Ditto for any other IPsec-related information. 3590 */ 3591 inp->inp_seclevel = listeninp->inp_seclevel; 3592 #endif /* IPSEC */ 3593 #ifdef INET6 3594 if (ISSET(inp->inp_flags, INP_IPV6)) { 3595 KASSERT(ISSET(listeninp->inp_flags, INP_IPV6)); 3596 3597 inp->inp_ipv6.ip6_hlim = listeninp->inp_ipv6.ip6_hlim; 3598 inp->inp_hops = listeninp->inp_hops; 3599 } else 3600 #endif 3601 { 3602 KASSERT(!ISSET(listeninp->inp_flags, INP_IPV6)); 3603 3604 inp->inp_ip.ip_ttl = listeninp->inp_ip.ip_ttl; 3605 inp->inp_options = ip_srcroute(m); 3606 if (inp->inp_options == NULL) { 3607 inp->inp_options = sc->sc_ipopts; 3608 sc->sc_ipopts = NULL; 3609 } 3610 } 3611 3612 /* inherit rtable from listening socket */ 3613 rtableid = sc->sc_rtableid; 3614 #if NPF > 0 3615 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3616 struct pf_divert *divert; 3617 3618 divert = pf_find_divert(m); 3619 KASSERT(divert != NULL); 3620 rtableid = divert->rdomain; 3621 } 3622 #endif 3623 in_pcbset_laddr(inp, dst, rtableid); 3624 3625 /* 3626 * Give the new socket our cached route reference. 3627 */ 3628 inp->inp_route = sc->sc_route; /* struct assignment */ 3629 sc->sc_route.ro_rt = NULL; 3630 3631 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3632 if (am == NULL) 3633 goto resetandabort; 3634 am->m_len = src->sa_len; 3635 memcpy(mtod(am, caddr_t), src, src->sa_len); 3636 if (in_pcbconnect(inp, am)) { 3637 (void) m_free(am); 3638 goto resetandabort; 3639 } 3640 (void) m_free(am); 3641 3642 tp->t_flags = intotcpcb(listeninp)->t_flags & (TF_NOPUSH|TF_NODELAY); 3643 if (sc->sc_request_r_scale != 15) { 3644 tp->requested_s_scale = sc->sc_requested_s_scale; 3645 tp->request_r_scale = sc->sc_request_r_scale; 3646 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3647 } 3648 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) 3649 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3650 3651 tp->t_template = tcp_template(tp); 3652 if (tp->t_template == NULL) 3653 goto abort; 3654 tp->sack_enable = ISSET(sc->sc_fixflags, SCF_SACK_PERMIT); 3655 tp->ts_modulate = sc->sc_modulate; 3656 tp->ts_recent = sc->sc_timestamp; 3657 tp->iss = sc->sc_iss; 3658 tp->irs = sc->sc_irs; 3659 tcp_sendseqinit(tp); 3660 tp->snd_last = tp->snd_una; 3661 #ifdef TCP_ECN 3662 if (ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) { 3663 tp->t_flags |= TF_ECN_PERMIT; 3664 tcpstat_inc(tcps_ecn_accepts); 3665 } 3666 #endif 3667 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) 3668 tp->t_flags |= TF_SACK_PERMIT; 3669 #ifdef TCP_SIGNATURE 3670 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) 3671 tp->t_flags |= TF_SIGNATURE; 3672 #endif 3673 tcp_rcvseqinit(tp); 3674 tp->t_state = TCPS_SYN_RECEIVED; 3675 tp->t_rcvtime = now; 3676 tp->t_sndtime = now; 3677 tp->t_rcvacktime = now; 3678 tp->t_sndacktime = now; 3679 TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepinit)); 3680 tcpstat_inc(tcps_accepts); 3681 3682 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3683 if (sc->sc_peermaxseg) 3684 tcp_mss_update(tp); 3685 /* Reset initial window to 1 segment for retransmit */ 3686 if (READ_ONCE(sc->sc_rxtshift) > 0) 3687 tp->snd_cwnd = tp->t_maxseg; 3688 tp->snd_wl1 = sc->sc_irs; 3689 tp->rcv_up = sc->sc_irs + 1; 3690 3691 /* 3692 * This is what would have happened in tcp_output() when 3693 * the SYN,ACK was sent. 3694 */ 3695 tp->snd_up = tp->snd_una; 3696 tp->snd_max = tp->snd_nxt = tp->iss+1; 3697 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3698 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3699 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3700 tp->last_ack_sent = tp->rcv_nxt; 3701 3702 in_pcbsounlock_rele(inp, so); 3703 tcpstat_inc(tcps_sc_completed); 3704 syn_cache_put(sc); 3705 return (so); 3706 3707 resetandabort: 3708 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3709 m->m_pkthdr.ph_rtableid, now); 3710 abort: 3711 if (tp != NULL) 3712 tp = tcp_drop(tp, ECONNABORTED); /* destroys socket */ 3713 m_freem(m); 3714 in_pcbsounlock_rele(inp, so); 3715 syn_cache_put(sc); 3716 tcpstat_inc(tcps_sc_aborted); 3717 return ((struct socket *)(-1)); 3718 } 3719 3720 /* 3721 * This function is called when we get a RST for a 3722 * non-existent connection, so that we can see if the 3723 * connection is in the syn cache. If it is, zap it. 3724 */ 3725 3726 void 3727 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3728 u_int rtableid) 3729 { 3730 struct syn_cache *sc; 3731 struct syn_cache_head *scp; 3732 3733 NET_ASSERT_LOCKED(); 3734 3735 mtx_enter(&syn_cache_mtx); 3736 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3737 if (sc == NULL) { 3738 mtx_leave(&syn_cache_mtx); 3739 return; 3740 } 3741 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3742 SEQ_GT(th->th_seq, sc->sc_irs + 1)) { 3743 mtx_leave(&syn_cache_mtx); 3744 return; 3745 } 3746 syn_cache_rm(sc); 3747 mtx_leave(&syn_cache_mtx); 3748 tcpstat_inc(tcps_sc_reset); 3749 syn_cache_put(sc); 3750 } 3751 3752 void 3753 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, 3754 struct tcphdr *th, u_int rtableid) 3755 { 3756 struct syn_cache *sc; 3757 struct syn_cache_head *scp; 3758 3759 NET_ASSERT_LOCKED(); 3760 3761 mtx_enter(&syn_cache_mtx); 3762 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3763 if (sc == NULL) { 3764 mtx_leave(&syn_cache_mtx); 3765 return; 3766 } 3767 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3768 if (ntohl (th->th_seq) != sc->sc_iss) { 3769 mtx_leave(&syn_cache_mtx); 3770 return; 3771 } 3772 3773 /* 3774 * If we've retransmitted 3 times and this is our second error, 3775 * we remove the entry. Otherwise, we allow it to continue on. 3776 * This prevents us from incorrectly nuking an entry during a 3777 * spurious network outage. 3778 * 3779 * See tcp_notify(). 3780 */ 3781 if (!ISSET(sc->sc_dynflags, SCF_UNREACH) || sc->sc_rxtshift < 3) { 3782 SET(sc->sc_dynflags, SCF_UNREACH); 3783 mtx_leave(&syn_cache_mtx); 3784 return; 3785 } 3786 3787 syn_cache_rm(sc); 3788 mtx_leave(&syn_cache_mtx); 3789 tcpstat_inc(tcps_sc_unreach); 3790 syn_cache_put(sc); 3791 } 3792 3793 /* 3794 * Given a LISTEN socket and an inbound SYN request, add 3795 * this to the syn cache, and send back a segment: 3796 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3797 * to the source. 3798 * 3799 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3800 * Doing so would require that we hold onto the data and deliver it 3801 * to the application. However, if we are the target of a SYN-flood 3802 * DoS attack, an attacker could send data which would eventually 3803 * consume all available buffer space if it were ACKed. By not ACKing 3804 * the data, we avoid this DoS scenario. 3805 */ 3806 3807 int 3808 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3809 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3810 struct tcp_opt_info *oi, tcp_seq *issp, uint64_t now, int do_ecn) 3811 { 3812 struct tcpcb tb, *tp; 3813 long win; 3814 struct syn_cache *sc; 3815 struct syn_cache_head *scp; 3816 struct mbuf *ipopts; 3817 3818 NET_ASSERT_LOCKED(); 3819 3820 tp = sototcpcb(so); 3821 3822 /* 3823 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3824 * 3825 * Note this check is performed in tcp_input() very early on. 3826 */ 3827 3828 /* 3829 * Initialize some local state. 3830 */ 3831 win = sbspace(so, &so->so_rcv); 3832 if (win > TCP_MAXWIN) 3833 win = TCP_MAXWIN; 3834 3835 bzero(&tb, sizeof(tb)); 3836 if (optp 3837 #ifdef TCP_SIGNATURE 3838 || (tp->t_flags & TF_SIGNATURE) 3839 #endif 3840 ) { 3841 tb.pf = tp->pf; 3842 tb.sack_enable = tp->sack_enable; 3843 tb.t_flags = atomic_load_int(&tcp_do_rfc1323) ? 3844 (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3845 #ifdef TCP_SIGNATURE 3846 if (tp->t_flags & TF_SIGNATURE) 3847 tb.t_flags |= TF_SIGNATURE; 3848 #endif 3849 tb.t_state = TCPS_LISTEN; 3850 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3851 sotoinpcb(so)->inp_rtableid, now)) 3852 return (-1); 3853 } 3854 3855 switch (src->sa_family) { 3856 case AF_INET: 3857 /* 3858 * Remember the IP options, if any. 3859 */ 3860 ipopts = ip_srcroute(m); 3861 break; 3862 default: 3863 ipopts = NULL; 3864 } 3865 3866 /* 3867 * See if we already have an entry for this connection. 3868 * If we do, resend the SYN,ACK. We do not count this 3869 * as a retransmission (XXX though maybe we should). 3870 */ 3871 mtx_enter(&syn_cache_mtx); 3872 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3873 if (sc != NULL) { 3874 refcnt_take(&sc->sc_refcnt); 3875 mtx_leave(&syn_cache_mtx); 3876 tcpstat_inc(tcps_sc_dupesyn); 3877 if (ipopts) { 3878 /* 3879 * If we were remembering a previous source route, 3880 * forget it and use the new one we've been given. 3881 */ 3882 m_free(sc->sc_ipopts); 3883 sc->sc_ipopts = ipopts; 3884 } 3885 sc->sc_timestamp = tb.ts_recent; 3886 if (syn_cache_respond(sc, m, now, do_ecn) == 0) { 3887 tcpstat_inc(tcps_sndacks); 3888 tcpstat_inc(tcps_sndtotal); 3889 } 3890 syn_cache_put(sc); 3891 return (0); 3892 } 3893 mtx_leave(&syn_cache_mtx); 3894 3895 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3896 if (sc == NULL) { 3897 m_free(ipopts); 3898 return (-1); 3899 } 3900 refcnt_init_trace(&sc->sc_refcnt, DT_REFCNT_IDX_SYNCACHE); 3901 timeout_set_flags(&sc->sc_timer, syn_cache_timer, sc, 3902 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 3903 3904 /* 3905 * Fill in the cache, and put the necessary IP and TCP 3906 * options into the reply. 3907 */ 3908 memcpy(&sc->sc_src, src, src->sa_len); 3909 memcpy(&sc->sc_dst, dst, dst->sa_len); 3910 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3911 sc->sc_ipopts = ipopts; 3912 sc->sc_irs = th->th_seq; 3913 3914 sc->sc_iss = issp ? *issp : arc4random(); 3915 sc->sc_peermaxseg = oi->maxseg; 3916 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3917 sc->sc_win = win; 3918 sc->sc_timestamp = tb.ts_recent; 3919 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3920 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3921 SET(sc->sc_fixflags, SCF_TIMESTAMP); 3922 sc->sc_modulate = arc4random(); 3923 } 3924 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3925 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3926 sc->sc_requested_s_scale = tb.requested_s_scale; 3927 sc->sc_request_r_scale = 0; 3928 /* 3929 * Pick the smallest possible scaling factor that 3930 * will still allow us to scale up to sb_max. 3931 * 3932 * We do this because there are broken firewalls that 3933 * will corrupt the window scale option, leading to 3934 * the other endpoint believing that our advertised 3935 * window is unscaled. At scale factors larger than 3936 * 5 the unscaled window will drop below 1500 bytes, 3937 * leading to serious problems when traversing these 3938 * broken firewalls. 3939 * 3940 * With the default sbmax of 256K, a scale factor 3941 * of 3 will be chosen by this algorithm. Those who 3942 * choose a larger sbmax should watch out 3943 * for the compatibility problems mentioned above. 3944 * 3945 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3946 * or <SYN,ACK>) segment itself is never scaled. 3947 */ 3948 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3949 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3950 sc->sc_request_r_scale++; 3951 } else { 3952 sc->sc_requested_s_scale = 15; 3953 sc->sc_request_r_scale = 15; 3954 } 3955 #ifdef TCP_ECN 3956 /* 3957 * if both ECE and CWR flag bits are set, peer is ECN capable. 3958 */ 3959 if (do_ecn && (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3960 SET(sc->sc_fixflags, SCF_ECN_PERMIT); 3961 #endif 3962 /* 3963 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3964 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3965 */ 3966 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3967 SET(sc->sc_fixflags, SCF_SACK_PERMIT); 3968 #ifdef TCP_SIGNATURE 3969 if (tb.t_flags & TF_SIGNATURE) 3970 SET(sc->sc_fixflags, SCF_SIGNATURE); 3971 #endif 3972 sc->sc_inplisten = in_pcbref(tp->t_inpcb); 3973 if (syn_cache_respond(sc, m, now, do_ecn) == 0) { 3974 mtx_enter(&syn_cache_mtx); 3975 /* 3976 * XXXSMP Currently exclusive netlock prevents another insert 3977 * after our syn_cache_lookup() and before syn_cache_insert(). 3978 * Double insert should be handled and not rely on netlock. 3979 */ 3980 syn_cache_insert(sc, tp); 3981 mtx_leave(&syn_cache_mtx); 3982 tcpstat_inc(tcps_sndacks); 3983 tcpstat_inc(tcps_sndtotal); 3984 } else { 3985 in_pcbunref(sc->sc_inplisten); 3986 syn_cache_put(sc); 3987 tcpstat_inc(tcps_sc_dropped); 3988 } 3989 3990 return (0); 3991 } 3992 3993 int 3994 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint64_t now, 3995 int do_ecn) 3996 { 3997 u_int8_t *optp; 3998 int optlen, error; 3999 u_int16_t tlen; 4000 struct ip *ip = NULL; 4001 #ifdef INET6 4002 struct ip6_hdr *ip6 = NULL; 4003 #endif 4004 struct tcphdr *th; 4005 u_int hlen; 4006 struct inpcb *inp; 4007 4008 NET_ASSERT_LOCKED(); 4009 4010 switch (sc->sc_src.sa.sa_family) { 4011 case AF_INET: 4012 hlen = sizeof(struct ip); 4013 break; 4014 #ifdef INET6 4015 case AF_INET6: 4016 hlen = sizeof(struct ip6_hdr); 4017 break; 4018 #endif 4019 default: 4020 m_freem(m); 4021 return (EAFNOSUPPORT); 4022 } 4023 4024 /* Compute the size of the TCP options. */ 4025 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4026 (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT) ? 4 : 0) + 4027 #ifdef TCP_SIGNATURE 4028 (ISSET(sc->sc_fixflags, SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4029 #endif 4030 (ISSET(sc->sc_fixflags, SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4031 4032 tlen = hlen + sizeof(struct tcphdr) + optlen; 4033 4034 /* 4035 * Create the IP+TCP header from scratch. 4036 */ 4037 m_freem(m); 4038 #ifdef DIAGNOSTIC 4039 if (max_linkhdr + tlen > MCLBYTES) 4040 return (ENOBUFS); 4041 #endif 4042 MGETHDR(m, M_DONTWAIT, MT_DATA); 4043 if (m && max_linkhdr + tlen > MHLEN) { 4044 MCLGET(m, M_DONTWAIT); 4045 if ((m->m_flags & M_EXT) == 0) { 4046 m_freem(m); 4047 m = NULL; 4048 } 4049 } 4050 if (m == NULL) 4051 return (ENOBUFS); 4052 4053 /* Fixup the mbuf. */ 4054 m->m_data += max_linkhdr; 4055 m->m_len = m->m_pkthdr.len = tlen; 4056 m->m_pkthdr.ph_ifidx = 0; 4057 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4058 memset(mtod(m, u_char *), 0, tlen); 4059 4060 switch (sc->sc_src.sa.sa_family) { 4061 case AF_INET: 4062 ip = mtod(m, struct ip *); 4063 ip->ip_dst = sc->sc_src.sin.sin_addr; 4064 ip->ip_src = sc->sc_dst.sin.sin_addr; 4065 ip->ip_p = IPPROTO_TCP; 4066 th = (struct tcphdr *)(ip + 1); 4067 th->th_dport = sc->sc_src.sin.sin_port; 4068 th->th_sport = sc->sc_dst.sin.sin_port; 4069 break; 4070 #ifdef INET6 4071 case AF_INET6: 4072 ip6 = mtod(m, struct ip6_hdr *); 4073 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4074 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4075 ip6->ip6_nxt = IPPROTO_TCP; 4076 th = (struct tcphdr *)(ip6 + 1); 4077 th->th_dport = sc->sc_src.sin6.sin6_port; 4078 th->th_sport = sc->sc_dst.sin6.sin6_port; 4079 break; 4080 #endif 4081 } 4082 4083 th->th_seq = htonl(sc->sc_iss); 4084 th->th_ack = htonl(sc->sc_irs + 1); 4085 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4086 th->th_flags = TH_SYN|TH_ACK; 4087 #ifdef TCP_ECN 4088 /* Set ECE for SYN-ACK if peer supports ECN. */ 4089 if (do_ecn && ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) 4090 th->th_flags |= TH_ECE; 4091 #endif 4092 th->th_win = htons(sc->sc_win); 4093 /* th_sum already 0 */ 4094 /* th_urp already 0 */ 4095 4096 /* Tack on the TCP options. */ 4097 optp = (u_int8_t *)(th + 1); 4098 *optp++ = TCPOPT_MAXSEG; 4099 *optp++ = 4; 4100 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4101 *optp++ = sc->sc_ourmaxseg & 0xff; 4102 4103 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4104 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) { 4105 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4106 optp += 4; 4107 } 4108 4109 if (sc->sc_request_r_scale != 15) { 4110 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4111 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4112 sc->sc_request_r_scale); 4113 optp += 4; 4114 } 4115 4116 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) { 4117 u_int32_t *lp = (u_int32_t *)(optp); 4118 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4119 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4120 *lp++ = htonl(now + sc->sc_modulate); 4121 *lp = htonl(sc->sc_timestamp); 4122 optp += TCPOLEN_TSTAMP_APPA; 4123 } 4124 4125 #ifdef TCP_SIGNATURE 4126 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) { 4127 union sockaddr_union src, dst; 4128 struct tdb *tdb; 4129 4130 bzero(&src, sizeof(union sockaddr_union)); 4131 bzero(&dst, sizeof(union sockaddr_union)); 4132 src.sa.sa_len = sc->sc_src.sa.sa_len; 4133 src.sa.sa_family = sc->sc_src.sa.sa_family; 4134 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4135 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4136 4137 switch (sc->sc_src.sa.sa_family) { 4138 case 0: /*default to PF_INET*/ 4139 case AF_INET: 4140 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4141 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4142 break; 4143 #ifdef INET6 4144 case AF_INET6: 4145 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4146 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4147 break; 4148 #endif /* INET6 */ 4149 } 4150 4151 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4152 0, &src, &dst, IPPROTO_TCP); 4153 if (tdb == NULL) { 4154 m_freem(m); 4155 return (EPERM); 4156 } 4157 4158 /* Send signature option */ 4159 *(optp++) = TCPOPT_SIGNATURE; 4160 *(optp++) = TCPOLEN_SIGNATURE; 4161 4162 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4163 hlen, 0, optp) < 0) { 4164 m_freem(m); 4165 tdb_unref(tdb); 4166 return (EINVAL); 4167 } 4168 tdb_unref(tdb); 4169 optp += 16; 4170 4171 /* Pad options list to the next 32 bit boundary and 4172 * terminate it. 4173 */ 4174 *optp++ = TCPOPT_NOP; 4175 *optp++ = TCPOPT_EOL; 4176 } 4177 #endif /* TCP_SIGNATURE */ 4178 4179 SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); 4180 4181 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4182 mtx_enter(&syn_cache_mtx); 4183 inp = in_pcbref(sc->sc_inplisten); 4184 mtx_leave(&syn_cache_mtx); 4185 4186 /* 4187 * Fill in some straggling IP bits. Note the stack expects 4188 * ip_len to be in host order, for convenience. 4189 */ 4190 switch (sc->sc_src.sa.sa_family) { 4191 case AF_INET: 4192 ip->ip_len = htons(tlen); 4193 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4194 if (inp != NULL) 4195 ip->ip_tos = inp->inp_ip.ip_tos; 4196 4197 error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 4198 (ip_mtudisc ? IP_MTUDISC : 0), NULL, 4199 inp ? &inp->inp_seclevel : NULL, 0); 4200 break; 4201 #ifdef INET6 4202 case AF_INET6: 4203 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4204 ip6->ip6_vfc |= IPV6_VERSION; 4205 /* ip6_plen will be updated in ip6_output() */ 4206 ip6->ip6_hlim = in6_selecthlim(inp); 4207 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4208 4209 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route, 0, 4210 NULL, inp ? &inp->inp_seclevel : NULL); 4211 break; 4212 #endif 4213 } 4214 in_pcbunref(inp); 4215 return (error); 4216 } 4217