1 /* $OpenBSD: tcp_input.c,v 1.422 2025/01/10 20:19:03 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet6/ip6_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #if NPF > 0 100 #include <net/pfvar.h> 101 #endif 102 103 int tcp_mss_adv(struct mbuf *, int); 104 int tcp_flush_queue(struct tcpcb *); 105 106 #ifdef INET6 107 #include <netinet6/in6_var.h> 108 #include <netinet6/nd6.h> 109 #endif /* INET6 */ 110 111 const int tcprexmtthresh = 3; 112 int tcptv_keep_init = TCPTV_KEEP_INIT; 113 114 int tcp_rst_ppslim = 100; /* 100pps */ 115 int tcp_rst_ppslim_count = 0; 116 struct timeval tcp_rst_ppslim_last; 117 118 int tcp_ackdrop_ppslim = 100; /* 100pps */ 119 int tcp_ackdrop_ppslim_count = 0; 120 struct timeval tcp_ackdrop_ppslim_last; 121 122 #define TCP_PAWS_IDLE TCP_TIME(24 * 24 * 60 * 60) 123 124 /* for modulo comparisons of timestamps */ 125 #define TSTMP_LT(a,b) ((int32_t)((a)-(b)) < 0) 126 #define TSTMP_GEQ(a,b) ((int32_t)((a)-(b)) >= 0) 127 128 /* for TCP SACK comparisons */ 129 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 130 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 131 132 /* 133 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 134 */ 135 #ifdef INET6 136 #define ND6_HINT(tp) \ 137 do { \ 138 if (tp && tp->t_inpcb && \ 139 ISSET(tp->t_inpcb->inp_flags, INP_IPV6) && \ 140 rtisvalid(tp->t_inpcb->inp_route.ro_rt)) { \ 141 nd6_nud_hint(tp->t_inpcb->inp_route.ro_rt); \ 142 } \ 143 } while (0) 144 #else 145 #define ND6_HINT(tp) 146 #endif 147 148 #ifdef TCP_ECN 149 /* 150 * ECN (Explicit Congestion Notification) support based on RFC3168 151 * implementation note: 152 * snd_last is used to track a recovery phase. 153 * when cwnd is reduced, snd_last is set to snd_max. 154 * while snd_last > snd_una, the sender is in a recovery phase and 155 * its cwnd should not be reduced again. 156 * snd_last follows snd_una when not in a recovery phase. 157 */ 158 #endif 159 160 /* 161 * Macro to compute ACK transmission behavior. Delay the ACK unless 162 * we have already delayed an ACK (must send an ACK every two segments). 163 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 164 * option is enabled or when the packet is coming from a loopback 165 * interface. 166 */ 167 #define TCP_SETUP_ACK(tp, tiflags, m) \ 168 do { \ 169 struct ifnet *ifp = NULL; \ 170 if (m && (m->m_flags & M_PKTHDR)) \ 171 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 172 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 173 (atomic_load_int(&tcp_ack_on_push) && (tiflags) & TH_PUSH) || \ 174 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 175 tp->t_flags |= TF_ACKNOW; \ 176 else \ 177 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \ 178 if_put(ifp); \ 179 } while (0) 180 181 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 182 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 183 184 void syn_cache_put(struct syn_cache *); 185 void syn_cache_rm(struct syn_cache *); 186 int syn_cache_respond(struct syn_cache *, struct mbuf *, uint64_t, int); 187 void syn_cache_timer(void *); 188 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 189 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 190 struct tcphdr *, u_int); 191 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 192 unsigned int, struct socket *, struct mbuf *, u_char *, int, 193 struct tcp_opt_info *, tcp_seq *, uint64_t, int); 194 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 195 struct tcphdr *, unsigned int, unsigned int, struct socket *, 196 struct mbuf *, uint64_t, int); 197 struct syn_cache *syn_cache_lookup(const struct sockaddr *, 198 const struct sockaddr *, struct syn_cache_head **, u_int); 199 200 /* 201 * Insert segment ti into reassembly queue of tcp with 202 * control block tp. Return TH_FIN if reassembly now includes 203 * a segment with FIN. The macro form does the common case inline 204 * (segment is the next to be received on an established connection, 205 * and the queue is empty), avoiding linkage into and removal 206 * from the queue and repetition of various conversions. 207 * Set DELACK for segments received in order, but ack immediately 208 * when segments are out of order (so fast retransmit can work). 209 */ 210 211 int 212 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 213 { 214 struct tcpqent *p, *q, *nq, *tiqe; 215 216 /* 217 * Allocate a new queue entry, before we throw away any data. 218 * If we can't, just drop the packet. XXX 219 */ 220 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 221 if (tiqe == NULL) { 222 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 223 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 224 /* Reuse last entry since new segment fills a hole */ 225 m_freem(tiqe->tcpqe_m); 226 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 227 } 228 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 229 /* Flush segment queue for this connection */ 230 tcp_freeq(tp); 231 tcpstat_inc(tcps_rcvmemdrop); 232 m_freem(m); 233 return (0); 234 } 235 } 236 237 /* 238 * Find a segment which begins after this one does. 239 */ 240 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 241 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 242 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 243 break; 244 245 /* 246 * If there is a preceding segment, it may provide some of 247 * our data already. If so, drop the data from the incoming 248 * segment. If it provides all of our data, drop us. 249 */ 250 if (p != NULL) { 251 struct tcphdr *phdr = p->tcpqe_tcp; 252 int i; 253 254 /* conversion to int (in i) handles seq wraparound */ 255 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 256 if (i > 0) { 257 if (i >= *tlen) { 258 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 259 *tlen); 260 m_freem(m); 261 pool_put(&tcpqe_pool, tiqe); 262 return (0); 263 } 264 m_adj(m, i); 265 *tlen -= i; 266 th->th_seq += i; 267 } 268 } 269 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 270 tp->t_rcvoopack++; 271 272 /* 273 * While we overlap succeeding segments trim them or, 274 * if they are completely covered, dequeue them. 275 */ 276 for (; q != NULL; q = nq) { 277 struct tcphdr *qhdr = q->tcpqe_tcp; 278 int i = (th->th_seq + *tlen) - qhdr->th_seq; 279 280 if (i <= 0) 281 break; 282 if (i < qhdr->th_reseqlen) { 283 qhdr->th_seq += i; 284 qhdr->th_reseqlen -= i; 285 m_adj(q->tcpqe_m, i); 286 break; 287 } 288 nq = TAILQ_NEXT(q, tcpqe_q); 289 m_freem(q->tcpqe_m); 290 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 291 pool_put(&tcpqe_pool, q); 292 } 293 294 /* Insert the new segment queue entry into place. */ 295 tiqe->tcpqe_m = m; 296 th->th_reseqlen = *tlen; 297 tiqe->tcpqe_tcp = th; 298 if (p == NULL) { 299 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 300 } else { 301 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 302 } 303 304 if (th->th_seq != tp->rcv_nxt) 305 return (0); 306 307 return (tcp_flush_queue(tp)); 308 } 309 310 int 311 tcp_flush_queue(struct tcpcb *tp) 312 { 313 struct socket *so = tp->t_inpcb->inp_socket; 314 struct tcpqent *q, *nq; 315 int flags; 316 317 /* 318 * Present data to user, advancing rcv_nxt through 319 * completed sequence space. 320 */ 321 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 322 return (0); 323 q = TAILQ_FIRST(&tp->t_segq); 324 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 325 return (0); 326 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 327 return (0); 328 do { 329 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 330 flags = q->tcpqe_tcp->th_flags & TH_FIN; 331 332 nq = TAILQ_NEXT(q, tcpqe_q); 333 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 334 ND6_HINT(tp); 335 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 336 m_freem(q->tcpqe_m); 337 else { 338 mtx_enter(&so->so_rcv.sb_mtx); 339 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 340 mtx_leave(&so->so_rcv.sb_mtx); 341 } 342 pool_put(&tcpqe_pool, q); 343 q = nq; 344 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 345 tp->t_flags |= TF_BLOCKOUTPUT; 346 sorwakeup(so); 347 tp->t_flags &= ~TF_BLOCKOUTPUT; 348 return (flags); 349 } 350 351 /* 352 * TCP input routine, follows pages 65-76 of the 353 * protocol specification dated September, 1981 very closely. 354 */ 355 int 356 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 357 { 358 struct mbuf *m = *mp; 359 int iphlen = *offp; 360 struct ip *ip = NULL; 361 struct inpcb *inp = NULL; 362 u_int8_t *optp = NULL; 363 int optlen = 0; 364 int tlen, off; 365 struct tcpcb *otp = NULL, *tp = NULL; 366 int tiflags; 367 struct socket *so = NULL; 368 int todrop, acked, ourfinisacked; 369 int hdroptlen = 0; 370 short ostate; 371 union { 372 struct tcpiphdr tcpip; 373 #ifdef INET6 374 struct tcpipv6hdr tcpip6; 375 #endif 376 char caddr; 377 } saveti; 378 tcp_seq iss, *reuse = NULL; 379 uint64_t now; 380 u_long tiwin; 381 struct tcp_opt_info opti; 382 struct tcphdr *th; 383 #ifdef INET6 384 struct ip6_hdr *ip6 = NULL; 385 #endif /* INET6 */ 386 int do_ecn = 0; 387 #ifdef TCP_ECN 388 u_char iptos; 389 #endif 390 391 tcpstat_inc(tcps_rcvtotal); 392 393 opti.ts_present = 0; 394 opti.maxseg = 0; 395 now = tcp_now(); 396 #ifdef TCP_ECN 397 do_ecn = atomic_load_int(&tcp_do_ecn); 398 #endif 399 400 /* 401 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 402 */ 403 if (m->m_flags & (M_BCAST|M_MCAST)) 404 goto drop; 405 406 /* 407 * Get IP and TCP header together in first mbuf. 408 * Note: IP leaves IP header in first mbuf. 409 */ 410 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 411 if (!th) { 412 tcpstat_inc(tcps_rcvshort); 413 return IPPROTO_DONE; 414 } 415 416 tlen = m->m_pkthdr.len - iphlen; 417 switch (af) { 418 case AF_INET: 419 ip = mtod(m, struct ip *); 420 #ifdef TCP_ECN 421 /* save ip_tos before clearing it for checksum */ 422 iptos = ip->ip_tos; 423 #endif 424 break; 425 #ifdef INET6 426 case AF_INET6: 427 ip6 = mtod(m, struct ip6_hdr *); 428 #ifdef TCP_ECN 429 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 430 #endif 431 432 /* 433 * Be proactive about unspecified IPv6 address in source. 434 * As we use all-zero to indicate unbounded/unconnected pcb, 435 * unspecified IPv6 address can be used to confuse us. 436 * 437 * Note that packets with unspecified IPv6 destination is 438 * already dropped in ip6_input. 439 */ 440 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 441 /* XXX stat */ 442 goto drop; 443 } 444 445 /* Discard packets to multicast */ 446 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 447 /* XXX stat */ 448 goto drop; 449 } 450 break; 451 #endif 452 default: 453 unhandled_af(af); 454 } 455 456 /* 457 * Checksum extended TCP header and data. 458 */ 459 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 460 int sum; 461 462 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 463 tcpstat_inc(tcps_rcvbadsum); 464 goto drop; 465 } 466 tcpstat_inc(tcps_inswcsum); 467 switch (af) { 468 case AF_INET: 469 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 470 break; 471 #ifdef INET6 472 case AF_INET6: 473 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 474 tlen); 475 break; 476 #endif 477 } 478 if (sum != 0) { 479 tcpstat_inc(tcps_rcvbadsum); 480 goto drop; 481 } 482 } 483 484 /* 485 * Check that TCP offset makes sense, 486 * pull out TCP options and adjust length. XXX 487 */ 488 off = th->th_off << 2; 489 if (off < sizeof(struct tcphdr) || off > tlen) { 490 tcpstat_inc(tcps_rcvbadoff); 491 goto drop; 492 } 493 tlen -= off; 494 if (off > sizeof(struct tcphdr)) { 495 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 496 if (!th) { 497 tcpstat_inc(tcps_rcvshort); 498 return IPPROTO_DONE; 499 } 500 optlen = off - sizeof(struct tcphdr); 501 optp = (u_int8_t *)(th + 1); 502 /* 503 * Do quick retrieval of timestamp options ("options 504 * prediction?"). If timestamp is the only option and it's 505 * formatted as recommended in RFC 1323 appendix A, we 506 * quickly get the values now and not bother calling 507 * tcp_dooptions(), etc. 508 */ 509 if ((optlen == TCPOLEN_TSTAMP_APPA || 510 (optlen > TCPOLEN_TSTAMP_APPA && 511 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 512 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 513 (th->th_flags & TH_SYN) == 0) { 514 opti.ts_present = 1; 515 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 516 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 517 optp = NULL; /* we've parsed the options */ 518 } 519 } 520 tiflags = th->th_flags; 521 522 /* 523 * Convert TCP protocol specific fields to host format. 524 */ 525 th->th_seq = ntohl(th->th_seq); 526 th->th_ack = ntohl(th->th_ack); 527 th->th_win = ntohs(th->th_win); 528 th->th_urp = ntohs(th->th_urp); 529 530 if (th->th_dport == 0) { 531 tcpstat_inc(tcps_noport); 532 goto dropwithreset_ratelim; 533 } 534 535 /* 536 * Locate pcb for segment. 537 */ 538 #if NPF > 0 539 inp = pf_inp_lookup(m); 540 #endif 541 findpcb: 542 if (inp == NULL) { 543 switch (af) { 544 #ifdef INET6 545 case AF_INET6: 546 inp = in6_pcblookup(&tcb6table, &ip6->ip6_src, 547 th->th_sport, &ip6->ip6_dst, th->th_dport, 548 m->m_pkthdr.ph_rtableid); 549 break; 550 #endif 551 case AF_INET: 552 inp = in_pcblookup(&tcbtable, ip->ip_src, 553 th->th_sport, ip->ip_dst, th->th_dport, 554 m->m_pkthdr.ph_rtableid); 555 break; 556 } 557 } 558 if (inp == NULL) { 559 tcpstat_inc(tcps_pcbhashmiss); 560 switch (af) { 561 #ifdef INET6 562 case AF_INET6: 563 inp = in6_pcblookup_listen(&tcb6table, &ip6->ip6_dst, 564 th->th_dport, m, m->m_pkthdr.ph_rtableid); 565 break; 566 #endif 567 case AF_INET: 568 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 569 th->th_dport, m, m->m_pkthdr.ph_rtableid); 570 break; 571 } 572 /* 573 * If the state is CLOSED (i.e., TCB does not exist) then 574 * all data in the incoming segment is discarded. 575 * If the TCB exists but is in CLOSED state, it is embryonic, 576 * but should either do a listen or a connect soon. 577 */ 578 } 579 #ifdef IPSEC 580 if (ipsec_in_use) { 581 struct m_tag *mtag; 582 struct tdb *tdb = NULL; 583 int error; 584 585 /* Find most recent IPsec tag */ 586 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 587 if (mtag != NULL) { 588 struct tdb_ident *tdbi; 589 590 tdbi = (struct tdb_ident *)(mtag + 1); 591 tdb = gettdb(tdbi->rdomain, tdbi->spi, 592 &tdbi->dst, tdbi->proto); 593 } 594 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 595 tdb, inp ? &inp->inp_seclevel : NULL, NULL, NULL); 596 tdb_unref(tdb); 597 if (error) { 598 tcpstat_inc(tcps_rcvnosec); 599 goto drop; 600 } 601 } 602 #endif /* IPSEC */ 603 604 if (inp == NULL) { 605 tcpstat_inc(tcps_noport); 606 goto dropwithreset_ratelim; 607 } 608 609 KASSERT(sotoinpcb(inp->inp_socket) == inp); 610 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 611 soassertlocked(inp->inp_socket); 612 613 /* Check the minimum TTL for socket. */ 614 switch (af) { 615 case AF_INET: 616 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 617 goto drop; 618 break; 619 #ifdef INET6 620 case AF_INET6: 621 if (inp->inp_ip6_minhlim && 622 inp->inp_ip6_minhlim > ip6->ip6_hlim) 623 goto drop; 624 break; 625 #endif 626 } 627 628 tp = intotcpcb(inp); 629 if (tp == NULL) 630 goto dropwithreset_ratelim; 631 if (tp->t_state == TCPS_CLOSED) 632 goto drop; 633 634 /* Unscale the window into a 32-bit value. */ 635 if ((tiflags & TH_SYN) == 0) 636 tiwin = th->th_win << tp->snd_scale; 637 else 638 tiwin = th->th_win; 639 640 so = inp->inp_socket; 641 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 642 union syn_cache_sa src; 643 union syn_cache_sa dst; 644 645 bzero(&src, sizeof(src)); 646 bzero(&dst, sizeof(dst)); 647 switch (af) { 648 case AF_INET: 649 src.sin.sin_len = sizeof(struct sockaddr_in); 650 src.sin.sin_family = AF_INET; 651 src.sin.sin_addr = ip->ip_src; 652 src.sin.sin_port = th->th_sport; 653 654 dst.sin.sin_len = sizeof(struct sockaddr_in); 655 dst.sin.sin_family = AF_INET; 656 dst.sin.sin_addr = ip->ip_dst; 657 dst.sin.sin_port = th->th_dport; 658 break; 659 #ifdef INET6 660 case AF_INET6: 661 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 662 src.sin6.sin6_family = AF_INET6; 663 src.sin6.sin6_addr = ip6->ip6_src; 664 src.sin6.sin6_port = th->th_sport; 665 666 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 667 dst.sin6.sin6_family = AF_INET6; 668 dst.sin6.sin6_addr = ip6->ip6_dst; 669 dst.sin6.sin6_port = th->th_dport; 670 break; 671 #endif /* INET6 */ 672 } 673 674 if (so->so_options & SO_DEBUG) { 675 otp = tp; 676 ostate = tp->t_state; 677 switch (af) { 678 #ifdef INET6 679 case AF_INET6: 680 saveti.tcpip6.ti6_i = *ip6; 681 saveti.tcpip6.ti6_t = *th; 682 break; 683 #endif 684 case AF_INET: 685 memcpy(&saveti.tcpip.ti_i, ip, sizeof(*ip)); 686 saveti.tcpip.ti_t = *th; 687 break; 688 } 689 } 690 if (so->so_options & SO_ACCEPTCONN) { 691 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 692 693 case TH_SYN|TH_ACK|TH_RST: 694 case TH_SYN|TH_RST: 695 case TH_ACK|TH_RST: 696 case TH_RST: 697 syn_cache_reset(&src.sa, &dst.sa, th, 698 inp->inp_rtableid); 699 goto drop; 700 701 case TH_SYN|TH_ACK: 702 /* 703 * Received a SYN,ACK. This should 704 * never happen while we are in 705 * LISTEN. Send an RST. 706 */ 707 goto badsyn; 708 709 case TH_ACK: 710 so = syn_cache_get(&src.sa, &dst.sa, 711 th, iphlen, tlen, so, m, now, do_ecn); 712 if (so == NULL) { 713 /* 714 * We don't have a SYN for 715 * this ACK; send an RST. 716 */ 717 goto badsyn; 718 } else if (so == (struct socket *)(-1)) { 719 /* 720 * We were unable to create 721 * the connection. If the 722 * 3-way handshake was 723 * completed, and RST has 724 * been sent to the peer. 725 * Since the mbuf might be 726 * in use for the reply, 727 * do not free it. 728 */ 729 m = *mp = NULL; 730 goto drop; 731 } else { 732 /* 733 * We have created a 734 * full-blown connection. 735 */ 736 tp = NULL; 737 in_pcbunref(inp); 738 inp = in_pcbref(sotoinpcb(so)); 739 tp = intotcpcb(inp); 740 if (tp == NULL) 741 goto badsyn; /*XXX*/ 742 743 } 744 break; 745 746 default: 747 /* 748 * None of RST, SYN or ACK was set. 749 * This is an invalid packet for a 750 * TCB in LISTEN state. Send a RST. 751 */ 752 goto badsyn; 753 754 case TH_SYN: 755 /* 756 * Received a SYN. 757 */ 758 #ifdef INET6 759 /* 760 * If deprecated address is forbidden, we do 761 * not accept SYN to deprecated interface 762 * address to prevent any new inbound 763 * connection from getting established. 764 * When we do not accept SYN, we send a TCP 765 * RST, with deprecated source address (instead 766 * of dropping it). We compromise it as it is 767 * much better for peer to send a RST, and 768 * RST will be the final packet for the 769 * exchange. 770 * 771 * If we do not forbid deprecated addresses, we 772 * accept the SYN packet. RFC2462 does not 773 * suggest dropping SYN in this case. 774 * If we decipher RFC2462 5.5.4, it says like 775 * this: 776 * 1. use of deprecated addr with existing 777 * communication is okay - "SHOULD continue 778 * to be used" 779 * 2. use of it with new communication: 780 * (2a) "SHOULD NOT be used if alternate 781 * address with sufficient scope is 782 * available" 783 * (2b) nothing mentioned otherwise. 784 * Here we fall into (2b) case as we have no 785 * choice in our source address selection - we 786 * must obey the peer. 787 * 788 * The wording in RFC2462 is confusing, and 789 * there are multiple description text for 790 * deprecated address handling - worse, they 791 * are not exactly the same. I believe 5.5.4 792 * is the best one, so we follow 5.5.4. 793 */ 794 if (ip6 && !ip6_use_deprecated) { 795 struct in6_ifaddr *ia6; 796 struct ifnet *ifp = 797 if_get(m->m_pkthdr.ph_ifidx); 798 799 if (ifp && 800 (ia6 = in6ifa_ifpwithaddr(ifp, 801 &ip6->ip6_dst)) && 802 (ia6->ia6_flags & 803 IN6_IFF_DEPRECATED)) { 804 tp = NULL; 805 if_put(ifp); 806 goto dropwithreset; 807 } 808 if_put(ifp); 809 } 810 #endif 811 812 /* 813 * LISTEN socket received a SYN 814 * from itself? This can't possibly 815 * be valid; drop the packet. 816 */ 817 if (th->th_dport == th->th_sport) { 818 switch (af) { 819 #ifdef INET6 820 case AF_INET6: 821 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 822 &ip6->ip6_dst)) { 823 tcpstat_inc(tcps_badsyn); 824 goto drop; 825 } 826 break; 827 #endif /* INET6 */ 828 case AF_INET: 829 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 830 tcpstat_inc(tcps_badsyn); 831 goto drop; 832 } 833 break; 834 } 835 } 836 837 /* 838 * SYN looks ok; create compressed TCP 839 * state for it. 840 */ 841 if (so->so_qlen > so->so_qlimit || 842 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 843 so, m, optp, optlen, &opti, reuse, now, 844 do_ecn) == -1) { 845 tcpstat_inc(tcps_dropsyn); 846 goto drop; 847 } 848 in_pcbunref(inp); 849 return IPPROTO_DONE; 850 } 851 } 852 } 853 854 #ifdef DIAGNOSTIC 855 /* 856 * Should not happen now that all embryonic connections 857 * are handled with compressed state. 858 */ 859 if (tp->t_state == TCPS_LISTEN) 860 panic("tcp_input: TCPS_LISTEN"); 861 #endif 862 863 #if NPF > 0 864 pf_inp_link(m, inp); 865 #endif 866 867 /* 868 * Segment received on connection. 869 * Reset idle time and keep-alive timer. 870 */ 871 tp->t_rcvtime = now; 872 if (TCPS_HAVEESTABLISHED(tp->t_state)) 873 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 874 875 if (tp->sack_enable) 876 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 877 878 /* 879 * Process options. 880 */ 881 if (optp 882 #ifdef TCP_SIGNATURE 883 || (tp->t_flags & TF_SIGNATURE) 884 #endif 885 ) { 886 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 887 m->m_pkthdr.ph_rtableid, now)) 888 goto drop; 889 } 890 891 if (opti.ts_present && opti.ts_ecr) { 892 int32_t rtt_test; 893 894 /* subtract out the tcp timestamp modulator */ 895 opti.ts_ecr -= tp->ts_modulate; 896 897 /* make sure ts_ecr is sensible */ 898 rtt_test = now - opti.ts_ecr; 899 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 900 opti.ts_ecr = 0; 901 } 902 903 #ifdef TCP_ECN 904 /* if congestion experienced, set ECE bit in subsequent packets. */ 905 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 906 tp->t_flags |= TF_RCVD_CE; 907 tcpstat_inc(tcps_ecn_rcvce); 908 } 909 #endif 910 /* 911 * Header prediction: check for the two common cases 912 * of a uni-directional data xfer. If the packet has 913 * no control flags, is in-sequence, the window didn't 914 * change and we're not retransmitting, it's a 915 * candidate. If the length is zero and the ack moved 916 * forward, we're the sender side of the xfer. Just 917 * free the data acked & wake any higher level process 918 * that was blocked waiting for space. If the length 919 * is non-zero and the ack didn't move, we're the 920 * receiver side. If we're getting packets in-order 921 * (the reassembly queue is empty), add the data to 922 * the socket buffer and note that we need a delayed ack. 923 */ 924 if (tp->t_state == TCPS_ESTABLISHED && 925 #ifdef TCP_ECN 926 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 927 #else 928 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 929 #endif 930 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 931 th->th_seq == tp->rcv_nxt && 932 tiwin && tiwin == tp->snd_wnd && 933 tp->snd_nxt == tp->snd_max) { 934 935 /* 936 * If last ACK falls within this segment's sequence numbers, 937 * record the timestamp. 938 * Fix from Braden, see Stevens p. 870 939 */ 940 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 941 tp->ts_recent_age = now; 942 tp->ts_recent = opti.ts_val; 943 } 944 945 if (tlen == 0) { 946 if (SEQ_GT(th->th_ack, tp->snd_una) && 947 SEQ_LEQ(th->th_ack, tp->snd_max) && 948 tp->snd_cwnd >= tp->snd_wnd && 949 tp->t_dupacks == 0) { 950 /* 951 * this is a pure ack for outstanding data. 952 */ 953 tcpstat_inc(tcps_predack); 954 if (opti.ts_present && opti.ts_ecr) 955 tcp_xmit_timer(tp, now - opti.ts_ecr); 956 else if (tp->t_rtttime && 957 SEQ_GT(th->th_ack, tp->t_rtseq)) 958 tcp_xmit_timer(tp, now - tp->t_rtttime); 959 acked = th->th_ack - tp->snd_una; 960 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 961 acked); 962 tp->t_rcvacktime = now; 963 ND6_HINT(tp); 964 965 mtx_enter(&so->so_snd.sb_mtx); 966 sbdrop(so, &so->so_snd, acked); 967 mtx_leave(&so->so_snd.sb_mtx); 968 969 /* 970 * If we had a pending ICMP message that 971 * refers to data that have just been 972 * acknowledged, disregard the recorded ICMP 973 * message. 974 */ 975 if ((tp->t_flags & TF_PMTUD_PEND) && 976 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 977 tp->t_flags &= ~TF_PMTUD_PEND; 978 979 /* 980 * Keep track of the largest chunk of data 981 * acknowledged since last PMTU update 982 */ 983 if (tp->t_pmtud_mss_acked < acked) 984 tp->t_pmtud_mss_acked = acked; 985 986 tp->snd_una = th->th_ack; 987 /* Pull snd_wl2 up to prevent seq wrap. */ 988 tp->snd_wl2 = th->th_ack; 989 /* 990 * We want snd_last to track snd_una so 991 * as to avoid sequence wraparound problems 992 * for very large transfers. 993 */ 994 #ifdef TCP_ECN 995 if (SEQ_GT(tp->snd_una, tp->snd_last)) 996 #endif 997 tp->snd_last = tp->snd_una; 998 m_freem(m); 999 1000 /* 1001 * If all outstanding data are acked, stop 1002 * retransmit timer, otherwise restart timer 1003 * using current (possibly backed-off) value. 1004 * If process is waiting for space, 1005 * wakeup/selwakeup/signal. If data 1006 * are ready to send, let tcp_output 1007 * decide between more output or persist. 1008 */ 1009 if (tp->snd_una == tp->snd_max) 1010 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1011 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1012 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1013 1014 tcp_update_sndspace(tp); 1015 if (sb_notify(so, &so->so_snd)) { 1016 tp->t_flags |= TF_BLOCKOUTPUT; 1017 sowwakeup(so); 1018 tp->t_flags &= ~TF_BLOCKOUTPUT; 1019 } 1020 if (so->so_snd.sb_cc || 1021 tp->t_flags & TF_NEEDOUTPUT) 1022 (void) tcp_output(tp); 1023 in_pcbunref(inp); 1024 return IPPROTO_DONE; 1025 } 1026 } else if (th->th_ack == tp->snd_una && 1027 TAILQ_EMPTY(&tp->t_segq) && 1028 tlen <= sbspace(so, &so->so_rcv)) { 1029 /* 1030 * This is a pure, in-sequence data packet 1031 * with nothing on the reassembly queue and 1032 * we have enough buffer space to take it. 1033 */ 1034 /* Clean receiver SACK report if present */ 1035 if (tp->sack_enable && tp->rcv_numsacks) 1036 tcp_clean_sackreport(tp); 1037 tcpstat_inc(tcps_preddat); 1038 tp->rcv_nxt += tlen; 1039 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1040 tp->snd_wl1 = th->th_seq; 1041 /* Packet has most recent segment, no urgent exists. */ 1042 tp->rcv_up = tp->rcv_nxt; 1043 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1044 ND6_HINT(tp); 1045 1046 TCP_SETUP_ACK(tp, tiflags, m); 1047 /* 1048 * Drop TCP, IP headers and TCP options then add data 1049 * to socket buffer. 1050 */ 1051 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1052 m_freem(m); 1053 else { 1054 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1055 now - tp->rfbuf_ts > (tp->t_srtt >> 1056 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) { 1057 tcp_update_rcvspace(tp); 1058 /* Start over with next RTT. */ 1059 tp->rfbuf_cnt = 0; 1060 tp->rfbuf_ts = 0; 1061 } else 1062 tp->rfbuf_cnt += tlen; 1063 m_adj(m, iphlen + off); 1064 mtx_enter(&so->so_rcv.sb_mtx); 1065 sbappendstream(so, &so->so_rcv, m); 1066 mtx_leave(&so->so_rcv.sb_mtx); 1067 } 1068 tp->t_flags |= TF_BLOCKOUTPUT; 1069 sorwakeup(so); 1070 tp->t_flags &= ~TF_BLOCKOUTPUT; 1071 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1072 (void) tcp_output(tp); 1073 in_pcbunref(inp); 1074 return IPPROTO_DONE; 1075 } 1076 } 1077 1078 /* 1079 * Compute mbuf offset to TCP data segment. 1080 */ 1081 hdroptlen = iphlen + off; 1082 1083 /* 1084 * Calculate amount of space in receive window, 1085 * and then do TCP input processing. 1086 * Receive window is amount of space in rcv queue, 1087 * but not less than advertised window. 1088 */ 1089 { 1090 int win; 1091 1092 win = sbspace(so, &so->so_rcv); 1093 if (win < 0) 1094 win = 0; 1095 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1096 } 1097 1098 switch (tp->t_state) { 1099 1100 /* 1101 * If the state is SYN_RECEIVED: 1102 * if seg contains SYN/ACK, send an RST. 1103 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1104 */ 1105 1106 case TCPS_SYN_RECEIVED: 1107 if (tiflags & TH_ACK) { 1108 if (tiflags & TH_SYN) { 1109 tcpstat_inc(tcps_badsyn); 1110 goto dropwithreset; 1111 } 1112 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1113 SEQ_GT(th->th_ack, tp->snd_max)) 1114 goto dropwithreset; 1115 } 1116 break; 1117 1118 /* 1119 * If the state is SYN_SENT: 1120 * if seg contains an ACK, but not for our SYN, drop the input. 1121 * if seg contains a RST, then drop the connection. 1122 * if seg does not contain SYN, then drop it. 1123 * Otherwise this is an acceptable SYN segment 1124 * initialize tp->rcv_nxt and tp->irs 1125 * if seg contains ack then advance tp->snd_una 1126 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1127 * arrange for segment to be acked (eventually) 1128 * continue processing rest of data/controls, beginning with URG 1129 */ 1130 case TCPS_SYN_SENT: 1131 if ((tiflags & TH_ACK) && 1132 (SEQ_LEQ(th->th_ack, tp->iss) || 1133 SEQ_GT(th->th_ack, tp->snd_max))) 1134 goto dropwithreset; 1135 if (tiflags & TH_RST) { 1136 #ifdef TCP_ECN 1137 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1138 if (do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1139 goto drop; 1140 #endif 1141 if (tiflags & TH_ACK) 1142 tp = tcp_drop(tp, ECONNREFUSED); 1143 goto drop; 1144 } 1145 if ((tiflags & TH_SYN) == 0) 1146 goto drop; 1147 if (tiflags & TH_ACK) { 1148 tp->snd_una = th->th_ack; 1149 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1150 tp->snd_nxt = tp->snd_una; 1151 } 1152 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1153 tp->irs = th->th_seq; 1154 tcp_mss(tp, opti.maxseg); 1155 /* Reset initial window to 1 segment for retransmit */ 1156 if (tp->t_rxtshift > 0) 1157 tp->snd_cwnd = tp->t_maxseg; 1158 tcp_rcvseqinit(tp); 1159 tp->t_flags |= TF_ACKNOW; 1160 /* 1161 * If we've sent a SACK_PERMITTED option, and the peer 1162 * also replied with one, then TF_SACK_PERMIT should have 1163 * been set in tcp_dooptions(). If it was not, disable SACKs. 1164 */ 1165 if (tp->sack_enable) 1166 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1167 #ifdef TCP_ECN 1168 /* 1169 * if ECE is set but CWR is not set for SYN-ACK, or 1170 * both ECE and CWR are set for simultaneous open, 1171 * peer is ECN capable. 1172 */ 1173 if (do_ecn) { 1174 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1175 case TH_ACK|TH_ECE: 1176 case TH_ECE|TH_CWR: 1177 tp->t_flags |= TF_ECN_PERMIT; 1178 tiflags &= ~(TH_ECE|TH_CWR); 1179 tcpstat_inc(tcps_ecn_accepts); 1180 } 1181 } 1182 #endif 1183 1184 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1185 tcpstat_inc(tcps_connects); 1186 tp->t_flags |= TF_BLOCKOUTPUT; 1187 soisconnected(so); 1188 tp->t_flags &= ~TF_BLOCKOUTPUT; 1189 tp->t_state = TCPS_ESTABLISHED; 1190 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1191 /* Do window scaling on this connection? */ 1192 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1193 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1194 tp->snd_scale = tp->requested_s_scale; 1195 tp->rcv_scale = tp->request_r_scale; 1196 } 1197 tcp_flush_queue(tp); 1198 1199 /* 1200 * if we didn't have to retransmit the SYN, 1201 * use its rtt as our initial srtt & rtt var. 1202 */ 1203 if (tp->t_rtttime) 1204 tcp_xmit_timer(tp, now - tp->t_rtttime); 1205 /* 1206 * Since new data was acked (the SYN), open the 1207 * congestion window by one MSS. We do this 1208 * here, because we won't go through the normal 1209 * ACK processing below. And since this is the 1210 * start of the connection, we know we are in 1211 * the exponential phase of slow-start. 1212 */ 1213 tp->snd_cwnd += tp->t_maxseg; 1214 } else 1215 tp->t_state = TCPS_SYN_RECEIVED; 1216 1217 #if 0 1218 trimthenstep6: 1219 #endif 1220 /* 1221 * Advance th->th_seq to correspond to first data byte. 1222 * If data, trim to stay within window, 1223 * dropping FIN if necessary. 1224 */ 1225 th->th_seq++; 1226 if (tlen > tp->rcv_wnd) { 1227 todrop = tlen - tp->rcv_wnd; 1228 m_adj(m, -todrop); 1229 tlen = tp->rcv_wnd; 1230 tiflags &= ~TH_FIN; 1231 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1232 todrop); 1233 } 1234 tp->snd_wl1 = th->th_seq - 1; 1235 tp->rcv_up = th->th_seq; 1236 goto step6; 1237 /* 1238 * If a new connection request is received while in TIME_WAIT, 1239 * drop the old connection and start over if the if the 1240 * timestamp or the sequence numbers are above the previous 1241 * ones. 1242 */ 1243 case TCPS_TIME_WAIT: 1244 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1245 ((opti.ts_present && 1246 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1247 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1248 #if NPF > 0 1249 /* 1250 * The socket will be recreated but the new state 1251 * has already been linked to the socket. Remove the 1252 * link between old socket and new state. 1253 */ 1254 pf_inp_unlink(inp); 1255 #endif 1256 /* 1257 * Advance the iss by at least 32768, but 1258 * clear the msb in order to make sure 1259 * that SEG_LT(snd_nxt, iss). 1260 */ 1261 iss = tp->snd_nxt + 1262 ((arc4random() & 0x7fffffff) | 0x8000); 1263 reuse = &iss; 1264 tp = tcp_close(tp); 1265 in_pcbunref(inp); 1266 inp = NULL; 1267 goto findpcb; 1268 } 1269 } 1270 1271 /* 1272 * States other than LISTEN or SYN_SENT. 1273 * First check timestamp, if present. 1274 * Then check that at least some bytes of segment are within 1275 * receive window. If segment begins before rcv_nxt, 1276 * drop leading data (and SYN); if nothing left, just ack. 1277 * 1278 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1279 * and it's less than opti.ts_recent, drop it. 1280 */ 1281 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1282 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1283 1284 /* Check to see if ts_recent is over 24 days old. */ 1285 if (now - tp->ts_recent_age > TCP_PAWS_IDLE) { 1286 /* 1287 * Invalidate ts_recent. If this segment updates 1288 * ts_recent, the age will be reset later and ts_recent 1289 * will get a valid value. If it does not, setting 1290 * ts_recent to zero will at least satisfy the 1291 * requirement that zero be placed in the timestamp 1292 * echo reply when ts_recent isn't valid. The 1293 * age isn't reset until we get a valid ts_recent 1294 * because we don't want out-of-order segments to be 1295 * dropped when ts_recent is old. 1296 */ 1297 tp->ts_recent = 0; 1298 } else { 1299 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1300 tcpstat_inc(tcps_pawsdrop); 1301 if (tlen) 1302 goto dropafterack; 1303 goto drop; 1304 } 1305 } 1306 1307 todrop = tp->rcv_nxt - th->th_seq; 1308 if (todrop > 0) { 1309 if (tiflags & TH_SYN) { 1310 tiflags &= ~TH_SYN; 1311 th->th_seq++; 1312 if (th->th_urp > 1) 1313 th->th_urp--; 1314 else 1315 tiflags &= ~TH_URG; 1316 todrop--; 1317 } 1318 if (todrop > tlen || 1319 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1320 /* 1321 * Any valid FIN must be to the left of the 1322 * window. At this point, FIN must be a 1323 * duplicate or out-of-sequence, so drop it. 1324 */ 1325 tiflags &= ~TH_FIN; 1326 /* 1327 * Send ACK to resynchronize, and drop any data, 1328 * but keep on processing for RST or ACK. 1329 */ 1330 tp->t_flags |= TF_ACKNOW; 1331 todrop = tlen; 1332 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1333 } else { 1334 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1335 todrop); 1336 } 1337 hdroptlen += todrop; /* drop from head afterwards */ 1338 th->th_seq += todrop; 1339 tlen -= todrop; 1340 if (th->th_urp > todrop) 1341 th->th_urp -= todrop; 1342 else { 1343 tiflags &= ~TH_URG; 1344 th->th_urp = 0; 1345 } 1346 } 1347 1348 /* 1349 * If new data are received on a connection after the 1350 * user processes are gone, then RST the other end. 1351 */ 1352 if ((so->so_state & SS_NOFDREF) && 1353 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1354 tp = tcp_close(tp); 1355 tcpstat_inc(tcps_rcvafterclose); 1356 goto dropwithreset; 1357 } 1358 1359 /* 1360 * If segment ends after window, drop trailing data 1361 * (and PUSH and FIN); if nothing left, just ACK. 1362 */ 1363 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1364 if (todrop > 0) { 1365 tcpstat_inc(tcps_rcvpackafterwin); 1366 if (todrop >= tlen) { 1367 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1368 /* 1369 * If window is closed can only take segments at 1370 * window edge, and have to drop data and PUSH from 1371 * incoming segments. Continue processing, but 1372 * remember to ack. Otherwise, drop segment 1373 * and ack. 1374 */ 1375 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1376 tp->t_flags |= TF_ACKNOW; 1377 tcpstat_inc(tcps_rcvwinprobe); 1378 } else 1379 goto dropafterack; 1380 } else 1381 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1382 m_adj(m, -todrop); 1383 tlen -= todrop; 1384 tiflags &= ~(TH_PUSH|TH_FIN); 1385 } 1386 1387 /* 1388 * If last ACK falls within this segment's sequence numbers, 1389 * record its timestamp if it's more recent. 1390 * NOTE that the test is modified according to the latest 1391 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1392 */ 1393 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1394 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1395 tp->ts_recent_age = now; 1396 tp->ts_recent = opti.ts_val; 1397 } 1398 1399 /* 1400 * If the RST bit is set examine the state: 1401 * SYN_RECEIVED STATE: 1402 * If passive open, return to LISTEN state. 1403 * If active open, inform user that connection was refused. 1404 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1405 * Inform user that connection was reset, and close tcb. 1406 * CLOSING, LAST_ACK, TIME_WAIT STATES 1407 * Close the tcb. 1408 */ 1409 if (tiflags & TH_RST) { 1410 if (th->th_seq != tp->last_ack_sent && 1411 th->th_seq != tp->rcv_nxt && 1412 th->th_seq != (tp->rcv_nxt + 1)) 1413 goto drop; 1414 1415 switch (tp->t_state) { 1416 case TCPS_SYN_RECEIVED: 1417 #ifdef TCP_ECN 1418 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1419 if (do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1420 goto drop; 1421 #endif 1422 so->so_error = ECONNREFUSED; 1423 goto close; 1424 1425 case TCPS_ESTABLISHED: 1426 case TCPS_FIN_WAIT_1: 1427 case TCPS_FIN_WAIT_2: 1428 case TCPS_CLOSE_WAIT: 1429 so->so_error = ECONNRESET; 1430 close: 1431 tp->t_state = TCPS_CLOSED; 1432 tcpstat_inc(tcps_drops); 1433 tp = tcp_close(tp); 1434 goto drop; 1435 case TCPS_CLOSING: 1436 case TCPS_LAST_ACK: 1437 case TCPS_TIME_WAIT: 1438 tp = tcp_close(tp); 1439 goto drop; 1440 } 1441 } 1442 1443 /* 1444 * If a SYN is in the window, then this is an 1445 * error and we ACK and drop the packet. 1446 */ 1447 if (tiflags & TH_SYN) 1448 goto dropafterack_ratelim; 1449 1450 /* 1451 * If the ACK bit is off we drop the segment and return. 1452 */ 1453 if ((tiflags & TH_ACK) == 0) { 1454 if (tp->t_flags & TF_ACKNOW) 1455 goto dropafterack; 1456 else 1457 goto drop; 1458 } 1459 1460 /* 1461 * Ack processing. 1462 */ 1463 switch (tp->t_state) { 1464 1465 /* 1466 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1467 * ESTABLISHED state and continue processing. 1468 * The ACK was checked above. 1469 */ 1470 case TCPS_SYN_RECEIVED: 1471 tcpstat_inc(tcps_connects); 1472 tp->t_flags |= TF_BLOCKOUTPUT; 1473 soisconnected(so); 1474 tp->t_flags &= ~TF_BLOCKOUTPUT; 1475 tp->t_state = TCPS_ESTABLISHED; 1476 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1477 /* Do window scaling? */ 1478 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1479 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1480 tp->snd_scale = tp->requested_s_scale; 1481 tp->rcv_scale = tp->request_r_scale; 1482 tiwin = th->th_win << tp->snd_scale; 1483 } 1484 tcp_flush_queue(tp); 1485 tp->snd_wl1 = th->th_seq - 1; 1486 /* fall into ... */ 1487 1488 /* 1489 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1490 * ACKs. If the ack is in the range 1491 * tp->snd_una < th->th_ack <= tp->snd_max 1492 * then advance tp->snd_una to th->th_ack and drop 1493 * data from the retransmission queue. If this ACK reflects 1494 * more up to date window information we update our window information. 1495 */ 1496 case TCPS_ESTABLISHED: 1497 case TCPS_FIN_WAIT_1: 1498 case TCPS_FIN_WAIT_2: 1499 case TCPS_CLOSE_WAIT: 1500 case TCPS_CLOSING: 1501 case TCPS_LAST_ACK: 1502 case TCPS_TIME_WAIT: 1503 #ifdef TCP_ECN 1504 /* 1505 * if we receive ECE and are not already in recovery phase, 1506 * reduce cwnd by half but don't slow-start. 1507 * advance snd_last to snd_max not to reduce cwnd again 1508 * until all outstanding packets are acked. 1509 */ 1510 if (do_ecn && (tiflags & TH_ECE)) { 1511 if ((tp->t_flags & TF_ECN_PERMIT) && 1512 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1513 u_int win; 1514 1515 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1516 if (win > 1) { 1517 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1518 tp->snd_cwnd = tp->snd_ssthresh; 1519 tp->snd_last = tp->snd_max; 1520 tp->t_flags |= TF_SEND_CWR; 1521 tcpstat_inc(tcps_cwr_ecn); 1522 } 1523 } 1524 tcpstat_inc(tcps_ecn_rcvece); 1525 } 1526 /* 1527 * if we receive CWR, we know that the peer has reduced 1528 * its congestion window. stop sending ecn-echo. 1529 */ 1530 if ((tiflags & TH_CWR)) { 1531 tp->t_flags &= ~TF_RCVD_CE; 1532 tcpstat_inc(tcps_ecn_rcvcwr); 1533 } 1534 #endif /* TCP_ECN */ 1535 1536 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1537 /* 1538 * Duplicate/old ACK processing. 1539 * Increments t_dupacks: 1540 * Pure duplicate (same seq/ack/window, no data) 1541 * Doesn't affect t_dupacks: 1542 * Data packets. 1543 * Normal window updates (window opens) 1544 * Resets t_dupacks: 1545 * New data ACKed. 1546 * Window shrinks 1547 * Old ACK 1548 */ 1549 if (tlen) { 1550 /* Drop very old ACKs unless th_seq matches */ 1551 if (th->th_seq != tp->rcv_nxt && 1552 SEQ_LT(th->th_ack, 1553 tp->snd_una - tp->max_sndwnd)) { 1554 tcpstat_inc(tcps_rcvacktooold); 1555 goto drop; 1556 } 1557 break; 1558 } 1559 /* 1560 * If we get an old ACK, there is probably packet 1561 * reordering going on. Be conservative and reset 1562 * t_dupacks so that we are less aggressive in 1563 * doing a fast retransmit. 1564 */ 1565 if (th->th_ack != tp->snd_una) { 1566 tp->t_dupacks = 0; 1567 break; 1568 } 1569 if (tiwin == tp->snd_wnd) { 1570 tcpstat_inc(tcps_rcvdupack); 1571 /* 1572 * If we have outstanding data (other than 1573 * a window probe), this is a completely 1574 * duplicate ack (ie, window info didn't 1575 * change), the ack is the biggest we've 1576 * seen and we've seen exactly our rexmt 1577 * threshold of them, assume a packet 1578 * has been dropped and retransmit it. 1579 * Kludge snd_nxt & the congestion 1580 * window so we send only this one 1581 * packet. 1582 * 1583 * We know we're losing at the current 1584 * window size so do congestion avoidance 1585 * (set ssthresh to half the current window 1586 * and pull our congestion window back to 1587 * the new ssthresh). 1588 * 1589 * Dup acks mean that packets have left the 1590 * network (they're now cached at the receiver) 1591 * so bump cwnd by the amount in the receiver 1592 * to keep a constant cwnd packets in the 1593 * network. 1594 */ 1595 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1596 tp->t_dupacks = 0; 1597 else if (++tp->t_dupacks == tcprexmtthresh) { 1598 tcp_seq onxt = tp->snd_nxt; 1599 u_long win = 1600 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1601 2 / tp->t_maxseg; 1602 1603 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1604 /* 1605 * False fast retx after 1606 * timeout. Do not cut window. 1607 */ 1608 tp->t_dupacks = 0; 1609 goto drop; 1610 } 1611 if (win < 2) 1612 win = 2; 1613 tp->snd_ssthresh = win * tp->t_maxseg; 1614 tp->snd_last = tp->snd_max; 1615 if (tp->sack_enable) { 1616 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1617 tp->t_rtttime = 0; 1618 #ifdef TCP_ECN 1619 tp->t_flags |= TF_SEND_CWR; 1620 #endif 1621 tcpstat_inc(tcps_cwr_frecovery); 1622 tcpstat_inc(tcps_sack_recovery_episode); 1623 /* 1624 * tcp_output() will send 1625 * oldest SACK-eligible rtx. 1626 */ 1627 (void) tcp_output(tp); 1628 tp->snd_cwnd = tp->snd_ssthresh+ 1629 tp->t_maxseg * tp->t_dupacks; 1630 goto drop; 1631 } 1632 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1633 tp->t_rtttime = 0; 1634 tp->snd_nxt = th->th_ack; 1635 tp->snd_cwnd = tp->t_maxseg; 1636 #ifdef TCP_ECN 1637 tp->t_flags |= TF_SEND_CWR; 1638 #endif 1639 tcpstat_inc(tcps_cwr_frecovery); 1640 tcpstat_inc(tcps_sndrexmitfast); 1641 (void) tcp_output(tp); 1642 1643 tp->snd_cwnd = tp->snd_ssthresh + 1644 tp->t_maxseg * tp->t_dupacks; 1645 if (SEQ_GT(onxt, tp->snd_nxt)) 1646 tp->snd_nxt = onxt; 1647 goto drop; 1648 } else if (tp->t_dupacks > tcprexmtthresh) { 1649 tp->snd_cwnd += tp->t_maxseg; 1650 (void) tcp_output(tp); 1651 goto drop; 1652 } 1653 } else if (tiwin < tp->snd_wnd) { 1654 /* 1655 * The window was retracted! Previous dup 1656 * ACKs may have been due to packets arriving 1657 * after the shrunken window, not a missing 1658 * packet, so play it safe and reset t_dupacks 1659 */ 1660 tp->t_dupacks = 0; 1661 } 1662 break; 1663 } 1664 /* 1665 * If the congestion window was inflated to account 1666 * for the other side's cached packets, retract it. 1667 */ 1668 if (tp->t_dupacks >= tcprexmtthresh) { 1669 /* Check for a partial ACK */ 1670 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1671 if (tp->sack_enable) 1672 tcp_sack_partialack(tp, th); 1673 else 1674 tcp_newreno_partialack(tp, th); 1675 } else { 1676 /* Out of fast recovery */ 1677 tp->snd_cwnd = tp->snd_ssthresh; 1678 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1679 tp->snd_ssthresh) 1680 tp->snd_cwnd = 1681 tcp_seq_subtract(tp->snd_max, 1682 th->th_ack); 1683 tp->t_dupacks = 0; 1684 } 1685 } else { 1686 /* 1687 * Reset the duplicate ACK counter if we 1688 * were not in fast recovery. 1689 */ 1690 tp->t_dupacks = 0; 1691 } 1692 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1693 tcpstat_inc(tcps_rcvacktoomuch); 1694 goto dropafterack_ratelim; 1695 } 1696 acked = th->th_ack - tp->snd_una; 1697 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1698 tp->t_rcvacktime = now; 1699 1700 /* 1701 * If we have a timestamp reply, update smoothed 1702 * round trip time. If no timestamp is present but 1703 * transmit timer is running and timed sequence 1704 * number was acked, update smoothed round trip time. 1705 * Since we now have an rtt measurement, cancel the 1706 * timer backoff (cf., Phil Karn's retransmit alg.). 1707 * Recompute the initial retransmit timer. 1708 */ 1709 if (opti.ts_present && opti.ts_ecr) 1710 tcp_xmit_timer(tp, now - opti.ts_ecr); 1711 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1712 tcp_xmit_timer(tp, now - tp->t_rtttime); 1713 1714 /* 1715 * If all outstanding data is acked, stop retransmit 1716 * timer and remember to restart (more output or persist). 1717 * If there is more data to be acked, restart retransmit 1718 * timer, using current (possibly backed-off) value. 1719 */ 1720 if (th->th_ack == tp->snd_max) { 1721 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1722 tp->t_flags |= TF_NEEDOUTPUT; 1723 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1724 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1725 /* 1726 * When new data is acked, open the congestion window. 1727 * If the window gives us less than ssthresh packets 1728 * in flight, open exponentially (maxseg per packet). 1729 * Otherwise open linearly: maxseg per window 1730 * (maxseg^2 / cwnd per packet). 1731 */ 1732 { 1733 u_int cw = tp->snd_cwnd; 1734 u_int incr = tp->t_maxseg; 1735 1736 if (cw > tp->snd_ssthresh) 1737 incr = max(incr * incr / cw, 1); 1738 if (tp->t_dupacks < tcprexmtthresh) 1739 tp->snd_cwnd = ulmin(cw + incr, 1740 TCP_MAXWIN << tp->snd_scale); 1741 } 1742 ND6_HINT(tp); 1743 if (acked > so->so_snd.sb_cc) { 1744 if (tp->snd_wnd > so->so_snd.sb_cc) 1745 tp->snd_wnd -= so->so_snd.sb_cc; 1746 else 1747 tp->snd_wnd = 0; 1748 mtx_enter(&so->so_snd.sb_mtx); 1749 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1750 mtx_leave(&so->so_snd.sb_mtx); 1751 ourfinisacked = 1; 1752 } else { 1753 mtx_enter(&so->so_snd.sb_mtx); 1754 sbdrop(so, &so->so_snd, acked); 1755 mtx_leave(&so->so_snd.sb_mtx); 1756 if (tp->snd_wnd > acked) 1757 tp->snd_wnd -= acked; 1758 else 1759 tp->snd_wnd = 0; 1760 ourfinisacked = 0; 1761 } 1762 1763 tcp_update_sndspace(tp); 1764 if (sb_notify(so, &so->so_snd)) { 1765 tp->t_flags |= TF_BLOCKOUTPUT; 1766 sowwakeup(so); 1767 tp->t_flags &= ~TF_BLOCKOUTPUT; 1768 } 1769 1770 /* 1771 * If we had a pending ICMP message that referred to data 1772 * that have just been acknowledged, disregard the recorded 1773 * ICMP message. 1774 */ 1775 if ((tp->t_flags & TF_PMTUD_PEND) && 1776 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1777 tp->t_flags &= ~TF_PMTUD_PEND; 1778 1779 /* 1780 * Keep track of the largest chunk of data acknowledged 1781 * since last PMTU update 1782 */ 1783 if (tp->t_pmtud_mss_acked < acked) 1784 tp->t_pmtud_mss_acked = acked; 1785 1786 tp->snd_una = th->th_ack; 1787 #ifdef TCP_ECN 1788 /* sync snd_last with snd_una */ 1789 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1790 tp->snd_last = tp->snd_una; 1791 #endif 1792 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1793 tp->snd_nxt = tp->snd_una; 1794 1795 switch (tp->t_state) { 1796 1797 /* 1798 * In FIN_WAIT_1 STATE in addition to the processing 1799 * for the ESTABLISHED state if our FIN is now acknowledged 1800 * then enter FIN_WAIT_2. 1801 */ 1802 case TCPS_FIN_WAIT_1: 1803 if (ourfinisacked) { 1804 /* 1805 * If we can't receive any more 1806 * data, then closing user can proceed. 1807 * Starting the timer is contrary to the 1808 * specification, but if we don't get a FIN 1809 * we'll hang forever. 1810 */ 1811 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 1812 tp->t_flags |= TF_BLOCKOUTPUT; 1813 soisdisconnected(so); 1814 tp->t_flags &= ~TF_BLOCKOUTPUT; 1815 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1816 } 1817 tp->t_state = TCPS_FIN_WAIT_2; 1818 } 1819 break; 1820 1821 /* 1822 * In CLOSING STATE in addition to the processing for 1823 * the ESTABLISHED state if the ACK acknowledges our FIN 1824 * then enter the TIME-WAIT state, otherwise ignore 1825 * the segment. 1826 */ 1827 case TCPS_CLOSING: 1828 if (ourfinisacked) { 1829 tp->t_state = TCPS_TIME_WAIT; 1830 tcp_canceltimers(tp); 1831 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1832 tp->t_flags |= TF_BLOCKOUTPUT; 1833 soisdisconnected(so); 1834 tp->t_flags &= ~TF_BLOCKOUTPUT; 1835 } 1836 break; 1837 1838 /* 1839 * In LAST_ACK, we may still be waiting for data to drain 1840 * and/or to be acked, as well as for the ack of our FIN. 1841 * If our FIN is now acknowledged, delete the TCB, 1842 * enter the closed state and return. 1843 */ 1844 case TCPS_LAST_ACK: 1845 if (ourfinisacked) { 1846 tp = tcp_close(tp); 1847 goto drop; 1848 } 1849 break; 1850 1851 /* 1852 * In TIME_WAIT state the only thing that should arrive 1853 * is a retransmission of the remote FIN. Acknowledge 1854 * it and restart the finack timer. 1855 */ 1856 case TCPS_TIME_WAIT: 1857 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1858 goto dropafterack; 1859 } 1860 } 1861 1862 step6: 1863 /* 1864 * Update window information. 1865 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1866 */ 1867 if ((tiflags & TH_ACK) && 1868 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1869 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1870 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1871 /* keep track of pure window updates */ 1872 if (tlen == 0 && 1873 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1874 tcpstat_inc(tcps_rcvwinupd); 1875 tp->snd_wnd = tiwin; 1876 tp->snd_wl1 = th->th_seq; 1877 tp->snd_wl2 = th->th_ack; 1878 if (tp->snd_wnd > tp->max_sndwnd) 1879 tp->max_sndwnd = tp->snd_wnd; 1880 tp->t_flags |= TF_NEEDOUTPUT; 1881 } 1882 1883 /* 1884 * Process segments with URG. 1885 */ 1886 if ((tiflags & TH_URG) && th->th_urp && 1887 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1888 u_long urgent; 1889 1890 /* 1891 * This is a kludge, but if we receive and accept 1892 * random urgent pointers, we'll crash in 1893 * soreceive. It's hard to imagine someone 1894 * actually wanting to send this much urgent data. 1895 */ 1896 mtx_enter(&so->so_rcv.sb_mtx); 1897 urgent = th->th_urp + so->so_rcv.sb_cc; 1898 mtx_leave(&so->so_rcv.sb_mtx); 1899 1900 if (urgent > sb_max) { 1901 th->th_urp = 0; /* XXX */ 1902 tiflags &= ~TH_URG; /* XXX */ 1903 goto dodata; /* XXX */ 1904 } 1905 /* 1906 * If this segment advances the known urgent pointer, 1907 * then mark the data stream. This should not happen 1908 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1909 * a FIN has been received from the remote side. 1910 * In these states we ignore the URG. 1911 * 1912 * According to RFC961 (Assigned Protocols), 1913 * the urgent pointer points to the last octet 1914 * of urgent data. We continue, however, 1915 * to consider it to indicate the first octet 1916 * of data past the urgent section as the original 1917 * spec states (in one of two places). 1918 */ 1919 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1920 tp->rcv_up = th->th_seq + th->th_urp; 1921 mtx_enter(&so->so_rcv.sb_mtx); 1922 so->so_oobmark = so->so_rcv.sb_cc + 1923 (tp->rcv_up - tp->rcv_nxt) - 1; 1924 if (so->so_oobmark == 0) 1925 so->so_rcv.sb_state |= SS_RCVATMARK; 1926 mtx_leave(&so->so_rcv.sb_mtx); 1927 sohasoutofband(so); 1928 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1929 } 1930 /* 1931 * Remove out of band data so doesn't get presented to user. 1932 * This can happen independent of advancing the URG pointer, 1933 * but if two URG's are pending at once, some out-of-band 1934 * data may creep in... ick. 1935 */ 1936 if (th->th_urp <= (u_int16_t) tlen && 1937 (so->so_options & SO_OOBINLINE) == 0) 1938 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1939 } else 1940 /* 1941 * If no out of band data is expected, 1942 * pull receive urgent pointer along 1943 * with the receive window. 1944 */ 1945 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1946 tp->rcv_up = tp->rcv_nxt; 1947 dodata: /* XXX */ 1948 1949 /* 1950 * Process the segment text, merging it into the TCP sequencing queue, 1951 * and arranging for acknowledgment of receipt if necessary. 1952 * This process logically involves adjusting tp->rcv_wnd as data 1953 * is presented to the user (this happens in tcp_usrreq.c, 1954 * case PRU_RCVD). If a FIN has already been received on this 1955 * connection then we just ignore the text. 1956 */ 1957 if ((tlen || (tiflags & TH_FIN)) && 1958 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1959 tcp_seq laststart = th->th_seq; 1960 tcp_seq lastend = th->th_seq + tlen; 1961 1962 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1963 tp->t_state == TCPS_ESTABLISHED) { 1964 TCP_SETUP_ACK(tp, tiflags, m); 1965 tp->rcv_nxt += tlen; 1966 tiflags = th->th_flags & TH_FIN; 1967 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1968 ND6_HINT(tp); 1969 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1970 m_freem(m); 1971 else { 1972 m_adj(m, hdroptlen); 1973 mtx_enter(&so->so_rcv.sb_mtx); 1974 sbappendstream(so, &so->so_rcv, m); 1975 mtx_leave(&so->so_rcv.sb_mtx); 1976 } 1977 tp->t_flags |= TF_BLOCKOUTPUT; 1978 sorwakeup(so); 1979 tp->t_flags &= ~TF_BLOCKOUTPUT; 1980 } else { 1981 m_adj(m, hdroptlen); 1982 tiflags = tcp_reass(tp, th, m, &tlen); 1983 tp->t_flags |= TF_ACKNOW; 1984 } 1985 if (tp->sack_enable) 1986 tcp_update_sack_list(tp, laststart, lastend); 1987 1988 /* 1989 * variable len never referenced again in modern BSD, 1990 * so why bother computing it ?? 1991 */ 1992 #if 0 1993 /* 1994 * Note the amount of data that peer has sent into 1995 * our window, in order to estimate the sender's 1996 * buffer size. 1997 */ 1998 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1999 #endif /* 0 */ 2000 } else { 2001 m_freem(m); 2002 tiflags &= ~TH_FIN; 2003 } 2004 2005 /* 2006 * If FIN is received ACK the FIN and let the user know 2007 * that the connection is closing. Ignore a FIN received before 2008 * the connection is fully established. 2009 */ 2010 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2011 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2012 tp->t_flags |= TF_BLOCKOUTPUT; 2013 socantrcvmore(so); 2014 tp->t_flags &= ~TF_BLOCKOUTPUT; 2015 tp->t_flags |= TF_ACKNOW; 2016 tp->rcv_nxt++; 2017 } 2018 switch (tp->t_state) { 2019 2020 /* 2021 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2022 */ 2023 case TCPS_ESTABLISHED: 2024 tp->t_state = TCPS_CLOSE_WAIT; 2025 break; 2026 2027 /* 2028 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2029 * enter the CLOSING state. 2030 */ 2031 case TCPS_FIN_WAIT_1: 2032 tp->t_state = TCPS_CLOSING; 2033 break; 2034 2035 /* 2036 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2037 * starting the time-wait timer, turning off the other 2038 * standard timers. 2039 */ 2040 case TCPS_FIN_WAIT_2: 2041 tp->t_state = TCPS_TIME_WAIT; 2042 tcp_canceltimers(tp); 2043 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2044 tp->t_flags |= TF_BLOCKOUTPUT; 2045 soisdisconnected(so); 2046 tp->t_flags &= ~TF_BLOCKOUTPUT; 2047 break; 2048 2049 /* 2050 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2051 */ 2052 case TCPS_TIME_WAIT: 2053 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2054 break; 2055 } 2056 } 2057 if (otp) 2058 tcp_trace(TA_INPUT, ostate, tp, otp, &saveti.caddr, 0, tlen); 2059 2060 /* 2061 * Return any desired output. 2062 */ 2063 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2064 (void) tcp_output(tp); 2065 in_pcbunref(inp); 2066 return IPPROTO_DONE; 2067 2068 badsyn: 2069 /* 2070 * Received a bad SYN. Increment counters and dropwithreset. 2071 */ 2072 tcpstat_inc(tcps_badsyn); 2073 tp = NULL; 2074 goto dropwithreset; 2075 2076 dropafterack_ratelim: 2077 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2078 tcp_ackdrop_ppslim) == 0) { 2079 /* XXX stat */ 2080 goto drop; 2081 } 2082 /* ...fall into dropafterack... */ 2083 2084 dropafterack: 2085 /* 2086 * Generate an ACK dropping incoming segment if it occupies 2087 * sequence space, where the ACK reflects our state. 2088 */ 2089 if (tiflags & TH_RST) 2090 goto drop; 2091 m_freem(m); 2092 tp->t_flags |= TF_ACKNOW; 2093 (void) tcp_output(tp); 2094 in_pcbunref(inp); 2095 return IPPROTO_DONE; 2096 2097 dropwithreset_ratelim: 2098 /* 2099 * We may want to rate-limit RSTs in certain situations, 2100 * particularly if we are sending an RST in response to 2101 * an attempt to connect to or otherwise communicate with 2102 * a port for which we have no socket. 2103 */ 2104 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2105 atomic_load_int(&tcp_rst_ppslim)) == 0) { 2106 /* XXX stat */ 2107 goto drop; 2108 } 2109 /* ...fall into dropwithreset... */ 2110 2111 dropwithreset: 2112 /* 2113 * Generate a RST, dropping incoming segment. 2114 * Make ACK acceptable to originator of segment. 2115 * Don't bother to respond to RST. 2116 */ 2117 if (tiflags & TH_RST) 2118 goto drop; 2119 if (tiflags & TH_ACK) { 2120 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2121 TH_RST, m->m_pkthdr.ph_rtableid, now); 2122 } else { 2123 if (tiflags & TH_SYN) 2124 tlen++; 2125 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2126 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); 2127 } 2128 m_freem(m); 2129 in_pcbunref(inp); 2130 return IPPROTO_DONE; 2131 2132 drop: 2133 /* 2134 * Drop space held by incoming segment and return. 2135 */ 2136 if (otp) 2137 tcp_trace(TA_DROP, ostate, tp, otp, &saveti.caddr, 0, tlen); 2138 2139 m_freem(m); 2140 in_pcbunref(inp); 2141 return IPPROTO_DONE; 2142 } 2143 2144 int 2145 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2146 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2147 u_int rtableid, uint64_t now) 2148 { 2149 u_int16_t mss = 0; 2150 int opt, optlen; 2151 #ifdef TCP_SIGNATURE 2152 caddr_t sigp = NULL; 2153 struct tdb *tdb = NULL; 2154 #endif 2155 2156 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2157 opt = cp[0]; 2158 if (opt == TCPOPT_EOL) 2159 break; 2160 if (opt == TCPOPT_NOP) 2161 optlen = 1; 2162 else { 2163 if (cnt < 2) 2164 break; 2165 optlen = cp[1]; 2166 if (optlen < 2 || optlen > cnt) 2167 break; 2168 } 2169 switch (opt) { 2170 2171 default: 2172 continue; 2173 2174 case TCPOPT_MAXSEG: 2175 if (optlen != TCPOLEN_MAXSEG) 2176 continue; 2177 if (!(th->th_flags & TH_SYN)) 2178 continue; 2179 if (TCPS_HAVERCVDSYN(tp->t_state)) 2180 continue; 2181 memcpy(&mss, cp + 2, sizeof(mss)); 2182 mss = ntohs(mss); 2183 oi->maxseg = mss; 2184 break; 2185 2186 case TCPOPT_WINDOW: 2187 if (optlen != TCPOLEN_WINDOW) 2188 continue; 2189 if (!(th->th_flags & TH_SYN)) 2190 continue; 2191 if (TCPS_HAVERCVDSYN(tp->t_state)) 2192 continue; 2193 tp->t_flags |= TF_RCVD_SCALE; 2194 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2195 break; 2196 2197 case TCPOPT_TIMESTAMP: 2198 if (optlen != TCPOLEN_TIMESTAMP) 2199 continue; 2200 oi->ts_present = 1; 2201 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2202 oi->ts_val = ntohl(oi->ts_val); 2203 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2204 oi->ts_ecr = ntohl(oi->ts_ecr); 2205 2206 if (!(th->th_flags & TH_SYN)) 2207 continue; 2208 if (TCPS_HAVERCVDSYN(tp->t_state)) 2209 continue; 2210 /* 2211 * A timestamp received in a SYN makes 2212 * it ok to send timestamp requests and replies. 2213 */ 2214 tp->t_flags |= TF_RCVD_TSTMP; 2215 tp->ts_recent = oi->ts_val; 2216 tp->ts_recent_age = now; 2217 break; 2218 2219 case TCPOPT_SACK_PERMITTED: 2220 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2221 continue; 2222 if (!(th->th_flags & TH_SYN)) 2223 continue; 2224 if (TCPS_HAVERCVDSYN(tp->t_state)) 2225 continue; 2226 /* MUST only be set on SYN */ 2227 tp->t_flags |= TF_SACK_PERMIT; 2228 break; 2229 case TCPOPT_SACK: 2230 tcp_sack_option(tp, th, cp, optlen); 2231 break; 2232 #ifdef TCP_SIGNATURE 2233 case TCPOPT_SIGNATURE: 2234 if (optlen != TCPOLEN_SIGNATURE) 2235 continue; 2236 2237 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2238 goto bad; 2239 2240 sigp = cp + 2; 2241 break; 2242 #endif /* TCP_SIGNATURE */ 2243 } 2244 } 2245 2246 #ifdef TCP_SIGNATURE 2247 if (tp->t_flags & TF_SIGNATURE) { 2248 union sockaddr_union src, dst; 2249 2250 memset(&src, 0, sizeof(union sockaddr_union)); 2251 memset(&dst, 0, sizeof(union sockaddr_union)); 2252 2253 switch (tp->pf) { 2254 case 0: 2255 case AF_INET: 2256 src.sa.sa_len = sizeof(struct sockaddr_in); 2257 src.sa.sa_family = AF_INET; 2258 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2259 dst.sa.sa_len = sizeof(struct sockaddr_in); 2260 dst.sa.sa_family = AF_INET; 2261 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2262 break; 2263 #ifdef INET6 2264 case AF_INET6: 2265 src.sa.sa_len = sizeof(struct sockaddr_in6); 2266 src.sa.sa_family = AF_INET6; 2267 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2268 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2269 dst.sa.sa_family = AF_INET6; 2270 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2271 break; 2272 #endif /* INET6 */ 2273 } 2274 2275 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2276 0, &src, &dst, IPPROTO_TCP); 2277 2278 /* 2279 * We don't have an SA for this peer, so we turn off 2280 * TF_SIGNATURE on the listen socket 2281 */ 2282 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2283 tp->t_flags &= ~TF_SIGNATURE; 2284 2285 } 2286 2287 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2288 tcpstat_inc(tcps_rcvbadsig); 2289 goto bad; 2290 } 2291 2292 if (sigp) { 2293 char sig[16]; 2294 2295 if (tdb == NULL) { 2296 tcpstat_inc(tcps_rcvbadsig); 2297 goto bad; 2298 } 2299 2300 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2301 goto bad; 2302 2303 if (timingsafe_bcmp(sig, sigp, 16)) { 2304 tcpstat_inc(tcps_rcvbadsig); 2305 goto bad; 2306 } 2307 2308 tcpstat_inc(tcps_rcvgoodsig); 2309 } 2310 2311 tdb_unref(tdb); 2312 #endif /* TCP_SIGNATURE */ 2313 2314 return (0); 2315 2316 #ifdef TCP_SIGNATURE 2317 bad: 2318 tdb_unref(tdb); 2319 #endif 2320 return (-1); 2321 } 2322 2323 u_long 2324 tcp_seq_subtract(u_long a, u_long b) 2325 { 2326 return ((long)(a - b)); 2327 } 2328 2329 /* 2330 * This function is called upon receipt of new valid data (while not in header 2331 * prediction mode), and it updates the ordered list of sacks. 2332 */ 2333 void 2334 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2335 tcp_seq rcv_lastend) 2336 { 2337 /* 2338 * First reported block MUST be the most recent one. Subsequent 2339 * blocks SHOULD be in the order in which they arrived at the 2340 * receiver. These two conditions make the implementation fully 2341 * compliant with RFC 2018. 2342 */ 2343 int i, j = 0, count = 0, lastpos = -1; 2344 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2345 2346 /* First clean up current list of sacks */ 2347 for (i = 0; i < tp->rcv_numsacks; i++) { 2348 sack = tp->sackblks[i]; 2349 if (sack.start == 0 && sack.end == 0) { 2350 count++; /* count = number of blocks to be discarded */ 2351 continue; 2352 } 2353 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2354 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2355 count++; 2356 } else { 2357 temp[j].start = tp->sackblks[i].start; 2358 temp[j++].end = tp->sackblks[i].end; 2359 } 2360 } 2361 tp->rcv_numsacks -= count; 2362 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2363 tcp_clean_sackreport(tp); 2364 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2365 /* ==> need first sack block */ 2366 tp->sackblks[0].start = rcv_laststart; 2367 tp->sackblks[0].end = rcv_lastend; 2368 tp->rcv_numsacks = 1; 2369 } 2370 return; 2371 } 2372 /* Otherwise, sack blocks are already present. */ 2373 for (i = 0; i < tp->rcv_numsacks; i++) 2374 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2375 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2376 return; /* sack list remains unchanged */ 2377 /* 2378 * From here, segment just received should be (part of) the 1st sack. 2379 * Go through list, possibly coalescing sack block entries. 2380 */ 2381 firstsack.start = rcv_laststart; 2382 firstsack.end = rcv_lastend; 2383 for (i = 0; i < tp->rcv_numsacks; i++) { 2384 sack = tp->sackblks[i]; 2385 if (SEQ_LT(sack.end, firstsack.start) || 2386 SEQ_GT(sack.start, firstsack.end)) 2387 continue; /* no overlap */ 2388 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2389 /* 2390 * identical block; delete it here since we will 2391 * move it to the front of the list. 2392 */ 2393 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2394 lastpos = i; /* last posn with a zero entry */ 2395 continue; 2396 } 2397 if (SEQ_LEQ(sack.start, firstsack.start)) 2398 firstsack.start = sack.start; /* merge blocks */ 2399 if (SEQ_GEQ(sack.end, firstsack.end)) 2400 firstsack.end = sack.end; /* merge blocks */ 2401 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2402 lastpos = i; /* last posn with a zero entry */ 2403 } 2404 if (lastpos != -1) { /* at least one merge */ 2405 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2406 sack = tp->sackblks[i]; 2407 if (sack.start == 0 && sack.end == 0) 2408 continue; 2409 temp[j++] = sack; 2410 } 2411 tp->rcv_numsacks = j; /* including first blk (added later) */ 2412 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2413 tp->sackblks[i] = temp[i]; 2414 } else { /* no merges -- shift sacks by 1 */ 2415 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2416 tp->rcv_numsacks++; 2417 for (i = tp->rcv_numsacks-1; i > 0; i--) 2418 tp->sackblks[i] = tp->sackblks[i-1]; 2419 } 2420 tp->sackblks[0] = firstsack; 2421 return; 2422 } 2423 2424 /* 2425 * Process the TCP SACK option. tp->snd_holes is an ordered list 2426 * of holes (oldest to newest, in terms of the sequence space). 2427 */ 2428 void 2429 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2430 { 2431 int tmp_olen; 2432 u_char *tmp_cp; 2433 struct sackhole *cur, *p, *temp; 2434 2435 if (!tp->sack_enable) 2436 return; 2437 /* SACK without ACK doesn't make sense. */ 2438 if ((th->th_flags & TH_ACK) == 0) 2439 return; 2440 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2441 if (SEQ_LT(th->th_ack, tp->snd_una) || 2442 SEQ_GT(th->th_ack, tp->snd_max)) 2443 return; 2444 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2445 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2446 return; 2447 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2448 tmp_cp = cp + 2; 2449 tmp_olen = optlen - 2; 2450 tcpstat_inc(tcps_sack_rcv_opts); 2451 if (tp->snd_numholes < 0) 2452 tp->snd_numholes = 0; 2453 if (tp->t_maxseg == 0) 2454 panic("tcp_sack_option"); /* Should never happen */ 2455 while (tmp_olen > 0) { 2456 struct sackblk sack; 2457 2458 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2459 sack.start = ntohl(sack.start); 2460 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2461 sack.end = ntohl(sack.end); 2462 tmp_olen -= TCPOLEN_SACK; 2463 tmp_cp += TCPOLEN_SACK; 2464 if (SEQ_LEQ(sack.end, sack.start)) 2465 continue; /* bad SACK fields */ 2466 if (SEQ_LEQ(sack.end, tp->snd_una)) 2467 continue; /* old block */ 2468 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2469 if (SEQ_LT(sack.start, th->th_ack)) 2470 continue; 2471 } 2472 if (SEQ_GT(sack.end, tp->snd_max)) 2473 continue; 2474 if (tp->snd_holes == NULL) { /* first hole */ 2475 tp->snd_holes = (struct sackhole *) 2476 pool_get(&sackhl_pool, PR_NOWAIT); 2477 if (tp->snd_holes == NULL) { 2478 /* ENOBUFS, so ignore SACKed block for now */ 2479 goto dropped; 2480 } 2481 cur = tp->snd_holes; 2482 cur->start = th->th_ack; 2483 cur->end = sack.start; 2484 cur->rxmit = cur->start; 2485 cur->next = NULL; 2486 tp->snd_numholes = 1; 2487 tp->rcv_lastsack = sack.end; 2488 /* 2489 * dups is at least one. If more data has been 2490 * SACKed, it can be greater than one. 2491 */ 2492 cur->dups = min(tcprexmtthresh, 2493 ((sack.end - cur->end)/tp->t_maxseg)); 2494 if (cur->dups < 1) 2495 cur->dups = 1; 2496 continue; /* with next sack block */ 2497 } 2498 /* Go thru list of holes: p = previous, cur = current */ 2499 p = cur = tp->snd_holes; 2500 while (cur) { 2501 if (SEQ_LEQ(sack.end, cur->start)) 2502 /* SACKs data before the current hole */ 2503 break; /* no use going through more holes */ 2504 if (SEQ_GEQ(sack.start, cur->end)) { 2505 /* SACKs data beyond the current hole */ 2506 cur->dups++; 2507 if (((sack.end - cur->end)/tp->t_maxseg) >= 2508 tcprexmtthresh) 2509 cur->dups = tcprexmtthresh; 2510 p = cur; 2511 cur = cur->next; 2512 continue; 2513 } 2514 if (SEQ_LEQ(sack.start, cur->start)) { 2515 /* Data acks at least the beginning of hole */ 2516 if (SEQ_GEQ(sack.end, cur->end)) { 2517 /* Acks entire hole, so delete hole */ 2518 if (p != cur) { 2519 p->next = cur->next; 2520 pool_put(&sackhl_pool, cur); 2521 cur = p->next; 2522 } else { 2523 cur = cur->next; 2524 pool_put(&sackhl_pool, p); 2525 p = cur; 2526 tp->snd_holes = p; 2527 } 2528 tp->snd_numholes--; 2529 continue; 2530 } 2531 /* otherwise, move start of hole forward */ 2532 cur->start = sack.end; 2533 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2534 p = cur; 2535 cur = cur->next; 2536 continue; 2537 } 2538 /* move end of hole backward */ 2539 if (SEQ_GEQ(sack.end, cur->end)) { 2540 cur->end = sack.start; 2541 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2542 cur->dups++; 2543 if (((sack.end - cur->end)/tp->t_maxseg) >= 2544 tcprexmtthresh) 2545 cur->dups = tcprexmtthresh; 2546 p = cur; 2547 cur = cur->next; 2548 continue; 2549 } 2550 if (SEQ_LT(cur->start, sack.start) && 2551 SEQ_GT(cur->end, sack.end)) { 2552 /* 2553 * ACKs some data in middle of a hole; need to 2554 * split current hole 2555 */ 2556 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2557 goto dropped; 2558 temp = (struct sackhole *) 2559 pool_get(&sackhl_pool, PR_NOWAIT); 2560 if (temp == NULL) 2561 goto dropped; /* ENOBUFS */ 2562 temp->next = cur->next; 2563 temp->start = sack.end; 2564 temp->end = cur->end; 2565 temp->dups = cur->dups; 2566 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2567 cur->end = sack.start; 2568 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2569 cur->dups++; 2570 if (((sack.end - cur->end)/tp->t_maxseg) >= 2571 tcprexmtthresh) 2572 cur->dups = tcprexmtthresh; 2573 cur->next = temp; 2574 p = temp; 2575 cur = p->next; 2576 tp->snd_numholes++; 2577 } 2578 } 2579 /* At this point, p points to the last hole on the list */ 2580 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2581 /* 2582 * Need to append new hole at end. 2583 * Last hole is p (and it's not NULL). 2584 */ 2585 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2586 goto dropped; 2587 temp = (struct sackhole *) 2588 pool_get(&sackhl_pool, PR_NOWAIT); 2589 if (temp == NULL) 2590 goto dropped; /* ENOBUFS */ 2591 temp->start = tp->rcv_lastsack; 2592 temp->end = sack.start; 2593 temp->dups = min(tcprexmtthresh, 2594 ((sack.end - sack.start)/tp->t_maxseg)); 2595 if (temp->dups < 1) 2596 temp->dups = 1; 2597 temp->rxmit = temp->start; 2598 temp->next = 0; 2599 p->next = temp; 2600 tp->rcv_lastsack = sack.end; 2601 tp->snd_numholes++; 2602 } 2603 } 2604 return; 2605 dropped: 2606 tcpstat_inc(tcps_sack_drop_opts); 2607 } 2608 2609 /* 2610 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2611 * it is completely acked; otherwise, tcp_sack_option(), called from 2612 * tcp_dooptions(), will fix up the hole. 2613 */ 2614 void 2615 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2616 { 2617 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2618 /* max because this could be an older ack just arrived */ 2619 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2620 th->th_ack : tp->snd_una; 2621 struct sackhole *cur = tp->snd_holes; 2622 struct sackhole *prev; 2623 while (cur) 2624 if (SEQ_LEQ(cur->end, lastack)) { 2625 prev = cur; 2626 cur = cur->next; 2627 pool_put(&sackhl_pool, prev); 2628 tp->snd_numholes--; 2629 } else if (SEQ_LT(cur->start, lastack)) { 2630 cur->start = lastack; 2631 if (SEQ_LT(cur->rxmit, cur->start)) 2632 cur->rxmit = cur->start; 2633 break; 2634 } else 2635 break; 2636 tp->snd_holes = cur; 2637 } 2638 } 2639 2640 /* 2641 * Delete all receiver-side SACK information. 2642 */ 2643 void 2644 tcp_clean_sackreport(struct tcpcb *tp) 2645 { 2646 int i; 2647 2648 tp->rcv_numsacks = 0; 2649 for (i = 0; i < MAX_SACK_BLKS; i++) 2650 tp->sackblks[i].start = tp->sackblks[i].end=0; 2651 2652 } 2653 2654 /* 2655 * Partial ack handling within a sack recovery episode. When a partial ack 2656 * arrives, turn off retransmission timer, deflate the window, do not clear 2657 * tp->t_dupacks. 2658 */ 2659 void 2660 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2661 { 2662 /* Turn off retx. timer (will start again next segment) */ 2663 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2664 tp->t_rtttime = 0; 2665 /* 2666 * Partial window deflation. This statement relies on the 2667 * fact that tp->snd_una has not been updated yet. 2668 */ 2669 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2670 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2671 tp->snd_cwnd += tp->t_maxseg; 2672 } else 2673 tp->snd_cwnd = tp->t_maxseg; 2674 tp->snd_cwnd += tp->t_maxseg; 2675 tp->t_flags |= TF_NEEDOUTPUT; 2676 } 2677 2678 /* 2679 * Pull out of band byte out of a segment so 2680 * it doesn't appear in the user's data queue. 2681 * It is still reflected in the segment length for 2682 * sequencing purposes. 2683 */ 2684 void 2685 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2686 { 2687 int cnt = off + urgent - 1; 2688 2689 while (cnt >= 0) { 2690 if (m->m_len > cnt) { 2691 char *cp = mtod(m, caddr_t) + cnt; 2692 struct tcpcb *tp = sototcpcb(so); 2693 2694 tp->t_iobc = *cp; 2695 tp->t_oobflags |= TCPOOB_HAVEDATA; 2696 memmove(cp, cp + 1, m->m_len - cnt - 1); 2697 m->m_len--; 2698 return; 2699 } 2700 cnt -= m->m_len; 2701 m = m->m_next; 2702 if (m == NULL) 2703 break; 2704 } 2705 panic("tcp_pulloutofband"); 2706 } 2707 2708 /* 2709 * Collect new round-trip time estimate 2710 * and update averages and current timeout. 2711 */ 2712 void 2713 tcp_xmit_timer(struct tcpcb *tp, int32_t rtt) 2714 { 2715 int delta, rttmin; 2716 2717 if (rtt < 0) 2718 rtt = 0; 2719 else if (rtt > TCP_RTT_MAX) 2720 rtt = TCP_RTT_MAX; 2721 2722 tcpstat_inc(tcps_rttupdated); 2723 if (tp->t_srtt != 0) { 2724 /* 2725 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2726 * after the binary point (scaled by 4), whereas 2727 * srtt is stored as fixed point with 5 bits after the 2728 * binary point (i.e., scaled by 32). The following magic 2729 * is equivalent to the smoothing algorithm in rfc793 with 2730 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2731 * point). 2732 */ 2733 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2734 (tp->t_srtt >> TCP_RTT_SHIFT); 2735 if ((tp->t_srtt += delta) <= 0) 2736 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2737 /* 2738 * We accumulate a smoothed rtt variance (actually, a 2739 * smoothed mean difference), then set the retransmit 2740 * timer to smoothed rtt + 4 times the smoothed variance. 2741 * rttvar is stored as fixed point with 4 bits after the 2742 * binary point (scaled by 16). The following is 2743 * equivalent to rfc793 smoothing with an alpha of .75 2744 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2745 * rfc793's wired-in beta. 2746 */ 2747 if (delta < 0) 2748 delta = -delta; 2749 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2750 if ((tp->t_rttvar += delta) <= 0) 2751 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2752 } else { 2753 /* 2754 * No rtt measurement yet - use the unsmoothed rtt. 2755 * Set the variance to half the rtt (so our first 2756 * retransmit happens at 3*rtt). 2757 */ 2758 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2759 tp->t_rttvar = (rtt + 1) << 2760 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2761 } 2762 tp->t_rtttime = 0; 2763 tp->t_rxtshift = 0; 2764 2765 /* 2766 * the retransmit should happen at rtt + 4 * rttvar. 2767 * Because of the way we do the smoothing, srtt and rttvar 2768 * will each average +1/2 tick of bias. When we compute 2769 * the retransmit timer, we want 1/2 tick of rounding and 2770 * 1 extra tick because of +-1/2 tick uncertainty in the 2771 * firing of the timer. The bias will give us exactly the 2772 * 1.5 tick we need. But, because the bias is 2773 * statistical, we have to test that we don't drop below 2774 * the minimum feasible timer (which is 2 ticks). 2775 */ 2776 rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)), 2777 TCPTV_REXMTMAX); 2778 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2779 2780 /* 2781 * We received an ack for a packet that wasn't retransmitted; 2782 * it is probably safe to discard any error indications we've 2783 * received recently. This isn't quite right, but close enough 2784 * for now (a route might have failed after we sent a segment, 2785 * and the return path might not be symmetrical). 2786 */ 2787 tp->t_softerror = 0; 2788 } 2789 2790 /* 2791 * Determine a reasonable value for maxseg size. 2792 * If the route is known, check route for mtu. 2793 * If none, use an mss that can be handled on the outgoing 2794 * interface without forcing IP to fragment; if bigger than 2795 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2796 * to utilize large mbufs. If no route is found, route has no mtu, 2797 * or the destination isn't local, use a default, hopefully conservative 2798 * size (usually 512 or the default IP max size, but no more than the mtu 2799 * of the interface), as we can't discover anything about intervening 2800 * gateways or networks. We also initialize the congestion/slow start 2801 * window to be a single segment if the destination isn't local. 2802 * While looking at the routing entry, we also initialize other path-dependent 2803 * parameters from pre-set or cached values in the routing entry. 2804 * 2805 * Also take into account the space needed for options that we 2806 * send regularly. Make maxseg shorter by that amount to assure 2807 * that we can send maxseg amount of data even when the options 2808 * are present. Store the upper limit of the length of options plus 2809 * data in maxopd. 2810 * 2811 * NOTE: offer == -1 indicates that the maxseg size changed due to 2812 * Path MTU discovery. 2813 */ 2814 int 2815 tcp_mss(struct tcpcb *tp, int offer) 2816 { 2817 struct rtentry *rt; 2818 struct ifnet *ifp; 2819 int mss, mssopt, mssdflt, iphlen, do_rfc3390; 2820 u_int rtmtu; 2821 2822 mss = mssopt = mssdflt = atomic_load_int(&tcp_mssdflt); 2823 2824 rt = in_pcbrtentry(tp->t_inpcb); 2825 if (rt == NULL) 2826 goto out; 2827 2828 ifp = if_get(rt->rt_ifidx); 2829 if (ifp == NULL) 2830 goto out; 2831 2832 switch (tp->pf) { 2833 case AF_INET: 2834 iphlen = sizeof(struct ip); 2835 break; 2836 #ifdef INET6 2837 case AF_INET6: 2838 iphlen = sizeof(struct ip6_hdr); 2839 break; 2840 #endif 2841 default: 2842 unhandled_af(tp->pf); 2843 } 2844 2845 /* 2846 * if there's an mtu associated with the route and we support 2847 * path MTU discovery for the underlying protocol family, use it. 2848 */ 2849 rtmtu = atomic_load_int(&rt->rt_mtu); 2850 if (rtmtu) { 2851 /* 2852 * One may wish to lower MSS to take into account options, 2853 * especially security-related options. 2854 */ 2855 if (tp->pf == AF_INET6 && rtmtu < IPV6_MMTU) { 2856 /* 2857 * RFC2460 section 5, last paragraph: if path MTU is 2858 * smaller than 1280, use 1280 as packet size and 2859 * attach fragment header. 2860 */ 2861 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2862 sizeof(struct tcphdr); 2863 } else { 2864 mss = rtmtu - iphlen - sizeof(struct tcphdr); 2865 } 2866 } else if (ifp->if_flags & IFF_LOOPBACK) { 2867 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2868 } else if (tp->pf == AF_INET) { 2869 if (ip_mtudisc) 2870 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2871 } 2872 #ifdef INET6 2873 else if (tp->pf == AF_INET6) { 2874 /* 2875 * for IPv6, path MTU discovery is always turned on, 2876 * or the node must use packet size <= 1280. 2877 */ 2878 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2879 } 2880 #endif /* INET6 */ 2881 2882 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2883 if (offer != -1) { 2884 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2885 mssopt = imax(mssopt, mssdflt); 2886 } 2887 if_put(ifp); 2888 out: 2889 /* 2890 * The current mss, t_maxseg, is initialized to the default value. 2891 * If we compute a smaller value, reduce the current mss. 2892 * If we compute a larger value, return it for use in sending 2893 * a max seg size option, but don't store it for use 2894 * unless we received an offer at least that large from peer. 2895 * 2896 * However, do not accept offers lower than the minimum of 2897 * the interface MTU and 216. 2898 */ 2899 if (offer > 0) 2900 tp->t_peermss = offer; 2901 if (tp->t_peermss) 2902 mss = imin(mss, max(tp->t_peermss, 216)); 2903 2904 /* sanity - at least max opt. space */ 2905 mss = imax(mss, 64); 2906 2907 /* 2908 * maxopd stores the maximum length of data AND options 2909 * in a segment; maxseg is the amount of data in a normal 2910 * segment. We need to store this value (maxopd) apart 2911 * from maxseg, because now every segment carries options 2912 * and thus we normally have somewhat less data in segments. 2913 */ 2914 tp->t_maxopd = mss; 2915 2916 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2917 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2918 mss -= TCPOLEN_TSTAMP_APPA; 2919 #ifdef TCP_SIGNATURE 2920 if (tp->t_flags & TF_SIGNATURE) 2921 mss -= TCPOLEN_SIGLEN; 2922 #endif 2923 2924 do_rfc3390 = atomic_load_int(&tcp_do_rfc3390); 2925 if (offer == -1) { 2926 /* mss changed due to Path MTU discovery */ 2927 tp->t_flags &= ~TF_PMTUD_PEND; 2928 tp->t_pmtud_mtu_sent = 0; 2929 tp->t_pmtud_mss_acked = 0; 2930 if (mss < tp->t_maxseg) { 2931 /* 2932 * Follow suggestion in RFC 2414 to reduce the 2933 * congestion window by the ratio of the old 2934 * segment size to the new segment size. 2935 */ 2936 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2937 mss, mss); 2938 } 2939 } else if (do_rfc3390 == 2) { 2940 /* increase initial window */ 2941 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2942 } else if (do_rfc3390) { 2943 /* increase initial window */ 2944 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2945 } else 2946 tp->snd_cwnd = mss; 2947 2948 tp->t_maxseg = mss; 2949 2950 return (offer != -1 ? mssopt : mss); 2951 } 2952 2953 u_int 2954 tcp_hdrsz(struct tcpcb *tp) 2955 { 2956 u_int hlen; 2957 2958 switch (tp->pf) { 2959 #ifdef INET6 2960 case AF_INET6: 2961 hlen = sizeof(struct ip6_hdr); 2962 break; 2963 #endif 2964 case AF_INET: 2965 hlen = sizeof(struct ip); 2966 break; 2967 default: 2968 hlen = 0; 2969 break; 2970 } 2971 hlen += sizeof(struct tcphdr); 2972 2973 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2974 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2975 hlen += TCPOLEN_TSTAMP_APPA; 2976 #ifdef TCP_SIGNATURE 2977 if (tp->t_flags & TF_SIGNATURE) 2978 hlen += TCPOLEN_SIGLEN; 2979 #endif 2980 return (hlen); 2981 } 2982 2983 /* 2984 * Set connection variables based on the effective MSS. 2985 * We are passed the TCPCB for the actual connection. If we 2986 * are the server, we are called by the compressed state engine 2987 * when the 3-way handshake is complete. If we are the client, 2988 * we are called when we receive the SYN,ACK from the server. 2989 * 2990 * NOTE: The t_maxseg value must be initialized in the TCPCB 2991 * before this routine is called! 2992 */ 2993 void 2994 tcp_mss_update(struct tcpcb *tp) 2995 { 2996 int mss; 2997 u_long bufsize; 2998 struct rtentry *rt; 2999 struct socket *so; 3000 3001 so = tp->t_inpcb->inp_socket; 3002 mss = tp->t_maxseg; 3003 3004 rt = in_pcbrtentry(tp->t_inpcb); 3005 if (rt == NULL) 3006 return; 3007 3008 mtx_enter(&so->so_snd.sb_mtx); 3009 bufsize = so->so_snd.sb_hiwat; 3010 if (bufsize < mss) { 3011 mtx_leave(&so->so_snd.sb_mtx); 3012 mss = bufsize; 3013 /* Update t_maxseg and t_maxopd */ 3014 tcp_mss(tp, mss); 3015 } else { 3016 bufsize = roundup(bufsize, mss); 3017 if (bufsize > sb_max) 3018 bufsize = sb_max; 3019 (void)sbreserve(so, &so->so_snd, bufsize); 3020 mtx_leave(&so->so_snd.sb_mtx); 3021 } 3022 3023 mtx_enter(&so->so_rcv.sb_mtx); 3024 bufsize = so->so_rcv.sb_hiwat; 3025 if (bufsize > mss) { 3026 bufsize = roundup(bufsize, mss); 3027 if (bufsize > sb_max) 3028 bufsize = sb_max; 3029 (void)sbreserve(so, &so->so_rcv, bufsize); 3030 } 3031 mtx_leave(&so->so_rcv.sb_mtx); 3032 } 3033 3034 /* 3035 * When a partial ack arrives, force the retransmission of the 3036 * next unacknowledged segment. Do not clear tp->t_dupacks. 3037 * By setting snd_nxt to ti_ack, this forces retransmission timer 3038 * to be started again. 3039 */ 3040 void 3041 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3042 { 3043 /* 3044 * snd_una has not been updated and the socket send buffer 3045 * not yet drained of the acked data, so we have to leave 3046 * snd_una as it was to get the correct data offset in 3047 * tcp_output(). 3048 */ 3049 tcp_seq onxt = tp->snd_nxt; 3050 u_long ocwnd = tp->snd_cwnd; 3051 3052 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3053 tp->t_rtttime = 0; 3054 tp->snd_nxt = th->th_ack; 3055 /* 3056 * Set snd_cwnd to one segment beyond acknowledged offset 3057 * (tp->snd_una not yet updated when this function is called) 3058 */ 3059 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3060 (void)tcp_output(tp); 3061 tp->snd_cwnd = ocwnd; 3062 if (SEQ_GT(onxt, tp->snd_nxt)) 3063 tp->snd_nxt = onxt; 3064 /* 3065 * Partial window deflation. Relies on fact that tp->snd_una 3066 * not updated yet. 3067 */ 3068 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3069 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3070 else 3071 tp->snd_cwnd = 0; 3072 tp->snd_cwnd += tp->t_maxseg; 3073 } 3074 3075 int 3076 tcp_mss_adv(struct mbuf *m, int af) 3077 { 3078 struct ifnet *ifp; 3079 int iphlen, mss, mssdflt; 3080 3081 mssdflt = atomic_load_int(&tcp_mssdflt); 3082 3083 if (m == NULL || (m->m_flags & M_PKTHDR) == 0) 3084 return mssdflt; 3085 3086 ifp = if_get(m->m_pkthdr.ph_ifidx); 3087 if (ifp == NULL) 3088 return mssdflt; 3089 3090 switch (af) { 3091 case AF_INET: 3092 iphlen = sizeof(struct ip); 3093 break; 3094 #ifdef INET6 3095 case AF_INET6: 3096 iphlen = sizeof(struct ip6_hdr); 3097 break; 3098 #endif 3099 default: 3100 unhandled_af(af); 3101 } 3102 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3103 if_put(ifp); 3104 3105 if (mss < mssdflt) 3106 return mssdflt; 3107 return mss; 3108 } 3109 3110 /* 3111 * TCP compressed state engine. Currently used to hold compressed 3112 * state for SYN_RECEIVED. 3113 */ 3114 3115 /* 3116 * Locks used to protect global data and struct members: 3117 * a atomic operations 3118 * N net lock 3119 * S syn_cache_mtx tcp syn cache global mutex 3120 */ 3121 3122 /* syn hash parameters */ 3123 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; /* [S] size of hash table */ 3124 int tcp_syn_cache_limit = /* [a] global entry limit */ 3125 TCP_SYN_HASH_SIZE * TCP_SYN_BUCKET_SIZE; 3126 int tcp_syn_bucket_limit = /* [a] per bucket limit */ 3127 3 * TCP_SYN_BUCKET_SIZE; 3128 int tcp_syn_use_limit = 100000; /* [S] reseed after uses */ 3129 3130 struct pool syn_cache_pool; 3131 struct syn_cache_set tcp_syn_cache[2]; /* [S] */ 3132 int tcp_syn_cache_active; /* [S] */ 3133 struct mutex syn_cache_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); 3134 3135 #define SYN_HASH(sa, sp, dp, rand) \ 3136 (((sa)->s_addr ^ (rand)[0]) * \ 3137 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3138 #ifndef INET6 3139 #define SYN_HASHALL(hash, src, dst, rand) \ 3140 do { \ 3141 hash = SYN_HASH(&satosin_const(src)->sin_addr, \ 3142 satosin_const(src)->sin_port, \ 3143 satosin_const(dst)->sin_port, (rand)); \ 3144 } while (/*CONSTCOND*/ 0) 3145 #else 3146 #define SYN_HASH6(sa, sp, dp, rand) \ 3147 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3148 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3149 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3150 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3151 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3152 3153 #define SYN_HASHALL(hash, src, dst, rand) \ 3154 do { \ 3155 switch ((src)->sa_family) { \ 3156 case AF_INET: \ 3157 hash = SYN_HASH(&satosin_const(src)->sin_addr, \ 3158 satosin_const(src)->sin_port, \ 3159 satosin_const(dst)->sin_port, (rand)); \ 3160 break; \ 3161 case AF_INET6: \ 3162 hash = SYN_HASH6(&satosin6_const(src)->sin6_addr, \ 3163 satosin6_const(src)->sin6_port, \ 3164 satosin6_const(dst)->sin6_port, (rand)); \ 3165 break; \ 3166 default: \ 3167 hash = 0; \ 3168 } \ 3169 } while (/*CONSTCOND*/0) 3170 #endif /* INET6 */ 3171 3172 void 3173 syn_cache_rm(struct syn_cache *sc) 3174 { 3175 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3176 3177 KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD)); 3178 SET(sc->sc_dynflags, SCF_DEAD); 3179 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3180 in_pcbunref(sc->sc_inplisten); 3181 sc->sc_inplisten = NULL; 3182 LIST_REMOVE(sc, sc_tpq); 3183 refcnt_rele(&sc->sc_refcnt); 3184 sc->sc_buckethead->sch_length--; 3185 if (timeout_del(&sc->sc_timer)) 3186 refcnt_rele(&sc->sc_refcnt); 3187 sc->sc_set->scs_count--; 3188 } 3189 3190 void 3191 syn_cache_put(struct syn_cache *sc) 3192 { 3193 if (refcnt_rele(&sc->sc_refcnt) == 0) 3194 return; 3195 3196 /* Dealing with last reference, no lock needed. */ 3197 m_free(sc->sc_ipopts); 3198 rtfree(sc->sc_route.ro_rt); 3199 3200 pool_put(&syn_cache_pool, sc); 3201 } 3202 3203 void 3204 syn_cache_init(void) 3205 { 3206 int i; 3207 3208 /* Initialize the hash buckets. */ 3209 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3210 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3211 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3212 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3213 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3214 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3215 for (i = 0; i < tcp_syn_hash_size; i++) { 3216 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3217 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3218 } 3219 3220 /* Initialize the syn cache pool. */ 3221 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3222 0, "syncache", NULL); 3223 } 3224 3225 void 3226 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3227 { 3228 struct syn_cache_set *set; 3229 struct syn_cache_head *scp; 3230 struct syn_cache *sc2; 3231 int i; 3232 3233 NET_ASSERT_LOCKED(); 3234 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3235 3236 set = &tcp_syn_cache[tcp_syn_cache_active]; 3237 3238 /* 3239 * If there are no entries in the hash table, reinitialize 3240 * the hash secrets. To avoid useless cache swaps and 3241 * reinitialization, use it until the limit is reached. 3242 * An empty cache is also the opportunity to resize the hash. 3243 */ 3244 if (set->scs_count == 0 && set->scs_use <= 0) { 3245 set->scs_use = tcp_syn_use_limit; 3246 if (set->scs_size != tcp_syn_hash_size) { 3247 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3248 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3249 if (scp == NULL) { 3250 /* Try again next time. */ 3251 set->scs_use = 0; 3252 } else { 3253 free(set->scs_buckethead, M_SYNCACHE, 3254 set->scs_size * 3255 sizeof(struct syn_cache_head)); 3256 set->scs_buckethead = scp; 3257 set->scs_size = tcp_syn_hash_size; 3258 for (i = 0; i < tcp_syn_hash_size; i++) 3259 TAILQ_INIT(&scp[i].sch_bucket); 3260 } 3261 } 3262 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3263 tcpstat_inc(tcps_sc_seedrandom); 3264 } 3265 3266 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3267 set->scs_random); 3268 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3269 sc->sc_buckethead = scp; 3270 3271 /* 3272 * Make sure that we don't overflow the per-bucket 3273 * limit or the total cache size limit. 3274 */ 3275 if (scp->sch_length >= atomic_load_int(&tcp_syn_bucket_limit)) { 3276 tcpstat_inc(tcps_sc_bucketoverflow); 3277 /* 3278 * Someone might attack our bucket hash function. Reseed 3279 * with random as soon as the passive syn cache gets empty. 3280 */ 3281 set->scs_use = 0; 3282 /* 3283 * The bucket is full. Toss the oldest element in the 3284 * bucket. This will be the first entry in the bucket. 3285 */ 3286 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3287 #ifdef DIAGNOSTIC 3288 /* 3289 * This should never happen; we should always find an 3290 * entry in our bucket. 3291 */ 3292 if (sc2 == NULL) 3293 panic("%s: bucketoverflow: impossible", __func__); 3294 #endif 3295 syn_cache_rm(sc2); 3296 syn_cache_put(sc2); 3297 } else if (set->scs_count >= atomic_load_int(&tcp_syn_cache_limit)) { 3298 struct syn_cache_head *scp2, *sce; 3299 3300 tcpstat_inc(tcps_sc_overflowed); 3301 /* 3302 * The cache is full. Toss the oldest entry in the 3303 * first non-empty bucket we can find. 3304 * 3305 * XXX We would really like to toss the oldest 3306 * entry in the cache, but we hope that this 3307 * condition doesn't happen very often. 3308 */ 3309 scp2 = scp; 3310 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3311 sce = &set->scs_buckethead[set->scs_size]; 3312 for (++scp2; scp2 != scp; scp2++) { 3313 if (scp2 >= sce) 3314 scp2 = &set->scs_buckethead[0]; 3315 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3316 break; 3317 } 3318 #ifdef DIAGNOSTIC 3319 /* 3320 * This should never happen; we should always find a 3321 * non-empty bucket. 3322 */ 3323 if (scp2 == scp) 3324 panic("%s: cacheoverflow: impossible", 3325 __func__); 3326 #endif 3327 } 3328 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3329 syn_cache_rm(sc2); 3330 syn_cache_put(sc2); 3331 } 3332 3333 /* 3334 * Initialize the entry's timer. We don't estimate RTT 3335 * with SYNs, so each packet starts with the default RTT 3336 * and each timer step has a fixed timeout value. 3337 */ 3338 sc->sc_rxttot = 0; 3339 sc->sc_rxtshift = 0; 3340 TCPT_RANGESET(sc->sc_rxtcur, 3341 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3342 TCPTV_REXMTMAX); 3343 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3344 refcnt_take(&sc->sc_refcnt); 3345 3346 /* Link it from tcpcb entry */ 3347 refcnt_take(&sc->sc_refcnt); 3348 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3349 3350 /* Put it into the bucket. */ 3351 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3352 scp->sch_length++; 3353 sc->sc_set = set; 3354 set->scs_count++; 3355 set->scs_use--; 3356 3357 tcpstat_inc(tcps_sc_added); 3358 3359 /* 3360 * If the active cache has exceeded its use limit and 3361 * the passive syn cache is empty, exchange their roles. 3362 */ 3363 if (set->scs_use <= 0 && 3364 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3365 tcp_syn_cache_active = !tcp_syn_cache_active; 3366 } 3367 3368 /* 3369 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3370 * If we have retransmitted an entry the maximum number of times, expire 3371 * that entry. 3372 */ 3373 void 3374 syn_cache_timer(void *arg) 3375 { 3376 struct syn_cache *sc = arg; 3377 struct inpcb *inp; 3378 struct socket *so; 3379 uint64_t now; 3380 int lastref, do_ecn = 0; 3381 3382 mtx_enter(&syn_cache_mtx); 3383 if (ISSET(sc->sc_dynflags, SCF_DEAD)) 3384 goto freeit; 3385 3386 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3387 /* Drop it -- too many retransmissions. */ 3388 goto dropit; 3389 } 3390 3391 /* 3392 * Compute the total amount of time this entry has 3393 * been on a queue. If this entry has been on longer 3394 * than the keep alive timer would allow, expire it. 3395 */ 3396 sc->sc_rxttot += sc->sc_rxtcur; 3397 if (sc->sc_rxttot >= READ_ONCE(tcptv_keep_init)) 3398 goto dropit; 3399 3400 /* Advance the timer back-off. */ 3401 sc->sc_rxtshift++; 3402 TCPT_RANGESET(sc->sc_rxtcur, 3403 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3404 TCPTV_REXMTMAX); 3405 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3406 refcnt_take(&sc->sc_refcnt); 3407 inp = in_pcbref(sc->sc_inplisten); 3408 if (inp == NULL) 3409 goto freeit; 3410 mtx_leave(&syn_cache_mtx); 3411 3412 NET_LOCK_SHARED(); 3413 so = in_pcbsolock_ref(inp); 3414 if (so != NULL) { 3415 now = tcp_now(); 3416 #ifdef TCP_ECN 3417 do_ecn = atomic_load_int(&tcp_do_ecn); 3418 #endif 3419 (void) syn_cache_respond(sc, NULL, now, do_ecn); 3420 tcpstat_inc(tcps_sc_retransmitted); 3421 } 3422 in_pcbsounlock_rele(inp, so); 3423 NET_UNLOCK_SHARED(); 3424 3425 in_pcbunref(inp); 3426 syn_cache_put(sc); 3427 return; 3428 3429 dropit: 3430 tcpstat_inc(tcps_sc_timed_out); 3431 syn_cache_rm(sc); 3432 /* Decrement reference of the timer and free object after remove. */ 3433 lastref = refcnt_rele(&sc->sc_refcnt); 3434 KASSERT(lastref == 0); 3435 (void)lastref; 3436 freeit: 3437 mtx_leave(&syn_cache_mtx); 3438 syn_cache_put(sc); 3439 } 3440 3441 /* 3442 * Remove syn cache created by the specified tcb entry, 3443 * because this does not make sense to keep them 3444 * (if there's no tcb entry, syn cache entry will never be used) 3445 */ 3446 void 3447 syn_cache_cleanup(struct tcpcb *tp) 3448 { 3449 struct syn_cache *sc, *nsc; 3450 3451 NET_ASSERT_LOCKED(); 3452 3453 mtx_enter(&syn_cache_mtx); 3454 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3455 KASSERT(sc->sc_inplisten == tp->t_inpcb); 3456 syn_cache_rm(sc); 3457 syn_cache_put(sc); 3458 } 3459 mtx_leave(&syn_cache_mtx); 3460 3461 KASSERT(LIST_EMPTY(&tp->t_sc)); 3462 } 3463 3464 /* 3465 * Find an entry in the syn cache. 3466 */ 3467 struct syn_cache * 3468 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, 3469 struct syn_cache_head **headp, u_int rtableid) 3470 { 3471 struct syn_cache_set *sets[2]; 3472 struct syn_cache *sc; 3473 struct syn_cache_head *scp; 3474 u_int32_t hash; 3475 int i; 3476 3477 NET_ASSERT_LOCKED(); 3478 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3479 3480 /* Check the active cache first, the passive cache is likely empty. */ 3481 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3482 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3483 for (i = 0; i < 2; i++) { 3484 if (sets[i]->scs_count == 0) 3485 continue; 3486 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3487 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3488 *headp = scp; 3489 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3490 if (sc->sc_hash != hash) 3491 continue; 3492 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3493 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3494 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3495 return (sc); 3496 } 3497 } 3498 return (NULL); 3499 } 3500 3501 /* 3502 * This function gets called when we receive an ACK for a 3503 * socket in the LISTEN state. We look up the connection 3504 * in the syn cache, and if its there, we pull it out of 3505 * the cache and turn it into a full-blown connection in 3506 * the SYN-RECEIVED state. 3507 * 3508 * The return values may not be immediately obvious, and their effects 3509 * can be subtle, so here they are: 3510 * 3511 * NULL SYN was not found in cache; caller should drop the 3512 * packet and send an RST. 3513 * 3514 * -1 We were unable to create the new connection, and are 3515 * aborting it. An ACK,RST is being sent to the peer 3516 * (unless we got screwy sequence numbers; see below), 3517 * because the 3-way handshake has been completed. Caller 3518 * should not free the mbuf, since we may be using it. If 3519 * we are not, we will free it. 3520 * 3521 * Otherwise, the return value is a pointer to the new socket 3522 * associated with the connection. 3523 */ 3524 struct socket * 3525 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3526 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint64_t now, 3527 int do_ecn) 3528 { 3529 struct syn_cache *sc; 3530 struct syn_cache_head *scp; 3531 struct inpcb *inp, *oldinp; 3532 struct tcpcb *tp = NULL; 3533 struct mbuf *am; 3534 struct socket *oso; 3535 u_int rtableid; 3536 3537 NET_ASSERT_LOCKED(); 3538 3539 mtx_enter(&syn_cache_mtx); 3540 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3541 if (sc == NULL) { 3542 mtx_leave(&syn_cache_mtx); 3543 return (NULL); 3544 } 3545 3546 /* 3547 * Verify the sequence and ack numbers. Try getting the correct 3548 * response again. 3549 */ 3550 if ((th->th_ack != sc->sc_iss + 1) || 3551 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3552 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3553 refcnt_take(&sc->sc_refcnt); 3554 mtx_leave(&syn_cache_mtx); 3555 (void) syn_cache_respond(sc, m, now, do_ecn); 3556 syn_cache_put(sc); 3557 return ((struct socket *)(-1)); 3558 } 3559 3560 /* Remove this cache entry */ 3561 syn_cache_rm(sc); 3562 mtx_leave(&syn_cache_mtx); 3563 3564 /* 3565 * Ok, create the full blown connection, and set things up 3566 * as they would have been set up if we had created the 3567 * connection when the SYN arrived. If we can't create 3568 * the connection, abort it. 3569 */ 3570 oso = so; 3571 so = sonewconn(so, SS_ISCONNECTED, M_DONTWAIT); 3572 if (so == NULL) 3573 goto resetandabort; 3574 3575 oldinp = sotoinpcb(oso); 3576 inp = sotoinpcb(so); 3577 3578 #ifdef IPSEC 3579 /* 3580 * We need to copy the required security levels 3581 * from the old pcb. Ditto for any other 3582 * IPsec-related information. 3583 */ 3584 inp->inp_seclevel = oldinp->inp_seclevel; 3585 #endif /* IPSEC */ 3586 #ifdef INET6 3587 if (ISSET(inp->inp_flags, INP_IPV6)) { 3588 KASSERT(ISSET(oldinp->inp_flags, INP_IPV6)); 3589 3590 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3591 inp->inp_hops = oldinp->inp_hops; 3592 } else 3593 #endif 3594 { 3595 KASSERT(!ISSET(oldinp->inp_flags, INP_IPV6)); 3596 3597 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3598 inp->inp_options = ip_srcroute(m); 3599 if (inp->inp_options == NULL) { 3600 inp->inp_options = sc->sc_ipopts; 3601 sc->sc_ipopts = NULL; 3602 } 3603 } 3604 3605 /* inherit rtable from listening socket */ 3606 rtableid = sc->sc_rtableid; 3607 #if NPF > 0 3608 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3609 struct pf_divert *divert; 3610 3611 divert = pf_find_divert(m); 3612 KASSERT(divert != NULL); 3613 rtableid = divert->rdomain; 3614 } 3615 #endif 3616 in_pcbset_laddr(inp, dst, rtableid); 3617 3618 /* 3619 * Give the new socket our cached route reference. 3620 */ 3621 inp->inp_route = sc->sc_route; /* struct assignment */ 3622 sc->sc_route.ro_rt = NULL; 3623 3624 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3625 if (am == NULL) 3626 goto resetandabort; 3627 am->m_len = src->sa_len; 3628 memcpy(mtod(am, caddr_t), src, src->sa_len); 3629 if (in_pcbconnect(inp, am)) { 3630 (void) m_free(am); 3631 goto resetandabort; 3632 } 3633 (void) m_free(am); 3634 3635 tp = intotcpcb(inp); 3636 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3637 if (sc->sc_request_r_scale != 15) { 3638 tp->requested_s_scale = sc->sc_requested_s_scale; 3639 tp->request_r_scale = sc->sc_request_r_scale; 3640 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3641 } 3642 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) 3643 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3644 3645 tp->t_template = tcp_template(tp); 3646 if (tp->t_template == 0) { 3647 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3648 so = NULL; 3649 goto abort; 3650 } 3651 tp->sack_enable = ISSET(sc->sc_fixflags, SCF_SACK_PERMIT); 3652 tp->ts_modulate = sc->sc_modulate; 3653 tp->ts_recent = sc->sc_timestamp; 3654 tp->iss = sc->sc_iss; 3655 tp->irs = sc->sc_irs; 3656 tcp_sendseqinit(tp); 3657 tp->snd_last = tp->snd_una; 3658 #ifdef TCP_ECN 3659 if (ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) { 3660 tp->t_flags |= TF_ECN_PERMIT; 3661 tcpstat_inc(tcps_ecn_accepts); 3662 } 3663 #endif 3664 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) 3665 tp->t_flags |= TF_SACK_PERMIT; 3666 #ifdef TCP_SIGNATURE 3667 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) 3668 tp->t_flags |= TF_SIGNATURE; 3669 #endif 3670 tcp_rcvseqinit(tp); 3671 tp->t_state = TCPS_SYN_RECEIVED; 3672 tp->t_rcvtime = now; 3673 tp->t_sndtime = now; 3674 tp->t_rcvacktime = now; 3675 tp->t_sndacktime = now; 3676 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3677 tcpstat_inc(tcps_accepts); 3678 3679 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3680 if (sc->sc_peermaxseg) 3681 tcp_mss_update(tp); 3682 /* Reset initial window to 1 segment for retransmit */ 3683 if (READ_ONCE(sc->sc_rxtshift) > 0) 3684 tp->snd_cwnd = tp->t_maxseg; 3685 tp->snd_wl1 = sc->sc_irs; 3686 tp->rcv_up = sc->sc_irs + 1; 3687 3688 /* 3689 * This is what would have happened in tcp_output() when 3690 * the SYN,ACK was sent. 3691 */ 3692 tp->snd_up = tp->snd_una; 3693 tp->snd_max = tp->snd_nxt = tp->iss+1; 3694 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3695 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3696 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3697 tp->last_ack_sent = tp->rcv_nxt; 3698 3699 tcpstat_inc(tcps_sc_completed); 3700 syn_cache_put(sc); 3701 return (so); 3702 3703 resetandabort: 3704 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3705 m->m_pkthdr.ph_rtableid, now); 3706 abort: 3707 m_freem(m); 3708 if (so != NULL) 3709 soabort(so); 3710 syn_cache_put(sc); 3711 tcpstat_inc(tcps_sc_aborted); 3712 return ((struct socket *)(-1)); 3713 } 3714 3715 /* 3716 * This function is called when we get a RST for a 3717 * non-existent connection, so that we can see if the 3718 * connection is in the syn cache. If it is, zap it. 3719 */ 3720 3721 void 3722 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3723 u_int rtableid) 3724 { 3725 struct syn_cache *sc; 3726 struct syn_cache_head *scp; 3727 3728 NET_ASSERT_LOCKED(); 3729 3730 mtx_enter(&syn_cache_mtx); 3731 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3732 if (sc == NULL) { 3733 mtx_leave(&syn_cache_mtx); 3734 return; 3735 } 3736 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3737 SEQ_GT(th->th_seq, sc->sc_irs + 1)) { 3738 mtx_leave(&syn_cache_mtx); 3739 return; 3740 } 3741 syn_cache_rm(sc); 3742 mtx_leave(&syn_cache_mtx); 3743 tcpstat_inc(tcps_sc_reset); 3744 syn_cache_put(sc); 3745 } 3746 3747 void 3748 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, 3749 struct tcphdr *th, u_int rtableid) 3750 { 3751 struct syn_cache *sc; 3752 struct syn_cache_head *scp; 3753 3754 NET_ASSERT_LOCKED(); 3755 3756 mtx_enter(&syn_cache_mtx); 3757 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3758 if (sc == NULL) { 3759 mtx_leave(&syn_cache_mtx); 3760 return; 3761 } 3762 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3763 if (ntohl (th->th_seq) != sc->sc_iss) { 3764 mtx_leave(&syn_cache_mtx); 3765 return; 3766 } 3767 3768 /* 3769 * If we've retransmitted 3 times and this is our second error, 3770 * we remove the entry. Otherwise, we allow it to continue on. 3771 * This prevents us from incorrectly nuking an entry during a 3772 * spurious network outage. 3773 * 3774 * See tcp_notify(). 3775 */ 3776 if (!ISSET(sc->sc_dynflags, SCF_UNREACH) || sc->sc_rxtshift < 3) { 3777 SET(sc->sc_dynflags, SCF_UNREACH); 3778 mtx_leave(&syn_cache_mtx); 3779 return; 3780 } 3781 3782 syn_cache_rm(sc); 3783 mtx_leave(&syn_cache_mtx); 3784 tcpstat_inc(tcps_sc_unreach); 3785 syn_cache_put(sc); 3786 } 3787 3788 /* 3789 * Given a LISTEN socket and an inbound SYN request, add 3790 * this to the syn cache, and send back a segment: 3791 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3792 * to the source. 3793 * 3794 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3795 * Doing so would require that we hold onto the data and deliver it 3796 * to the application. However, if we are the target of a SYN-flood 3797 * DoS attack, an attacker could send data which would eventually 3798 * consume all available buffer space if it were ACKed. By not ACKing 3799 * the data, we avoid this DoS scenario. 3800 */ 3801 3802 int 3803 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3804 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3805 struct tcp_opt_info *oi, tcp_seq *issp, uint64_t now, int do_ecn) 3806 { 3807 struct tcpcb tb, *tp; 3808 long win; 3809 struct syn_cache *sc; 3810 struct syn_cache_head *scp; 3811 struct mbuf *ipopts; 3812 3813 NET_ASSERT_LOCKED(); 3814 3815 tp = sototcpcb(so); 3816 3817 /* 3818 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3819 * 3820 * Note this check is performed in tcp_input() very early on. 3821 */ 3822 3823 /* 3824 * Initialize some local state. 3825 */ 3826 win = sbspace(so, &so->so_rcv); 3827 if (win > TCP_MAXWIN) 3828 win = TCP_MAXWIN; 3829 3830 bzero(&tb, sizeof(tb)); 3831 if (optp 3832 #ifdef TCP_SIGNATURE 3833 || (tp->t_flags & TF_SIGNATURE) 3834 #endif 3835 ) { 3836 tb.pf = tp->pf; 3837 tb.sack_enable = tp->sack_enable; 3838 tb.t_flags = atomic_load_int(&tcp_do_rfc1323) ? 3839 (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3840 #ifdef TCP_SIGNATURE 3841 if (tp->t_flags & TF_SIGNATURE) 3842 tb.t_flags |= TF_SIGNATURE; 3843 #endif 3844 tb.t_state = TCPS_LISTEN; 3845 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3846 sotoinpcb(so)->inp_rtableid, now)) 3847 return (-1); 3848 } 3849 3850 switch (src->sa_family) { 3851 case AF_INET: 3852 /* 3853 * Remember the IP options, if any. 3854 */ 3855 ipopts = ip_srcroute(m); 3856 break; 3857 default: 3858 ipopts = NULL; 3859 } 3860 3861 /* 3862 * See if we already have an entry for this connection. 3863 * If we do, resend the SYN,ACK. We do not count this 3864 * as a retransmission (XXX though maybe we should). 3865 */ 3866 mtx_enter(&syn_cache_mtx); 3867 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3868 if (sc != NULL) { 3869 refcnt_take(&sc->sc_refcnt); 3870 mtx_leave(&syn_cache_mtx); 3871 tcpstat_inc(tcps_sc_dupesyn); 3872 if (ipopts) { 3873 /* 3874 * If we were remembering a previous source route, 3875 * forget it and use the new one we've been given. 3876 */ 3877 m_free(sc->sc_ipopts); 3878 sc->sc_ipopts = ipopts; 3879 } 3880 sc->sc_timestamp = tb.ts_recent; 3881 if (syn_cache_respond(sc, m, now, do_ecn) == 0) { 3882 tcpstat_inc(tcps_sndacks); 3883 tcpstat_inc(tcps_sndtotal); 3884 } 3885 syn_cache_put(sc); 3886 return (0); 3887 } 3888 mtx_leave(&syn_cache_mtx); 3889 3890 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3891 if (sc == NULL) { 3892 m_free(ipopts); 3893 return (-1); 3894 } 3895 refcnt_init_trace(&sc->sc_refcnt, DT_REFCNT_IDX_SYNCACHE); 3896 timeout_set_flags(&sc->sc_timer, syn_cache_timer, sc, 3897 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 3898 3899 /* 3900 * Fill in the cache, and put the necessary IP and TCP 3901 * options into the reply. 3902 */ 3903 memcpy(&sc->sc_src, src, src->sa_len); 3904 memcpy(&sc->sc_dst, dst, dst->sa_len); 3905 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3906 sc->sc_ipopts = ipopts; 3907 sc->sc_irs = th->th_seq; 3908 3909 sc->sc_iss = issp ? *issp : arc4random(); 3910 sc->sc_peermaxseg = oi->maxseg; 3911 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3912 sc->sc_win = win; 3913 sc->sc_timestamp = tb.ts_recent; 3914 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3915 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3916 SET(sc->sc_fixflags, SCF_TIMESTAMP); 3917 sc->sc_modulate = arc4random(); 3918 } 3919 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3920 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3921 sc->sc_requested_s_scale = tb.requested_s_scale; 3922 sc->sc_request_r_scale = 0; 3923 /* 3924 * Pick the smallest possible scaling factor that 3925 * will still allow us to scale up to sb_max. 3926 * 3927 * We do this because there are broken firewalls that 3928 * will corrupt the window scale option, leading to 3929 * the other endpoint believing that our advertised 3930 * window is unscaled. At scale factors larger than 3931 * 5 the unscaled window will drop below 1500 bytes, 3932 * leading to serious problems when traversing these 3933 * broken firewalls. 3934 * 3935 * With the default sbmax of 256K, a scale factor 3936 * of 3 will be chosen by this algorithm. Those who 3937 * choose a larger sbmax should watch out 3938 * for the compatibility problems mentioned above. 3939 * 3940 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3941 * or <SYN,ACK>) segment itself is never scaled. 3942 */ 3943 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3944 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3945 sc->sc_request_r_scale++; 3946 } else { 3947 sc->sc_requested_s_scale = 15; 3948 sc->sc_request_r_scale = 15; 3949 } 3950 #ifdef TCP_ECN 3951 /* 3952 * if both ECE and CWR flag bits are set, peer is ECN capable. 3953 */ 3954 if (do_ecn && (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3955 SET(sc->sc_fixflags, SCF_ECN_PERMIT); 3956 #endif 3957 /* 3958 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3959 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3960 */ 3961 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3962 SET(sc->sc_fixflags, SCF_SACK_PERMIT); 3963 #ifdef TCP_SIGNATURE 3964 if (tb.t_flags & TF_SIGNATURE) 3965 SET(sc->sc_fixflags, SCF_SIGNATURE); 3966 #endif 3967 sc->sc_inplisten = in_pcbref(tp->t_inpcb); 3968 if (syn_cache_respond(sc, m, now, do_ecn) == 0) { 3969 mtx_enter(&syn_cache_mtx); 3970 /* 3971 * XXXSMP Currently exclusive netlock prevents another insert 3972 * after our syn_cache_lookup() and before syn_cache_insert(). 3973 * Double insert should be handled and not rely on netlock. 3974 */ 3975 syn_cache_insert(sc, tp); 3976 mtx_leave(&syn_cache_mtx); 3977 tcpstat_inc(tcps_sndacks); 3978 tcpstat_inc(tcps_sndtotal); 3979 } else { 3980 in_pcbunref(sc->sc_inplisten); 3981 syn_cache_put(sc); 3982 tcpstat_inc(tcps_sc_dropped); 3983 } 3984 3985 return (0); 3986 } 3987 3988 int 3989 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint64_t now, 3990 int do_ecn) 3991 { 3992 u_int8_t *optp; 3993 int optlen, error; 3994 u_int16_t tlen; 3995 struct ip *ip = NULL; 3996 #ifdef INET6 3997 struct ip6_hdr *ip6 = NULL; 3998 #endif 3999 struct tcphdr *th; 4000 u_int hlen; 4001 struct inpcb *inp; 4002 4003 NET_ASSERT_LOCKED(); 4004 4005 switch (sc->sc_src.sa.sa_family) { 4006 case AF_INET: 4007 hlen = sizeof(struct ip); 4008 break; 4009 #ifdef INET6 4010 case AF_INET6: 4011 hlen = sizeof(struct ip6_hdr); 4012 break; 4013 #endif 4014 default: 4015 m_freem(m); 4016 return (EAFNOSUPPORT); 4017 } 4018 4019 /* Compute the size of the TCP options. */ 4020 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4021 (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT) ? 4 : 0) + 4022 #ifdef TCP_SIGNATURE 4023 (ISSET(sc->sc_fixflags, SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4024 #endif 4025 (ISSET(sc->sc_fixflags, SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4026 4027 tlen = hlen + sizeof(struct tcphdr) + optlen; 4028 4029 /* 4030 * Create the IP+TCP header from scratch. 4031 */ 4032 m_freem(m); 4033 #ifdef DIAGNOSTIC 4034 if (max_linkhdr + tlen > MCLBYTES) 4035 return (ENOBUFS); 4036 #endif 4037 MGETHDR(m, M_DONTWAIT, MT_DATA); 4038 if (m && max_linkhdr + tlen > MHLEN) { 4039 MCLGET(m, M_DONTWAIT); 4040 if ((m->m_flags & M_EXT) == 0) { 4041 m_freem(m); 4042 m = NULL; 4043 } 4044 } 4045 if (m == NULL) 4046 return (ENOBUFS); 4047 4048 /* Fixup the mbuf. */ 4049 m->m_data += max_linkhdr; 4050 m->m_len = m->m_pkthdr.len = tlen; 4051 m->m_pkthdr.ph_ifidx = 0; 4052 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4053 memset(mtod(m, u_char *), 0, tlen); 4054 4055 switch (sc->sc_src.sa.sa_family) { 4056 case AF_INET: 4057 ip = mtod(m, struct ip *); 4058 ip->ip_dst = sc->sc_src.sin.sin_addr; 4059 ip->ip_src = sc->sc_dst.sin.sin_addr; 4060 ip->ip_p = IPPROTO_TCP; 4061 th = (struct tcphdr *)(ip + 1); 4062 th->th_dport = sc->sc_src.sin.sin_port; 4063 th->th_sport = sc->sc_dst.sin.sin_port; 4064 break; 4065 #ifdef INET6 4066 case AF_INET6: 4067 ip6 = mtod(m, struct ip6_hdr *); 4068 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4069 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4070 ip6->ip6_nxt = IPPROTO_TCP; 4071 th = (struct tcphdr *)(ip6 + 1); 4072 th->th_dport = sc->sc_src.sin6.sin6_port; 4073 th->th_sport = sc->sc_dst.sin6.sin6_port; 4074 break; 4075 #endif 4076 } 4077 4078 th->th_seq = htonl(sc->sc_iss); 4079 th->th_ack = htonl(sc->sc_irs + 1); 4080 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4081 th->th_flags = TH_SYN|TH_ACK; 4082 #ifdef TCP_ECN 4083 /* Set ECE for SYN-ACK if peer supports ECN. */ 4084 if (do_ecn && ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) 4085 th->th_flags |= TH_ECE; 4086 #endif 4087 th->th_win = htons(sc->sc_win); 4088 /* th_sum already 0 */ 4089 /* th_urp already 0 */ 4090 4091 /* Tack on the TCP options. */ 4092 optp = (u_int8_t *)(th + 1); 4093 *optp++ = TCPOPT_MAXSEG; 4094 *optp++ = 4; 4095 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4096 *optp++ = sc->sc_ourmaxseg & 0xff; 4097 4098 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4099 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) { 4100 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4101 optp += 4; 4102 } 4103 4104 if (sc->sc_request_r_scale != 15) { 4105 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4106 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4107 sc->sc_request_r_scale); 4108 optp += 4; 4109 } 4110 4111 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) { 4112 u_int32_t *lp = (u_int32_t *)(optp); 4113 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4114 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4115 *lp++ = htonl(now + sc->sc_modulate); 4116 *lp = htonl(sc->sc_timestamp); 4117 optp += TCPOLEN_TSTAMP_APPA; 4118 } 4119 4120 #ifdef TCP_SIGNATURE 4121 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) { 4122 union sockaddr_union src, dst; 4123 struct tdb *tdb; 4124 4125 bzero(&src, sizeof(union sockaddr_union)); 4126 bzero(&dst, sizeof(union sockaddr_union)); 4127 src.sa.sa_len = sc->sc_src.sa.sa_len; 4128 src.sa.sa_family = sc->sc_src.sa.sa_family; 4129 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4130 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4131 4132 switch (sc->sc_src.sa.sa_family) { 4133 case 0: /*default to PF_INET*/ 4134 case AF_INET: 4135 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4136 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4137 break; 4138 #ifdef INET6 4139 case AF_INET6: 4140 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4141 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4142 break; 4143 #endif /* INET6 */ 4144 } 4145 4146 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4147 0, &src, &dst, IPPROTO_TCP); 4148 if (tdb == NULL) { 4149 m_freem(m); 4150 return (EPERM); 4151 } 4152 4153 /* Send signature option */ 4154 *(optp++) = TCPOPT_SIGNATURE; 4155 *(optp++) = TCPOLEN_SIGNATURE; 4156 4157 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4158 hlen, 0, optp) < 0) { 4159 m_freem(m); 4160 tdb_unref(tdb); 4161 return (EINVAL); 4162 } 4163 tdb_unref(tdb); 4164 optp += 16; 4165 4166 /* Pad options list to the next 32 bit boundary and 4167 * terminate it. 4168 */ 4169 *optp++ = TCPOPT_NOP; 4170 *optp++ = TCPOPT_EOL; 4171 } 4172 #endif /* TCP_SIGNATURE */ 4173 4174 SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); 4175 4176 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4177 mtx_enter(&syn_cache_mtx); 4178 inp = in_pcbref(sc->sc_inplisten); 4179 mtx_leave(&syn_cache_mtx); 4180 4181 /* 4182 * Fill in some straggling IP bits. Note the stack expects 4183 * ip_len to be in host order, for convenience. 4184 */ 4185 switch (sc->sc_src.sa.sa_family) { 4186 case AF_INET: 4187 ip->ip_len = htons(tlen); 4188 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4189 if (inp != NULL) 4190 ip->ip_tos = inp->inp_ip.ip_tos; 4191 4192 error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 4193 (ip_mtudisc ? IP_MTUDISC : 0), NULL, 4194 inp ? &inp->inp_seclevel : NULL, 0); 4195 break; 4196 #ifdef INET6 4197 case AF_INET6: 4198 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4199 ip6->ip6_vfc |= IPV6_VERSION; 4200 /* ip6_plen will be updated in ip6_output() */ 4201 ip6->ip6_hlim = in6_selecthlim(inp); 4202 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4203 4204 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route, 0, 4205 NULL, inp ? &inp->inp_seclevel : NULL); 4206 break; 4207 #endif 4208 } 4209 in_pcbunref(inp); 4210 return (error); 4211 } 4212