1 /* $OpenBSD: tcp_input.c,v 1.325 2016/07/20 09:15:28 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_debug.h> 98 99 #if NPF > 0 100 #include <net/pfvar.h> 101 #endif 102 103 struct tcpiphdr tcp_saveti; 104 105 int tcp_mss_adv(struct mbuf *, int); 106 int tcp_flush_queue(struct tcpcb *); 107 108 #ifdef INET6 109 #include <netinet6/in6_var.h> 110 #include <netinet6/nd6.h> 111 112 struct tcpipv6hdr tcp_saveti6; 113 114 /* for the packet header length in the mbuf */ 115 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 116 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 117 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 118 #endif /* INET6 */ 119 120 int tcprexmtthresh = 3; 121 int tcptv_keep_init = TCPTV_KEEP_INIT; 122 123 int tcp_rst_ppslim = 100; /* 100pps */ 124 int tcp_rst_ppslim_count = 0; 125 struct timeval tcp_rst_ppslim_last; 126 127 int tcp_ackdrop_ppslim = 100; /* 100pps */ 128 int tcp_ackdrop_ppslim_count = 0; 129 struct timeval tcp_ackdrop_ppslim_last; 130 131 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 132 133 /* for modulo comparisons of timestamps */ 134 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 135 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 136 137 /* for TCP SACK comparisons */ 138 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 139 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 140 141 /* 142 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 143 */ 144 #ifdef INET6 145 #define ND6_HINT(tp) \ 146 do { \ 147 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 148 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 149 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 150 } \ 151 } while (0) 152 #else 153 #define ND6_HINT(tp) 154 #endif 155 156 #ifdef TCP_ECN 157 /* 158 * ECN (Explicit Congestion Notification) support based on RFC3168 159 * implementation note: 160 * snd_last is used to track a recovery phase. 161 * when cwnd is reduced, snd_last is set to snd_max. 162 * while snd_last > snd_una, the sender is in a recovery phase and 163 * its cwnd should not be reduced again. 164 * snd_last follows snd_una when not in a recovery phase. 165 */ 166 #endif 167 168 /* 169 * Macro to compute ACK transmission behavior. Delay the ACK unless 170 * we have already delayed an ACK (must send an ACK every two segments). 171 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 172 * option is enabled or when the packet is coming from a loopback 173 * interface. 174 */ 175 #define TCP_SETUP_ACK(tp, tiflags, m) \ 176 do { \ 177 struct ifnet *ifp = NULL; \ 178 if (m && (m->m_flags & M_PKTHDR)) \ 179 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 180 if ((tp)->t_flags & TF_DELACK || \ 181 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 182 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 183 tp->t_flags |= TF_ACKNOW; \ 184 else \ 185 TCP_SET_DELACK(tp); \ 186 if_put(ifp); \ 187 } while (0) 188 189 void syn_cache_put(struct syn_cache *); 190 void syn_cache_rm(struct syn_cache *); 191 192 /* 193 * Insert segment ti into reassembly queue of tcp with 194 * control block tp. Return TH_FIN if reassembly now includes 195 * a segment with FIN. The macro form does the common case inline 196 * (segment is the next to be received on an established connection, 197 * and the queue is empty), avoiding linkage into and removal 198 * from the queue and repetition of various conversions. 199 * Set DELACK for segments received in order, but ack immediately 200 * when segments are out of order (so fast retransmit can work). 201 */ 202 203 int 204 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 205 { 206 struct tcpqent *p, *q, *nq, *tiqe; 207 208 /* 209 * Allocate a new queue entry, before we throw away any data. 210 * If we can't, just drop the packet. XXX 211 */ 212 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 213 if (tiqe == NULL) { 214 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 215 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 216 /* Reuse last entry since new segment fills a hole */ 217 m_freem(tiqe->tcpqe_m); 218 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 219 } 220 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 221 /* Flush segment queue for this connection */ 222 tcp_freeq(tp); 223 tcpstat.tcps_rcvmemdrop++; 224 m_freem(m); 225 return (0); 226 } 227 } 228 229 /* 230 * Find a segment which begins after this one does. 231 */ 232 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 233 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 234 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 235 break; 236 237 /* 238 * If there is a preceding segment, it may provide some of 239 * our data already. If so, drop the data from the incoming 240 * segment. If it provides all of our data, drop us. 241 */ 242 if (p != NULL) { 243 struct tcphdr *phdr = p->tcpqe_tcp; 244 int i; 245 246 /* conversion to int (in i) handles seq wraparound */ 247 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 248 if (i > 0) { 249 if (i >= *tlen) { 250 tcpstat.tcps_rcvduppack++; 251 tcpstat.tcps_rcvdupbyte += *tlen; 252 m_freem(m); 253 pool_put(&tcpqe_pool, tiqe); 254 return (0); 255 } 256 m_adj(m, i); 257 *tlen -= i; 258 th->th_seq += i; 259 } 260 } 261 tcpstat.tcps_rcvoopack++; 262 tcpstat.tcps_rcvoobyte += *tlen; 263 264 /* 265 * While we overlap succeeding segments trim them or, 266 * if they are completely covered, dequeue them. 267 */ 268 for (; q != NULL; q = nq) { 269 struct tcphdr *qhdr = q->tcpqe_tcp; 270 int i = (th->th_seq + *tlen) - qhdr->th_seq; 271 272 if (i <= 0) 273 break; 274 if (i < qhdr->th_reseqlen) { 275 qhdr->th_seq += i; 276 qhdr->th_reseqlen -= i; 277 m_adj(q->tcpqe_m, i); 278 break; 279 } 280 nq = TAILQ_NEXT(q, tcpqe_q); 281 m_freem(q->tcpqe_m); 282 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 283 pool_put(&tcpqe_pool, q); 284 } 285 286 /* Insert the new segment queue entry into place. */ 287 tiqe->tcpqe_m = m; 288 th->th_reseqlen = *tlen; 289 tiqe->tcpqe_tcp = th; 290 if (p == NULL) { 291 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 292 } else { 293 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 294 } 295 296 if (th->th_seq != tp->rcv_nxt) 297 return (0); 298 299 return (tcp_flush_queue(tp)); 300 } 301 302 int 303 tcp_flush_queue(struct tcpcb *tp) 304 { 305 struct socket *so = tp->t_inpcb->inp_socket; 306 struct tcpqent *q, *nq; 307 int flags; 308 309 /* 310 * Present data to user, advancing rcv_nxt through 311 * completed sequence space. 312 */ 313 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 314 return (0); 315 q = TAILQ_FIRST(&tp->t_segq); 316 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 317 return (0); 318 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 319 return (0); 320 do { 321 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 322 flags = q->tcpqe_tcp->th_flags & TH_FIN; 323 324 nq = TAILQ_NEXT(q, tcpqe_q); 325 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 326 ND6_HINT(tp); 327 if (so->so_state & SS_CANTRCVMORE) 328 m_freem(q->tcpqe_m); 329 else 330 sbappendstream(&so->so_rcv, q->tcpqe_m); 331 pool_put(&tcpqe_pool, q); 332 q = nq; 333 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 334 tp->t_flags |= TF_BLOCKOUTPUT; 335 sorwakeup(so); 336 tp->t_flags &= ~TF_BLOCKOUTPUT; 337 return (flags); 338 } 339 340 #ifdef INET6 341 int 342 tcp6_input(struct mbuf **mp, int *offp, int proto) 343 { 344 struct mbuf *m = *mp; 345 346 tcp_input(m, *offp, proto); 347 return IPPROTO_DONE; 348 } 349 #endif 350 351 /* 352 * TCP input routine, follows pages 65-76 of the 353 * protocol specification dated September, 1981 very closely. 354 */ 355 void 356 tcp_input(struct mbuf *m, ...) 357 { 358 struct ip *ip; 359 struct inpcb *inp = NULL; 360 u_int8_t *optp = NULL; 361 int optlen = 0; 362 int tlen, off; 363 struct tcpcb *tp = NULL; 364 int tiflags; 365 struct socket *so = NULL; 366 int todrop, acked, ourfinisacked; 367 int hdroptlen = 0; 368 short ostate = 0; 369 tcp_seq iss, *reuse = NULL; 370 u_long tiwin; 371 struct tcp_opt_info opti; 372 int iphlen; 373 va_list ap; 374 struct tcphdr *th; 375 #ifdef INET6 376 struct ip6_hdr *ip6 = NULL; 377 #endif /* INET6 */ 378 #ifdef IPSEC 379 struct m_tag *mtag; 380 struct tdb_ident *tdbi; 381 struct tdb *tdb; 382 int error; 383 #endif /* IPSEC */ 384 int af; 385 #ifdef TCP_ECN 386 u_char iptos; 387 #endif 388 389 va_start(ap, m); 390 iphlen = va_arg(ap, int); 391 va_end(ap); 392 393 tcpstat.tcps_rcvtotal++; 394 395 opti.ts_present = 0; 396 opti.maxseg = 0; 397 398 /* 399 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 400 */ 401 if (m->m_flags & (M_BCAST|M_MCAST)) 402 goto drop; 403 404 /* 405 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 406 * TCP/IPv4. 407 */ 408 switch (mtod(m, struct ip *)->ip_v) { 409 #ifdef INET6 410 case 6: 411 af = AF_INET6; 412 break; 413 #endif 414 case 4: 415 af = AF_INET; 416 break; 417 default: 418 m_freem(m); 419 return; /*EAFNOSUPPORT*/ 420 } 421 422 /* 423 * Get IP and TCP header together in first mbuf. 424 * Note: IP leaves IP header in first mbuf. 425 */ 426 switch (af) { 427 case AF_INET: 428 #ifdef DIAGNOSTIC 429 if (iphlen < sizeof(struct ip)) { 430 m_freem(m); 431 return; 432 } 433 #endif /* DIAGNOSTIC */ 434 break; 435 #ifdef INET6 436 case AF_INET6: 437 #ifdef DIAGNOSTIC 438 if (iphlen < sizeof(struct ip6_hdr)) { 439 m_freem(m); 440 return; 441 } 442 #endif /* DIAGNOSTIC */ 443 break; 444 #endif 445 default: 446 m_freem(m); 447 return; 448 } 449 450 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 451 if (!th) { 452 tcpstat.tcps_rcvshort++; 453 return; 454 } 455 456 tlen = m->m_pkthdr.len - iphlen; 457 ip = NULL; 458 #ifdef INET6 459 ip6 = NULL; 460 #endif 461 switch (af) { 462 case AF_INET: 463 ip = mtod(m, struct ip *); 464 #ifdef TCP_ECN 465 /* save ip_tos before clearing it for checksum */ 466 iptos = ip->ip_tos; 467 #endif 468 break; 469 #ifdef INET6 470 case AF_INET6: 471 ip6 = mtod(m, struct ip6_hdr *); 472 #ifdef TCP_ECN 473 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 474 #endif 475 476 /* Be proactive about malicious use of IPv4 mapped address */ 477 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 478 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 479 /* XXX stat */ 480 goto drop; 481 } 482 483 /* 484 * Be proactive about unspecified IPv6 address in source. 485 * As we use all-zero to indicate unbounded/unconnected pcb, 486 * unspecified IPv6 address can be used to confuse us. 487 * 488 * Note that packets with unspecified IPv6 destination is 489 * already dropped in ip6_input. 490 */ 491 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 492 /* XXX stat */ 493 goto drop; 494 } 495 496 /* Discard packets to multicast */ 497 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 498 /* XXX stat */ 499 goto drop; 500 } 501 break; 502 #endif 503 } 504 505 /* 506 * Checksum extended TCP header and data. 507 */ 508 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 509 int sum; 510 511 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 512 tcpstat.tcps_rcvbadsum++; 513 goto drop; 514 } 515 tcpstat.tcps_inswcsum++; 516 switch (af) { 517 case AF_INET: 518 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 519 break; 520 #ifdef INET6 521 case AF_INET6: 522 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 523 tlen); 524 break; 525 #endif 526 } 527 if (sum != 0) { 528 tcpstat.tcps_rcvbadsum++; 529 goto drop; 530 } 531 } 532 533 /* 534 * Check that TCP offset makes sense, 535 * pull out TCP options and adjust length. XXX 536 */ 537 off = th->th_off << 2; 538 if (off < sizeof(struct tcphdr) || off > tlen) { 539 tcpstat.tcps_rcvbadoff++; 540 goto drop; 541 } 542 tlen -= off; 543 if (off > sizeof(struct tcphdr)) { 544 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 545 if (!th) { 546 tcpstat.tcps_rcvshort++; 547 return; 548 } 549 optlen = off - sizeof(struct tcphdr); 550 optp = (u_int8_t *)(th + 1); 551 /* 552 * Do quick retrieval of timestamp options ("options 553 * prediction?"). If timestamp is the only option and it's 554 * formatted as recommended in RFC 1323 appendix A, we 555 * quickly get the values now and not bother calling 556 * tcp_dooptions(), etc. 557 */ 558 if ((optlen == TCPOLEN_TSTAMP_APPA || 559 (optlen > TCPOLEN_TSTAMP_APPA && 560 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 561 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 562 (th->th_flags & TH_SYN) == 0) { 563 opti.ts_present = 1; 564 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 565 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 566 optp = NULL; /* we've parsed the options */ 567 } 568 } 569 tiflags = th->th_flags; 570 571 /* 572 * Convert TCP protocol specific fields to host format. 573 */ 574 th->th_seq = ntohl(th->th_seq); 575 th->th_ack = ntohl(th->th_ack); 576 th->th_win = ntohs(th->th_win); 577 th->th_urp = ntohs(th->th_urp); 578 579 /* 580 * Locate pcb for segment. 581 */ 582 #if NPF > 0 583 inp = pf_inp_lookup(m); 584 #endif 585 findpcb: 586 if (inp == NULL) { 587 switch (af) { 588 #ifdef INET6 589 case AF_INET6: 590 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 591 th->th_sport, &ip6->ip6_dst, th->th_dport, 592 m->m_pkthdr.ph_rtableid); 593 break; 594 #endif 595 case AF_INET: 596 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 597 th->th_sport, ip->ip_dst, th->th_dport, 598 m->m_pkthdr.ph_rtableid); 599 break; 600 } 601 } 602 if (inp == NULL) { 603 int inpl_reverse = 0; 604 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 605 inpl_reverse = 1; 606 ++tcpstat.tcps_pcbhashmiss; 607 switch (af) { 608 #ifdef INET6 609 case AF_INET6: 610 inp = in6_pcblookup_listen(&tcbtable, 611 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 612 m->m_pkthdr.ph_rtableid); 613 break; 614 #endif /* INET6 */ 615 case AF_INET: 616 inp = in_pcblookup_listen(&tcbtable, 617 ip->ip_dst, th->th_dport, inpl_reverse, m, 618 m->m_pkthdr.ph_rtableid); 619 break; 620 } 621 /* 622 * If the state is CLOSED (i.e., TCB does not exist) then 623 * all data in the incoming segment is discarded. 624 * If the TCB exists but is in CLOSED state, it is embryonic, 625 * but should either do a listen or a connect soon. 626 */ 627 if (inp == NULL) { 628 ++tcpstat.tcps_noport; 629 goto dropwithreset_ratelim; 630 } 631 } 632 KASSERT(sotoinpcb(inp->inp_socket) == inp); 633 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 634 635 /* Check the minimum TTL for socket. */ 636 switch (af) { 637 case AF_INET: 638 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 639 goto drop; 640 break; 641 #ifdef INET6 642 case AF_INET6: 643 if (inp->inp_ip6_minhlim && 644 inp->inp_ip6_minhlim > ip6->ip6_hlim) 645 goto drop; 646 break; 647 #endif 648 } 649 650 tp = intotcpcb(inp); 651 if (tp == NULL) 652 goto dropwithreset_ratelim; 653 if (tp->t_state == TCPS_CLOSED) 654 goto drop; 655 656 /* Unscale the window into a 32-bit value. */ 657 if ((tiflags & TH_SYN) == 0) 658 tiwin = th->th_win << tp->snd_scale; 659 else 660 tiwin = th->th_win; 661 662 so = inp->inp_socket; 663 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 664 union syn_cache_sa src; 665 union syn_cache_sa dst; 666 667 bzero(&src, sizeof(src)); 668 bzero(&dst, sizeof(dst)); 669 switch (af) { 670 case AF_INET: 671 src.sin.sin_len = sizeof(struct sockaddr_in); 672 src.sin.sin_family = AF_INET; 673 src.sin.sin_addr = ip->ip_src; 674 src.sin.sin_port = th->th_sport; 675 676 dst.sin.sin_len = sizeof(struct sockaddr_in); 677 dst.sin.sin_family = AF_INET; 678 dst.sin.sin_addr = ip->ip_dst; 679 dst.sin.sin_port = th->th_dport; 680 break; 681 #ifdef INET6 682 case AF_INET6: 683 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 684 src.sin6.sin6_family = AF_INET6; 685 src.sin6.sin6_addr = ip6->ip6_src; 686 src.sin6.sin6_port = th->th_sport; 687 688 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 689 dst.sin6.sin6_family = AF_INET6; 690 dst.sin6.sin6_addr = ip6->ip6_dst; 691 dst.sin6.sin6_port = th->th_dport; 692 break; 693 #endif /* INET6 */ 694 default: 695 goto badsyn; /*sanity*/ 696 } 697 698 if (so->so_options & SO_DEBUG) { 699 ostate = tp->t_state; 700 switch (af) { 701 #ifdef INET6 702 case AF_INET6: 703 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 704 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 705 break; 706 #endif 707 case AF_INET: 708 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 709 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 710 break; 711 } 712 } 713 if (so->so_options & SO_ACCEPTCONN) { 714 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 715 716 case TH_SYN|TH_ACK|TH_RST: 717 case TH_SYN|TH_RST: 718 case TH_ACK|TH_RST: 719 case TH_RST: 720 syn_cache_reset(&src.sa, &dst.sa, th, 721 inp->inp_rtableid); 722 goto drop; 723 724 case TH_SYN|TH_ACK: 725 /* 726 * Received a SYN,ACK. This should 727 * never happen while we are in 728 * LISTEN. Send an RST. 729 */ 730 goto badsyn; 731 732 case TH_ACK: 733 so = syn_cache_get(&src.sa, &dst.sa, 734 th, iphlen, tlen, so, m); 735 if (so == NULL) { 736 /* 737 * We don't have a SYN for 738 * this ACK; send an RST. 739 */ 740 goto badsyn; 741 } else if (so == (struct socket *)(-1)) { 742 /* 743 * We were unable to create 744 * the connection. If the 745 * 3-way handshake was 746 * completed, and RST has 747 * been sent to the peer. 748 * Since the mbuf might be 749 * in use for the reply, 750 * do not free it. 751 */ 752 m = NULL; 753 goto drop; 754 } else { 755 /* 756 * We have created a 757 * full-blown connection. 758 */ 759 tp = NULL; 760 inp = sotoinpcb(so); 761 tp = intotcpcb(inp); 762 if (tp == NULL) 763 goto badsyn; /*XXX*/ 764 765 } 766 break; 767 768 default: 769 /* 770 * None of RST, SYN or ACK was set. 771 * This is an invalid packet for a 772 * TCB in LISTEN state. Send a RST. 773 */ 774 goto badsyn; 775 776 case TH_SYN: 777 /* 778 * Received a SYN. 779 */ 780 #ifdef INET6 781 /* 782 * If deprecated address is forbidden, we do 783 * not accept SYN to deprecated interface 784 * address to prevent any new inbound 785 * connection from getting established. 786 * When we do not accept SYN, we send a TCP 787 * RST, with deprecated source address (instead 788 * of dropping it). We compromise it as it is 789 * much better for peer to send a RST, and 790 * RST will be the final packet for the 791 * exchange. 792 * 793 * If we do not forbid deprecated addresses, we 794 * accept the SYN packet. RFC2462 does not 795 * suggest dropping SYN in this case. 796 * If we decipher RFC2462 5.5.4, it says like 797 * this: 798 * 1. use of deprecated addr with existing 799 * communication is okay - "SHOULD continue 800 * to be used" 801 * 2. use of it with new communication: 802 * (2a) "SHOULD NOT be used if alternate 803 * address with sufficient scope is 804 * available" 805 * (2b) nothing mentioned otherwise. 806 * Here we fall into (2b) case as we have no 807 * choice in our source address selection - we 808 * must obey the peer. 809 * 810 * The wording in RFC2462 is confusing, and 811 * there are multiple description text for 812 * deprecated address handling - worse, they 813 * are not exactly the same. I believe 5.5.4 814 * is the best one, so we follow 5.5.4. 815 */ 816 if (ip6 && !ip6_use_deprecated) { 817 struct in6_ifaddr *ia6; 818 struct ifnet *ifp = 819 if_get(m->m_pkthdr.ph_ifidx); 820 821 if (ifp && 822 (ia6 = in6ifa_ifpwithaddr(ifp, 823 &ip6->ip6_dst)) && 824 (ia6->ia6_flags & 825 IN6_IFF_DEPRECATED)) { 826 tp = NULL; 827 if_put(ifp); 828 goto dropwithreset; 829 } 830 if_put(ifp); 831 } 832 #endif 833 834 /* 835 * LISTEN socket received a SYN 836 * from itself? This can't possibly 837 * be valid; drop the packet. 838 */ 839 if (th->th_dport == th->th_sport) { 840 switch (af) { 841 #ifdef INET6 842 case AF_INET6: 843 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 844 &ip6->ip6_dst)) { 845 tcpstat.tcps_badsyn++; 846 goto drop; 847 } 848 break; 849 #endif /* INET6 */ 850 case AF_INET: 851 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 852 tcpstat.tcps_badsyn++; 853 goto drop; 854 } 855 break; 856 } 857 } 858 859 /* 860 * SYN looks ok; create compressed TCP 861 * state for it. 862 */ 863 if (so->so_qlen > so->so_qlimit || 864 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 865 so, m, optp, optlen, &opti, reuse) == -1) { 866 tcpstat.tcps_dropsyn++; 867 goto drop; 868 } 869 return; 870 } 871 } 872 } 873 874 #ifdef DIAGNOSTIC 875 /* 876 * Should not happen now that all embryonic connections 877 * are handled with compressed state. 878 */ 879 if (tp->t_state == TCPS_LISTEN) 880 panic("tcp_input: TCPS_LISTEN"); 881 #endif 882 883 #if NPF > 0 884 pf_inp_link(m, inp); 885 #endif 886 887 #ifdef IPSEC 888 /* Find most recent IPsec tag */ 889 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 890 if (mtag != NULL) { 891 tdbi = (struct tdb_ident *)(mtag + 1); 892 tdb = gettdb(tdbi->rdomain, tdbi->spi, 893 &tdbi->dst, tdbi->proto); 894 } else 895 tdb = NULL; 896 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 897 tdb, inp, 0); 898 if (error) { 899 tcpstat.tcps_rcvnosec++; 900 goto drop; 901 } 902 #endif /* IPSEC */ 903 904 /* 905 * Segment received on connection. 906 * Reset idle time and keep-alive timer. 907 */ 908 tp->t_rcvtime = tcp_now; 909 if (TCPS_HAVEESTABLISHED(tp->t_state)) 910 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 911 912 #ifdef TCP_SACK 913 if (tp->sack_enable) 914 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 915 #endif /* TCP_SACK */ 916 917 /* 918 * Process options. 919 */ 920 #ifdef TCP_SIGNATURE 921 if (optp || (tp->t_flags & TF_SIGNATURE)) 922 #else 923 if (optp) 924 #endif 925 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 926 m->m_pkthdr.ph_rtableid)) 927 goto drop; 928 929 if (opti.ts_present && opti.ts_ecr) { 930 int rtt_test; 931 932 /* subtract out the tcp timestamp modulator */ 933 opti.ts_ecr -= tp->ts_modulate; 934 935 /* make sure ts_ecr is sensible */ 936 rtt_test = tcp_now - opti.ts_ecr; 937 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 938 opti.ts_ecr = 0; 939 } 940 941 #ifdef TCP_ECN 942 /* if congestion experienced, set ECE bit in subsequent packets. */ 943 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 944 tp->t_flags |= TF_RCVD_CE; 945 tcpstat.tcps_ecn_rcvce++; 946 } 947 #endif 948 /* 949 * Header prediction: check for the two common cases 950 * of a uni-directional data xfer. If the packet has 951 * no control flags, is in-sequence, the window didn't 952 * change and we're not retransmitting, it's a 953 * candidate. If the length is zero and the ack moved 954 * forward, we're the sender side of the xfer. Just 955 * free the data acked & wake any higher level process 956 * that was blocked waiting for space. If the length 957 * is non-zero and the ack didn't move, we're the 958 * receiver side. If we're getting packets in-order 959 * (the reassembly queue is empty), add the data to 960 * the socket buffer and note that we need a delayed ack. 961 */ 962 if (tp->t_state == TCPS_ESTABLISHED && 963 #ifdef TCP_ECN 964 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 965 #else 966 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 967 #endif 968 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 969 th->th_seq == tp->rcv_nxt && 970 tiwin && tiwin == tp->snd_wnd && 971 tp->snd_nxt == tp->snd_max) { 972 973 /* 974 * If last ACK falls within this segment's sequence numbers, 975 * record the timestamp. 976 * Fix from Braden, see Stevens p. 870 977 */ 978 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 979 tp->ts_recent_age = tcp_now; 980 tp->ts_recent = opti.ts_val; 981 } 982 983 if (tlen == 0) { 984 if (SEQ_GT(th->th_ack, tp->snd_una) && 985 SEQ_LEQ(th->th_ack, tp->snd_max) && 986 tp->snd_cwnd >= tp->snd_wnd && 987 tp->t_dupacks == 0) { 988 /* 989 * this is a pure ack for outstanding data. 990 */ 991 ++tcpstat.tcps_predack; 992 if (opti.ts_present && opti.ts_ecr) 993 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 994 else if (tp->t_rtttime && 995 SEQ_GT(th->th_ack, tp->t_rtseq)) 996 tcp_xmit_timer(tp, 997 tcp_now - tp->t_rtttime); 998 acked = th->th_ack - tp->snd_una; 999 tcpstat.tcps_rcvackpack++; 1000 tcpstat.tcps_rcvackbyte += acked; 1001 ND6_HINT(tp); 1002 sbdrop(&so->so_snd, acked); 1003 1004 /* 1005 * If we had a pending ICMP message that 1006 * refers to data that have just been 1007 * acknowledged, disregard the recorded ICMP 1008 * message. 1009 */ 1010 if ((tp->t_flags & TF_PMTUD_PEND) && 1011 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1012 tp->t_flags &= ~TF_PMTUD_PEND; 1013 1014 /* 1015 * Keep track of the largest chunk of data 1016 * acknowledged since last PMTU update 1017 */ 1018 if (tp->t_pmtud_mss_acked < acked) 1019 tp->t_pmtud_mss_acked = acked; 1020 1021 tp->snd_una = th->th_ack; 1022 #if defined(TCP_SACK) || defined(TCP_ECN) 1023 /* 1024 * We want snd_last to track snd_una so 1025 * as to avoid sequence wraparound problems 1026 * for very large transfers. 1027 */ 1028 #ifdef TCP_ECN 1029 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1030 #endif 1031 tp->snd_last = tp->snd_una; 1032 #endif /* TCP_SACK */ 1033 #if defined(TCP_SACK) && defined(TCP_FACK) 1034 tp->snd_fack = tp->snd_una; 1035 tp->retran_data = 0; 1036 #endif /* TCP_FACK */ 1037 m_freem(m); 1038 1039 /* 1040 * If all outstanding data are acked, stop 1041 * retransmit timer, otherwise restart timer 1042 * using current (possibly backed-off) value. 1043 * If process is waiting for space, 1044 * wakeup/selwakeup/signal. If data 1045 * are ready to send, let tcp_output 1046 * decide between more output or persist. 1047 */ 1048 if (tp->snd_una == tp->snd_max) 1049 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1050 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1051 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1052 1053 tcp_update_sndspace(tp); 1054 if (sb_notify(&so->so_snd)) { 1055 tp->t_flags |= TF_BLOCKOUTPUT; 1056 sowwakeup(so); 1057 tp->t_flags &= ~TF_BLOCKOUTPUT; 1058 } 1059 if (so->so_snd.sb_cc || 1060 tp->t_flags & TF_NEEDOUTPUT) 1061 (void) tcp_output(tp); 1062 return; 1063 } 1064 } else if (th->th_ack == tp->snd_una && 1065 TAILQ_EMPTY(&tp->t_segq) && 1066 tlen <= sbspace(&so->so_rcv)) { 1067 /* 1068 * This is a pure, in-sequence data packet 1069 * with nothing on the reassembly queue and 1070 * we have enough buffer space to take it. 1071 */ 1072 #ifdef TCP_SACK 1073 /* Clean receiver SACK report if present */ 1074 if (tp->sack_enable && tp->rcv_numsacks) 1075 tcp_clean_sackreport(tp); 1076 #endif /* TCP_SACK */ 1077 ++tcpstat.tcps_preddat; 1078 tp->rcv_nxt += tlen; 1079 tcpstat.tcps_rcvpack++; 1080 tcpstat.tcps_rcvbyte += tlen; 1081 ND6_HINT(tp); 1082 1083 TCP_SETUP_ACK(tp, tiflags, m); 1084 /* 1085 * Drop TCP, IP headers and TCP options then add data 1086 * to socket buffer. 1087 */ 1088 if (so->so_state & SS_CANTRCVMORE) 1089 m_freem(m); 1090 else { 1091 if (opti.ts_present && opti.ts_ecr) { 1092 if (tp->rfbuf_ts < opti.ts_ecr && 1093 opti.ts_ecr - tp->rfbuf_ts < hz) { 1094 tcp_update_rcvspace(tp); 1095 /* Start over with next RTT. */ 1096 tp->rfbuf_cnt = 0; 1097 tp->rfbuf_ts = 0; 1098 } else 1099 tp->rfbuf_cnt += tlen; 1100 } 1101 m_adj(m, iphlen + off); 1102 sbappendstream(&so->so_rcv, m); 1103 } 1104 tp->t_flags |= TF_BLOCKOUTPUT; 1105 sorwakeup(so); 1106 tp->t_flags &= ~TF_BLOCKOUTPUT; 1107 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1108 (void) tcp_output(tp); 1109 return; 1110 } 1111 } 1112 1113 /* 1114 * Compute mbuf offset to TCP data segment. 1115 */ 1116 hdroptlen = iphlen + off; 1117 1118 /* 1119 * Calculate amount of space in receive window, 1120 * and then do TCP input processing. 1121 * Receive window is amount of space in rcv queue, 1122 * but not less than advertised window. 1123 */ 1124 { int win; 1125 1126 win = sbspace(&so->so_rcv); 1127 if (win < 0) 1128 win = 0; 1129 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1130 } 1131 1132 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1133 tp->rfbuf_cnt = 0; 1134 tp->rfbuf_ts = 0; 1135 1136 switch (tp->t_state) { 1137 1138 /* 1139 * If the state is SYN_RECEIVED: 1140 * if seg contains SYN/ACK, send an RST. 1141 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1142 */ 1143 1144 case TCPS_SYN_RECEIVED: 1145 if (tiflags & TH_ACK) { 1146 if (tiflags & TH_SYN) { 1147 tcpstat.tcps_badsyn++; 1148 goto dropwithreset; 1149 } 1150 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1151 SEQ_GT(th->th_ack, tp->snd_max)) 1152 goto dropwithreset; 1153 } 1154 break; 1155 1156 /* 1157 * If the state is SYN_SENT: 1158 * if seg contains an ACK, but not for our SYN, drop the input. 1159 * if seg contains a RST, then drop the connection. 1160 * if seg does not contain SYN, then drop it. 1161 * Otherwise this is an acceptable SYN segment 1162 * initialize tp->rcv_nxt and tp->irs 1163 * if seg contains ack then advance tp->snd_una 1164 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1165 * arrange for segment to be acked (eventually) 1166 * continue processing rest of data/controls, beginning with URG 1167 */ 1168 case TCPS_SYN_SENT: 1169 if ((tiflags & TH_ACK) && 1170 (SEQ_LEQ(th->th_ack, tp->iss) || 1171 SEQ_GT(th->th_ack, tp->snd_max))) 1172 goto dropwithreset; 1173 if (tiflags & TH_RST) { 1174 #ifdef TCP_ECN 1175 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1176 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1177 goto drop; 1178 #endif 1179 if (tiflags & TH_ACK) 1180 tp = tcp_drop(tp, ECONNREFUSED); 1181 goto drop; 1182 } 1183 if ((tiflags & TH_SYN) == 0) 1184 goto drop; 1185 if (tiflags & TH_ACK) { 1186 tp->snd_una = th->th_ack; 1187 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1188 tp->snd_nxt = tp->snd_una; 1189 } 1190 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1191 tp->irs = th->th_seq; 1192 tcp_mss(tp, opti.maxseg); 1193 /* Reset initial window to 1 segment for retransmit */ 1194 if (tp->t_rxtshift > 0) 1195 tp->snd_cwnd = tp->t_maxseg; 1196 tcp_rcvseqinit(tp); 1197 tp->t_flags |= TF_ACKNOW; 1198 #ifdef TCP_SACK 1199 /* 1200 * If we've sent a SACK_PERMITTED option, and the peer 1201 * also replied with one, then TF_SACK_PERMIT should have 1202 * been set in tcp_dooptions(). If it was not, disable SACKs. 1203 */ 1204 if (tp->sack_enable) 1205 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1206 #endif 1207 #ifdef TCP_ECN 1208 /* 1209 * if ECE is set but CWR is not set for SYN-ACK, or 1210 * both ECE and CWR are set for simultaneous open, 1211 * peer is ECN capable. 1212 */ 1213 if (tcp_do_ecn) { 1214 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1215 case TH_ACK|TH_ECE: 1216 case TH_ECE|TH_CWR: 1217 tp->t_flags |= TF_ECN_PERMIT; 1218 tiflags &= ~(TH_ECE|TH_CWR); 1219 tcpstat.tcps_ecn_accepts++; 1220 } 1221 } 1222 #endif 1223 1224 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1225 tcpstat.tcps_connects++; 1226 soisconnected(so); 1227 tp->t_state = TCPS_ESTABLISHED; 1228 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1229 /* Do window scaling on this connection? */ 1230 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1231 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1232 tp->snd_scale = tp->requested_s_scale; 1233 tp->rcv_scale = tp->request_r_scale; 1234 } 1235 tcp_flush_queue(tp); 1236 1237 /* 1238 * if we didn't have to retransmit the SYN, 1239 * use its rtt as our initial srtt & rtt var. 1240 */ 1241 if (tp->t_rtttime) 1242 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1243 /* 1244 * Since new data was acked (the SYN), open the 1245 * congestion window by one MSS. We do this 1246 * here, because we won't go through the normal 1247 * ACK processing below. And since this is the 1248 * start of the connection, we know we are in 1249 * the exponential phase of slow-start. 1250 */ 1251 tp->snd_cwnd += tp->t_maxseg; 1252 } else 1253 tp->t_state = TCPS_SYN_RECEIVED; 1254 1255 #if 0 1256 trimthenstep6: 1257 #endif 1258 /* 1259 * Advance th->th_seq to correspond to first data byte. 1260 * If data, trim to stay within window, 1261 * dropping FIN if necessary. 1262 */ 1263 th->th_seq++; 1264 if (tlen > tp->rcv_wnd) { 1265 todrop = tlen - tp->rcv_wnd; 1266 m_adj(m, -todrop); 1267 tlen = tp->rcv_wnd; 1268 tiflags &= ~TH_FIN; 1269 tcpstat.tcps_rcvpackafterwin++; 1270 tcpstat.tcps_rcvbyteafterwin += todrop; 1271 } 1272 tp->snd_wl1 = th->th_seq - 1; 1273 tp->rcv_up = th->th_seq; 1274 goto step6; 1275 /* 1276 * If a new connection request is received while in TIME_WAIT, 1277 * drop the old connection and start over if the if the 1278 * timestamp or the sequence numbers are above the previous 1279 * ones. 1280 */ 1281 case TCPS_TIME_WAIT: 1282 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1283 ((opti.ts_present && 1284 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1285 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1286 #if NPF > 0 1287 /* 1288 * The socket will be recreated but the new state 1289 * has already been linked to the socket. Remove the 1290 * link between old socket and new state. 1291 */ 1292 pf_inp_unlink(inp); 1293 #endif 1294 /* 1295 * Advance the iss by at least 32768, but 1296 * clear the msb in order to make sure 1297 * that SEG_LT(snd_nxt, iss). 1298 */ 1299 iss = tp->snd_nxt + 1300 ((arc4random() & 0x7fffffff) | 0x8000); 1301 reuse = &iss; 1302 tp = tcp_close(tp); 1303 inp = NULL; 1304 goto findpcb; 1305 } 1306 } 1307 1308 /* 1309 * States other than LISTEN or SYN_SENT. 1310 * First check timestamp, if present. 1311 * Then check that at least some bytes of segment are within 1312 * receive window. If segment begins before rcv_nxt, 1313 * drop leading data (and SYN); if nothing left, just ack. 1314 * 1315 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1316 * and it's less than opti.ts_recent, drop it. 1317 */ 1318 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1319 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1320 1321 /* Check to see if ts_recent is over 24 days old. */ 1322 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1323 /* 1324 * Invalidate ts_recent. If this segment updates 1325 * ts_recent, the age will be reset later and ts_recent 1326 * will get a valid value. If it does not, setting 1327 * ts_recent to zero will at least satisfy the 1328 * requirement that zero be placed in the timestamp 1329 * echo reply when ts_recent isn't valid. The 1330 * age isn't reset until we get a valid ts_recent 1331 * because we don't want out-of-order segments to be 1332 * dropped when ts_recent is old. 1333 */ 1334 tp->ts_recent = 0; 1335 } else { 1336 tcpstat.tcps_rcvduppack++; 1337 tcpstat.tcps_rcvdupbyte += tlen; 1338 tcpstat.tcps_pawsdrop++; 1339 goto dropafterack; 1340 } 1341 } 1342 1343 todrop = tp->rcv_nxt - th->th_seq; 1344 if (todrop > 0) { 1345 if (tiflags & TH_SYN) { 1346 tiflags &= ~TH_SYN; 1347 th->th_seq++; 1348 if (th->th_urp > 1) 1349 th->th_urp--; 1350 else 1351 tiflags &= ~TH_URG; 1352 todrop--; 1353 } 1354 if (todrop > tlen || 1355 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1356 /* 1357 * Any valid FIN must be to the left of the 1358 * window. At this point, FIN must be a 1359 * duplicate or out-of-sequence, so drop it. 1360 */ 1361 tiflags &= ~TH_FIN; 1362 /* 1363 * Send ACK to resynchronize, and drop any data, 1364 * but keep on processing for RST or ACK. 1365 */ 1366 tp->t_flags |= TF_ACKNOW; 1367 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1368 tcpstat.tcps_rcvduppack++; 1369 } else { 1370 tcpstat.tcps_rcvpartduppack++; 1371 tcpstat.tcps_rcvpartdupbyte += todrop; 1372 } 1373 hdroptlen += todrop; /* drop from head afterwards */ 1374 th->th_seq += todrop; 1375 tlen -= todrop; 1376 if (th->th_urp > todrop) 1377 th->th_urp -= todrop; 1378 else { 1379 tiflags &= ~TH_URG; 1380 th->th_urp = 0; 1381 } 1382 } 1383 1384 /* 1385 * If new data are received on a connection after the 1386 * user processes are gone, then RST the other end. 1387 */ 1388 if ((so->so_state & SS_NOFDREF) && 1389 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1390 tp = tcp_close(tp); 1391 tcpstat.tcps_rcvafterclose++; 1392 goto dropwithreset; 1393 } 1394 1395 /* 1396 * If segment ends after window, drop trailing data 1397 * (and PUSH and FIN); if nothing left, just ACK. 1398 */ 1399 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1400 if (todrop > 0) { 1401 tcpstat.tcps_rcvpackafterwin++; 1402 if (todrop >= tlen) { 1403 tcpstat.tcps_rcvbyteafterwin += tlen; 1404 /* 1405 * If window is closed can only take segments at 1406 * window edge, and have to drop data and PUSH from 1407 * incoming segments. Continue processing, but 1408 * remember to ack. Otherwise, drop segment 1409 * and ack. 1410 */ 1411 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1412 tp->t_flags |= TF_ACKNOW; 1413 tcpstat.tcps_rcvwinprobe++; 1414 } else 1415 goto dropafterack; 1416 } else 1417 tcpstat.tcps_rcvbyteafterwin += todrop; 1418 m_adj(m, -todrop); 1419 tlen -= todrop; 1420 tiflags &= ~(TH_PUSH|TH_FIN); 1421 } 1422 1423 /* 1424 * If last ACK falls within this segment's sequence numbers, 1425 * record its timestamp if it's more recent. 1426 * Cf fix from Braden, see Stevens p. 870 1427 */ 1428 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1429 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1430 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1431 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1432 tp->ts_recent = opti.ts_val; 1433 else 1434 tp->ts_recent = 0; 1435 tp->ts_recent_age = tcp_now; 1436 } 1437 1438 /* 1439 * If the RST bit is set examine the state: 1440 * SYN_RECEIVED STATE: 1441 * If passive open, return to LISTEN state. 1442 * If active open, inform user that connection was refused. 1443 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1444 * Inform user that connection was reset, and close tcb. 1445 * CLOSING, LAST_ACK, TIME_WAIT STATES 1446 * Close the tcb. 1447 */ 1448 if (tiflags & TH_RST) { 1449 if (th->th_seq != tp->last_ack_sent && 1450 th->th_seq != tp->rcv_nxt && 1451 th->th_seq != (tp->rcv_nxt + 1)) 1452 goto drop; 1453 1454 switch (tp->t_state) { 1455 case TCPS_SYN_RECEIVED: 1456 #ifdef TCP_ECN 1457 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1458 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1459 goto drop; 1460 #endif 1461 so->so_error = ECONNREFUSED; 1462 goto close; 1463 1464 case TCPS_ESTABLISHED: 1465 case TCPS_FIN_WAIT_1: 1466 case TCPS_FIN_WAIT_2: 1467 case TCPS_CLOSE_WAIT: 1468 so->so_error = ECONNRESET; 1469 close: 1470 tp->t_state = TCPS_CLOSED; 1471 tcpstat.tcps_drops++; 1472 tp = tcp_close(tp); 1473 goto drop; 1474 case TCPS_CLOSING: 1475 case TCPS_LAST_ACK: 1476 case TCPS_TIME_WAIT: 1477 tp = tcp_close(tp); 1478 goto drop; 1479 } 1480 } 1481 1482 /* 1483 * If a SYN is in the window, then this is an 1484 * error and we ACK and drop the packet. 1485 */ 1486 if (tiflags & TH_SYN) 1487 goto dropafterack_ratelim; 1488 1489 /* 1490 * If the ACK bit is off we drop the segment and return. 1491 */ 1492 if ((tiflags & TH_ACK) == 0) { 1493 if (tp->t_flags & TF_ACKNOW) 1494 goto dropafterack; 1495 else 1496 goto drop; 1497 } 1498 1499 /* 1500 * Ack processing. 1501 */ 1502 switch (tp->t_state) { 1503 1504 /* 1505 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1506 * ESTABLISHED state and continue processing. 1507 * The ACK was checked above. 1508 */ 1509 case TCPS_SYN_RECEIVED: 1510 tcpstat.tcps_connects++; 1511 soisconnected(so); 1512 tp->t_state = TCPS_ESTABLISHED; 1513 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1514 /* Do window scaling? */ 1515 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1516 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1517 tp->snd_scale = tp->requested_s_scale; 1518 tp->rcv_scale = tp->request_r_scale; 1519 tiwin = th->th_win << tp->snd_scale; 1520 } 1521 tcp_flush_queue(tp); 1522 tp->snd_wl1 = th->th_seq - 1; 1523 /* fall into ... */ 1524 1525 /* 1526 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1527 * ACKs. If the ack is in the range 1528 * tp->snd_una < th->th_ack <= tp->snd_max 1529 * then advance tp->snd_una to th->th_ack and drop 1530 * data from the retransmission queue. If this ACK reflects 1531 * more up to date window information we update our window information. 1532 */ 1533 case TCPS_ESTABLISHED: 1534 case TCPS_FIN_WAIT_1: 1535 case TCPS_FIN_WAIT_2: 1536 case TCPS_CLOSE_WAIT: 1537 case TCPS_CLOSING: 1538 case TCPS_LAST_ACK: 1539 case TCPS_TIME_WAIT: 1540 #ifdef TCP_ECN 1541 /* 1542 * if we receive ECE and are not already in recovery phase, 1543 * reduce cwnd by half but don't slow-start. 1544 * advance snd_last to snd_max not to reduce cwnd again 1545 * until all outstanding packets are acked. 1546 */ 1547 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1548 if ((tp->t_flags & TF_ECN_PERMIT) && 1549 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1550 u_int win; 1551 1552 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1553 if (win > 1) { 1554 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1555 tp->snd_cwnd = tp->snd_ssthresh; 1556 tp->snd_last = tp->snd_max; 1557 tp->t_flags |= TF_SEND_CWR; 1558 tcpstat.tcps_cwr_ecn++; 1559 } 1560 } 1561 tcpstat.tcps_ecn_rcvece++; 1562 } 1563 /* 1564 * if we receive CWR, we know that the peer has reduced 1565 * its congestion window. stop sending ecn-echo. 1566 */ 1567 if ((tiflags & TH_CWR)) { 1568 tp->t_flags &= ~TF_RCVD_CE; 1569 tcpstat.tcps_ecn_rcvcwr++; 1570 } 1571 #endif /* TCP_ECN */ 1572 1573 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1574 /* 1575 * Duplicate/old ACK processing. 1576 * Increments t_dupacks: 1577 * Pure duplicate (same seq/ack/window, no data) 1578 * Doesn't affect t_dupacks: 1579 * Data packets. 1580 * Normal window updates (window opens) 1581 * Resets t_dupacks: 1582 * New data ACKed. 1583 * Window shrinks 1584 * Old ACK 1585 */ 1586 if (tlen) { 1587 /* Drop very old ACKs unless th_seq matches */ 1588 if (th->th_seq != tp->rcv_nxt && 1589 SEQ_LT(th->th_ack, 1590 tp->snd_una - tp->max_sndwnd)) { 1591 tcpstat.tcps_rcvacktooold++; 1592 goto drop; 1593 } 1594 break; 1595 } 1596 /* 1597 * If we get an old ACK, there is probably packet 1598 * reordering going on. Be conservative and reset 1599 * t_dupacks so that we are less aggressive in 1600 * doing a fast retransmit. 1601 */ 1602 if (th->th_ack != tp->snd_una) { 1603 tp->t_dupacks = 0; 1604 break; 1605 } 1606 if (tiwin == tp->snd_wnd) { 1607 tcpstat.tcps_rcvdupack++; 1608 /* 1609 * If we have outstanding data (other than 1610 * a window probe), this is a completely 1611 * duplicate ack (ie, window info didn't 1612 * change), the ack is the biggest we've 1613 * seen and we've seen exactly our rexmt 1614 * threshold of them, assume a packet 1615 * has been dropped and retransmit it. 1616 * Kludge snd_nxt & the congestion 1617 * window so we send only this one 1618 * packet. 1619 * 1620 * We know we're losing at the current 1621 * window size so do congestion avoidance 1622 * (set ssthresh to half the current window 1623 * and pull our congestion window back to 1624 * the new ssthresh). 1625 * 1626 * Dup acks mean that packets have left the 1627 * network (they're now cached at the receiver) 1628 * so bump cwnd by the amount in the receiver 1629 * to keep a constant cwnd packets in the 1630 * network. 1631 */ 1632 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1633 tp->t_dupacks = 0; 1634 #if defined(TCP_SACK) && defined(TCP_FACK) 1635 /* 1636 * In FACK, can enter fast rec. if the receiver 1637 * reports a reass. queue longer than 3 segs. 1638 */ 1639 else if (++tp->t_dupacks == tcprexmtthresh || 1640 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1641 tp->t_maxseg + tp->snd_una)) && 1642 SEQ_GT(tp->snd_una, tp->snd_last))) { 1643 #else 1644 else if (++tp->t_dupacks == tcprexmtthresh) { 1645 #endif /* TCP_FACK */ 1646 tcp_seq onxt = tp->snd_nxt; 1647 u_long win = 1648 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1649 2 / tp->t_maxseg; 1650 1651 #if defined(TCP_SACK) || defined(TCP_ECN) 1652 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1653 /* 1654 * False fast retx after 1655 * timeout. Do not cut window. 1656 */ 1657 tp->t_dupacks = 0; 1658 goto drop; 1659 } 1660 #endif 1661 if (win < 2) 1662 win = 2; 1663 tp->snd_ssthresh = win * tp->t_maxseg; 1664 #ifdef TCP_SACK 1665 tp->snd_last = tp->snd_max; 1666 if (tp->sack_enable) { 1667 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1668 tp->t_rtttime = 0; 1669 #ifdef TCP_ECN 1670 tp->t_flags |= TF_SEND_CWR; 1671 #endif 1672 tcpstat.tcps_cwr_frecovery++; 1673 tcpstat.tcps_sack_recovery_episode++; 1674 #if defined(TCP_SACK) && defined(TCP_FACK) 1675 tp->t_dupacks = tcprexmtthresh; 1676 (void) tcp_output(tp); 1677 /* 1678 * During FR, snd_cwnd is held 1679 * constant for FACK. 1680 */ 1681 tp->snd_cwnd = tp->snd_ssthresh; 1682 #else 1683 /* 1684 * tcp_output() will send 1685 * oldest SACK-eligible rtx. 1686 */ 1687 (void) tcp_output(tp); 1688 tp->snd_cwnd = tp->snd_ssthresh+ 1689 tp->t_maxseg * tp->t_dupacks; 1690 #endif /* TCP_FACK */ 1691 goto drop; 1692 } 1693 #endif /* TCP_SACK */ 1694 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1695 tp->t_rtttime = 0; 1696 tp->snd_nxt = th->th_ack; 1697 tp->snd_cwnd = tp->t_maxseg; 1698 #ifdef TCP_ECN 1699 tp->t_flags |= TF_SEND_CWR; 1700 #endif 1701 tcpstat.tcps_cwr_frecovery++; 1702 tcpstat.tcps_sndrexmitfast++; 1703 (void) tcp_output(tp); 1704 1705 tp->snd_cwnd = tp->snd_ssthresh + 1706 tp->t_maxseg * tp->t_dupacks; 1707 if (SEQ_GT(onxt, tp->snd_nxt)) 1708 tp->snd_nxt = onxt; 1709 goto drop; 1710 } else if (tp->t_dupacks > tcprexmtthresh) { 1711 #if defined(TCP_SACK) && defined(TCP_FACK) 1712 /* 1713 * while (awnd < cwnd) 1714 * sendsomething(); 1715 */ 1716 if (tp->sack_enable) { 1717 if (tp->snd_awnd < tp->snd_cwnd) 1718 tcp_output(tp); 1719 goto drop; 1720 } 1721 #endif /* TCP_FACK */ 1722 tp->snd_cwnd += tp->t_maxseg; 1723 (void) tcp_output(tp); 1724 goto drop; 1725 } 1726 } else if (tiwin < tp->snd_wnd) { 1727 /* 1728 * The window was retracted! Previous dup 1729 * ACKs may have been due to packets arriving 1730 * after the shrunken window, not a missing 1731 * packet, so play it safe and reset t_dupacks 1732 */ 1733 tp->t_dupacks = 0; 1734 } 1735 break; 1736 } 1737 /* 1738 * If the congestion window was inflated to account 1739 * for the other side's cached packets, retract it. 1740 */ 1741 #if defined(TCP_SACK) 1742 if (tp->sack_enable) { 1743 if (tp->t_dupacks >= tcprexmtthresh) { 1744 /* Check for a partial ACK */ 1745 if (tcp_sack_partialack(tp, th)) { 1746 #if defined(TCP_SACK) && defined(TCP_FACK) 1747 /* Force call to tcp_output */ 1748 if (tp->snd_awnd < tp->snd_cwnd) 1749 tp->t_flags |= TF_NEEDOUTPUT; 1750 #else 1751 tp->snd_cwnd += tp->t_maxseg; 1752 tp->t_flags |= TF_NEEDOUTPUT; 1753 #endif /* TCP_FACK */ 1754 } else { 1755 /* Out of fast recovery */ 1756 tp->snd_cwnd = tp->snd_ssthresh; 1757 if (tcp_seq_subtract(tp->snd_max, 1758 th->th_ack) < tp->snd_ssthresh) 1759 tp->snd_cwnd = 1760 tcp_seq_subtract(tp->snd_max, 1761 th->th_ack); 1762 tp->t_dupacks = 0; 1763 #if defined(TCP_SACK) && defined(TCP_FACK) 1764 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1765 tp->snd_fack = th->th_ack; 1766 #endif /* TCP_FACK */ 1767 } 1768 } 1769 } else { 1770 if (tp->t_dupacks >= tcprexmtthresh && 1771 !tcp_newreno(tp, th)) { 1772 /* Out of fast recovery */ 1773 tp->snd_cwnd = tp->snd_ssthresh; 1774 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1775 tp->snd_ssthresh) 1776 tp->snd_cwnd = 1777 tcp_seq_subtract(tp->snd_max, 1778 th->th_ack); 1779 tp->t_dupacks = 0; 1780 } 1781 } 1782 if (tp->t_dupacks < tcprexmtthresh) 1783 tp->t_dupacks = 0; 1784 #else /* else no TCP_SACK */ 1785 if (tp->t_dupacks >= tcprexmtthresh && 1786 tp->snd_cwnd > tp->snd_ssthresh) 1787 tp->snd_cwnd = tp->snd_ssthresh; 1788 tp->t_dupacks = 0; 1789 #endif 1790 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1791 tcpstat.tcps_rcvacktoomuch++; 1792 goto dropafterack_ratelim; 1793 } 1794 acked = th->th_ack - tp->snd_una; 1795 tcpstat.tcps_rcvackpack++; 1796 tcpstat.tcps_rcvackbyte += acked; 1797 1798 /* 1799 * If we have a timestamp reply, update smoothed 1800 * round trip time. If no timestamp is present but 1801 * transmit timer is running and timed sequence 1802 * number was acked, update smoothed round trip time. 1803 * Since we now have an rtt measurement, cancel the 1804 * timer backoff (cf., Phil Karn's retransmit alg.). 1805 * Recompute the initial retransmit timer. 1806 */ 1807 if (opti.ts_present && opti.ts_ecr) 1808 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1809 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1810 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1811 1812 /* 1813 * If all outstanding data is acked, stop retransmit 1814 * timer and remember to restart (more output or persist). 1815 * If there is more data to be acked, restart retransmit 1816 * timer, using current (possibly backed-off) value. 1817 */ 1818 if (th->th_ack == tp->snd_max) { 1819 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1820 tp->t_flags |= TF_NEEDOUTPUT; 1821 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1822 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1823 /* 1824 * When new data is acked, open the congestion window. 1825 * If the window gives us less than ssthresh packets 1826 * in flight, open exponentially (maxseg per packet). 1827 * Otherwise open linearly: maxseg per window 1828 * (maxseg^2 / cwnd per packet). 1829 */ 1830 { 1831 u_int cw = tp->snd_cwnd; 1832 u_int incr = tp->t_maxseg; 1833 1834 if (cw > tp->snd_ssthresh) 1835 incr = incr * incr / cw; 1836 #if defined (TCP_SACK) 1837 if (tp->t_dupacks < tcprexmtthresh) 1838 #endif 1839 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1840 } 1841 ND6_HINT(tp); 1842 if (acked > so->so_snd.sb_cc) { 1843 tp->snd_wnd -= so->so_snd.sb_cc; 1844 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1845 ourfinisacked = 1; 1846 } else { 1847 sbdrop(&so->so_snd, acked); 1848 tp->snd_wnd -= acked; 1849 ourfinisacked = 0; 1850 } 1851 1852 tcp_update_sndspace(tp); 1853 if (sb_notify(&so->so_snd)) { 1854 tp->t_flags |= TF_BLOCKOUTPUT; 1855 sowwakeup(so); 1856 tp->t_flags &= ~TF_BLOCKOUTPUT; 1857 } 1858 1859 /* 1860 * If we had a pending ICMP message that referred to data 1861 * that have just been acknowledged, disregard the recorded 1862 * ICMP message. 1863 */ 1864 if ((tp->t_flags & TF_PMTUD_PEND) && 1865 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1866 tp->t_flags &= ~TF_PMTUD_PEND; 1867 1868 /* 1869 * Keep track of the largest chunk of data acknowledged 1870 * since last PMTU update 1871 */ 1872 if (tp->t_pmtud_mss_acked < acked) 1873 tp->t_pmtud_mss_acked = acked; 1874 1875 tp->snd_una = th->th_ack; 1876 #ifdef TCP_ECN 1877 /* sync snd_last with snd_una */ 1878 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1879 tp->snd_last = tp->snd_una; 1880 #endif 1881 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1882 tp->snd_nxt = tp->snd_una; 1883 #if defined (TCP_SACK) && defined (TCP_FACK) 1884 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1885 tp->snd_fack = tp->snd_una; 1886 /* Update snd_awnd for partial ACK 1887 * without any SACK blocks. 1888 */ 1889 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1890 tp->snd_fack) + tp->retran_data; 1891 } 1892 #endif 1893 1894 switch (tp->t_state) { 1895 1896 /* 1897 * In FIN_WAIT_1 STATE in addition to the processing 1898 * for the ESTABLISHED state if our FIN is now acknowledged 1899 * then enter FIN_WAIT_2. 1900 */ 1901 case TCPS_FIN_WAIT_1: 1902 if (ourfinisacked) { 1903 /* 1904 * If we can't receive any more 1905 * data, then closing user can proceed. 1906 * Starting the timer is contrary to the 1907 * specification, but if we don't get a FIN 1908 * we'll hang forever. 1909 */ 1910 if (so->so_state & SS_CANTRCVMORE) { 1911 soisdisconnected(so); 1912 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1913 } 1914 tp->t_state = TCPS_FIN_WAIT_2; 1915 } 1916 break; 1917 1918 /* 1919 * In CLOSING STATE in addition to the processing for 1920 * the ESTABLISHED state if the ACK acknowledges our FIN 1921 * then enter the TIME-WAIT state, otherwise ignore 1922 * the segment. 1923 */ 1924 case TCPS_CLOSING: 1925 if (ourfinisacked) { 1926 tp->t_state = TCPS_TIME_WAIT; 1927 tcp_canceltimers(tp); 1928 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1929 soisdisconnected(so); 1930 } 1931 break; 1932 1933 /* 1934 * In LAST_ACK, we may still be waiting for data to drain 1935 * and/or to be acked, as well as for the ack of our FIN. 1936 * If our FIN is now acknowledged, delete the TCB, 1937 * enter the closed state and return. 1938 */ 1939 case TCPS_LAST_ACK: 1940 if (ourfinisacked) { 1941 tp = tcp_close(tp); 1942 goto drop; 1943 } 1944 break; 1945 1946 /* 1947 * In TIME_WAIT state the only thing that should arrive 1948 * is a retransmission of the remote FIN. Acknowledge 1949 * it and restart the finack timer. 1950 */ 1951 case TCPS_TIME_WAIT: 1952 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1953 goto dropafterack; 1954 } 1955 } 1956 1957 step6: 1958 /* 1959 * Update window information. 1960 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1961 */ 1962 if ((tiflags & TH_ACK) && 1963 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1964 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1965 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1966 /* keep track of pure window updates */ 1967 if (tlen == 0 && 1968 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1969 tcpstat.tcps_rcvwinupd++; 1970 tp->snd_wnd = tiwin; 1971 tp->snd_wl1 = th->th_seq; 1972 tp->snd_wl2 = th->th_ack; 1973 if (tp->snd_wnd > tp->max_sndwnd) 1974 tp->max_sndwnd = tp->snd_wnd; 1975 tp->t_flags |= TF_NEEDOUTPUT; 1976 } 1977 1978 /* 1979 * Process segments with URG. 1980 */ 1981 if ((tiflags & TH_URG) && th->th_urp && 1982 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1983 /* 1984 * This is a kludge, but if we receive and accept 1985 * random urgent pointers, we'll crash in 1986 * soreceive. It's hard to imagine someone 1987 * actually wanting to send this much urgent data. 1988 */ 1989 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1990 th->th_urp = 0; /* XXX */ 1991 tiflags &= ~TH_URG; /* XXX */ 1992 goto dodata; /* XXX */ 1993 } 1994 /* 1995 * If this segment advances the known urgent pointer, 1996 * then mark the data stream. This should not happen 1997 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1998 * a FIN has been received from the remote side. 1999 * In these states we ignore the URG. 2000 * 2001 * According to RFC961 (Assigned Protocols), 2002 * the urgent pointer points to the last octet 2003 * of urgent data. We continue, however, 2004 * to consider it to indicate the first octet 2005 * of data past the urgent section as the original 2006 * spec states (in one of two places). 2007 */ 2008 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2009 tp->rcv_up = th->th_seq + th->th_urp; 2010 so->so_oobmark = so->so_rcv.sb_cc + 2011 (tp->rcv_up - tp->rcv_nxt) - 1; 2012 if (so->so_oobmark == 0) 2013 so->so_state |= SS_RCVATMARK; 2014 sohasoutofband(so); 2015 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2016 } 2017 /* 2018 * Remove out of band data so doesn't get presented to user. 2019 * This can happen independent of advancing the URG pointer, 2020 * but if two URG's are pending at once, some out-of-band 2021 * data may creep in... ick. 2022 */ 2023 if (th->th_urp <= (u_int16_t) tlen && 2024 (so->so_options & SO_OOBINLINE) == 0) 2025 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2026 } else 2027 /* 2028 * If no out of band data is expected, 2029 * pull receive urgent pointer along 2030 * with the receive window. 2031 */ 2032 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2033 tp->rcv_up = tp->rcv_nxt; 2034 dodata: /* XXX */ 2035 2036 /* 2037 * Process the segment text, merging it into the TCP sequencing queue, 2038 * and arranging for acknowledgment of receipt if necessary. 2039 * This process logically involves adjusting tp->rcv_wnd as data 2040 * is presented to the user (this happens in tcp_usrreq.c, 2041 * case PRU_RCVD). If a FIN has already been received on this 2042 * connection then we just ignore the text. 2043 */ 2044 if ((tlen || (tiflags & TH_FIN)) && 2045 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2046 #ifdef TCP_SACK 2047 tcp_seq laststart = th->th_seq; 2048 tcp_seq lastend = th->th_seq + tlen; 2049 #endif 2050 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2051 tp->t_state == TCPS_ESTABLISHED) { 2052 TCP_SETUP_ACK(tp, tiflags, m); 2053 tp->rcv_nxt += tlen; 2054 tiflags = th->th_flags & TH_FIN; 2055 tcpstat.tcps_rcvpack++; 2056 tcpstat.tcps_rcvbyte += tlen; 2057 ND6_HINT(tp); 2058 if (so->so_state & SS_CANTRCVMORE) 2059 m_freem(m); 2060 else { 2061 m_adj(m, hdroptlen); 2062 sbappendstream(&so->so_rcv, m); 2063 } 2064 tp->t_flags |= TF_BLOCKOUTPUT; 2065 sorwakeup(so); 2066 tp->t_flags &= ~TF_BLOCKOUTPUT; 2067 } else { 2068 m_adj(m, hdroptlen); 2069 tiflags = tcp_reass(tp, th, m, &tlen); 2070 tp->t_flags |= TF_ACKNOW; 2071 } 2072 #ifdef TCP_SACK 2073 if (tp->sack_enable) 2074 tcp_update_sack_list(tp, laststart, lastend); 2075 #endif 2076 2077 /* 2078 * variable len never referenced again in modern BSD, 2079 * so why bother computing it ?? 2080 */ 2081 #if 0 2082 /* 2083 * Note the amount of data that peer has sent into 2084 * our window, in order to estimate the sender's 2085 * buffer size. 2086 */ 2087 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2088 #endif /* 0 */ 2089 } else { 2090 m_freem(m); 2091 tiflags &= ~TH_FIN; 2092 } 2093 2094 /* 2095 * If FIN is received ACK the FIN and let the user know 2096 * that the connection is closing. Ignore a FIN received before 2097 * the connection is fully established. 2098 */ 2099 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2100 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2101 socantrcvmore(so); 2102 tp->t_flags |= TF_ACKNOW; 2103 tp->rcv_nxt++; 2104 } 2105 switch (tp->t_state) { 2106 2107 /* 2108 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2109 */ 2110 case TCPS_ESTABLISHED: 2111 tp->t_state = TCPS_CLOSE_WAIT; 2112 break; 2113 2114 /* 2115 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2116 * enter the CLOSING state. 2117 */ 2118 case TCPS_FIN_WAIT_1: 2119 tp->t_state = TCPS_CLOSING; 2120 break; 2121 2122 /* 2123 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2124 * starting the time-wait timer, turning off the other 2125 * standard timers. 2126 */ 2127 case TCPS_FIN_WAIT_2: 2128 tp->t_state = TCPS_TIME_WAIT; 2129 tcp_canceltimers(tp); 2130 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2131 soisdisconnected(so); 2132 break; 2133 2134 /* 2135 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2136 */ 2137 case TCPS_TIME_WAIT: 2138 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2139 break; 2140 } 2141 } 2142 if (so->so_options & SO_DEBUG) { 2143 switch (tp->pf) { 2144 #ifdef INET6 2145 case PF_INET6: 2146 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2147 0, tlen); 2148 break; 2149 #endif /* INET6 */ 2150 case PF_INET: 2151 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2152 0, tlen); 2153 break; 2154 } 2155 } 2156 2157 /* 2158 * Return any desired output. 2159 */ 2160 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2161 (void) tcp_output(tp); 2162 return; 2163 2164 badsyn: 2165 /* 2166 * Received a bad SYN. Increment counters and dropwithreset. 2167 */ 2168 tcpstat.tcps_badsyn++; 2169 tp = NULL; 2170 goto dropwithreset; 2171 2172 dropafterack_ratelim: 2173 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2174 tcp_ackdrop_ppslim) == 0) { 2175 /* XXX stat */ 2176 goto drop; 2177 } 2178 /* ...fall into dropafterack... */ 2179 2180 dropafterack: 2181 /* 2182 * Generate an ACK dropping incoming segment if it occupies 2183 * sequence space, where the ACK reflects our state. 2184 */ 2185 if (tiflags & TH_RST) 2186 goto drop; 2187 m_freem(m); 2188 tp->t_flags |= TF_ACKNOW; 2189 (void) tcp_output(tp); 2190 return; 2191 2192 dropwithreset_ratelim: 2193 /* 2194 * We may want to rate-limit RSTs in certain situations, 2195 * particularly if we are sending an RST in response to 2196 * an attempt to connect to or otherwise communicate with 2197 * a port for which we have no socket. 2198 */ 2199 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2200 tcp_rst_ppslim) == 0) { 2201 /* XXX stat */ 2202 goto drop; 2203 } 2204 /* ...fall into dropwithreset... */ 2205 2206 dropwithreset: 2207 /* 2208 * Generate a RST, dropping incoming segment. 2209 * Make ACK acceptable to originator of segment. 2210 * Don't bother to respond to RST. 2211 */ 2212 if (tiflags & TH_RST) 2213 goto drop; 2214 if (tiflags & TH_ACK) { 2215 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2216 TH_RST, m->m_pkthdr.ph_rtableid); 2217 } else { 2218 if (tiflags & TH_SYN) 2219 tlen++; 2220 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2221 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2222 } 2223 m_freem(m); 2224 return; 2225 2226 drop: 2227 /* 2228 * Drop space held by incoming segment and return. 2229 */ 2230 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2231 switch (tp->pf) { 2232 #ifdef INET6 2233 case PF_INET6: 2234 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2235 0, tlen); 2236 break; 2237 #endif /* INET6 */ 2238 case PF_INET: 2239 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2240 0, tlen); 2241 break; 2242 } 2243 } 2244 2245 m_freem(m); 2246 return; 2247 } 2248 2249 int 2250 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2251 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2252 u_int rtableid) 2253 { 2254 u_int16_t mss = 0; 2255 int opt, optlen; 2256 #ifdef TCP_SIGNATURE 2257 caddr_t sigp = NULL; 2258 struct tdb *tdb = NULL; 2259 #endif /* TCP_SIGNATURE */ 2260 2261 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2262 opt = cp[0]; 2263 if (opt == TCPOPT_EOL) 2264 break; 2265 if (opt == TCPOPT_NOP) 2266 optlen = 1; 2267 else { 2268 if (cnt < 2) 2269 break; 2270 optlen = cp[1]; 2271 if (optlen < 2 || optlen > cnt) 2272 break; 2273 } 2274 switch (opt) { 2275 2276 default: 2277 continue; 2278 2279 case TCPOPT_MAXSEG: 2280 if (optlen != TCPOLEN_MAXSEG) 2281 continue; 2282 if (!(th->th_flags & TH_SYN)) 2283 continue; 2284 if (TCPS_HAVERCVDSYN(tp->t_state)) 2285 continue; 2286 memcpy(&mss, cp + 2, sizeof(mss)); 2287 mss = ntohs(mss); 2288 oi->maxseg = mss; 2289 break; 2290 2291 case TCPOPT_WINDOW: 2292 if (optlen != TCPOLEN_WINDOW) 2293 continue; 2294 if (!(th->th_flags & TH_SYN)) 2295 continue; 2296 if (TCPS_HAVERCVDSYN(tp->t_state)) 2297 continue; 2298 tp->t_flags |= TF_RCVD_SCALE; 2299 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2300 break; 2301 2302 case TCPOPT_TIMESTAMP: 2303 if (optlen != TCPOLEN_TIMESTAMP) 2304 continue; 2305 oi->ts_present = 1; 2306 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2307 oi->ts_val = ntohl(oi->ts_val); 2308 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2309 oi->ts_ecr = ntohl(oi->ts_ecr); 2310 2311 if (!(th->th_flags & TH_SYN)) 2312 continue; 2313 if (TCPS_HAVERCVDSYN(tp->t_state)) 2314 continue; 2315 /* 2316 * A timestamp received in a SYN makes 2317 * it ok to send timestamp requests and replies. 2318 */ 2319 tp->t_flags |= TF_RCVD_TSTMP; 2320 tp->ts_recent = oi->ts_val; 2321 tp->ts_recent_age = tcp_now; 2322 break; 2323 2324 #ifdef TCP_SACK 2325 case TCPOPT_SACK_PERMITTED: 2326 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2327 continue; 2328 if (!(th->th_flags & TH_SYN)) 2329 continue; 2330 if (TCPS_HAVERCVDSYN(tp->t_state)) 2331 continue; 2332 /* MUST only be set on SYN */ 2333 tp->t_flags |= TF_SACK_PERMIT; 2334 break; 2335 case TCPOPT_SACK: 2336 tcp_sack_option(tp, th, cp, optlen); 2337 break; 2338 #endif 2339 #ifdef TCP_SIGNATURE 2340 case TCPOPT_SIGNATURE: 2341 if (optlen != TCPOLEN_SIGNATURE) 2342 continue; 2343 2344 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2345 return (-1); 2346 2347 sigp = cp + 2; 2348 break; 2349 #endif /* TCP_SIGNATURE */ 2350 } 2351 } 2352 2353 #ifdef TCP_SIGNATURE 2354 if (tp->t_flags & TF_SIGNATURE) { 2355 union sockaddr_union src, dst; 2356 2357 memset(&src, 0, sizeof(union sockaddr_union)); 2358 memset(&dst, 0, sizeof(union sockaddr_union)); 2359 2360 switch (tp->pf) { 2361 case 0: 2362 case AF_INET: 2363 src.sa.sa_len = sizeof(struct sockaddr_in); 2364 src.sa.sa_family = AF_INET; 2365 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2366 dst.sa.sa_len = sizeof(struct sockaddr_in); 2367 dst.sa.sa_family = AF_INET; 2368 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2369 break; 2370 #ifdef INET6 2371 case AF_INET6: 2372 src.sa.sa_len = sizeof(struct sockaddr_in6); 2373 src.sa.sa_family = AF_INET6; 2374 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2375 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2376 dst.sa.sa_family = AF_INET6; 2377 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2378 break; 2379 #endif /* INET6 */ 2380 } 2381 2382 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2383 0, &src, &dst, IPPROTO_TCP); 2384 2385 /* 2386 * We don't have an SA for this peer, so we turn off 2387 * TF_SIGNATURE on the listen socket 2388 */ 2389 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2390 tp->t_flags &= ~TF_SIGNATURE; 2391 2392 } 2393 2394 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2395 tcpstat.tcps_rcvbadsig++; 2396 return (-1); 2397 } 2398 2399 if (sigp) { 2400 char sig[16]; 2401 2402 if (tdb == NULL) { 2403 tcpstat.tcps_rcvbadsig++; 2404 return (-1); 2405 } 2406 2407 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2408 return (-1); 2409 2410 if (timingsafe_bcmp(sig, sigp, 16)) { 2411 tcpstat.tcps_rcvbadsig++; 2412 return (-1); 2413 } 2414 2415 tcpstat.tcps_rcvgoodsig++; 2416 } 2417 #endif /* TCP_SIGNATURE */ 2418 2419 return (0); 2420 } 2421 2422 #if defined(TCP_SACK) 2423 u_long 2424 tcp_seq_subtract(u_long a, u_long b) 2425 { 2426 return ((long)(a - b)); 2427 } 2428 #endif 2429 2430 2431 #ifdef TCP_SACK 2432 /* 2433 * This function is called upon receipt of new valid data (while not in header 2434 * prediction mode), and it updates the ordered list of sacks. 2435 */ 2436 void 2437 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2438 tcp_seq rcv_lastend) 2439 { 2440 /* 2441 * First reported block MUST be the most recent one. Subsequent 2442 * blocks SHOULD be in the order in which they arrived at the 2443 * receiver. These two conditions make the implementation fully 2444 * compliant with RFC 2018. 2445 */ 2446 int i, j = 0, count = 0, lastpos = -1; 2447 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2448 2449 /* First clean up current list of sacks */ 2450 for (i = 0; i < tp->rcv_numsacks; i++) { 2451 sack = tp->sackblks[i]; 2452 if (sack.start == 0 && sack.end == 0) { 2453 count++; /* count = number of blocks to be discarded */ 2454 continue; 2455 } 2456 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2457 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2458 count++; 2459 } else { 2460 temp[j].start = tp->sackblks[i].start; 2461 temp[j++].end = tp->sackblks[i].end; 2462 } 2463 } 2464 tp->rcv_numsacks -= count; 2465 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2466 tcp_clean_sackreport(tp); 2467 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2468 /* ==> need first sack block */ 2469 tp->sackblks[0].start = rcv_laststart; 2470 tp->sackblks[0].end = rcv_lastend; 2471 tp->rcv_numsacks = 1; 2472 } 2473 return; 2474 } 2475 /* Otherwise, sack blocks are already present. */ 2476 for (i = 0; i < tp->rcv_numsacks; i++) 2477 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2478 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2479 return; /* sack list remains unchanged */ 2480 /* 2481 * From here, segment just received should be (part of) the 1st sack. 2482 * Go through list, possibly coalescing sack block entries. 2483 */ 2484 firstsack.start = rcv_laststart; 2485 firstsack.end = rcv_lastend; 2486 for (i = 0; i < tp->rcv_numsacks; i++) { 2487 sack = tp->sackblks[i]; 2488 if (SEQ_LT(sack.end, firstsack.start) || 2489 SEQ_GT(sack.start, firstsack.end)) 2490 continue; /* no overlap */ 2491 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2492 /* 2493 * identical block; delete it here since we will 2494 * move it to the front of the list. 2495 */ 2496 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2497 lastpos = i; /* last posn with a zero entry */ 2498 continue; 2499 } 2500 if (SEQ_LEQ(sack.start, firstsack.start)) 2501 firstsack.start = sack.start; /* merge blocks */ 2502 if (SEQ_GEQ(sack.end, firstsack.end)) 2503 firstsack.end = sack.end; /* merge blocks */ 2504 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2505 lastpos = i; /* last posn with a zero entry */ 2506 } 2507 if (lastpos != -1) { /* at least one merge */ 2508 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2509 sack = tp->sackblks[i]; 2510 if (sack.start == 0 && sack.end == 0) 2511 continue; 2512 temp[j++] = sack; 2513 } 2514 tp->rcv_numsacks = j; /* including first blk (added later) */ 2515 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2516 tp->sackblks[i] = temp[i]; 2517 } else { /* no merges -- shift sacks by 1 */ 2518 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2519 tp->rcv_numsacks++; 2520 for (i = tp->rcv_numsacks-1; i > 0; i--) 2521 tp->sackblks[i] = tp->sackblks[i-1]; 2522 } 2523 tp->sackblks[0] = firstsack; 2524 return; 2525 } 2526 2527 /* 2528 * Process the TCP SACK option. tp->snd_holes is an ordered list 2529 * of holes (oldest to newest, in terms of the sequence space). 2530 */ 2531 void 2532 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2533 { 2534 int tmp_olen; 2535 u_char *tmp_cp; 2536 struct sackhole *cur, *p, *temp; 2537 2538 if (!tp->sack_enable) 2539 return; 2540 /* SACK without ACK doesn't make sense. */ 2541 if ((th->th_flags & TH_ACK) == 0) 2542 return; 2543 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2544 if (SEQ_LT(th->th_ack, tp->snd_una) || 2545 SEQ_GT(th->th_ack, tp->snd_max)) 2546 return; 2547 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2548 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2549 return; 2550 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2551 tmp_cp = cp + 2; 2552 tmp_olen = optlen - 2; 2553 tcpstat.tcps_sack_rcv_opts++; 2554 if (tp->snd_numholes < 0) 2555 tp->snd_numholes = 0; 2556 if (tp->t_maxseg == 0) 2557 panic("tcp_sack_option"); /* Should never happen */ 2558 while (tmp_olen > 0) { 2559 struct sackblk sack; 2560 2561 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2562 sack.start = ntohl(sack.start); 2563 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2564 sack.end = ntohl(sack.end); 2565 tmp_olen -= TCPOLEN_SACK; 2566 tmp_cp += TCPOLEN_SACK; 2567 if (SEQ_LEQ(sack.end, sack.start)) 2568 continue; /* bad SACK fields */ 2569 if (SEQ_LEQ(sack.end, tp->snd_una)) 2570 continue; /* old block */ 2571 #if defined(TCP_SACK) && defined(TCP_FACK) 2572 /* Updates snd_fack. */ 2573 if (SEQ_GT(sack.end, tp->snd_fack)) 2574 tp->snd_fack = sack.end; 2575 #endif /* TCP_FACK */ 2576 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2577 if (SEQ_LT(sack.start, th->th_ack)) 2578 continue; 2579 } 2580 if (SEQ_GT(sack.end, tp->snd_max)) 2581 continue; 2582 if (tp->snd_holes == NULL) { /* first hole */ 2583 tp->snd_holes = (struct sackhole *) 2584 pool_get(&sackhl_pool, PR_NOWAIT); 2585 if (tp->snd_holes == NULL) { 2586 /* ENOBUFS, so ignore SACKed block for now*/ 2587 goto done; 2588 } 2589 cur = tp->snd_holes; 2590 cur->start = th->th_ack; 2591 cur->end = sack.start; 2592 cur->rxmit = cur->start; 2593 cur->next = NULL; 2594 tp->snd_numholes = 1; 2595 tp->rcv_lastsack = sack.end; 2596 /* 2597 * dups is at least one. If more data has been 2598 * SACKed, it can be greater than one. 2599 */ 2600 cur->dups = min(tcprexmtthresh, 2601 ((sack.end - cur->end)/tp->t_maxseg)); 2602 if (cur->dups < 1) 2603 cur->dups = 1; 2604 continue; /* with next sack block */ 2605 } 2606 /* Go thru list of holes: p = previous, cur = current */ 2607 p = cur = tp->snd_holes; 2608 while (cur) { 2609 if (SEQ_LEQ(sack.end, cur->start)) 2610 /* SACKs data before the current hole */ 2611 break; /* no use going through more holes */ 2612 if (SEQ_GEQ(sack.start, cur->end)) { 2613 /* SACKs data beyond the current hole */ 2614 cur->dups++; 2615 if (((sack.end - cur->end)/tp->t_maxseg) >= 2616 tcprexmtthresh) 2617 cur->dups = tcprexmtthresh; 2618 p = cur; 2619 cur = cur->next; 2620 continue; 2621 } 2622 if (SEQ_LEQ(sack.start, cur->start)) { 2623 /* Data acks at least the beginning of hole */ 2624 #if defined(TCP_SACK) && defined(TCP_FACK) 2625 if (SEQ_GT(sack.end, cur->rxmit)) 2626 tp->retran_data -= 2627 tcp_seq_subtract(cur->rxmit, 2628 cur->start); 2629 else 2630 tp->retran_data -= 2631 tcp_seq_subtract(sack.end, 2632 cur->start); 2633 #endif /* TCP_FACK */ 2634 if (SEQ_GEQ(sack.end, cur->end)) { 2635 /* Acks entire hole, so delete hole */ 2636 if (p != cur) { 2637 p->next = cur->next; 2638 pool_put(&sackhl_pool, cur); 2639 cur = p->next; 2640 } else { 2641 cur = cur->next; 2642 pool_put(&sackhl_pool, p); 2643 p = cur; 2644 tp->snd_holes = p; 2645 } 2646 tp->snd_numholes--; 2647 continue; 2648 } 2649 /* otherwise, move start of hole forward */ 2650 cur->start = sack.end; 2651 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2652 p = cur; 2653 cur = cur->next; 2654 continue; 2655 } 2656 /* move end of hole backward */ 2657 if (SEQ_GEQ(sack.end, cur->end)) { 2658 #if defined(TCP_SACK) && defined(TCP_FACK) 2659 if (SEQ_GT(cur->rxmit, sack.start)) 2660 tp->retran_data -= 2661 tcp_seq_subtract(cur->rxmit, 2662 sack.start); 2663 #endif /* TCP_FACK */ 2664 cur->end = sack.start; 2665 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2666 cur->dups++; 2667 if (((sack.end - cur->end)/tp->t_maxseg) >= 2668 tcprexmtthresh) 2669 cur->dups = tcprexmtthresh; 2670 p = cur; 2671 cur = cur->next; 2672 continue; 2673 } 2674 if (SEQ_LT(cur->start, sack.start) && 2675 SEQ_GT(cur->end, sack.end)) { 2676 /* 2677 * ACKs some data in middle of a hole; need to 2678 * split current hole 2679 */ 2680 temp = (struct sackhole *) 2681 pool_get(&sackhl_pool, PR_NOWAIT); 2682 if (temp == NULL) 2683 goto done; /* ENOBUFS */ 2684 #if defined(TCP_SACK) && defined(TCP_FACK) 2685 if (SEQ_GT(cur->rxmit, sack.end)) 2686 tp->retran_data -= 2687 tcp_seq_subtract(sack.end, 2688 sack.start); 2689 else if (SEQ_GT(cur->rxmit, sack.start)) 2690 tp->retran_data -= 2691 tcp_seq_subtract(cur->rxmit, 2692 sack.start); 2693 #endif /* TCP_FACK */ 2694 temp->next = cur->next; 2695 temp->start = sack.end; 2696 temp->end = cur->end; 2697 temp->dups = cur->dups; 2698 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2699 cur->end = sack.start; 2700 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2701 cur->dups++; 2702 if (((sack.end - cur->end)/tp->t_maxseg) >= 2703 tcprexmtthresh) 2704 cur->dups = tcprexmtthresh; 2705 cur->next = temp; 2706 p = temp; 2707 cur = p->next; 2708 tp->snd_numholes++; 2709 } 2710 } 2711 /* At this point, p points to the last hole on the list */ 2712 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2713 /* 2714 * Need to append new hole at end. 2715 * Last hole is p (and it's not NULL). 2716 */ 2717 temp = (struct sackhole *) 2718 pool_get(&sackhl_pool, PR_NOWAIT); 2719 if (temp == NULL) 2720 goto done; /* ENOBUFS */ 2721 temp->start = tp->rcv_lastsack; 2722 temp->end = sack.start; 2723 temp->dups = min(tcprexmtthresh, 2724 ((sack.end - sack.start)/tp->t_maxseg)); 2725 if (temp->dups < 1) 2726 temp->dups = 1; 2727 temp->rxmit = temp->start; 2728 temp->next = 0; 2729 p->next = temp; 2730 tp->rcv_lastsack = sack.end; 2731 tp->snd_numholes++; 2732 } 2733 } 2734 done: 2735 #if defined(TCP_SACK) && defined(TCP_FACK) 2736 /* 2737 * Update retran_data and snd_awnd. Go through the list of 2738 * holes. Increment retran_data by (hole->rxmit - hole->start). 2739 */ 2740 tp->retran_data = 0; 2741 cur = tp->snd_holes; 2742 while (cur) { 2743 tp->retran_data += cur->rxmit - cur->start; 2744 cur = cur->next; 2745 } 2746 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2747 tp->retran_data; 2748 #endif /* TCP_FACK */ 2749 2750 return; 2751 } 2752 2753 /* 2754 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2755 * it is completely acked; otherwise, tcp_sack_option(), called from 2756 * tcp_dooptions(), will fix up the hole. 2757 */ 2758 void 2759 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2760 { 2761 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2762 /* max because this could be an older ack just arrived */ 2763 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2764 th->th_ack : tp->snd_una; 2765 struct sackhole *cur = tp->snd_holes; 2766 struct sackhole *prev; 2767 while (cur) 2768 if (SEQ_LEQ(cur->end, lastack)) { 2769 prev = cur; 2770 cur = cur->next; 2771 pool_put(&sackhl_pool, prev); 2772 tp->snd_numholes--; 2773 } else if (SEQ_LT(cur->start, lastack)) { 2774 cur->start = lastack; 2775 if (SEQ_LT(cur->rxmit, cur->start)) 2776 cur->rxmit = cur->start; 2777 break; 2778 } else 2779 break; 2780 tp->snd_holes = cur; 2781 } 2782 } 2783 2784 /* 2785 * Delete all receiver-side SACK information. 2786 */ 2787 void 2788 tcp_clean_sackreport(struct tcpcb *tp) 2789 { 2790 int i; 2791 2792 tp->rcv_numsacks = 0; 2793 for (i = 0; i < MAX_SACK_BLKS; i++) 2794 tp->sackblks[i].start = tp->sackblks[i].end=0; 2795 2796 } 2797 2798 /* 2799 * Checks for partial ack. If partial ack arrives, turn off retransmission 2800 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2801 * If the ack advances at least to tp->snd_last, return 0. 2802 */ 2803 int 2804 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2805 { 2806 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2807 /* Turn off retx. timer (will start again next segment) */ 2808 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2809 tp->t_rtttime = 0; 2810 #ifndef TCP_FACK 2811 /* 2812 * Partial window deflation. This statement relies on the 2813 * fact that tp->snd_una has not been updated yet. In FACK 2814 * hold snd_cwnd constant during fast recovery. 2815 */ 2816 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2817 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2818 tp->snd_cwnd += tp->t_maxseg; 2819 } else 2820 tp->snd_cwnd = tp->t_maxseg; 2821 #endif 2822 return (1); 2823 } 2824 return (0); 2825 } 2826 #endif /* TCP_SACK */ 2827 2828 /* 2829 * Pull out of band byte out of a segment so 2830 * it doesn't appear in the user's data queue. 2831 * It is still reflected in the segment length for 2832 * sequencing purposes. 2833 */ 2834 void 2835 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2836 { 2837 int cnt = off + urgent - 1; 2838 2839 while (cnt >= 0) { 2840 if (m->m_len > cnt) { 2841 char *cp = mtod(m, caddr_t) + cnt; 2842 struct tcpcb *tp = sototcpcb(so); 2843 2844 tp->t_iobc = *cp; 2845 tp->t_oobflags |= TCPOOB_HAVEDATA; 2846 memmove(cp, cp + 1, m->m_len - cnt - 1); 2847 m->m_len--; 2848 return; 2849 } 2850 cnt -= m->m_len; 2851 m = m->m_next; 2852 if (m == NULL) 2853 break; 2854 } 2855 panic("tcp_pulloutofband"); 2856 } 2857 2858 /* 2859 * Collect new round-trip time estimate 2860 * and update averages and current timeout. 2861 */ 2862 void 2863 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2864 { 2865 short delta; 2866 short rttmin; 2867 2868 if (rtt < 0) 2869 rtt = 0; 2870 else if (rtt > TCP_RTT_MAX) 2871 rtt = TCP_RTT_MAX; 2872 2873 tcpstat.tcps_rttupdated++; 2874 if (tp->t_srtt != 0) { 2875 /* 2876 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2877 * after the binary point (scaled by 4), whereas 2878 * srtt is stored as fixed point with 5 bits after the 2879 * binary point (i.e., scaled by 32). The following magic 2880 * is equivalent to the smoothing algorithm in rfc793 with 2881 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2882 * point). 2883 */ 2884 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2885 (tp->t_srtt >> TCP_RTT_SHIFT); 2886 if ((tp->t_srtt += delta) <= 0) 2887 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2888 /* 2889 * We accumulate a smoothed rtt variance (actually, a 2890 * smoothed mean difference), then set the retransmit 2891 * timer to smoothed rtt + 4 times the smoothed variance. 2892 * rttvar is stored as fixed point with 4 bits after the 2893 * binary point (scaled by 16). The following is 2894 * equivalent to rfc793 smoothing with an alpha of .75 2895 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2896 * rfc793's wired-in beta. 2897 */ 2898 if (delta < 0) 2899 delta = -delta; 2900 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2901 if ((tp->t_rttvar += delta) <= 0) 2902 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2903 } else { 2904 /* 2905 * No rtt measurement yet - use the unsmoothed rtt. 2906 * Set the variance to half the rtt (so our first 2907 * retransmit happens at 3*rtt). 2908 */ 2909 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2910 tp->t_rttvar = (rtt + 1) << 2911 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2912 } 2913 tp->t_rtttime = 0; 2914 tp->t_rxtshift = 0; 2915 2916 /* 2917 * the retransmit should happen at rtt + 4 * rttvar. 2918 * Because of the way we do the smoothing, srtt and rttvar 2919 * will each average +1/2 tick of bias. When we compute 2920 * the retransmit timer, we want 1/2 tick of rounding and 2921 * 1 extra tick because of +-1/2 tick uncertainty in the 2922 * firing of the timer. The bias will give us exactly the 2923 * 1.5 tick we need. But, because the bias is 2924 * statistical, we have to test that we don't drop below 2925 * the minimum feasible timer (which is 2 ticks). 2926 */ 2927 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2928 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2929 2930 /* 2931 * We received an ack for a packet that wasn't retransmitted; 2932 * it is probably safe to discard any error indications we've 2933 * received recently. This isn't quite right, but close enough 2934 * for now (a route might have failed after we sent a segment, 2935 * and the return path might not be symmetrical). 2936 */ 2937 tp->t_softerror = 0; 2938 } 2939 2940 /* 2941 * Determine a reasonable value for maxseg size. 2942 * If the route is known, check route for mtu. 2943 * If none, use an mss that can be handled on the outgoing 2944 * interface without forcing IP to fragment; if bigger than 2945 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2946 * to utilize large mbufs. If no route is found, route has no mtu, 2947 * or the destination isn't local, use a default, hopefully conservative 2948 * size (usually 512 or the default IP max size, but no more than the mtu 2949 * of the interface), as we can't discover anything about intervening 2950 * gateways or networks. We also initialize the congestion/slow start 2951 * window to be a single segment if the destination isn't local. 2952 * While looking at the routing entry, we also initialize other path-dependent 2953 * parameters from pre-set or cached values in the routing entry. 2954 * 2955 * Also take into account the space needed for options that we 2956 * send regularly. Make maxseg shorter by that amount to assure 2957 * that we can send maxseg amount of data even when the options 2958 * are present. Store the upper limit of the length of options plus 2959 * data in maxopd. 2960 * 2961 * NOTE: offer == -1 indicates that the maxseg size changed due to 2962 * Path MTU discovery. 2963 */ 2964 int 2965 tcp_mss(struct tcpcb *tp, int offer) 2966 { 2967 struct rtentry *rt; 2968 struct ifnet *ifp = NULL; 2969 int mss, mssopt; 2970 int iphlen; 2971 struct inpcb *inp; 2972 2973 inp = tp->t_inpcb; 2974 2975 mssopt = mss = tcp_mssdflt; 2976 2977 rt = in_pcbrtentry(inp); 2978 2979 if (rt == NULL) 2980 goto out; 2981 2982 ifp = if_get(rt->rt_ifidx); 2983 if (ifp == NULL) 2984 goto out; 2985 2986 switch (tp->pf) { 2987 #ifdef INET6 2988 case AF_INET6: 2989 iphlen = sizeof(struct ip6_hdr); 2990 break; 2991 #endif 2992 case AF_INET: 2993 iphlen = sizeof(struct ip); 2994 break; 2995 default: 2996 /* the family does not support path MTU discovery */ 2997 goto out; 2998 } 2999 3000 /* 3001 * if there's an mtu associated with the route and we support 3002 * path MTU discovery for the underlying protocol family, use it. 3003 */ 3004 if (rt->rt_rmx.rmx_mtu) { 3005 /* 3006 * One may wish to lower MSS to take into account options, 3007 * especially security-related options. 3008 */ 3009 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3010 /* 3011 * RFC2460 section 5, last paragraph: if path MTU is 3012 * smaller than 1280, use 1280 as packet size and 3013 * attach fragment header. 3014 */ 3015 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3016 sizeof(struct tcphdr); 3017 } else { 3018 mss = rt->rt_rmx.rmx_mtu - iphlen - 3019 sizeof(struct tcphdr); 3020 } 3021 } else if (ifp->if_flags & IFF_LOOPBACK) { 3022 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3023 } else if (tp->pf == AF_INET) { 3024 if (ip_mtudisc) 3025 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3026 } 3027 #ifdef INET6 3028 else if (tp->pf == AF_INET6) { 3029 /* 3030 * for IPv6, path MTU discovery is always turned on, 3031 * or the node must use packet size <= 1280. 3032 */ 3033 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3034 } 3035 #endif /* INET6 */ 3036 3037 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3038 if (offer != -1) { 3039 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3040 mssopt = max(tcp_mssdflt, mssopt); 3041 } 3042 out: 3043 if_put(ifp); 3044 /* 3045 * The current mss, t_maxseg, is initialized to the default value. 3046 * If we compute a smaller value, reduce the current mss. 3047 * If we compute a larger value, return it for use in sending 3048 * a max seg size option, but don't store it for use 3049 * unless we received an offer at least that large from peer. 3050 * 3051 * However, do not accept offers lower than the minimum of 3052 * the interface MTU and 216. 3053 */ 3054 if (offer > 0) 3055 tp->t_peermss = offer; 3056 if (tp->t_peermss) 3057 mss = min(mss, max(tp->t_peermss, 216)); 3058 3059 /* sanity - at least max opt. space */ 3060 mss = max(mss, 64); 3061 3062 /* 3063 * maxopd stores the maximum length of data AND options 3064 * in a segment; maxseg is the amount of data in a normal 3065 * segment. We need to store this value (maxopd) apart 3066 * from maxseg, because now every segment carries options 3067 * and thus we normally have somewhat less data in segments. 3068 */ 3069 tp->t_maxopd = mss; 3070 3071 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3072 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3073 mss -= TCPOLEN_TSTAMP_APPA; 3074 #ifdef TCP_SIGNATURE 3075 if (tp->t_flags & TF_SIGNATURE) 3076 mss -= TCPOLEN_SIGLEN; 3077 #endif 3078 3079 if (offer == -1) { 3080 /* mss changed due to Path MTU discovery */ 3081 tp->t_flags &= ~TF_PMTUD_PEND; 3082 tp->t_pmtud_mtu_sent = 0; 3083 tp->t_pmtud_mss_acked = 0; 3084 if (mss < tp->t_maxseg) { 3085 /* 3086 * Follow suggestion in RFC 2414 to reduce the 3087 * congestion window by the ratio of the old 3088 * segment size to the new segment size. 3089 */ 3090 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3091 mss, mss); 3092 } 3093 } else if (tcp_do_rfc3390 == 2) { 3094 /* increase initial window */ 3095 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3096 } else if (tcp_do_rfc3390) { 3097 /* increase initial window */ 3098 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3099 } else 3100 tp->snd_cwnd = mss; 3101 3102 tp->t_maxseg = mss; 3103 3104 return (offer != -1 ? mssopt : mss); 3105 } 3106 3107 u_int 3108 tcp_hdrsz(struct tcpcb *tp) 3109 { 3110 u_int hlen; 3111 3112 switch (tp->pf) { 3113 #ifdef INET6 3114 case AF_INET6: 3115 hlen = sizeof(struct ip6_hdr); 3116 break; 3117 #endif 3118 case AF_INET: 3119 hlen = sizeof(struct ip); 3120 break; 3121 default: 3122 hlen = 0; 3123 break; 3124 } 3125 hlen += sizeof(struct tcphdr); 3126 3127 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3128 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3129 hlen += TCPOLEN_TSTAMP_APPA; 3130 #ifdef TCP_SIGNATURE 3131 if (tp->t_flags & TF_SIGNATURE) 3132 hlen += TCPOLEN_SIGLEN; 3133 #endif 3134 return (hlen); 3135 } 3136 3137 /* 3138 * Set connection variables based on the effective MSS. 3139 * We are passed the TCPCB for the actual connection. If we 3140 * are the server, we are called by the compressed state engine 3141 * when the 3-way handshake is complete. If we are the client, 3142 * we are called when we receive the SYN,ACK from the server. 3143 * 3144 * NOTE: The t_maxseg value must be initialized in the TCPCB 3145 * before this routine is called! 3146 */ 3147 void 3148 tcp_mss_update(struct tcpcb *tp) 3149 { 3150 int mss; 3151 u_long bufsize; 3152 struct rtentry *rt; 3153 struct socket *so; 3154 3155 so = tp->t_inpcb->inp_socket; 3156 mss = tp->t_maxseg; 3157 3158 rt = in_pcbrtentry(tp->t_inpcb); 3159 3160 if (rt == NULL) 3161 return; 3162 3163 bufsize = so->so_snd.sb_hiwat; 3164 if (bufsize < mss) { 3165 mss = bufsize; 3166 /* Update t_maxseg and t_maxopd */ 3167 tcp_mss(tp, mss); 3168 } else { 3169 bufsize = roundup(bufsize, mss); 3170 if (bufsize > sb_max) 3171 bufsize = sb_max; 3172 (void)sbreserve(&so->so_snd, bufsize); 3173 } 3174 3175 bufsize = so->so_rcv.sb_hiwat; 3176 if (bufsize > mss) { 3177 bufsize = roundup(bufsize, mss); 3178 if (bufsize > sb_max) 3179 bufsize = sb_max; 3180 (void)sbreserve(&so->so_rcv, bufsize); 3181 } 3182 3183 } 3184 3185 #if defined (TCP_SACK) 3186 /* 3187 * Checks for partial ack. If partial ack arrives, force the retransmission 3188 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3189 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3190 * be started again. If the ack advances at least to tp->snd_last, return 0. 3191 */ 3192 int 3193 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3194 { 3195 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3196 /* 3197 * snd_una has not been updated and the socket send buffer 3198 * not yet drained of the acked data, so we have to leave 3199 * snd_una as it was to get the correct data offset in 3200 * tcp_output(). 3201 */ 3202 tcp_seq onxt = tp->snd_nxt; 3203 u_long ocwnd = tp->snd_cwnd; 3204 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3205 tp->t_rtttime = 0; 3206 tp->snd_nxt = th->th_ack; 3207 /* 3208 * Set snd_cwnd to one segment beyond acknowledged offset 3209 * (tp->snd_una not yet updated when this function is called) 3210 */ 3211 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3212 (void) tcp_output(tp); 3213 tp->snd_cwnd = ocwnd; 3214 if (SEQ_GT(onxt, tp->snd_nxt)) 3215 tp->snd_nxt = onxt; 3216 /* 3217 * Partial window deflation. Relies on fact that tp->snd_una 3218 * not updated yet. 3219 */ 3220 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3221 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3222 else 3223 tp->snd_cwnd = 0; 3224 tp->snd_cwnd += tp->t_maxseg; 3225 3226 return 1; 3227 } 3228 return 0; 3229 } 3230 #endif /* TCP_SACK */ 3231 3232 int 3233 tcp_mss_adv(struct mbuf *m, int af) 3234 { 3235 int mss = 0; 3236 int iphlen; 3237 struct ifnet *ifp = NULL; 3238 3239 if (m && (m->m_flags & M_PKTHDR)) 3240 ifp = if_get(m->m_pkthdr.ph_ifidx); 3241 3242 switch (af) { 3243 case AF_INET: 3244 if (ifp != NULL) 3245 mss = ifp->if_mtu; 3246 iphlen = sizeof(struct ip); 3247 break; 3248 #ifdef INET6 3249 case AF_INET6: 3250 if (ifp != NULL) 3251 mss = ifp->if_mtu; 3252 iphlen = sizeof(struct ip6_hdr); 3253 break; 3254 #endif 3255 default: 3256 unhandled_af(af); 3257 } 3258 if_put(ifp); 3259 mss = mss - iphlen - sizeof(struct tcphdr); 3260 return (max(mss, tcp_mssdflt)); 3261 } 3262 3263 /* 3264 * TCP compressed state engine. Currently used to hold compressed 3265 * state for SYN_RECEIVED. 3266 */ 3267 3268 /* syn hash parameters */ 3269 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3270 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3271 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3272 int tcp_syn_use_limit = 100000; 3273 3274 struct syn_cache_set tcp_syn_cache[2]; 3275 int tcp_syn_cache_active; 3276 3277 #define SYN_HASH(sa, sp, dp, rand) \ 3278 (((sa)->s_addr ^ (rand)[0]) * \ 3279 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3280 #ifndef INET6 3281 #define SYN_HASHALL(hash, src, dst, rand) \ 3282 do { \ 3283 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3284 satosin(src)->sin_port, \ 3285 satosin(dst)->sin_port, (rand)); \ 3286 } while (/*CONSTCOND*/ 0) 3287 #else 3288 #define SYN_HASH6(sa, sp, dp, rand) \ 3289 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3290 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3291 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3292 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3293 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3294 3295 #define SYN_HASHALL(hash, src, dst, rand) \ 3296 do { \ 3297 switch ((src)->sa_family) { \ 3298 case AF_INET: \ 3299 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3300 satosin(src)->sin_port, \ 3301 satosin(dst)->sin_port, (rand)); \ 3302 break; \ 3303 case AF_INET6: \ 3304 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3305 satosin6(src)->sin6_port, \ 3306 satosin6(dst)->sin6_port, (rand)); \ 3307 break; \ 3308 default: \ 3309 hash = 0; \ 3310 } \ 3311 } while (/*CONSTCOND*/0) 3312 #endif /* INET6 */ 3313 3314 void 3315 syn_cache_rm(struct syn_cache *sc) 3316 { 3317 sc->sc_flags |= SCF_DEAD; 3318 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3319 sc->sc_tp = NULL; 3320 LIST_REMOVE(sc, sc_tpq); 3321 sc->sc_buckethead->sch_length--; 3322 timeout_del(&sc->sc_timer); 3323 sc->sc_set->scs_count--; 3324 } 3325 3326 void 3327 syn_cache_put(struct syn_cache *sc) 3328 { 3329 if (sc->sc_ipopts) 3330 (void) m_free(sc->sc_ipopts); 3331 if (sc->sc_route4.ro_rt != NULL) { 3332 rtfree(sc->sc_route4.ro_rt); 3333 sc->sc_route4.ro_rt = NULL; 3334 } 3335 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3336 timeout_add(&sc->sc_timer, 0); 3337 } 3338 3339 struct pool syn_cache_pool; 3340 3341 /* 3342 * We don't estimate RTT with SYNs, so each packet starts with the default 3343 * RTT and each timer step has a fixed timeout value. 3344 */ 3345 #define SYN_CACHE_TIMER_ARM(sc) \ 3346 do { \ 3347 TCPT_RANGESET((sc)->sc_rxtcur, \ 3348 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3349 TCPTV_REXMTMAX); \ 3350 if (!timeout_initialized(&(sc)->sc_timer)) \ 3351 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3352 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3353 } while (/*CONSTCOND*/0) 3354 3355 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3356 3357 void 3358 syn_cache_init(void) 3359 { 3360 int i; 3361 3362 /* Initialize the hash buckets. */ 3363 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3364 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3365 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3366 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3367 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3368 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3369 for (i = 0; i < tcp_syn_hash_size; i++) { 3370 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3371 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3372 } 3373 3374 /* Initialize the syn cache pool. */ 3375 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3376 "syncache", NULL); 3377 pool_setipl(&syn_cache_pool, IPL_SOFTNET); 3378 } 3379 3380 void 3381 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3382 { 3383 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3384 struct syn_cache_head *scp; 3385 struct syn_cache *sc2; 3386 int i, s; 3387 3388 s = splsoftnet(); 3389 3390 /* 3391 * If there are no entries in the hash table, reinitialize 3392 * the hash secrets. To avoid useless cache swaps and 3393 * reinitialization, use it until the limit is reached. 3394 * An emtpy cache is also the oportunity to resize the hash. 3395 */ 3396 if (set->scs_count == 0 && set->scs_use <= 0) { 3397 set->scs_use = tcp_syn_use_limit; 3398 if (set->scs_size != tcp_syn_hash_size) { 3399 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3400 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3401 if (scp == NULL) { 3402 /* Try again next time. */ 3403 set->scs_use = 0; 3404 } else { 3405 free(set->scs_buckethead, M_SYNCACHE, 3406 set->scs_size * 3407 sizeof(struct syn_cache_head)); 3408 set->scs_buckethead = scp; 3409 set->scs_size = tcp_syn_hash_size; 3410 for (i = 0; i < tcp_syn_hash_size; i++) 3411 TAILQ_INIT(&scp[i].sch_bucket); 3412 } 3413 } 3414 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3415 tcpstat.tcps_sc_seedrandom++; 3416 } 3417 3418 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3419 set->scs_random); 3420 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3421 sc->sc_buckethead = scp; 3422 3423 /* 3424 * Make sure that we don't overflow the per-bucket 3425 * limit or the total cache size limit. 3426 */ 3427 if (scp->sch_length >= tcp_syn_bucket_limit) { 3428 tcpstat.tcps_sc_bucketoverflow++; 3429 /* 3430 * Someone might attack our bucket hash function. Reseed 3431 * with random as soon as the passive syn cache gets empty. 3432 */ 3433 set->scs_use = 0; 3434 /* 3435 * The bucket is full. Toss the oldest element in the 3436 * bucket. This will be the first entry in the bucket. 3437 */ 3438 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3439 #ifdef DIAGNOSTIC 3440 /* 3441 * This should never happen; we should always find an 3442 * entry in our bucket. 3443 */ 3444 if (sc2 == NULL) 3445 panic("syn_cache_insert: bucketoverflow: impossible"); 3446 #endif 3447 syn_cache_rm(sc2); 3448 syn_cache_put(sc2); 3449 } else if (set->scs_count >= tcp_syn_cache_limit) { 3450 struct syn_cache_head *scp2, *sce; 3451 3452 tcpstat.tcps_sc_overflowed++; 3453 /* 3454 * The cache is full. Toss the oldest entry in the 3455 * first non-empty bucket we can find. 3456 * 3457 * XXX We would really like to toss the oldest 3458 * entry in the cache, but we hope that this 3459 * condition doesn't happen very often. 3460 */ 3461 scp2 = scp; 3462 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3463 sce = &set->scs_buckethead[set->scs_size]; 3464 for (++scp2; scp2 != scp; scp2++) { 3465 if (scp2 >= sce) 3466 scp2 = &set->scs_buckethead[0]; 3467 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3468 break; 3469 } 3470 #ifdef DIAGNOSTIC 3471 /* 3472 * This should never happen; we should always find a 3473 * non-empty bucket. 3474 */ 3475 if (scp2 == scp) 3476 panic("syn_cache_insert: cacheoverflow: " 3477 "impossible"); 3478 #endif 3479 } 3480 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3481 syn_cache_rm(sc2); 3482 syn_cache_put(sc2); 3483 } 3484 3485 /* 3486 * Initialize the entry's timer. 3487 */ 3488 sc->sc_rxttot = 0; 3489 sc->sc_rxtshift = 0; 3490 SYN_CACHE_TIMER_ARM(sc); 3491 3492 /* Link it from tcpcb entry */ 3493 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3494 3495 /* Put it into the bucket. */ 3496 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3497 scp->sch_length++; 3498 sc->sc_set = set; 3499 set->scs_count++; 3500 set->scs_use--; 3501 3502 tcpstat.tcps_sc_added++; 3503 3504 /* 3505 * If the active cache has exceeded its use limit and 3506 * the passive syn cache is empty, exchange their roles. 3507 */ 3508 if (set->scs_use <= 0 && 3509 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3510 tcp_syn_cache_active = !tcp_syn_cache_active; 3511 3512 splx(s); 3513 } 3514 3515 /* 3516 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3517 * If we have retransmitted an entry the maximum number of times, expire 3518 * that entry. 3519 */ 3520 void 3521 syn_cache_timer(void *arg) 3522 { 3523 struct syn_cache *sc = arg; 3524 int s; 3525 3526 s = splsoftnet(); 3527 if (sc->sc_flags & SCF_DEAD) { 3528 splx(s); 3529 return; 3530 } 3531 3532 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3533 /* Drop it -- too many retransmissions. */ 3534 goto dropit; 3535 } 3536 3537 /* 3538 * Compute the total amount of time this entry has 3539 * been on a queue. If this entry has been on longer 3540 * than the keep alive timer would allow, expire it. 3541 */ 3542 sc->sc_rxttot += sc->sc_rxtcur; 3543 if (sc->sc_rxttot >= tcptv_keep_init) 3544 goto dropit; 3545 3546 tcpstat.tcps_sc_retransmitted++; 3547 (void) syn_cache_respond(sc, NULL); 3548 3549 /* Advance the timer back-off. */ 3550 sc->sc_rxtshift++; 3551 SYN_CACHE_TIMER_ARM(sc); 3552 3553 splx(s); 3554 return; 3555 3556 dropit: 3557 tcpstat.tcps_sc_timed_out++; 3558 syn_cache_rm(sc); 3559 syn_cache_put(sc); 3560 splx(s); 3561 } 3562 3563 void 3564 syn_cache_reaper(void *arg) 3565 { 3566 struct syn_cache *sc = arg; 3567 3568 pool_put(&syn_cache_pool, (sc)); 3569 return; 3570 } 3571 3572 /* 3573 * Remove syn cache created by the specified tcb entry, 3574 * because this does not make sense to keep them 3575 * (if there's no tcb entry, syn cache entry will never be used) 3576 */ 3577 void 3578 syn_cache_cleanup(struct tcpcb *tp) 3579 { 3580 struct syn_cache *sc, *nsc; 3581 int s; 3582 3583 s = splsoftnet(); 3584 3585 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3586 #ifdef DIAGNOSTIC 3587 if (sc->sc_tp != tp) 3588 panic("invalid sc_tp in syn_cache_cleanup"); 3589 #endif 3590 syn_cache_rm(sc); 3591 syn_cache_put(sc); 3592 } 3593 /* just for safety */ 3594 LIST_INIT(&tp->t_sc); 3595 3596 splx(s); 3597 } 3598 3599 /* 3600 * Find an entry in the syn cache. 3601 */ 3602 struct syn_cache * 3603 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3604 struct syn_cache_head **headp, u_int rtableid) 3605 { 3606 struct syn_cache_set *sets[2]; 3607 struct syn_cache *sc; 3608 struct syn_cache_head *scp; 3609 u_int32_t hash; 3610 int i; 3611 3612 splsoftassert(IPL_SOFTNET); 3613 3614 /* Check the active cache first, the passive cache is likely emtpy. */ 3615 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3616 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3617 for (i = 0; i < 2; i++) { 3618 if (sets[i]->scs_count == 0) 3619 continue; 3620 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3621 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3622 *headp = scp; 3623 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3624 if (sc->sc_hash != hash) 3625 continue; 3626 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3627 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3628 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3629 return (sc); 3630 } 3631 } 3632 return (NULL); 3633 } 3634 3635 /* 3636 * This function gets called when we receive an ACK for a 3637 * socket in the LISTEN state. We look up the connection 3638 * in the syn cache, and if its there, we pull it out of 3639 * the cache and turn it into a full-blown connection in 3640 * the SYN-RECEIVED state. 3641 * 3642 * The return values may not be immediately obvious, and their effects 3643 * can be subtle, so here they are: 3644 * 3645 * NULL SYN was not found in cache; caller should drop the 3646 * packet and send an RST. 3647 * 3648 * -1 We were unable to create the new connection, and are 3649 * aborting it. An ACK,RST is being sent to the peer 3650 * (unless we got screwey sequence numbners; see below), 3651 * because the 3-way handshake has been completed. Caller 3652 * should not free the mbuf, since we may be using it. If 3653 * we are not, we will free it. 3654 * 3655 * Otherwise, the return value is a pointer to the new socket 3656 * associated with the connection. 3657 */ 3658 struct socket * 3659 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3660 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3661 { 3662 struct syn_cache *sc; 3663 struct syn_cache_head *scp; 3664 struct inpcb *inp, *oldinp; 3665 struct tcpcb *tp = NULL; 3666 struct mbuf *am; 3667 int s; 3668 struct socket *oso; 3669 #if NPF > 0 3670 struct pf_divert *divert = NULL; 3671 #endif 3672 3673 s = splsoftnet(); 3674 if ((sc = syn_cache_lookup(src, dst, &scp, 3675 sotoinpcb(so)->inp_rtableid)) == NULL) { 3676 splx(s); 3677 return (NULL); 3678 } 3679 3680 /* 3681 * Verify the sequence and ack numbers. Try getting the correct 3682 * response again. 3683 */ 3684 if ((th->th_ack != sc->sc_iss + 1) || 3685 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3686 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3687 (void) syn_cache_respond(sc, m); 3688 splx(s); 3689 return ((struct socket *)(-1)); 3690 } 3691 3692 /* Remove this cache entry */ 3693 syn_cache_rm(sc); 3694 splx(s); 3695 3696 /* 3697 * Ok, create the full blown connection, and set things up 3698 * as they would have been set up if we had created the 3699 * connection when the SYN arrived. If we can't create 3700 * the connection, abort it. 3701 */ 3702 oso = so; 3703 so = sonewconn(so, SS_ISCONNECTED); 3704 if (so == NULL) 3705 goto resetandabort; 3706 3707 oldinp = sotoinpcb(oso); 3708 inp = sotoinpcb(so); 3709 3710 #ifdef IPSEC 3711 /* 3712 * We need to copy the required security levels 3713 * from the old pcb. Ditto for any other 3714 * IPsec-related information. 3715 */ 3716 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3717 sizeof(oldinp->inp_seclevel)); 3718 #endif /* IPSEC */ 3719 #ifdef INET6 3720 /* 3721 * inp still has the OLD in_pcb stuff, set the 3722 * v6-related flags on the new guy, too. 3723 */ 3724 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3725 if (inp->inp_flags & INP_IPV6) { 3726 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3727 inp->inp_hops = oldinp->inp_hops; 3728 } else 3729 #endif /* INET6 */ 3730 { 3731 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3732 } 3733 3734 #if NPF > 0 3735 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3736 (divert = pf_find_divert(m)) != NULL) 3737 inp->inp_rtableid = divert->rdomain; 3738 else 3739 #endif 3740 /* inherit rtable from listening socket */ 3741 inp->inp_rtableid = sc->sc_rtableid; 3742 3743 inp->inp_lport = th->th_dport; 3744 switch (src->sa_family) { 3745 #ifdef INET6 3746 case AF_INET6: 3747 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3748 break; 3749 #endif /* INET6 */ 3750 case AF_INET: 3751 inp->inp_laddr = satosin(dst)->sin_addr; 3752 inp->inp_options = ip_srcroute(m); 3753 if (inp->inp_options == NULL) { 3754 inp->inp_options = sc->sc_ipopts; 3755 sc->sc_ipopts = NULL; 3756 } 3757 break; 3758 } 3759 in_pcbrehash(inp); 3760 3761 /* 3762 * Give the new socket our cached route reference. 3763 */ 3764 if (src->sa_family == AF_INET) 3765 inp->inp_route = sc->sc_route4; /* struct assignment */ 3766 #ifdef INET6 3767 else 3768 inp->inp_route6 = sc->sc_route6; 3769 #endif 3770 sc->sc_route4.ro_rt = NULL; 3771 3772 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3773 if (am == NULL) 3774 goto resetandabort; 3775 am->m_len = src->sa_len; 3776 memcpy(mtod(am, caddr_t), src, src->sa_len); 3777 3778 switch (src->sa_family) { 3779 case AF_INET: 3780 /* drop IPv4 packet to AF_INET6 socket */ 3781 if (inp->inp_flags & INP_IPV6) { 3782 (void) m_free(am); 3783 goto resetandabort; 3784 } 3785 if (in_pcbconnect(inp, am)) { 3786 (void) m_free(am); 3787 goto resetandabort; 3788 } 3789 break; 3790 #ifdef INET6 3791 case AF_INET6: 3792 if (in6_pcbconnect(inp, am)) { 3793 (void) m_free(am); 3794 goto resetandabort; 3795 } 3796 break; 3797 #endif 3798 } 3799 (void) m_free(am); 3800 3801 tp = intotcpcb(inp); 3802 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3803 if (sc->sc_request_r_scale != 15) { 3804 tp->requested_s_scale = sc->sc_requested_s_scale; 3805 tp->request_r_scale = sc->sc_request_r_scale; 3806 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3807 } 3808 if (sc->sc_flags & SCF_TIMESTAMP) 3809 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3810 3811 tp->t_template = tcp_template(tp); 3812 if (tp->t_template == 0) { 3813 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3814 so = NULL; 3815 m_freem(m); 3816 goto abort; 3817 } 3818 #ifdef TCP_SACK 3819 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3820 #endif 3821 3822 tp->ts_modulate = sc->sc_modulate; 3823 tp->ts_recent = sc->sc_timestamp; 3824 tp->iss = sc->sc_iss; 3825 tp->irs = sc->sc_irs; 3826 tcp_sendseqinit(tp); 3827 #if defined (TCP_SACK) || defined(TCP_ECN) 3828 tp->snd_last = tp->snd_una; 3829 #endif /* TCP_SACK */ 3830 #if defined(TCP_SACK) && defined(TCP_FACK) 3831 tp->snd_fack = tp->snd_una; 3832 tp->retran_data = 0; 3833 tp->snd_awnd = 0; 3834 #endif /* TCP_FACK */ 3835 #ifdef TCP_ECN 3836 if (sc->sc_flags & SCF_ECN_PERMIT) { 3837 tp->t_flags |= TF_ECN_PERMIT; 3838 tcpstat.tcps_ecn_accepts++; 3839 } 3840 #endif 3841 #ifdef TCP_SACK 3842 if (sc->sc_flags & SCF_SACK_PERMIT) 3843 tp->t_flags |= TF_SACK_PERMIT; 3844 #endif 3845 #ifdef TCP_SIGNATURE 3846 if (sc->sc_flags & SCF_SIGNATURE) 3847 tp->t_flags |= TF_SIGNATURE; 3848 #endif 3849 tcp_rcvseqinit(tp); 3850 tp->t_state = TCPS_SYN_RECEIVED; 3851 tp->t_rcvtime = tcp_now; 3852 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3853 tcpstat.tcps_accepts++; 3854 3855 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3856 if (sc->sc_peermaxseg) 3857 tcp_mss_update(tp); 3858 /* Reset initial window to 1 segment for retransmit */ 3859 if (sc->sc_rxtshift > 0) 3860 tp->snd_cwnd = tp->t_maxseg; 3861 tp->snd_wl1 = sc->sc_irs; 3862 tp->rcv_up = sc->sc_irs + 1; 3863 3864 /* 3865 * This is what whould have happened in tcp_output() when 3866 * the SYN,ACK was sent. 3867 */ 3868 tp->snd_up = tp->snd_una; 3869 tp->snd_max = tp->snd_nxt = tp->iss+1; 3870 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3871 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3872 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3873 tp->last_ack_sent = tp->rcv_nxt; 3874 3875 tcpstat.tcps_sc_completed++; 3876 syn_cache_put(sc); 3877 return (so); 3878 3879 resetandabort: 3880 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3881 m->m_pkthdr.ph_rtableid); 3882 m_freem(m); 3883 abort: 3884 if (so != NULL) 3885 (void) soabort(so); 3886 syn_cache_put(sc); 3887 tcpstat.tcps_sc_aborted++; 3888 return ((struct socket *)(-1)); 3889 } 3890 3891 /* 3892 * This function is called when we get a RST for a 3893 * non-existent connection, so that we can see if the 3894 * connection is in the syn cache. If it is, zap it. 3895 */ 3896 3897 void 3898 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3899 u_int rtableid) 3900 { 3901 struct syn_cache *sc; 3902 struct syn_cache_head *scp; 3903 int s = splsoftnet(); 3904 3905 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3906 splx(s); 3907 return; 3908 } 3909 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3910 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3911 splx(s); 3912 return; 3913 } 3914 syn_cache_rm(sc); 3915 splx(s); 3916 tcpstat.tcps_sc_reset++; 3917 syn_cache_put(sc); 3918 } 3919 3920 void 3921 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3922 u_int rtableid) 3923 { 3924 struct syn_cache *sc; 3925 struct syn_cache_head *scp; 3926 int s; 3927 3928 s = splsoftnet(); 3929 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3930 splx(s); 3931 return; 3932 } 3933 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3934 if (ntohl (th->th_seq) != sc->sc_iss) { 3935 splx(s); 3936 return; 3937 } 3938 3939 /* 3940 * If we've retransmitted 3 times and this is our second error, 3941 * we remove the entry. Otherwise, we allow it to continue on. 3942 * This prevents us from incorrectly nuking an entry during a 3943 * spurious network outage. 3944 * 3945 * See tcp_notify(). 3946 */ 3947 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3948 sc->sc_flags |= SCF_UNREACH; 3949 splx(s); 3950 return; 3951 } 3952 3953 syn_cache_rm(sc); 3954 splx(s); 3955 tcpstat.tcps_sc_unreach++; 3956 syn_cache_put(sc); 3957 } 3958 3959 /* 3960 * Given a LISTEN socket and an inbound SYN request, add 3961 * this to the syn cache, and send back a segment: 3962 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3963 * to the source. 3964 * 3965 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3966 * Doing so would require that we hold onto the data and deliver it 3967 * to the application. However, if we are the target of a SYN-flood 3968 * DoS attack, an attacker could send data which would eventually 3969 * consume all available buffer space if it were ACKed. By not ACKing 3970 * the data, we avoid this DoS scenario. 3971 */ 3972 3973 int 3974 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3975 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3976 struct tcp_opt_info *oi, tcp_seq *issp) 3977 { 3978 struct tcpcb tb, *tp; 3979 long win; 3980 struct syn_cache *sc; 3981 struct syn_cache_head *scp; 3982 struct mbuf *ipopts; 3983 3984 tp = sototcpcb(so); 3985 3986 /* 3987 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3988 * 3989 * Note this check is performed in tcp_input() very early on. 3990 */ 3991 3992 /* 3993 * Initialize some local state. 3994 */ 3995 win = sbspace(&so->so_rcv); 3996 if (win > TCP_MAXWIN) 3997 win = TCP_MAXWIN; 3998 3999 bzero(&tb, sizeof(tb)); 4000 #ifdef TCP_SIGNATURE 4001 if (optp || (tp->t_flags & TF_SIGNATURE)) { 4002 #else 4003 if (optp) { 4004 #endif 4005 tb.pf = tp->pf; 4006 #ifdef TCP_SACK 4007 tb.sack_enable = tp->sack_enable; 4008 #endif 4009 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 4010 #ifdef TCP_SIGNATURE 4011 if (tp->t_flags & TF_SIGNATURE) 4012 tb.t_flags |= TF_SIGNATURE; 4013 #endif 4014 tb.t_state = TCPS_LISTEN; 4015 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 4016 sotoinpcb(so)->inp_rtableid)) 4017 return (-1); 4018 } 4019 4020 switch (src->sa_family) { 4021 case AF_INET: 4022 /* 4023 * Remember the IP options, if any. 4024 */ 4025 ipopts = ip_srcroute(m); 4026 break; 4027 default: 4028 ipopts = NULL; 4029 } 4030 4031 /* 4032 * See if we already have an entry for this connection. 4033 * If we do, resend the SYN,ACK. We do not count this 4034 * as a retransmission (XXX though maybe we should). 4035 */ 4036 if ((sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid)) 4037 != NULL) { 4038 tcpstat.tcps_sc_dupesyn++; 4039 if (ipopts) { 4040 /* 4041 * If we were remembering a previous source route, 4042 * forget it and use the new one we've been given. 4043 */ 4044 if (sc->sc_ipopts) 4045 (void) m_free(sc->sc_ipopts); 4046 sc->sc_ipopts = ipopts; 4047 } 4048 sc->sc_timestamp = tb.ts_recent; 4049 if (syn_cache_respond(sc, m) == 0) { 4050 tcpstat.tcps_sndacks++; 4051 tcpstat.tcps_sndtotal++; 4052 } 4053 return (0); 4054 } 4055 4056 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4057 if (sc == NULL) { 4058 if (ipopts) 4059 (void) m_free(ipopts); 4060 return (-1); 4061 } 4062 4063 /* 4064 * Fill in the cache, and put the necessary IP and TCP 4065 * options into the reply. 4066 */ 4067 memcpy(&sc->sc_src, src, src->sa_len); 4068 memcpy(&sc->sc_dst, dst, dst->sa_len); 4069 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 4070 sc->sc_flags = 0; 4071 sc->sc_ipopts = ipopts; 4072 sc->sc_irs = th->th_seq; 4073 4074 sc->sc_iss = issp ? *issp : arc4random(); 4075 sc->sc_peermaxseg = oi->maxseg; 4076 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 4077 sc->sc_win = win; 4078 sc->sc_timestamp = tb.ts_recent; 4079 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4080 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4081 sc->sc_flags |= SCF_TIMESTAMP; 4082 sc->sc_modulate = arc4random(); 4083 } 4084 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4085 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4086 sc->sc_requested_s_scale = tb.requested_s_scale; 4087 sc->sc_request_r_scale = 0; 4088 /* 4089 * Pick the smallest possible scaling factor that 4090 * will still allow us to scale up to sb_max. 4091 * 4092 * We do this because there are broken firewalls that 4093 * will corrupt the window scale option, leading to 4094 * the other endpoint believing that our advertised 4095 * window is unscaled. At scale factors larger than 4096 * 5 the unscaled window will drop below 1500 bytes, 4097 * leading to serious problems when traversing these 4098 * broken firewalls. 4099 * 4100 * With the default sbmax of 256K, a scale factor 4101 * of 3 will be chosen by this algorithm. Those who 4102 * choose a larger sbmax should watch out 4103 * for the compatiblity problems mentioned above. 4104 * 4105 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4106 * or <SYN,ACK>) segment itself is never scaled. 4107 */ 4108 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4109 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4110 sc->sc_request_r_scale++; 4111 } else { 4112 sc->sc_requested_s_scale = 15; 4113 sc->sc_request_r_scale = 15; 4114 } 4115 #ifdef TCP_ECN 4116 /* 4117 * if both ECE and CWR flag bits are set, peer is ECN capable. 4118 */ 4119 if (tcp_do_ecn && 4120 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4121 sc->sc_flags |= SCF_ECN_PERMIT; 4122 #endif 4123 #ifdef TCP_SACK 4124 /* 4125 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4126 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4127 */ 4128 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4129 sc->sc_flags |= SCF_SACK_PERMIT; 4130 #endif 4131 #ifdef TCP_SIGNATURE 4132 if (tb.t_flags & TF_SIGNATURE) 4133 sc->sc_flags |= SCF_SIGNATURE; 4134 #endif 4135 sc->sc_tp = tp; 4136 if (syn_cache_respond(sc, m) == 0) { 4137 syn_cache_insert(sc, tp); 4138 tcpstat.tcps_sndacks++; 4139 tcpstat.tcps_sndtotal++; 4140 } else { 4141 syn_cache_put(sc); 4142 tcpstat.tcps_sc_dropped++; 4143 } 4144 4145 return (0); 4146 } 4147 4148 int 4149 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4150 { 4151 struct route *ro; 4152 u_int8_t *optp; 4153 int optlen, error; 4154 u_int16_t tlen; 4155 struct ip *ip = NULL; 4156 #ifdef INET6 4157 struct ip6_hdr *ip6 = NULL; 4158 #endif 4159 struct tcphdr *th; 4160 u_int hlen; 4161 struct inpcb *inp; 4162 4163 switch (sc->sc_src.sa.sa_family) { 4164 case AF_INET: 4165 hlen = sizeof(struct ip); 4166 ro = &sc->sc_route4; 4167 break; 4168 #ifdef INET6 4169 case AF_INET6: 4170 hlen = sizeof(struct ip6_hdr); 4171 ro = (struct route *)&sc->sc_route6; 4172 break; 4173 #endif 4174 default: 4175 m_freem(m); 4176 return (EAFNOSUPPORT); 4177 } 4178 4179 /* Compute the size of the TCP options. */ 4180 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4181 #ifdef TCP_SACK 4182 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4183 #endif 4184 #ifdef TCP_SIGNATURE 4185 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4186 #endif 4187 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4188 4189 tlen = hlen + sizeof(struct tcphdr) + optlen; 4190 4191 /* 4192 * Create the IP+TCP header from scratch. 4193 */ 4194 m_freem(m); 4195 #ifdef DIAGNOSTIC 4196 if (max_linkhdr + tlen > MCLBYTES) 4197 return (ENOBUFS); 4198 #endif 4199 MGETHDR(m, M_DONTWAIT, MT_DATA); 4200 if (m && max_linkhdr + tlen > MHLEN) { 4201 MCLGET(m, M_DONTWAIT); 4202 if ((m->m_flags & M_EXT) == 0) { 4203 m_freem(m); 4204 m = NULL; 4205 } 4206 } 4207 if (m == NULL) 4208 return (ENOBUFS); 4209 4210 /* Fixup the mbuf. */ 4211 m->m_data += max_linkhdr; 4212 m->m_len = m->m_pkthdr.len = tlen; 4213 m->m_pkthdr.ph_ifidx = 0; 4214 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4215 memset(mtod(m, u_char *), 0, tlen); 4216 4217 switch (sc->sc_src.sa.sa_family) { 4218 case AF_INET: 4219 ip = mtod(m, struct ip *); 4220 ip->ip_dst = sc->sc_src.sin.sin_addr; 4221 ip->ip_src = sc->sc_dst.sin.sin_addr; 4222 ip->ip_p = IPPROTO_TCP; 4223 th = (struct tcphdr *)(ip + 1); 4224 th->th_dport = sc->sc_src.sin.sin_port; 4225 th->th_sport = sc->sc_dst.sin.sin_port; 4226 break; 4227 #ifdef INET6 4228 case AF_INET6: 4229 ip6 = mtod(m, struct ip6_hdr *); 4230 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4231 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4232 ip6->ip6_nxt = IPPROTO_TCP; 4233 /* ip6_plen will be updated in ip6_output() */ 4234 th = (struct tcphdr *)(ip6 + 1); 4235 th->th_dport = sc->sc_src.sin6.sin6_port; 4236 th->th_sport = sc->sc_dst.sin6.sin6_port; 4237 break; 4238 #endif 4239 default: 4240 unhandled_af(sc->sc_src.sa.sa_family); 4241 } 4242 4243 th->th_seq = htonl(sc->sc_iss); 4244 th->th_ack = htonl(sc->sc_irs + 1); 4245 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4246 th->th_flags = TH_SYN|TH_ACK; 4247 #ifdef TCP_ECN 4248 /* Set ECE for SYN-ACK if peer supports ECN. */ 4249 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4250 th->th_flags |= TH_ECE; 4251 #endif 4252 th->th_win = htons(sc->sc_win); 4253 /* th_sum already 0 */ 4254 /* th_urp already 0 */ 4255 4256 /* Tack on the TCP options. */ 4257 optp = (u_int8_t *)(th + 1); 4258 *optp++ = TCPOPT_MAXSEG; 4259 *optp++ = 4; 4260 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4261 *optp++ = sc->sc_ourmaxseg & 0xff; 4262 4263 #ifdef TCP_SACK 4264 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4265 if (sc->sc_flags & SCF_SACK_PERMIT) { 4266 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4267 optp += 4; 4268 } 4269 #endif 4270 4271 if (sc->sc_request_r_scale != 15) { 4272 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4273 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4274 sc->sc_request_r_scale); 4275 optp += 4; 4276 } 4277 4278 if (sc->sc_flags & SCF_TIMESTAMP) { 4279 u_int32_t *lp = (u_int32_t *)(optp); 4280 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4281 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4282 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4283 *lp = htonl(sc->sc_timestamp); 4284 optp += TCPOLEN_TSTAMP_APPA; 4285 } 4286 4287 #ifdef TCP_SIGNATURE 4288 if (sc->sc_flags & SCF_SIGNATURE) { 4289 union sockaddr_union src, dst; 4290 struct tdb *tdb; 4291 4292 bzero(&src, sizeof(union sockaddr_union)); 4293 bzero(&dst, sizeof(union sockaddr_union)); 4294 src.sa.sa_len = sc->sc_src.sa.sa_len; 4295 src.sa.sa_family = sc->sc_src.sa.sa_family; 4296 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4297 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4298 4299 switch (sc->sc_src.sa.sa_family) { 4300 case 0: /*default to PF_INET*/ 4301 case AF_INET: 4302 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4303 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4304 break; 4305 #ifdef INET6 4306 case AF_INET6: 4307 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4308 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4309 break; 4310 #endif /* INET6 */ 4311 } 4312 4313 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4314 0, &src, &dst, IPPROTO_TCP); 4315 if (tdb == NULL) { 4316 m_freem(m); 4317 return (EPERM); 4318 } 4319 4320 /* Send signature option */ 4321 *(optp++) = TCPOPT_SIGNATURE; 4322 *(optp++) = TCPOLEN_SIGNATURE; 4323 4324 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4325 hlen, 0, optp) < 0) { 4326 m_freem(m); 4327 return (EINVAL); 4328 } 4329 optp += 16; 4330 4331 /* Pad options list to the next 32 bit boundary and 4332 * terminate it. 4333 */ 4334 *optp++ = TCPOPT_NOP; 4335 *optp++ = TCPOPT_EOL; 4336 } 4337 #endif /* TCP_SIGNATURE */ 4338 4339 /* Compute the packet's checksum. */ 4340 switch (sc->sc_src.sa.sa_family) { 4341 case AF_INET: 4342 ip->ip_len = htons(tlen - hlen); 4343 th->th_sum = 0; 4344 th->th_sum = in_cksum(m, tlen); 4345 break; 4346 #ifdef INET6 4347 case AF_INET6: 4348 ip6->ip6_plen = htons(tlen - hlen); 4349 th->th_sum = 0; 4350 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4351 break; 4352 #endif 4353 } 4354 4355 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4356 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4357 4358 /* 4359 * Fill in some straggling IP bits. Note the stack expects 4360 * ip_len to be in host order, for convenience. 4361 */ 4362 switch (sc->sc_src.sa.sa_family) { 4363 case AF_INET: 4364 ip->ip_len = htons(tlen); 4365 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4366 if (inp != NULL) 4367 ip->ip_tos = inp->inp_ip.ip_tos; 4368 break; 4369 #ifdef INET6 4370 case AF_INET6: 4371 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4372 ip6->ip6_vfc |= IPV6_VERSION; 4373 ip6->ip6_plen = htons(tlen - hlen); 4374 /* ip6_hlim will be initialized afterwards */ 4375 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4376 break; 4377 #endif 4378 } 4379 4380 switch (sc->sc_src.sa.sa_family) { 4381 case AF_INET: 4382 error = ip_output(m, sc->sc_ipopts, ro, 4383 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4384 break; 4385 #ifdef INET6 4386 case AF_INET6: 4387 ip6->ip6_hlim = in6_selecthlim(inp); 4388 4389 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4390 NULL, NULL); 4391 break; 4392 #endif 4393 default: 4394 error = EAFNOSUPPORT; 4395 break; 4396 } 4397 return (error); 4398 } 4399