1 /* $OpenBSD: tcp_input.c,v 1.318 2016/03/31 13:11:14 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_debug.h> 98 99 #if NPF > 0 100 #include <net/pfvar.h> 101 #endif 102 103 struct tcpiphdr tcp_saveti; 104 105 int tcp_mss_adv(struct mbuf *, int); 106 int tcp_flush_queue(struct tcpcb *); 107 108 #ifdef INET6 109 #include <netinet6/in6_var.h> 110 #include <netinet6/nd6.h> 111 112 struct tcpipv6hdr tcp_saveti6; 113 114 /* for the packet header length in the mbuf */ 115 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 116 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 117 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 118 #endif /* INET6 */ 119 120 int tcprexmtthresh = 3; 121 int tcptv_keep_init = TCPTV_KEEP_INIT; 122 123 int tcp_rst_ppslim = 100; /* 100pps */ 124 int tcp_rst_ppslim_count = 0; 125 struct timeval tcp_rst_ppslim_last; 126 127 int tcp_ackdrop_ppslim = 100; /* 100pps */ 128 int tcp_ackdrop_ppslim_count = 0; 129 struct timeval tcp_ackdrop_ppslim_last; 130 131 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 132 133 /* for modulo comparisons of timestamps */ 134 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 135 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 136 137 /* for TCP SACK comparisons */ 138 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 139 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 140 141 /* 142 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 143 */ 144 #ifdef INET6 145 #define ND6_HINT(tp) \ 146 do { \ 147 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 148 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 149 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 150 } \ 151 } while (0) 152 #else 153 #define ND6_HINT(tp) 154 #endif 155 156 #ifdef TCP_ECN 157 /* 158 * ECN (Explicit Congestion Notification) support based on RFC3168 159 * implementation note: 160 * snd_last is used to track a recovery phase. 161 * when cwnd is reduced, snd_last is set to snd_max. 162 * while snd_last > snd_una, the sender is in a recovery phase and 163 * its cwnd should not be reduced again. 164 * snd_last follows snd_una when not in a recovery phase. 165 */ 166 #endif 167 168 /* 169 * Macro to compute ACK transmission behavior. Delay the ACK unless 170 * we have already delayed an ACK (must send an ACK every two segments). 171 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 172 * option is enabled or when the packet is coming from a loopback 173 * interface. 174 */ 175 #define TCP_SETUP_ACK(tp, tiflags, m) \ 176 do { \ 177 struct ifnet *ifp = NULL; \ 178 if (m && (m->m_flags & M_PKTHDR)) \ 179 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 180 if ((tp)->t_flags & TF_DELACK || \ 181 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 182 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 183 tp->t_flags |= TF_ACKNOW; \ 184 else \ 185 TCP_SET_DELACK(tp); \ 186 if_put(ifp); \ 187 } while (0) 188 189 void syn_cache_put(struct syn_cache *); 190 void syn_cache_rm(struct syn_cache *); 191 192 /* 193 * Insert segment ti into reassembly queue of tcp with 194 * control block tp. Return TH_FIN if reassembly now includes 195 * a segment with FIN. The macro form does the common case inline 196 * (segment is the next to be received on an established connection, 197 * and the queue is empty), avoiding linkage into and removal 198 * from the queue and repetition of various conversions. 199 * Set DELACK for segments received in order, but ack immediately 200 * when segments are out of order (so fast retransmit can work). 201 */ 202 203 int 204 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 205 { 206 struct tcpqent *p, *q, *nq, *tiqe; 207 208 /* 209 * Allocate a new queue entry, before we throw away any data. 210 * If we can't, just drop the packet. XXX 211 */ 212 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 213 if (tiqe == NULL) { 214 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 215 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 216 /* Reuse last entry since new segment fills a hole */ 217 m_freem(tiqe->tcpqe_m); 218 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 219 } 220 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 221 /* Flush segment queue for this connection */ 222 tcp_freeq(tp); 223 tcpstat.tcps_rcvmemdrop++; 224 m_freem(m); 225 return (0); 226 } 227 } 228 229 /* 230 * Find a segment which begins after this one does. 231 */ 232 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 233 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 234 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 235 break; 236 237 /* 238 * If there is a preceding segment, it may provide some of 239 * our data already. If so, drop the data from the incoming 240 * segment. If it provides all of our data, drop us. 241 */ 242 if (p != NULL) { 243 struct tcphdr *phdr = p->tcpqe_tcp; 244 int i; 245 246 /* conversion to int (in i) handles seq wraparound */ 247 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 248 if (i > 0) { 249 if (i >= *tlen) { 250 tcpstat.tcps_rcvduppack++; 251 tcpstat.tcps_rcvdupbyte += *tlen; 252 m_freem(m); 253 pool_put(&tcpqe_pool, tiqe); 254 return (0); 255 } 256 m_adj(m, i); 257 *tlen -= i; 258 th->th_seq += i; 259 } 260 } 261 tcpstat.tcps_rcvoopack++; 262 tcpstat.tcps_rcvoobyte += *tlen; 263 264 /* 265 * While we overlap succeeding segments trim them or, 266 * if they are completely covered, dequeue them. 267 */ 268 for (; q != NULL; q = nq) { 269 struct tcphdr *qhdr = q->tcpqe_tcp; 270 int i = (th->th_seq + *tlen) - qhdr->th_seq; 271 272 if (i <= 0) 273 break; 274 if (i < qhdr->th_reseqlen) { 275 qhdr->th_seq += i; 276 qhdr->th_reseqlen -= i; 277 m_adj(q->tcpqe_m, i); 278 break; 279 } 280 nq = TAILQ_NEXT(q, tcpqe_q); 281 m_freem(q->tcpqe_m); 282 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 283 pool_put(&tcpqe_pool, q); 284 } 285 286 /* Insert the new segment queue entry into place. */ 287 tiqe->tcpqe_m = m; 288 th->th_reseqlen = *tlen; 289 tiqe->tcpqe_tcp = th; 290 if (p == NULL) { 291 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 292 } else { 293 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 294 } 295 296 if (th->th_seq != tp->rcv_nxt) 297 return (0); 298 299 return (tcp_flush_queue(tp)); 300 } 301 302 int 303 tcp_flush_queue(struct tcpcb *tp) 304 { 305 struct socket *so = tp->t_inpcb->inp_socket; 306 struct tcpqent *q, *nq; 307 int flags; 308 309 /* 310 * Present data to user, advancing rcv_nxt through 311 * completed sequence space. 312 */ 313 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 314 return (0); 315 q = TAILQ_FIRST(&tp->t_segq); 316 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 317 return (0); 318 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 319 return (0); 320 do { 321 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 322 flags = q->tcpqe_tcp->th_flags & TH_FIN; 323 324 nq = TAILQ_NEXT(q, tcpqe_q); 325 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 326 ND6_HINT(tp); 327 if (so->so_state & SS_CANTRCVMORE) 328 m_freem(q->tcpqe_m); 329 else 330 sbappendstream(&so->so_rcv, q->tcpqe_m); 331 pool_put(&tcpqe_pool, q); 332 q = nq; 333 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 334 tp->t_flags |= TF_BLOCKOUTPUT; 335 sorwakeup(so); 336 tp->t_flags &= ~TF_BLOCKOUTPUT; 337 return (flags); 338 } 339 340 #ifdef INET6 341 int 342 tcp6_input(struct mbuf **mp, int *offp, int proto) 343 { 344 struct mbuf *m = *mp; 345 346 tcp_input(m, *offp, proto); 347 return IPPROTO_DONE; 348 } 349 #endif 350 351 /* 352 * TCP input routine, follows pages 65-76 of the 353 * protocol specification dated September, 1981 very closely. 354 */ 355 void 356 tcp_input(struct mbuf *m, ...) 357 { 358 struct ip *ip; 359 struct inpcb *inp = NULL; 360 u_int8_t *optp = NULL; 361 int optlen = 0; 362 int tlen, off; 363 struct tcpcb *tp = NULL; 364 int tiflags; 365 struct socket *so = NULL; 366 int todrop, acked, ourfinisacked; 367 int hdroptlen = 0; 368 short ostate = 0; 369 tcp_seq iss, *reuse = NULL; 370 u_long tiwin; 371 struct tcp_opt_info opti; 372 int iphlen; 373 va_list ap; 374 struct tcphdr *th; 375 #ifdef INET6 376 struct ip6_hdr *ip6 = NULL; 377 #endif /* INET6 */ 378 #ifdef IPSEC 379 struct m_tag *mtag; 380 struct tdb_ident *tdbi; 381 struct tdb *tdb; 382 int error; 383 #endif /* IPSEC */ 384 int af; 385 #ifdef TCP_ECN 386 u_char iptos; 387 #endif 388 389 va_start(ap, m); 390 iphlen = va_arg(ap, int); 391 va_end(ap); 392 393 tcpstat.tcps_rcvtotal++; 394 395 opti.ts_present = 0; 396 opti.maxseg = 0; 397 398 /* 399 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 400 */ 401 if (m->m_flags & (M_BCAST|M_MCAST)) 402 goto drop; 403 404 /* 405 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 406 * TCP/IPv4. 407 */ 408 switch (mtod(m, struct ip *)->ip_v) { 409 #ifdef INET6 410 case 6: 411 af = AF_INET6; 412 break; 413 #endif 414 case 4: 415 af = AF_INET; 416 break; 417 default: 418 m_freem(m); 419 return; /*EAFNOSUPPORT*/ 420 } 421 422 /* 423 * Get IP and TCP header together in first mbuf. 424 * Note: IP leaves IP header in first mbuf. 425 */ 426 switch (af) { 427 case AF_INET: 428 #ifdef DIAGNOSTIC 429 if (iphlen < sizeof(struct ip)) { 430 m_freem(m); 431 return; 432 } 433 #endif /* DIAGNOSTIC */ 434 break; 435 #ifdef INET6 436 case AF_INET6: 437 #ifdef DIAGNOSTIC 438 if (iphlen < sizeof(struct ip6_hdr)) { 439 m_freem(m); 440 return; 441 } 442 #endif /* DIAGNOSTIC */ 443 break; 444 #endif 445 default: 446 m_freem(m); 447 return; 448 } 449 450 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 451 if (!th) { 452 tcpstat.tcps_rcvshort++; 453 return; 454 } 455 456 tlen = m->m_pkthdr.len - iphlen; 457 ip = NULL; 458 #ifdef INET6 459 ip6 = NULL; 460 #endif 461 switch (af) { 462 case AF_INET: 463 ip = mtod(m, struct ip *); 464 #ifdef TCP_ECN 465 /* save ip_tos before clearing it for checksum */ 466 iptos = ip->ip_tos; 467 #endif 468 break; 469 #ifdef INET6 470 case AF_INET6: 471 ip6 = mtod(m, struct ip6_hdr *); 472 #ifdef TCP_ECN 473 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 474 #endif 475 476 /* Be proactive about malicious use of IPv4 mapped address */ 477 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 478 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 479 /* XXX stat */ 480 goto drop; 481 } 482 483 /* 484 * Be proactive about unspecified IPv6 address in source. 485 * As we use all-zero to indicate unbounded/unconnected pcb, 486 * unspecified IPv6 address can be used to confuse us. 487 * 488 * Note that packets with unspecified IPv6 destination is 489 * already dropped in ip6_input. 490 */ 491 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 492 /* XXX stat */ 493 goto drop; 494 } 495 496 /* Discard packets to multicast */ 497 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 498 /* XXX stat */ 499 goto drop; 500 } 501 break; 502 #endif 503 } 504 505 /* 506 * Checksum extended TCP header and data. 507 */ 508 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 509 int sum; 510 511 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 512 tcpstat.tcps_rcvbadsum++; 513 goto drop; 514 } 515 tcpstat.tcps_inswcsum++; 516 switch (af) { 517 case AF_INET: 518 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 519 break; 520 #ifdef INET6 521 case AF_INET6: 522 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 523 tlen); 524 break; 525 #endif 526 } 527 if (sum != 0) { 528 tcpstat.tcps_rcvbadsum++; 529 goto drop; 530 } 531 } 532 533 /* 534 * Check that TCP offset makes sense, 535 * pull out TCP options and adjust length. XXX 536 */ 537 off = th->th_off << 2; 538 if (off < sizeof(struct tcphdr) || off > tlen) { 539 tcpstat.tcps_rcvbadoff++; 540 goto drop; 541 } 542 tlen -= off; 543 if (off > sizeof(struct tcphdr)) { 544 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 545 if (!th) { 546 tcpstat.tcps_rcvshort++; 547 return; 548 } 549 optlen = off - sizeof(struct tcphdr); 550 optp = (u_int8_t *)(th + 1); 551 /* 552 * Do quick retrieval of timestamp options ("options 553 * prediction?"). If timestamp is the only option and it's 554 * formatted as recommended in RFC 1323 appendix A, we 555 * quickly get the values now and not bother calling 556 * tcp_dooptions(), etc. 557 */ 558 if ((optlen == TCPOLEN_TSTAMP_APPA || 559 (optlen > TCPOLEN_TSTAMP_APPA && 560 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 561 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 562 (th->th_flags & TH_SYN) == 0) { 563 opti.ts_present = 1; 564 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 565 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 566 optp = NULL; /* we've parsed the options */ 567 } 568 } 569 tiflags = th->th_flags; 570 571 /* 572 * Convert TCP protocol specific fields to host format. 573 */ 574 th->th_seq = ntohl(th->th_seq); 575 th->th_ack = ntohl(th->th_ack); 576 th->th_win = ntohs(th->th_win); 577 th->th_urp = ntohs(th->th_urp); 578 579 /* 580 * Locate pcb for segment. 581 */ 582 #if NPF > 0 583 inp = pf_inp_lookup(m); 584 #endif 585 findpcb: 586 if (inp == NULL) { 587 switch (af) { 588 #ifdef INET6 589 case AF_INET6: 590 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 591 th->th_sport, &ip6->ip6_dst, th->th_dport, 592 m->m_pkthdr.ph_rtableid); 593 break; 594 #endif 595 case AF_INET: 596 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 597 th->th_sport, ip->ip_dst, th->th_dport, 598 m->m_pkthdr.ph_rtableid); 599 break; 600 } 601 } 602 if (inp == NULL) { 603 int inpl_reverse = 0; 604 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 605 inpl_reverse = 1; 606 ++tcpstat.tcps_pcbhashmiss; 607 switch (af) { 608 #ifdef INET6 609 case AF_INET6: 610 inp = in6_pcblookup_listen(&tcbtable, 611 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 612 m->m_pkthdr.ph_rtableid); 613 break; 614 #endif /* INET6 */ 615 case AF_INET: 616 inp = in_pcblookup_listen(&tcbtable, 617 ip->ip_dst, th->th_dport, inpl_reverse, m, 618 m->m_pkthdr.ph_rtableid); 619 break; 620 } 621 /* 622 * If the state is CLOSED (i.e., TCB does not exist) then 623 * all data in the incoming segment is discarded. 624 * If the TCB exists but is in CLOSED state, it is embryonic, 625 * but should either do a listen or a connect soon. 626 */ 627 if (inp == NULL) { 628 ++tcpstat.tcps_noport; 629 goto dropwithreset_ratelim; 630 } 631 } 632 KASSERT(sotoinpcb(inp->inp_socket) == inp); 633 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 634 635 /* Check the minimum TTL for socket. */ 636 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 637 goto drop; 638 639 tp = intotcpcb(inp); 640 if (tp == NULL) 641 goto dropwithreset_ratelim; 642 if (tp->t_state == TCPS_CLOSED) 643 goto drop; 644 645 /* Unscale the window into a 32-bit value. */ 646 if ((tiflags & TH_SYN) == 0) 647 tiwin = th->th_win << tp->snd_scale; 648 else 649 tiwin = th->th_win; 650 651 so = inp->inp_socket; 652 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 653 union syn_cache_sa src; 654 union syn_cache_sa dst; 655 656 bzero(&src, sizeof(src)); 657 bzero(&dst, sizeof(dst)); 658 switch (af) { 659 case AF_INET: 660 src.sin.sin_len = sizeof(struct sockaddr_in); 661 src.sin.sin_family = AF_INET; 662 src.sin.sin_addr = ip->ip_src; 663 src.sin.sin_port = th->th_sport; 664 665 dst.sin.sin_len = sizeof(struct sockaddr_in); 666 dst.sin.sin_family = AF_INET; 667 dst.sin.sin_addr = ip->ip_dst; 668 dst.sin.sin_port = th->th_dport; 669 break; 670 #ifdef INET6 671 case AF_INET6: 672 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 673 src.sin6.sin6_family = AF_INET6; 674 src.sin6.sin6_addr = ip6->ip6_src; 675 src.sin6.sin6_port = th->th_sport; 676 677 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 678 dst.sin6.sin6_family = AF_INET6; 679 dst.sin6.sin6_addr = ip6->ip6_dst; 680 dst.sin6.sin6_port = th->th_dport; 681 break; 682 #endif /* INET6 */ 683 default: 684 goto badsyn; /*sanity*/ 685 } 686 687 if (so->so_options & SO_DEBUG) { 688 ostate = tp->t_state; 689 switch (af) { 690 #ifdef INET6 691 case AF_INET6: 692 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 693 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 694 break; 695 #endif 696 case AF_INET: 697 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 698 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 699 break; 700 } 701 } 702 if (so->so_options & SO_ACCEPTCONN) { 703 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 704 705 case TH_SYN|TH_ACK|TH_RST: 706 case TH_SYN|TH_RST: 707 case TH_ACK|TH_RST: 708 case TH_RST: 709 syn_cache_reset(&src.sa, &dst.sa, th, 710 inp->inp_rtableid); 711 goto drop; 712 713 case TH_SYN|TH_ACK: 714 /* 715 * Received a SYN,ACK. This should 716 * never happen while we are in 717 * LISTEN. Send an RST. 718 */ 719 goto badsyn; 720 721 case TH_ACK: 722 so = syn_cache_get(&src.sa, &dst.sa, 723 th, iphlen, tlen, so, m); 724 if (so == NULL) { 725 /* 726 * We don't have a SYN for 727 * this ACK; send an RST. 728 */ 729 goto badsyn; 730 } else if (so == (struct socket *)(-1)) { 731 /* 732 * We were unable to create 733 * the connection. If the 734 * 3-way handshake was 735 * completed, and RST has 736 * been sent to the peer. 737 * Since the mbuf might be 738 * in use for the reply, 739 * do not free it. 740 */ 741 m = NULL; 742 goto drop; 743 } else { 744 /* 745 * We have created a 746 * full-blown connection. 747 */ 748 tp = NULL; 749 inp = sotoinpcb(so); 750 tp = intotcpcb(inp); 751 if (tp == NULL) 752 goto badsyn; /*XXX*/ 753 754 } 755 break; 756 757 default: 758 /* 759 * None of RST, SYN or ACK was set. 760 * This is an invalid packet for a 761 * TCB in LISTEN state. Send a RST. 762 */ 763 goto badsyn; 764 765 case TH_SYN: 766 /* 767 * Received a SYN. 768 */ 769 #ifdef INET6 770 /* 771 * If deprecated address is forbidden, we do 772 * not accept SYN to deprecated interface 773 * address to prevent any new inbound 774 * connection from getting established. 775 * When we do not accept SYN, we send a TCP 776 * RST, with deprecated source address (instead 777 * of dropping it). We compromise it as it is 778 * much better for peer to send a RST, and 779 * RST will be the final packet for the 780 * exchange. 781 * 782 * If we do not forbid deprecated addresses, we 783 * accept the SYN packet. RFC2462 does not 784 * suggest dropping SYN in this case. 785 * If we decipher RFC2462 5.5.4, it says like 786 * this: 787 * 1. use of deprecated addr with existing 788 * communication is okay - "SHOULD continue 789 * to be used" 790 * 2. use of it with new communication: 791 * (2a) "SHOULD NOT be used if alternate 792 * address with sufficient scope is 793 * available" 794 * (2b) nothing mentioned otherwise. 795 * Here we fall into (2b) case as we have no 796 * choice in our source address selection - we 797 * must obey the peer. 798 * 799 * The wording in RFC2462 is confusing, and 800 * there are multiple description text for 801 * deprecated address handling - worse, they 802 * are not exactly the same. I believe 5.5.4 803 * is the best one, so we follow 5.5.4. 804 */ 805 if (ip6 && !ip6_use_deprecated) { 806 struct in6_ifaddr *ia6; 807 struct ifnet *ifp = 808 if_get(m->m_pkthdr.ph_ifidx); 809 810 if (ifp && 811 (ia6 = in6ifa_ifpwithaddr(ifp, 812 &ip6->ip6_dst)) && 813 (ia6->ia6_flags & 814 IN6_IFF_DEPRECATED)) { 815 tp = NULL; 816 if_put(ifp); 817 goto dropwithreset; 818 } 819 if_put(ifp); 820 } 821 #endif 822 823 /* 824 * LISTEN socket received a SYN 825 * from itself? This can't possibly 826 * be valid; drop the packet. 827 */ 828 if (th->th_dport == th->th_sport) { 829 switch (af) { 830 #ifdef INET6 831 case AF_INET6: 832 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 833 &ip6->ip6_dst)) { 834 tcpstat.tcps_badsyn++; 835 goto drop; 836 } 837 break; 838 #endif /* INET6 */ 839 case AF_INET: 840 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 841 tcpstat.tcps_badsyn++; 842 goto drop; 843 } 844 break; 845 } 846 } 847 848 /* 849 * SYN looks ok; create compressed TCP 850 * state for it. 851 */ 852 if (so->so_qlen > so->so_qlimit || 853 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 854 so, m, optp, optlen, &opti, reuse) == -1) { 855 tcpstat.tcps_dropsyn++; 856 goto drop; 857 } 858 return; 859 } 860 } 861 } 862 863 #ifdef DIAGNOSTIC 864 /* 865 * Should not happen now that all embryonic connections 866 * are handled with compressed state. 867 */ 868 if (tp->t_state == TCPS_LISTEN) 869 panic("tcp_input: TCPS_LISTEN"); 870 #endif 871 872 #if NPF > 0 873 pf_inp_link(m, inp); 874 #endif 875 876 #ifdef IPSEC 877 /* Find most recent IPsec tag */ 878 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 879 if (mtag != NULL) { 880 tdbi = (struct tdb_ident *)(mtag + 1); 881 tdb = gettdb(tdbi->rdomain, tdbi->spi, 882 &tdbi->dst, tdbi->proto); 883 } else 884 tdb = NULL; 885 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 886 tdb, inp, 0); 887 if (error) { 888 tcpstat.tcps_rcvnosec++; 889 goto drop; 890 } 891 #endif /* IPSEC */ 892 893 /* 894 * Segment received on connection. 895 * Reset idle time and keep-alive timer. 896 */ 897 tp->t_rcvtime = tcp_now; 898 if (TCPS_HAVEESTABLISHED(tp->t_state)) 899 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 900 901 #ifdef TCP_SACK 902 if (tp->sack_enable) 903 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 904 #endif /* TCP_SACK */ 905 906 /* 907 * Process options. 908 */ 909 #ifdef TCP_SIGNATURE 910 if (optp || (tp->t_flags & TF_SIGNATURE)) 911 #else 912 if (optp) 913 #endif 914 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 915 m->m_pkthdr.ph_rtableid)) 916 goto drop; 917 918 if (opti.ts_present && opti.ts_ecr) { 919 int rtt_test; 920 921 /* subtract out the tcp timestamp modulator */ 922 opti.ts_ecr -= tp->ts_modulate; 923 924 /* make sure ts_ecr is sensible */ 925 rtt_test = tcp_now - opti.ts_ecr; 926 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 927 opti.ts_ecr = 0; 928 } 929 930 #ifdef TCP_ECN 931 /* if congestion experienced, set ECE bit in subsequent packets. */ 932 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 933 tp->t_flags |= TF_RCVD_CE; 934 tcpstat.tcps_ecn_rcvce++; 935 } 936 #endif 937 /* 938 * Header prediction: check for the two common cases 939 * of a uni-directional data xfer. If the packet has 940 * no control flags, is in-sequence, the window didn't 941 * change and we're not retransmitting, it's a 942 * candidate. If the length is zero and the ack moved 943 * forward, we're the sender side of the xfer. Just 944 * free the data acked & wake any higher level process 945 * that was blocked waiting for space. If the length 946 * is non-zero and the ack didn't move, we're the 947 * receiver side. If we're getting packets in-order 948 * (the reassembly queue is empty), add the data to 949 * the socket buffer and note that we need a delayed ack. 950 */ 951 if (tp->t_state == TCPS_ESTABLISHED && 952 #ifdef TCP_ECN 953 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 954 #else 955 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 956 #endif 957 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 958 th->th_seq == tp->rcv_nxt && 959 tiwin && tiwin == tp->snd_wnd && 960 tp->snd_nxt == tp->snd_max) { 961 962 /* 963 * If last ACK falls within this segment's sequence numbers, 964 * record the timestamp. 965 * Fix from Braden, see Stevens p. 870 966 */ 967 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 968 tp->ts_recent_age = tcp_now; 969 tp->ts_recent = opti.ts_val; 970 } 971 972 if (tlen == 0) { 973 if (SEQ_GT(th->th_ack, tp->snd_una) && 974 SEQ_LEQ(th->th_ack, tp->snd_max) && 975 tp->snd_cwnd >= tp->snd_wnd && 976 tp->t_dupacks == 0) { 977 /* 978 * this is a pure ack for outstanding data. 979 */ 980 ++tcpstat.tcps_predack; 981 if (opti.ts_present && opti.ts_ecr) 982 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 983 else if (tp->t_rtttime && 984 SEQ_GT(th->th_ack, tp->t_rtseq)) 985 tcp_xmit_timer(tp, 986 tcp_now - tp->t_rtttime); 987 acked = th->th_ack - tp->snd_una; 988 tcpstat.tcps_rcvackpack++; 989 tcpstat.tcps_rcvackbyte += acked; 990 ND6_HINT(tp); 991 sbdrop(&so->so_snd, acked); 992 993 /* 994 * If we had a pending ICMP message that 995 * refers to data that have just been 996 * acknowledged, disregard the recorded ICMP 997 * message. 998 */ 999 if ((tp->t_flags & TF_PMTUD_PEND) && 1000 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1001 tp->t_flags &= ~TF_PMTUD_PEND; 1002 1003 /* 1004 * Keep track of the largest chunk of data 1005 * acknowledged since last PMTU update 1006 */ 1007 if (tp->t_pmtud_mss_acked < acked) 1008 tp->t_pmtud_mss_acked = acked; 1009 1010 tp->snd_una = th->th_ack; 1011 #if defined(TCP_SACK) || defined(TCP_ECN) 1012 /* 1013 * We want snd_last to track snd_una so 1014 * as to avoid sequence wraparound problems 1015 * for very large transfers. 1016 */ 1017 #ifdef TCP_ECN 1018 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1019 #endif 1020 tp->snd_last = tp->snd_una; 1021 #endif /* TCP_SACK */ 1022 #if defined(TCP_SACK) && defined(TCP_FACK) 1023 tp->snd_fack = tp->snd_una; 1024 tp->retran_data = 0; 1025 #endif /* TCP_FACK */ 1026 m_freem(m); 1027 1028 /* 1029 * If all outstanding data are acked, stop 1030 * retransmit timer, otherwise restart timer 1031 * using current (possibly backed-off) value. 1032 * If process is waiting for space, 1033 * wakeup/selwakeup/signal. If data 1034 * are ready to send, let tcp_output 1035 * decide between more output or persist. 1036 */ 1037 if (tp->snd_una == tp->snd_max) 1038 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1039 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1040 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1041 1042 tcp_update_sndspace(tp); 1043 if (sb_notify(&so->so_snd)) { 1044 tp->t_flags |= TF_BLOCKOUTPUT; 1045 sowwakeup(so); 1046 tp->t_flags &= ~TF_BLOCKOUTPUT; 1047 } 1048 if (so->so_snd.sb_cc || 1049 tp->t_flags & TF_NEEDOUTPUT) 1050 (void) tcp_output(tp); 1051 return; 1052 } 1053 } else if (th->th_ack == tp->snd_una && 1054 TAILQ_EMPTY(&tp->t_segq) && 1055 tlen <= sbspace(&so->so_rcv)) { 1056 /* 1057 * This is a pure, in-sequence data packet 1058 * with nothing on the reassembly queue and 1059 * we have enough buffer space to take it. 1060 */ 1061 #ifdef TCP_SACK 1062 /* Clean receiver SACK report if present */ 1063 if (tp->sack_enable && tp->rcv_numsacks) 1064 tcp_clean_sackreport(tp); 1065 #endif /* TCP_SACK */ 1066 ++tcpstat.tcps_preddat; 1067 tp->rcv_nxt += tlen; 1068 tcpstat.tcps_rcvpack++; 1069 tcpstat.tcps_rcvbyte += tlen; 1070 ND6_HINT(tp); 1071 1072 TCP_SETUP_ACK(tp, tiflags, m); 1073 /* 1074 * Drop TCP, IP headers and TCP options then add data 1075 * to socket buffer. 1076 */ 1077 if (so->so_state & SS_CANTRCVMORE) 1078 m_freem(m); 1079 else { 1080 if (opti.ts_present && opti.ts_ecr) { 1081 if (tp->rfbuf_ts < opti.ts_ecr && 1082 opti.ts_ecr - tp->rfbuf_ts < hz) { 1083 tcp_update_rcvspace(tp); 1084 /* Start over with next RTT. */ 1085 tp->rfbuf_cnt = 0; 1086 tp->rfbuf_ts = 0; 1087 } else 1088 tp->rfbuf_cnt += tlen; 1089 } 1090 m_adj(m, iphlen + off); 1091 sbappendstream(&so->so_rcv, m); 1092 } 1093 tp->t_flags |= TF_BLOCKOUTPUT; 1094 sorwakeup(so); 1095 tp->t_flags &= ~TF_BLOCKOUTPUT; 1096 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1097 (void) tcp_output(tp); 1098 return; 1099 } 1100 } 1101 1102 /* 1103 * Compute mbuf offset to TCP data segment. 1104 */ 1105 hdroptlen = iphlen + off; 1106 1107 /* 1108 * Calculate amount of space in receive window, 1109 * and then do TCP input processing. 1110 * Receive window is amount of space in rcv queue, 1111 * but not less than advertised window. 1112 */ 1113 { int win; 1114 1115 win = sbspace(&so->so_rcv); 1116 if (win < 0) 1117 win = 0; 1118 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1119 } 1120 1121 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1122 tp->rfbuf_cnt = 0; 1123 tp->rfbuf_ts = 0; 1124 1125 switch (tp->t_state) { 1126 1127 /* 1128 * If the state is SYN_RECEIVED: 1129 * if seg contains SYN/ACK, send an RST. 1130 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1131 */ 1132 1133 case TCPS_SYN_RECEIVED: 1134 if (tiflags & TH_ACK) { 1135 if (tiflags & TH_SYN) { 1136 tcpstat.tcps_badsyn++; 1137 goto dropwithreset; 1138 } 1139 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1140 SEQ_GT(th->th_ack, tp->snd_max)) 1141 goto dropwithreset; 1142 } 1143 break; 1144 1145 /* 1146 * If the state is SYN_SENT: 1147 * if seg contains an ACK, but not for our SYN, drop the input. 1148 * if seg contains a RST, then drop the connection. 1149 * if seg does not contain SYN, then drop it. 1150 * Otherwise this is an acceptable SYN segment 1151 * initialize tp->rcv_nxt and tp->irs 1152 * if seg contains ack then advance tp->snd_una 1153 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1154 * arrange for segment to be acked (eventually) 1155 * continue processing rest of data/controls, beginning with URG 1156 */ 1157 case TCPS_SYN_SENT: 1158 if ((tiflags & TH_ACK) && 1159 (SEQ_LEQ(th->th_ack, tp->iss) || 1160 SEQ_GT(th->th_ack, tp->snd_max))) 1161 goto dropwithreset; 1162 if (tiflags & TH_RST) { 1163 #ifdef TCP_ECN 1164 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1165 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1166 goto drop; 1167 #endif 1168 if (tiflags & TH_ACK) 1169 tp = tcp_drop(tp, ECONNREFUSED); 1170 goto drop; 1171 } 1172 if ((tiflags & TH_SYN) == 0) 1173 goto drop; 1174 if (tiflags & TH_ACK) { 1175 tp->snd_una = th->th_ack; 1176 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1177 tp->snd_nxt = tp->snd_una; 1178 } 1179 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1180 tp->irs = th->th_seq; 1181 tcp_mss(tp, opti.maxseg); 1182 /* Reset initial window to 1 segment for retransmit */ 1183 if (tp->t_rxtshift > 0) 1184 tp->snd_cwnd = tp->t_maxseg; 1185 tcp_rcvseqinit(tp); 1186 tp->t_flags |= TF_ACKNOW; 1187 #ifdef TCP_SACK 1188 /* 1189 * If we've sent a SACK_PERMITTED option, and the peer 1190 * also replied with one, then TF_SACK_PERMIT should have 1191 * been set in tcp_dooptions(). If it was not, disable SACKs. 1192 */ 1193 if (tp->sack_enable) 1194 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1195 #endif 1196 #ifdef TCP_ECN 1197 /* 1198 * if ECE is set but CWR is not set for SYN-ACK, or 1199 * both ECE and CWR are set for simultaneous open, 1200 * peer is ECN capable. 1201 */ 1202 if (tcp_do_ecn) { 1203 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1204 case TH_ACK|TH_ECE: 1205 case TH_ECE|TH_CWR: 1206 tp->t_flags |= TF_ECN_PERMIT; 1207 tiflags &= ~(TH_ECE|TH_CWR); 1208 tcpstat.tcps_ecn_accepts++; 1209 } 1210 } 1211 #endif 1212 1213 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1214 tcpstat.tcps_connects++; 1215 soisconnected(so); 1216 tp->t_state = TCPS_ESTABLISHED; 1217 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1218 /* Do window scaling on this connection? */ 1219 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1220 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1221 tp->snd_scale = tp->requested_s_scale; 1222 tp->rcv_scale = tp->request_r_scale; 1223 } 1224 tcp_flush_queue(tp); 1225 1226 /* 1227 * if we didn't have to retransmit the SYN, 1228 * use its rtt as our initial srtt & rtt var. 1229 */ 1230 if (tp->t_rtttime) 1231 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1232 /* 1233 * Since new data was acked (the SYN), open the 1234 * congestion window by one MSS. We do this 1235 * here, because we won't go through the normal 1236 * ACK processing below. And since this is the 1237 * start of the connection, we know we are in 1238 * the exponential phase of slow-start. 1239 */ 1240 tp->snd_cwnd += tp->t_maxseg; 1241 } else 1242 tp->t_state = TCPS_SYN_RECEIVED; 1243 1244 #if 0 1245 trimthenstep6: 1246 #endif 1247 /* 1248 * Advance th->th_seq to correspond to first data byte. 1249 * If data, trim to stay within window, 1250 * dropping FIN if necessary. 1251 */ 1252 th->th_seq++; 1253 if (tlen > tp->rcv_wnd) { 1254 todrop = tlen - tp->rcv_wnd; 1255 m_adj(m, -todrop); 1256 tlen = tp->rcv_wnd; 1257 tiflags &= ~TH_FIN; 1258 tcpstat.tcps_rcvpackafterwin++; 1259 tcpstat.tcps_rcvbyteafterwin += todrop; 1260 } 1261 tp->snd_wl1 = th->th_seq - 1; 1262 tp->rcv_up = th->th_seq; 1263 goto step6; 1264 /* 1265 * If a new connection request is received while in TIME_WAIT, 1266 * drop the old connection and start over if the if the 1267 * timestamp or the sequence numbers are above the previous 1268 * ones. 1269 */ 1270 case TCPS_TIME_WAIT: 1271 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1272 ((opti.ts_present && 1273 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1274 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1275 #if NPF > 0 1276 /* 1277 * The socket will be recreated but the new state 1278 * has already been linked to the socket. Remove the 1279 * link between old socket and new state. 1280 */ 1281 pf_inp_unlink(inp); 1282 #endif 1283 /* 1284 * Advance the iss by at least 32768, but 1285 * clear the msb in order to make sure 1286 * that SEG_LT(snd_nxt, iss). 1287 */ 1288 iss = tp->snd_nxt + 1289 ((arc4random() & 0x7fffffff) | 0x8000); 1290 reuse = &iss; 1291 tp = tcp_close(tp); 1292 inp = NULL; 1293 goto findpcb; 1294 } 1295 } 1296 1297 /* 1298 * States other than LISTEN or SYN_SENT. 1299 * First check timestamp, if present. 1300 * Then check that at least some bytes of segment are within 1301 * receive window. If segment begins before rcv_nxt, 1302 * drop leading data (and SYN); if nothing left, just ack. 1303 * 1304 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1305 * and it's less than opti.ts_recent, drop it. 1306 */ 1307 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1308 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1309 1310 /* Check to see if ts_recent is over 24 days old. */ 1311 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1312 /* 1313 * Invalidate ts_recent. If this segment updates 1314 * ts_recent, the age will be reset later and ts_recent 1315 * will get a valid value. If it does not, setting 1316 * ts_recent to zero will at least satisfy the 1317 * requirement that zero be placed in the timestamp 1318 * echo reply when ts_recent isn't valid. The 1319 * age isn't reset until we get a valid ts_recent 1320 * because we don't want out-of-order segments to be 1321 * dropped when ts_recent is old. 1322 */ 1323 tp->ts_recent = 0; 1324 } else { 1325 tcpstat.tcps_rcvduppack++; 1326 tcpstat.tcps_rcvdupbyte += tlen; 1327 tcpstat.tcps_pawsdrop++; 1328 goto dropafterack; 1329 } 1330 } 1331 1332 todrop = tp->rcv_nxt - th->th_seq; 1333 if (todrop > 0) { 1334 if (tiflags & TH_SYN) { 1335 tiflags &= ~TH_SYN; 1336 th->th_seq++; 1337 if (th->th_urp > 1) 1338 th->th_urp--; 1339 else 1340 tiflags &= ~TH_URG; 1341 todrop--; 1342 } 1343 if (todrop > tlen || 1344 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1345 /* 1346 * Any valid FIN must be to the left of the 1347 * window. At this point, FIN must be a 1348 * duplicate or out-of-sequence, so drop it. 1349 */ 1350 tiflags &= ~TH_FIN; 1351 /* 1352 * Send ACK to resynchronize, and drop any data, 1353 * but keep on processing for RST or ACK. 1354 */ 1355 tp->t_flags |= TF_ACKNOW; 1356 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1357 tcpstat.tcps_rcvduppack++; 1358 } else { 1359 tcpstat.tcps_rcvpartduppack++; 1360 tcpstat.tcps_rcvpartdupbyte += todrop; 1361 } 1362 hdroptlen += todrop; /* drop from head afterwards */ 1363 th->th_seq += todrop; 1364 tlen -= todrop; 1365 if (th->th_urp > todrop) 1366 th->th_urp -= todrop; 1367 else { 1368 tiflags &= ~TH_URG; 1369 th->th_urp = 0; 1370 } 1371 } 1372 1373 /* 1374 * If new data are received on a connection after the 1375 * user processes are gone, then RST the other end. 1376 */ 1377 if ((so->so_state & SS_NOFDREF) && 1378 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1379 tp = tcp_close(tp); 1380 tcpstat.tcps_rcvafterclose++; 1381 goto dropwithreset; 1382 } 1383 1384 /* 1385 * If segment ends after window, drop trailing data 1386 * (and PUSH and FIN); if nothing left, just ACK. 1387 */ 1388 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1389 if (todrop > 0) { 1390 tcpstat.tcps_rcvpackafterwin++; 1391 if (todrop >= tlen) { 1392 tcpstat.tcps_rcvbyteafterwin += tlen; 1393 /* 1394 * If window is closed can only take segments at 1395 * window edge, and have to drop data and PUSH from 1396 * incoming segments. Continue processing, but 1397 * remember to ack. Otherwise, drop segment 1398 * and ack. 1399 */ 1400 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1401 tp->t_flags |= TF_ACKNOW; 1402 tcpstat.tcps_rcvwinprobe++; 1403 } else 1404 goto dropafterack; 1405 } else 1406 tcpstat.tcps_rcvbyteafterwin += todrop; 1407 m_adj(m, -todrop); 1408 tlen -= todrop; 1409 tiflags &= ~(TH_PUSH|TH_FIN); 1410 } 1411 1412 /* 1413 * If last ACK falls within this segment's sequence numbers, 1414 * record its timestamp if it's more recent. 1415 * Cf fix from Braden, see Stevens p. 870 1416 */ 1417 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1418 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1419 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1420 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1421 tp->ts_recent = opti.ts_val; 1422 else 1423 tp->ts_recent = 0; 1424 tp->ts_recent_age = tcp_now; 1425 } 1426 1427 /* 1428 * If the RST bit is set examine the state: 1429 * SYN_RECEIVED STATE: 1430 * If passive open, return to LISTEN state. 1431 * If active open, inform user that connection was refused. 1432 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1433 * Inform user that connection was reset, and close tcb. 1434 * CLOSING, LAST_ACK, TIME_WAIT STATES 1435 * Close the tcb. 1436 */ 1437 if (tiflags & TH_RST) { 1438 if (th->th_seq != tp->last_ack_sent && 1439 th->th_seq != tp->rcv_nxt && 1440 th->th_seq != (tp->rcv_nxt + 1)) 1441 goto drop; 1442 1443 switch (tp->t_state) { 1444 case TCPS_SYN_RECEIVED: 1445 #ifdef TCP_ECN 1446 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1447 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1448 goto drop; 1449 #endif 1450 so->so_error = ECONNREFUSED; 1451 goto close; 1452 1453 case TCPS_ESTABLISHED: 1454 case TCPS_FIN_WAIT_1: 1455 case TCPS_FIN_WAIT_2: 1456 case TCPS_CLOSE_WAIT: 1457 so->so_error = ECONNRESET; 1458 close: 1459 tp->t_state = TCPS_CLOSED; 1460 tcpstat.tcps_drops++; 1461 tp = tcp_close(tp); 1462 goto drop; 1463 case TCPS_CLOSING: 1464 case TCPS_LAST_ACK: 1465 case TCPS_TIME_WAIT: 1466 tp = tcp_close(tp); 1467 goto drop; 1468 } 1469 } 1470 1471 /* 1472 * If a SYN is in the window, then this is an 1473 * error and we ACK and drop the packet. 1474 */ 1475 if (tiflags & TH_SYN) 1476 goto dropafterack_ratelim; 1477 1478 /* 1479 * If the ACK bit is off we drop the segment and return. 1480 */ 1481 if ((tiflags & TH_ACK) == 0) { 1482 if (tp->t_flags & TF_ACKNOW) 1483 goto dropafterack; 1484 else 1485 goto drop; 1486 } 1487 1488 /* 1489 * Ack processing. 1490 */ 1491 switch (tp->t_state) { 1492 1493 /* 1494 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1495 * ESTABLISHED state and continue processing. 1496 * The ACK was checked above. 1497 */ 1498 case TCPS_SYN_RECEIVED: 1499 tcpstat.tcps_connects++; 1500 soisconnected(so); 1501 tp->t_state = TCPS_ESTABLISHED; 1502 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1503 /* Do window scaling? */ 1504 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1505 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1506 tp->snd_scale = tp->requested_s_scale; 1507 tp->rcv_scale = tp->request_r_scale; 1508 tiwin = th->th_win << tp->snd_scale; 1509 } 1510 tcp_flush_queue(tp); 1511 tp->snd_wl1 = th->th_seq - 1; 1512 /* fall into ... */ 1513 1514 /* 1515 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1516 * ACKs. If the ack is in the range 1517 * tp->snd_una < th->th_ack <= tp->snd_max 1518 * then advance tp->snd_una to th->th_ack and drop 1519 * data from the retransmission queue. If this ACK reflects 1520 * more up to date window information we update our window information. 1521 */ 1522 case TCPS_ESTABLISHED: 1523 case TCPS_FIN_WAIT_1: 1524 case TCPS_FIN_WAIT_2: 1525 case TCPS_CLOSE_WAIT: 1526 case TCPS_CLOSING: 1527 case TCPS_LAST_ACK: 1528 case TCPS_TIME_WAIT: 1529 #ifdef TCP_ECN 1530 /* 1531 * if we receive ECE and are not already in recovery phase, 1532 * reduce cwnd by half but don't slow-start. 1533 * advance snd_last to snd_max not to reduce cwnd again 1534 * until all outstanding packets are acked. 1535 */ 1536 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1537 if ((tp->t_flags & TF_ECN_PERMIT) && 1538 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1539 u_int win; 1540 1541 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1542 if (win > 1) { 1543 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1544 tp->snd_cwnd = tp->snd_ssthresh; 1545 tp->snd_last = tp->snd_max; 1546 tp->t_flags |= TF_SEND_CWR; 1547 tcpstat.tcps_cwr_ecn++; 1548 } 1549 } 1550 tcpstat.tcps_ecn_rcvece++; 1551 } 1552 /* 1553 * if we receive CWR, we know that the peer has reduced 1554 * its congestion window. stop sending ecn-echo. 1555 */ 1556 if ((tiflags & TH_CWR)) { 1557 tp->t_flags &= ~TF_RCVD_CE; 1558 tcpstat.tcps_ecn_rcvcwr++; 1559 } 1560 #endif /* TCP_ECN */ 1561 1562 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1563 /* 1564 * Duplicate/old ACK processing. 1565 * Increments t_dupacks: 1566 * Pure duplicate (same seq/ack/window, no data) 1567 * Doesn't affect t_dupacks: 1568 * Data packets. 1569 * Normal window updates (window opens) 1570 * Resets t_dupacks: 1571 * New data ACKed. 1572 * Window shrinks 1573 * Old ACK 1574 */ 1575 if (tlen) { 1576 /* Drop very old ACKs unless th_seq matches */ 1577 if (th->th_seq != tp->rcv_nxt && 1578 SEQ_LT(th->th_ack, 1579 tp->snd_una - tp->max_sndwnd)) { 1580 tcpstat.tcps_rcvacktooold++; 1581 goto drop; 1582 } 1583 break; 1584 } 1585 /* 1586 * If we get an old ACK, there is probably packet 1587 * reordering going on. Be conservative and reset 1588 * t_dupacks so that we are less aggressive in 1589 * doing a fast retransmit. 1590 */ 1591 if (th->th_ack != tp->snd_una) { 1592 tp->t_dupacks = 0; 1593 break; 1594 } 1595 if (tiwin == tp->snd_wnd) { 1596 tcpstat.tcps_rcvdupack++; 1597 /* 1598 * If we have outstanding data (other than 1599 * a window probe), this is a completely 1600 * duplicate ack (ie, window info didn't 1601 * change), the ack is the biggest we've 1602 * seen and we've seen exactly our rexmt 1603 * threshold of them, assume a packet 1604 * has been dropped and retransmit it. 1605 * Kludge snd_nxt & the congestion 1606 * window so we send only this one 1607 * packet. 1608 * 1609 * We know we're losing at the current 1610 * window size so do congestion avoidance 1611 * (set ssthresh to half the current window 1612 * and pull our congestion window back to 1613 * the new ssthresh). 1614 * 1615 * Dup acks mean that packets have left the 1616 * network (they're now cached at the receiver) 1617 * so bump cwnd by the amount in the receiver 1618 * to keep a constant cwnd packets in the 1619 * network. 1620 */ 1621 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1622 tp->t_dupacks = 0; 1623 #if defined(TCP_SACK) && defined(TCP_FACK) 1624 /* 1625 * In FACK, can enter fast rec. if the receiver 1626 * reports a reass. queue longer than 3 segs. 1627 */ 1628 else if (++tp->t_dupacks == tcprexmtthresh || 1629 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1630 tp->t_maxseg + tp->snd_una)) && 1631 SEQ_GT(tp->snd_una, tp->snd_last))) { 1632 #else 1633 else if (++tp->t_dupacks == tcprexmtthresh) { 1634 #endif /* TCP_FACK */ 1635 tcp_seq onxt = tp->snd_nxt; 1636 u_long win = 1637 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1638 2 / tp->t_maxseg; 1639 1640 #if defined(TCP_SACK) || defined(TCP_ECN) 1641 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1642 /* 1643 * False fast retx after 1644 * timeout. Do not cut window. 1645 */ 1646 tp->t_dupacks = 0; 1647 goto drop; 1648 } 1649 #endif 1650 if (win < 2) 1651 win = 2; 1652 tp->snd_ssthresh = win * tp->t_maxseg; 1653 #ifdef TCP_SACK 1654 tp->snd_last = tp->snd_max; 1655 if (tp->sack_enable) { 1656 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1657 tp->t_rtttime = 0; 1658 #ifdef TCP_ECN 1659 tp->t_flags |= TF_SEND_CWR; 1660 #endif 1661 tcpstat.tcps_cwr_frecovery++; 1662 tcpstat.tcps_sack_recovery_episode++; 1663 #if defined(TCP_SACK) && defined(TCP_FACK) 1664 tp->t_dupacks = tcprexmtthresh; 1665 (void) tcp_output(tp); 1666 /* 1667 * During FR, snd_cwnd is held 1668 * constant for FACK. 1669 */ 1670 tp->snd_cwnd = tp->snd_ssthresh; 1671 #else 1672 /* 1673 * tcp_output() will send 1674 * oldest SACK-eligible rtx. 1675 */ 1676 (void) tcp_output(tp); 1677 tp->snd_cwnd = tp->snd_ssthresh+ 1678 tp->t_maxseg * tp->t_dupacks; 1679 #endif /* TCP_FACK */ 1680 goto drop; 1681 } 1682 #endif /* TCP_SACK */ 1683 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1684 tp->t_rtttime = 0; 1685 tp->snd_nxt = th->th_ack; 1686 tp->snd_cwnd = tp->t_maxseg; 1687 #ifdef TCP_ECN 1688 tp->t_flags |= TF_SEND_CWR; 1689 #endif 1690 tcpstat.tcps_cwr_frecovery++; 1691 tcpstat.tcps_sndrexmitfast++; 1692 (void) tcp_output(tp); 1693 1694 tp->snd_cwnd = tp->snd_ssthresh + 1695 tp->t_maxseg * tp->t_dupacks; 1696 if (SEQ_GT(onxt, tp->snd_nxt)) 1697 tp->snd_nxt = onxt; 1698 goto drop; 1699 } else if (tp->t_dupacks > tcprexmtthresh) { 1700 #if defined(TCP_SACK) && defined(TCP_FACK) 1701 /* 1702 * while (awnd < cwnd) 1703 * sendsomething(); 1704 */ 1705 if (tp->sack_enable) { 1706 if (tp->snd_awnd < tp->snd_cwnd) 1707 tcp_output(tp); 1708 goto drop; 1709 } 1710 #endif /* TCP_FACK */ 1711 tp->snd_cwnd += tp->t_maxseg; 1712 (void) tcp_output(tp); 1713 goto drop; 1714 } 1715 } else if (tiwin < tp->snd_wnd) { 1716 /* 1717 * The window was retracted! Previous dup 1718 * ACKs may have been due to packets arriving 1719 * after the shrunken window, not a missing 1720 * packet, so play it safe and reset t_dupacks 1721 */ 1722 tp->t_dupacks = 0; 1723 } 1724 break; 1725 } 1726 /* 1727 * If the congestion window was inflated to account 1728 * for the other side's cached packets, retract it. 1729 */ 1730 #if defined(TCP_SACK) 1731 if (tp->sack_enable) { 1732 if (tp->t_dupacks >= tcprexmtthresh) { 1733 /* Check for a partial ACK */ 1734 if (tcp_sack_partialack(tp, th)) { 1735 #if defined(TCP_SACK) && defined(TCP_FACK) 1736 /* Force call to tcp_output */ 1737 if (tp->snd_awnd < tp->snd_cwnd) 1738 tp->t_flags |= TF_NEEDOUTPUT; 1739 #else 1740 tp->snd_cwnd += tp->t_maxseg; 1741 tp->t_flags |= TF_NEEDOUTPUT; 1742 #endif /* TCP_FACK */ 1743 } else { 1744 /* Out of fast recovery */ 1745 tp->snd_cwnd = tp->snd_ssthresh; 1746 if (tcp_seq_subtract(tp->snd_max, 1747 th->th_ack) < tp->snd_ssthresh) 1748 tp->snd_cwnd = 1749 tcp_seq_subtract(tp->snd_max, 1750 th->th_ack); 1751 tp->t_dupacks = 0; 1752 #if defined(TCP_SACK) && defined(TCP_FACK) 1753 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1754 tp->snd_fack = th->th_ack; 1755 #endif /* TCP_FACK */ 1756 } 1757 } 1758 } else { 1759 if (tp->t_dupacks >= tcprexmtthresh && 1760 !tcp_newreno(tp, th)) { 1761 /* Out of fast recovery */ 1762 tp->snd_cwnd = tp->snd_ssthresh; 1763 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1764 tp->snd_ssthresh) 1765 tp->snd_cwnd = 1766 tcp_seq_subtract(tp->snd_max, 1767 th->th_ack); 1768 tp->t_dupacks = 0; 1769 } 1770 } 1771 if (tp->t_dupacks < tcprexmtthresh) 1772 tp->t_dupacks = 0; 1773 #else /* else no TCP_SACK */ 1774 if (tp->t_dupacks >= tcprexmtthresh && 1775 tp->snd_cwnd > tp->snd_ssthresh) 1776 tp->snd_cwnd = tp->snd_ssthresh; 1777 tp->t_dupacks = 0; 1778 #endif 1779 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1780 tcpstat.tcps_rcvacktoomuch++; 1781 goto dropafterack_ratelim; 1782 } 1783 acked = th->th_ack - tp->snd_una; 1784 tcpstat.tcps_rcvackpack++; 1785 tcpstat.tcps_rcvackbyte += acked; 1786 1787 /* 1788 * If we have a timestamp reply, update smoothed 1789 * round trip time. If no timestamp is present but 1790 * transmit timer is running and timed sequence 1791 * number was acked, update smoothed round trip time. 1792 * Since we now have an rtt measurement, cancel the 1793 * timer backoff (cf., Phil Karn's retransmit alg.). 1794 * Recompute the initial retransmit timer. 1795 */ 1796 if (opti.ts_present && opti.ts_ecr) 1797 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1798 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1799 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1800 1801 /* 1802 * If all outstanding data is acked, stop retransmit 1803 * timer and remember to restart (more output or persist). 1804 * If there is more data to be acked, restart retransmit 1805 * timer, using current (possibly backed-off) value. 1806 */ 1807 if (th->th_ack == tp->snd_max) { 1808 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1809 tp->t_flags |= TF_NEEDOUTPUT; 1810 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1811 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1812 /* 1813 * When new data is acked, open the congestion window. 1814 * If the window gives us less than ssthresh packets 1815 * in flight, open exponentially (maxseg per packet). 1816 * Otherwise open linearly: maxseg per window 1817 * (maxseg^2 / cwnd per packet). 1818 */ 1819 { 1820 u_int cw = tp->snd_cwnd; 1821 u_int incr = tp->t_maxseg; 1822 1823 if (cw > tp->snd_ssthresh) 1824 incr = incr * incr / cw; 1825 #if defined (TCP_SACK) 1826 if (tp->t_dupacks < tcprexmtthresh) 1827 #endif 1828 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1829 } 1830 ND6_HINT(tp); 1831 if (acked > so->so_snd.sb_cc) { 1832 tp->snd_wnd -= so->so_snd.sb_cc; 1833 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1834 ourfinisacked = 1; 1835 } else { 1836 sbdrop(&so->so_snd, acked); 1837 tp->snd_wnd -= acked; 1838 ourfinisacked = 0; 1839 } 1840 1841 tcp_update_sndspace(tp); 1842 if (sb_notify(&so->so_snd)) { 1843 tp->t_flags |= TF_BLOCKOUTPUT; 1844 sowwakeup(so); 1845 tp->t_flags &= ~TF_BLOCKOUTPUT; 1846 } 1847 1848 /* 1849 * If we had a pending ICMP message that referred to data 1850 * that have just been acknowledged, disregard the recorded 1851 * ICMP message. 1852 */ 1853 if ((tp->t_flags & TF_PMTUD_PEND) && 1854 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1855 tp->t_flags &= ~TF_PMTUD_PEND; 1856 1857 /* 1858 * Keep track of the largest chunk of data acknowledged 1859 * since last PMTU update 1860 */ 1861 if (tp->t_pmtud_mss_acked < acked) 1862 tp->t_pmtud_mss_acked = acked; 1863 1864 tp->snd_una = th->th_ack; 1865 #ifdef TCP_ECN 1866 /* sync snd_last with snd_una */ 1867 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1868 tp->snd_last = tp->snd_una; 1869 #endif 1870 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1871 tp->snd_nxt = tp->snd_una; 1872 #if defined (TCP_SACK) && defined (TCP_FACK) 1873 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1874 tp->snd_fack = tp->snd_una; 1875 /* Update snd_awnd for partial ACK 1876 * without any SACK blocks. 1877 */ 1878 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1879 tp->snd_fack) + tp->retran_data; 1880 } 1881 #endif 1882 1883 switch (tp->t_state) { 1884 1885 /* 1886 * In FIN_WAIT_1 STATE in addition to the processing 1887 * for the ESTABLISHED state if our FIN is now acknowledged 1888 * then enter FIN_WAIT_2. 1889 */ 1890 case TCPS_FIN_WAIT_1: 1891 if (ourfinisacked) { 1892 /* 1893 * If we can't receive any more 1894 * data, then closing user can proceed. 1895 * Starting the timer is contrary to the 1896 * specification, but if we don't get a FIN 1897 * we'll hang forever. 1898 */ 1899 if (so->so_state & SS_CANTRCVMORE) { 1900 soisdisconnected(so); 1901 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1902 } 1903 tp->t_state = TCPS_FIN_WAIT_2; 1904 } 1905 break; 1906 1907 /* 1908 * In CLOSING STATE in addition to the processing for 1909 * the ESTABLISHED state if the ACK acknowledges our FIN 1910 * then enter the TIME-WAIT state, otherwise ignore 1911 * the segment. 1912 */ 1913 case TCPS_CLOSING: 1914 if (ourfinisacked) { 1915 tp->t_state = TCPS_TIME_WAIT; 1916 tcp_canceltimers(tp); 1917 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1918 soisdisconnected(so); 1919 } 1920 break; 1921 1922 /* 1923 * In LAST_ACK, we may still be waiting for data to drain 1924 * and/or to be acked, as well as for the ack of our FIN. 1925 * If our FIN is now acknowledged, delete the TCB, 1926 * enter the closed state and return. 1927 */ 1928 case TCPS_LAST_ACK: 1929 if (ourfinisacked) { 1930 tp = tcp_close(tp); 1931 goto drop; 1932 } 1933 break; 1934 1935 /* 1936 * In TIME_WAIT state the only thing that should arrive 1937 * is a retransmission of the remote FIN. Acknowledge 1938 * it and restart the finack timer. 1939 */ 1940 case TCPS_TIME_WAIT: 1941 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1942 goto dropafterack; 1943 } 1944 } 1945 1946 step6: 1947 /* 1948 * Update window information. 1949 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1950 */ 1951 if ((tiflags & TH_ACK) && 1952 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1953 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1954 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1955 /* keep track of pure window updates */ 1956 if (tlen == 0 && 1957 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1958 tcpstat.tcps_rcvwinupd++; 1959 tp->snd_wnd = tiwin; 1960 tp->snd_wl1 = th->th_seq; 1961 tp->snd_wl2 = th->th_ack; 1962 if (tp->snd_wnd > tp->max_sndwnd) 1963 tp->max_sndwnd = tp->snd_wnd; 1964 tp->t_flags |= TF_NEEDOUTPUT; 1965 } 1966 1967 /* 1968 * Process segments with URG. 1969 */ 1970 if ((tiflags & TH_URG) && th->th_urp && 1971 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1972 /* 1973 * This is a kludge, but if we receive and accept 1974 * random urgent pointers, we'll crash in 1975 * soreceive. It's hard to imagine someone 1976 * actually wanting to send this much urgent data. 1977 */ 1978 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1979 th->th_urp = 0; /* XXX */ 1980 tiflags &= ~TH_URG; /* XXX */ 1981 goto dodata; /* XXX */ 1982 } 1983 /* 1984 * If this segment advances the known urgent pointer, 1985 * then mark the data stream. This should not happen 1986 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1987 * a FIN has been received from the remote side. 1988 * In these states we ignore the URG. 1989 * 1990 * According to RFC961 (Assigned Protocols), 1991 * the urgent pointer points to the last octet 1992 * of urgent data. We continue, however, 1993 * to consider it to indicate the first octet 1994 * of data past the urgent section as the original 1995 * spec states (in one of two places). 1996 */ 1997 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1998 tp->rcv_up = th->th_seq + th->th_urp; 1999 so->so_oobmark = so->so_rcv.sb_cc + 2000 (tp->rcv_up - tp->rcv_nxt) - 1; 2001 if (so->so_oobmark == 0) 2002 so->so_state |= SS_RCVATMARK; 2003 sohasoutofband(so); 2004 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2005 } 2006 /* 2007 * Remove out of band data so doesn't get presented to user. 2008 * This can happen independent of advancing the URG pointer, 2009 * but if two URG's are pending at once, some out-of-band 2010 * data may creep in... ick. 2011 */ 2012 if (th->th_urp <= (u_int16_t) tlen && 2013 (so->so_options & SO_OOBINLINE) == 0) 2014 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2015 } else 2016 /* 2017 * If no out of band data is expected, 2018 * pull receive urgent pointer along 2019 * with the receive window. 2020 */ 2021 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2022 tp->rcv_up = tp->rcv_nxt; 2023 dodata: /* XXX */ 2024 2025 /* 2026 * Process the segment text, merging it into the TCP sequencing queue, 2027 * and arranging for acknowledgment of receipt if necessary. 2028 * This process logically involves adjusting tp->rcv_wnd as data 2029 * is presented to the user (this happens in tcp_usrreq.c, 2030 * case PRU_RCVD). If a FIN has already been received on this 2031 * connection then we just ignore the text. 2032 */ 2033 if ((tlen || (tiflags & TH_FIN)) && 2034 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2035 #ifdef TCP_SACK 2036 tcp_seq laststart = th->th_seq; 2037 tcp_seq lastend = th->th_seq + tlen; 2038 #endif 2039 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2040 tp->t_state == TCPS_ESTABLISHED) { 2041 TCP_SETUP_ACK(tp, tiflags, m); 2042 tp->rcv_nxt += tlen; 2043 tiflags = th->th_flags & TH_FIN; 2044 tcpstat.tcps_rcvpack++; 2045 tcpstat.tcps_rcvbyte += tlen; 2046 ND6_HINT(tp); 2047 if (so->so_state & SS_CANTRCVMORE) 2048 m_freem(m); 2049 else { 2050 m_adj(m, hdroptlen); 2051 sbappendstream(&so->so_rcv, m); 2052 } 2053 tp->t_flags |= TF_BLOCKOUTPUT; 2054 sorwakeup(so); 2055 tp->t_flags &= ~TF_BLOCKOUTPUT; 2056 } else { 2057 m_adj(m, hdroptlen); 2058 tiflags = tcp_reass(tp, th, m, &tlen); 2059 tp->t_flags |= TF_ACKNOW; 2060 } 2061 #ifdef TCP_SACK 2062 if (tp->sack_enable) 2063 tcp_update_sack_list(tp, laststart, lastend); 2064 #endif 2065 2066 /* 2067 * variable len never referenced again in modern BSD, 2068 * so why bother computing it ?? 2069 */ 2070 #if 0 2071 /* 2072 * Note the amount of data that peer has sent into 2073 * our window, in order to estimate the sender's 2074 * buffer size. 2075 */ 2076 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2077 #endif /* 0 */ 2078 } else { 2079 m_freem(m); 2080 tiflags &= ~TH_FIN; 2081 } 2082 2083 /* 2084 * If FIN is received ACK the FIN and let the user know 2085 * that the connection is closing. Ignore a FIN received before 2086 * the connection is fully established. 2087 */ 2088 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2089 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2090 socantrcvmore(so); 2091 tp->t_flags |= TF_ACKNOW; 2092 tp->rcv_nxt++; 2093 } 2094 switch (tp->t_state) { 2095 2096 /* 2097 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2098 */ 2099 case TCPS_ESTABLISHED: 2100 tp->t_state = TCPS_CLOSE_WAIT; 2101 break; 2102 2103 /* 2104 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2105 * enter the CLOSING state. 2106 */ 2107 case TCPS_FIN_WAIT_1: 2108 tp->t_state = TCPS_CLOSING; 2109 break; 2110 2111 /* 2112 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2113 * starting the time-wait timer, turning off the other 2114 * standard timers. 2115 */ 2116 case TCPS_FIN_WAIT_2: 2117 tp->t_state = TCPS_TIME_WAIT; 2118 tcp_canceltimers(tp); 2119 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2120 soisdisconnected(so); 2121 break; 2122 2123 /* 2124 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2125 */ 2126 case TCPS_TIME_WAIT: 2127 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2128 break; 2129 } 2130 } 2131 if (so->so_options & SO_DEBUG) { 2132 switch (tp->pf) { 2133 #ifdef INET6 2134 case PF_INET6: 2135 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2136 0, tlen); 2137 break; 2138 #endif /* INET6 */ 2139 case PF_INET: 2140 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2141 0, tlen); 2142 break; 2143 } 2144 } 2145 2146 /* 2147 * Return any desired output. 2148 */ 2149 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2150 (void) tcp_output(tp); 2151 return; 2152 2153 badsyn: 2154 /* 2155 * Received a bad SYN. Increment counters and dropwithreset. 2156 */ 2157 tcpstat.tcps_badsyn++; 2158 tp = NULL; 2159 goto dropwithreset; 2160 2161 dropafterack_ratelim: 2162 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2163 tcp_ackdrop_ppslim) == 0) { 2164 /* XXX stat */ 2165 goto drop; 2166 } 2167 /* ...fall into dropafterack... */ 2168 2169 dropafterack: 2170 /* 2171 * Generate an ACK dropping incoming segment if it occupies 2172 * sequence space, where the ACK reflects our state. 2173 */ 2174 if (tiflags & TH_RST) 2175 goto drop; 2176 m_freem(m); 2177 tp->t_flags |= TF_ACKNOW; 2178 (void) tcp_output(tp); 2179 return; 2180 2181 dropwithreset_ratelim: 2182 /* 2183 * We may want to rate-limit RSTs in certain situations, 2184 * particularly if we are sending an RST in response to 2185 * an attempt to connect to or otherwise communicate with 2186 * a port for which we have no socket. 2187 */ 2188 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2189 tcp_rst_ppslim) == 0) { 2190 /* XXX stat */ 2191 goto drop; 2192 } 2193 /* ...fall into dropwithreset... */ 2194 2195 dropwithreset: 2196 /* 2197 * Generate a RST, dropping incoming segment. 2198 * Make ACK acceptable to originator of segment. 2199 * Don't bother to respond to RST. 2200 */ 2201 if (tiflags & TH_RST) 2202 goto drop; 2203 if (tiflags & TH_ACK) { 2204 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2205 TH_RST, m->m_pkthdr.ph_rtableid); 2206 } else { 2207 if (tiflags & TH_SYN) 2208 tlen++; 2209 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2210 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2211 } 2212 m_freem(m); 2213 return; 2214 2215 drop: 2216 /* 2217 * Drop space held by incoming segment and return. 2218 */ 2219 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2220 switch (tp->pf) { 2221 #ifdef INET6 2222 case PF_INET6: 2223 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2224 0, tlen); 2225 break; 2226 #endif /* INET6 */ 2227 case PF_INET: 2228 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2229 0, tlen); 2230 break; 2231 } 2232 } 2233 2234 m_freem(m); 2235 return; 2236 } 2237 2238 int 2239 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2240 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2241 u_int rtableid) 2242 { 2243 u_int16_t mss = 0; 2244 int opt, optlen; 2245 #ifdef TCP_SIGNATURE 2246 caddr_t sigp = NULL; 2247 struct tdb *tdb = NULL; 2248 #endif /* TCP_SIGNATURE */ 2249 2250 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2251 opt = cp[0]; 2252 if (opt == TCPOPT_EOL) 2253 break; 2254 if (opt == TCPOPT_NOP) 2255 optlen = 1; 2256 else { 2257 if (cnt < 2) 2258 break; 2259 optlen = cp[1]; 2260 if (optlen < 2 || optlen > cnt) 2261 break; 2262 } 2263 switch (opt) { 2264 2265 default: 2266 continue; 2267 2268 case TCPOPT_MAXSEG: 2269 if (optlen != TCPOLEN_MAXSEG) 2270 continue; 2271 if (!(th->th_flags & TH_SYN)) 2272 continue; 2273 if (TCPS_HAVERCVDSYN(tp->t_state)) 2274 continue; 2275 memcpy(&mss, cp + 2, sizeof(mss)); 2276 mss = ntohs(mss); 2277 oi->maxseg = mss; 2278 break; 2279 2280 case TCPOPT_WINDOW: 2281 if (optlen != TCPOLEN_WINDOW) 2282 continue; 2283 if (!(th->th_flags & TH_SYN)) 2284 continue; 2285 if (TCPS_HAVERCVDSYN(tp->t_state)) 2286 continue; 2287 tp->t_flags |= TF_RCVD_SCALE; 2288 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2289 break; 2290 2291 case TCPOPT_TIMESTAMP: 2292 if (optlen != TCPOLEN_TIMESTAMP) 2293 continue; 2294 oi->ts_present = 1; 2295 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2296 oi->ts_val = ntohl(oi->ts_val); 2297 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2298 oi->ts_ecr = ntohl(oi->ts_ecr); 2299 2300 if (!(th->th_flags & TH_SYN)) 2301 continue; 2302 if (TCPS_HAVERCVDSYN(tp->t_state)) 2303 continue; 2304 /* 2305 * A timestamp received in a SYN makes 2306 * it ok to send timestamp requests and replies. 2307 */ 2308 tp->t_flags |= TF_RCVD_TSTMP; 2309 tp->ts_recent = oi->ts_val; 2310 tp->ts_recent_age = tcp_now; 2311 break; 2312 2313 #ifdef TCP_SACK 2314 case TCPOPT_SACK_PERMITTED: 2315 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2316 continue; 2317 if (!(th->th_flags & TH_SYN)) 2318 continue; 2319 if (TCPS_HAVERCVDSYN(tp->t_state)) 2320 continue; 2321 /* MUST only be set on SYN */ 2322 tp->t_flags |= TF_SACK_PERMIT; 2323 break; 2324 case TCPOPT_SACK: 2325 tcp_sack_option(tp, th, cp, optlen); 2326 break; 2327 #endif 2328 #ifdef TCP_SIGNATURE 2329 case TCPOPT_SIGNATURE: 2330 if (optlen != TCPOLEN_SIGNATURE) 2331 continue; 2332 2333 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2334 return (-1); 2335 2336 sigp = cp + 2; 2337 break; 2338 #endif /* TCP_SIGNATURE */ 2339 } 2340 } 2341 2342 #ifdef TCP_SIGNATURE 2343 if (tp->t_flags & TF_SIGNATURE) { 2344 union sockaddr_union src, dst; 2345 2346 memset(&src, 0, sizeof(union sockaddr_union)); 2347 memset(&dst, 0, sizeof(union sockaddr_union)); 2348 2349 switch (tp->pf) { 2350 case 0: 2351 case AF_INET: 2352 src.sa.sa_len = sizeof(struct sockaddr_in); 2353 src.sa.sa_family = AF_INET; 2354 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2355 dst.sa.sa_len = sizeof(struct sockaddr_in); 2356 dst.sa.sa_family = AF_INET; 2357 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2358 break; 2359 #ifdef INET6 2360 case AF_INET6: 2361 src.sa.sa_len = sizeof(struct sockaddr_in6); 2362 src.sa.sa_family = AF_INET6; 2363 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2364 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2365 dst.sa.sa_family = AF_INET6; 2366 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2367 break; 2368 #endif /* INET6 */ 2369 } 2370 2371 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2372 0, &src, &dst, IPPROTO_TCP); 2373 2374 /* 2375 * We don't have an SA for this peer, so we turn off 2376 * TF_SIGNATURE on the listen socket 2377 */ 2378 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2379 tp->t_flags &= ~TF_SIGNATURE; 2380 2381 } 2382 2383 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2384 tcpstat.tcps_rcvbadsig++; 2385 return (-1); 2386 } 2387 2388 if (sigp) { 2389 char sig[16]; 2390 2391 if (tdb == NULL) { 2392 tcpstat.tcps_rcvbadsig++; 2393 return (-1); 2394 } 2395 2396 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2397 return (-1); 2398 2399 if (timingsafe_bcmp(sig, sigp, 16)) { 2400 tcpstat.tcps_rcvbadsig++; 2401 return (-1); 2402 } 2403 2404 tcpstat.tcps_rcvgoodsig++; 2405 } 2406 #endif /* TCP_SIGNATURE */ 2407 2408 return (0); 2409 } 2410 2411 #if defined(TCP_SACK) 2412 u_long 2413 tcp_seq_subtract(u_long a, u_long b) 2414 { 2415 return ((long)(a - b)); 2416 } 2417 #endif 2418 2419 2420 #ifdef TCP_SACK 2421 /* 2422 * This function is called upon receipt of new valid data (while not in header 2423 * prediction mode), and it updates the ordered list of sacks. 2424 */ 2425 void 2426 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2427 tcp_seq rcv_lastend) 2428 { 2429 /* 2430 * First reported block MUST be the most recent one. Subsequent 2431 * blocks SHOULD be in the order in which they arrived at the 2432 * receiver. These two conditions make the implementation fully 2433 * compliant with RFC 2018. 2434 */ 2435 int i, j = 0, count = 0, lastpos = -1; 2436 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2437 2438 /* First clean up current list of sacks */ 2439 for (i = 0; i < tp->rcv_numsacks; i++) { 2440 sack = tp->sackblks[i]; 2441 if (sack.start == 0 && sack.end == 0) { 2442 count++; /* count = number of blocks to be discarded */ 2443 continue; 2444 } 2445 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2446 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2447 count++; 2448 } else { 2449 temp[j].start = tp->sackblks[i].start; 2450 temp[j++].end = tp->sackblks[i].end; 2451 } 2452 } 2453 tp->rcv_numsacks -= count; 2454 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2455 tcp_clean_sackreport(tp); 2456 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2457 /* ==> need first sack block */ 2458 tp->sackblks[0].start = rcv_laststart; 2459 tp->sackblks[0].end = rcv_lastend; 2460 tp->rcv_numsacks = 1; 2461 } 2462 return; 2463 } 2464 /* Otherwise, sack blocks are already present. */ 2465 for (i = 0; i < tp->rcv_numsacks; i++) 2466 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2467 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2468 return; /* sack list remains unchanged */ 2469 /* 2470 * From here, segment just received should be (part of) the 1st sack. 2471 * Go through list, possibly coalescing sack block entries. 2472 */ 2473 firstsack.start = rcv_laststart; 2474 firstsack.end = rcv_lastend; 2475 for (i = 0; i < tp->rcv_numsacks; i++) { 2476 sack = tp->sackblks[i]; 2477 if (SEQ_LT(sack.end, firstsack.start) || 2478 SEQ_GT(sack.start, firstsack.end)) 2479 continue; /* no overlap */ 2480 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2481 /* 2482 * identical block; delete it here since we will 2483 * move it to the front of the list. 2484 */ 2485 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2486 lastpos = i; /* last posn with a zero entry */ 2487 continue; 2488 } 2489 if (SEQ_LEQ(sack.start, firstsack.start)) 2490 firstsack.start = sack.start; /* merge blocks */ 2491 if (SEQ_GEQ(sack.end, firstsack.end)) 2492 firstsack.end = sack.end; /* merge blocks */ 2493 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2494 lastpos = i; /* last posn with a zero entry */ 2495 } 2496 if (lastpos != -1) { /* at least one merge */ 2497 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2498 sack = tp->sackblks[i]; 2499 if (sack.start == 0 && sack.end == 0) 2500 continue; 2501 temp[j++] = sack; 2502 } 2503 tp->rcv_numsacks = j; /* including first blk (added later) */ 2504 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2505 tp->sackblks[i] = temp[i]; 2506 } else { /* no merges -- shift sacks by 1 */ 2507 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2508 tp->rcv_numsacks++; 2509 for (i = tp->rcv_numsacks-1; i > 0; i--) 2510 tp->sackblks[i] = tp->sackblks[i-1]; 2511 } 2512 tp->sackblks[0] = firstsack; 2513 return; 2514 } 2515 2516 /* 2517 * Process the TCP SACK option. tp->snd_holes is an ordered list 2518 * of holes (oldest to newest, in terms of the sequence space). 2519 */ 2520 void 2521 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2522 { 2523 int tmp_olen; 2524 u_char *tmp_cp; 2525 struct sackhole *cur, *p, *temp; 2526 2527 if (!tp->sack_enable) 2528 return; 2529 /* SACK without ACK doesn't make sense. */ 2530 if ((th->th_flags & TH_ACK) == 0) 2531 return; 2532 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2533 if (SEQ_LT(th->th_ack, tp->snd_una) || 2534 SEQ_GT(th->th_ack, tp->snd_max)) 2535 return; 2536 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2537 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2538 return; 2539 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2540 tmp_cp = cp + 2; 2541 tmp_olen = optlen - 2; 2542 tcpstat.tcps_sack_rcv_opts++; 2543 if (tp->snd_numholes < 0) 2544 tp->snd_numholes = 0; 2545 if (tp->t_maxseg == 0) 2546 panic("tcp_sack_option"); /* Should never happen */ 2547 while (tmp_olen > 0) { 2548 struct sackblk sack; 2549 2550 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2551 sack.start = ntohl(sack.start); 2552 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2553 sack.end = ntohl(sack.end); 2554 tmp_olen -= TCPOLEN_SACK; 2555 tmp_cp += TCPOLEN_SACK; 2556 if (SEQ_LEQ(sack.end, sack.start)) 2557 continue; /* bad SACK fields */ 2558 if (SEQ_LEQ(sack.end, tp->snd_una)) 2559 continue; /* old block */ 2560 #if defined(TCP_SACK) && defined(TCP_FACK) 2561 /* Updates snd_fack. */ 2562 if (SEQ_GT(sack.end, tp->snd_fack)) 2563 tp->snd_fack = sack.end; 2564 #endif /* TCP_FACK */ 2565 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2566 if (SEQ_LT(sack.start, th->th_ack)) 2567 continue; 2568 } 2569 if (SEQ_GT(sack.end, tp->snd_max)) 2570 continue; 2571 if (tp->snd_holes == NULL) { /* first hole */ 2572 tp->snd_holes = (struct sackhole *) 2573 pool_get(&sackhl_pool, PR_NOWAIT); 2574 if (tp->snd_holes == NULL) { 2575 /* ENOBUFS, so ignore SACKed block for now*/ 2576 goto done; 2577 } 2578 cur = tp->snd_holes; 2579 cur->start = th->th_ack; 2580 cur->end = sack.start; 2581 cur->rxmit = cur->start; 2582 cur->next = NULL; 2583 tp->snd_numholes = 1; 2584 tp->rcv_lastsack = sack.end; 2585 /* 2586 * dups is at least one. If more data has been 2587 * SACKed, it can be greater than one. 2588 */ 2589 cur->dups = min(tcprexmtthresh, 2590 ((sack.end - cur->end)/tp->t_maxseg)); 2591 if (cur->dups < 1) 2592 cur->dups = 1; 2593 continue; /* with next sack block */ 2594 } 2595 /* Go thru list of holes: p = previous, cur = current */ 2596 p = cur = tp->snd_holes; 2597 while (cur) { 2598 if (SEQ_LEQ(sack.end, cur->start)) 2599 /* SACKs data before the current hole */ 2600 break; /* no use going through more holes */ 2601 if (SEQ_GEQ(sack.start, cur->end)) { 2602 /* SACKs data beyond the current hole */ 2603 cur->dups++; 2604 if (((sack.end - cur->end)/tp->t_maxseg) >= 2605 tcprexmtthresh) 2606 cur->dups = tcprexmtthresh; 2607 p = cur; 2608 cur = cur->next; 2609 continue; 2610 } 2611 if (SEQ_LEQ(sack.start, cur->start)) { 2612 /* Data acks at least the beginning of hole */ 2613 #if defined(TCP_SACK) && defined(TCP_FACK) 2614 if (SEQ_GT(sack.end, cur->rxmit)) 2615 tp->retran_data -= 2616 tcp_seq_subtract(cur->rxmit, 2617 cur->start); 2618 else 2619 tp->retran_data -= 2620 tcp_seq_subtract(sack.end, 2621 cur->start); 2622 #endif /* TCP_FACK */ 2623 if (SEQ_GEQ(sack.end, cur->end)) { 2624 /* Acks entire hole, so delete hole */ 2625 if (p != cur) { 2626 p->next = cur->next; 2627 pool_put(&sackhl_pool, cur); 2628 cur = p->next; 2629 } else { 2630 cur = cur->next; 2631 pool_put(&sackhl_pool, p); 2632 p = cur; 2633 tp->snd_holes = p; 2634 } 2635 tp->snd_numholes--; 2636 continue; 2637 } 2638 /* otherwise, move start of hole forward */ 2639 cur->start = sack.end; 2640 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2641 p = cur; 2642 cur = cur->next; 2643 continue; 2644 } 2645 /* move end of hole backward */ 2646 if (SEQ_GEQ(sack.end, cur->end)) { 2647 #if defined(TCP_SACK) && defined(TCP_FACK) 2648 if (SEQ_GT(cur->rxmit, sack.start)) 2649 tp->retran_data -= 2650 tcp_seq_subtract(cur->rxmit, 2651 sack.start); 2652 #endif /* TCP_FACK */ 2653 cur->end = sack.start; 2654 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2655 cur->dups++; 2656 if (((sack.end - cur->end)/tp->t_maxseg) >= 2657 tcprexmtthresh) 2658 cur->dups = tcprexmtthresh; 2659 p = cur; 2660 cur = cur->next; 2661 continue; 2662 } 2663 if (SEQ_LT(cur->start, sack.start) && 2664 SEQ_GT(cur->end, sack.end)) { 2665 /* 2666 * ACKs some data in middle of a hole; need to 2667 * split current hole 2668 */ 2669 temp = (struct sackhole *) 2670 pool_get(&sackhl_pool, PR_NOWAIT); 2671 if (temp == NULL) 2672 goto done; /* ENOBUFS */ 2673 #if defined(TCP_SACK) && defined(TCP_FACK) 2674 if (SEQ_GT(cur->rxmit, sack.end)) 2675 tp->retran_data -= 2676 tcp_seq_subtract(sack.end, 2677 sack.start); 2678 else if (SEQ_GT(cur->rxmit, sack.start)) 2679 tp->retran_data -= 2680 tcp_seq_subtract(cur->rxmit, 2681 sack.start); 2682 #endif /* TCP_FACK */ 2683 temp->next = cur->next; 2684 temp->start = sack.end; 2685 temp->end = cur->end; 2686 temp->dups = cur->dups; 2687 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2688 cur->end = sack.start; 2689 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2690 cur->dups++; 2691 if (((sack.end - cur->end)/tp->t_maxseg) >= 2692 tcprexmtthresh) 2693 cur->dups = tcprexmtthresh; 2694 cur->next = temp; 2695 p = temp; 2696 cur = p->next; 2697 tp->snd_numholes++; 2698 } 2699 } 2700 /* At this point, p points to the last hole on the list */ 2701 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2702 /* 2703 * Need to append new hole at end. 2704 * Last hole is p (and it's not NULL). 2705 */ 2706 temp = (struct sackhole *) 2707 pool_get(&sackhl_pool, PR_NOWAIT); 2708 if (temp == NULL) 2709 goto done; /* ENOBUFS */ 2710 temp->start = tp->rcv_lastsack; 2711 temp->end = sack.start; 2712 temp->dups = min(tcprexmtthresh, 2713 ((sack.end - sack.start)/tp->t_maxseg)); 2714 if (temp->dups < 1) 2715 temp->dups = 1; 2716 temp->rxmit = temp->start; 2717 temp->next = 0; 2718 p->next = temp; 2719 tp->rcv_lastsack = sack.end; 2720 tp->snd_numholes++; 2721 } 2722 } 2723 done: 2724 #if defined(TCP_SACK) && defined(TCP_FACK) 2725 /* 2726 * Update retran_data and snd_awnd. Go through the list of 2727 * holes. Increment retran_data by (hole->rxmit - hole->start). 2728 */ 2729 tp->retran_data = 0; 2730 cur = tp->snd_holes; 2731 while (cur) { 2732 tp->retran_data += cur->rxmit - cur->start; 2733 cur = cur->next; 2734 } 2735 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2736 tp->retran_data; 2737 #endif /* TCP_FACK */ 2738 2739 return; 2740 } 2741 2742 /* 2743 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2744 * it is completely acked; otherwise, tcp_sack_option(), called from 2745 * tcp_dooptions(), will fix up the hole. 2746 */ 2747 void 2748 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2749 { 2750 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2751 /* max because this could be an older ack just arrived */ 2752 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2753 th->th_ack : tp->snd_una; 2754 struct sackhole *cur = tp->snd_holes; 2755 struct sackhole *prev; 2756 while (cur) 2757 if (SEQ_LEQ(cur->end, lastack)) { 2758 prev = cur; 2759 cur = cur->next; 2760 pool_put(&sackhl_pool, prev); 2761 tp->snd_numholes--; 2762 } else if (SEQ_LT(cur->start, lastack)) { 2763 cur->start = lastack; 2764 if (SEQ_LT(cur->rxmit, cur->start)) 2765 cur->rxmit = cur->start; 2766 break; 2767 } else 2768 break; 2769 tp->snd_holes = cur; 2770 } 2771 } 2772 2773 /* 2774 * Delete all receiver-side SACK information. 2775 */ 2776 void 2777 tcp_clean_sackreport(struct tcpcb *tp) 2778 { 2779 int i; 2780 2781 tp->rcv_numsacks = 0; 2782 for (i = 0; i < MAX_SACK_BLKS; i++) 2783 tp->sackblks[i].start = tp->sackblks[i].end=0; 2784 2785 } 2786 2787 /* 2788 * Checks for partial ack. If partial ack arrives, turn off retransmission 2789 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2790 * If the ack advances at least to tp->snd_last, return 0. 2791 */ 2792 int 2793 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2794 { 2795 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2796 /* Turn off retx. timer (will start again next segment) */ 2797 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2798 tp->t_rtttime = 0; 2799 #ifndef TCP_FACK 2800 /* 2801 * Partial window deflation. This statement relies on the 2802 * fact that tp->snd_una has not been updated yet. In FACK 2803 * hold snd_cwnd constant during fast recovery. 2804 */ 2805 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2806 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2807 tp->snd_cwnd += tp->t_maxseg; 2808 } else 2809 tp->snd_cwnd = tp->t_maxseg; 2810 #endif 2811 return (1); 2812 } 2813 return (0); 2814 } 2815 #endif /* TCP_SACK */ 2816 2817 /* 2818 * Pull out of band byte out of a segment so 2819 * it doesn't appear in the user's data queue. 2820 * It is still reflected in the segment length for 2821 * sequencing purposes. 2822 */ 2823 void 2824 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2825 { 2826 int cnt = off + urgent - 1; 2827 2828 while (cnt >= 0) { 2829 if (m->m_len > cnt) { 2830 char *cp = mtod(m, caddr_t) + cnt; 2831 struct tcpcb *tp = sototcpcb(so); 2832 2833 tp->t_iobc = *cp; 2834 tp->t_oobflags |= TCPOOB_HAVEDATA; 2835 memmove(cp, cp + 1, m->m_len - cnt - 1); 2836 m->m_len--; 2837 return; 2838 } 2839 cnt -= m->m_len; 2840 m = m->m_next; 2841 if (m == NULL) 2842 break; 2843 } 2844 panic("tcp_pulloutofband"); 2845 } 2846 2847 /* 2848 * Collect new round-trip time estimate 2849 * and update averages and current timeout. 2850 */ 2851 void 2852 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2853 { 2854 short delta; 2855 short rttmin; 2856 2857 if (rtt < 0) 2858 rtt = 0; 2859 else if (rtt > TCP_RTT_MAX) 2860 rtt = TCP_RTT_MAX; 2861 2862 tcpstat.tcps_rttupdated++; 2863 if (tp->t_srtt != 0) { 2864 /* 2865 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2866 * after the binary point (scaled by 4), whereas 2867 * srtt is stored as fixed point with 5 bits after the 2868 * binary point (i.e., scaled by 32). The following magic 2869 * is equivalent to the smoothing algorithm in rfc793 with 2870 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2871 * point). 2872 */ 2873 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2874 (tp->t_srtt >> TCP_RTT_SHIFT); 2875 if ((tp->t_srtt += delta) <= 0) 2876 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2877 /* 2878 * We accumulate a smoothed rtt variance (actually, a 2879 * smoothed mean difference), then set the retransmit 2880 * timer to smoothed rtt + 4 times the smoothed variance. 2881 * rttvar is stored as fixed point with 4 bits after the 2882 * binary point (scaled by 16). The following is 2883 * equivalent to rfc793 smoothing with an alpha of .75 2884 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2885 * rfc793's wired-in beta. 2886 */ 2887 if (delta < 0) 2888 delta = -delta; 2889 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2890 if ((tp->t_rttvar += delta) <= 0) 2891 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2892 } else { 2893 /* 2894 * No rtt measurement yet - use the unsmoothed rtt. 2895 * Set the variance to half the rtt (so our first 2896 * retransmit happens at 3*rtt). 2897 */ 2898 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2899 tp->t_rttvar = (rtt + 1) << 2900 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2901 } 2902 tp->t_rtttime = 0; 2903 tp->t_rxtshift = 0; 2904 2905 /* 2906 * the retransmit should happen at rtt + 4 * rttvar. 2907 * Because of the way we do the smoothing, srtt and rttvar 2908 * will each average +1/2 tick of bias. When we compute 2909 * the retransmit timer, we want 1/2 tick of rounding and 2910 * 1 extra tick because of +-1/2 tick uncertainty in the 2911 * firing of the timer. The bias will give us exactly the 2912 * 1.5 tick we need. But, because the bias is 2913 * statistical, we have to test that we don't drop below 2914 * the minimum feasible timer (which is 2 ticks). 2915 */ 2916 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2917 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2918 2919 /* 2920 * We received an ack for a packet that wasn't retransmitted; 2921 * it is probably safe to discard any error indications we've 2922 * received recently. This isn't quite right, but close enough 2923 * for now (a route might have failed after we sent a segment, 2924 * and the return path might not be symmetrical). 2925 */ 2926 tp->t_softerror = 0; 2927 } 2928 2929 /* 2930 * Determine a reasonable value for maxseg size. 2931 * If the route is known, check route for mtu. 2932 * If none, use an mss that can be handled on the outgoing 2933 * interface without forcing IP to fragment; if bigger than 2934 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2935 * to utilize large mbufs. If no route is found, route has no mtu, 2936 * or the destination isn't local, use a default, hopefully conservative 2937 * size (usually 512 or the default IP max size, but no more than the mtu 2938 * of the interface), as we can't discover anything about intervening 2939 * gateways or networks. We also initialize the congestion/slow start 2940 * window to be a single segment if the destination isn't local. 2941 * While looking at the routing entry, we also initialize other path-dependent 2942 * parameters from pre-set or cached values in the routing entry. 2943 * 2944 * Also take into account the space needed for options that we 2945 * send regularly. Make maxseg shorter by that amount to assure 2946 * that we can send maxseg amount of data even when the options 2947 * are present. Store the upper limit of the length of options plus 2948 * data in maxopd. 2949 * 2950 * NOTE: offer == -1 indicates that the maxseg size changed due to 2951 * Path MTU discovery. 2952 */ 2953 int 2954 tcp_mss(struct tcpcb *tp, int offer) 2955 { 2956 struct rtentry *rt; 2957 struct ifnet *ifp = NULL; 2958 int mss, mssopt; 2959 int iphlen; 2960 struct inpcb *inp; 2961 2962 inp = tp->t_inpcb; 2963 2964 mssopt = mss = tcp_mssdflt; 2965 2966 rt = in_pcbrtentry(inp); 2967 2968 if (rt == NULL) 2969 goto out; 2970 2971 ifp = if_get(rt->rt_ifidx); 2972 if (ifp == NULL) 2973 goto out; 2974 2975 switch (tp->pf) { 2976 #ifdef INET6 2977 case AF_INET6: 2978 iphlen = sizeof(struct ip6_hdr); 2979 break; 2980 #endif 2981 case AF_INET: 2982 iphlen = sizeof(struct ip); 2983 break; 2984 default: 2985 /* the family does not support path MTU discovery */ 2986 goto out; 2987 } 2988 2989 /* 2990 * if there's an mtu associated with the route and we support 2991 * path MTU discovery for the underlying protocol family, use it. 2992 */ 2993 if (rt->rt_rmx.rmx_mtu) { 2994 /* 2995 * One may wish to lower MSS to take into account options, 2996 * especially security-related options. 2997 */ 2998 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 2999 /* 3000 * RFC2460 section 5, last paragraph: if path MTU is 3001 * smaller than 1280, use 1280 as packet size and 3002 * attach fragment header. 3003 */ 3004 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3005 sizeof(struct tcphdr); 3006 } else { 3007 mss = rt->rt_rmx.rmx_mtu - iphlen - 3008 sizeof(struct tcphdr); 3009 } 3010 } else if (ifp->if_flags & IFF_LOOPBACK) { 3011 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3012 } else if (tp->pf == AF_INET) { 3013 if (ip_mtudisc) 3014 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3015 } 3016 #ifdef INET6 3017 else if (tp->pf == AF_INET6) { 3018 /* 3019 * for IPv6, path MTU discovery is always turned on, 3020 * or the node must use packet size <= 1280. 3021 */ 3022 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3023 } 3024 #endif /* INET6 */ 3025 3026 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3027 if (offer != -1) { 3028 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3029 mssopt = max(tcp_mssdflt, mssopt); 3030 } 3031 out: 3032 if_put(ifp); 3033 /* 3034 * The current mss, t_maxseg, is initialized to the default value. 3035 * If we compute a smaller value, reduce the current mss. 3036 * If we compute a larger value, return it for use in sending 3037 * a max seg size option, but don't store it for use 3038 * unless we received an offer at least that large from peer. 3039 * 3040 * However, do not accept offers lower than the minimum of 3041 * the interface MTU and 216. 3042 */ 3043 if (offer > 0) 3044 tp->t_peermss = offer; 3045 if (tp->t_peermss) 3046 mss = min(mss, max(tp->t_peermss, 216)); 3047 3048 /* sanity - at least max opt. space */ 3049 mss = max(mss, 64); 3050 3051 /* 3052 * maxopd stores the maximum length of data AND options 3053 * in a segment; maxseg is the amount of data in a normal 3054 * segment. We need to store this value (maxopd) apart 3055 * from maxseg, because now every segment carries options 3056 * and thus we normally have somewhat less data in segments. 3057 */ 3058 tp->t_maxopd = mss; 3059 3060 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3061 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3062 mss -= TCPOLEN_TSTAMP_APPA; 3063 #ifdef TCP_SIGNATURE 3064 if (tp->t_flags & TF_SIGNATURE) 3065 mss -= TCPOLEN_SIGLEN; 3066 #endif 3067 3068 if (offer == -1) { 3069 /* mss changed due to Path MTU discovery */ 3070 tp->t_flags &= ~TF_PMTUD_PEND; 3071 tp->t_pmtud_mtu_sent = 0; 3072 tp->t_pmtud_mss_acked = 0; 3073 if (mss < tp->t_maxseg) { 3074 /* 3075 * Follow suggestion in RFC 2414 to reduce the 3076 * congestion window by the ratio of the old 3077 * segment size to the new segment size. 3078 */ 3079 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3080 mss, mss); 3081 } 3082 } else if (tcp_do_rfc3390 == 2) { 3083 /* increase initial window */ 3084 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3085 } else if (tcp_do_rfc3390) { 3086 /* increase initial window */ 3087 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3088 } else 3089 tp->snd_cwnd = mss; 3090 3091 tp->t_maxseg = mss; 3092 3093 return (offer != -1 ? mssopt : mss); 3094 } 3095 3096 u_int 3097 tcp_hdrsz(struct tcpcb *tp) 3098 { 3099 u_int hlen; 3100 3101 switch (tp->pf) { 3102 #ifdef INET6 3103 case AF_INET6: 3104 hlen = sizeof(struct ip6_hdr); 3105 break; 3106 #endif 3107 case AF_INET: 3108 hlen = sizeof(struct ip); 3109 break; 3110 default: 3111 hlen = 0; 3112 break; 3113 } 3114 hlen += sizeof(struct tcphdr); 3115 3116 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3117 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3118 hlen += TCPOLEN_TSTAMP_APPA; 3119 #ifdef TCP_SIGNATURE 3120 if (tp->t_flags & TF_SIGNATURE) 3121 hlen += TCPOLEN_SIGLEN; 3122 #endif 3123 return (hlen); 3124 } 3125 3126 /* 3127 * Set connection variables based on the effective MSS. 3128 * We are passed the TCPCB for the actual connection. If we 3129 * are the server, we are called by the compressed state engine 3130 * when the 3-way handshake is complete. If we are the client, 3131 * we are called when we receive the SYN,ACK from the server. 3132 * 3133 * NOTE: The t_maxseg value must be initialized in the TCPCB 3134 * before this routine is called! 3135 */ 3136 void 3137 tcp_mss_update(struct tcpcb *tp) 3138 { 3139 int mss; 3140 u_long bufsize; 3141 struct rtentry *rt; 3142 struct socket *so; 3143 3144 so = tp->t_inpcb->inp_socket; 3145 mss = tp->t_maxseg; 3146 3147 rt = in_pcbrtentry(tp->t_inpcb); 3148 3149 if (rt == NULL) 3150 return; 3151 3152 bufsize = so->so_snd.sb_hiwat; 3153 if (bufsize < mss) { 3154 mss = bufsize; 3155 /* Update t_maxseg and t_maxopd */ 3156 tcp_mss(tp, mss); 3157 } else { 3158 bufsize = roundup(bufsize, mss); 3159 if (bufsize > sb_max) 3160 bufsize = sb_max; 3161 (void)sbreserve(&so->so_snd, bufsize); 3162 } 3163 3164 bufsize = so->so_rcv.sb_hiwat; 3165 if (bufsize > mss) { 3166 bufsize = roundup(bufsize, mss); 3167 if (bufsize > sb_max) 3168 bufsize = sb_max; 3169 (void)sbreserve(&so->so_rcv, bufsize); 3170 } 3171 3172 } 3173 3174 #if defined (TCP_SACK) 3175 /* 3176 * Checks for partial ack. If partial ack arrives, force the retransmission 3177 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3178 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3179 * be started again. If the ack advances at least to tp->snd_last, return 0. 3180 */ 3181 int 3182 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3183 { 3184 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3185 /* 3186 * snd_una has not been updated and the socket send buffer 3187 * not yet drained of the acked data, so we have to leave 3188 * snd_una as it was to get the correct data offset in 3189 * tcp_output(). 3190 */ 3191 tcp_seq onxt = tp->snd_nxt; 3192 u_long ocwnd = tp->snd_cwnd; 3193 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3194 tp->t_rtttime = 0; 3195 tp->snd_nxt = th->th_ack; 3196 /* 3197 * Set snd_cwnd to one segment beyond acknowledged offset 3198 * (tp->snd_una not yet updated when this function is called) 3199 */ 3200 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3201 (void) tcp_output(tp); 3202 tp->snd_cwnd = ocwnd; 3203 if (SEQ_GT(onxt, tp->snd_nxt)) 3204 tp->snd_nxt = onxt; 3205 /* 3206 * Partial window deflation. Relies on fact that tp->snd_una 3207 * not updated yet. 3208 */ 3209 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3210 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3211 else 3212 tp->snd_cwnd = 0; 3213 tp->snd_cwnd += tp->t_maxseg; 3214 3215 return 1; 3216 } 3217 return 0; 3218 } 3219 #endif /* TCP_SACK */ 3220 3221 int 3222 tcp_mss_adv(struct mbuf *m, int af) 3223 { 3224 int mss = 0; 3225 int iphlen; 3226 struct ifnet *ifp = NULL; 3227 3228 if (m && (m->m_flags & M_PKTHDR)) 3229 ifp = if_get(m->m_pkthdr.ph_ifidx); 3230 3231 switch (af) { 3232 case AF_INET: 3233 if (ifp != NULL) 3234 mss = ifp->if_mtu; 3235 iphlen = sizeof(struct ip); 3236 break; 3237 #ifdef INET6 3238 case AF_INET6: 3239 if (ifp != NULL) 3240 mss = ifp->if_mtu; 3241 iphlen = sizeof(struct ip6_hdr); 3242 break; 3243 #endif 3244 default: 3245 unhandled_af(af); 3246 } 3247 if_put(ifp); 3248 mss = mss - iphlen - sizeof(struct tcphdr); 3249 return (max(mss, tcp_mssdflt)); 3250 } 3251 3252 /* 3253 * TCP compressed state engine. Currently used to hold compressed 3254 * state for SYN_RECEIVED. 3255 */ 3256 3257 /* syn hash parameters */ 3258 int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; 3259 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3260 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3261 int tcp_syn_use_limit = 100000; 3262 3263 struct syn_cache_set tcp_syn_cache[2]; 3264 int tcp_syn_cache_active; 3265 3266 #define SYN_HASH(sa, sp, dp, rand) \ 3267 (((sa)->s_addr ^ (rand)[0]) * \ 3268 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3269 #ifndef INET6 3270 #define SYN_HASHALL(hash, src, dst, rand) \ 3271 do { \ 3272 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3273 satosin(src)->sin_port, \ 3274 satosin(dst)->sin_port, (rand)); \ 3275 } while (/*CONSTCOND*/ 0) 3276 #else 3277 #define SYN_HASH6(sa, sp, dp, rand) \ 3278 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3279 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3280 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3281 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3282 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3283 3284 #define SYN_HASHALL(hash, src, dst, rand) \ 3285 do { \ 3286 switch ((src)->sa_family) { \ 3287 case AF_INET: \ 3288 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3289 satosin(src)->sin_port, \ 3290 satosin(dst)->sin_port, (rand)); \ 3291 break; \ 3292 case AF_INET6: \ 3293 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3294 satosin6(src)->sin6_port, \ 3295 satosin6(dst)->sin6_port, (rand)); \ 3296 break; \ 3297 default: \ 3298 hash = 0; \ 3299 } \ 3300 } while (/*CONSTCOND*/0) 3301 #endif /* INET6 */ 3302 3303 void 3304 syn_cache_rm(struct syn_cache *sc) 3305 { 3306 sc->sc_flags |= SCF_DEAD; 3307 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3308 sc->sc_tp = NULL; 3309 LIST_REMOVE(sc, sc_tpq); 3310 sc->sc_buckethead->sch_length--; 3311 timeout_del(&sc->sc_timer); 3312 sc->sc_set->scs_count--; 3313 } 3314 3315 void 3316 syn_cache_put(struct syn_cache *sc) 3317 { 3318 if (sc->sc_ipopts) 3319 (void) m_free(sc->sc_ipopts); 3320 if (sc->sc_route4.ro_rt != NULL) { 3321 rtfree(sc->sc_route4.ro_rt); 3322 sc->sc_route4.ro_rt = NULL; 3323 } 3324 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3325 timeout_add(&sc->sc_timer, 0); 3326 } 3327 3328 struct pool syn_cache_pool; 3329 3330 /* 3331 * We don't estimate RTT with SYNs, so each packet starts with the default 3332 * RTT and each timer step has a fixed timeout value. 3333 */ 3334 #define SYN_CACHE_TIMER_ARM(sc) \ 3335 do { \ 3336 TCPT_RANGESET((sc)->sc_rxtcur, \ 3337 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3338 TCPTV_REXMTMAX); \ 3339 if (!timeout_initialized(&(sc)->sc_timer)) \ 3340 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3341 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3342 } while (/*CONSTCOND*/0) 3343 3344 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3345 3346 void 3347 syn_cache_init(void) 3348 { 3349 int i; 3350 3351 /* Initialize the hash buckets. */ 3352 for (i = 0; i < tcp_syn_cache_size; i++) { 3353 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3354 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3355 } 3356 3357 /* Initialize the syn cache pool. */ 3358 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3359 "syncache", NULL); 3360 pool_setipl(&syn_cache_pool, IPL_SOFTNET); 3361 } 3362 3363 void 3364 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3365 { 3366 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3367 struct syn_cache_head *scp; 3368 struct syn_cache *sc2; 3369 int s; 3370 3371 s = splsoftnet(); 3372 3373 /* 3374 * If there are no entries in the hash table, reinitialize 3375 * the hash secrets. To avoid useless cache swaps and 3376 * and reinitialization, use it until the limit is reached. 3377 */ 3378 if (set->scs_count == 0 && set->scs_use <= 0) { 3379 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3380 set->scs_use = tcp_syn_use_limit; 3381 tcpstat.tcps_sc_seedrandom++; 3382 } 3383 3384 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3385 set->scs_random); 3386 scp = &set->scs_buckethead[sc->sc_hash % tcp_syn_cache_size]; 3387 sc->sc_buckethead = scp; 3388 3389 /* 3390 * Make sure that we don't overflow the per-bucket 3391 * limit or the total cache size limit. 3392 */ 3393 if (scp->sch_length >= tcp_syn_bucket_limit) { 3394 tcpstat.tcps_sc_bucketoverflow++; 3395 /* 3396 * Someone might attack our bucket hash function. Reseed 3397 * with random as soon as the passive syn cache gets empty. 3398 */ 3399 set->scs_use = 0; 3400 /* 3401 * The bucket is full. Toss the oldest element in the 3402 * bucket. This will be the first entry in the bucket. 3403 */ 3404 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3405 #ifdef DIAGNOSTIC 3406 /* 3407 * This should never happen; we should always find an 3408 * entry in our bucket. 3409 */ 3410 if (sc2 == NULL) 3411 panic("syn_cache_insert: bucketoverflow: impossible"); 3412 #endif 3413 syn_cache_rm(sc2); 3414 syn_cache_put(sc2); 3415 } else if (set->scs_count >= tcp_syn_cache_limit) { 3416 struct syn_cache_head *scp2, *sce; 3417 3418 tcpstat.tcps_sc_overflowed++; 3419 /* 3420 * The cache is full. Toss the oldest entry in the 3421 * first non-empty bucket we can find. 3422 * 3423 * XXX We would really like to toss the oldest 3424 * entry in the cache, but we hope that this 3425 * condition doesn't happen very often. 3426 */ 3427 scp2 = scp; 3428 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3429 sce = &set->scs_buckethead[tcp_syn_cache_size]; 3430 for (++scp2; scp2 != scp; scp2++) { 3431 if (scp2 >= sce) 3432 scp2 = &set->scs_buckethead[0]; 3433 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3434 break; 3435 } 3436 #ifdef DIAGNOSTIC 3437 /* 3438 * This should never happen; we should always find a 3439 * non-empty bucket. 3440 */ 3441 if (scp2 == scp) 3442 panic("syn_cache_insert: cacheoverflow: " 3443 "impossible"); 3444 #endif 3445 } 3446 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3447 syn_cache_rm(sc2); 3448 syn_cache_put(sc2); 3449 } 3450 3451 /* 3452 * Initialize the entry's timer. 3453 */ 3454 sc->sc_rxttot = 0; 3455 sc->sc_rxtshift = 0; 3456 SYN_CACHE_TIMER_ARM(sc); 3457 3458 /* Link it from tcpcb entry */ 3459 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3460 3461 /* Put it into the bucket. */ 3462 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3463 scp->sch_length++; 3464 sc->sc_set = set; 3465 set->scs_count++; 3466 set->scs_use--; 3467 3468 tcpstat.tcps_sc_added++; 3469 3470 /* 3471 * If the active cache has exceeded its use limit and 3472 * the passive syn cache is empty, exchange their roles. 3473 */ 3474 if (set->scs_use <= 0 && 3475 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3476 tcp_syn_cache_active = !tcp_syn_cache_active; 3477 3478 splx(s); 3479 } 3480 3481 /* 3482 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3483 * If we have retransmitted an entry the maximum number of times, expire 3484 * that entry. 3485 */ 3486 void 3487 syn_cache_timer(void *arg) 3488 { 3489 struct syn_cache *sc = arg; 3490 int s; 3491 3492 s = splsoftnet(); 3493 if (sc->sc_flags & SCF_DEAD) { 3494 splx(s); 3495 return; 3496 } 3497 3498 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3499 /* Drop it -- too many retransmissions. */ 3500 goto dropit; 3501 } 3502 3503 /* 3504 * Compute the total amount of time this entry has 3505 * been on a queue. If this entry has been on longer 3506 * than the keep alive timer would allow, expire it. 3507 */ 3508 sc->sc_rxttot += sc->sc_rxtcur; 3509 if (sc->sc_rxttot >= tcptv_keep_init) 3510 goto dropit; 3511 3512 tcpstat.tcps_sc_retransmitted++; 3513 (void) syn_cache_respond(sc, NULL); 3514 3515 /* Advance the timer back-off. */ 3516 sc->sc_rxtshift++; 3517 SYN_CACHE_TIMER_ARM(sc); 3518 3519 splx(s); 3520 return; 3521 3522 dropit: 3523 tcpstat.tcps_sc_timed_out++; 3524 syn_cache_rm(sc); 3525 syn_cache_put(sc); 3526 splx(s); 3527 } 3528 3529 void 3530 syn_cache_reaper(void *arg) 3531 { 3532 struct syn_cache *sc = arg; 3533 3534 pool_put(&syn_cache_pool, (sc)); 3535 return; 3536 } 3537 3538 /* 3539 * Remove syn cache created by the specified tcb entry, 3540 * because this does not make sense to keep them 3541 * (if there's no tcb entry, syn cache entry will never be used) 3542 */ 3543 void 3544 syn_cache_cleanup(struct tcpcb *tp) 3545 { 3546 struct syn_cache *sc, *nsc; 3547 int s; 3548 3549 s = splsoftnet(); 3550 3551 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3552 #ifdef DIAGNOSTIC 3553 if (sc->sc_tp != tp) 3554 panic("invalid sc_tp in syn_cache_cleanup"); 3555 #endif 3556 syn_cache_rm(sc); 3557 syn_cache_put(sc); 3558 } 3559 /* just for safety */ 3560 LIST_INIT(&tp->t_sc); 3561 3562 splx(s); 3563 } 3564 3565 /* 3566 * Find an entry in the syn cache. 3567 */ 3568 struct syn_cache * 3569 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3570 struct syn_cache_head **headp, u_int rtableid) 3571 { 3572 struct syn_cache_set *sets[2]; 3573 struct syn_cache *sc; 3574 struct syn_cache_head *scp; 3575 u_int32_t hash; 3576 int i; 3577 3578 splsoftassert(IPL_SOFTNET); 3579 3580 /* Check the active cache first, the passive cache is likely emtpy. */ 3581 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3582 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3583 for (i = 0; i < 2; i++) { 3584 if (sets[i]->scs_count == 0) 3585 continue; 3586 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3587 scp = &sets[i]->scs_buckethead[hash % tcp_syn_cache_size]; 3588 *headp = scp; 3589 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3590 if (sc->sc_hash != hash) 3591 continue; 3592 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3593 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3594 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3595 return (sc); 3596 } 3597 } 3598 return (NULL); 3599 } 3600 3601 /* 3602 * This function gets called when we receive an ACK for a 3603 * socket in the LISTEN state. We look up the connection 3604 * in the syn cache, and if its there, we pull it out of 3605 * the cache and turn it into a full-blown connection in 3606 * the SYN-RECEIVED state. 3607 * 3608 * The return values may not be immediately obvious, and their effects 3609 * can be subtle, so here they are: 3610 * 3611 * NULL SYN was not found in cache; caller should drop the 3612 * packet and send an RST. 3613 * 3614 * -1 We were unable to create the new connection, and are 3615 * aborting it. An ACK,RST is being sent to the peer 3616 * (unless we got screwey sequence numbners; see below), 3617 * because the 3-way handshake has been completed. Caller 3618 * should not free the mbuf, since we may be using it. If 3619 * we are not, we will free it. 3620 * 3621 * Otherwise, the return value is a pointer to the new socket 3622 * associated with the connection. 3623 */ 3624 struct socket * 3625 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3626 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3627 { 3628 struct syn_cache *sc; 3629 struct syn_cache_head *scp; 3630 struct inpcb *inp = NULL; 3631 struct tcpcb *tp = NULL; 3632 struct mbuf *am; 3633 int s; 3634 struct socket *oso; 3635 #if NPF > 0 3636 struct pf_divert *divert = NULL; 3637 #endif 3638 3639 s = splsoftnet(); 3640 if ((sc = syn_cache_lookup(src, dst, &scp, 3641 sotoinpcb(so)->inp_rtableid)) == NULL) { 3642 splx(s); 3643 return (NULL); 3644 } 3645 3646 /* 3647 * Verify the sequence and ack numbers. Try getting the correct 3648 * response again. 3649 */ 3650 if ((th->th_ack != sc->sc_iss + 1) || 3651 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3652 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3653 (void) syn_cache_respond(sc, m); 3654 splx(s); 3655 return ((struct socket *)(-1)); 3656 } 3657 3658 /* Remove this cache entry */ 3659 syn_cache_rm(sc); 3660 splx(s); 3661 3662 /* 3663 * Ok, create the full blown connection, and set things up 3664 * as they would have been set up if we had created the 3665 * connection when the SYN arrived. If we can't create 3666 * the connection, abort it. 3667 */ 3668 oso = so; 3669 so = sonewconn(so, SS_ISCONNECTED); 3670 if (so == NULL) 3671 goto resetandabort; 3672 3673 inp = sotoinpcb(oso); 3674 3675 #ifdef IPSEC 3676 /* 3677 * We need to copy the required security levels 3678 * from the old pcb. Ditto for any other 3679 * IPsec-related information. 3680 */ 3681 { 3682 struct inpcb *newinp = sotoinpcb(so); 3683 memcpy(newinp->inp_seclevel, inp->inp_seclevel, 3684 sizeof(inp->inp_seclevel)); 3685 } 3686 #endif /* IPSEC */ 3687 #ifdef INET6 3688 /* 3689 * inp still has the OLD in_pcb stuff, set the 3690 * v6-related flags on the new guy, too. 3691 */ 3692 { 3693 int flags = inp->inp_flags; 3694 struct inpcb *oldinpcb = inp; 3695 3696 inp = sotoinpcb(so); 3697 inp->inp_flags |= (flags & INP_IPV6); 3698 if ((inp->inp_flags & INP_IPV6) != 0) { 3699 inp->inp_ipv6.ip6_hlim = 3700 oldinpcb->inp_ipv6.ip6_hlim; 3701 } 3702 } 3703 #else /* INET6 */ 3704 inp = sotoinpcb(so); 3705 #endif /* INET6 */ 3706 3707 #if NPF > 0 3708 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3709 (divert = pf_find_divert(m)) != NULL) 3710 inp->inp_rtableid = divert->rdomain; 3711 else 3712 #endif 3713 /* inherit rtable from listening socket */ 3714 inp->inp_rtableid = sc->sc_rtableid; 3715 3716 inp->inp_lport = th->th_dport; 3717 switch (src->sa_family) { 3718 #ifdef INET6 3719 case AF_INET6: 3720 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3721 break; 3722 #endif /* INET6 */ 3723 case AF_INET: 3724 inp->inp_laddr = satosin(dst)->sin_addr; 3725 inp->inp_options = ip_srcroute(m); 3726 if (inp->inp_options == NULL) { 3727 inp->inp_options = sc->sc_ipopts; 3728 sc->sc_ipopts = NULL; 3729 } 3730 break; 3731 } 3732 in_pcbrehash(inp); 3733 3734 /* 3735 * Give the new socket our cached route reference. 3736 */ 3737 if (src->sa_family == AF_INET) 3738 inp->inp_route = sc->sc_route4; /* struct assignment */ 3739 #ifdef INET6 3740 else 3741 inp->inp_route6 = sc->sc_route6; 3742 #endif 3743 sc->sc_route4.ro_rt = NULL; 3744 3745 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3746 if (am == NULL) 3747 goto resetandabort; 3748 am->m_len = src->sa_len; 3749 memcpy(mtod(am, caddr_t), src, src->sa_len); 3750 3751 switch (src->sa_family) { 3752 case AF_INET: 3753 /* drop IPv4 packet to AF_INET6 socket */ 3754 if (inp->inp_flags & INP_IPV6) { 3755 (void) m_free(am); 3756 goto resetandabort; 3757 } 3758 if (in_pcbconnect(inp, am)) { 3759 (void) m_free(am); 3760 goto resetandabort; 3761 } 3762 break; 3763 #ifdef INET6 3764 case AF_INET6: 3765 if (in6_pcbconnect(inp, am)) { 3766 (void) m_free(am); 3767 goto resetandabort; 3768 } 3769 break; 3770 #endif 3771 } 3772 (void) m_free(am); 3773 3774 tp = intotcpcb(inp); 3775 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3776 if (sc->sc_request_r_scale != 15) { 3777 tp->requested_s_scale = sc->sc_requested_s_scale; 3778 tp->request_r_scale = sc->sc_request_r_scale; 3779 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3780 } 3781 if (sc->sc_flags & SCF_TIMESTAMP) 3782 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3783 3784 tp->t_template = tcp_template(tp); 3785 if (tp->t_template == 0) { 3786 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3787 so = NULL; 3788 m_freem(m); 3789 goto abort; 3790 } 3791 #ifdef TCP_SACK 3792 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3793 #endif 3794 3795 tp->ts_modulate = sc->sc_modulate; 3796 tp->ts_recent = sc->sc_timestamp; 3797 tp->iss = sc->sc_iss; 3798 tp->irs = sc->sc_irs; 3799 tcp_sendseqinit(tp); 3800 #if defined (TCP_SACK) || defined(TCP_ECN) 3801 tp->snd_last = tp->snd_una; 3802 #endif /* TCP_SACK */ 3803 #if defined(TCP_SACK) && defined(TCP_FACK) 3804 tp->snd_fack = tp->snd_una; 3805 tp->retran_data = 0; 3806 tp->snd_awnd = 0; 3807 #endif /* TCP_FACK */ 3808 #ifdef TCP_ECN 3809 if (sc->sc_flags & SCF_ECN_PERMIT) { 3810 tp->t_flags |= TF_ECN_PERMIT; 3811 tcpstat.tcps_ecn_accepts++; 3812 } 3813 #endif 3814 #ifdef TCP_SACK 3815 if (sc->sc_flags & SCF_SACK_PERMIT) 3816 tp->t_flags |= TF_SACK_PERMIT; 3817 #endif 3818 #ifdef TCP_SIGNATURE 3819 if (sc->sc_flags & SCF_SIGNATURE) 3820 tp->t_flags |= TF_SIGNATURE; 3821 #endif 3822 tcp_rcvseqinit(tp); 3823 tp->t_state = TCPS_SYN_RECEIVED; 3824 tp->t_rcvtime = tcp_now; 3825 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3826 tcpstat.tcps_accepts++; 3827 3828 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3829 if (sc->sc_peermaxseg) 3830 tcp_mss_update(tp); 3831 /* Reset initial window to 1 segment for retransmit */ 3832 if (sc->sc_rxtshift > 0) 3833 tp->snd_cwnd = tp->t_maxseg; 3834 tp->snd_wl1 = sc->sc_irs; 3835 tp->rcv_up = sc->sc_irs + 1; 3836 3837 /* 3838 * This is what whould have happened in tcp_output() when 3839 * the SYN,ACK was sent. 3840 */ 3841 tp->snd_up = tp->snd_una; 3842 tp->snd_max = tp->snd_nxt = tp->iss+1; 3843 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3844 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3845 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3846 tp->last_ack_sent = tp->rcv_nxt; 3847 3848 tcpstat.tcps_sc_completed++; 3849 syn_cache_put(sc); 3850 return (so); 3851 3852 resetandabort: 3853 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3854 m->m_pkthdr.ph_rtableid); 3855 m_freem(m); 3856 abort: 3857 if (so != NULL) 3858 (void) soabort(so); 3859 syn_cache_put(sc); 3860 tcpstat.tcps_sc_aborted++; 3861 return ((struct socket *)(-1)); 3862 } 3863 3864 /* 3865 * This function is called when we get a RST for a 3866 * non-existent connection, so that we can see if the 3867 * connection is in the syn cache. If it is, zap it. 3868 */ 3869 3870 void 3871 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3872 u_int rtableid) 3873 { 3874 struct syn_cache *sc; 3875 struct syn_cache_head *scp; 3876 int s = splsoftnet(); 3877 3878 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3879 splx(s); 3880 return; 3881 } 3882 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3883 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3884 splx(s); 3885 return; 3886 } 3887 syn_cache_rm(sc); 3888 splx(s); 3889 tcpstat.tcps_sc_reset++; 3890 syn_cache_put(sc); 3891 } 3892 3893 void 3894 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3895 u_int rtableid) 3896 { 3897 struct syn_cache *sc; 3898 struct syn_cache_head *scp; 3899 int s; 3900 3901 s = splsoftnet(); 3902 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3903 splx(s); 3904 return; 3905 } 3906 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3907 if (ntohl (th->th_seq) != sc->sc_iss) { 3908 splx(s); 3909 return; 3910 } 3911 3912 /* 3913 * If we've retransmitted 3 times and this is our second error, 3914 * we remove the entry. Otherwise, we allow it to continue on. 3915 * This prevents us from incorrectly nuking an entry during a 3916 * spurious network outage. 3917 * 3918 * See tcp_notify(). 3919 */ 3920 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3921 sc->sc_flags |= SCF_UNREACH; 3922 splx(s); 3923 return; 3924 } 3925 3926 syn_cache_rm(sc); 3927 splx(s); 3928 tcpstat.tcps_sc_unreach++; 3929 syn_cache_put(sc); 3930 } 3931 3932 /* 3933 * Given a LISTEN socket and an inbound SYN request, add 3934 * this to the syn cache, and send back a segment: 3935 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3936 * to the source. 3937 * 3938 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3939 * Doing so would require that we hold onto the data and deliver it 3940 * to the application. However, if we are the target of a SYN-flood 3941 * DoS attack, an attacker could send data which would eventually 3942 * consume all available buffer space if it were ACKed. By not ACKing 3943 * the data, we avoid this DoS scenario. 3944 */ 3945 3946 int 3947 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3948 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3949 struct tcp_opt_info *oi, tcp_seq *issp) 3950 { 3951 struct tcpcb tb, *tp; 3952 long win; 3953 struct syn_cache *sc; 3954 struct syn_cache_head *scp; 3955 struct mbuf *ipopts; 3956 3957 tp = sototcpcb(so); 3958 3959 /* 3960 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3961 * 3962 * Note this check is performed in tcp_input() very early on. 3963 */ 3964 3965 /* 3966 * Initialize some local state. 3967 */ 3968 win = sbspace(&so->so_rcv); 3969 if (win > TCP_MAXWIN) 3970 win = TCP_MAXWIN; 3971 3972 bzero(&tb, sizeof(tb)); 3973 #ifdef TCP_SIGNATURE 3974 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3975 #else 3976 if (optp) { 3977 #endif 3978 tb.pf = tp->pf; 3979 #ifdef TCP_SACK 3980 tb.sack_enable = tp->sack_enable; 3981 #endif 3982 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3983 #ifdef TCP_SIGNATURE 3984 if (tp->t_flags & TF_SIGNATURE) 3985 tb.t_flags |= TF_SIGNATURE; 3986 #endif 3987 tb.t_state = TCPS_LISTEN; 3988 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3989 sotoinpcb(so)->inp_rtableid)) 3990 return (-1); 3991 } 3992 3993 switch (src->sa_family) { 3994 case AF_INET: 3995 /* 3996 * Remember the IP options, if any. 3997 */ 3998 ipopts = ip_srcroute(m); 3999 break; 4000 default: 4001 ipopts = NULL; 4002 } 4003 4004 /* 4005 * See if we already have an entry for this connection. 4006 * If we do, resend the SYN,ACK. We do not count this 4007 * as a retransmission (XXX though maybe we should). 4008 */ 4009 if ((sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid)) 4010 != NULL) { 4011 tcpstat.tcps_sc_dupesyn++; 4012 if (ipopts) { 4013 /* 4014 * If we were remembering a previous source route, 4015 * forget it and use the new one we've been given. 4016 */ 4017 if (sc->sc_ipopts) 4018 (void) m_free(sc->sc_ipopts); 4019 sc->sc_ipopts = ipopts; 4020 } 4021 sc->sc_timestamp = tb.ts_recent; 4022 if (syn_cache_respond(sc, m) == 0) { 4023 tcpstat.tcps_sndacks++; 4024 tcpstat.tcps_sndtotal++; 4025 } 4026 return (0); 4027 } 4028 4029 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4030 if (sc == NULL) { 4031 if (ipopts) 4032 (void) m_free(ipopts); 4033 return (-1); 4034 } 4035 4036 /* 4037 * Fill in the cache, and put the necessary IP and TCP 4038 * options into the reply. 4039 */ 4040 memcpy(&sc->sc_src, src, src->sa_len); 4041 memcpy(&sc->sc_dst, dst, dst->sa_len); 4042 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 4043 sc->sc_flags = 0; 4044 sc->sc_ipopts = ipopts; 4045 sc->sc_irs = th->th_seq; 4046 4047 sc->sc_iss = issp ? *issp : arc4random(); 4048 sc->sc_peermaxseg = oi->maxseg; 4049 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 4050 sc->sc_win = win; 4051 sc->sc_timestamp = tb.ts_recent; 4052 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4053 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4054 sc->sc_flags |= SCF_TIMESTAMP; 4055 sc->sc_modulate = arc4random(); 4056 } 4057 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4058 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4059 sc->sc_requested_s_scale = tb.requested_s_scale; 4060 sc->sc_request_r_scale = 0; 4061 /* 4062 * Pick the smallest possible scaling factor that 4063 * will still allow us to scale up to sb_max. 4064 * 4065 * We do this because there are broken firewalls that 4066 * will corrupt the window scale option, leading to 4067 * the other endpoint believing that our advertised 4068 * window is unscaled. At scale factors larger than 4069 * 5 the unscaled window will drop below 1500 bytes, 4070 * leading to serious problems when traversing these 4071 * broken firewalls. 4072 * 4073 * With the default sbmax of 256K, a scale factor 4074 * of 3 will be chosen by this algorithm. Those who 4075 * choose a larger sbmax should watch out 4076 * for the compatiblity problems mentioned above. 4077 * 4078 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4079 * or <SYN,ACK>) segment itself is never scaled. 4080 */ 4081 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4082 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4083 sc->sc_request_r_scale++; 4084 } else { 4085 sc->sc_requested_s_scale = 15; 4086 sc->sc_request_r_scale = 15; 4087 } 4088 #ifdef TCP_ECN 4089 /* 4090 * if both ECE and CWR flag bits are set, peer is ECN capable. 4091 */ 4092 if (tcp_do_ecn && 4093 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4094 sc->sc_flags |= SCF_ECN_PERMIT; 4095 #endif 4096 #ifdef TCP_SACK 4097 /* 4098 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4099 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4100 */ 4101 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4102 sc->sc_flags |= SCF_SACK_PERMIT; 4103 #endif 4104 #ifdef TCP_SIGNATURE 4105 if (tb.t_flags & TF_SIGNATURE) 4106 sc->sc_flags |= SCF_SIGNATURE; 4107 #endif 4108 sc->sc_tp = tp; 4109 if (syn_cache_respond(sc, m) == 0) { 4110 syn_cache_insert(sc, tp); 4111 tcpstat.tcps_sndacks++; 4112 tcpstat.tcps_sndtotal++; 4113 } else { 4114 syn_cache_put(sc); 4115 tcpstat.tcps_sc_dropped++; 4116 } 4117 4118 return (0); 4119 } 4120 4121 int 4122 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4123 { 4124 struct route *ro; 4125 u_int8_t *optp; 4126 int optlen, error; 4127 u_int16_t tlen; 4128 struct ip *ip = NULL; 4129 #ifdef INET6 4130 struct ip6_hdr *ip6 = NULL; 4131 #endif 4132 struct tcphdr *th; 4133 u_int hlen; 4134 struct inpcb *inp; 4135 4136 switch (sc->sc_src.sa.sa_family) { 4137 case AF_INET: 4138 hlen = sizeof(struct ip); 4139 ro = &sc->sc_route4; 4140 break; 4141 #ifdef INET6 4142 case AF_INET6: 4143 hlen = sizeof(struct ip6_hdr); 4144 ro = (struct route *)&sc->sc_route6; 4145 break; 4146 #endif 4147 default: 4148 m_freem(m); 4149 return (EAFNOSUPPORT); 4150 } 4151 4152 /* Compute the size of the TCP options. */ 4153 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4154 #ifdef TCP_SACK 4155 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4156 #endif 4157 #ifdef TCP_SIGNATURE 4158 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4159 #endif 4160 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4161 4162 tlen = hlen + sizeof(struct tcphdr) + optlen; 4163 4164 /* 4165 * Create the IP+TCP header from scratch. 4166 */ 4167 m_freem(m); 4168 #ifdef DIAGNOSTIC 4169 if (max_linkhdr + tlen > MCLBYTES) 4170 return (ENOBUFS); 4171 #endif 4172 MGETHDR(m, M_DONTWAIT, MT_DATA); 4173 if (m && max_linkhdr + tlen > MHLEN) { 4174 MCLGET(m, M_DONTWAIT); 4175 if ((m->m_flags & M_EXT) == 0) { 4176 m_freem(m); 4177 m = NULL; 4178 } 4179 } 4180 if (m == NULL) 4181 return (ENOBUFS); 4182 4183 /* Fixup the mbuf. */ 4184 m->m_data += max_linkhdr; 4185 m->m_len = m->m_pkthdr.len = tlen; 4186 m->m_pkthdr.ph_ifidx = 0; 4187 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4188 memset(mtod(m, u_char *), 0, tlen); 4189 4190 switch (sc->sc_src.sa.sa_family) { 4191 case AF_INET: 4192 ip = mtod(m, struct ip *); 4193 ip->ip_dst = sc->sc_src.sin.sin_addr; 4194 ip->ip_src = sc->sc_dst.sin.sin_addr; 4195 ip->ip_p = IPPROTO_TCP; 4196 th = (struct tcphdr *)(ip + 1); 4197 th->th_dport = sc->sc_src.sin.sin_port; 4198 th->th_sport = sc->sc_dst.sin.sin_port; 4199 break; 4200 #ifdef INET6 4201 case AF_INET6: 4202 ip6 = mtod(m, struct ip6_hdr *); 4203 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4204 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4205 ip6->ip6_nxt = IPPROTO_TCP; 4206 /* ip6_plen will be updated in ip6_output() */ 4207 th = (struct tcphdr *)(ip6 + 1); 4208 th->th_dport = sc->sc_src.sin6.sin6_port; 4209 th->th_sport = sc->sc_dst.sin6.sin6_port; 4210 break; 4211 #endif 4212 default: 4213 unhandled_af(sc->sc_src.sa.sa_family); 4214 } 4215 4216 th->th_seq = htonl(sc->sc_iss); 4217 th->th_ack = htonl(sc->sc_irs + 1); 4218 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4219 th->th_flags = TH_SYN|TH_ACK; 4220 #ifdef TCP_ECN 4221 /* Set ECE for SYN-ACK if peer supports ECN. */ 4222 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4223 th->th_flags |= TH_ECE; 4224 #endif 4225 th->th_win = htons(sc->sc_win); 4226 /* th_sum already 0 */ 4227 /* th_urp already 0 */ 4228 4229 /* Tack on the TCP options. */ 4230 optp = (u_int8_t *)(th + 1); 4231 *optp++ = TCPOPT_MAXSEG; 4232 *optp++ = 4; 4233 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4234 *optp++ = sc->sc_ourmaxseg & 0xff; 4235 4236 #ifdef TCP_SACK 4237 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4238 if (sc->sc_flags & SCF_SACK_PERMIT) { 4239 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4240 optp += 4; 4241 } 4242 #endif 4243 4244 if (sc->sc_request_r_scale != 15) { 4245 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4246 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4247 sc->sc_request_r_scale); 4248 optp += 4; 4249 } 4250 4251 if (sc->sc_flags & SCF_TIMESTAMP) { 4252 u_int32_t *lp = (u_int32_t *)(optp); 4253 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4254 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4255 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4256 *lp = htonl(sc->sc_timestamp); 4257 optp += TCPOLEN_TSTAMP_APPA; 4258 } 4259 4260 #ifdef TCP_SIGNATURE 4261 if (sc->sc_flags & SCF_SIGNATURE) { 4262 union sockaddr_union src, dst; 4263 struct tdb *tdb; 4264 4265 bzero(&src, sizeof(union sockaddr_union)); 4266 bzero(&dst, sizeof(union sockaddr_union)); 4267 src.sa.sa_len = sc->sc_src.sa.sa_len; 4268 src.sa.sa_family = sc->sc_src.sa.sa_family; 4269 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4270 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4271 4272 switch (sc->sc_src.sa.sa_family) { 4273 case 0: /*default to PF_INET*/ 4274 case AF_INET: 4275 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4276 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4277 break; 4278 #ifdef INET6 4279 case AF_INET6: 4280 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4281 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4282 break; 4283 #endif /* INET6 */ 4284 } 4285 4286 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4287 0, &src, &dst, IPPROTO_TCP); 4288 if (tdb == NULL) { 4289 m_freem(m); 4290 return (EPERM); 4291 } 4292 4293 /* Send signature option */ 4294 *(optp++) = TCPOPT_SIGNATURE; 4295 *(optp++) = TCPOLEN_SIGNATURE; 4296 4297 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4298 hlen, 0, optp) < 0) { 4299 m_freem(m); 4300 return (EINVAL); 4301 } 4302 optp += 16; 4303 4304 /* Pad options list to the next 32 bit boundary and 4305 * terminate it. 4306 */ 4307 *optp++ = TCPOPT_NOP; 4308 *optp++ = TCPOPT_EOL; 4309 } 4310 #endif /* TCP_SIGNATURE */ 4311 4312 /* Compute the packet's checksum. */ 4313 switch (sc->sc_src.sa.sa_family) { 4314 case AF_INET: 4315 ip->ip_len = htons(tlen - hlen); 4316 th->th_sum = 0; 4317 th->th_sum = in_cksum(m, tlen); 4318 break; 4319 #ifdef INET6 4320 case AF_INET6: 4321 ip6->ip6_plen = htons(tlen - hlen); 4322 th->th_sum = 0; 4323 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4324 break; 4325 #endif 4326 } 4327 4328 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4329 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4330 4331 /* 4332 * Fill in some straggling IP bits. Note the stack expects 4333 * ip_len to be in host order, for convenience. 4334 */ 4335 switch (sc->sc_src.sa.sa_family) { 4336 case AF_INET: 4337 ip->ip_len = htons(tlen); 4338 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4339 if (inp != NULL) 4340 ip->ip_tos = inp->inp_ip.ip_tos; 4341 break; 4342 #ifdef INET6 4343 case AF_INET6: 4344 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4345 ip6->ip6_vfc |= IPV6_VERSION; 4346 ip6->ip6_plen = htons(tlen - hlen); 4347 /* ip6_hlim will be initialized afterwards */ 4348 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4349 break; 4350 #endif 4351 } 4352 4353 switch (sc->sc_src.sa.sa_family) { 4354 case AF_INET: 4355 error = ip_output(m, sc->sc_ipopts, ro, 4356 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4357 break; 4358 #ifdef INET6 4359 case AF_INET6: 4360 ip6->ip6_hlim = in6_selecthlim(NULL); 4361 4362 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4363 NULL, NULL); 4364 break; 4365 #endif 4366 default: 4367 error = EAFNOSUPPORT; 4368 break; 4369 } 4370 return (error); 4371 } 4372