1 /* $OpenBSD: tcp_input.c,v 1.232 2010/03/11 00:24:58 sthen Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/protosw.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/kernel.h> 78 #include <sys/pool.h> 79 80 #include <dev/rndvar.h> 81 82 #include <net/if.h> 83 #include <net/route.h> 84 85 #include <netinet/in.h> 86 #include <netinet/in_systm.h> 87 #include <netinet/ip.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/tcp.h> 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/tcp_debug.h> 97 98 #include "faith.h" 99 #if NFAITH > 0 100 #include <net/if_types.h> 101 #endif 102 103 #include "pf.h" 104 #if NPF > 0 105 #include <net/pfvar.h> 106 #endif 107 108 struct tcpiphdr tcp_saveti; 109 110 int tcp_mss_adv(struct ifnet *, int); 111 112 #ifdef INET6 113 #include <netinet6/in6_var.h> 114 #include <netinet6/nd6.h> 115 116 struct tcpipv6hdr tcp_saveti6; 117 118 /* for the packet header length in the mbuf */ 119 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 120 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 121 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 122 #endif /* INET6 */ 123 124 int tcprexmtthresh = 3; 125 int tcptv_keep_init = TCPTV_KEEP_INIT; 126 127 extern u_long sb_max; 128 129 int tcp_rst_ppslim = 100; /* 100pps */ 130 int tcp_rst_ppslim_count = 0; 131 struct timeval tcp_rst_ppslim_last; 132 133 int tcp_ackdrop_ppslim = 100; /* 100pps */ 134 int tcp_ackdrop_ppslim_count = 0; 135 struct timeval tcp_ackdrop_ppslim_last; 136 137 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 138 139 /* for modulo comparisons of timestamps */ 140 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 141 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 142 143 /* for TCP SACK comparisons */ 144 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 145 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 146 147 /* 148 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 149 */ 150 #ifdef INET6 151 #define ND6_HINT(tp) \ 152 do { \ 153 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 154 tp->t_inpcb->inp_route6.ro_rt) { \ 155 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 156 } \ 157 } while (0) 158 #else 159 #define ND6_HINT(tp) 160 #endif 161 162 #ifdef TCP_ECN 163 /* 164 * ECN (Explicit Congestion Notification) support based on RFC3168 165 * implementation note: 166 * snd_last is used to track a recovery phase. 167 * when cwnd is reduced, snd_last is set to snd_max. 168 * while snd_last > snd_una, the sender is in a recovery phase and 169 * its cwnd should not be reduced again. 170 * snd_last follows snd_una when not in a recovery phase. 171 */ 172 #endif 173 174 /* 175 * Macro to compute ACK transmission behavior. Delay the ACK unless 176 * we have already delayed an ACK (must send an ACK every two segments). 177 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 178 * option is enabled. 179 */ 180 #define TCP_SETUP_ACK(tp, tiflags) \ 181 do { \ 182 if ((tp)->t_flags & TF_DELACK || \ 183 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \ 184 tp->t_flags |= TF_ACKNOW; \ 185 else \ 186 TCP_SET_DELACK(tp); \ 187 } while (0) 188 189 /* 190 * Insert segment ti into reassembly queue of tcp with 191 * control block tp. Return TH_FIN if reassembly now includes 192 * a segment with FIN. The macro form does the common case inline 193 * (segment is the next to be received on an established connection, 194 * and the queue is empty), avoiding linkage into and removal 195 * from the queue and repetition of various conversions. 196 * Set DELACK for segments received in order, but ack immediately 197 * when segments are out of order (so fast retransmit can work). 198 */ 199 200 int 201 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 202 { 203 struct tcpqent *p, *q, *nq, *tiqe; 204 struct socket *so = tp->t_inpcb->inp_socket; 205 int flags; 206 207 /* 208 * Call with th==0 after become established to 209 * force pre-ESTABLISHED data up to user socket. 210 */ 211 if (th == 0) 212 goto present; 213 214 /* 215 * Allocate a new queue entry, before we throw away any data. 216 * If we can't, just drop the packet. XXX 217 */ 218 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 219 if (tiqe == NULL) { 220 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 221 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 222 /* Reuse last entry since new segment fills a hole */ 223 m_freem(tiqe->tcpqe_m); 224 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 225 } 226 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 227 /* Flush segment queue for this connection */ 228 tcp_freeq(tp); 229 tcpstat.tcps_rcvmemdrop++; 230 m_freem(m); 231 return (0); 232 } 233 } 234 235 /* 236 * Find a segment which begins after this one does. 237 */ 238 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 239 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 240 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 241 break; 242 243 /* 244 * If there is a preceding segment, it may provide some of 245 * our data already. If so, drop the data from the incoming 246 * segment. If it provides all of our data, drop us. 247 */ 248 if (p != NULL) { 249 struct tcphdr *phdr = p->tcpqe_tcp; 250 int i; 251 252 /* conversion to int (in i) handles seq wraparound */ 253 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 254 if (i > 0) { 255 if (i >= *tlen) { 256 tcpstat.tcps_rcvduppack++; 257 tcpstat.tcps_rcvdupbyte += *tlen; 258 m_freem(m); 259 pool_put(&tcpqe_pool, tiqe); 260 return (0); 261 } 262 m_adj(m, i); 263 *tlen -= i; 264 th->th_seq += i; 265 } 266 } 267 tcpstat.tcps_rcvoopack++; 268 tcpstat.tcps_rcvoobyte += *tlen; 269 270 /* 271 * While we overlap succeeding segments trim them or, 272 * if they are completely covered, dequeue them. 273 */ 274 for (; q != NULL; q = nq) { 275 struct tcphdr *qhdr = q->tcpqe_tcp; 276 int i = (th->th_seq + *tlen) - qhdr->th_seq; 277 278 if (i <= 0) 279 break; 280 if (i < qhdr->th_reseqlen) { 281 qhdr->th_seq += i; 282 qhdr->th_reseqlen -= i; 283 m_adj(q->tcpqe_m, i); 284 break; 285 } 286 nq = TAILQ_NEXT(q, tcpqe_q); 287 m_freem(q->tcpqe_m); 288 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 289 pool_put(&tcpqe_pool, q); 290 } 291 292 /* Insert the new segment queue entry into place. */ 293 tiqe->tcpqe_m = m; 294 th->th_reseqlen = *tlen; 295 tiqe->tcpqe_tcp = th; 296 if (p == NULL) { 297 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 298 } else { 299 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 300 } 301 302 present: 303 /* 304 * Present data to user, advancing rcv_nxt through 305 * completed sequence space. 306 */ 307 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 308 return (0); 309 q = TAILQ_FIRST(&tp->t_segq); 310 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 311 return (0); 312 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 313 return (0); 314 do { 315 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 316 flags = q->tcpqe_tcp->th_flags & TH_FIN; 317 318 nq = TAILQ_NEXT(q, tcpqe_q); 319 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 320 ND6_HINT(tp); 321 if (so->so_state & SS_CANTRCVMORE) 322 m_freem(q->tcpqe_m); 323 else 324 sbappendstream(&so->so_rcv, q->tcpqe_m); 325 pool_put(&tcpqe_pool, q); 326 q = nq; 327 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 328 sorwakeup(so); 329 return (flags); 330 } 331 332 #ifdef INET6 333 int 334 tcp6_input(struct mbuf **mp, int *offp, int proto) 335 { 336 struct mbuf *m = *mp; 337 338 #if NFAITH > 0 339 if (m->m_pkthdr.rcvif) { 340 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 341 /* XXX send icmp6 host/port unreach? */ 342 m_freem(m); 343 return IPPROTO_DONE; 344 } 345 } 346 #endif 347 348 tcp_input(m, *offp, proto); 349 return IPPROTO_DONE; 350 } 351 #endif 352 353 /* 354 * TCP input routine, follows pages 65-76 of the 355 * protocol specification dated September, 1981 very closely. 356 */ 357 void 358 tcp_input(struct mbuf *m, ...) 359 { 360 struct ip *ip; 361 struct inpcb *inp = NULL; 362 u_int8_t *optp = NULL; 363 int optlen = 0; 364 int tlen, off; 365 struct tcpcb *tp = 0; 366 int tiflags; 367 struct socket *so = NULL; 368 int todrop, acked, ourfinisacked, needoutput = 0; 369 int hdroptlen = 0; 370 short ostate = 0; 371 tcp_seq iss, *reuse = NULL; 372 u_long tiwin; 373 struct tcp_opt_info opti; 374 int iphlen; 375 va_list ap; 376 struct tcphdr *th; 377 #ifdef INET6 378 struct ip6_hdr *ip6 = NULL; 379 #endif /* INET6 */ 380 #ifdef IPSEC 381 struct m_tag *mtag; 382 struct tdb_ident *tdbi; 383 struct tdb *tdb; 384 int error, s; 385 #endif /* IPSEC */ 386 int af; 387 #ifdef TCP_ECN 388 u_char iptos; 389 #endif 390 391 va_start(ap, m); 392 iphlen = va_arg(ap, int); 393 va_end(ap); 394 395 tcpstat.tcps_rcvtotal++; 396 397 opti.ts_present = 0; 398 opti.maxseg = 0; 399 400 /* 401 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 402 * See below for AF specific multicast. 403 */ 404 if (m->m_flags & (M_BCAST|M_MCAST)) 405 goto drop; 406 407 /* 408 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 409 * TCP/IPv4. 410 */ 411 switch (mtod(m, struct ip *)->ip_v) { 412 #ifdef INET6 413 case 6: 414 af = AF_INET6; 415 break; 416 #endif 417 case 4: 418 af = AF_INET; 419 break; 420 default: 421 m_freem(m); 422 return; /*EAFNOSUPPORT*/ 423 } 424 425 /* 426 * Get IP and TCP header together in first mbuf. 427 * Note: IP leaves IP header in first mbuf. 428 */ 429 switch (af) { 430 case AF_INET: 431 #ifdef DIAGNOSTIC 432 if (iphlen < sizeof(struct ip)) { 433 m_freem(m); 434 return; 435 } 436 #endif /* DIAGNOSTIC */ 437 break; 438 #ifdef INET6 439 case AF_INET6: 440 #ifdef DIAGNOSTIC 441 if (iphlen < sizeof(struct ip6_hdr)) { 442 m_freem(m); 443 return; 444 } 445 #endif /* DIAGNOSTIC */ 446 break; 447 #endif 448 default: 449 m_freem(m); 450 return; 451 } 452 453 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 454 if (!th) { 455 tcpstat.tcps_rcvshort++; 456 return; 457 } 458 459 tlen = m->m_pkthdr.len - iphlen; 460 ip = NULL; 461 #ifdef INET6 462 ip6 = NULL; 463 #endif 464 switch (af) { 465 case AF_INET: 466 ip = mtod(m, struct ip *); 467 if (IN_MULTICAST(ip->ip_dst.s_addr) || 468 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 469 goto drop; 470 #ifdef TCP_ECN 471 /* save ip_tos before clearing it for checksum */ 472 iptos = ip->ip_tos; 473 #endif 474 /* 475 * Checksum extended TCP header and data. 476 */ 477 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 478 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 479 tcpstat.tcps_inhwcsum++; 480 tcpstat.tcps_rcvbadsum++; 481 goto drop; 482 } 483 if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) { 484 tcpstat.tcps_rcvbadsum++; 485 goto drop; 486 } 487 } else { 488 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 489 tcpstat.tcps_inhwcsum++; 490 } 491 break; 492 #ifdef INET6 493 case AF_INET6: 494 ip6 = mtod(m, struct ip6_hdr *); 495 #ifdef TCP_ECN 496 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 497 #endif 498 499 /* Be proactive about malicious use of IPv4 mapped address */ 500 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 501 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 502 /* XXX stat */ 503 goto drop; 504 } 505 506 /* 507 * Be proactive about unspecified IPv6 address in source. 508 * As we use all-zero to indicate unbounded/unconnected pcb, 509 * unspecified IPv6 address can be used to confuse us. 510 * 511 * Note that packets with unspecified IPv6 destination is 512 * already dropped in ip6_input. 513 */ 514 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 515 /* XXX stat */ 516 goto drop; 517 } 518 519 /* Discard packets to multicast */ 520 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 521 /* XXX stat */ 522 goto drop; 523 } 524 525 /* 526 * Checksum extended TCP header and data. 527 */ 528 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 529 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 530 tcpstat.tcps_inhwcsum++; 531 tcpstat.tcps_rcvbadsum++; 532 goto drop; 533 } 534 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 535 tlen)) { 536 tcpstat.tcps_rcvbadsum++; 537 goto drop; 538 } 539 } else { 540 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 541 tcpstat.tcps_inhwcsum++; 542 } 543 break; 544 #endif 545 } 546 547 /* 548 * Check that TCP offset makes sense, 549 * pull out TCP options and adjust length. XXX 550 */ 551 off = th->th_off << 2; 552 if (off < sizeof(struct tcphdr) || off > tlen) { 553 tcpstat.tcps_rcvbadoff++; 554 goto drop; 555 } 556 tlen -= off; 557 if (off > sizeof(struct tcphdr)) { 558 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 559 if (!th) { 560 tcpstat.tcps_rcvshort++; 561 return; 562 } 563 optlen = off - sizeof(struct tcphdr); 564 optp = (u_int8_t *)(th + 1); 565 /* 566 * Do quick retrieval of timestamp options ("options 567 * prediction?"). If timestamp is the only option and it's 568 * formatted as recommended in RFC 1323 appendix A, we 569 * quickly get the values now and not bother calling 570 * tcp_dooptions(), etc. 571 */ 572 if ((optlen == TCPOLEN_TSTAMP_APPA || 573 (optlen > TCPOLEN_TSTAMP_APPA && 574 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 575 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 576 (th->th_flags & TH_SYN) == 0) { 577 opti.ts_present = 1; 578 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 579 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 580 optp = NULL; /* we've parsed the options */ 581 } 582 } 583 tiflags = th->th_flags; 584 585 /* 586 * Convert TCP protocol specific fields to host format. 587 */ 588 NTOHL(th->th_seq); 589 NTOHL(th->th_ack); 590 NTOHS(th->th_win); 591 NTOHS(th->th_urp); 592 593 /* 594 * Locate pcb for segment. 595 */ 596 #if NPF > 0 597 if (m->m_pkthdr.pf.statekey) 598 inp = ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp; 599 #endif 600 findpcb: 601 if (inp == NULL) { 602 switch (af) { 603 #ifdef INET6 604 case AF_INET6: 605 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 606 th->th_sport, &ip6->ip6_dst, th->th_dport); 607 break; 608 #endif 609 case AF_INET: 610 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 611 th->th_sport, ip->ip_dst, th->th_dport, 612 m->m_pkthdr.rdomain); 613 break; 614 } 615 #if NPF > 0 616 if (m->m_pkthdr.pf.statekey && inp) { 617 ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp = 618 inp; 619 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 620 } 621 #endif 622 } 623 if (inp == NULL) { 624 int inpl_flags = 0; 625 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 626 inpl_flags = INPLOOKUP_WILDCARD; 627 ++tcpstat.tcps_pcbhashmiss; 628 switch (af) { 629 #ifdef INET6 630 case AF_INET6: 631 inp = in6_pcblookup_listen(&tcbtable, 632 &ip6->ip6_dst, th->th_dport, inpl_flags, m); 633 break; 634 #endif /* INET6 */ 635 case AF_INET: 636 inp = in_pcblookup_listen(&tcbtable, 637 ip->ip_dst, th->th_dport, inpl_flags, m, 638 m->m_pkthdr.rdomain); 639 break; 640 } 641 /* 642 * If the state is CLOSED (i.e., TCB does not exist) then 643 * all data in the incoming segment is discarded. 644 * If the TCB exists but is in CLOSED state, it is embryonic, 645 * but should either do a listen or a connect soon. 646 */ 647 if (inp == 0) { 648 ++tcpstat.tcps_noport; 649 goto dropwithreset_ratelim; 650 } 651 } 652 653 /* Check the minimum TTL for socket. */ 654 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 655 goto drop; 656 657 tp = intotcpcb(inp); 658 if (tp == 0) 659 goto dropwithreset_ratelim; 660 if (tp->t_state == TCPS_CLOSED) 661 goto drop; 662 663 /* Unscale the window into a 32-bit value. */ 664 if ((tiflags & TH_SYN) == 0) 665 tiwin = th->th_win << tp->snd_scale; 666 else 667 tiwin = th->th_win; 668 669 so = inp->inp_socket; 670 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 671 union syn_cache_sa src; 672 union syn_cache_sa dst; 673 674 bzero(&src, sizeof(src)); 675 bzero(&dst, sizeof(dst)); 676 switch (af) { 677 #ifdef INET 678 case AF_INET: 679 src.sin.sin_len = sizeof(struct sockaddr_in); 680 src.sin.sin_family = AF_INET; 681 src.sin.sin_addr = ip->ip_src; 682 src.sin.sin_port = th->th_sport; 683 684 dst.sin.sin_len = sizeof(struct sockaddr_in); 685 dst.sin.sin_family = AF_INET; 686 dst.sin.sin_addr = ip->ip_dst; 687 dst.sin.sin_port = th->th_dport; 688 break; 689 #endif 690 #ifdef INET6 691 case AF_INET6: 692 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 693 src.sin6.sin6_family = AF_INET6; 694 src.sin6.sin6_addr = ip6->ip6_src; 695 src.sin6.sin6_port = th->th_sport; 696 697 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 698 dst.sin6.sin6_family = AF_INET6; 699 dst.sin6.sin6_addr = ip6->ip6_dst; 700 dst.sin6.sin6_port = th->th_dport; 701 break; 702 #endif /* INET6 */ 703 default: 704 goto badsyn; /*sanity*/ 705 } 706 707 if (so->so_options & SO_DEBUG) { 708 ostate = tp->t_state; 709 switch (af) { 710 #ifdef INET6 711 case AF_INET6: 712 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 713 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 714 break; 715 #endif 716 case AF_INET: 717 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 718 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 719 break; 720 } 721 } 722 if (so->so_options & SO_ACCEPTCONN) { 723 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 724 if (tiflags & TH_RST) { 725 syn_cache_reset(&src.sa, &dst.sa, th, 726 inp->inp_rdomain); 727 } else if ((tiflags & (TH_ACK|TH_SYN)) == 728 (TH_ACK|TH_SYN)) { 729 /* 730 * Received a SYN,ACK. This should 731 * never happen while we are in 732 * LISTEN. Send an RST. 733 */ 734 goto badsyn; 735 } else if (tiflags & TH_ACK) { 736 so = syn_cache_get(&src.sa, &dst.sa, 737 th, iphlen, tlen, so, m); 738 if (so == NULL) { 739 /* 740 * We don't have a SYN for 741 * this ACK; send an RST. 742 */ 743 goto badsyn; 744 } else if (so == 745 (struct socket *)(-1)) { 746 /* 747 * We were unable to create 748 * the connection. If the 749 * 3-way handshake was 750 * completed, and RST has 751 * been sent to the peer. 752 * Since the mbuf might be 753 * in use for the reply, 754 * do not free it. 755 */ 756 m = NULL; 757 } else { 758 /* 759 * We have created a 760 * full-blown connection. 761 */ 762 tp = NULL; 763 inp = (struct inpcb *)so->so_pcb; 764 tp = intotcpcb(inp); 765 if (tp == NULL) 766 goto badsyn; /*XXX*/ 767 768 /* 769 * Compute proper scaling 770 * value from buffer space 771 */ 772 tcp_rscale(tp, so->so_rcv.sb_hiwat); 773 goto after_listen; 774 } 775 } else { 776 /* 777 * None of RST, SYN or ACK was set. 778 * This is an invalid packet for a 779 * TCB in LISTEN state. Send a RST. 780 */ 781 goto badsyn; 782 } 783 } else { 784 /* 785 * Received a SYN. 786 */ 787 #ifdef INET6 788 /* 789 * If deprecated address is forbidden, we do 790 * not accept SYN to deprecated interface 791 * address to prevent any new inbound 792 * connection from getting established. 793 * When we do not accept SYN, we send a TCP 794 * RST, with deprecated source address (instead 795 * of dropping it). We compromise it as it is 796 * much better for peer to send a RST, and 797 * RST will be the final packet for the 798 * exchange. 799 * 800 * If we do not forbid deprecated addresses, we 801 * accept the SYN packet. RFC2462 does not 802 * suggest dropping SYN in this case. 803 * If we decipher RFC2462 5.5.4, it says like 804 * this: 805 * 1. use of deprecated addr with existing 806 * communication is okay - "SHOULD continue 807 * to be used" 808 * 2. use of it with new communication: 809 * (2a) "SHOULD NOT be used if alternate 810 * address with sufficient scope is 811 * available" 812 * (2b) nothing mentioned otherwise. 813 * Here we fall into (2b) case as we have no 814 * choice in our source address selection - we 815 * must obey the peer. 816 * 817 * The wording in RFC2462 is confusing, and 818 * there are multiple description text for 819 * deprecated address handling - worse, they 820 * are not exactly the same. I believe 5.5.4 821 * is the best one, so we follow 5.5.4. 822 */ 823 if (ip6 && !ip6_use_deprecated) { 824 struct in6_ifaddr *ia6; 825 826 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 827 &ip6->ip6_dst)) && 828 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 829 tp = NULL; 830 goto dropwithreset; 831 } 832 } 833 #endif 834 835 /* 836 * LISTEN socket received a SYN 837 * from itself? This can't possibly 838 * be valid; drop the packet. 839 */ 840 if (th->th_dport == th->th_sport) { 841 switch (af) { 842 #ifdef INET6 843 case AF_INET6: 844 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 845 &ip6->ip6_dst)) { 846 tcpstat.tcps_badsyn++; 847 goto drop; 848 } 849 break; 850 #endif /* INET6 */ 851 case AF_INET: 852 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 853 tcpstat.tcps_badsyn++; 854 goto drop; 855 } 856 break; 857 } 858 } 859 860 /* 861 * SYN looks ok; create compressed TCP 862 * state for it. 863 */ 864 if (so->so_qlen <= so->so_qlimit && 865 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 866 so, m, optp, optlen, &opti, reuse)) 867 m = NULL; 868 } 869 goto drop; 870 } 871 } 872 873 after_listen: 874 #ifdef DIAGNOSTIC 875 /* 876 * Should not happen now that all embryonic connections 877 * are handled with compressed state. 878 */ 879 if (tp->t_state == TCPS_LISTEN) 880 panic("tcp_input: TCPS_LISTEN"); 881 #endif 882 883 #if NPF > 0 884 if (m->m_pkthdr.pf.statekey) { 885 ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp = inp; 886 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 887 } 888 #endif 889 890 #ifdef IPSEC 891 /* Find most recent IPsec tag */ 892 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 893 s = splnet(); 894 if (mtag != NULL) { 895 tdbi = (struct tdb_ident *)(mtag + 1); 896 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 897 } else 898 tdb = NULL; 899 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 900 tdb, inp); 901 if (error) { 902 splx(s); 903 goto drop; 904 } 905 906 /* Latch SA */ 907 if (inp->inp_tdb_in != tdb) { 908 if (tdb) { 909 tdb_add_inp(tdb, inp, 1); 910 if (inp->inp_ipo == NULL) { 911 inp->inp_ipo = ipsec_add_policy(inp, af, 912 IPSP_DIRECTION_OUT); 913 if (inp->inp_ipo == NULL) { 914 splx(s); 915 goto drop; 916 } 917 } 918 if (inp->inp_ipo->ipo_dstid == NULL && 919 tdb->tdb_srcid != NULL) { 920 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 921 tdb->tdb_srcid->ref_count++; 922 } 923 if (inp->inp_ipsec_remotecred == NULL && 924 tdb->tdb_remote_cred != NULL) { 925 inp->inp_ipsec_remotecred = 926 tdb->tdb_remote_cred; 927 tdb->tdb_remote_cred->ref_count++; 928 } 929 if (inp->inp_ipsec_remoteauth == NULL && 930 tdb->tdb_remote_auth != NULL) { 931 inp->inp_ipsec_remoteauth = 932 tdb->tdb_remote_auth; 933 tdb->tdb_remote_auth->ref_count++; 934 } 935 } else { /* Just reset */ 936 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 937 inp_tdb_in_next); 938 inp->inp_tdb_in = NULL; 939 } 940 } 941 splx(s); 942 #endif /* IPSEC */ 943 944 /* 945 * Segment received on connection. 946 * Reset idle time and keep-alive timer. 947 */ 948 tp->t_rcvtime = tcp_now; 949 if (TCPS_HAVEESTABLISHED(tp->t_state)) 950 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 951 952 #ifdef TCP_SACK 953 if (tp->sack_enable) 954 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 955 #endif /* TCP_SACK */ 956 957 /* 958 * Process options. 959 */ 960 #ifdef TCP_SIGNATURE 961 if (optp || (tp->t_flags & TF_SIGNATURE)) 962 #else 963 if (optp) 964 #endif 965 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti)) 966 goto drop; 967 968 if (opti.ts_present && opti.ts_ecr) { 969 int rtt_test; 970 971 /* subtract out the tcp timestamp modulator */ 972 opti.ts_ecr -= tp->ts_modulate; 973 974 /* make sure ts_ecr is sensible */ 975 rtt_test = tcp_now - opti.ts_ecr; 976 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 977 opti.ts_ecr = 0; 978 } 979 980 #ifdef TCP_ECN 981 /* if congestion experienced, set ECE bit in subsequent packets. */ 982 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 983 tp->t_flags |= TF_RCVD_CE; 984 tcpstat.tcps_ecn_rcvce++; 985 } 986 #endif 987 /* 988 * Header prediction: check for the two common cases 989 * of a uni-directional data xfer. If the packet has 990 * no control flags, is in-sequence, the window didn't 991 * change and we're not retransmitting, it's a 992 * candidate. If the length is zero and the ack moved 993 * forward, we're the sender side of the xfer. Just 994 * free the data acked & wake any higher level process 995 * that was blocked waiting for space. If the length 996 * is non-zero and the ack didn't move, we're the 997 * receiver side. If we're getting packets in-order 998 * (the reassembly queue is empty), add the data to 999 * the socket buffer and note that we need a delayed ack. 1000 */ 1001 if (tp->t_state == TCPS_ESTABLISHED && 1002 #ifdef TCP_ECN 1003 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 1004 #else 1005 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1006 #endif 1007 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1008 th->th_seq == tp->rcv_nxt && 1009 tiwin && tiwin == tp->snd_wnd && 1010 tp->snd_nxt == tp->snd_max) { 1011 1012 /* 1013 * If last ACK falls within this segment's sequence numbers, 1014 * record the timestamp. 1015 * Fix from Braden, see Stevens p. 870 1016 */ 1017 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1018 tp->ts_recent_age = tcp_now; 1019 tp->ts_recent = opti.ts_val; 1020 } 1021 1022 if (tlen == 0) { 1023 if (SEQ_GT(th->th_ack, tp->snd_una) && 1024 SEQ_LEQ(th->th_ack, tp->snd_max) && 1025 tp->snd_cwnd >= tp->snd_wnd && 1026 tp->t_dupacks == 0) { 1027 /* 1028 * this is a pure ack for outstanding data. 1029 */ 1030 ++tcpstat.tcps_predack; 1031 if (opti.ts_present && opti.ts_ecr) 1032 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1033 else if (tp->t_rtttime && 1034 SEQ_GT(th->th_ack, tp->t_rtseq)) 1035 tcp_xmit_timer(tp, 1036 tcp_now - tp->t_rtttime); 1037 acked = th->th_ack - tp->snd_una; 1038 tcpstat.tcps_rcvackpack++; 1039 tcpstat.tcps_rcvackbyte += acked; 1040 ND6_HINT(tp); 1041 sbdrop(&so->so_snd, acked); 1042 1043 /* 1044 * If we had a pending ICMP message that 1045 * referres to data that have just been 1046 * acknowledged, disregard the recorded ICMP 1047 * message. 1048 */ 1049 if ((tp->t_flags & TF_PMTUD_PEND) && 1050 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1051 tp->t_flags &= ~TF_PMTUD_PEND; 1052 1053 /* 1054 * Keep track of the largest chunk of data 1055 * acknowledged since last PMTU update 1056 */ 1057 if (tp->t_pmtud_mss_acked < acked) 1058 tp->t_pmtud_mss_acked = acked; 1059 1060 tp->snd_una = th->th_ack; 1061 #if defined(TCP_SACK) || defined(TCP_ECN) 1062 /* 1063 * We want snd_last to track snd_una so 1064 * as to avoid sequence wraparound problems 1065 * for very large transfers. 1066 */ 1067 #ifdef TCP_ECN 1068 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1069 #endif 1070 tp->snd_last = tp->snd_una; 1071 #endif /* TCP_SACK */ 1072 #if defined(TCP_SACK) && defined(TCP_FACK) 1073 tp->snd_fack = tp->snd_una; 1074 tp->retran_data = 0; 1075 #endif /* TCP_FACK */ 1076 m_freem(m); 1077 1078 /* 1079 * If all outstanding data are acked, stop 1080 * retransmit timer, otherwise restart timer 1081 * using current (possibly backed-off) value. 1082 * If process is waiting for space, 1083 * wakeup/selwakeup/signal. If data 1084 * are ready to send, let tcp_output 1085 * decide between more output or persist. 1086 */ 1087 if (tp->snd_una == tp->snd_max) 1088 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1089 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1090 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1091 1092 if (sb_notify(&so->so_snd)) 1093 sowwakeup(so); 1094 if (so->so_snd.sb_cc) 1095 (void) tcp_output(tp); 1096 return; 1097 } 1098 } else if (th->th_ack == tp->snd_una && 1099 TAILQ_EMPTY(&tp->t_segq) && 1100 tlen <= sbspace(&so->so_rcv)) { 1101 /* 1102 * This is a pure, in-sequence data packet 1103 * with nothing on the reassembly queue and 1104 * we have enough buffer space to take it. 1105 */ 1106 #ifdef TCP_SACK 1107 /* Clean receiver SACK report if present */ 1108 if (tp->sack_enable && tp->rcv_numsacks) 1109 tcp_clean_sackreport(tp); 1110 #endif /* TCP_SACK */ 1111 ++tcpstat.tcps_preddat; 1112 tp->rcv_nxt += tlen; 1113 tcpstat.tcps_rcvpack++; 1114 tcpstat.tcps_rcvbyte += tlen; 1115 ND6_HINT(tp); 1116 /* 1117 * Drop TCP, IP headers and TCP options then add data 1118 * to socket buffer. 1119 */ 1120 if (so->so_state & SS_CANTRCVMORE) 1121 m_freem(m); 1122 else { 1123 m_adj(m, iphlen + off); 1124 sbappendstream(&so->so_rcv, m); 1125 } 1126 sorwakeup(so); 1127 TCP_SETUP_ACK(tp, tiflags); 1128 if (tp->t_flags & TF_ACKNOW) 1129 (void) tcp_output(tp); 1130 return; 1131 } 1132 } 1133 1134 /* 1135 * Compute mbuf offset to TCP data segment. 1136 */ 1137 hdroptlen = iphlen + off; 1138 1139 /* 1140 * Calculate amount of space in receive window, 1141 * and then do TCP input processing. 1142 * Receive window is amount of space in rcv queue, 1143 * but not less than advertised window. 1144 */ 1145 { int win; 1146 1147 win = sbspace(&so->so_rcv); 1148 if (win < 0) 1149 win = 0; 1150 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1151 } 1152 1153 switch (tp->t_state) { 1154 1155 /* 1156 * If the state is SYN_RECEIVED: 1157 * if seg contains SYN/ACK, send an RST. 1158 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1159 */ 1160 1161 case TCPS_SYN_RECEIVED: 1162 if (tiflags & TH_ACK) { 1163 if (tiflags & TH_SYN) { 1164 tcpstat.tcps_badsyn++; 1165 goto dropwithreset; 1166 } 1167 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1168 SEQ_GT(th->th_ack, tp->snd_max)) 1169 goto dropwithreset; 1170 } 1171 break; 1172 1173 /* 1174 * If the state is SYN_SENT: 1175 * if seg contains an ACK, but not for our SYN, drop the input. 1176 * if seg contains a RST, then drop the connection. 1177 * if seg does not contain SYN, then drop it. 1178 * Otherwise this is an acceptable SYN segment 1179 * initialize tp->rcv_nxt and tp->irs 1180 * if seg contains ack then advance tp->snd_una 1181 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1182 * arrange for segment to be acked (eventually) 1183 * continue processing rest of data/controls, beginning with URG 1184 */ 1185 case TCPS_SYN_SENT: 1186 if ((tiflags & TH_ACK) && 1187 (SEQ_LEQ(th->th_ack, tp->iss) || 1188 SEQ_GT(th->th_ack, tp->snd_max))) 1189 goto dropwithreset; 1190 if (tiflags & TH_RST) { 1191 #ifdef TCP_ECN 1192 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1193 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1194 goto drop; 1195 #endif 1196 if (tiflags & TH_ACK) 1197 tp = tcp_drop(tp, ECONNREFUSED); 1198 goto drop; 1199 } 1200 if ((tiflags & TH_SYN) == 0) 1201 goto drop; 1202 if (tiflags & TH_ACK) { 1203 tp->snd_una = th->th_ack; 1204 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1205 tp->snd_nxt = tp->snd_una; 1206 } 1207 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1208 tp->irs = th->th_seq; 1209 tcp_mss(tp, opti.maxseg); 1210 /* Reset initial window to 1 segment for retransmit */ 1211 if (tp->t_rxtshift > 0) 1212 tp->snd_cwnd = tp->t_maxseg; 1213 tcp_rcvseqinit(tp); 1214 tp->t_flags |= TF_ACKNOW; 1215 #ifdef TCP_SACK 1216 /* 1217 * If we've sent a SACK_PERMITTED option, and the peer 1218 * also replied with one, then TF_SACK_PERMIT should have 1219 * been set in tcp_dooptions(). If it was not, disable SACKs. 1220 */ 1221 if (tp->sack_enable) 1222 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1223 #endif 1224 #ifdef TCP_ECN 1225 /* 1226 * if ECE is set but CWR is not set for SYN-ACK, or 1227 * both ECE and CWR are set for simultaneous open, 1228 * peer is ECN capable. 1229 */ 1230 if (tcp_do_ecn) { 1231 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1232 == (TH_ACK|TH_ECE) || 1233 (tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1234 == (TH_ECE|TH_CWR)) { 1235 tp->t_flags |= TF_ECN_PERMIT; 1236 tiflags &= ~(TH_ECE|TH_CWR); 1237 tcpstat.tcps_ecn_accepts++; 1238 } 1239 } 1240 #endif 1241 1242 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1243 tcpstat.tcps_connects++; 1244 soisconnected(so); 1245 tp->t_state = TCPS_ESTABLISHED; 1246 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1247 /* Do window scaling on this connection? */ 1248 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1249 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1250 tp->snd_scale = tp->requested_s_scale; 1251 tp->rcv_scale = tp->request_r_scale; 1252 } 1253 (void) tcp_reass(tp, (struct tcphdr *)0, 1254 (struct mbuf *)0, &tlen); 1255 /* 1256 * if we didn't have to retransmit the SYN, 1257 * use its rtt as our initial srtt & rtt var. 1258 */ 1259 if (tp->t_rtttime) 1260 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1261 /* 1262 * Since new data was acked (the SYN), open the 1263 * congestion window by one MSS. We do this 1264 * here, because we won't go through the normal 1265 * ACK processing below. And since this is the 1266 * start of the connection, we know we are in 1267 * the exponential phase of slow-start. 1268 */ 1269 tp->snd_cwnd += tp->t_maxseg; 1270 } else 1271 tp->t_state = TCPS_SYN_RECEIVED; 1272 1273 #if 0 1274 trimthenstep6: 1275 #endif 1276 /* 1277 * Advance th->th_seq to correspond to first data byte. 1278 * If data, trim to stay within window, 1279 * dropping FIN if necessary. 1280 */ 1281 th->th_seq++; 1282 if (tlen > tp->rcv_wnd) { 1283 todrop = tlen - tp->rcv_wnd; 1284 m_adj(m, -todrop); 1285 tlen = tp->rcv_wnd; 1286 tiflags &= ~TH_FIN; 1287 tcpstat.tcps_rcvpackafterwin++; 1288 tcpstat.tcps_rcvbyteafterwin += todrop; 1289 } 1290 tp->snd_wl1 = th->th_seq - 1; 1291 tp->rcv_up = th->th_seq; 1292 goto step6; 1293 /* 1294 * If a new connection request is received while in TIME_WAIT, 1295 * drop the old connection and start over if the if the 1296 * timestamp or the sequence numbers are above the previous 1297 * ones. 1298 */ 1299 case TCPS_TIME_WAIT: 1300 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1301 ((opti.ts_present && 1302 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1303 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1304 /* 1305 * Advance the iss by at least 32768, but 1306 * clear the msb in order to make sure 1307 * that SEG_LT(snd_nxt, iss). 1308 */ 1309 iss = tp->snd_nxt + 1310 ((arc4random() & 0x7fffffff) | 0x8000); 1311 reuse = &iss; 1312 tp = tcp_close(tp); 1313 inp = NULL; 1314 goto findpcb; 1315 } 1316 } 1317 1318 /* 1319 * States other than LISTEN or SYN_SENT. 1320 * First check timestamp, if present. 1321 * Then check that at least some bytes of segment are within 1322 * receive window. If segment begins before rcv_nxt, 1323 * drop leading data (and SYN); if nothing left, just ack. 1324 * 1325 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1326 * and it's less than opti.ts_recent, drop it. 1327 */ 1328 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1329 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1330 1331 /* Check to see if ts_recent is over 24 days old. */ 1332 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1333 /* 1334 * Invalidate ts_recent. If this segment updates 1335 * ts_recent, the age will be reset later and ts_recent 1336 * will get a valid value. If it does not, setting 1337 * ts_recent to zero will at least satisfy the 1338 * requirement that zero be placed in the timestamp 1339 * echo reply when ts_recent isn't valid. The 1340 * age isn't reset until we get a valid ts_recent 1341 * because we don't want out-of-order segments to be 1342 * dropped when ts_recent is old. 1343 */ 1344 tp->ts_recent = 0; 1345 } else { 1346 tcpstat.tcps_rcvduppack++; 1347 tcpstat.tcps_rcvdupbyte += tlen; 1348 tcpstat.tcps_pawsdrop++; 1349 goto dropafterack; 1350 } 1351 } 1352 1353 todrop = tp->rcv_nxt - th->th_seq; 1354 if (todrop > 0) { 1355 if (tiflags & TH_SYN) { 1356 tiflags &= ~TH_SYN; 1357 th->th_seq++; 1358 if (th->th_urp > 1) 1359 th->th_urp--; 1360 else 1361 tiflags &= ~TH_URG; 1362 todrop--; 1363 } 1364 if (todrop > tlen || 1365 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1366 /* 1367 * Any valid FIN must be to the left of the 1368 * window. At this point, FIN must be a 1369 * duplicate or out-of-sequence, so drop it. 1370 */ 1371 tiflags &= ~TH_FIN; 1372 /* 1373 * Send ACK to resynchronize, and drop any data, 1374 * but keep on processing for RST or ACK. 1375 */ 1376 tp->t_flags |= TF_ACKNOW; 1377 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1378 tcpstat.tcps_rcvduppack++; 1379 } else { 1380 tcpstat.tcps_rcvpartduppack++; 1381 tcpstat.tcps_rcvpartdupbyte += todrop; 1382 } 1383 hdroptlen += todrop; /* drop from head afterwards */ 1384 th->th_seq += todrop; 1385 tlen -= todrop; 1386 if (th->th_urp > todrop) 1387 th->th_urp -= todrop; 1388 else { 1389 tiflags &= ~TH_URG; 1390 th->th_urp = 0; 1391 } 1392 } 1393 1394 /* 1395 * If new data are received on a connection after the 1396 * user processes are gone, then RST the other end. 1397 */ 1398 if ((so->so_state & SS_NOFDREF) && 1399 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1400 tp = tcp_close(tp); 1401 tcpstat.tcps_rcvafterclose++; 1402 goto dropwithreset; 1403 } 1404 1405 /* 1406 * If segment ends after window, drop trailing data 1407 * (and PUSH and FIN); if nothing left, just ACK. 1408 */ 1409 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1410 if (todrop > 0) { 1411 tcpstat.tcps_rcvpackafterwin++; 1412 if (todrop >= tlen) { 1413 tcpstat.tcps_rcvbyteafterwin += tlen; 1414 /* 1415 * If window is closed can only take segments at 1416 * window edge, and have to drop data and PUSH from 1417 * incoming segments. Continue processing, but 1418 * remember to ack. Otherwise, drop segment 1419 * and ack. 1420 */ 1421 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1422 tp->t_flags |= TF_ACKNOW; 1423 tcpstat.tcps_rcvwinprobe++; 1424 } else 1425 goto dropafterack; 1426 } else 1427 tcpstat.tcps_rcvbyteafterwin += todrop; 1428 m_adj(m, -todrop); 1429 tlen -= todrop; 1430 tiflags &= ~(TH_PUSH|TH_FIN); 1431 } 1432 1433 /* 1434 * If last ACK falls within this segment's sequence numbers, 1435 * record its timestamp if it's more recent. 1436 * Cf fix from Braden, see Stevens p. 870 1437 */ 1438 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1439 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1440 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1441 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1442 tp->ts_recent = opti.ts_val; 1443 else 1444 tp->ts_recent = 0; 1445 tp->ts_recent_age = tcp_now; 1446 } 1447 1448 /* 1449 * If the RST bit is set examine the state: 1450 * SYN_RECEIVED STATE: 1451 * If passive open, return to LISTEN state. 1452 * If active open, inform user that connection was refused. 1453 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1454 * Inform user that connection was reset, and close tcb. 1455 * CLOSING, LAST_ACK, TIME_WAIT STATES 1456 * Close the tcb. 1457 */ 1458 if (tiflags & TH_RST) { 1459 if (th->th_seq != tp->last_ack_sent && 1460 th->th_seq != tp->rcv_nxt && 1461 th->th_seq != (tp->rcv_nxt + 1)) 1462 goto drop; 1463 1464 switch (tp->t_state) { 1465 case TCPS_SYN_RECEIVED: 1466 #ifdef TCP_ECN 1467 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1468 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1469 goto drop; 1470 #endif 1471 so->so_error = ECONNREFUSED; 1472 goto close; 1473 1474 case TCPS_ESTABLISHED: 1475 case TCPS_FIN_WAIT_1: 1476 case TCPS_FIN_WAIT_2: 1477 case TCPS_CLOSE_WAIT: 1478 so->so_error = ECONNRESET; 1479 close: 1480 tp->t_state = TCPS_CLOSED; 1481 tcpstat.tcps_drops++; 1482 tp = tcp_close(tp); 1483 goto drop; 1484 case TCPS_CLOSING: 1485 case TCPS_LAST_ACK: 1486 case TCPS_TIME_WAIT: 1487 tp = tcp_close(tp); 1488 goto drop; 1489 } 1490 } 1491 1492 /* 1493 * If a SYN is in the window, then this is an 1494 * error and we ACK and drop the packet. 1495 */ 1496 if (tiflags & TH_SYN) 1497 goto dropafterack_ratelim; 1498 1499 /* 1500 * If the ACK bit is off we drop the segment and return. 1501 */ 1502 if ((tiflags & TH_ACK) == 0) { 1503 if (tp->t_flags & TF_ACKNOW) 1504 goto dropafterack; 1505 else 1506 goto drop; 1507 } 1508 1509 /* 1510 * Ack processing. 1511 */ 1512 switch (tp->t_state) { 1513 1514 /* 1515 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1516 * ESTABLISHED state and continue processing. 1517 * The ACK was checked above. 1518 */ 1519 case TCPS_SYN_RECEIVED: 1520 tcpstat.tcps_connects++; 1521 soisconnected(so); 1522 tp->t_state = TCPS_ESTABLISHED; 1523 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1524 /* Do window scaling? */ 1525 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1526 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1527 tp->snd_scale = tp->requested_s_scale; 1528 tp->rcv_scale = tp->request_r_scale; 1529 tiwin = th->th_win << tp->snd_scale; 1530 } 1531 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1532 &tlen); 1533 tp->snd_wl1 = th->th_seq - 1; 1534 /* fall into ... */ 1535 1536 /* 1537 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1538 * ACKs. If the ack is in the range 1539 * tp->snd_una < th->th_ack <= tp->snd_max 1540 * then advance tp->snd_una to th->th_ack and drop 1541 * data from the retransmission queue. If this ACK reflects 1542 * more up to date window information we update our window information. 1543 */ 1544 case TCPS_ESTABLISHED: 1545 case TCPS_FIN_WAIT_1: 1546 case TCPS_FIN_WAIT_2: 1547 case TCPS_CLOSE_WAIT: 1548 case TCPS_CLOSING: 1549 case TCPS_LAST_ACK: 1550 case TCPS_TIME_WAIT: 1551 #ifdef TCP_ECN 1552 /* 1553 * if we receive ECE and are not already in recovery phase, 1554 * reduce cwnd by half but don't slow-start. 1555 * advance snd_last to snd_max not to reduce cwnd again 1556 * until all outstanding packets are acked. 1557 */ 1558 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1559 if ((tp->t_flags & TF_ECN_PERMIT) && 1560 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1561 u_int win; 1562 1563 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1564 if (win > 1) { 1565 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1566 tp->snd_cwnd = tp->snd_ssthresh; 1567 tp->snd_last = tp->snd_max; 1568 tp->t_flags |= TF_SEND_CWR; 1569 tcpstat.tcps_cwr_ecn++; 1570 } 1571 } 1572 tcpstat.tcps_ecn_rcvece++; 1573 } 1574 /* 1575 * if we receive CWR, we know that the peer has reduced 1576 * its congestion window. stop sending ecn-echo. 1577 */ 1578 if ((tiflags & TH_CWR)) { 1579 tp->t_flags &= ~TF_RCVD_CE; 1580 tcpstat.tcps_ecn_rcvcwr++; 1581 } 1582 #endif /* TCP_ECN */ 1583 1584 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1585 /* 1586 * Duplicate/old ACK processing. 1587 * Increments t_dupacks: 1588 * Pure duplicate (same seq/ack/window, no data) 1589 * Doesn't affect t_dupacks: 1590 * Data packets. 1591 * Normal window updates (window opens) 1592 * Resets t_dupacks: 1593 * New data ACKed. 1594 * Window shrinks 1595 * Old ACK 1596 */ 1597 if (tlen) { 1598 /* Drop very old ACKs unless th_seq matches */ 1599 if (th->th_seq != tp->rcv_nxt && 1600 SEQ_LT(th->th_ack, 1601 tp->snd_una - tp->max_sndwnd)) { 1602 tcpstat.tcps_rcvacktooold++; 1603 goto drop; 1604 } 1605 break; 1606 } 1607 /* 1608 * If we get an old ACK, there is probably packet 1609 * reordering going on. Be conservative and reset 1610 * t_dupacks so that we are less aggressive in 1611 * doing a fast retransmit. 1612 */ 1613 if (th->th_ack != tp->snd_una) { 1614 tp->t_dupacks = 0; 1615 break; 1616 } 1617 if (tiwin == tp->snd_wnd) { 1618 tcpstat.tcps_rcvdupack++; 1619 /* 1620 * If we have outstanding data (other than 1621 * a window probe), this is a completely 1622 * duplicate ack (ie, window info didn't 1623 * change), the ack is the biggest we've 1624 * seen and we've seen exactly our rexmt 1625 * threshold of them, assume a packet 1626 * has been dropped and retransmit it. 1627 * Kludge snd_nxt & the congestion 1628 * window so we send only this one 1629 * packet. 1630 * 1631 * We know we're losing at the current 1632 * window size so do congestion avoidance 1633 * (set ssthresh to half the current window 1634 * and pull our congestion window back to 1635 * the new ssthresh). 1636 * 1637 * Dup acks mean that packets have left the 1638 * network (they're now cached at the receiver) 1639 * so bump cwnd by the amount in the receiver 1640 * to keep a constant cwnd packets in the 1641 * network. 1642 */ 1643 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1644 tp->t_dupacks = 0; 1645 #if defined(TCP_SACK) && defined(TCP_FACK) 1646 /* 1647 * In FACK, can enter fast rec. if the receiver 1648 * reports a reass. queue longer than 3 segs. 1649 */ 1650 else if (++tp->t_dupacks == tcprexmtthresh || 1651 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1652 tp->t_maxseg + tp->snd_una)) && 1653 SEQ_GT(tp->snd_una, tp->snd_last))) { 1654 #else 1655 else if (++tp->t_dupacks == tcprexmtthresh) { 1656 #endif /* TCP_FACK */ 1657 tcp_seq onxt = tp->snd_nxt; 1658 u_long win = 1659 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1660 2 / tp->t_maxseg; 1661 1662 #if defined(TCP_SACK) || defined(TCP_ECN) 1663 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1664 /* 1665 * False fast retx after 1666 * timeout. Do not cut window. 1667 */ 1668 tp->t_dupacks = 0; 1669 goto drop; 1670 } 1671 #endif 1672 if (win < 2) 1673 win = 2; 1674 tp->snd_ssthresh = win * tp->t_maxseg; 1675 #ifdef TCP_SACK 1676 tp->snd_last = tp->snd_max; 1677 if (tp->sack_enable) { 1678 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1679 tp->t_rtttime = 0; 1680 #ifdef TCP_ECN 1681 tp->t_flags |= TF_SEND_CWR; 1682 #endif 1683 tcpstat.tcps_cwr_frecovery++; 1684 tcpstat.tcps_sack_recovery_episode++; 1685 #if defined(TCP_SACK) && defined(TCP_FACK) 1686 tp->t_dupacks = tcprexmtthresh; 1687 (void) tcp_output(tp); 1688 /* 1689 * During FR, snd_cwnd is held 1690 * constant for FACK. 1691 */ 1692 tp->snd_cwnd = tp->snd_ssthresh; 1693 #else 1694 /* 1695 * tcp_output() will send 1696 * oldest SACK-eligible rtx. 1697 */ 1698 (void) tcp_output(tp); 1699 tp->snd_cwnd = tp->snd_ssthresh+ 1700 tp->t_maxseg * tp->t_dupacks; 1701 #endif /* TCP_FACK */ 1702 goto drop; 1703 } 1704 #endif /* TCP_SACK */ 1705 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1706 tp->t_rtttime = 0; 1707 tp->snd_nxt = th->th_ack; 1708 tp->snd_cwnd = tp->t_maxseg; 1709 #ifdef TCP_ECN 1710 tp->t_flags |= TF_SEND_CWR; 1711 #endif 1712 tcpstat.tcps_cwr_frecovery++; 1713 tcpstat.tcps_sndrexmitfast++; 1714 (void) tcp_output(tp); 1715 1716 tp->snd_cwnd = tp->snd_ssthresh + 1717 tp->t_maxseg * tp->t_dupacks; 1718 if (SEQ_GT(onxt, tp->snd_nxt)) 1719 tp->snd_nxt = onxt; 1720 goto drop; 1721 } else if (tp->t_dupacks > tcprexmtthresh) { 1722 #if defined(TCP_SACK) && defined(TCP_FACK) 1723 /* 1724 * while (awnd < cwnd) 1725 * sendsomething(); 1726 */ 1727 if (tp->sack_enable) { 1728 if (tp->snd_awnd < tp->snd_cwnd) 1729 tcp_output(tp); 1730 goto drop; 1731 } 1732 #endif /* TCP_FACK */ 1733 tp->snd_cwnd += tp->t_maxseg; 1734 (void) tcp_output(tp); 1735 goto drop; 1736 } 1737 } else if (tiwin < tp->snd_wnd) { 1738 /* 1739 * The window was retracted! Previous dup 1740 * ACKs may have been due to packets arriving 1741 * after the shrunken window, not a missing 1742 * packet, so play it safe and reset t_dupacks 1743 */ 1744 tp->t_dupacks = 0; 1745 } 1746 break; 1747 } 1748 /* 1749 * If the congestion window was inflated to account 1750 * for the other side's cached packets, retract it. 1751 */ 1752 #if defined(TCP_SACK) 1753 if (tp->sack_enable) { 1754 if (tp->t_dupacks >= tcprexmtthresh) { 1755 /* Check for a partial ACK */ 1756 if (tcp_sack_partialack(tp, th)) { 1757 #if defined(TCP_SACK) && defined(TCP_FACK) 1758 /* Force call to tcp_output */ 1759 if (tp->snd_awnd < tp->snd_cwnd) 1760 needoutput = 1; 1761 #else 1762 tp->snd_cwnd += tp->t_maxseg; 1763 needoutput = 1; 1764 #endif /* TCP_FACK */ 1765 } else { 1766 /* Out of fast recovery */ 1767 tp->snd_cwnd = tp->snd_ssthresh; 1768 if (tcp_seq_subtract(tp->snd_max, 1769 th->th_ack) < tp->snd_ssthresh) 1770 tp->snd_cwnd = 1771 tcp_seq_subtract(tp->snd_max, 1772 th->th_ack); 1773 tp->t_dupacks = 0; 1774 #if defined(TCP_SACK) && defined(TCP_FACK) 1775 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1776 tp->snd_fack = th->th_ack; 1777 #endif /* TCP_FACK */ 1778 } 1779 } 1780 } else { 1781 if (tp->t_dupacks >= tcprexmtthresh && 1782 !tcp_newreno(tp, th)) { 1783 /* Out of fast recovery */ 1784 tp->snd_cwnd = tp->snd_ssthresh; 1785 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1786 tp->snd_ssthresh) 1787 tp->snd_cwnd = 1788 tcp_seq_subtract(tp->snd_max, 1789 th->th_ack); 1790 tp->t_dupacks = 0; 1791 } 1792 } 1793 if (tp->t_dupacks < tcprexmtthresh) 1794 tp->t_dupacks = 0; 1795 #else /* else no TCP_SACK */ 1796 if (tp->t_dupacks >= tcprexmtthresh && 1797 tp->snd_cwnd > tp->snd_ssthresh) 1798 tp->snd_cwnd = tp->snd_ssthresh; 1799 tp->t_dupacks = 0; 1800 #endif 1801 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1802 tcpstat.tcps_rcvacktoomuch++; 1803 goto dropafterack_ratelim; 1804 } 1805 acked = th->th_ack - tp->snd_una; 1806 tcpstat.tcps_rcvackpack++; 1807 tcpstat.tcps_rcvackbyte += acked; 1808 1809 /* 1810 * If we have a timestamp reply, update smoothed 1811 * round trip time. If no timestamp is present but 1812 * transmit timer is running and timed sequence 1813 * number was acked, update smoothed round trip time. 1814 * Since we now have an rtt measurement, cancel the 1815 * timer backoff (cf., Phil Karn's retransmit alg.). 1816 * Recompute the initial retransmit timer. 1817 */ 1818 if (opti.ts_present && opti.ts_ecr) 1819 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1820 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1821 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1822 1823 /* 1824 * If all outstanding data is acked, stop retransmit 1825 * timer and remember to restart (more output or persist). 1826 * If there is more data to be acked, restart retransmit 1827 * timer, using current (possibly backed-off) value. 1828 */ 1829 if (th->th_ack == tp->snd_max) { 1830 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1831 needoutput = 1; 1832 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1833 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1834 /* 1835 * When new data is acked, open the congestion window. 1836 * If the window gives us less than ssthresh packets 1837 * in flight, open exponentially (maxseg per packet). 1838 * Otherwise open linearly: maxseg per window 1839 * (maxseg^2 / cwnd per packet). 1840 */ 1841 { 1842 u_int cw = tp->snd_cwnd; 1843 u_int incr = tp->t_maxseg; 1844 1845 if (cw > tp->snd_ssthresh) 1846 incr = incr * incr / cw; 1847 #if defined (TCP_SACK) 1848 if (tp->t_dupacks < tcprexmtthresh) 1849 #endif 1850 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1851 } 1852 ND6_HINT(tp); 1853 if (acked > so->so_snd.sb_cc) { 1854 tp->snd_wnd -= so->so_snd.sb_cc; 1855 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1856 ourfinisacked = 1; 1857 } else { 1858 sbdrop(&so->so_snd, acked); 1859 tp->snd_wnd -= acked; 1860 ourfinisacked = 0; 1861 } 1862 if (sb_notify(&so->so_snd)) 1863 sowwakeup(so); 1864 1865 /* 1866 * If we had a pending ICMP message that referred to data 1867 * that have just been acknowledged, disregard the recorded 1868 * ICMP message. 1869 */ 1870 if ((tp->t_flags & TF_PMTUD_PEND) && 1871 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1872 tp->t_flags &= ~TF_PMTUD_PEND; 1873 1874 /* 1875 * Keep track of the largest chunk of data acknowledged 1876 * since last PMTU update 1877 */ 1878 if (tp->t_pmtud_mss_acked < acked) 1879 tp->t_pmtud_mss_acked = acked; 1880 1881 tp->snd_una = th->th_ack; 1882 #ifdef TCP_ECN 1883 /* sync snd_last with snd_una */ 1884 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1885 tp->snd_last = tp->snd_una; 1886 #endif 1887 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1888 tp->snd_nxt = tp->snd_una; 1889 #if defined (TCP_SACK) && defined (TCP_FACK) 1890 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1891 tp->snd_fack = tp->snd_una; 1892 /* Update snd_awnd for partial ACK 1893 * without any SACK blocks. 1894 */ 1895 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1896 tp->snd_fack) + tp->retran_data; 1897 } 1898 #endif 1899 1900 switch (tp->t_state) { 1901 1902 /* 1903 * In FIN_WAIT_1 STATE in addition to the processing 1904 * for the ESTABLISHED state if our FIN is now acknowledged 1905 * then enter FIN_WAIT_2. 1906 */ 1907 case TCPS_FIN_WAIT_1: 1908 if (ourfinisacked) { 1909 /* 1910 * If we can't receive any more 1911 * data, then closing user can proceed. 1912 * Starting the timer is contrary to the 1913 * specification, but if we don't get a FIN 1914 * we'll hang forever. 1915 */ 1916 if (so->so_state & SS_CANTRCVMORE) { 1917 soisdisconnected(so); 1918 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1919 } 1920 tp->t_state = TCPS_FIN_WAIT_2; 1921 } 1922 break; 1923 1924 /* 1925 * In CLOSING STATE in addition to the processing for 1926 * the ESTABLISHED state if the ACK acknowledges our FIN 1927 * then enter the TIME-WAIT state, otherwise ignore 1928 * the segment. 1929 */ 1930 case TCPS_CLOSING: 1931 if (ourfinisacked) { 1932 tp->t_state = TCPS_TIME_WAIT; 1933 tcp_canceltimers(tp); 1934 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1935 soisdisconnected(so); 1936 } 1937 break; 1938 1939 /* 1940 * In LAST_ACK, we may still be waiting for data to drain 1941 * and/or to be acked, as well as for the ack of our FIN. 1942 * If our FIN is now acknowledged, delete the TCB, 1943 * enter the closed state and return. 1944 */ 1945 case TCPS_LAST_ACK: 1946 if (ourfinisacked) { 1947 tp = tcp_close(tp); 1948 goto drop; 1949 } 1950 break; 1951 1952 /* 1953 * In TIME_WAIT state the only thing that should arrive 1954 * is a retransmission of the remote FIN. Acknowledge 1955 * it and restart the finack timer. 1956 */ 1957 case TCPS_TIME_WAIT: 1958 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1959 goto dropafterack; 1960 } 1961 } 1962 1963 step6: 1964 /* 1965 * Update window information. 1966 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1967 */ 1968 if ((tiflags & TH_ACK) && 1969 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1970 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1971 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1972 /* keep track of pure window updates */ 1973 if (tlen == 0 && 1974 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1975 tcpstat.tcps_rcvwinupd++; 1976 tp->snd_wnd = tiwin; 1977 tp->snd_wl1 = th->th_seq; 1978 tp->snd_wl2 = th->th_ack; 1979 if (tp->snd_wnd > tp->max_sndwnd) 1980 tp->max_sndwnd = tp->snd_wnd; 1981 needoutput = 1; 1982 } 1983 1984 /* 1985 * Process segments with URG. 1986 */ 1987 if ((tiflags & TH_URG) && th->th_urp && 1988 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1989 /* 1990 * This is a kludge, but if we receive and accept 1991 * random urgent pointers, we'll crash in 1992 * soreceive. It's hard to imagine someone 1993 * actually wanting to send this much urgent data. 1994 */ 1995 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1996 th->th_urp = 0; /* XXX */ 1997 tiflags &= ~TH_URG; /* XXX */ 1998 goto dodata; /* XXX */ 1999 } 2000 /* 2001 * If this segment advances the known urgent pointer, 2002 * then mark the data stream. This should not happen 2003 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2004 * a FIN has been received from the remote side. 2005 * In these states we ignore the URG. 2006 * 2007 * According to RFC961 (Assigned Protocols), 2008 * the urgent pointer points to the last octet 2009 * of urgent data. We continue, however, 2010 * to consider it to indicate the first octet 2011 * of data past the urgent section as the original 2012 * spec states (in one of two places). 2013 */ 2014 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2015 tp->rcv_up = th->th_seq + th->th_urp; 2016 so->so_oobmark = so->so_rcv.sb_cc + 2017 (tp->rcv_up - tp->rcv_nxt) - 1; 2018 if (so->so_oobmark == 0) 2019 so->so_state |= SS_RCVATMARK; 2020 sohasoutofband(so); 2021 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2022 } 2023 /* 2024 * Remove out of band data so doesn't get presented to user. 2025 * This can happen independent of advancing the URG pointer, 2026 * but if two URG's are pending at once, some out-of-band 2027 * data may creep in... ick. 2028 */ 2029 if (th->th_urp <= (u_int16_t) tlen 2030 #ifdef SO_OOBINLINE 2031 && (so->so_options & SO_OOBINLINE) == 0 2032 #endif 2033 ) 2034 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2035 } else 2036 /* 2037 * If no out of band data is expected, 2038 * pull receive urgent pointer along 2039 * with the receive window. 2040 */ 2041 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2042 tp->rcv_up = tp->rcv_nxt; 2043 dodata: /* XXX */ 2044 2045 /* 2046 * Process the segment text, merging it into the TCP sequencing queue, 2047 * and arranging for acknowledgment of receipt if necessary. 2048 * This process logically involves adjusting tp->rcv_wnd as data 2049 * is presented to the user (this happens in tcp_usrreq.c, 2050 * case PRU_RCVD). If a FIN has already been received on this 2051 * connection then we just ignore the text. 2052 */ 2053 if ((tlen || (tiflags & TH_FIN)) && 2054 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2055 #ifdef TCP_SACK 2056 tcp_seq laststart = th->th_seq; 2057 tcp_seq lastend = th->th_seq + tlen; 2058 #endif 2059 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2060 tp->t_state == TCPS_ESTABLISHED) { 2061 TCP_SETUP_ACK(tp, tiflags); 2062 tp->rcv_nxt += tlen; 2063 tiflags = th->th_flags & TH_FIN; 2064 tcpstat.tcps_rcvpack++; 2065 tcpstat.tcps_rcvbyte += tlen; 2066 ND6_HINT(tp); 2067 if (so->so_state & SS_CANTRCVMORE) 2068 m_freem(m); 2069 else { 2070 m_adj(m, hdroptlen); 2071 sbappendstream(&so->so_rcv, m); 2072 } 2073 sorwakeup(so); 2074 } else { 2075 m_adj(m, hdroptlen); 2076 tiflags = tcp_reass(tp, th, m, &tlen); 2077 tp->t_flags |= TF_ACKNOW; 2078 } 2079 #ifdef TCP_SACK 2080 if (tp->sack_enable) 2081 tcp_update_sack_list(tp, laststart, lastend); 2082 #endif 2083 2084 /* 2085 * variable len never referenced again in modern BSD, 2086 * so why bother computing it ?? 2087 */ 2088 #if 0 2089 /* 2090 * Note the amount of data that peer has sent into 2091 * our window, in order to estimate the sender's 2092 * buffer size. 2093 */ 2094 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2095 #endif /* 0 */ 2096 } else { 2097 m_freem(m); 2098 tiflags &= ~TH_FIN; 2099 } 2100 2101 /* 2102 * If FIN is received ACK the FIN and let the user know 2103 * that the connection is closing. Ignore a FIN received before 2104 * the connection is fully established. 2105 */ 2106 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2107 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2108 socantrcvmore(so); 2109 tp->t_flags |= TF_ACKNOW; 2110 tp->rcv_nxt++; 2111 } 2112 switch (tp->t_state) { 2113 2114 /* 2115 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2116 */ 2117 case TCPS_ESTABLISHED: 2118 tp->t_state = TCPS_CLOSE_WAIT; 2119 break; 2120 2121 /* 2122 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2123 * enter the CLOSING state. 2124 */ 2125 case TCPS_FIN_WAIT_1: 2126 tp->t_state = TCPS_CLOSING; 2127 break; 2128 2129 /* 2130 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2131 * starting the time-wait timer, turning off the other 2132 * standard timers. 2133 */ 2134 case TCPS_FIN_WAIT_2: 2135 tp->t_state = TCPS_TIME_WAIT; 2136 tcp_canceltimers(tp); 2137 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2138 soisdisconnected(so); 2139 break; 2140 2141 /* 2142 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2143 */ 2144 case TCPS_TIME_WAIT: 2145 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2146 break; 2147 } 2148 } 2149 if (so->so_options & SO_DEBUG) { 2150 switch (tp->pf) { 2151 #ifdef INET6 2152 case PF_INET6: 2153 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2154 0, tlen); 2155 break; 2156 #endif /* INET6 */ 2157 case PF_INET: 2158 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2159 0, tlen); 2160 break; 2161 } 2162 } 2163 2164 /* 2165 * Return any desired output. 2166 */ 2167 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2168 (void) tcp_output(tp); 2169 } 2170 return; 2171 2172 badsyn: 2173 /* 2174 * Received a bad SYN. Increment counters and dropwithreset. 2175 */ 2176 tcpstat.tcps_badsyn++; 2177 tp = NULL; 2178 goto dropwithreset; 2179 2180 dropafterack_ratelim: 2181 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2182 tcp_ackdrop_ppslim) == 0) { 2183 /* XXX stat */ 2184 goto drop; 2185 } 2186 /* ...fall into dropafterack... */ 2187 2188 dropafterack: 2189 /* 2190 * Generate an ACK dropping incoming segment if it occupies 2191 * sequence space, where the ACK reflects our state. 2192 */ 2193 if (tiflags & TH_RST) 2194 goto drop; 2195 m_freem(m); 2196 tp->t_flags |= TF_ACKNOW; 2197 (void) tcp_output(tp); 2198 return; 2199 2200 dropwithreset_ratelim: 2201 /* 2202 * We may want to rate-limit RSTs in certain situations, 2203 * particularly if we are sending an RST in response to 2204 * an attempt to connect to or otherwise communicate with 2205 * a port for which we have no socket. 2206 */ 2207 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2208 tcp_rst_ppslim) == 0) { 2209 /* XXX stat */ 2210 goto drop; 2211 } 2212 /* ...fall into dropwithreset... */ 2213 2214 dropwithreset: 2215 /* 2216 * Generate a RST, dropping incoming segment. 2217 * Make ACK acceptable to originator of segment. 2218 * Don't bother to respond to RST. 2219 */ 2220 if (tiflags & TH_RST) 2221 goto drop; 2222 if (tiflags & TH_ACK) { 2223 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2224 TH_RST, 0); 2225 } else { 2226 if (tiflags & TH_SYN) 2227 tlen++; 2228 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2229 (tcp_seq)0, TH_RST|TH_ACK, 0); 2230 } 2231 m_freem(m); 2232 return; 2233 2234 drop: 2235 /* 2236 * Drop space held by incoming segment and return. 2237 */ 2238 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2239 switch (tp->pf) { 2240 #ifdef INET6 2241 case PF_INET6: 2242 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2243 0, tlen); 2244 break; 2245 #endif /* INET6 */ 2246 case PF_INET: 2247 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2248 0, tlen); 2249 break; 2250 } 2251 } 2252 2253 m_freem(m); 2254 return; 2255 } 2256 2257 int 2258 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2259 struct mbuf *m, int iphlen, struct tcp_opt_info *oi) 2260 { 2261 u_int16_t mss = 0; 2262 int opt, optlen; 2263 #ifdef TCP_SIGNATURE 2264 caddr_t sigp = NULL; 2265 struct tdb *tdb = NULL; 2266 #endif /* TCP_SIGNATURE */ 2267 2268 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2269 opt = cp[0]; 2270 if (opt == TCPOPT_EOL) 2271 break; 2272 if (opt == TCPOPT_NOP) 2273 optlen = 1; 2274 else { 2275 if (cnt < 2) 2276 break; 2277 optlen = cp[1]; 2278 if (optlen < 2 || optlen > cnt) 2279 break; 2280 } 2281 switch (opt) { 2282 2283 default: 2284 continue; 2285 2286 case TCPOPT_MAXSEG: 2287 if (optlen != TCPOLEN_MAXSEG) 2288 continue; 2289 if (!(th->th_flags & TH_SYN)) 2290 continue; 2291 if (TCPS_HAVERCVDSYN(tp->t_state)) 2292 continue; 2293 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2294 NTOHS(mss); 2295 oi->maxseg = mss; 2296 break; 2297 2298 case TCPOPT_WINDOW: 2299 if (optlen != TCPOLEN_WINDOW) 2300 continue; 2301 if (!(th->th_flags & TH_SYN)) 2302 continue; 2303 if (TCPS_HAVERCVDSYN(tp->t_state)) 2304 continue; 2305 tp->t_flags |= TF_RCVD_SCALE; 2306 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2307 break; 2308 2309 case TCPOPT_TIMESTAMP: 2310 if (optlen != TCPOLEN_TIMESTAMP) 2311 continue; 2312 oi->ts_present = 1; 2313 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2314 NTOHL(oi->ts_val); 2315 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2316 NTOHL(oi->ts_ecr); 2317 2318 if (!(th->th_flags & TH_SYN)) 2319 continue; 2320 if (TCPS_HAVERCVDSYN(tp->t_state)) 2321 continue; 2322 /* 2323 * A timestamp received in a SYN makes 2324 * it ok to send timestamp requests and replies. 2325 */ 2326 tp->t_flags |= TF_RCVD_TSTMP; 2327 tp->ts_recent = oi->ts_val; 2328 tp->ts_recent_age = tcp_now; 2329 break; 2330 2331 #ifdef TCP_SACK 2332 case TCPOPT_SACK_PERMITTED: 2333 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2334 continue; 2335 if (!(th->th_flags & TH_SYN)) 2336 continue; 2337 if (TCPS_HAVERCVDSYN(tp->t_state)) 2338 continue; 2339 /* MUST only be set on SYN */ 2340 tp->t_flags |= TF_SACK_PERMIT; 2341 break; 2342 case TCPOPT_SACK: 2343 tcp_sack_option(tp, th, cp, optlen); 2344 break; 2345 #endif 2346 #ifdef TCP_SIGNATURE 2347 case TCPOPT_SIGNATURE: 2348 if (optlen != TCPOLEN_SIGNATURE) 2349 continue; 2350 2351 if (sigp && bcmp(sigp, cp + 2, 16)) 2352 return (-1); 2353 2354 sigp = cp + 2; 2355 break; 2356 #endif /* TCP_SIGNATURE */ 2357 } 2358 } 2359 2360 #ifdef TCP_SIGNATURE 2361 if (tp->t_flags & TF_SIGNATURE) { 2362 union sockaddr_union src, dst; 2363 2364 memset(&src, 0, sizeof(union sockaddr_union)); 2365 memset(&dst, 0, sizeof(union sockaddr_union)); 2366 2367 switch (tp->pf) { 2368 case 0: 2369 #ifdef INET 2370 case AF_INET: 2371 src.sa.sa_len = sizeof(struct sockaddr_in); 2372 src.sa.sa_family = AF_INET; 2373 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2374 dst.sa.sa_len = sizeof(struct sockaddr_in); 2375 dst.sa.sa_family = AF_INET; 2376 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2377 break; 2378 #endif 2379 #ifdef INET6 2380 case AF_INET6: 2381 src.sa.sa_len = sizeof(struct sockaddr_in6); 2382 src.sa.sa_family = AF_INET6; 2383 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2384 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2385 dst.sa.sa_family = AF_INET6; 2386 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2387 break; 2388 #endif /* INET6 */ 2389 } 2390 2391 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 2392 2393 /* 2394 * We don't have an SA for this peer, so we turn off 2395 * TF_SIGNATURE on the listen socket 2396 */ 2397 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2398 tp->t_flags &= ~TF_SIGNATURE; 2399 2400 } 2401 2402 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2403 tcpstat.tcps_rcvbadsig++; 2404 return (-1); 2405 } 2406 2407 if (sigp) { 2408 char sig[16]; 2409 2410 if (tdb == NULL) { 2411 tcpstat.tcps_rcvbadsig++; 2412 return (-1); 2413 } 2414 2415 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2416 return (-1); 2417 2418 if (bcmp(sig, sigp, 16)) { 2419 tcpstat.tcps_rcvbadsig++; 2420 return (-1); 2421 } 2422 2423 tcpstat.tcps_rcvgoodsig++; 2424 } 2425 #endif /* TCP_SIGNATURE */ 2426 2427 return (0); 2428 } 2429 2430 #if defined(TCP_SACK) 2431 u_long 2432 tcp_seq_subtract(u_long a, u_long b) 2433 { 2434 return ((long)(a - b)); 2435 } 2436 #endif 2437 2438 2439 #ifdef TCP_SACK 2440 /* 2441 * This function is called upon receipt of new valid data (while not in header 2442 * prediction mode), and it updates the ordered list of sacks. 2443 */ 2444 void 2445 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2446 tcp_seq rcv_lastend) 2447 { 2448 /* 2449 * First reported block MUST be the most recent one. Subsequent 2450 * blocks SHOULD be in the order in which they arrived at the 2451 * receiver. These two conditions make the implementation fully 2452 * compliant with RFC 2018. 2453 */ 2454 int i, j = 0, count = 0, lastpos = -1; 2455 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2456 2457 /* First clean up current list of sacks */ 2458 for (i = 0; i < tp->rcv_numsacks; i++) { 2459 sack = tp->sackblks[i]; 2460 if (sack.start == 0 && sack.end == 0) { 2461 count++; /* count = number of blocks to be discarded */ 2462 continue; 2463 } 2464 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2465 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2466 count++; 2467 } else { 2468 temp[j].start = tp->sackblks[i].start; 2469 temp[j++].end = tp->sackblks[i].end; 2470 } 2471 } 2472 tp->rcv_numsacks -= count; 2473 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2474 tcp_clean_sackreport(tp); 2475 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2476 /* ==> need first sack block */ 2477 tp->sackblks[0].start = rcv_laststart; 2478 tp->sackblks[0].end = rcv_lastend; 2479 tp->rcv_numsacks = 1; 2480 } 2481 return; 2482 } 2483 /* Otherwise, sack blocks are already present. */ 2484 for (i = 0; i < tp->rcv_numsacks; i++) 2485 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2486 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2487 return; /* sack list remains unchanged */ 2488 /* 2489 * From here, segment just received should be (part of) the 1st sack. 2490 * Go through list, possibly coalescing sack block entries. 2491 */ 2492 firstsack.start = rcv_laststart; 2493 firstsack.end = rcv_lastend; 2494 for (i = 0; i < tp->rcv_numsacks; i++) { 2495 sack = tp->sackblks[i]; 2496 if (SEQ_LT(sack.end, firstsack.start) || 2497 SEQ_GT(sack.start, firstsack.end)) 2498 continue; /* no overlap */ 2499 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2500 /* 2501 * identical block; delete it here since we will 2502 * move it to the front of the list. 2503 */ 2504 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2505 lastpos = i; /* last posn with a zero entry */ 2506 continue; 2507 } 2508 if (SEQ_LEQ(sack.start, firstsack.start)) 2509 firstsack.start = sack.start; /* merge blocks */ 2510 if (SEQ_GEQ(sack.end, firstsack.end)) 2511 firstsack.end = sack.end; /* merge blocks */ 2512 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2513 lastpos = i; /* last posn with a zero entry */ 2514 } 2515 if (lastpos != -1) { /* at least one merge */ 2516 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2517 sack = tp->sackblks[i]; 2518 if (sack.start == 0 && sack.end == 0) 2519 continue; 2520 temp[j++] = sack; 2521 } 2522 tp->rcv_numsacks = j; /* including first blk (added later) */ 2523 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2524 tp->sackblks[i] = temp[i]; 2525 } else { /* no merges -- shift sacks by 1 */ 2526 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2527 tp->rcv_numsacks++; 2528 for (i = tp->rcv_numsacks-1; i > 0; i--) 2529 tp->sackblks[i] = tp->sackblks[i-1]; 2530 } 2531 tp->sackblks[0] = firstsack; 2532 return; 2533 } 2534 2535 /* 2536 * Process the TCP SACK option. tp->snd_holes is an ordered list 2537 * of holes (oldest to newest, in terms of the sequence space). 2538 */ 2539 void 2540 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2541 { 2542 int tmp_olen; 2543 u_char *tmp_cp; 2544 struct sackhole *cur, *p, *temp; 2545 2546 if (!tp->sack_enable) 2547 return; 2548 /* SACK without ACK doesn't make sense. */ 2549 if ((th->th_flags & TH_ACK) == 0) 2550 return; 2551 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2552 if (SEQ_LT(th->th_ack, tp->snd_una) || 2553 SEQ_GT(th->th_ack, tp->snd_max)) 2554 return; 2555 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2556 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2557 return; 2558 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2559 tmp_cp = cp + 2; 2560 tmp_olen = optlen - 2; 2561 tcpstat.tcps_sack_rcv_opts++; 2562 if (tp->snd_numholes < 0) 2563 tp->snd_numholes = 0; 2564 if (tp->t_maxseg == 0) 2565 panic("tcp_sack_option"); /* Should never happen */ 2566 while (tmp_olen > 0) { 2567 struct sackblk sack; 2568 2569 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2570 NTOHL(sack.start); 2571 bcopy(tmp_cp + sizeof(tcp_seq), 2572 (char *) &(sack.end), sizeof(tcp_seq)); 2573 NTOHL(sack.end); 2574 tmp_olen -= TCPOLEN_SACK; 2575 tmp_cp += TCPOLEN_SACK; 2576 if (SEQ_LEQ(sack.end, sack.start)) 2577 continue; /* bad SACK fields */ 2578 if (SEQ_LEQ(sack.end, tp->snd_una)) 2579 continue; /* old block */ 2580 #if defined(TCP_SACK) && defined(TCP_FACK) 2581 /* Updates snd_fack. */ 2582 if (SEQ_GT(sack.end, tp->snd_fack)) 2583 tp->snd_fack = sack.end; 2584 #endif /* TCP_FACK */ 2585 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2586 if (SEQ_LT(sack.start, th->th_ack)) 2587 continue; 2588 } 2589 if (SEQ_GT(sack.end, tp->snd_max)) 2590 continue; 2591 if (tp->snd_holes == NULL) { /* first hole */ 2592 tp->snd_holes = (struct sackhole *) 2593 pool_get(&sackhl_pool, PR_NOWAIT); 2594 if (tp->snd_holes == NULL) { 2595 /* ENOBUFS, so ignore SACKed block for now*/ 2596 goto done; 2597 } 2598 cur = tp->snd_holes; 2599 cur->start = th->th_ack; 2600 cur->end = sack.start; 2601 cur->rxmit = cur->start; 2602 cur->next = NULL; 2603 tp->snd_numholes = 1; 2604 tp->rcv_lastsack = sack.end; 2605 /* 2606 * dups is at least one. If more data has been 2607 * SACKed, it can be greater than one. 2608 */ 2609 cur->dups = min(tcprexmtthresh, 2610 ((sack.end - cur->end)/tp->t_maxseg)); 2611 if (cur->dups < 1) 2612 cur->dups = 1; 2613 continue; /* with next sack block */ 2614 } 2615 /* Go thru list of holes: p = previous, cur = current */ 2616 p = cur = tp->snd_holes; 2617 while (cur) { 2618 if (SEQ_LEQ(sack.end, cur->start)) 2619 /* SACKs data before the current hole */ 2620 break; /* no use going through more holes */ 2621 if (SEQ_GEQ(sack.start, cur->end)) { 2622 /* SACKs data beyond the current hole */ 2623 cur->dups++; 2624 if (((sack.end - cur->end)/tp->t_maxseg) >= 2625 tcprexmtthresh) 2626 cur->dups = tcprexmtthresh; 2627 p = cur; 2628 cur = cur->next; 2629 continue; 2630 } 2631 if (SEQ_LEQ(sack.start, cur->start)) { 2632 /* Data acks at least the beginning of hole */ 2633 #if defined(TCP_SACK) && defined(TCP_FACK) 2634 if (SEQ_GT(sack.end, cur->rxmit)) 2635 tp->retran_data -= 2636 tcp_seq_subtract(cur->rxmit, 2637 cur->start); 2638 else 2639 tp->retran_data -= 2640 tcp_seq_subtract(sack.end, 2641 cur->start); 2642 #endif /* TCP_FACK */ 2643 if (SEQ_GEQ(sack.end, cur->end)) { 2644 /* Acks entire hole, so delete hole */ 2645 if (p != cur) { 2646 p->next = cur->next; 2647 pool_put(&sackhl_pool, cur); 2648 cur = p->next; 2649 } else { 2650 cur = cur->next; 2651 pool_put(&sackhl_pool, p); 2652 p = cur; 2653 tp->snd_holes = p; 2654 } 2655 tp->snd_numholes--; 2656 continue; 2657 } 2658 /* otherwise, move start of hole forward */ 2659 cur->start = sack.end; 2660 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2661 p = cur; 2662 cur = cur->next; 2663 continue; 2664 } 2665 /* move end of hole backward */ 2666 if (SEQ_GEQ(sack.end, cur->end)) { 2667 #if defined(TCP_SACK) && defined(TCP_FACK) 2668 if (SEQ_GT(cur->rxmit, sack.start)) 2669 tp->retran_data -= 2670 tcp_seq_subtract(cur->rxmit, 2671 sack.start); 2672 #endif /* TCP_FACK */ 2673 cur->end = sack.start; 2674 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2675 cur->dups++; 2676 if (((sack.end - cur->end)/tp->t_maxseg) >= 2677 tcprexmtthresh) 2678 cur->dups = tcprexmtthresh; 2679 p = cur; 2680 cur = cur->next; 2681 continue; 2682 } 2683 if (SEQ_LT(cur->start, sack.start) && 2684 SEQ_GT(cur->end, sack.end)) { 2685 /* 2686 * ACKs some data in middle of a hole; need to 2687 * split current hole 2688 */ 2689 temp = (struct sackhole *) 2690 pool_get(&sackhl_pool, PR_NOWAIT); 2691 if (temp == NULL) 2692 goto done; /* ENOBUFS */ 2693 #if defined(TCP_SACK) && defined(TCP_FACK) 2694 if (SEQ_GT(cur->rxmit, sack.end)) 2695 tp->retran_data -= 2696 tcp_seq_subtract(sack.end, 2697 sack.start); 2698 else if (SEQ_GT(cur->rxmit, sack.start)) 2699 tp->retran_data -= 2700 tcp_seq_subtract(cur->rxmit, 2701 sack.start); 2702 #endif /* TCP_FACK */ 2703 temp->next = cur->next; 2704 temp->start = sack.end; 2705 temp->end = cur->end; 2706 temp->dups = cur->dups; 2707 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2708 cur->end = sack.start; 2709 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2710 cur->dups++; 2711 if (((sack.end - cur->end)/tp->t_maxseg) >= 2712 tcprexmtthresh) 2713 cur->dups = tcprexmtthresh; 2714 cur->next = temp; 2715 p = temp; 2716 cur = p->next; 2717 tp->snd_numholes++; 2718 } 2719 } 2720 /* At this point, p points to the last hole on the list */ 2721 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2722 /* 2723 * Need to append new hole at end. 2724 * Last hole is p (and it's not NULL). 2725 */ 2726 temp = (struct sackhole *) 2727 pool_get(&sackhl_pool, PR_NOWAIT); 2728 if (temp == NULL) 2729 goto done; /* ENOBUFS */ 2730 temp->start = tp->rcv_lastsack; 2731 temp->end = sack.start; 2732 temp->dups = min(tcprexmtthresh, 2733 ((sack.end - sack.start)/tp->t_maxseg)); 2734 if (temp->dups < 1) 2735 temp->dups = 1; 2736 temp->rxmit = temp->start; 2737 temp->next = 0; 2738 p->next = temp; 2739 tp->rcv_lastsack = sack.end; 2740 tp->snd_numholes++; 2741 } 2742 } 2743 done: 2744 #if defined(TCP_SACK) && defined(TCP_FACK) 2745 /* 2746 * Update retran_data and snd_awnd. Go through the list of 2747 * holes. Increment retran_data by (hole->rxmit - hole->start). 2748 */ 2749 tp->retran_data = 0; 2750 cur = tp->snd_holes; 2751 while (cur) { 2752 tp->retran_data += cur->rxmit - cur->start; 2753 cur = cur->next; 2754 } 2755 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2756 tp->retran_data; 2757 #endif /* TCP_FACK */ 2758 2759 return; 2760 } 2761 2762 /* 2763 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2764 * it is completely acked; otherwise, tcp_sack_option(), called from 2765 * tcp_dooptions(), will fix up the hole. 2766 */ 2767 void 2768 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2769 { 2770 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2771 /* max because this could be an older ack just arrived */ 2772 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2773 th->th_ack : tp->snd_una; 2774 struct sackhole *cur = tp->snd_holes; 2775 struct sackhole *prev; 2776 while (cur) 2777 if (SEQ_LEQ(cur->end, lastack)) { 2778 prev = cur; 2779 cur = cur->next; 2780 pool_put(&sackhl_pool, prev); 2781 tp->snd_numholes--; 2782 } else if (SEQ_LT(cur->start, lastack)) { 2783 cur->start = lastack; 2784 if (SEQ_LT(cur->rxmit, cur->start)) 2785 cur->rxmit = cur->start; 2786 break; 2787 } else 2788 break; 2789 tp->snd_holes = cur; 2790 } 2791 } 2792 2793 /* 2794 * Delete all receiver-side SACK information. 2795 */ 2796 void 2797 tcp_clean_sackreport(struct tcpcb *tp) 2798 { 2799 int i; 2800 2801 tp->rcv_numsacks = 0; 2802 for (i = 0; i < MAX_SACK_BLKS; i++) 2803 tp->sackblks[i].start = tp->sackblks[i].end=0; 2804 2805 } 2806 2807 /* 2808 * Checks for partial ack. If partial ack arrives, turn off retransmission 2809 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2810 * If the ack advances at least to tp->snd_last, return 0. 2811 */ 2812 int 2813 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2814 { 2815 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2816 /* Turn off retx. timer (will start again next segment) */ 2817 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2818 tp->t_rtttime = 0; 2819 #ifndef TCP_FACK 2820 /* 2821 * Partial window deflation. This statement relies on the 2822 * fact that tp->snd_una has not been updated yet. In FACK 2823 * hold snd_cwnd constant during fast recovery. 2824 */ 2825 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2826 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2827 tp->snd_cwnd += tp->t_maxseg; 2828 } else 2829 tp->snd_cwnd = tp->t_maxseg; 2830 #endif 2831 return (1); 2832 } 2833 return (0); 2834 } 2835 #endif /* TCP_SACK */ 2836 2837 /* 2838 * Pull out of band byte out of a segment so 2839 * it doesn't appear in the user's data queue. 2840 * It is still reflected in the segment length for 2841 * sequencing purposes. 2842 */ 2843 void 2844 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2845 { 2846 int cnt = off + urgent - 1; 2847 2848 while (cnt >= 0) { 2849 if (m->m_len > cnt) { 2850 char *cp = mtod(m, caddr_t) + cnt; 2851 struct tcpcb *tp = sototcpcb(so); 2852 2853 tp->t_iobc = *cp; 2854 tp->t_oobflags |= TCPOOB_HAVEDATA; 2855 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2856 m->m_len--; 2857 return; 2858 } 2859 cnt -= m->m_len; 2860 m = m->m_next; 2861 if (m == 0) 2862 break; 2863 } 2864 panic("tcp_pulloutofband"); 2865 } 2866 2867 /* 2868 * Collect new round-trip time estimate 2869 * and update averages and current timeout. 2870 */ 2871 void 2872 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2873 { 2874 short delta; 2875 short rttmin; 2876 2877 if (rtt < 0) 2878 rtt = 0; 2879 else if (rtt > TCP_RTT_MAX) 2880 rtt = TCP_RTT_MAX; 2881 2882 tcpstat.tcps_rttupdated++; 2883 if (tp->t_srtt != 0) { 2884 /* 2885 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2886 * after the binary point (scaled by 4), whereas 2887 * srtt is stored as fixed point with 5 bits after the 2888 * binary point (i.e., scaled by 32). The following magic 2889 * is equivalent to the smoothing algorithm in rfc793 with 2890 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2891 * point). 2892 */ 2893 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2894 (tp->t_srtt >> TCP_RTT_SHIFT); 2895 if ((tp->t_srtt += delta) <= 0) 2896 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2897 /* 2898 * We accumulate a smoothed rtt variance (actually, a 2899 * smoothed mean difference), then set the retransmit 2900 * timer to smoothed rtt + 4 times the smoothed variance. 2901 * rttvar is stored as fixed point with 4 bits after the 2902 * binary point (scaled by 16). The following is 2903 * equivalent to rfc793 smoothing with an alpha of .75 2904 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2905 * rfc793's wired-in beta. 2906 */ 2907 if (delta < 0) 2908 delta = -delta; 2909 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2910 if ((tp->t_rttvar += delta) <= 0) 2911 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2912 } else { 2913 /* 2914 * No rtt measurement yet - use the unsmoothed rtt. 2915 * Set the variance to half the rtt (so our first 2916 * retransmit happens at 3*rtt). 2917 */ 2918 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2919 tp->t_rttvar = (rtt + 1) << 2920 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2921 } 2922 tp->t_rtttime = 0; 2923 tp->t_rxtshift = 0; 2924 2925 /* 2926 * the retransmit should happen at rtt + 4 * rttvar. 2927 * Because of the way we do the smoothing, srtt and rttvar 2928 * will each average +1/2 tick of bias. When we compute 2929 * the retransmit timer, we want 1/2 tick of rounding and 2930 * 1 extra tick because of +-1/2 tick uncertainty in the 2931 * firing of the timer. The bias will give us exactly the 2932 * 1.5 tick we need. But, because the bias is 2933 * statistical, we have to test that we don't drop below 2934 * the minimum feasible timer (which is 2 ticks). 2935 */ 2936 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2937 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2938 2939 /* 2940 * We received an ack for a packet that wasn't retransmitted; 2941 * it is probably safe to discard any error indications we've 2942 * received recently. This isn't quite right, but close enough 2943 * for now (a route might have failed after we sent a segment, 2944 * and the return path might not be symmetrical). 2945 */ 2946 tp->t_softerror = 0; 2947 } 2948 2949 /* 2950 * Determine a reasonable value for maxseg size. 2951 * If the route is known, check route for mtu. 2952 * If none, use an mss that can be handled on the outgoing 2953 * interface without forcing IP to fragment; if bigger than 2954 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2955 * to utilize large mbufs. If no route is found, route has no mtu, 2956 * or the destination isn't local, use a default, hopefully conservative 2957 * size (usually 512 or the default IP max size, but no more than the mtu 2958 * of the interface), as we can't discover anything about intervening 2959 * gateways or networks. We also initialize the congestion/slow start 2960 * window to be a single segment if the destination isn't local. 2961 * While looking at the routing entry, we also initialize other path-dependent 2962 * parameters from pre-set or cached values in the routing entry. 2963 * 2964 * Also take into account the space needed for options that we 2965 * send regularly. Make maxseg shorter by that amount to assure 2966 * that we can send maxseg amount of data even when the options 2967 * are present. Store the upper limit of the length of options plus 2968 * data in maxopd. 2969 * 2970 * NOTE: offer == -1 indicates that the maxseg size changed due to 2971 * Path MTU discovery. 2972 */ 2973 int 2974 tcp_mss(struct tcpcb *tp, int offer) 2975 { 2976 struct rtentry *rt; 2977 struct ifnet *ifp; 2978 int mss, mssopt; 2979 int iphlen; 2980 struct inpcb *inp; 2981 2982 inp = tp->t_inpcb; 2983 2984 mssopt = mss = tcp_mssdflt; 2985 2986 rt = in_pcbrtentry(inp); 2987 2988 if (rt == NULL) 2989 goto out; 2990 2991 ifp = rt->rt_ifp; 2992 2993 switch (tp->pf) { 2994 #ifdef INET6 2995 case AF_INET6: 2996 iphlen = sizeof(struct ip6_hdr); 2997 break; 2998 #endif 2999 case AF_INET: 3000 iphlen = sizeof(struct ip); 3001 break; 3002 default: 3003 /* the family does not support path MTU discovery */ 3004 goto out; 3005 } 3006 3007 #ifdef RTV_MTU 3008 /* 3009 * if there's an mtu associated with the route and we support 3010 * path MTU discovery for the underlying protocol family, use it. 3011 */ 3012 if (rt->rt_rmx.rmx_mtu) { 3013 /* 3014 * One may wish to lower MSS to take into account options, 3015 * especially security-related options. 3016 */ 3017 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3018 /* 3019 * RFC2460 section 5, last paragraph: if path MTU is 3020 * smaller than 1280, use 1280 as packet size and 3021 * attach fragment header. 3022 */ 3023 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3024 sizeof(struct tcphdr); 3025 } else 3026 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3027 } else 3028 #endif /* RTV_MTU */ 3029 if (!ifp) 3030 /* 3031 * ifp may be null and rmx_mtu may be zero in certain 3032 * v6 cases (e.g., if ND wasn't able to resolve the 3033 * destination host. 3034 */ 3035 goto out; 3036 else if (ifp->if_flags & IFF_LOOPBACK) 3037 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3038 else if (tp->pf == AF_INET) { 3039 if (ip_mtudisc) 3040 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3041 else if (inp && in_localaddr(inp->inp_faddr, inp->inp_rdomain)) 3042 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3043 } 3044 #ifdef INET6 3045 else if (tp->pf == AF_INET6) { 3046 /* 3047 * for IPv6, path MTU discovery is always turned on, 3048 * or the node must use packet size <= 1280. 3049 */ 3050 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3051 } 3052 #endif /* INET6 */ 3053 3054 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3055 if (offer != -1) { 3056 #ifndef INET6 3057 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3058 #else 3059 if (tp->pf == AF_INET6) 3060 mssopt = IN6_LINKMTU(ifp) - iphlen - 3061 sizeof(struct tcphdr); 3062 else 3063 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3064 #endif 3065 3066 mssopt = max(tcp_mssdflt, mssopt); 3067 } 3068 3069 out: 3070 /* 3071 * The current mss, t_maxseg, is initialized to the default value. 3072 * If we compute a smaller value, reduce the current mss. 3073 * If we compute a larger value, return it for use in sending 3074 * a max seg size option, but don't store it for use 3075 * unless we received an offer at least that large from peer. 3076 * 3077 * However, do not accept offers lower than the minimum of 3078 * the interface MTU and 216. 3079 */ 3080 if (offer > 0) 3081 tp->t_peermss = offer; 3082 if (tp->t_peermss) 3083 mss = min(mss, max(tp->t_peermss, 216)); 3084 3085 /* sanity - at least max opt. space */ 3086 mss = max(mss, 64); 3087 3088 /* 3089 * maxopd stores the maximum length of data AND options 3090 * in a segment; maxseg is the amount of data in a normal 3091 * segment. We need to store this value (maxopd) apart 3092 * from maxseg, because now every segment carries options 3093 * and thus we normally have somewhat less data in segments. 3094 */ 3095 tp->t_maxopd = mss; 3096 3097 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3098 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3099 mss -= TCPOLEN_TSTAMP_APPA; 3100 #ifdef TCP_SIGNATURE 3101 if (tp->t_flags & TF_SIGNATURE) 3102 mss -= TCPOLEN_SIGLEN; 3103 #endif 3104 3105 if (offer == -1) { 3106 /* mss changed due to Path MTU discovery */ 3107 tp->t_flags &= ~TF_PMTUD_PEND; 3108 tp->t_pmtud_mtu_sent = 0; 3109 tp->t_pmtud_mss_acked = 0; 3110 if (mss < tp->t_maxseg) { 3111 /* 3112 * Follow suggestion in RFC 2414 to reduce the 3113 * congestion window by the ratio of the old 3114 * segment size to the new segment size. 3115 */ 3116 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3117 mss, mss); 3118 } 3119 } else if (tcp_do_rfc3390) { 3120 /* increase initial window */ 3121 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3122 } else 3123 tp->snd_cwnd = mss; 3124 3125 tp->t_maxseg = mss; 3126 3127 return (offer != -1 ? mssopt : mss); 3128 } 3129 3130 u_int 3131 tcp_hdrsz(struct tcpcb *tp) 3132 { 3133 u_int hlen; 3134 3135 switch (tp->pf) { 3136 #ifdef INET6 3137 case AF_INET6: 3138 hlen = sizeof(struct ip6_hdr); 3139 break; 3140 #endif 3141 case AF_INET: 3142 hlen = sizeof(struct ip); 3143 break; 3144 default: 3145 hlen = 0; 3146 break; 3147 } 3148 hlen += sizeof(struct tcphdr); 3149 3150 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3151 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3152 hlen += TCPOLEN_TSTAMP_APPA; 3153 #ifdef TCP_SIGNATURE 3154 if (tp->t_flags & TF_SIGNATURE) 3155 hlen += TCPOLEN_SIGLEN; 3156 #endif 3157 return (hlen); 3158 } 3159 3160 /* 3161 * Set connection variables based on the effective MSS. 3162 * We are passed the TCPCB for the actual connection. If we 3163 * are the server, we are called by the compressed state engine 3164 * when the 3-way handshake is complete. If we are the client, 3165 * we are called when we receive the SYN,ACK from the server. 3166 * 3167 * NOTE: The t_maxseg value must be initialized in the TCPCB 3168 * before this routine is called! 3169 */ 3170 void 3171 tcp_mss_update(struct tcpcb *tp) 3172 { 3173 int mss; 3174 u_long bufsize; 3175 struct rtentry *rt; 3176 struct socket *so; 3177 3178 so = tp->t_inpcb->inp_socket; 3179 mss = tp->t_maxseg; 3180 3181 rt = in_pcbrtentry(tp->t_inpcb); 3182 3183 if (rt == NULL) 3184 return; 3185 3186 bufsize = so->so_snd.sb_hiwat; 3187 if (bufsize < mss) { 3188 mss = bufsize; 3189 /* Update t_maxseg and t_maxopd */ 3190 tcp_mss(tp, mss); 3191 } else { 3192 bufsize = roundup(bufsize, mss); 3193 if (bufsize > sb_max) 3194 bufsize = sb_max; 3195 (void)sbreserve(&so->so_snd, bufsize); 3196 } 3197 3198 bufsize = so->so_rcv.sb_hiwat; 3199 if (bufsize > mss) { 3200 bufsize = roundup(bufsize, mss); 3201 if (bufsize > sb_max) 3202 bufsize = sb_max; 3203 (void)sbreserve(&so->so_rcv, bufsize); 3204 } 3205 3206 } 3207 3208 #if defined (TCP_SACK) 3209 /* 3210 * Checks for partial ack. If partial ack arrives, force the retransmission 3211 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3212 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3213 * be started again. If the ack advances at least to tp->snd_last, return 0. 3214 */ 3215 int 3216 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3217 { 3218 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3219 /* 3220 * snd_una has not been updated and the socket send buffer 3221 * not yet drained of the acked data, so we have to leave 3222 * snd_una as it was to get the correct data offset in 3223 * tcp_output(). 3224 */ 3225 tcp_seq onxt = tp->snd_nxt; 3226 u_long ocwnd = tp->snd_cwnd; 3227 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3228 tp->t_rtttime = 0; 3229 tp->snd_nxt = th->th_ack; 3230 /* 3231 * Set snd_cwnd to one segment beyond acknowledged offset 3232 * (tp->snd_una not yet updated when this function is called) 3233 */ 3234 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3235 (void) tcp_output(tp); 3236 tp->snd_cwnd = ocwnd; 3237 if (SEQ_GT(onxt, tp->snd_nxt)) 3238 tp->snd_nxt = onxt; 3239 /* 3240 * Partial window deflation. Relies on fact that tp->snd_una 3241 * not updated yet. 3242 */ 3243 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3244 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3245 else 3246 tp->snd_cwnd = 0; 3247 tp->snd_cwnd += tp->t_maxseg; 3248 3249 return 1; 3250 } 3251 return 0; 3252 } 3253 #endif /* TCP_SACK */ 3254 3255 int 3256 tcp_mss_adv(struct ifnet *ifp, int af) 3257 { 3258 int mss = 0; 3259 int iphlen; 3260 3261 switch (af) { 3262 case AF_INET: 3263 if (ifp != NULL) 3264 mss = ifp->if_mtu; 3265 iphlen = sizeof(struct ip); 3266 break; 3267 #ifdef INET6 3268 case AF_INET6: 3269 if (ifp != NULL) 3270 mss = IN6_LINKMTU(ifp); 3271 iphlen = sizeof(struct ip6_hdr); 3272 break; 3273 #endif 3274 } 3275 mss = mss - iphlen - sizeof(struct tcphdr); 3276 return (max(mss, tcp_mssdflt)); 3277 } 3278 3279 /* 3280 * TCP compressed state engine. Currently used to hold compressed 3281 * state for SYN_RECEIVED. 3282 */ 3283 3284 u_long syn_cache_count; 3285 u_int32_t syn_hash1, syn_hash2; 3286 3287 #define SYN_HASH(sa, sp, dp) \ 3288 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3289 ((u_int32_t)(sp)))^syn_hash2))) 3290 #ifndef INET6 3291 #define SYN_HASHALL(hash, src, dst) \ 3292 do { \ 3293 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3294 ((struct sockaddr_in *)(src))->sin_port, \ 3295 ((struct sockaddr_in *)(dst))->sin_port); \ 3296 } while (/*CONSTCOND*/ 0) 3297 #else 3298 #define SYN_HASH6(sa, sp, dp) \ 3299 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3300 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3301 & 0x7fffffff) 3302 3303 #define SYN_HASHALL(hash, src, dst) \ 3304 do { \ 3305 switch ((src)->sa_family) { \ 3306 case AF_INET: \ 3307 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3308 ((struct sockaddr_in *)(src))->sin_port, \ 3309 ((struct sockaddr_in *)(dst))->sin_port); \ 3310 break; \ 3311 case AF_INET6: \ 3312 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3313 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3314 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3315 break; \ 3316 default: \ 3317 hash = 0; \ 3318 } \ 3319 } while (/*CONSTCOND*/0) 3320 #endif /* INET6 */ 3321 3322 #define SYN_CACHE_RM(sc) \ 3323 do { \ 3324 (sc)->sc_flags |= SCF_DEAD; \ 3325 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 3326 (sc), sc_bucketq); \ 3327 (sc)->sc_tp = NULL; \ 3328 LIST_REMOVE((sc), sc_tpq); \ 3329 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 3330 timeout_del(&(sc)->sc_timer); \ 3331 syn_cache_count--; \ 3332 } while (/*CONSTCOND*/0) 3333 3334 #define SYN_CACHE_PUT(sc) \ 3335 do { \ 3336 if ((sc)->sc_ipopts) \ 3337 (void) m_free((sc)->sc_ipopts); \ 3338 if ((sc)->sc_route4.ro_rt != NULL) \ 3339 RTFREE((sc)->sc_route4.ro_rt); \ 3340 timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \ 3341 timeout_add(&(sc)->sc_timer, 0); \ 3342 } while (/*CONSTCOND*/0) 3343 3344 struct pool syn_cache_pool; 3345 3346 /* 3347 * We don't estimate RTT with SYNs, so each packet starts with the default 3348 * RTT and each timer step has a fixed timeout value. 3349 */ 3350 #define SYN_CACHE_TIMER_ARM(sc) \ 3351 do { \ 3352 TCPT_RANGESET((sc)->sc_rxtcur, \ 3353 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3354 TCPTV_REXMTMAX); \ 3355 if (!timeout_initialized(&(sc)->sc_timer)) \ 3356 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3357 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3358 } while (/*CONSTCOND*/0) 3359 3360 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3361 3362 void 3363 syn_cache_init() 3364 { 3365 int i; 3366 3367 /* Initialize the hash buckets. */ 3368 for (i = 0; i < tcp_syn_cache_size; i++) 3369 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3370 3371 /* Initialize the syn cache pool. */ 3372 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3373 "synpl", NULL); 3374 } 3375 3376 void 3377 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3378 { 3379 struct syn_cache_head *scp; 3380 struct syn_cache *sc2; 3381 int s; 3382 3383 /* 3384 * If there are no entries in the hash table, reinitialize 3385 * the hash secrets. 3386 */ 3387 if (syn_cache_count == 0) { 3388 syn_hash1 = arc4random(); 3389 syn_hash2 = arc4random(); 3390 } 3391 3392 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3393 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3394 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3395 3396 /* 3397 * Make sure that we don't overflow the per-bucket 3398 * limit or the total cache size limit. 3399 */ 3400 s = splsoftnet(); 3401 if (scp->sch_length >= tcp_syn_bucket_limit) { 3402 tcpstat.tcps_sc_bucketoverflow++; 3403 /* 3404 * The bucket is full. Toss the oldest element in the 3405 * bucket. This will be the first entry in the bucket. 3406 */ 3407 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3408 #ifdef DIAGNOSTIC 3409 /* 3410 * This should never happen; we should always find an 3411 * entry in our bucket. 3412 */ 3413 if (sc2 == NULL) 3414 panic("syn_cache_insert: bucketoverflow: impossible"); 3415 #endif 3416 SYN_CACHE_RM(sc2); 3417 SYN_CACHE_PUT(sc2); 3418 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3419 struct syn_cache_head *scp2, *sce; 3420 3421 tcpstat.tcps_sc_overflowed++; 3422 /* 3423 * The cache is full. Toss the oldest entry in the 3424 * first non-empty bucket we can find. 3425 * 3426 * XXX We would really like to toss the oldest 3427 * entry in the cache, but we hope that this 3428 * condition doesn't happen very often. 3429 */ 3430 scp2 = scp; 3431 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3432 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3433 for (++scp2; scp2 != scp; scp2++) { 3434 if (scp2 >= sce) 3435 scp2 = &tcp_syn_cache[0]; 3436 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3437 break; 3438 } 3439 #ifdef DIAGNOSTIC 3440 /* 3441 * This should never happen; we should always find a 3442 * non-empty bucket. 3443 */ 3444 if (scp2 == scp) 3445 panic("syn_cache_insert: cacheoverflow: " 3446 "impossible"); 3447 #endif 3448 } 3449 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3450 SYN_CACHE_RM(sc2); 3451 SYN_CACHE_PUT(sc2); 3452 } 3453 3454 /* 3455 * Initialize the entry's timer. 3456 */ 3457 sc->sc_rxttot = 0; 3458 sc->sc_rxtshift = 0; 3459 SYN_CACHE_TIMER_ARM(sc); 3460 3461 /* Link it from tcpcb entry */ 3462 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3463 3464 /* Put it into the bucket. */ 3465 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3466 scp->sch_length++; 3467 syn_cache_count++; 3468 3469 tcpstat.tcps_sc_added++; 3470 splx(s); 3471 } 3472 3473 /* 3474 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3475 * If we have retransmitted an entry the maximum number of times, expire 3476 * that entry. 3477 */ 3478 void 3479 syn_cache_timer(void *arg) 3480 { 3481 struct syn_cache *sc = arg; 3482 int s; 3483 3484 s = splsoftnet(); 3485 if (sc->sc_flags & SCF_DEAD) { 3486 splx(s); 3487 return; 3488 } 3489 3490 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3491 /* Drop it -- too many retransmissions. */ 3492 goto dropit; 3493 } 3494 3495 /* 3496 * Compute the total amount of time this entry has 3497 * been on a queue. If this entry has been on longer 3498 * than the keep alive timer would allow, expire it. 3499 */ 3500 sc->sc_rxttot += sc->sc_rxtcur; 3501 if (sc->sc_rxttot >= tcptv_keep_init) 3502 goto dropit; 3503 3504 tcpstat.tcps_sc_retransmitted++; 3505 (void) syn_cache_respond(sc, NULL); 3506 3507 /* Advance the timer back-off. */ 3508 sc->sc_rxtshift++; 3509 SYN_CACHE_TIMER_ARM(sc); 3510 3511 splx(s); 3512 return; 3513 3514 dropit: 3515 tcpstat.tcps_sc_timed_out++; 3516 SYN_CACHE_RM(sc); 3517 SYN_CACHE_PUT(sc); 3518 splx(s); 3519 } 3520 3521 void 3522 syn_cache_reaper(void *arg) 3523 { 3524 struct syn_cache *sc = arg; 3525 int s; 3526 3527 s = splsoftnet(); 3528 pool_put(&syn_cache_pool, (sc)); 3529 splx(s); 3530 return; 3531 } 3532 3533 /* 3534 * Remove syn cache created by the specified tcb entry, 3535 * because this does not make sense to keep them 3536 * (if there's no tcb entry, syn cache entry will never be used) 3537 */ 3538 void 3539 syn_cache_cleanup(struct tcpcb *tp) 3540 { 3541 struct syn_cache *sc, *nsc; 3542 int s; 3543 3544 s = splsoftnet(); 3545 3546 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3547 nsc = LIST_NEXT(sc, sc_tpq); 3548 3549 #ifdef DIAGNOSTIC 3550 if (sc->sc_tp != tp) 3551 panic("invalid sc_tp in syn_cache_cleanup"); 3552 #endif 3553 SYN_CACHE_RM(sc); 3554 SYN_CACHE_PUT(sc); 3555 } 3556 /* just for safety */ 3557 LIST_INIT(&tp->t_sc); 3558 3559 splx(s); 3560 } 3561 3562 /* 3563 * Find an entry in the syn cache. 3564 */ 3565 struct syn_cache * 3566 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3567 struct syn_cache_head **headp, u_int rdomain) 3568 { 3569 struct syn_cache *sc; 3570 struct syn_cache_head *scp; 3571 u_int32_t hash; 3572 int s; 3573 3574 SYN_HASHALL(hash, src, dst); 3575 3576 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3577 *headp = scp; 3578 s = splsoftnet(); 3579 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3580 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3581 if (sc->sc_hash != hash) 3582 continue; 3583 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3584 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3585 rtable_l2(rdomain) == rtable_l2(sc->sc_rdomain)) { 3586 splx(s); 3587 return (sc); 3588 } 3589 } 3590 splx(s); 3591 return (NULL); 3592 } 3593 3594 /* 3595 * This function gets called when we receive an ACK for a 3596 * socket in the LISTEN state. We look up the connection 3597 * in the syn cache, and if its there, we pull it out of 3598 * the cache and turn it into a full-blown connection in 3599 * the SYN-RECEIVED state. 3600 * 3601 * The return values may not be immediately obvious, and their effects 3602 * can be subtle, so here they are: 3603 * 3604 * NULL SYN was not found in cache; caller should drop the 3605 * packet and send an RST. 3606 * 3607 * -1 We were unable to create the new connection, and are 3608 * aborting it. An ACK,RST is being sent to the peer 3609 * (unless we got screwey sequence numbners; see below), 3610 * because the 3-way handshake has been completed. Caller 3611 * should not free the mbuf, since we may be using it. If 3612 * we are not, we will free it. 3613 * 3614 * Otherwise, the return value is a pointer to the new socket 3615 * associated with the connection. 3616 */ 3617 struct socket * 3618 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3619 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3620 { 3621 struct syn_cache *sc; 3622 struct syn_cache_head *scp; 3623 struct inpcb *inp = NULL; 3624 struct tcpcb *tp = 0; 3625 struct mbuf *am; 3626 int s; 3627 struct socket *oso; 3628 3629 s = splsoftnet(); 3630 if ((sc = syn_cache_lookup(src, dst, &scp, 3631 sotoinpcb(so)->inp_rdomain)) == NULL) { 3632 splx(s); 3633 return (NULL); 3634 } 3635 3636 /* 3637 * Verify the sequence and ack numbers. Try getting the correct 3638 * response again. 3639 */ 3640 if ((th->th_ack != sc->sc_iss + 1) || 3641 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3642 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3643 (void) syn_cache_respond(sc, m); 3644 splx(s); 3645 return ((struct socket *)(-1)); 3646 } 3647 3648 /* Remove this cache entry */ 3649 SYN_CACHE_RM(sc); 3650 splx(s); 3651 3652 /* 3653 * Ok, create the full blown connection, and set things up 3654 * as they would have been set up if we had created the 3655 * connection when the SYN arrived. If we can't create 3656 * the connection, abort it. 3657 */ 3658 oso = so; 3659 so = sonewconn(so, SS_ISCONNECTED); 3660 if (so == NULL) 3661 goto resetandabort; 3662 3663 inp = sotoinpcb(oso); 3664 3665 #ifdef IPSEC 3666 /* 3667 * We need to copy the required security levels 3668 * from the old pcb. Ditto for any other 3669 * IPsec-related information. 3670 */ 3671 { 3672 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 3673 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3674 sizeof(inp->inp_seclevel)); 3675 newinp->inp_secrequire = inp->inp_secrequire; 3676 if (inp->inp_ipo != NULL) { 3677 newinp->inp_ipo = inp->inp_ipo; 3678 inp->inp_ipo->ipo_ref_count++; 3679 } 3680 if (inp->inp_ipsec_remotecred != NULL) { 3681 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3682 inp->inp_ipsec_remotecred->ref_count++; 3683 } 3684 if (inp->inp_ipsec_remoteauth != NULL) { 3685 newinp->inp_ipsec_remoteauth 3686 = inp->inp_ipsec_remoteauth; 3687 inp->inp_ipsec_remoteauth->ref_count++; 3688 } 3689 } 3690 #endif /* IPSEC */ 3691 #ifdef INET6 3692 /* 3693 * inp still has the OLD in_pcb stuff, set the 3694 * v6-related flags on the new guy, too. 3695 */ 3696 { 3697 int flags = inp->inp_flags; 3698 struct inpcb *oldinpcb = inp; 3699 3700 inp = (struct inpcb *)so->so_pcb; 3701 inp->inp_flags |= (flags & INP_IPV6); 3702 if ((inp->inp_flags & INP_IPV6) != 0) { 3703 inp->inp_ipv6.ip6_hlim = 3704 oldinpcb->inp_ipv6.ip6_hlim; 3705 } 3706 } 3707 #else /* INET6 */ 3708 inp = (struct inpcb *)so->so_pcb; 3709 #endif /* INET6 */ 3710 3711 /* inherit rdomain from listening socket */ 3712 inp->inp_rdomain = sc->sc_rdomain; 3713 3714 inp->inp_lport = th->th_dport; 3715 switch (src->sa_family) { 3716 #ifdef INET6 3717 case AF_INET6: 3718 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3719 break; 3720 #endif /* INET6 */ 3721 case AF_INET: 3722 3723 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3724 inp->inp_options = ip_srcroute(); 3725 if (inp->inp_options == NULL) { 3726 inp->inp_options = sc->sc_ipopts; 3727 sc->sc_ipopts = NULL; 3728 } 3729 break; 3730 } 3731 in_pcbrehash(inp); 3732 3733 /* 3734 * Give the new socket our cached route reference. 3735 */ 3736 if (src->sa_family == AF_INET) 3737 inp->inp_route = sc->sc_route4; /* struct assignment */ 3738 #ifdef INET6 3739 else 3740 inp->inp_route6 = sc->sc_route6; 3741 #endif 3742 sc->sc_route4.ro_rt = NULL; 3743 3744 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3745 if (am == NULL) 3746 goto resetandabort; 3747 am->m_len = src->sa_len; 3748 bcopy(src, mtod(am, caddr_t), src->sa_len); 3749 3750 switch (src->sa_family) { 3751 case AF_INET: 3752 /* drop IPv4 packet to AF_INET6 socket */ 3753 if (inp->inp_flags & INP_IPV6) { 3754 (void) m_free(am); 3755 goto resetandabort; 3756 } 3757 if (in_pcbconnect(inp, am)) { 3758 (void) m_free(am); 3759 goto resetandabort; 3760 } 3761 break; 3762 #ifdef INET6 3763 case AF_INET6: 3764 if (in6_pcbconnect(inp, am)) { 3765 (void) m_free(am); 3766 goto resetandabort; 3767 } 3768 break; 3769 #endif 3770 } 3771 (void) m_free(am); 3772 3773 tp = intotcpcb(inp); 3774 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3775 if (sc->sc_request_r_scale != 15) { 3776 tp->requested_s_scale = sc->sc_requested_s_scale; 3777 tp->request_r_scale = sc->sc_request_r_scale; 3778 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3779 } 3780 if (sc->sc_flags & SCF_TIMESTAMP) 3781 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3782 3783 tp->t_template = tcp_template(tp); 3784 if (tp->t_template == 0) { 3785 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3786 so = NULL; 3787 m_freem(m); 3788 goto abort; 3789 } 3790 #ifdef TCP_SACK 3791 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3792 #endif 3793 3794 tp->ts_modulate = sc->sc_modulate; 3795 tp->iss = sc->sc_iss; 3796 tp->irs = sc->sc_irs; 3797 tcp_sendseqinit(tp); 3798 #if defined (TCP_SACK) || defined(TCP_ECN) 3799 tp->snd_last = tp->snd_una; 3800 #endif /* TCP_SACK */ 3801 #if defined(TCP_SACK) && defined(TCP_FACK) 3802 tp->snd_fack = tp->snd_una; 3803 tp->retran_data = 0; 3804 tp->snd_awnd = 0; 3805 #endif /* TCP_FACK */ 3806 #ifdef TCP_ECN 3807 if (sc->sc_flags & SCF_ECN_PERMIT) { 3808 tp->t_flags |= TF_ECN_PERMIT; 3809 tcpstat.tcps_ecn_accepts++; 3810 } 3811 #endif 3812 #ifdef TCP_SACK 3813 if (sc->sc_flags & SCF_SACK_PERMIT) 3814 tp->t_flags |= TF_SACK_PERMIT; 3815 #endif 3816 #ifdef TCP_SIGNATURE 3817 if (sc->sc_flags & SCF_SIGNATURE) 3818 tp->t_flags |= TF_SIGNATURE; 3819 #endif 3820 tcp_rcvseqinit(tp); 3821 tp->t_state = TCPS_SYN_RECEIVED; 3822 tp->t_rcvtime = tcp_now; 3823 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3824 tcpstat.tcps_accepts++; 3825 3826 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3827 if (sc->sc_peermaxseg) 3828 tcp_mss_update(tp); 3829 /* Reset initial window to 1 segment for retransmit */ 3830 if (sc->sc_rxtshift > 0) 3831 tp->snd_cwnd = tp->t_maxseg; 3832 tp->snd_wl1 = sc->sc_irs; 3833 tp->rcv_up = sc->sc_irs + 1; 3834 3835 /* 3836 * This is what whould have happened in tcp_output() when 3837 * the SYN,ACK was sent. 3838 */ 3839 tp->snd_up = tp->snd_una; 3840 tp->snd_max = tp->snd_nxt = tp->iss+1; 3841 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3842 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3843 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3844 tp->last_ack_sent = tp->rcv_nxt; 3845 3846 tcpstat.tcps_sc_completed++; 3847 SYN_CACHE_PUT(sc); 3848 return (so); 3849 3850 resetandabort: 3851 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3852 m->m_pkthdr.rdomain); 3853 m_freem(m); 3854 abort: 3855 if (so != NULL) 3856 (void) soabort(so); 3857 SYN_CACHE_PUT(sc); 3858 tcpstat.tcps_sc_aborted++; 3859 return ((struct socket *)(-1)); 3860 } 3861 3862 /* 3863 * This function is called when we get a RST for a 3864 * non-existent connection, so that we can see if the 3865 * connection is in the syn cache. If it is, zap it. 3866 */ 3867 3868 void 3869 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3870 u_int rdomain) 3871 { 3872 struct syn_cache *sc; 3873 struct syn_cache_head *scp; 3874 int s = splsoftnet(); 3875 3876 if ((sc = syn_cache_lookup(src, dst, &scp, rdomain)) == NULL) { 3877 splx(s); 3878 return; 3879 } 3880 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3881 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3882 splx(s); 3883 return; 3884 } 3885 SYN_CACHE_RM(sc); 3886 splx(s); 3887 tcpstat.tcps_sc_reset++; 3888 SYN_CACHE_PUT(sc); 3889 } 3890 3891 void 3892 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3893 u_int rdomain) 3894 { 3895 struct syn_cache *sc; 3896 struct syn_cache_head *scp; 3897 int s; 3898 3899 s = splsoftnet(); 3900 if ((sc = syn_cache_lookup(src, dst, &scp, rdomain)) == NULL) { 3901 splx(s); 3902 return; 3903 } 3904 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3905 if (ntohl (th->th_seq) != sc->sc_iss) { 3906 splx(s); 3907 return; 3908 } 3909 3910 /* 3911 * If we've retransmitted 3 times and this is our second error, 3912 * we remove the entry. Otherwise, we allow it to continue on. 3913 * This prevents us from incorrectly nuking an entry during a 3914 * spurious network outage. 3915 * 3916 * See tcp_notify(). 3917 */ 3918 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3919 sc->sc_flags |= SCF_UNREACH; 3920 splx(s); 3921 return; 3922 } 3923 3924 SYN_CACHE_RM(sc); 3925 splx(s); 3926 tcpstat.tcps_sc_unreach++; 3927 SYN_CACHE_PUT(sc); 3928 } 3929 3930 /* 3931 * Given a LISTEN socket and an inbound SYN request, add 3932 * this to the syn cache, and send back a segment: 3933 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3934 * to the source. 3935 * 3936 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3937 * Doing so would require that we hold onto the data and deliver it 3938 * to the application. However, if we are the target of a SYN-flood 3939 * DoS attack, an attacker could send data which would eventually 3940 * consume all available buffer space if it were ACKed. By not ACKing 3941 * the data, we avoid this DoS scenario. 3942 */ 3943 3944 int 3945 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3946 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3947 struct tcp_opt_info *oi, tcp_seq *issp) 3948 { 3949 struct tcpcb tb, *tp; 3950 long win; 3951 struct syn_cache *sc; 3952 struct syn_cache_head *scp; 3953 struct mbuf *ipopts; 3954 3955 tp = sototcpcb(so); 3956 3957 /* 3958 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3959 * 3960 * Note this check is performed in tcp_input() very early on. 3961 */ 3962 3963 /* 3964 * Initialize some local state. 3965 */ 3966 win = sbspace(&so->so_rcv); 3967 if (win > TCP_MAXWIN) 3968 win = TCP_MAXWIN; 3969 3970 #ifdef TCP_SIGNATURE 3971 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3972 #else 3973 if (optp) { 3974 #endif 3975 tb.pf = tp->pf; 3976 #ifdef TCP_SACK 3977 tb.sack_enable = tp->sack_enable; 3978 #endif 3979 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3980 #ifdef TCP_SIGNATURE 3981 if (tp->t_flags & TF_SIGNATURE) 3982 tb.t_flags |= TF_SIGNATURE; 3983 #endif 3984 tb.t_state = TCPS_LISTEN; 3985 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi)) 3986 return (0); 3987 } else 3988 tb.t_flags = 0; 3989 3990 switch (src->sa_family) { 3991 #ifdef INET 3992 case AF_INET: 3993 /* 3994 * Remember the IP options, if any. 3995 */ 3996 ipopts = ip_srcroute(); 3997 break; 3998 #endif 3999 default: 4000 ipopts = NULL; 4001 } 4002 4003 /* 4004 * See if we already have an entry for this connection. 4005 * If we do, resend the SYN,ACK. We do not count this 4006 * as a retransmission (XXX though maybe we should). 4007 */ 4008 if ((sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rdomain)) 4009 != NULL) { 4010 tcpstat.tcps_sc_dupesyn++; 4011 if (ipopts) { 4012 /* 4013 * If we were remembering a previous source route, 4014 * forget it and use the new one we've been given. 4015 */ 4016 if (sc->sc_ipopts) 4017 (void) m_free(sc->sc_ipopts); 4018 sc->sc_ipopts = ipopts; 4019 } 4020 sc->sc_timestamp = tb.ts_recent; 4021 if (syn_cache_respond(sc, m) == 0) { 4022 tcpstat.tcps_sndacks++; 4023 tcpstat.tcps_sndtotal++; 4024 } 4025 return (1); 4026 } 4027 4028 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4029 if (sc == NULL) { 4030 if (ipopts) 4031 (void) m_free(ipopts); 4032 return (0); 4033 } 4034 4035 /* 4036 * Fill in the cache, and put the necessary IP and TCP 4037 * options into the reply. 4038 */ 4039 bcopy(src, &sc->sc_src, src->sa_len); 4040 bcopy(dst, &sc->sc_dst, dst->sa_len); 4041 sc->sc_rdomain = sotoinpcb(so)->inp_rdomain; 4042 sc->sc_flags = 0; 4043 sc->sc_ipopts = ipopts; 4044 sc->sc_irs = th->th_seq; 4045 4046 sc->sc_iss = issp ? *issp : arc4random(); 4047 sc->sc_peermaxseg = oi->maxseg; 4048 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4049 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4050 sc->sc_win = win; 4051 sc->sc_timestamp = tb.ts_recent; 4052 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4053 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4054 sc->sc_flags |= SCF_TIMESTAMP; 4055 sc->sc_modulate = arc4random(); 4056 } 4057 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4058 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4059 sc->sc_requested_s_scale = tb.requested_s_scale; 4060 sc->sc_request_r_scale = 0; 4061 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4062 TCP_MAXWIN << sc->sc_request_r_scale < 4063 so->so_rcv.sb_hiwat) 4064 sc->sc_request_r_scale++; 4065 } else { 4066 sc->sc_requested_s_scale = 15; 4067 sc->sc_request_r_scale = 15; 4068 } 4069 #ifdef TCP_ECN 4070 /* 4071 * if both ECE and CWR flag bits are set, peer is ECN capable. 4072 */ 4073 if (tcp_do_ecn && 4074 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4075 sc->sc_flags |= SCF_ECN_PERMIT; 4076 #endif 4077 #ifdef TCP_SACK 4078 /* 4079 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4080 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4081 */ 4082 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4083 sc->sc_flags |= SCF_SACK_PERMIT; 4084 #endif 4085 #ifdef TCP_SIGNATURE 4086 if (tb.t_flags & TF_SIGNATURE) 4087 sc->sc_flags |= SCF_SIGNATURE; 4088 #endif 4089 sc->sc_tp = tp; 4090 if (syn_cache_respond(sc, m) == 0) { 4091 syn_cache_insert(sc, tp); 4092 tcpstat.tcps_sndacks++; 4093 tcpstat.tcps_sndtotal++; 4094 } else { 4095 SYN_CACHE_PUT(sc); 4096 tcpstat.tcps_sc_dropped++; 4097 } 4098 return (1); 4099 } 4100 4101 int 4102 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4103 { 4104 struct route *ro; 4105 u_int8_t *optp; 4106 int optlen, error; 4107 u_int16_t tlen; 4108 struct ip *ip = NULL; 4109 #ifdef INET6 4110 struct ip6_hdr *ip6 = NULL; 4111 #endif 4112 struct tcphdr *th; 4113 u_int hlen; 4114 struct inpcb *inp; 4115 4116 switch (sc->sc_src.sa.sa_family) { 4117 case AF_INET: 4118 hlen = sizeof(struct ip); 4119 ro = &sc->sc_route4; 4120 break; 4121 #ifdef INET6 4122 case AF_INET6: 4123 hlen = sizeof(struct ip6_hdr); 4124 ro = (struct route *)&sc->sc_route6; 4125 break; 4126 #endif 4127 default: 4128 if (m) 4129 m_freem(m); 4130 return (EAFNOSUPPORT); 4131 } 4132 4133 /* Compute the size of the TCP options. */ 4134 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4135 #ifdef TCP_SACK 4136 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4137 #endif 4138 #ifdef TCP_SIGNATURE 4139 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4140 #endif 4141 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4142 4143 tlen = hlen + sizeof(struct tcphdr) + optlen; 4144 4145 /* 4146 * Create the IP+TCP header from scratch. 4147 */ 4148 if (m) 4149 m_freem(m); 4150 #ifdef DIAGNOSTIC 4151 if (max_linkhdr + tlen > MCLBYTES) 4152 return (ENOBUFS); 4153 #endif 4154 MGETHDR(m, M_DONTWAIT, MT_DATA); 4155 if (m && max_linkhdr + tlen > MHLEN) { 4156 MCLGET(m, M_DONTWAIT); 4157 if ((m->m_flags & M_EXT) == 0) { 4158 m_freem(m); 4159 m = NULL; 4160 } 4161 } 4162 if (m == NULL) 4163 return (ENOBUFS); 4164 4165 /* Fixup the mbuf. */ 4166 m->m_data += max_linkhdr; 4167 m->m_len = m->m_pkthdr.len = tlen; 4168 m->m_pkthdr.rcvif = NULL; 4169 m->m_pkthdr.rdomain = sc->sc_rdomain; 4170 memset(mtod(m, u_char *), 0, tlen); 4171 4172 switch (sc->sc_src.sa.sa_family) { 4173 case AF_INET: 4174 ip = mtod(m, struct ip *); 4175 ip->ip_dst = sc->sc_src.sin.sin_addr; 4176 ip->ip_src = sc->sc_dst.sin.sin_addr; 4177 ip->ip_p = IPPROTO_TCP; 4178 th = (struct tcphdr *)(ip + 1); 4179 th->th_dport = sc->sc_src.sin.sin_port; 4180 th->th_sport = sc->sc_dst.sin.sin_port; 4181 break; 4182 #ifdef INET6 4183 case AF_INET6: 4184 ip6 = mtod(m, struct ip6_hdr *); 4185 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4186 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4187 ip6->ip6_nxt = IPPROTO_TCP; 4188 /* ip6_plen will be updated in ip6_output() */ 4189 th = (struct tcphdr *)(ip6 + 1); 4190 th->th_dport = sc->sc_src.sin6.sin6_port; 4191 th->th_sport = sc->sc_dst.sin6.sin6_port; 4192 break; 4193 #endif 4194 default: 4195 th = NULL; 4196 } 4197 4198 th->th_seq = htonl(sc->sc_iss); 4199 th->th_ack = htonl(sc->sc_irs + 1); 4200 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4201 th->th_flags = TH_SYN|TH_ACK; 4202 #ifdef TCP_ECN 4203 /* Set ECE for SYN-ACK if peer supports ECN. */ 4204 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4205 th->th_flags |= TH_ECE; 4206 #endif 4207 th->th_win = htons(sc->sc_win); 4208 /* th_sum already 0 */ 4209 /* th_urp already 0 */ 4210 4211 /* Tack on the TCP options. */ 4212 optp = (u_int8_t *)(th + 1); 4213 *optp++ = TCPOPT_MAXSEG; 4214 *optp++ = 4; 4215 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4216 *optp++ = sc->sc_ourmaxseg & 0xff; 4217 4218 #ifdef TCP_SACK 4219 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4220 if (sc->sc_flags & SCF_SACK_PERMIT) { 4221 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4222 optp += 4; 4223 } 4224 #endif 4225 4226 if (sc->sc_request_r_scale != 15) { 4227 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4228 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4229 sc->sc_request_r_scale); 4230 optp += 4; 4231 } 4232 4233 if (sc->sc_flags & SCF_TIMESTAMP) { 4234 u_int32_t *lp = (u_int32_t *)(optp); 4235 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4236 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4237 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4238 *lp = htonl(sc->sc_timestamp); 4239 optp += TCPOLEN_TSTAMP_APPA; 4240 } 4241 4242 #ifdef TCP_SIGNATURE 4243 if (sc->sc_flags & SCF_SIGNATURE) { 4244 union sockaddr_union src, dst; 4245 struct tdb *tdb; 4246 4247 bzero(&src, sizeof(union sockaddr_union)); 4248 bzero(&dst, sizeof(union sockaddr_union)); 4249 src.sa.sa_len = sc->sc_src.sa.sa_len; 4250 src.sa.sa_family = sc->sc_src.sa.sa_family; 4251 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4252 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4253 4254 switch (sc->sc_src.sa.sa_family) { 4255 case 0: /*default to PF_INET*/ 4256 #ifdef INET 4257 case AF_INET: 4258 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4259 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4260 break; 4261 #endif /* INET */ 4262 #ifdef INET6 4263 case AF_INET6: 4264 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4265 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4266 break; 4267 #endif /* INET6 */ 4268 } 4269 4270 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 4271 if (tdb == NULL) { 4272 if (m) 4273 m_freem(m); 4274 return (EPERM); 4275 } 4276 4277 /* Send signature option */ 4278 *(optp++) = TCPOPT_SIGNATURE; 4279 *(optp++) = TCPOLEN_SIGNATURE; 4280 4281 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4282 hlen, 0, optp) < 0) { 4283 if (m) 4284 m_freem(m); 4285 return (EINVAL); 4286 } 4287 optp += 16; 4288 4289 /* Pad options list to the next 32 bit boundary and 4290 * terminate it. 4291 */ 4292 *optp++ = TCPOPT_NOP; 4293 *optp++ = TCPOPT_EOL; 4294 } 4295 #endif /* TCP_SIGNATURE */ 4296 4297 /* Compute the packet's checksum. */ 4298 switch (sc->sc_src.sa.sa_family) { 4299 case AF_INET: 4300 ip->ip_len = htons(tlen - hlen); 4301 th->th_sum = 0; 4302 th->th_sum = in_cksum(m, tlen); 4303 break; 4304 #ifdef INET6 4305 case AF_INET6: 4306 ip6->ip6_plen = htons(tlen - hlen); 4307 th->th_sum = 0; 4308 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4309 break; 4310 #endif 4311 } 4312 4313 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4314 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4315 4316 /* 4317 * Fill in some straggling IP bits. Note the stack expects 4318 * ip_len to be in host order, for convenience. 4319 */ 4320 switch (sc->sc_src.sa.sa_family) { 4321 #ifdef INET 4322 case AF_INET: 4323 ip->ip_len = htons(tlen); 4324 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4325 /* XXX tos? */ 4326 break; 4327 #endif 4328 #ifdef INET6 4329 case AF_INET6: 4330 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4331 ip6->ip6_vfc |= IPV6_VERSION; 4332 ip6->ip6_plen = htons(tlen - hlen); 4333 /* ip6_hlim will be initialized afterwards */ 4334 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4335 break; 4336 #endif 4337 } 4338 4339 switch (sc->sc_src.sa.sa_family) { 4340 #ifdef INET 4341 case AF_INET: 4342 error = ip_output(m, sc->sc_ipopts, ro, 4343 (ip_mtudisc ? IP_MTUDISC : 0), 4344 (struct ip_moptions *)NULL, inp); 4345 break; 4346 #endif 4347 #ifdef INET6 4348 case AF_INET6: 4349 ip6->ip6_hlim = in6_selecthlim(NULL, 4350 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4351 4352 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4353 (struct ip6_moptions *)0, NULL, NULL); 4354 break; 4355 #endif 4356 default: 4357 error = EAFNOSUPPORT; 4358 break; 4359 } 4360 return (error); 4361 } 4362