1 /* $OpenBSD: tcp_input.c,v 1.212 2008/02/20 11:24:02 markus Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/protosw.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/kernel.h> 78 #include <sys/pool.h> 79 80 #include <dev/rndvar.h> 81 82 #include <net/if.h> 83 #include <net/route.h> 84 85 #include <netinet/in.h> 86 #include <netinet/in_systm.h> 87 #include <netinet/ip.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/tcp.h> 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/tcp_debug.h> 97 98 struct tcpiphdr tcp_saveti; 99 100 int tcp_mss_adv(struct ifnet *, int); 101 102 #ifdef INET6 103 #include <netinet6/in6_var.h> 104 #include <netinet6/nd6.h> 105 106 struct tcpipv6hdr tcp_saveti6; 107 108 /* for the packet header length in the mbuf */ 109 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 110 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 111 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 112 #endif /* INET6 */ 113 114 int tcprexmtthresh = 3; 115 int tcptv_keep_init = TCPTV_KEEP_INIT; 116 117 extern u_long sb_max; 118 119 int tcp_rst_ppslim = 100; /* 100pps */ 120 int tcp_rst_ppslim_count = 0; 121 struct timeval tcp_rst_ppslim_last; 122 123 int tcp_ackdrop_ppslim = 100; /* 100pps */ 124 int tcp_ackdrop_ppslim_count = 0; 125 struct timeval tcp_ackdrop_ppslim_last; 126 127 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 128 129 /* for modulo comparisons of timestamps */ 130 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 131 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 132 133 /* for TCP SACK comparisons */ 134 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 135 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 136 137 /* 138 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 139 */ 140 #ifdef INET6 141 #define ND6_HINT(tp) \ 142 do { \ 143 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 144 tp->t_inpcb->inp_route6.ro_rt) { \ 145 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 146 } \ 147 } while (0) 148 #else 149 #define ND6_HINT(tp) 150 #endif 151 152 #ifdef TCP_ECN 153 /* 154 * ECN (Explicit Congestion Notification) support based on RFC3168 155 * implementation note: 156 * snd_last is used to track a recovery phase. 157 * when cwnd is reduced, snd_last is set to snd_max. 158 * while snd_last > snd_una, the sender is in a recovery phase and 159 * its cwnd should not be reduced again. 160 * snd_last follows snd_una when not in a recovery phase. 161 */ 162 #endif 163 164 /* 165 * Macro to compute ACK transmission behavior. Delay the ACK unless 166 * we have already delayed an ACK (must send an ACK every two segments). 167 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 168 * option is enabled. 169 */ 170 #define TCP_SETUP_ACK(tp, tiflags) \ 171 do { \ 172 if ((tp)->t_flags & TF_DELACK || \ 173 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \ 174 tp->t_flags |= TF_ACKNOW; \ 175 else \ 176 TCP_SET_DELACK(tp); \ 177 } while (0) 178 179 /* 180 * Insert segment ti into reassembly queue of tcp with 181 * control block tp. Return TH_FIN if reassembly now includes 182 * a segment with FIN. The macro form does the common case inline 183 * (segment is the next to be received on an established connection, 184 * and the queue is empty), avoiding linkage into and removal 185 * from the queue and repetition of various conversions. 186 * Set DELACK for segments received in order, but ack immediately 187 * when segments are out of order (so fast retransmit can work). 188 */ 189 190 int 191 tcp_reass(tp, th, m, tlen) 192 struct tcpcb *tp; 193 struct tcphdr *th; 194 struct mbuf *m; 195 int *tlen; 196 { 197 struct tcpqent *p, *q, *nq, *tiqe; 198 struct socket *so = tp->t_inpcb->inp_socket; 199 int flags; 200 201 /* 202 * Call with th==0 after become established to 203 * force pre-ESTABLISHED data up to user socket. 204 */ 205 if (th == 0) 206 goto present; 207 208 /* 209 * Allocate a new queue entry, before we throw away any data. 210 * If we can't, just drop the packet. XXX 211 */ 212 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 213 if (tiqe == NULL) { 214 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 215 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 216 /* Reuse last entry since new segment fills a hole */ 217 m_freem(tiqe->tcpqe_m); 218 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 219 } 220 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 221 /* Flush segment queue for this connection */ 222 tcp_freeq(tp); 223 tcpstat.tcps_rcvmemdrop++; 224 m_freem(m); 225 return (0); 226 } 227 } 228 229 /* 230 * Find a segment which begins after this one does. 231 */ 232 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 233 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 234 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 235 break; 236 237 /* 238 * If there is a preceding segment, it may provide some of 239 * our data already. If so, drop the data from the incoming 240 * segment. If it provides all of our data, drop us. 241 */ 242 if (p != NULL) { 243 struct tcphdr *phdr = p->tcpqe_tcp; 244 int i; 245 246 /* conversion to int (in i) handles seq wraparound */ 247 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 248 if (i > 0) { 249 if (i >= *tlen) { 250 tcpstat.tcps_rcvduppack++; 251 tcpstat.tcps_rcvdupbyte += *tlen; 252 m_freem(m); 253 pool_put(&tcpqe_pool, tiqe); 254 return (0); 255 } 256 m_adj(m, i); 257 *tlen -= i; 258 th->th_seq += i; 259 } 260 } 261 tcpstat.tcps_rcvoopack++; 262 tcpstat.tcps_rcvoobyte += *tlen; 263 264 /* 265 * While we overlap succeeding segments trim them or, 266 * if they are completely covered, dequeue them. 267 */ 268 for (; q != NULL; q = nq) { 269 struct tcphdr *qhdr = q->tcpqe_tcp; 270 int i = (th->th_seq + *tlen) - qhdr->th_seq; 271 272 if (i <= 0) 273 break; 274 if (i < qhdr->th_reseqlen) { 275 qhdr->th_seq += i; 276 qhdr->th_reseqlen -= i; 277 m_adj(q->tcpqe_m, i); 278 break; 279 } 280 nq = TAILQ_NEXT(q, tcpqe_q); 281 m_freem(q->tcpqe_m); 282 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 283 pool_put(&tcpqe_pool, q); 284 } 285 286 /* Insert the new segment queue entry into place. */ 287 tiqe->tcpqe_m = m; 288 th->th_reseqlen = *tlen; 289 tiqe->tcpqe_tcp = th; 290 if (p == NULL) { 291 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 292 } else { 293 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 294 } 295 296 present: 297 /* 298 * Present data to user, advancing rcv_nxt through 299 * completed sequence space. 300 */ 301 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 302 return (0); 303 q = TAILQ_FIRST(&tp->t_segq); 304 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 305 return (0); 306 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 307 return (0); 308 do { 309 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 310 flags = q->tcpqe_tcp->th_flags & TH_FIN; 311 312 nq = TAILQ_NEXT(q, tcpqe_q); 313 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 314 ND6_HINT(tp); 315 if (so->so_state & SS_CANTRCVMORE) 316 m_freem(q->tcpqe_m); 317 else 318 sbappendstream(&so->so_rcv, q->tcpqe_m); 319 pool_put(&tcpqe_pool, q); 320 q = nq; 321 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 322 sorwakeup(so); 323 return (flags); 324 } 325 326 #ifdef INET6 327 int 328 tcp6_input(mp, offp, proto) 329 struct mbuf **mp; 330 int *offp, proto; 331 { 332 struct mbuf *m = *mp; 333 334 #if defined(NFAITH) && 0 < NFAITH 335 if (m->m_pkthdr.rcvif) { 336 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 337 /* XXX send icmp6 host/port unreach? */ 338 m_freem(m); 339 return IPPROTO_DONE; 340 } 341 } 342 #endif 343 344 /* 345 * draft-itojun-ipv6-tcp-to-anycast 346 * better place to put this in? 347 */ 348 if (m->m_flags & M_ANYCAST6) { 349 if (m->m_len >= sizeof(struct ip6_hdr)) { 350 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 351 icmp6_error(m, ICMP6_DST_UNREACH, 352 ICMP6_DST_UNREACH_ADDR, 353 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 354 } else 355 m_freem(m); 356 return IPPROTO_DONE; 357 } 358 359 tcp_input(m, *offp, proto); 360 return IPPROTO_DONE; 361 } 362 #endif 363 364 /* 365 * TCP input routine, follows pages 65-76 of the 366 * protocol specification dated September, 1981 very closely. 367 */ 368 void 369 tcp_input(struct mbuf *m, ...) 370 { 371 struct ip *ip; 372 struct inpcb *inp; 373 u_int8_t *optp = NULL; 374 int optlen = 0; 375 int tlen, off; 376 struct tcpcb *tp = 0; 377 int tiflags; 378 struct socket *so = NULL; 379 int todrop, acked, ourfinisacked, needoutput = 0; 380 int hdroptlen = 0; 381 short ostate = 0; 382 tcp_seq iss, *reuse = NULL; 383 u_long tiwin; 384 struct tcp_opt_info opti; 385 int iphlen; 386 va_list ap; 387 struct tcphdr *th; 388 #ifdef INET6 389 struct ip6_hdr *ip6 = NULL; 390 #endif /* INET6 */ 391 #ifdef IPSEC 392 struct m_tag *mtag; 393 struct tdb_ident *tdbi; 394 struct tdb *tdb; 395 int error, s; 396 #endif /* IPSEC */ 397 int af; 398 #ifdef TCP_ECN 399 u_char iptos; 400 #endif 401 402 va_start(ap, m); 403 iphlen = va_arg(ap, int); 404 va_end(ap); 405 406 tcpstat.tcps_rcvtotal++; 407 408 opti.ts_present = 0; 409 opti.maxseg = 0; 410 411 /* 412 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 413 * See below for AF specific multicast. 414 */ 415 if (m->m_flags & (M_BCAST|M_MCAST)) 416 goto drop; 417 418 /* 419 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 420 * TCP/IPv4. 421 */ 422 switch (mtod(m, struct ip *)->ip_v) { 423 #ifdef INET6 424 case 6: 425 af = AF_INET6; 426 break; 427 #endif 428 case 4: 429 af = AF_INET; 430 break; 431 default: 432 m_freem(m); 433 return; /*EAFNOSUPPORT*/ 434 } 435 436 /* 437 * Get IP and TCP header together in first mbuf. 438 * Note: IP leaves IP header in first mbuf. 439 */ 440 switch (af) { 441 case AF_INET: 442 #ifdef DIAGNOSTIC 443 if (iphlen < sizeof(struct ip)) { 444 m_freem(m); 445 return; 446 } 447 #endif /* DIAGNOSTIC */ 448 break; 449 #ifdef INET6 450 case AF_INET6: 451 #ifdef DIAGNOSTIC 452 if (iphlen < sizeof(struct ip6_hdr)) { 453 m_freem(m); 454 return; 455 } 456 #endif /* DIAGNOSTIC */ 457 break; 458 #endif 459 default: 460 m_freem(m); 461 return; 462 } 463 464 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 465 if (!th) { 466 tcpstat.tcps_rcvshort++; 467 return; 468 } 469 470 tlen = m->m_pkthdr.len - iphlen; 471 ip = NULL; 472 #ifdef INET6 473 ip6 = NULL; 474 #endif 475 switch (af) { 476 case AF_INET: 477 ip = mtod(m, struct ip *); 478 if (IN_MULTICAST(ip->ip_dst.s_addr) || 479 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 480 goto drop; 481 #ifdef TCP_ECN 482 /* save ip_tos before clearing it for checksum */ 483 iptos = ip->ip_tos; 484 #endif 485 /* 486 * Checksum extended TCP header and data. 487 */ 488 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 489 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 490 tcpstat.tcps_inhwcsum++; 491 tcpstat.tcps_rcvbadsum++; 492 goto drop; 493 } 494 if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) { 495 tcpstat.tcps_rcvbadsum++; 496 goto drop; 497 } 498 } else { 499 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 500 tcpstat.tcps_inhwcsum++; 501 } 502 break; 503 #ifdef INET6 504 case AF_INET6: 505 ip6 = mtod(m, struct ip6_hdr *); 506 #ifdef TCP_ECN 507 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 508 #endif 509 510 /* Be proactive about malicious use of IPv4 mapped address */ 511 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 512 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 513 /* XXX stat */ 514 goto drop; 515 } 516 517 /* 518 * Be proactive about unspecified IPv6 address in source. 519 * As we use all-zero to indicate unbounded/unconnected pcb, 520 * unspecified IPv6 address can be used to confuse us. 521 * 522 * Note that packets with unspecified IPv6 destination is 523 * already dropped in ip6_input. 524 */ 525 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 526 /* XXX stat */ 527 goto drop; 528 } 529 530 /* Discard packets to multicast */ 531 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 532 /* XXX stat */ 533 goto drop; 534 } 535 536 /* 537 * Checksum extended TCP header and data. 538 */ 539 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 540 tcpstat.tcps_rcvbadsum++; 541 goto drop; 542 } 543 break; 544 #endif 545 } 546 547 /* 548 * Check that TCP offset makes sense, 549 * pull out TCP options and adjust length. XXX 550 */ 551 off = th->th_off << 2; 552 if (off < sizeof(struct tcphdr) || off > tlen) { 553 tcpstat.tcps_rcvbadoff++; 554 goto drop; 555 } 556 tlen -= off; 557 if (off > sizeof(struct tcphdr)) { 558 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 559 if (!th) { 560 tcpstat.tcps_rcvshort++; 561 return; 562 } 563 optlen = off - sizeof(struct tcphdr); 564 optp = (u_int8_t *)(th + 1); 565 /* 566 * Do quick retrieval of timestamp options ("options 567 * prediction?"). If timestamp is the only option and it's 568 * formatted as recommended in RFC 1323 appendix A, we 569 * quickly get the values now and not bother calling 570 * tcp_dooptions(), etc. 571 */ 572 if ((optlen == TCPOLEN_TSTAMP_APPA || 573 (optlen > TCPOLEN_TSTAMP_APPA && 574 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 575 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 576 (th->th_flags & TH_SYN) == 0) { 577 opti.ts_present = 1; 578 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 579 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 580 optp = NULL; /* we've parsed the options */ 581 } 582 } 583 tiflags = th->th_flags; 584 585 /* 586 * Convert TCP protocol specific fields to host format. 587 */ 588 NTOHL(th->th_seq); 589 NTOHL(th->th_ack); 590 NTOHS(th->th_win); 591 NTOHS(th->th_urp); 592 593 /* 594 * Locate pcb for segment. 595 */ 596 findpcb: 597 switch (af) { 598 #ifdef INET6 599 case AF_INET6: 600 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport, 601 &ip6->ip6_dst, th->th_dport); 602 break; 603 #endif 604 case AF_INET: 605 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport, 606 ip->ip_dst, th->th_dport); 607 break; 608 } 609 if (inp == 0) { 610 int inpl_flags = 0; 611 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 612 inpl_flags = INPLOOKUP_WILDCARD; 613 ++tcpstat.tcps_pcbhashmiss; 614 switch (af) { 615 #ifdef INET6 616 case AF_INET6: 617 inp = in6_pcblookup_listen(&tcbtable, 618 &ip6->ip6_dst, th->th_dport, inpl_flags); 619 break; 620 #endif /* INET6 */ 621 case AF_INET: 622 inp = in_pcblookup_listen(&tcbtable, 623 ip->ip_dst, th->th_dport, inpl_flags); 624 break; 625 } 626 /* 627 * If the state is CLOSED (i.e., TCB does not exist) then 628 * all data in the incoming segment is discarded. 629 * If the TCB exists but is in CLOSED state, it is embryonic, 630 * but should either do a listen or a connect soon. 631 */ 632 if (inp == 0) { 633 ++tcpstat.tcps_noport; 634 goto dropwithreset_ratelim; 635 } 636 } 637 638 /* Check the minimum TTL for socket. */ 639 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 640 goto drop; 641 642 tp = intotcpcb(inp); 643 if (tp == 0) 644 goto dropwithreset_ratelim; 645 if (tp->t_state == TCPS_CLOSED) 646 goto drop; 647 648 /* Unscale the window into a 32-bit value. */ 649 if ((tiflags & TH_SYN) == 0) 650 tiwin = th->th_win << tp->snd_scale; 651 else 652 tiwin = th->th_win; 653 654 so = inp->inp_socket; 655 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 656 union syn_cache_sa src; 657 union syn_cache_sa dst; 658 659 bzero(&src, sizeof(src)); 660 bzero(&dst, sizeof(dst)); 661 switch (af) { 662 #ifdef INET 663 case AF_INET: 664 src.sin.sin_len = sizeof(struct sockaddr_in); 665 src.sin.sin_family = AF_INET; 666 src.sin.sin_addr = ip->ip_src; 667 src.sin.sin_port = th->th_sport; 668 669 dst.sin.sin_len = sizeof(struct sockaddr_in); 670 dst.sin.sin_family = AF_INET; 671 dst.sin.sin_addr = ip->ip_dst; 672 dst.sin.sin_port = th->th_dport; 673 break; 674 #endif 675 #ifdef INET6 676 case AF_INET6: 677 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 678 src.sin6.sin6_family = AF_INET6; 679 src.sin6.sin6_addr = ip6->ip6_src; 680 src.sin6.sin6_port = th->th_sport; 681 682 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 683 dst.sin6.sin6_family = AF_INET6; 684 dst.sin6.sin6_addr = ip6->ip6_dst; 685 dst.sin6.sin6_port = th->th_dport; 686 break; 687 #endif /* INET6 */ 688 default: 689 goto badsyn; /*sanity*/ 690 } 691 692 if (so->so_options & SO_DEBUG) { 693 ostate = tp->t_state; 694 switch (af) { 695 #ifdef INET6 696 case AF_INET6: 697 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 698 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 699 break; 700 #endif 701 case AF_INET: 702 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 703 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 704 break; 705 } 706 } 707 if (so->so_options & SO_ACCEPTCONN) { 708 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 709 if (tiflags & TH_RST) { 710 syn_cache_reset(&src.sa, &dst.sa, th); 711 } else if ((tiflags & (TH_ACK|TH_SYN)) == 712 (TH_ACK|TH_SYN)) { 713 /* 714 * Received a SYN,ACK. This should 715 * never happen while we are in 716 * LISTEN. Send an RST. 717 */ 718 goto badsyn; 719 } else if (tiflags & TH_ACK) { 720 so = syn_cache_get(&src.sa, &dst.sa, 721 th, iphlen, tlen, so, m); 722 if (so == NULL) { 723 /* 724 * We don't have a SYN for 725 * this ACK; send an RST. 726 */ 727 goto badsyn; 728 } else if (so == 729 (struct socket *)(-1)) { 730 /* 731 * We were unable to create 732 * the connection. If the 733 * 3-way handshake was 734 * completed, and RST has 735 * been sent to the peer. 736 * Since the mbuf might be 737 * in use for the reply, 738 * do not free it. 739 */ 740 m = NULL; 741 } else { 742 /* 743 * We have created a 744 * full-blown connection. 745 */ 746 tp = NULL; 747 inp = (struct inpcb *)so->so_pcb; 748 tp = intotcpcb(inp); 749 if (tp == NULL) 750 goto badsyn; /*XXX*/ 751 752 /* 753 * Compute proper scaling 754 * value from buffer space 755 */ 756 tcp_rscale(tp, so->so_rcv.sb_hiwat); 757 goto after_listen; 758 } 759 } else { 760 /* 761 * None of RST, SYN or ACK was set. 762 * This is an invalid packet for a 763 * TCB in LISTEN state. Send a RST. 764 */ 765 goto badsyn; 766 } 767 } else { 768 /* 769 * Received a SYN. 770 */ 771 #ifdef INET6 772 /* 773 * If deprecated address is forbidden, we do 774 * not accept SYN to deprecated interface 775 * address to prevent any new inbound 776 * connection from getting established. 777 * When we do not accept SYN, we send a TCP 778 * RST, with deprecated source address (instead 779 * of dropping it). We compromise it as it is 780 * much better for peer to send a RST, and 781 * RST will be the final packet for the 782 * exchange. 783 * 784 * If we do not forbid deprecated addresses, we 785 * accept the SYN packet. RFC2462 does not 786 * suggest dropping SYN in this case. 787 * If we decipher RFC2462 5.5.4, it says like 788 * this: 789 * 1. use of deprecated addr with existing 790 * communication is okay - "SHOULD continue 791 * to be used" 792 * 2. use of it with new communication: 793 * (2a) "SHOULD NOT be used if alternate 794 * address with sufficient scope is 795 * available" 796 * (2b) nothing mentioned otherwise. 797 * Here we fall into (2b) case as we have no 798 * choice in our source address selection - we 799 * must obey the peer. 800 * 801 * The wording in RFC2462 is confusing, and 802 * there are multiple description text for 803 * deprecated address handling - worse, they 804 * are not exactly the same. I believe 5.5.4 805 * is the best one, so we follow 5.5.4. 806 */ 807 if (ip6 && !ip6_use_deprecated) { 808 struct in6_ifaddr *ia6; 809 810 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 811 &ip6->ip6_dst)) && 812 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 813 tp = NULL; 814 goto dropwithreset; 815 } 816 } 817 #endif 818 819 /* 820 * LISTEN socket received a SYN 821 * from itself? This can't possibly 822 * be valid; drop the packet. 823 */ 824 if (th->th_dport == th->th_sport) { 825 switch (af) { 826 #ifdef INET6 827 case AF_INET6: 828 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 829 &ip6->ip6_dst)) { 830 tcpstat.tcps_badsyn++; 831 goto drop; 832 } 833 break; 834 #endif /* INET6 */ 835 case AF_INET: 836 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 837 tcpstat.tcps_badsyn++; 838 goto drop; 839 } 840 break; 841 } 842 } 843 844 /* 845 * SYN looks ok; create compressed TCP 846 * state for it. 847 */ 848 if (so->so_qlen <= so->so_qlimit && 849 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 850 so, m, optp, optlen, &opti, reuse)) 851 m = NULL; 852 } 853 goto drop; 854 } 855 } 856 857 after_listen: 858 #ifdef DIAGNOSTIC 859 /* 860 * Should not happen now that all embryonic connections 861 * are handled with compressed state. 862 */ 863 if (tp->t_state == TCPS_LISTEN) 864 panic("tcp_input: TCPS_LISTEN"); 865 #endif 866 867 #ifdef IPSEC 868 /* Find most recent IPsec tag */ 869 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 870 s = splnet(); 871 if (mtag != NULL) { 872 tdbi = (struct tdb_ident *)(mtag + 1); 873 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 874 } else 875 tdb = NULL; 876 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 877 tdb, inp); 878 if (error) { 879 splx(s); 880 goto drop; 881 } 882 883 /* Latch SA */ 884 if (inp->inp_tdb_in != tdb) { 885 if (tdb) { 886 tdb_add_inp(tdb, inp, 1); 887 if (inp->inp_ipo == NULL) { 888 inp->inp_ipo = ipsec_add_policy(inp, af, 889 IPSP_DIRECTION_OUT); 890 if (inp->inp_ipo == NULL) { 891 splx(s); 892 goto drop; 893 } 894 } 895 if (inp->inp_ipo->ipo_dstid == NULL && 896 tdb->tdb_srcid != NULL) { 897 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 898 tdb->tdb_srcid->ref_count++; 899 } 900 if (inp->inp_ipsec_remotecred == NULL && 901 tdb->tdb_remote_cred != NULL) { 902 inp->inp_ipsec_remotecred = 903 tdb->tdb_remote_cred; 904 tdb->tdb_remote_cred->ref_count++; 905 } 906 if (inp->inp_ipsec_remoteauth == NULL && 907 tdb->tdb_remote_auth != NULL) { 908 inp->inp_ipsec_remoteauth = 909 tdb->tdb_remote_auth; 910 tdb->tdb_remote_auth->ref_count++; 911 } 912 } else { /* Just reset */ 913 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 914 inp_tdb_in_next); 915 inp->inp_tdb_in = NULL; 916 } 917 } 918 splx(s); 919 #endif /* IPSEC */ 920 921 /* 922 * Segment received on connection. 923 * Reset idle time and keep-alive timer. 924 */ 925 tp->t_rcvtime = tcp_now; 926 if (TCPS_HAVEESTABLISHED(tp->t_state)) 927 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 928 929 #ifdef TCP_SACK 930 if (tp->sack_enable) 931 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 932 #endif /* TCP_SACK */ 933 934 /* 935 * Process options. 936 */ 937 #ifdef TCP_SIGNATURE 938 if (optp || (tp->t_flags & TF_SIGNATURE)) 939 #else 940 if (optp) 941 #endif 942 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti)) 943 goto drop; 944 945 if (opti.ts_present && opti.ts_ecr) { 946 int rtt_test; 947 948 /* subtract out the tcp timestamp modulator */ 949 opti.ts_ecr -= tp->ts_modulate; 950 951 /* make sure ts_ecr is sensible */ 952 rtt_test = tcp_now - opti.ts_ecr; 953 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 954 opti.ts_ecr = 0; 955 } 956 957 #ifdef TCP_ECN 958 /* if congestion experienced, set ECE bit in subsequent packets. */ 959 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 960 tp->t_flags |= TF_RCVD_CE; 961 tcpstat.tcps_ecn_rcvce++; 962 } 963 #endif 964 /* 965 * Header prediction: check for the two common cases 966 * of a uni-directional data xfer. If the packet has 967 * no control flags, is in-sequence, the window didn't 968 * change and we're not retransmitting, it's a 969 * candidate. If the length is zero and the ack moved 970 * forward, we're the sender side of the xfer. Just 971 * free the data acked & wake any higher level process 972 * that was blocked waiting for space. If the length 973 * is non-zero and the ack didn't move, we're the 974 * receiver side. If we're getting packets in-order 975 * (the reassembly queue is empty), add the data to 976 * the socket buffer and note that we need a delayed ack. 977 */ 978 if (tp->t_state == TCPS_ESTABLISHED && 979 #ifdef TCP_ECN 980 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 981 #else 982 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 983 #endif 984 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 985 th->th_seq == tp->rcv_nxt && 986 tiwin && tiwin == tp->snd_wnd && 987 tp->snd_nxt == tp->snd_max) { 988 989 /* 990 * If last ACK falls within this segment's sequence numbers, 991 * record the timestamp. 992 * Fix from Braden, see Stevens p. 870 993 */ 994 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 995 tp->ts_recent_age = tcp_now; 996 tp->ts_recent = opti.ts_val; 997 } 998 999 if (tlen == 0) { 1000 if (SEQ_GT(th->th_ack, tp->snd_una) && 1001 SEQ_LEQ(th->th_ack, tp->snd_max) && 1002 tp->snd_cwnd >= tp->snd_wnd && 1003 tp->t_dupacks == 0) { 1004 /* 1005 * this is a pure ack for outstanding data. 1006 */ 1007 ++tcpstat.tcps_predack; 1008 if (opti.ts_present && opti.ts_ecr) 1009 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1010 else if (tp->t_rtttime && 1011 SEQ_GT(th->th_ack, tp->t_rtseq)) 1012 tcp_xmit_timer(tp, 1013 tcp_now - tp->t_rtttime); 1014 acked = th->th_ack - tp->snd_una; 1015 tcpstat.tcps_rcvackpack++; 1016 tcpstat.tcps_rcvackbyte += acked; 1017 ND6_HINT(tp); 1018 sbdrop(&so->so_snd, acked); 1019 1020 /* 1021 * If we had a pending ICMP message that 1022 * referres to data that have just been 1023 * acknowledged, disregard the recorded ICMP 1024 * message. 1025 */ 1026 if ((tp->t_flags & TF_PMTUD_PEND) && 1027 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1028 tp->t_flags &= ~TF_PMTUD_PEND; 1029 1030 /* 1031 * Keep track of the largest chunk of data 1032 * acknowledged since last PMTU update 1033 */ 1034 if (tp->t_pmtud_mss_acked < acked) 1035 tp->t_pmtud_mss_acked = acked; 1036 1037 tp->snd_una = th->th_ack; 1038 #if defined(TCP_SACK) || defined(TCP_ECN) 1039 /* 1040 * We want snd_last to track snd_una so 1041 * as to avoid sequence wraparound problems 1042 * for very large transfers. 1043 */ 1044 #ifdef TCP_ECN 1045 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1046 #endif 1047 tp->snd_last = tp->snd_una; 1048 #endif /* TCP_SACK */ 1049 #if defined(TCP_SACK) && defined(TCP_FACK) 1050 tp->snd_fack = tp->snd_una; 1051 tp->retran_data = 0; 1052 #endif /* TCP_FACK */ 1053 m_freem(m); 1054 1055 /* 1056 * If all outstanding data are acked, stop 1057 * retransmit timer, otherwise restart timer 1058 * using current (possibly backed-off) value. 1059 * If process is waiting for space, 1060 * wakeup/selwakeup/signal. If data 1061 * are ready to send, let tcp_output 1062 * decide between more output or persist. 1063 */ 1064 if (tp->snd_una == tp->snd_max) 1065 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1066 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1067 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1068 1069 if (sb_notify(&so->so_snd)) 1070 sowwakeup(so); 1071 if (so->so_snd.sb_cc) 1072 (void) tcp_output(tp); 1073 return; 1074 } 1075 } else if (th->th_ack == tp->snd_una && 1076 TAILQ_EMPTY(&tp->t_segq) && 1077 tlen <= sbspace(&so->so_rcv)) { 1078 /* 1079 * This is a pure, in-sequence data packet 1080 * with nothing on the reassembly queue and 1081 * we have enough buffer space to take it. 1082 */ 1083 #ifdef TCP_SACK 1084 /* Clean receiver SACK report if present */ 1085 if (tp->sack_enable && tp->rcv_numsacks) 1086 tcp_clean_sackreport(tp); 1087 #endif /* TCP_SACK */ 1088 ++tcpstat.tcps_preddat; 1089 tp->rcv_nxt += tlen; 1090 tcpstat.tcps_rcvpack++; 1091 tcpstat.tcps_rcvbyte += tlen; 1092 ND6_HINT(tp); 1093 /* 1094 * Drop TCP, IP headers and TCP options then add data 1095 * to socket buffer. 1096 */ 1097 if (so->so_state & SS_CANTRCVMORE) 1098 m_freem(m); 1099 else { 1100 m_adj(m, iphlen + off); 1101 sbappendstream(&so->so_rcv, m); 1102 } 1103 sorwakeup(so); 1104 TCP_SETUP_ACK(tp, tiflags); 1105 if (tp->t_flags & TF_ACKNOW) 1106 (void) tcp_output(tp); 1107 return; 1108 } 1109 } 1110 1111 /* 1112 * Compute mbuf offset to TCP data segment. 1113 */ 1114 hdroptlen = iphlen + off; 1115 1116 /* 1117 * Calculate amount of space in receive window, 1118 * and then do TCP input processing. 1119 * Receive window is amount of space in rcv queue, 1120 * but not less than advertised window. 1121 */ 1122 { int win; 1123 1124 win = sbspace(&so->so_rcv); 1125 if (win < 0) 1126 win = 0; 1127 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1128 } 1129 1130 switch (tp->t_state) { 1131 1132 /* 1133 * If the state is SYN_RECEIVED: 1134 * if seg contains SYN/ACK, send an RST. 1135 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1136 */ 1137 1138 case TCPS_SYN_RECEIVED: 1139 if (tiflags & TH_ACK) { 1140 if (tiflags & TH_SYN) { 1141 tcpstat.tcps_badsyn++; 1142 goto dropwithreset; 1143 } 1144 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1145 SEQ_GT(th->th_ack, tp->snd_max)) 1146 goto dropwithreset; 1147 } 1148 break; 1149 1150 /* 1151 * If the state is SYN_SENT: 1152 * if seg contains an ACK, but not for our SYN, drop the input. 1153 * if seg contains a RST, then drop the connection. 1154 * if seg does not contain SYN, then drop it. 1155 * Otherwise this is an acceptable SYN segment 1156 * initialize tp->rcv_nxt and tp->irs 1157 * if seg contains ack then advance tp->snd_una 1158 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1159 * arrange for segment to be acked (eventually) 1160 * continue processing rest of data/controls, beginning with URG 1161 */ 1162 case TCPS_SYN_SENT: 1163 if ((tiflags & TH_ACK) && 1164 (SEQ_LEQ(th->th_ack, tp->iss) || 1165 SEQ_GT(th->th_ack, tp->snd_max))) 1166 goto dropwithreset; 1167 if (tiflags & TH_RST) { 1168 #ifdef TCP_ECN 1169 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1170 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1171 goto drop; 1172 #endif 1173 if (tiflags & TH_ACK) 1174 tp = tcp_drop(tp, ECONNREFUSED); 1175 goto drop; 1176 } 1177 if ((tiflags & TH_SYN) == 0) 1178 goto drop; 1179 if (tiflags & TH_ACK) { 1180 tp->snd_una = th->th_ack; 1181 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1182 tp->snd_nxt = tp->snd_una; 1183 } 1184 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1185 tp->irs = th->th_seq; 1186 tcp_mss(tp, opti.maxseg); 1187 /* Reset initial window to 1 segment for retransmit */ 1188 if (tp->t_rxtshift > 0) 1189 tp->snd_cwnd = tp->t_maxseg; 1190 tcp_rcvseqinit(tp); 1191 tp->t_flags |= TF_ACKNOW; 1192 #ifdef TCP_SACK 1193 /* 1194 * If we've sent a SACK_PERMITTED option, and the peer 1195 * also replied with one, then TF_SACK_PERMIT should have 1196 * been set in tcp_dooptions(). If it was not, disable SACKs. 1197 */ 1198 if (tp->sack_enable) 1199 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1200 #endif 1201 #ifdef TCP_ECN 1202 /* 1203 * if ECE is set but CWR is not set for SYN-ACK, or 1204 * both ECE and CWR are set for simultaneous open, 1205 * peer is ECN capable. 1206 */ 1207 if (tcp_do_ecn) { 1208 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1209 == (TH_ACK|TH_ECE) || 1210 (tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1211 == (TH_ECE|TH_CWR)) { 1212 tp->t_flags |= TF_ECN_PERMIT; 1213 tiflags &= ~(TH_ECE|TH_CWR); 1214 tcpstat.tcps_ecn_accepts++; 1215 } 1216 } 1217 #endif 1218 1219 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1220 tcpstat.tcps_connects++; 1221 soisconnected(so); 1222 tp->t_state = TCPS_ESTABLISHED; 1223 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1224 /* Do window scaling on this connection? */ 1225 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1226 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1227 tp->snd_scale = tp->requested_s_scale; 1228 tp->rcv_scale = tp->request_r_scale; 1229 } 1230 tcp_reass_lock(tp); 1231 (void) tcp_reass(tp, (struct tcphdr *)0, 1232 (struct mbuf *)0, &tlen); 1233 tcp_reass_unlock(tp); 1234 /* 1235 * if we didn't have to retransmit the SYN, 1236 * use its rtt as our initial srtt & rtt var. 1237 */ 1238 if (tp->t_rtttime) 1239 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1240 /* 1241 * Since new data was acked (the SYN), open the 1242 * congestion window by one MSS. We do this 1243 * here, because we won't go through the normal 1244 * ACK processing below. And since this is the 1245 * start of the connection, we know we are in 1246 * the exponential phase of slow-start. 1247 */ 1248 tp->snd_cwnd += tp->t_maxseg; 1249 } else 1250 tp->t_state = TCPS_SYN_RECEIVED; 1251 1252 #if 0 1253 trimthenstep6: 1254 #endif 1255 /* 1256 * Advance th->th_seq to correspond to first data byte. 1257 * If data, trim to stay within window, 1258 * dropping FIN if necessary. 1259 */ 1260 th->th_seq++; 1261 if (tlen > tp->rcv_wnd) { 1262 todrop = tlen - tp->rcv_wnd; 1263 m_adj(m, -todrop); 1264 tlen = tp->rcv_wnd; 1265 tiflags &= ~TH_FIN; 1266 tcpstat.tcps_rcvpackafterwin++; 1267 tcpstat.tcps_rcvbyteafterwin += todrop; 1268 } 1269 tp->snd_wl1 = th->th_seq - 1; 1270 tp->rcv_up = th->th_seq; 1271 goto step6; 1272 /* 1273 * If a new connection request is received while in TIME_WAIT, 1274 * drop the old connection and start over if the if the 1275 * timestamp or the sequence numbers are above the previous 1276 * ones. 1277 */ 1278 case TCPS_TIME_WAIT: 1279 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1280 ((opti.ts_present && 1281 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1282 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1283 /* 1284 * Advance the iss by at least 32768, but 1285 * clear the msb in order to make sure 1286 * that SEG_LT(snd_nxt, iss). 1287 */ 1288 iss = tp->snd_nxt + 1289 ((arc4random() & 0x7fffffff) | 0x8000); 1290 reuse = &iss; 1291 tp = tcp_close(tp); 1292 goto findpcb; 1293 } 1294 } 1295 1296 /* 1297 * States other than LISTEN or SYN_SENT. 1298 * First check timestamp, if present. 1299 * Then check that at least some bytes of segment are within 1300 * receive window. If segment begins before rcv_nxt, 1301 * drop leading data (and SYN); if nothing left, just ack. 1302 * 1303 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1304 * and it's less than opti.ts_recent, drop it. 1305 */ 1306 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1307 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1308 1309 /* Check to see if ts_recent is over 24 days old. */ 1310 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1311 /* 1312 * Invalidate ts_recent. If this segment updates 1313 * ts_recent, the age will be reset later and ts_recent 1314 * will get a valid value. If it does not, setting 1315 * ts_recent to zero will at least satisfy the 1316 * requirement that zero be placed in the timestamp 1317 * echo reply when ts_recent isn't valid. The 1318 * age isn't reset until we get a valid ts_recent 1319 * because we don't want out-of-order segments to be 1320 * dropped when ts_recent is old. 1321 */ 1322 tp->ts_recent = 0; 1323 } else { 1324 tcpstat.tcps_rcvduppack++; 1325 tcpstat.tcps_rcvdupbyte += tlen; 1326 tcpstat.tcps_pawsdrop++; 1327 goto dropafterack; 1328 } 1329 } 1330 1331 todrop = tp->rcv_nxt - th->th_seq; 1332 if (todrop > 0) { 1333 if (tiflags & TH_SYN) { 1334 tiflags &= ~TH_SYN; 1335 th->th_seq++; 1336 if (th->th_urp > 1) 1337 th->th_urp--; 1338 else 1339 tiflags &= ~TH_URG; 1340 todrop--; 1341 } 1342 if (todrop > tlen || 1343 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1344 /* 1345 * Any valid FIN must be to the left of the 1346 * window. At this point, FIN must be a 1347 * duplicate or out-of-sequence, so drop it. 1348 */ 1349 tiflags &= ~TH_FIN; 1350 /* 1351 * Send ACK to resynchronize, and drop any data, 1352 * but keep on processing for RST or ACK. 1353 */ 1354 tp->t_flags |= TF_ACKNOW; 1355 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1356 tcpstat.tcps_rcvduppack++; 1357 } else { 1358 tcpstat.tcps_rcvpartduppack++; 1359 tcpstat.tcps_rcvpartdupbyte += todrop; 1360 } 1361 hdroptlen += todrop; /* drop from head afterwards */ 1362 th->th_seq += todrop; 1363 tlen -= todrop; 1364 if (th->th_urp > todrop) 1365 th->th_urp -= todrop; 1366 else { 1367 tiflags &= ~TH_URG; 1368 th->th_urp = 0; 1369 } 1370 } 1371 1372 /* 1373 * If new data are received on a connection after the 1374 * user processes are gone, then RST the other end. 1375 */ 1376 if ((so->so_state & SS_NOFDREF) && 1377 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1378 tp = tcp_close(tp); 1379 tcpstat.tcps_rcvafterclose++; 1380 goto dropwithreset; 1381 } 1382 1383 /* 1384 * If segment ends after window, drop trailing data 1385 * (and PUSH and FIN); if nothing left, just ACK. 1386 */ 1387 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1388 if (todrop > 0) { 1389 tcpstat.tcps_rcvpackafterwin++; 1390 if (todrop >= tlen) { 1391 tcpstat.tcps_rcvbyteafterwin += tlen; 1392 /* 1393 * If window is closed can only take segments at 1394 * window edge, and have to drop data and PUSH from 1395 * incoming segments. Continue processing, but 1396 * remember to ack. Otherwise, drop segment 1397 * and ack. 1398 */ 1399 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1400 tp->t_flags |= TF_ACKNOW; 1401 tcpstat.tcps_rcvwinprobe++; 1402 } else 1403 goto dropafterack; 1404 } else 1405 tcpstat.tcps_rcvbyteafterwin += todrop; 1406 m_adj(m, -todrop); 1407 tlen -= todrop; 1408 tiflags &= ~(TH_PUSH|TH_FIN); 1409 } 1410 1411 /* 1412 * If last ACK falls within this segment's sequence numbers, 1413 * record its timestamp if it's more recent. 1414 * Cf fix from Braden, see Stevens p. 870 1415 */ 1416 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1417 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1418 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1419 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1420 tp->ts_recent = opti.ts_val; 1421 else 1422 tp->ts_recent = 0; 1423 tp->ts_recent_age = tcp_now; 1424 } 1425 1426 /* 1427 * If the RST bit is set examine the state: 1428 * SYN_RECEIVED STATE: 1429 * If passive open, return to LISTEN state. 1430 * If active open, inform user that connection was refused. 1431 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1432 * Inform user that connection was reset, and close tcb. 1433 * CLOSING, LAST_ACK, TIME_WAIT STATES 1434 * Close the tcb. 1435 */ 1436 if (tiflags & TH_RST) { 1437 if (th->th_seq != tp->last_ack_sent && 1438 th->th_seq != tp->rcv_nxt && 1439 th->th_seq != (tp->rcv_nxt + 1)) 1440 goto drop; 1441 1442 switch (tp->t_state) { 1443 case TCPS_SYN_RECEIVED: 1444 #ifdef TCP_ECN 1445 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1446 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1447 goto drop; 1448 #endif 1449 so->so_error = ECONNREFUSED; 1450 goto close; 1451 1452 case TCPS_ESTABLISHED: 1453 case TCPS_FIN_WAIT_1: 1454 case TCPS_FIN_WAIT_2: 1455 case TCPS_CLOSE_WAIT: 1456 so->so_error = ECONNRESET; 1457 close: 1458 tp->t_state = TCPS_CLOSED; 1459 tcpstat.tcps_drops++; 1460 tp = tcp_close(tp); 1461 goto drop; 1462 case TCPS_CLOSING: 1463 case TCPS_LAST_ACK: 1464 case TCPS_TIME_WAIT: 1465 tp = tcp_close(tp); 1466 goto drop; 1467 } 1468 } 1469 1470 /* 1471 * If a SYN is in the window, then this is an 1472 * error and we ACK and drop the packet. 1473 */ 1474 if (tiflags & TH_SYN) 1475 goto dropafterack_ratelim; 1476 1477 /* 1478 * If the ACK bit is off we drop the segment and return. 1479 */ 1480 if ((tiflags & TH_ACK) == 0) { 1481 if (tp->t_flags & TF_ACKNOW) 1482 goto dropafterack; 1483 else 1484 goto drop; 1485 } 1486 1487 /* 1488 * Ack processing. 1489 */ 1490 switch (tp->t_state) { 1491 1492 /* 1493 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1494 * ESTABLISHED state and continue processing. 1495 * The ACK was checked above. 1496 */ 1497 case TCPS_SYN_RECEIVED: 1498 tcpstat.tcps_connects++; 1499 soisconnected(so); 1500 tp->t_state = TCPS_ESTABLISHED; 1501 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1502 /* Do window scaling? */ 1503 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1504 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1505 tp->snd_scale = tp->requested_s_scale; 1506 tp->rcv_scale = tp->request_r_scale; 1507 tiwin = th->th_win << tp->snd_scale; 1508 } 1509 tcp_reass_lock(tp); 1510 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1511 &tlen); 1512 tcp_reass_unlock(tp); 1513 tp->snd_wl1 = th->th_seq - 1; 1514 /* fall into ... */ 1515 1516 /* 1517 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1518 * ACKs. If the ack is in the range 1519 * tp->snd_una < th->th_ack <= tp->snd_max 1520 * then advance tp->snd_una to th->th_ack and drop 1521 * data from the retransmission queue. If this ACK reflects 1522 * more up to date window information we update our window information. 1523 */ 1524 case TCPS_ESTABLISHED: 1525 case TCPS_FIN_WAIT_1: 1526 case TCPS_FIN_WAIT_2: 1527 case TCPS_CLOSE_WAIT: 1528 case TCPS_CLOSING: 1529 case TCPS_LAST_ACK: 1530 case TCPS_TIME_WAIT: 1531 #ifdef TCP_ECN 1532 /* 1533 * if we receive ECE and are not already in recovery phase, 1534 * reduce cwnd by half but don't slow-start. 1535 * advance snd_last to snd_max not to reduce cwnd again 1536 * until all outstanding packets are acked. 1537 */ 1538 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1539 if ((tp->t_flags & TF_ECN_PERMIT) && 1540 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1541 u_int win; 1542 1543 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1544 if (win > 1) { 1545 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1546 tp->snd_cwnd = tp->snd_ssthresh; 1547 tp->snd_last = tp->snd_max; 1548 tp->t_flags |= TF_SEND_CWR; 1549 tcpstat.tcps_cwr_ecn++; 1550 } 1551 } 1552 tcpstat.tcps_ecn_rcvece++; 1553 } 1554 /* 1555 * if we receive CWR, we know that the peer has reduced 1556 * its congestion window. stop sending ecn-echo. 1557 */ 1558 if ((tiflags & TH_CWR)) { 1559 tp->t_flags &= ~TF_RCVD_CE; 1560 tcpstat.tcps_ecn_rcvcwr++; 1561 } 1562 #endif /* TCP_ECN */ 1563 1564 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1565 /* 1566 * Duplicate/old ACK processing. 1567 * Increments t_dupacks: 1568 * Pure duplicate (same seq/ack/window, no data) 1569 * Doesn't affect t_dupacks: 1570 * Data packets. 1571 * Normal window updates (window opens) 1572 * Resets t_dupacks: 1573 * New data ACKed. 1574 * Window shrinks 1575 * Old ACK 1576 */ 1577 if (tlen) { 1578 /* Drop very old ACKs unless th_seq matches */ 1579 if (th->th_seq != tp->rcv_nxt && 1580 SEQ_LT(th->th_ack, 1581 tp->snd_una - tp->max_sndwnd)) { 1582 tcpstat.tcps_rcvacktooold++; 1583 goto drop; 1584 } 1585 break; 1586 } 1587 /* 1588 * If we get an old ACK, there is probably packet 1589 * reordering going on. Be conservative and reset 1590 * t_dupacks so that we are less aggressive in 1591 * doing a fast retransmit. 1592 */ 1593 if (th->th_ack != tp->snd_una) { 1594 tp->t_dupacks = 0; 1595 break; 1596 } 1597 if (tiwin == tp->snd_wnd) { 1598 tcpstat.tcps_rcvdupack++; 1599 /* 1600 * If we have outstanding data (other than 1601 * a window probe), this is a completely 1602 * duplicate ack (ie, window info didn't 1603 * change), the ack is the biggest we've 1604 * seen and we've seen exactly our rexmt 1605 * threshold of them, assume a packet 1606 * has been dropped and retransmit it. 1607 * Kludge snd_nxt & the congestion 1608 * window so we send only this one 1609 * packet. 1610 * 1611 * We know we're losing at the current 1612 * window size so do congestion avoidance 1613 * (set ssthresh to half the current window 1614 * and pull our congestion window back to 1615 * the new ssthresh). 1616 * 1617 * Dup acks mean that packets have left the 1618 * network (they're now cached at the receiver) 1619 * so bump cwnd by the amount in the receiver 1620 * to keep a constant cwnd packets in the 1621 * network. 1622 */ 1623 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1624 tp->t_dupacks = 0; 1625 #if defined(TCP_SACK) && defined(TCP_FACK) 1626 /* 1627 * In FACK, can enter fast rec. if the receiver 1628 * reports a reass. queue longer than 3 segs. 1629 */ 1630 else if (++tp->t_dupacks == tcprexmtthresh || 1631 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1632 tp->t_maxseg + tp->snd_una)) && 1633 SEQ_GT(tp->snd_una, tp->snd_last))) { 1634 #else 1635 else if (++tp->t_dupacks == tcprexmtthresh) { 1636 #endif /* TCP_FACK */ 1637 tcp_seq onxt = tp->snd_nxt; 1638 u_long win = 1639 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1640 2 / tp->t_maxseg; 1641 1642 #if defined(TCP_SACK) || defined(TCP_ECN) 1643 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1644 /* 1645 * False fast retx after 1646 * timeout. Do not cut window. 1647 */ 1648 tp->t_dupacks = 0; 1649 goto drop; 1650 } 1651 #endif 1652 if (win < 2) 1653 win = 2; 1654 tp->snd_ssthresh = win * tp->t_maxseg; 1655 #if defined(TCP_SACK) 1656 tp->snd_last = tp->snd_max; 1657 #endif 1658 #ifdef TCP_SACK 1659 if (tp->sack_enable) { 1660 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1661 tp->t_rtttime = 0; 1662 #ifdef TCP_ECN 1663 tp->t_flags |= TF_SEND_CWR; 1664 #endif 1665 #if 1 /* TCP_ECN */ 1666 tcpstat.tcps_cwr_frecovery++; 1667 #endif 1668 tcpstat.tcps_sack_recovery_episode++; 1669 #if defined(TCP_SACK) && defined(TCP_FACK) 1670 tp->t_dupacks = tcprexmtthresh; 1671 (void) tcp_output(tp); 1672 /* 1673 * During FR, snd_cwnd is held 1674 * constant for FACK. 1675 */ 1676 tp->snd_cwnd = tp->snd_ssthresh; 1677 #else 1678 /* 1679 * tcp_output() will send 1680 * oldest SACK-eligible rtx. 1681 */ 1682 (void) tcp_output(tp); 1683 tp->snd_cwnd = tp->snd_ssthresh+ 1684 tp->t_maxseg * tp->t_dupacks; 1685 #endif /* TCP_FACK */ 1686 goto drop; 1687 } 1688 #endif /* TCP_SACK */ 1689 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1690 tp->t_rtttime = 0; 1691 tp->snd_nxt = th->th_ack; 1692 tp->snd_cwnd = tp->t_maxseg; 1693 #ifdef TCP_ECN 1694 tp->t_flags |= TF_SEND_CWR; 1695 #endif 1696 #if 1 /* TCP_ECN */ 1697 tcpstat.tcps_cwr_frecovery++; 1698 #endif 1699 tcpstat.tcps_sndrexmitfast++; 1700 (void) tcp_output(tp); 1701 1702 tp->snd_cwnd = tp->snd_ssthresh + 1703 tp->t_maxseg * tp->t_dupacks; 1704 if (SEQ_GT(onxt, tp->snd_nxt)) 1705 tp->snd_nxt = onxt; 1706 goto drop; 1707 } else if (tp->t_dupacks > tcprexmtthresh) { 1708 #if defined(TCP_SACK) && defined(TCP_FACK) 1709 /* 1710 * while (awnd < cwnd) 1711 * sendsomething(); 1712 */ 1713 if (tp->sack_enable) { 1714 if (tp->snd_awnd < tp->snd_cwnd) 1715 tcp_output(tp); 1716 goto drop; 1717 } 1718 #endif /* TCP_FACK */ 1719 tp->snd_cwnd += tp->t_maxseg; 1720 (void) tcp_output(tp); 1721 goto drop; 1722 } 1723 } else if (tiwin < tp->snd_wnd) { 1724 /* 1725 * The window was retracted! Previous dup 1726 * ACKs may have been due to packets arriving 1727 * after the shrunken window, not a missing 1728 * packet, so play it safe and reset t_dupacks 1729 */ 1730 tp->t_dupacks = 0; 1731 } 1732 break; 1733 } 1734 /* 1735 * If the congestion window was inflated to account 1736 * for the other side's cached packets, retract it. 1737 */ 1738 #if defined(TCP_SACK) 1739 if (tp->sack_enable) { 1740 if (tp->t_dupacks >= tcprexmtthresh) { 1741 /* Check for a partial ACK */ 1742 if (tcp_sack_partialack(tp, th)) { 1743 #if defined(TCP_SACK) && defined(TCP_FACK) 1744 /* Force call to tcp_output */ 1745 if (tp->snd_awnd < tp->snd_cwnd) 1746 needoutput = 1; 1747 #else 1748 tp->snd_cwnd += tp->t_maxseg; 1749 needoutput = 1; 1750 #endif /* TCP_FACK */ 1751 } else { 1752 /* Out of fast recovery */ 1753 tp->snd_cwnd = tp->snd_ssthresh; 1754 if (tcp_seq_subtract(tp->snd_max, 1755 th->th_ack) < tp->snd_ssthresh) 1756 tp->snd_cwnd = 1757 tcp_seq_subtract(tp->snd_max, 1758 th->th_ack); 1759 tp->t_dupacks = 0; 1760 #if defined(TCP_SACK) && defined(TCP_FACK) 1761 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1762 tp->snd_fack = th->th_ack; 1763 #endif /* TCP_FACK */ 1764 } 1765 } 1766 } else { 1767 if (tp->t_dupacks >= tcprexmtthresh && 1768 !tcp_newreno(tp, th)) { 1769 /* Out of fast recovery */ 1770 tp->snd_cwnd = tp->snd_ssthresh; 1771 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1772 tp->snd_ssthresh) 1773 tp->snd_cwnd = 1774 tcp_seq_subtract(tp->snd_max, 1775 th->th_ack); 1776 tp->t_dupacks = 0; 1777 } 1778 } 1779 if (tp->t_dupacks < tcprexmtthresh) 1780 tp->t_dupacks = 0; 1781 #else /* else no TCP_SACK */ 1782 if (tp->t_dupacks >= tcprexmtthresh && 1783 tp->snd_cwnd > tp->snd_ssthresh) 1784 tp->snd_cwnd = tp->snd_ssthresh; 1785 tp->t_dupacks = 0; 1786 #endif 1787 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1788 tcpstat.tcps_rcvacktoomuch++; 1789 goto dropafterack_ratelim; 1790 } 1791 acked = th->th_ack - tp->snd_una; 1792 tcpstat.tcps_rcvackpack++; 1793 tcpstat.tcps_rcvackbyte += acked; 1794 1795 /* 1796 * If we have a timestamp reply, update smoothed 1797 * round trip time. If no timestamp is present but 1798 * transmit timer is running and timed sequence 1799 * number was acked, update smoothed round trip time. 1800 * Since we now have an rtt measurement, cancel the 1801 * timer backoff (cf., Phil Karn's retransmit alg.). 1802 * Recompute the initial retransmit timer. 1803 */ 1804 if (opti.ts_present && opti.ts_ecr) 1805 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1806 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1807 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1808 1809 /* 1810 * If all outstanding data is acked, stop retransmit 1811 * timer and remember to restart (more output or persist). 1812 * If there is more data to be acked, restart retransmit 1813 * timer, using current (possibly backed-off) value. 1814 */ 1815 if (th->th_ack == tp->snd_max) { 1816 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1817 needoutput = 1; 1818 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1819 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1820 /* 1821 * When new data is acked, open the congestion window. 1822 * If the window gives us less than ssthresh packets 1823 * in flight, open exponentially (maxseg per packet). 1824 * Otherwise open linearly: maxseg per window 1825 * (maxseg^2 / cwnd per packet). 1826 */ 1827 { 1828 u_int cw = tp->snd_cwnd; 1829 u_int incr = tp->t_maxseg; 1830 1831 if (cw > tp->snd_ssthresh) 1832 incr = incr * incr / cw; 1833 #if defined (TCP_SACK) 1834 if (tp->t_dupacks < tcprexmtthresh) 1835 #endif 1836 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1837 } 1838 ND6_HINT(tp); 1839 if (acked > so->so_snd.sb_cc) { 1840 tp->snd_wnd -= so->so_snd.sb_cc; 1841 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1842 ourfinisacked = 1; 1843 } else { 1844 sbdrop(&so->so_snd, acked); 1845 tp->snd_wnd -= acked; 1846 ourfinisacked = 0; 1847 } 1848 if (sb_notify(&so->so_snd)) 1849 sowwakeup(so); 1850 1851 /* 1852 * If we had a pending ICMP message that referred to data 1853 * that have just been acknowledged, disregard the recorded 1854 * ICMP message. 1855 */ 1856 if ((tp->t_flags & TF_PMTUD_PEND) && 1857 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1858 tp->t_flags &= ~TF_PMTUD_PEND; 1859 1860 /* 1861 * Keep track of the largest chunk of data acknowledged 1862 * since last PMTU update 1863 */ 1864 if (tp->t_pmtud_mss_acked < acked) 1865 tp->t_pmtud_mss_acked = acked; 1866 1867 tp->snd_una = th->th_ack; 1868 #ifdef TCP_ECN 1869 /* sync snd_last with snd_una */ 1870 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1871 tp->snd_last = tp->snd_una; 1872 #endif 1873 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1874 tp->snd_nxt = tp->snd_una; 1875 #if defined (TCP_SACK) && defined (TCP_FACK) 1876 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1877 tp->snd_fack = tp->snd_una; 1878 /* Update snd_awnd for partial ACK 1879 * without any SACK blocks. 1880 */ 1881 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1882 tp->snd_fack) + tp->retran_data; 1883 } 1884 #endif 1885 1886 switch (tp->t_state) { 1887 1888 /* 1889 * In FIN_WAIT_1 STATE in addition to the processing 1890 * for the ESTABLISHED state if our FIN is now acknowledged 1891 * then enter FIN_WAIT_2. 1892 */ 1893 case TCPS_FIN_WAIT_1: 1894 if (ourfinisacked) { 1895 /* 1896 * If we can't receive any more 1897 * data, then closing user can proceed. 1898 * Starting the timer is contrary to the 1899 * specification, but if we don't get a FIN 1900 * we'll hang forever. 1901 */ 1902 if (so->so_state & SS_CANTRCVMORE) { 1903 soisdisconnected(so); 1904 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1905 } 1906 tp->t_state = TCPS_FIN_WAIT_2; 1907 } 1908 break; 1909 1910 /* 1911 * In CLOSING STATE in addition to the processing for 1912 * the ESTABLISHED state if the ACK acknowledges our FIN 1913 * then enter the TIME-WAIT state, otherwise ignore 1914 * the segment. 1915 */ 1916 case TCPS_CLOSING: 1917 if (ourfinisacked) { 1918 tp->t_state = TCPS_TIME_WAIT; 1919 tcp_canceltimers(tp); 1920 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1921 soisdisconnected(so); 1922 } 1923 break; 1924 1925 /* 1926 * In LAST_ACK, we may still be waiting for data to drain 1927 * and/or to be acked, as well as for the ack of our FIN. 1928 * If our FIN is now acknowledged, delete the TCB, 1929 * enter the closed state and return. 1930 */ 1931 case TCPS_LAST_ACK: 1932 if (ourfinisacked) { 1933 tp = tcp_close(tp); 1934 goto drop; 1935 } 1936 break; 1937 1938 /* 1939 * In TIME_WAIT state the only thing that should arrive 1940 * is a retransmission of the remote FIN. Acknowledge 1941 * it and restart the finack timer. 1942 */ 1943 case TCPS_TIME_WAIT: 1944 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1945 goto dropafterack; 1946 } 1947 } 1948 1949 step6: 1950 /* 1951 * Update window information. 1952 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1953 */ 1954 if ((tiflags & TH_ACK) && 1955 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1956 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1957 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1958 /* keep track of pure window updates */ 1959 if (tlen == 0 && 1960 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1961 tcpstat.tcps_rcvwinupd++; 1962 tp->snd_wnd = tiwin; 1963 tp->snd_wl1 = th->th_seq; 1964 tp->snd_wl2 = th->th_ack; 1965 if (tp->snd_wnd > tp->max_sndwnd) 1966 tp->max_sndwnd = tp->snd_wnd; 1967 needoutput = 1; 1968 } 1969 1970 /* 1971 * Process segments with URG. 1972 */ 1973 if ((tiflags & TH_URG) && th->th_urp && 1974 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1975 /* 1976 * This is a kludge, but if we receive and accept 1977 * random urgent pointers, we'll crash in 1978 * soreceive. It's hard to imagine someone 1979 * actually wanting to send this much urgent data. 1980 */ 1981 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1982 th->th_urp = 0; /* XXX */ 1983 tiflags &= ~TH_URG; /* XXX */ 1984 goto dodata; /* XXX */ 1985 } 1986 /* 1987 * If this segment advances the known urgent pointer, 1988 * then mark the data stream. This should not happen 1989 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1990 * a FIN has been received from the remote side. 1991 * In these states we ignore the URG. 1992 * 1993 * According to RFC961 (Assigned Protocols), 1994 * the urgent pointer points to the last octet 1995 * of urgent data. We continue, however, 1996 * to consider it to indicate the first octet 1997 * of data past the urgent section as the original 1998 * spec states (in one of two places). 1999 */ 2000 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2001 tp->rcv_up = th->th_seq + th->th_urp; 2002 so->so_oobmark = so->so_rcv.sb_cc + 2003 (tp->rcv_up - tp->rcv_nxt) - 1; 2004 if (so->so_oobmark == 0) 2005 so->so_state |= SS_RCVATMARK; 2006 sohasoutofband(so); 2007 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2008 } 2009 /* 2010 * Remove out of band data so doesn't get presented to user. 2011 * This can happen independent of advancing the URG pointer, 2012 * but if two URG's are pending at once, some out-of-band 2013 * data may creep in... ick. 2014 */ 2015 if (th->th_urp <= (u_int16_t) tlen 2016 #ifdef SO_OOBINLINE 2017 && (so->so_options & SO_OOBINLINE) == 0 2018 #endif 2019 ) 2020 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2021 } else 2022 /* 2023 * If no out of band data is expected, 2024 * pull receive urgent pointer along 2025 * with the receive window. 2026 */ 2027 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2028 tp->rcv_up = tp->rcv_nxt; 2029 dodata: /* XXX */ 2030 2031 /* 2032 * Process the segment text, merging it into the TCP sequencing queue, 2033 * and arranging for acknowledgment of receipt if necessary. 2034 * This process logically involves adjusting tp->rcv_wnd as data 2035 * is presented to the user (this happens in tcp_usrreq.c, 2036 * case PRU_RCVD). If a FIN has already been received on this 2037 * connection then we just ignore the text. 2038 */ 2039 if ((tlen || (tiflags & TH_FIN)) && 2040 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2041 #ifdef TCP_SACK 2042 tcp_seq laststart = th->th_seq; 2043 tcp_seq lastend = th->th_seq + tlen; 2044 #endif 2045 tcp_reass_lock(tp); 2046 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2047 tp->t_state == TCPS_ESTABLISHED) { 2048 tcp_reass_unlock(tp); 2049 TCP_SETUP_ACK(tp, tiflags); 2050 tp->rcv_nxt += tlen; 2051 tiflags = th->th_flags & TH_FIN; 2052 tcpstat.tcps_rcvpack++; 2053 tcpstat.tcps_rcvbyte += tlen; 2054 ND6_HINT(tp); 2055 if (so->so_state & SS_CANTRCVMORE) 2056 m_freem(m); 2057 else { 2058 m_adj(m, hdroptlen); 2059 sbappendstream(&so->so_rcv, m); 2060 } 2061 sorwakeup(so); 2062 } else { 2063 m_adj(m, hdroptlen); 2064 tiflags = tcp_reass(tp, th, m, &tlen); 2065 tcp_reass_unlock(tp); 2066 tp->t_flags |= TF_ACKNOW; 2067 } 2068 #ifdef TCP_SACK 2069 if (tp->sack_enable) 2070 tcp_update_sack_list(tp, laststart, lastend); 2071 #endif 2072 2073 /* 2074 * variable len never referenced again in modern BSD, 2075 * so why bother computing it ?? 2076 */ 2077 #if 0 2078 /* 2079 * Note the amount of data that peer has sent into 2080 * our window, in order to estimate the sender's 2081 * buffer size. 2082 */ 2083 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2084 #endif /* 0 */ 2085 } else { 2086 m_freem(m); 2087 tiflags &= ~TH_FIN; 2088 } 2089 2090 /* 2091 * If FIN is received ACK the FIN and let the user know 2092 * that the connection is closing. Ignore a FIN received before 2093 * the connection is fully established. 2094 */ 2095 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2096 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2097 socantrcvmore(so); 2098 tp->t_flags |= TF_ACKNOW; 2099 tp->rcv_nxt++; 2100 } 2101 switch (tp->t_state) { 2102 2103 /* 2104 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2105 */ 2106 case TCPS_ESTABLISHED: 2107 tp->t_state = TCPS_CLOSE_WAIT; 2108 break; 2109 2110 /* 2111 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2112 * enter the CLOSING state. 2113 */ 2114 case TCPS_FIN_WAIT_1: 2115 tp->t_state = TCPS_CLOSING; 2116 break; 2117 2118 /* 2119 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2120 * starting the time-wait timer, turning off the other 2121 * standard timers. 2122 */ 2123 case TCPS_FIN_WAIT_2: 2124 tp->t_state = TCPS_TIME_WAIT; 2125 tcp_canceltimers(tp); 2126 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2127 soisdisconnected(so); 2128 break; 2129 2130 /* 2131 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2132 */ 2133 case TCPS_TIME_WAIT: 2134 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2135 break; 2136 } 2137 } 2138 if (so->so_options & SO_DEBUG) { 2139 switch (tp->pf) { 2140 #ifdef INET6 2141 case PF_INET6: 2142 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2143 0, tlen); 2144 break; 2145 #endif /* INET6 */ 2146 case PF_INET: 2147 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2148 0, tlen); 2149 break; 2150 } 2151 } 2152 2153 /* 2154 * Return any desired output. 2155 */ 2156 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2157 (void) tcp_output(tp); 2158 } 2159 return; 2160 2161 badsyn: 2162 /* 2163 * Received a bad SYN. Increment counters and dropwithreset. 2164 */ 2165 tcpstat.tcps_badsyn++; 2166 tp = NULL; 2167 goto dropwithreset; 2168 2169 dropafterack_ratelim: 2170 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2171 tcp_ackdrop_ppslim) == 0) { 2172 /* XXX stat */ 2173 goto drop; 2174 } 2175 /* ...fall into dropafterack... */ 2176 2177 dropafterack: 2178 /* 2179 * Generate an ACK dropping incoming segment if it occupies 2180 * sequence space, where the ACK reflects our state. 2181 */ 2182 if (tiflags & TH_RST) 2183 goto drop; 2184 m_freem(m); 2185 tp->t_flags |= TF_ACKNOW; 2186 (void) tcp_output(tp); 2187 return; 2188 2189 dropwithreset_ratelim: 2190 /* 2191 * We may want to rate-limit RSTs in certain situations, 2192 * particularly if we are sending an RST in response to 2193 * an attempt to connect to or otherwise communicate with 2194 * a port for which we have no socket. 2195 */ 2196 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2197 tcp_rst_ppslim) == 0) { 2198 /* XXX stat */ 2199 goto drop; 2200 } 2201 /* ...fall into dropwithreset... */ 2202 2203 dropwithreset: 2204 /* 2205 * Generate a RST, dropping incoming segment. 2206 * Make ACK acceptable to originator of segment. 2207 * Don't bother to respond to RST. 2208 */ 2209 if (tiflags & TH_RST) 2210 goto drop; 2211 if (tiflags & TH_ACK) { 2212 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2213 TH_RST); 2214 } else { 2215 if (tiflags & TH_SYN) 2216 tlen++; 2217 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2218 (tcp_seq)0, TH_RST|TH_ACK); 2219 } 2220 m_freem(m); 2221 return; 2222 2223 drop: 2224 /* 2225 * Drop space held by incoming segment and return. 2226 */ 2227 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2228 switch (tp->pf) { 2229 #ifdef INET6 2230 case PF_INET6: 2231 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2232 0, tlen); 2233 break; 2234 #endif /* INET6 */ 2235 case PF_INET: 2236 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2237 0, tlen); 2238 break; 2239 } 2240 } 2241 2242 m_freem(m); 2243 return; 2244 } 2245 2246 int 2247 tcp_dooptions(tp, cp, cnt, th, m, iphlen, oi) 2248 struct tcpcb *tp; 2249 u_char *cp; 2250 int cnt; 2251 struct tcphdr *th; 2252 struct mbuf *m; 2253 int iphlen; 2254 struct tcp_opt_info *oi; 2255 { 2256 u_int16_t mss = 0; 2257 int opt, optlen; 2258 #ifdef TCP_SIGNATURE 2259 caddr_t sigp = NULL; 2260 struct tdb *tdb = NULL; 2261 #endif /* TCP_SIGNATURE */ 2262 2263 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2264 opt = cp[0]; 2265 if (opt == TCPOPT_EOL) 2266 break; 2267 if (opt == TCPOPT_NOP) 2268 optlen = 1; 2269 else { 2270 if (cnt < 2) 2271 break; 2272 optlen = cp[1]; 2273 if (optlen < 2 || optlen > cnt) 2274 break; 2275 } 2276 switch (opt) { 2277 2278 default: 2279 continue; 2280 2281 case TCPOPT_MAXSEG: 2282 if (optlen != TCPOLEN_MAXSEG) 2283 continue; 2284 if (!(th->th_flags & TH_SYN)) 2285 continue; 2286 if (TCPS_HAVERCVDSYN(tp->t_state)) 2287 continue; 2288 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2289 NTOHS(mss); 2290 oi->maxseg = mss; 2291 break; 2292 2293 case TCPOPT_WINDOW: 2294 if (optlen != TCPOLEN_WINDOW) 2295 continue; 2296 if (!(th->th_flags & TH_SYN)) 2297 continue; 2298 if (TCPS_HAVERCVDSYN(tp->t_state)) 2299 continue; 2300 tp->t_flags |= TF_RCVD_SCALE; 2301 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2302 break; 2303 2304 case TCPOPT_TIMESTAMP: 2305 if (optlen != TCPOLEN_TIMESTAMP) 2306 continue; 2307 oi->ts_present = 1; 2308 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2309 NTOHL(oi->ts_val); 2310 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2311 NTOHL(oi->ts_ecr); 2312 2313 if (!(th->th_flags & TH_SYN)) 2314 continue; 2315 if (TCPS_HAVERCVDSYN(tp->t_state)) 2316 continue; 2317 /* 2318 * A timestamp received in a SYN makes 2319 * it ok to send timestamp requests and replies. 2320 */ 2321 tp->t_flags |= TF_RCVD_TSTMP; 2322 tp->ts_recent = oi->ts_val; 2323 tp->ts_recent_age = tcp_now; 2324 break; 2325 2326 #ifdef TCP_SACK 2327 case TCPOPT_SACK_PERMITTED: 2328 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2329 continue; 2330 if (!(th->th_flags & TH_SYN)) 2331 continue; 2332 if (TCPS_HAVERCVDSYN(tp->t_state)) 2333 continue; 2334 /* MUST only be set on SYN */ 2335 tp->t_flags |= TF_SACK_PERMIT; 2336 break; 2337 case TCPOPT_SACK: 2338 tcp_sack_option(tp, th, cp, optlen); 2339 break; 2340 #endif 2341 #ifdef TCP_SIGNATURE 2342 case TCPOPT_SIGNATURE: 2343 if (optlen != TCPOLEN_SIGNATURE) 2344 continue; 2345 2346 if (sigp && bcmp(sigp, cp + 2, 16)) 2347 return (-1); 2348 2349 sigp = cp + 2; 2350 break; 2351 #endif /* TCP_SIGNATURE */ 2352 } 2353 } 2354 2355 #ifdef TCP_SIGNATURE 2356 if (tp->t_flags & TF_SIGNATURE) { 2357 union sockaddr_union src, dst; 2358 2359 memset(&src, 0, sizeof(union sockaddr_union)); 2360 memset(&dst, 0, sizeof(union sockaddr_union)); 2361 2362 switch (tp->pf) { 2363 case 0: 2364 #ifdef INET 2365 case AF_INET: 2366 src.sa.sa_len = sizeof(struct sockaddr_in); 2367 src.sa.sa_family = AF_INET; 2368 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2369 dst.sa.sa_len = sizeof(struct sockaddr_in); 2370 dst.sa.sa_family = AF_INET; 2371 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2372 break; 2373 #endif 2374 #ifdef INET6 2375 case AF_INET6: 2376 src.sa.sa_len = sizeof(struct sockaddr_in6); 2377 src.sa.sa_family = AF_INET6; 2378 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2379 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2380 dst.sa.sa_family = AF_INET6; 2381 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2382 break; 2383 #endif /* INET6 */ 2384 } 2385 2386 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 2387 2388 /* 2389 * We don't have an SA for this peer, so we turn off 2390 * TF_SIGNATURE on the listen socket 2391 */ 2392 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2393 tp->t_flags &= ~TF_SIGNATURE; 2394 2395 } 2396 2397 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2398 tcpstat.tcps_rcvbadsig++; 2399 return (-1); 2400 } 2401 2402 if (sigp) { 2403 char sig[16]; 2404 2405 if (tdb == NULL) { 2406 tcpstat.tcps_rcvbadsig++; 2407 return (-1); 2408 } 2409 2410 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2411 return (-1); 2412 2413 if (bcmp(sig, sigp, 16)) { 2414 tcpstat.tcps_rcvbadsig++; 2415 return (-1); 2416 } 2417 2418 tcpstat.tcps_rcvgoodsig++; 2419 } 2420 #endif /* TCP_SIGNATURE */ 2421 2422 return (0); 2423 } 2424 2425 #if defined(TCP_SACK) 2426 u_long 2427 tcp_seq_subtract(a, b) 2428 u_long a, b; 2429 { 2430 return ((long)(a - b)); 2431 } 2432 #endif 2433 2434 2435 #ifdef TCP_SACK 2436 /* 2437 * This function is called upon receipt of new valid data (while not in header 2438 * prediction mode), and it updates the ordered list of sacks. 2439 */ 2440 void 2441 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2442 tcp_seq rcv_lastend) 2443 { 2444 /* 2445 * First reported block MUST be the most recent one. Subsequent 2446 * blocks SHOULD be in the order in which they arrived at the 2447 * receiver. These two conditions make the implementation fully 2448 * compliant with RFC 2018. 2449 */ 2450 int i, j = 0, count = 0, lastpos = -1; 2451 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2452 2453 /* First clean up current list of sacks */ 2454 for (i = 0; i < tp->rcv_numsacks; i++) { 2455 sack = tp->sackblks[i]; 2456 if (sack.start == 0 && sack.end == 0) { 2457 count++; /* count = number of blocks to be discarded */ 2458 continue; 2459 } 2460 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2461 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2462 count++; 2463 } else { 2464 temp[j].start = tp->sackblks[i].start; 2465 temp[j++].end = tp->sackblks[i].end; 2466 } 2467 } 2468 tp->rcv_numsacks -= count; 2469 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2470 tcp_clean_sackreport(tp); 2471 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2472 /* ==> need first sack block */ 2473 tp->sackblks[0].start = rcv_laststart; 2474 tp->sackblks[0].end = rcv_lastend; 2475 tp->rcv_numsacks = 1; 2476 } 2477 return; 2478 } 2479 /* Otherwise, sack blocks are already present. */ 2480 for (i = 0; i < tp->rcv_numsacks; i++) 2481 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2482 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2483 return; /* sack list remains unchanged */ 2484 /* 2485 * From here, segment just received should be (part of) the 1st sack. 2486 * Go through list, possibly coalescing sack block entries. 2487 */ 2488 firstsack.start = rcv_laststart; 2489 firstsack.end = rcv_lastend; 2490 for (i = 0; i < tp->rcv_numsacks; i++) { 2491 sack = tp->sackblks[i]; 2492 if (SEQ_LT(sack.end, firstsack.start) || 2493 SEQ_GT(sack.start, firstsack.end)) 2494 continue; /* no overlap */ 2495 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2496 /* 2497 * identical block; delete it here since we will 2498 * move it to the front of the list. 2499 */ 2500 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2501 lastpos = i; /* last posn with a zero entry */ 2502 continue; 2503 } 2504 if (SEQ_LEQ(sack.start, firstsack.start)) 2505 firstsack.start = sack.start; /* merge blocks */ 2506 if (SEQ_GEQ(sack.end, firstsack.end)) 2507 firstsack.end = sack.end; /* merge blocks */ 2508 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2509 lastpos = i; /* last posn with a zero entry */ 2510 } 2511 if (lastpos != -1) { /* at least one merge */ 2512 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2513 sack = tp->sackblks[i]; 2514 if (sack.start == 0 && sack.end == 0) 2515 continue; 2516 temp[j++] = sack; 2517 } 2518 tp->rcv_numsacks = j; /* including first blk (added later) */ 2519 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2520 tp->sackblks[i] = temp[i]; 2521 } else { /* no merges -- shift sacks by 1 */ 2522 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2523 tp->rcv_numsacks++; 2524 for (i = tp->rcv_numsacks-1; i > 0; i--) 2525 tp->sackblks[i] = tp->sackblks[i-1]; 2526 } 2527 tp->sackblks[0] = firstsack; 2528 return; 2529 } 2530 2531 /* 2532 * Process the TCP SACK option. tp->snd_holes is an ordered list 2533 * of holes (oldest to newest, in terms of the sequence space). 2534 */ 2535 void 2536 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2537 { 2538 int tmp_olen; 2539 u_char *tmp_cp; 2540 struct sackhole *cur, *p, *temp; 2541 2542 if (!tp->sack_enable) 2543 return; 2544 /* SACK without ACK doesn't make sense. */ 2545 if ((th->th_flags & TH_ACK) == 0) 2546 return; 2547 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2548 if (SEQ_LT(th->th_ack, tp->snd_una) || 2549 SEQ_GT(th->th_ack, tp->snd_max)) 2550 return; 2551 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2552 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2553 return; 2554 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2555 tmp_cp = cp + 2; 2556 tmp_olen = optlen - 2; 2557 tcpstat.tcps_sack_rcv_opts++; 2558 if (tp->snd_numholes < 0) 2559 tp->snd_numholes = 0; 2560 if (tp->t_maxseg == 0) 2561 panic("tcp_sack_option"); /* Should never happen */ 2562 while (tmp_olen > 0) { 2563 struct sackblk sack; 2564 2565 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2566 NTOHL(sack.start); 2567 bcopy(tmp_cp + sizeof(tcp_seq), 2568 (char *) &(sack.end), sizeof(tcp_seq)); 2569 NTOHL(sack.end); 2570 tmp_olen -= TCPOLEN_SACK; 2571 tmp_cp += TCPOLEN_SACK; 2572 if (SEQ_LEQ(sack.end, sack.start)) 2573 continue; /* bad SACK fields */ 2574 if (SEQ_LEQ(sack.end, tp->snd_una)) 2575 continue; /* old block */ 2576 #if defined(TCP_SACK) && defined(TCP_FACK) 2577 /* Updates snd_fack. */ 2578 if (SEQ_GT(sack.end, tp->snd_fack)) 2579 tp->snd_fack = sack.end; 2580 #endif /* TCP_FACK */ 2581 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2582 if (SEQ_LT(sack.start, th->th_ack)) 2583 continue; 2584 } 2585 if (SEQ_GT(sack.end, tp->snd_max)) 2586 continue; 2587 if (tp->snd_holes == NULL) { /* first hole */ 2588 tp->snd_holes = (struct sackhole *) 2589 pool_get(&sackhl_pool, PR_NOWAIT); 2590 if (tp->snd_holes == NULL) { 2591 /* ENOBUFS, so ignore SACKed block for now*/ 2592 goto done; 2593 } 2594 cur = tp->snd_holes; 2595 cur->start = th->th_ack; 2596 cur->end = sack.start; 2597 cur->rxmit = cur->start; 2598 cur->next = NULL; 2599 tp->snd_numholes = 1; 2600 tp->rcv_lastsack = sack.end; 2601 /* 2602 * dups is at least one. If more data has been 2603 * SACKed, it can be greater than one. 2604 */ 2605 cur->dups = min(tcprexmtthresh, 2606 ((sack.end - cur->end)/tp->t_maxseg)); 2607 if (cur->dups < 1) 2608 cur->dups = 1; 2609 continue; /* with next sack block */ 2610 } 2611 /* Go thru list of holes: p = previous, cur = current */ 2612 p = cur = tp->snd_holes; 2613 while (cur) { 2614 if (SEQ_LEQ(sack.end, cur->start)) 2615 /* SACKs data before the current hole */ 2616 break; /* no use going through more holes */ 2617 if (SEQ_GEQ(sack.start, cur->end)) { 2618 /* SACKs data beyond the current hole */ 2619 cur->dups++; 2620 if (((sack.end - cur->end)/tp->t_maxseg) >= 2621 tcprexmtthresh) 2622 cur->dups = tcprexmtthresh; 2623 p = cur; 2624 cur = cur->next; 2625 continue; 2626 } 2627 if (SEQ_LEQ(sack.start, cur->start)) { 2628 /* Data acks at least the beginning of hole */ 2629 #if defined(TCP_SACK) && defined(TCP_FACK) 2630 if (SEQ_GT(sack.end, cur->rxmit)) 2631 tp->retran_data -= 2632 tcp_seq_subtract(cur->rxmit, 2633 cur->start); 2634 else 2635 tp->retran_data -= 2636 tcp_seq_subtract(sack.end, 2637 cur->start); 2638 #endif /* TCP_FACK */ 2639 if (SEQ_GEQ(sack.end, cur->end)) { 2640 /* Acks entire hole, so delete hole */ 2641 if (p != cur) { 2642 p->next = cur->next; 2643 pool_put(&sackhl_pool, cur); 2644 cur = p->next; 2645 } else { 2646 cur = cur->next; 2647 pool_put(&sackhl_pool, p); 2648 p = cur; 2649 tp->snd_holes = p; 2650 } 2651 tp->snd_numholes--; 2652 continue; 2653 } 2654 /* otherwise, move start of hole forward */ 2655 cur->start = sack.end; 2656 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2657 p = cur; 2658 cur = cur->next; 2659 continue; 2660 } 2661 /* move end of hole backward */ 2662 if (SEQ_GEQ(sack.end, cur->end)) { 2663 #if defined(TCP_SACK) && defined(TCP_FACK) 2664 if (SEQ_GT(cur->rxmit, sack.start)) 2665 tp->retran_data -= 2666 tcp_seq_subtract(cur->rxmit, 2667 sack.start); 2668 #endif /* TCP_FACK */ 2669 cur->end = sack.start; 2670 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2671 cur->dups++; 2672 if (((sack.end - cur->end)/tp->t_maxseg) >= 2673 tcprexmtthresh) 2674 cur->dups = tcprexmtthresh; 2675 p = cur; 2676 cur = cur->next; 2677 continue; 2678 } 2679 if (SEQ_LT(cur->start, sack.start) && 2680 SEQ_GT(cur->end, sack.end)) { 2681 /* 2682 * ACKs some data in middle of a hole; need to 2683 * split current hole 2684 */ 2685 temp = (struct sackhole *) 2686 pool_get(&sackhl_pool, PR_NOWAIT); 2687 if (temp == NULL) 2688 goto done; /* ENOBUFS */ 2689 #if defined(TCP_SACK) && defined(TCP_FACK) 2690 if (SEQ_GT(cur->rxmit, sack.end)) 2691 tp->retran_data -= 2692 tcp_seq_subtract(sack.end, 2693 sack.start); 2694 else if (SEQ_GT(cur->rxmit, sack.start)) 2695 tp->retran_data -= 2696 tcp_seq_subtract(cur->rxmit, 2697 sack.start); 2698 #endif /* TCP_FACK */ 2699 temp->next = cur->next; 2700 temp->start = sack.end; 2701 temp->end = cur->end; 2702 temp->dups = cur->dups; 2703 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2704 cur->end = sack.start; 2705 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2706 cur->dups++; 2707 if (((sack.end - cur->end)/tp->t_maxseg) >= 2708 tcprexmtthresh) 2709 cur->dups = tcprexmtthresh; 2710 cur->next = temp; 2711 p = temp; 2712 cur = p->next; 2713 tp->snd_numholes++; 2714 } 2715 } 2716 /* At this point, p points to the last hole on the list */ 2717 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2718 /* 2719 * Need to append new hole at end. 2720 * Last hole is p (and it's not NULL). 2721 */ 2722 temp = (struct sackhole *) 2723 pool_get(&sackhl_pool, PR_NOWAIT); 2724 if (temp == NULL) 2725 goto done; /* ENOBUFS */ 2726 temp->start = tp->rcv_lastsack; 2727 temp->end = sack.start; 2728 temp->dups = min(tcprexmtthresh, 2729 ((sack.end - sack.start)/tp->t_maxseg)); 2730 if (temp->dups < 1) 2731 temp->dups = 1; 2732 temp->rxmit = temp->start; 2733 temp->next = 0; 2734 p->next = temp; 2735 tp->rcv_lastsack = sack.end; 2736 tp->snd_numholes++; 2737 } 2738 } 2739 done: 2740 #if defined(TCP_SACK) && defined(TCP_FACK) 2741 /* 2742 * Update retran_data and snd_awnd. Go through the list of 2743 * holes. Increment retran_data by (hole->rxmit - hole->start). 2744 */ 2745 tp->retran_data = 0; 2746 cur = tp->snd_holes; 2747 while (cur) { 2748 tp->retran_data += cur->rxmit - cur->start; 2749 cur = cur->next; 2750 } 2751 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2752 tp->retran_data; 2753 #endif /* TCP_FACK */ 2754 2755 return; 2756 } 2757 2758 /* 2759 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2760 * it is completely acked; otherwise, tcp_sack_option(), called from 2761 * tcp_dooptions(), will fix up the hole. 2762 */ 2763 void 2764 tcp_del_sackholes(tp, th) 2765 struct tcpcb *tp; 2766 struct tcphdr *th; 2767 { 2768 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2769 /* max because this could be an older ack just arrived */ 2770 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2771 th->th_ack : tp->snd_una; 2772 struct sackhole *cur = tp->snd_holes; 2773 struct sackhole *prev; 2774 while (cur) 2775 if (SEQ_LEQ(cur->end, lastack)) { 2776 prev = cur; 2777 cur = cur->next; 2778 pool_put(&sackhl_pool, prev); 2779 tp->snd_numholes--; 2780 } else if (SEQ_LT(cur->start, lastack)) { 2781 cur->start = lastack; 2782 if (SEQ_LT(cur->rxmit, cur->start)) 2783 cur->rxmit = cur->start; 2784 break; 2785 } else 2786 break; 2787 tp->snd_holes = cur; 2788 } 2789 } 2790 2791 /* 2792 * Delete all receiver-side SACK information. 2793 */ 2794 void 2795 tcp_clean_sackreport(tp) 2796 struct tcpcb *tp; 2797 { 2798 int i; 2799 2800 tp->rcv_numsacks = 0; 2801 for (i = 0; i < MAX_SACK_BLKS; i++) 2802 tp->sackblks[i].start = tp->sackblks[i].end=0; 2803 2804 } 2805 2806 /* 2807 * Checks for partial ack. If partial ack arrives, turn off retransmission 2808 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2809 * If the ack advances at least to tp->snd_last, return 0. 2810 */ 2811 int 2812 tcp_sack_partialack(tp, th) 2813 struct tcpcb *tp; 2814 struct tcphdr *th; 2815 { 2816 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2817 /* Turn off retx. timer (will start again next segment) */ 2818 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2819 tp->t_rtttime = 0; 2820 #ifndef TCP_FACK 2821 /* 2822 * Partial window deflation. This statement relies on the 2823 * fact that tp->snd_una has not been updated yet. In FACK 2824 * hold snd_cwnd constant during fast recovery. 2825 */ 2826 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2827 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2828 tp->snd_cwnd += tp->t_maxseg; 2829 } else 2830 tp->snd_cwnd = tp->t_maxseg; 2831 #endif 2832 return (1); 2833 } 2834 return (0); 2835 } 2836 #endif /* TCP_SACK */ 2837 2838 /* 2839 * Pull out of band byte out of a segment so 2840 * it doesn't appear in the user's data queue. 2841 * It is still reflected in the segment length for 2842 * sequencing purposes. 2843 */ 2844 void 2845 tcp_pulloutofband(so, urgent, m, off) 2846 struct socket *so; 2847 u_int urgent; 2848 struct mbuf *m; 2849 int off; 2850 { 2851 int cnt = off + urgent - 1; 2852 2853 while (cnt >= 0) { 2854 if (m->m_len > cnt) { 2855 char *cp = mtod(m, caddr_t) + cnt; 2856 struct tcpcb *tp = sototcpcb(so); 2857 2858 tp->t_iobc = *cp; 2859 tp->t_oobflags |= TCPOOB_HAVEDATA; 2860 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2861 m->m_len--; 2862 return; 2863 } 2864 cnt -= m->m_len; 2865 m = m->m_next; 2866 if (m == 0) 2867 break; 2868 } 2869 panic("tcp_pulloutofband"); 2870 } 2871 2872 /* 2873 * Collect new round-trip time estimate 2874 * and update averages and current timeout. 2875 */ 2876 void 2877 tcp_xmit_timer(tp, rtt) 2878 struct tcpcb *tp; 2879 short rtt; 2880 { 2881 short delta; 2882 short rttmin; 2883 2884 if (rtt < 0) 2885 rtt = 0; 2886 else if (rtt > TCP_RTT_MAX) 2887 rtt = TCP_RTT_MAX; 2888 2889 tcpstat.tcps_rttupdated++; 2890 if (tp->t_srtt != 0) { 2891 /* 2892 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2893 * after the binary point (scaled by 4), whereas 2894 * srtt is stored as fixed point with 5 bits after the 2895 * binary point (i.e., scaled by 32). The following magic 2896 * is equivalent to the smoothing algorithm in rfc793 with 2897 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2898 * point). 2899 */ 2900 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2901 (tp->t_srtt >> TCP_RTT_SHIFT); 2902 if ((tp->t_srtt += delta) <= 0) 2903 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2904 /* 2905 * We accumulate a smoothed rtt variance (actually, a 2906 * smoothed mean difference), then set the retransmit 2907 * timer to smoothed rtt + 4 times the smoothed variance. 2908 * rttvar is stored as fixed point with 4 bits after the 2909 * binary point (scaled by 16). The following is 2910 * equivalent to rfc793 smoothing with an alpha of .75 2911 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2912 * rfc793's wired-in beta. 2913 */ 2914 if (delta < 0) 2915 delta = -delta; 2916 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2917 if ((tp->t_rttvar += delta) <= 0) 2918 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2919 } else { 2920 /* 2921 * No rtt measurement yet - use the unsmoothed rtt. 2922 * Set the variance to half the rtt (so our first 2923 * retransmit happens at 3*rtt). 2924 */ 2925 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2926 tp->t_rttvar = (rtt + 1) << 2927 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2928 } 2929 tp->t_rtttime = 0; 2930 tp->t_rxtshift = 0; 2931 2932 /* 2933 * the retransmit should happen at rtt + 4 * rttvar. 2934 * Because of the way we do the smoothing, srtt and rttvar 2935 * will each average +1/2 tick of bias. When we compute 2936 * the retransmit timer, we want 1/2 tick of rounding and 2937 * 1 extra tick because of +-1/2 tick uncertainty in the 2938 * firing of the timer. The bias will give us exactly the 2939 * 1.5 tick we need. But, because the bias is 2940 * statistical, we have to test that we don't drop below 2941 * the minimum feasible timer (which is 2 ticks). 2942 */ 2943 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2944 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2945 2946 /* 2947 * We received an ack for a packet that wasn't retransmitted; 2948 * it is probably safe to discard any error indications we've 2949 * received recently. This isn't quite right, but close enough 2950 * for now (a route might have failed after we sent a segment, 2951 * and the return path might not be symmetrical). 2952 */ 2953 tp->t_softerror = 0; 2954 } 2955 2956 /* 2957 * Determine a reasonable value for maxseg size. 2958 * If the route is known, check route for mtu. 2959 * If none, use an mss that can be handled on the outgoing 2960 * interface without forcing IP to fragment; if bigger than 2961 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2962 * to utilize large mbufs. If no route is found, route has no mtu, 2963 * or the destination isn't local, use a default, hopefully conservative 2964 * size (usually 512 or the default IP max size, but no more than the mtu 2965 * of the interface), as we can't discover anything about intervening 2966 * gateways or networks. We also initialize the congestion/slow start 2967 * window to be a single segment if the destination isn't local. 2968 * While looking at the routing entry, we also initialize other path-dependent 2969 * parameters from pre-set or cached values in the routing entry. 2970 * 2971 * Also take into account the space needed for options that we 2972 * send regularly. Make maxseg shorter by that amount to assure 2973 * that we can send maxseg amount of data even when the options 2974 * are present. Store the upper limit of the length of options plus 2975 * data in maxopd. 2976 * 2977 * NOTE: offer == -1 indicates that the maxseg size changed due to 2978 * Path MTU discovery. 2979 */ 2980 int 2981 tcp_mss(tp, offer) 2982 struct tcpcb *tp; 2983 int offer; 2984 { 2985 struct rtentry *rt; 2986 struct ifnet *ifp; 2987 int mss, mssopt; 2988 int iphlen; 2989 struct inpcb *inp; 2990 2991 inp = tp->t_inpcb; 2992 2993 mssopt = mss = tcp_mssdflt; 2994 2995 rt = in_pcbrtentry(inp); 2996 2997 if (rt == NULL) 2998 goto out; 2999 3000 ifp = rt->rt_ifp; 3001 3002 switch (tp->pf) { 3003 #ifdef INET6 3004 case AF_INET6: 3005 iphlen = sizeof(struct ip6_hdr); 3006 break; 3007 #endif 3008 case AF_INET: 3009 iphlen = sizeof(struct ip); 3010 break; 3011 default: 3012 /* the family does not support path MTU discovery */ 3013 goto out; 3014 } 3015 3016 #ifdef RTV_MTU 3017 /* 3018 * if there's an mtu associated with the route and we support 3019 * path MTU discovery for the underlying protocol family, use it. 3020 */ 3021 if (rt->rt_rmx.rmx_mtu) { 3022 /* 3023 * One may wish to lower MSS to take into account options, 3024 * especially security-related options. 3025 */ 3026 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3027 /* 3028 * RFC2460 section 5, last paragraph: if path MTU is 3029 * smaller than 1280, use 1280 as packet size and 3030 * attach fragment header. 3031 */ 3032 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3033 sizeof(struct tcphdr); 3034 } else 3035 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3036 } else 3037 #endif /* RTV_MTU */ 3038 if (!ifp) 3039 /* 3040 * ifp may be null and rmx_mtu may be zero in certain 3041 * v6 cases (e.g., if ND wasn't able to resolve the 3042 * destination host. 3043 */ 3044 goto out; 3045 else if (ifp->if_flags & IFF_LOOPBACK) 3046 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3047 else if (tp->pf == AF_INET) { 3048 if (ip_mtudisc) 3049 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3050 else if (inp && in_localaddr(inp->inp_faddr)) 3051 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3052 } 3053 #ifdef INET6 3054 else if (tp->pf == AF_INET6) { 3055 /* 3056 * for IPv6, path MTU discovery is always turned on, 3057 * or the node must use packet size <= 1280. 3058 */ 3059 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3060 } 3061 #endif /* INET6 */ 3062 3063 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3064 if (offer != -1) { 3065 #ifndef INET6 3066 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3067 #else 3068 if (tp->pf == AF_INET6) 3069 mssopt = IN6_LINKMTU(ifp) - iphlen - 3070 sizeof(struct tcphdr); 3071 else 3072 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3073 #endif 3074 3075 mssopt = max(tcp_mssdflt, mssopt); 3076 } 3077 3078 out: 3079 /* 3080 * The current mss, t_maxseg, is initialized to the default value. 3081 * If we compute a smaller value, reduce the current mss. 3082 * If we compute a larger value, return it for use in sending 3083 * a max seg size option, but don't store it for use 3084 * unless we received an offer at least that large from peer. 3085 * 3086 * However, do not accept offers lower than the minimum of 3087 * the interface MTU and 216. 3088 */ 3089 if (offer > 0) 3090 tp->t_peermss = offer; 3091 if (tp->t_peermss) 3092 mss = min(mss, max(tp->t_peermss, 216)); 3093 3094 /* sanity - at least max opt. space */ 3095 mss = max(mss, 64); 3096 3097 /* 3098 * maxopd stores the maximum length of data AND options 3099 * in a segment; maxseg is the amount of data in a normal 3100 * segment. We need to store this value (maxopd) apart 3101 * from maxseg, because now every segment carries options 3102 * and thus we normally have somewhat less data in segments. 3103 */ 3104 tp->t_maxopd = mss; 3105 3106 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3107 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3108 mss -= TCPOLEN_TSTAMP_APPA; 3109 #ifdef TCP_SIGNATURE 3110 if (tp->t_flags & TF_SIGNATURE) 3111 mss -= TCPOLEN_SIGLEN; 3112 #endif 3113 3114 if (offer == -1) { 3115 /* mss changed due to Path MTU discovery */ 3116 tp->t_flags &= ~TF_PMTUD_PEND; 3117 tp->t_pmtud_mtu_sent = 0; 3118 tp->t_pmtud_mss_acked = 0; 3119 if (mss < tp->t_maxseg) { 3120 /* 3121 * Follow suggestion in RFC 2414 to reduce the 3122 * congestion window by the ratio of the old 3123 * segment size to the new segment size. 3124 */ 3125 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3126 mss, mss); 3127 } 3128 } else if (tcp_do_rfc3390) { 3129 /* increase initial window */ 3130 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3131 } else 3132 tp->snd_cwnd = mss; 3133 3134 tp->t_maxseg = mss; 3135 3136 return (offer != -1 ? mssopt : mss); 3137 } 3138 3139 u_int 3140 tcp_hdrsz(struct tcpcb *tp) 3141 { 3142 u_int hlen; 3143 3144 switch (tp->pf) { 3145 #ifdef INET6 3146 case AF_INET6: 3147 hlen = sizeof(struct ip6_hdr); 3148 break; 3149 #endif 3150 case AF_INET: 3151 hlen = sizeof(struct ip); 3152 break; 3153 default: 3154 hlen = 0; 3155 break; 3156 } 3157 hlen += sizeof(struct tcphdr); 3158 3159 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3160 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3161 hlen += TCPOLEN_TSTAMP_APPA; 3162 #ifdef TCP_SIGNATURE 3163 if (tp->t_flags & TF_SIGNATURE) 3164 hlen += TCPOLEN_SIGLEN; 3165 #endif 3166 return (hlen); 3167 } 3168 3169 /* 3170 * Set connection variables based on the effective MSS. 3171 * We are passed the TCPCB for the actual connection. If we 3172 * are the server, we are called by the compressed state engine 3173 * when the 3-way handshake is complete. If we are the client, 3174 * we are called when we receive the SYN,ACK from the server. 3175 * 3176 * NOTE: The t_maxseg value must be initialized in the TCPCB 3177 * before this routine is called! 3178 */ 3179 void 3180 tcp_mss_update(tp) 3181 struct tcpcb *tp; 3182 { 3183 int mss; 3184 u_long bufsize; 3185 struct rtentry *rt; 3186 struct socket *so; 3187 3188 so = tp->t_inpcb->inp_socket; 3189 mss = tp->t_maxseg; 3190 3191 rt = in_pcbrtentry(tp->t_inpcb); 3192 3193 if (rt == NULL) 3194 return; 3195 3196 bufsize = so->so_snd.sb_hiwat; 3197 if (bufsize < mss) { 3198 mss = bufsize; 3199 /* Update t_maxseg and t_maxopd */ 3200 tcp_mss(tp, mss); 3201 } else { 3202 bufsize = roundup(bufsize, mss); 3203 if (bufsize > sb_max) 3204 bufsize = sb_max; 3205 (void)sbreserve(&so->so_snd, bufsize); 3206 } 3207 3208 bufsize = so->so_rcv.sb_hiwat; 3209 if (bufsize > mss) { 3210 bufsize = roundup(bufsize, mss); 3211 if (bufsize > sb_max) 3212 bufsize = sb_max; 3213 (void)sbreserve(&so->so_rcv, bufsize); 3214 } 3215 3216 } 3217 3218 #if defined (TCP_SACK) 3219 /* 3220 * Checks for partial ack. If partial ack arrives, force the retransmission 3221 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3222 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3223 * be started again. If the ack advances at least to tp->snd_last, return 0. 3224 */ 3225 int 3226 tcp_newreno(tp, th) 3227 struct tcpcb *tp; 3228 struct tcphdr *th; 3229 { 3230 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3231 /* 3232 * snd_una has not been updated and the socket send buffer 3233 * not yet drained of the acked data, so we have to leave 3234 * snd_una as it was to get the correct data offset in 3235 * tcp_output(). 3236 */ 3237 tcp_seq onxt = tp->snd_nxt; 3238 u_long ocwnd = tp->snd_cwnd; 3239 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3240 tp->t_rtttime = 0; 3241 tp->snd_nxt = th->th_ack; 3242 /* 3243 * Set snd_cwnd to one segment beyond acknowledged offset 3244 * (tp->snd_una not yet updated when this function is called) 3245 */ 3246 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3247 (void) tcp_output(tp); 3248 tp->snd_cwnd = ocwnd; 3249 if (SEQ_GT(onxt, tp->snd_nxt)) 3250 tp->snd_nxt = onxt; 3251 /* 3252 * Partial window deflation. Relies on fact that tp->snd_una 3253 * not updated yet. 3254 */ 3255 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3256 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3257 else 3258 tp->snd_cwnd = 0; 3259 tp->snd_cwnd += tp->t_maxseg; 3260 3261 return 1; 3262 } 3263 return 0; 3264 } 3265 #endif /* TCP_SACK */ 3266 3267 int 3268 tcp_mss_adv(struct ifnet *ifp, int af) 3269 { 3270 int mss = 0; 3271 int iphlen; 3272 3273 switch (af) { 3274 case AF_INET: 3275 if (ifp != NULL) 3276 mss = ifp->if_mtu; 3277 iphlen = sizeof(struct ip); 3278 break; 3279 #ifdef INET6 3280 case AF_INET6: 3281 if (ifp != NULL) 3282 mss = IN6_LINKMTU(ifp); 3283 iphlen = sizeof(struct ip6_hdr); 3284 break; 3285 #endif 3286 } 3287 mss = mss - iphlen - sizeof(struct tcphdr); 3288 return (max(mss, tcp_mssdflt)); 3289 } 3290 3291 /* 3292 * TCP compressed state engine. Currently used to hold compressed 3293 * state for SYN_RECEIVED. 3294 */ 3295 3296 u_long syn_cache_count; 3297 u_int32_t syn_hash1, syn_hash2; 3298 3299 #define SYN_HASH(sa, sp, dp) \ 3300 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3301 ((u_int32_t)(sp)))^syn_hash2))) 3302 #ifndef INET6 3303 #define SYN_HASHALL(hash, src, dst) \ 3304 do { \ 3305 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3306 ((struct sockaddr_in *)(src))->sin_port, \ 3307 ((struct sockaddr_in *)(dst))->sin_port); \ 3308 } while (/*CONSTCOND*/ 0) 3309 #else 3310 #define SYN_HASH6(sa, sp, dp) \ 3311 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3312 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3313 & 0x7fffffff) 3314 3315 #define SYN_HASHALL(hash, src, dst) \ 3316 do { \ 3317 switch ((src)->sa_family) { \ 3318 case AF_INET: \ 3319 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3320 ((struct sockaddr_in *)(src))->sin_port, \ 3321 ((struct sockaddr_in *)(dst))->sin_port); \ 3322 break; \ 3323 case AF_INET6: \ 3324 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3325 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3326 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3327 break; \ 3328 default: \ 3329 hash = 0; \ 3330 } \ 3331 } while (/*CONSTCOND*/0) 3332 #endif /* INET6 */ 3333 3334 #define SYN_CACHE_RM(sc) \ 3335 do { \ 3336 (sc)->sc_flags |= SCF_DEAD; \ 3337 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 3338 (sc), sc_bucketq); \ 3339 (sc)->sc_tp = NULL; \ 3340 LIST_REMOVE((sc), sc_tpq); \ 3341 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 3342 timeout_del(&(sc)->sc_timer); \ 3343 syn_cache_count--; \ 3344 } while (/*CONSTCOND*/0) 3345 3346 #define SYN_CACHE_PUT(sc) \ 3347 do { \ 3348 if ((sc)->sc_ipopts) \ 3349 (void) m_free((sc)->sc_ipopts); \ 3350 if ((sc)->sc_route4.ro_rt != NULL) \ 3351 RTFREE((sc)->sc_route4.ro_rt); \ 3352 timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \ 3353 timeout_add(&(sc)->sc_timer, 0); \ 3354 } while (/*CONSTCOND*/0) 3355 3356 struct pool syn_cache_pool; 3357 3358 /* 3359 * We don't estimate RTT with SYNs, so each packet starts with the default 3360 * RTT and each timer step has a fixed timeout value. 3361 */ 3362 #define SYN_CACHE_TIMER_ARM(sc) \ 3363 do { \ 3364 TCPT_RANGESET((sc)->sc_rxtcur, \ 3365 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3366 TCPTV_REXMTMAX); \ 3367 if (!timeout_initialized(&(sc)->sc_timer)) \ 3368 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3369 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3370 } while (/*CONSTCOND*/0) 3371 3372 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3373 3374 void 3375 syn_cache_init() 3376 { 3377 int i; 3378 3379 /* Initialize the hash buckets. */ 3380 for (i = 0; i < tcp_syn_cache_size; i++) 3381 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3382 3383 /* Initialize the syn cache pool. */ 3384 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3385 "synpl", NULL); 3386 } 3387 3388 void 3389 syn_cache_insert(sc, tp) 3390 struct syn_cache *sc; 3391 struct tcpcb *tp; 3392 { 3393 struct syn_cache_head *scp; 3394 struct syn_cache *sc2; 3395 int s; 3396 3397 /* 3398 * If there are no entries in the hash table, reinitialize 3399 * the hash secrets. 3400 */ 3401 if (syn_cache_count == 0) { 3402 syn_hash1 = arc4random(); 3403 syn_hash2 = arc4random(); 3404 } 3405 3406 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3407 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3408 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3409 3410 /* 3411 * Make sure that we don't overflow the per-bucket 3412 * limit or the total cache size limit. 3413 */ 3414 s = splsoftnet(); 3415 if (scp->sch_length >= tcp_syn_bucket_limit) { 3416 tcpstat.tcps_sc_bucketoverflow++; 3417 /* 3418 * The bucket is full. Toss the oldest element in the 3419 * bucket. This will be the first entry in the bucket. 3420 */ 3421 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3422 #ifdef DIAGNOSTIC 3423 /* 3424 * This should never happen; we should always find an 3425 * entry in our bucket. 3426 */ 3427 if (sc2 == NULL) 3428 panic("syn_cache_insert: bucketoverflow: impossible"); 3429 #endif 3430 SYN_CACHE_RM(sc2); 3431 SYN_CACHE_PUT(sc2); 3432 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3433 struct syn_cache_head *scp2, *sce; 3434 3435 tcpstat.tcps_sc_overflowed++; 3436 /* 3437 * The cache is full. Toss the oldest entry in the 3438 * first non-empty bucket we can find. 3439 * 3440 * XXX We would really like to toss the oldest 3441 * entry in the cache, but we hope that this 3442 * condition doesn't happen very often. 3443 */ 3444 scp2 = scp; 3445 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3446 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3447 for (++scp2; scp2 != scp; scp2++) { 3448 if (scp2 >= sce) 3449 scp2 = &tcp_syn_cache[0]; 3450 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3451 break; 3452 } 3453 #ifdef DIAGNOSTIC 3454 /* 3455 * This should never happen; we should always find a 3456 * non-empty bucket. 3457 */ 3458 if (scp2 == scp) 3459 panic("syn_cache_insert: cacheoverflow: " 3460 "impossible"); 3461 #endif 3462 } 3463 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3464 SYN_CACHE_RM(sc2); 3465 SYN_CACHE_PUT(sc2); 3466 } 3467 3468 /* 3469 * Initialize the entry's timer. 3470 */ 3471 sc->sc_rxttot = 0; 3472 sc->sc_rxtshift = 0; 3473 SYN_CACHE_TIMER_ARM(sc); 3474 3475 /* Link it from tcpcb entry */ 3476 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3477 3478 /* Put it into the bucket. */ 3479 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3480 scp->sch_length++; 3481 syn_cache_count++; 3482 3483 tcpstat.tcps_sc_added++; 3484 splx(s); 3485 } 3486 3487 /* 3488 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3489 * If we have retransmitted an entry the maximum number of times, expire 3490 * that entry. 3491 */ 3492 void 3493 syn_cache_timer(void *arg) 3494 { 3495 struct syn_cache *sc = arg; 3496 int s; 3497 3498 s = splsoftnet(); 3499 if (sc->sc_flags & SCF_DEAD) { 3500 splx(s); 3501 return; 3502 } 3503 3504 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3505 /* Drop it -- too many retransmissions. */ 3506 goto dropit; 3507 } 3508 3509 /* 3510 * Compute the total amount of time this entry has 3511 * been on a queue. If this entry has been on longer 3512 * than the keep alive timer would allow, expire it. 3513 */ 3514 sc->sc_rxttot += sc->sc_rxtcur; 3515 if (sc->sc_rxttot >= tcptv_keep_init) 3516 goto dropit; 3517 3518 tcpstat.tcps_sc_retransmitted++; 3519 (void) syn_cache_respond(sc, NULL); 3520 3521 /* Advance the timer back-off. */ 3522 sc->sc_rxtshift++; 3523 SYN_CACHE_TIMER_ARM(sc); 3524 3525 splx(s); 3526 return; 3527 3528 dropit: 3529 tcpstat.tcps_sc_timed_out++; 3530 SYN_CACHE_RM(sc); 3531 SYN_CACHE_PUT(sc); 3532 splx(s); 3533 } 3534 3535 void 3536 syn_cache_reaper(void *arg) 3537 { 3538 struct syn_cache *sc = arg; 3539 int s; 3540 3541 s = splsoftnet(); 3542 pool_put(&syn_cache_pool, (sc)); 3543 splx(s); 3544 return; 3545 } 3546 3547 /* 3548 * Remove syn cache created by the specified tcb entry, 3549 * because this does not make sense to keep them 3550 * (if there's no tcb entry, syn cache entry will never be used) 3551 */ 3552 void 3553 syn_cache_cleanup(tp) 3554 struct tcpcb *tp; 3555 { 3556 struct syn_cache *sc, *nsc; 3557 int s; 3558 3559 s = splsoftnet(); 3560 3561 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3562 nsc = LIST_NEXT(sc, sc_tpq); 3563 3564 #ifdef DIAGNOSTIC 3565 if (sc->sc_tp != tp) 3566 panic("invalid sc_tp in syn_cache_cleanup"); 3567 #endif 3568 SYN_CACHE_RM(sc); 3569 SYN_CACHE_PUT(sc); 3570 } 3571 /* just for safety */ 3572 LIST_INIT(&tp->t_sc); 3573 3574 splx(s); 3575 } 3576 3577 /* 3578 * Find an entry in the syn cache. 3579 */ 3580 struct syn_cache * 3581 syn_cache_lookup(src, dst, headp) 3582 struct sockaddr *src; 3583 struct sockaddr *dst; 3584 struct syn_cache_head **headp; 3585 { 3586 struct syn_cache *sc; 3587 struct syn_cache_head *scp; 3588 u_int32_t hash; 3589 int s; 3590 3591 SYN_HASHALL(hash, src, dst); 3592 3593 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3594 *headp = scp; 3595 s = splsoftnet(); 3596 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3597 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3598 if (sc->sc_hash != hash) 3599 continue; 3600 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3601 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 3602 splx(s); 3603 return (sc); 3604 } 3605 } 3606 splx(s); 3607 return (NULL); 3608 } 3609 3610 /* 3611 * This function gets called when we receive an ACK for a 3612 * socket in the LISTEN state. We look up the connection 3613 * in the syn cache, and if its there, we pull it out of 3614 * the cache and turn it into a full-blown connection in 3615 * the SYN-RECEIVED state. 3616 * 3617 * The return values may not be immediately obvious, and their effects 3618 * can be subtle, so here they are: 3619 * 3620 * NULL SYN was not found in cache; caller should drop the 3621 * packet and send an RST. 3622 * 3623 * -1 We were unable to create the new connection, and are 3624 * aborting it. An ACK,RST is being sent to the peer 3625 * (unless we got screwey sequence numbners; see below), 3626 * because the 3-way handshake has been completed. Caller 3627 * should not free the mbuf, since we may be using it. If 3628 * we are not, we will free it. 3629 * 3630 * Otherwise, the return value is a pointer to the new socket 3631 * associated with the connection. 3632 */ 3633 struct socket * 3634 syn_cache_get(src, dst, th, hlen, tlen, so, m) 3635 struct sockaddr *src; 3636 struct sockaddr *dst; 3637 struct tcphdr *th; 3638 unsigned int hlen, tlen; 3639 struct socket *so; 3640 struct mbuf *m; 3641 { 3642 struct syn_cache *sc; 3643 struct syn_cache_head *scp; 3644 struct inpcb *inp = NULL; 3645 struct tcpcb *tp = 0; 3646 struct mbuf *am; 3647 int s; 3648 struct socket *oso; 3649 3650 s = splsoftnet(); 3651 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3652 splx(s); 3653 return (NULL); 3654 } 3655 3656 /* 3657 * Verify the sequence and ack numbers. Try getting the correct 3658 * response again. 3659 */ 3660 if ((th->th_ack != sc->sc_iss + 1) || 3661 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3662 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3663 (void) syn_cache_respond(sc, m); 3664 splx(s); 3665 return ((struct socket *)(-1)); 3666 } 3667 3668 /* Remove this cache entry */ 3669 SYN_CACHE_RM(sc); 3670 splx(s); 3671 3672 /* 3673 * Ok, create the full blown connection, and set things up 3674 * as they would have been set up if we had created the 3675 * connection when the SYN arrived. If we can't create 3676 * the connection, abort it. 3677 */ 3678 oso = so; 3679 so = sonewconn(so, SS_ISCONNECTED); 3680 if (so == NULL) 3681 goto resetandabort; 3682 3683 inp = sotoinpcb(oso); 3684 #ifdef IPSEC 3685 /* 3686 * We need to copy the required security levels 3687 * from the old pcb. Ditto for any other 3688 * IPsec-related information. 3689 */ 3690 { 3691 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 3692 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3693 sizeof(inp->inp_seclevel)); 3694 newinp->inp_secrequire = inp->inp_secrequire; 3695 if (inp->inp_ipo != NULL) { 3696 newinp->inp_ipo = inp->inp_ipo; 3697 inp->inp_ipo->ipo_ref_count++; 3698 } 3699 if (inp->inp_ipsec_remotecred != NULL) { 3700 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3701 inp->inp_ipsec_remotecred->ref_count++; 3702 } 3703 if (inp->inp_ipsec_remoteauth != NULL) { 3704 newinp->inp_ipsec_remoteauth 3705 = inp->inp_ipsec_remoteauth; 3706 inp->inp_ipsec_remoteauth->ref_count++; 3707 } 3708 } 3709 #endif /* IPSEC */ 3710 #ifdef INET6 3711 /* 3712 * inp still has the OLD in_pcb stuff, set the 3713 * v6-related flags on the new guy, too. 3714 */ 3715 { 3716 int flags = inp->inp_flags; 3717 struct inpcb *oldinpcb = inp; 3718 3719 inp = (struct inpcb *)so->so_pcb; 3720 inp->inp_flags |= (flags & INP_IPV6); 3721 if ((inp->inp_flags & INP_IPV6) != 0) { 3722 inp->inp_ipv6.ip6_hlim = 3723 oldinpcb->inp_ipv6.ip6_hlim; 3724 } 3725 } 3726 #else /* INET6 */ 3727 inp = (struct inpcb *)so->so_pcb; 3728 #endif /* INET6 */ 3729 3730 inp->inp_lport = th->th_dport; 3731 switch (src->sa_family) { 3732 #ifdef INET6 3733 case AF_INET6: 3734 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3735 break; 3736 #endif /* INET6 */ 3737 case AF_INET: 3738 3739 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3740 inp->inp_options = ip_srcroute(); 3741 if (inp->inp_options == NULL) { 3742 inp->inp_options = sc->sc_ipopts; 3743 sc->sc_ipopts = NULL; 3744 } 3745 break; 3746 } 3747 in_pcbrehash(inp); 3748 3749 /* 3750 * Give the new socket our cached route reference. 3751 */ 3752 if (src->sa_family == AF_INET) 3753 inp->inp_route = sc->sc_route4; /* struct assignment */ 3754 #ifdef INET6 3755 else 3756 inp->inp_route6 = sc->sc_route6; 3757 #endif 3758 sc->sc_route4.ro_rt = NULL; 3759 3760 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3761 if (am == NULL) 3762 goto resetandabort; 3763 am->m_len = src->sa_len; 3764 bcopy(src, mtod(am, caddr_t), src->sa_len); 3765 3766 switch (src->sa_family) { 3767 case AF_INET: 3768 /* drop IPv4 packet to AF_INET6 socket */ 3769 if (inp->inp_flags & INP_IPV6) { 3770 (void) m_free(am); 3771 goto resetandabort; 3772 } 3773 if (in_pcbconnect(inp, am)) { 3774 (void) m_free(am); 3775 goto resetandabort; 3776 } 3777 break; 3778 #ifdef INET6 3779 case AF_INET6: 3780 if (in6_pcbconnect(inp, am)) { 3781 (void) m_free(am); 3782 goto resetandabort; 3783 } 3784 break; 3785 #endif 3786 } 3787 (void) m_free(am); 3788 3789 tp = intotcpcb(inp); 3790 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3791 if (sc->sc_request_r_scale != 15) { 3792 tp->requested_s_scale = sc->sc_requested_s_scale; 3793 tp->request_r_scale = sc->sc_request_r_scale; 3794 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3795 } 3796 if (sc->sc_flags & SCF_TIMESTAMP) 3797 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3798 3799 tp->t_template = tcp_template(tp); 3800 if (tp->t_template == 0) { 3801 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3802 so = NULL; 3803 m_freem(m); 3804 goto abort; 3805 } 3806 #ifdef TCP_SACK 3807 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3808 #endif 3809 3810 tp->ts_modulate = sc->sc_modulate; 3811 tp->iss = sc->sc_iss; 3812 tp->irs = sc->sc_irs; 3813 tcp_sendseqinit(tp); 3814 #if defined (TCP_SACK) || defined(TCP_ECN) 3815 tp->snd_last = tp->snd_una; 3816 #endif /* TCP_SACK */ 3817 #if defined(TCP_SACK) && defined(TCP_FACK) 3818 tp->snd_fack = tp->snd_una; 3819 tp->retran_data = 0; 3820 tp->snd_awnd = 0; 3821 #endif /* TCP_FACK */ 3822 #ifdef TCP_ECN 3823 if (sc->sc_flags & SCF_ECN_PERMIT) { 3824 tp->t_flags |= TF_ECN_PERMIT; 3825 tcpstat.tcps_ecn_accepts++; 3826 } 3827 #endif 3828 #ifdef TCP_SACK 3829 if (sc->sc_flags & SCF_SACK_PERMIT) 3830 tp->t_flags |= TF_SACK_PERMIT; 3831 #endif 3832 #ifdef TCP_SIGNATURE 3833 if (sc->sc_flags & SCF_SIGNATURE) 3834 tp->t_flags |= TF_SIGNATURE; 3835 #endif 3836 tcp_rcvseqinit(tp); 3837 tp->t_state = TCPS_SYN_RECEIVED; 3838 tp->t_rcvtime = tcp_now; 3839 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3840 tcpstat.tcps_accepts++; 3841 3842 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3843 if (sc->sc_peermaxseg) 3844 tcp_mss_update(tp); 3845 /* Reset initial window to 1 segment for retransmit */ 3846 if (sc->sc_rxtshift > 0) 3847 tp->snd_cwnd = tp->t_maxseg; 3848 tp->snd_wl1 = sc->sc_irs; 3849 tp->rcv_up = sc->sc_irs + 1; 3850 3851 /* 3852 * This is what whould have happened in tcp_output() when 3853 * the SYN,ACK was sent. 3854 */ 3855 tp->snd_up = tp->snd_una; 3856 tp->snd_max = tp->snd_nxt = tp->iss+1; 3857 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3858 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3859 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3860 tp->last_ack_sent = tp->rcv_nxt; 3861 3862 tcpstat.tcps_sc_completed++; 3863 SYN_CACHE_PUT(sc); 3864 return (so); 3865 3866 resetandabort: 3867 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST); 3868 m_freem(m); 3869 abort: 3870 if (so != NULL) 3871 (void) soabort(so); 3872 SYN_CACHE_PUT(sc); 3873 tcpstat.tcps_sc_aborted++; 3874 return ((struct socket *)(-1)); 3875 } 3876 3877 /* 3878 * This function is called when we get a RST for a 3879 * non-existent connection, so that we can see if the 3880 * connection is in the syn cache. If it is, zap it. 3881 */ 3882 3883 void 3884 syn_cache_reset(src, dst, th) 3885 struct sockaddr *src; 3886 struct sockaddr *dst; 3887 struct tcphdr *th; 3888 { 3889 struct syn_cache *sc; 3890 struct syn_cache_head *scp; 3891 int s = splsoftnet(); 3892 3893 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3894 splx(s); 3895 return; 3896 } 3897 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3898 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3899 splx(s); 3900 return; 3901 } 3902 SYN_CACHE_RM(sc); 3903 splx(s); 3904 tcpstat.tcps_sc_reset++; 3905 SYN_CACHE_PUT(sc); 3906 } 3907 3908 void 3909 syn_cache_unreach(src, dst, th) 3910 struct sockaddr *src; 3911 struct sockaddr *dst; 3912 struct tcphdr *th; 3913 { 3914 struct syn_cache *sc; 3915 struct syn_cache_head *scp; 3916 int s; 3917 3918 s = splsoftnet(); 3919 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3920 splx(s); 3921 return; 3922 } 3923 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3924 if (ntohl (th->th_seq) != sc->sc_iss) { 3925 splx(s); 3926 return; 3927 } 3928 3929 /* 3930 * If we've retransmitted 3 times and this is our second error, 3931 * we remove the entry. Otherwise, we allow it to continue on. 3932 * This prevents us from incorrectly nuking an entry during a 3933 * spurious network outage. 3934 * 3935 * See tcp_notify(). 3936 */ 3937 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3938 sc->sc_flags |= SCF_UNREACH; 3939 splx(s); 3940 return; 3941 } 3942 3943 SYN_CACHE_RM(sc); 3944 splx(s); 3945 tcpstat.tcps_sc_unreach++; 3946 SYN_CACHE_PUT(sc); 3947 } 3948 3949 /* 3950 * Given a LISTEN socket and an inbound SYN request, add 3951 * this to the syn cache, and send back a segment: 3952 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3953 * to the source. 3954 * 3955 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3956 * Doing so would require that we hold onto the data and deliver it 3957 * to the application. However, if we are the target of a SYN-flood 3958 * DoS attack, an attacker could send data which would eventually 3959 * consume all available buffer space if it were ACKed. By not ACKing 3960 * the data, we avoid this DoS scenario. 3961 */ 3962 3963 int 3964 syn_cache_add(src, dst, th, iphlen, so, m, optp, optlen, oi, issp) 3965 struct sockaddr *src; 3966 struct sockaddr *dst; 3967 struct tcphdr *th; 3968 unsigned int iphlen; 3969 struct socket *so; 3970 struct mbuf *m; 3971 u_char *optp; 3972 int optlen; 3973 struct tcp_opt_info *oi; 3974 tcp_seq *issp; 3975 { 3976 struct tcpcb tb, *tp; 3977 long win; 3978 struct syn_cache *sc; 3979 struct syn_cache_head *scp; 3980 struct mbuf *ipopts; 3981 3982 tp = sototcpcb(so); 3983 3984 /* 3985 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3986 * 3987 * Note this check is performed in tcp_input() very early on. 3988 */ 3989 3990 /* 3991 * Initialize some local state. 3992 */ 3993 win = sbspace(&so->so_rcv); 3994 if (win > TCP_MAXWIN) 3995 win = TCP_MAXWIN; 3996 3997 #ifdef TCP_SIGNATURE 3998 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3999 #else 4000 if (optp) { 4001 #endif 4002 tb.pf = tp->pf; 4003 #ifdef TCP_SACK 4004 tb.sack_enable = tp->sack_enable; 4005 #endif 4006 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 4007 #ifdef TCP_SIGNATURE 4008 if (tp->t_flags & TF_SIGNATURE) 4009 tb.t_flags |= TF_SIGNATURE; 4010 #endif 4011 tb.t_state = TCPS_LISTEN; 4012 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi)) 4013 return (0); 4014 } else 4015 tb.t_flags = 0; 4016 4017 switch (src->sa_family) { 4018 #ifdef INET 4019 case AF_INET: 4020 /* 4021 * Remember the IP options, if any. 4022 */ 4023 ipopts = ip_srcroute(); 4024 break; 4025 #endif 4026 default: 4027 ipopts = NULL; 4028 } 4029 4030 /* 4031 * See if we already have an entry for this connection. 4032 * If we do, resend the SYN,ACK. We do not count this 4033 * as a retransmission (XXX though maybe we should). 4034 */ 4035 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 4036 tcpstat.tcps_sc_dupesyn++; 4037 if (ipopts) { 4038 /* 4039 * If we were remembering a previous source route, 4040 * forget it and use the new one we've been given. 4041 */ 4042 if (sc->sc_ipopts) 4043 (void) m_free(sc->sc_ipopts); 4044 sc->sc_ipopts = ipopts; 4045 } 4046 sc->sc_timestamp = tb.ts_recent; 4047 if (syn_cache_respond(sc, m) == 0) { 4048 tcpstat.tcps_sndacks++; 4049 tcpstat.tcps_sndtotal++; 4050 } 4051 return (1); 4052 } 4053 4054 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 4055 if (sc == NULL) { 4056 if (ipopts) 4057 (void) m_free(ipopts); 4058 return (0); 4059 } 4060 4061 /* 4062 * Fill in the cache, and put the necessary IP and TCP 4063 * options into the reply. 4064 */ 4065 bzero(sc, sizeof(struct syn_cache)); 4066 bzero(&sc->sc_timer, sizeof(sc->sc_timer)); 4067 bcopy(src, &sc->sc_src, src->sa_len); 4068 bcopy(dst, &sc->sc_dst, dst->sa_len); 4069 sc->sc_flags = 0; 4070 sc->sc_ipopts = ipopts; 4071 sc->sc_irs = th->th_seq; 4072 4073 sc->sc_iss = issp ? *issp : arc4random(); 4074 sc->sc_peermaxseg = oi->maxseg; 4075 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4076 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4077 sc->sc_win = win; 4078 sc->sc_timestamp = tb.ts_recent; 4079 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4080 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4081 sc->sc_flags |= SCF_TIMESTAMP; 4082 sc->sc_modulate = arc4random(); 4083 } 4084 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4085 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4086 sc->sc_requested_s_scale = tb.requested_s_scale; 4087 sc->sc_request_r_scale = 0; 4088 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4089 TCP_MAXWIN << sc->sc_request_r_scale < 4090 so->so_rcv.sb_hiwat) 4091 sc->sc_request_r_scale++; 4092 } else { 4093 sc->sc_requested_s_scale = 15; 4094 sc->sc_request_r_scale = 15; 4095 } 4096 #ifdef TCP_ECN 4097 /* 4098 * if both ECE and CWR flag bits are set, peer is ECN capable. 4099 */ 4100 if (tcp_do_ecn && 4101 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4102 sc->sc_flags |= SCF_ECN_PERMIT; 4103 #endif 4104 #ifdef TCP_SACK 4105 /* 4106 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4107 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4108 */ 4109 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4110 sc->sc_flags |= SCF_SACK_PERMIT; 4111 #endif 4112 #ifdef TCP_SIGNATURE 4113 if (tb.t_flags & TF_SIGNATURE) 4114 sc->sc_flags |= SCF_SIGNATURE; 4115 #endif 4116 sc->sc_tp = tp; 4117 if (syn_cache_respond(sc, m) == 0) { 4118 syn_cache_insert(sc, tp); 4119 tcpstat.tcps_sndacks++; 4120 tcpstat.tcps_sndtotal++; 4121 } else { 4122 SYN_CACHE_PUT(sc); 4123 tcpstat.tcps_sc_dropped++; 4124 } 4125 return (1); 4126 } 4127 4128 int 4129 syn_cache_respond(sc, m) 4130 struct syn_cache *sc; 4131 struct mbuf *m; 4132 { 4133 struct route *ro; 4134 u_int8_t *optp; 4135 int optlen, error; 4136 u_int16_t tlen; 4137 struct ip *ip = NULL; 4138 #ifdef INET6 4139 struct ip6_hdr *ip6 = NULL; 4140 #endif 4141 struct tcphdr *th; 4142 u_int hlen; 4143 struct inpcb *inp; 4144 4145 switch (sc->sc_src.sa.sa_family) { 4146 case AF_INET: 4147 hlen = sizeof(struct ip); 4148 ro = &sc->sc_route4; 4149 break; 4150 #ifdef INET6 4151 case AF_INET6: 4152 hlen = sizeof(struct ip6_hdr); 4153 ro = (struct route *)&sc->sc_route6; 4154 break; 4155 #endif 4156 default: 4157 if (m) 4158 m_freem(m); 4159 return (EAFNOSUPPORT); 4160 } 4161 4162 /* Compute the size of the TCP options. */ 4163 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4164 #ifdef TCP_SACK 4165 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4166 #endif 4167 #ifdef TCP_SIGNATURE 4168 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4169 #endif 4170 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4171 4172 tlen = hlen + sizeof(struct tcphdr) + optlen; 4173 4174 /* 4175 * Create the IP+TCP header from scratch. 4176 */ 4177 if (m) 4178 m_freem(m); 4179 #ifdef DIAGNOSTIC 4180 if (max_linkhdr + tlen > MCLBYTES) 4181 return (ENOBUFS); 4182 #endif 4183 MGETHDR(m, M_DONTWAIT, MT_DATA); 4184 if (m && max_linkhdr + tlen > MHLEN) { 4185 MCLGET(m, M_DONTWAIT); 4186 if ((m->m_flags & M_EXT) == 0) { 4187 m_freem(m); 4188 m = NULL; 4189 } 4190 } 4191 if (m == NULL) 4192 return (ENOBUFS); 4193 4194 /* Fixup the mbuf. */ 4195 m->m_data += max_linkhdr; 4196 m->m_len = m->m_pkthdr.len = tlen; 4197 m->m_pkthdr.rcvif = NULL; 4198 memset(mtod(m, u_char *), 0, tlen); 4199 4200 switch (sc->sc_src.sa.sa_family) { 4201 case AF_INET: 4202 ip = mtod(m, struct ip *); 4203 ip->ip_dst = sc->sc_src.sin.sin_addr; 4204 ip->ip_src = sc->sc_dst.sin.sin_addr; 4205 ip->ip_p = IPPROTO_TCP; 4206 th = (struct tcphdr *)(ip + 1); 4207 th->th_dport = sc->sc_src.sin.sin_port; 4208 th->th_sport = sc->sc_dst.sin.sin_port; 4209 break; 4210 #ifdef INET6 4211 case AF_INET6: 4212 ip6 = mtod(m, struct ip6_hdr *); 4213 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4214 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4215 ip6->ip6_nxt = IPPROTO_TCP; 4216 /* ip6_plen will be updated in ip6_output() */ 4217 th = (struct tcphdr *)(ip6 + 1); 4218 th->th_dport = sc->sc_src.sin6.sin6_port; 4219 th->th_sport = sc->sc_dst.sin6.sin6_port; 4220 break; 4221 #endif 4222 default: 4223 th = NULL; 4224 } 4225 4226 th->th_seq = htonl(sc->sc_iss); 4227 th->th_ack = htonl(sc->sc_irs + 1); 4228 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4229 th->th_flags = TH_SYN|TH_ACK; 4230 #ifdef TCP_ECN 4231 /* Set ECE for SYN-ACK if peer supports ECN. */ 4232 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4233 th->th_flags |= TH_ECE; 4234 #endif 4235 th->th_win = htons(sc->sc_win); 4236 /* th_sum already 0 */ 4237 /* th_urp already 0 */ 4238 4239 /* Tack on the TCP options. */ 4240 optp = (u_int8_t *)(th + 1); 4241 *optp++ = TCPOPT_MAXSEG; 4242 *optp++ = 4; 4243 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4244 *optp++ = sc->sc_ourmaxseg & 0xff; 4245 4246 #ifdef TCP_SACK 4247 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4248 if (sc->sc_flags & SCF_SACK_PERMIT) { 4249 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4250 optp += 4; 4251 } 4252 #endif 4253 4254 if (sc->sc_request_r_scale != 15) { 4255 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4256 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4257 sc->sc_request_r_scale); 4258 optp += 4; 4259 } 4260 4261 if (sc->sc_flags & SCF_TIMESTAMP) { 4262 u_int32_t *lp = (u_int32_t *)(optp); 4263 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4264 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4265 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4266 *lp = htonl(sc->sc_timestamp); 4267 optp += TCPOLEN_TSTAMP_APPA; 4268 } 4269 4270 #ifdef TCP_SIGNATURE 4271 if (sc->sc_flags & SCF_SIGNATURE) { 4272 union sockaddr_union src, dst; 4273 struct tdb *tdb; 4274 4275 bzero(&src, sizeof(union sockaddr_union)); 4276 bzero(&dst, sizeof(union sockaddr_union)); 4277 src.sa.sa_len = sc->sc_src.sa.sa_len; 4278 src.sa.sa_family = sc->sc_src.sa.sa_family; 4279 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4280 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4281 4282 switch (sc->sc_src.sa.sa_family) { 4283 case 0: /*default to PF_INET*/ 4284 #ifdef INET 4285 case AF_INET: 4286 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4287 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4288 break; 4289 #endif /* INET */ 4290 #ifdef INET6 4291 case AF_INET6: 4292 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4293 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4294 break; 4295 #endif /* INET6 */ 4296 } 4297 4298 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 4299 if (tdb == NULL) { 4300 if (m) 4301 m_freem(m); 4302 return (EPERM); 4303 } 4304 4305 /* Send signature option */ 4306 *(optp++) = TCPOPT_SIGNATURE; 4307 *(optp++) = TCPOLEN_SIGNATURE; 4308 4309 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4310 hlen, 0, optp) < 0) { 4311 if (m) 4312 m_freem(m); 4313 return (EINVAL); 4314 } 4315 optp += 16; 4316 4317 /* Pad options list to the next 32 bit boundary and 4318 * terminate it. 4319 */ 4320 *optp++ = TCPOPT_NOP; 4321 *optp++ = TCPOPT_EOL; 4322 } 4323 #endif /* TCP_SIGNATURE */ 4324 4325 /* Compute the packet's checksum. */ 4326 switch (sc->sc_src.sa.sa_family) { 4327 case AF_INET: 4328 ip->ip_len = htons(tlen - hlen); 4329 th->th_sum = 0; 4330 th->th_sum = in_cksum(m, tlen); 4331 break; 4332 #ifdef INET6 4333 case AF_INET6: 4334 ip6->ip6_plen = htons(tlen - hlen); 4335 th->th_sum = 0; 4336 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4337 break; 4338 #endif 4339 } 4340 4341 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4342 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4343 4344 /* 4345 * Fill in some straggling IP bits. Note the stack expects 4346 * ip_len to be in host order, for convenience. 4347 */ 4348 switch (sc->sc_src.sa.sa_family) { 4349 #ifdef INET 4350 case AF_INET: 4351 ip->ip_len = htons(tlen); 4352 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4353 /* XXX tos? */ 4354 break; 4355 #endif 4356 #ifdef INET6 4357 case AF_INET6: 4358 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4359 ip6->ip6_vfc |= IPV6_VERSION; 4360 ip6->ip6_plen = htons(tlen - hlen); 4361 /* ip6_hlim will be initialized afterwards */ 4362 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4363 break; 4364 #endif 4365 } 4366 4367 switch (sc->sc_src.sa.sa_family) { 4368 #ifdef INET 4369 case AF_INET: 4370 error = ip_output(m, sc->sc_ipopts, ro, 4371 (ip_mtudisc ? IP_MTUDISC : 0), 4372 (struct ip_moptions *)NULL, inp); 4373 break; 4374 #endif 4375 #ifdef INET6 4376 case AF_INET6: 4377 ip6->ip6_hlim = in6_selecthlim(NULL, 4378 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4379 4380 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4381 (struct ip6_moptions *)0, NULL, NULL); 4382 break; 4383 #endif 4384 default: 4385 error = EAFNOSUPPORT; 4386 break; 4387 } 4388 return (error); 4389 } 4390