1 /* $OpenBSD: tcp_input.c,v 1.200 2006/12/11 21:31:58 markus Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/protosw.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/kernel.h> 78 79 #include <dev/rndvar.h> 80 81 #include <net/if.h> 82 #include <net/route.h> 83 84 #include <netinet/in.h> 85 #include <netinet/in_systm.h> 86 #include <netinet/ip.h> 87 #include <netinet/in_pcb.h> 88 #include <netinet/ip_var.h> 89 #include <netinet/tcp.h> 90 #include <netinet/tcp_fsm.h> 91 #include <netinet/tcp_seq.h> 92 #include <netinet/tcp_timer.h> 93 #include <netinet/tcp_var.h> 94 #include <netinet/tcpip.h> 95 #include <netinet/tcp_debug.h> 96 97 struct tcpiphdr tcp_saveti; 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #include <netinet6/nd6.h> 102 103 struct tcpipv6hdr tcp_saveti6; 104 105 /* for the packet header length in the mbuf */ 106 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 107 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 108 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 109 #endif /* INET6 */ 110 111 int tcprexmtthresh = 3; 112 int tcptv_keep_init = TCPTV_KEEP_INIT; 113 114 extern u_long sb_max; 115 116 int tcp_rst_ppslim = 100; /* 100pps */ 117 int tcp_rst_ppslim_count = 0; 118 struct timeval tcp_rst_ppslim_last; 119 120 int tcp_ackdrop_ppslim = 100; /* 100pps */ 121 int tcp_ackdrop_ppslim_count = 0; 122 struct timeval tcp_ackdrop_ppslim_last; 123 124 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 125 126 /* for modulo comparisons of timestamps */ 127 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 128 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 129 130 /* for TCP SACK comparisons */ 131 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 132 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 133 134 /* 135 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 136 */ 137 #ifdef INET6 138 #define ND6_HINT(tp) \ 139 do { \ 140 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 141 tp->t_inpcb->inp_route6.ro_rt) { \ 142 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 143 } \ 144 } while (0) 145 #else 146 #define ND6_HINT(tp) 147 #endif 148 149 #ifdef TCP_ECN 150 /* 151 * ECN (Explicit Congestion Notification) support based on RFC3168 152 * implementation note: 153 * snd_last is used to track a recovery phase. 154 * when cwnd is reduced, snd_last is set to snd_max. 155 * while snd_last > snd_una, the sender is in a recovery phase and 156 * its cwnd should not be reduced again. 157 * snd_last follows snd_una when not in a recovery phase. 158 */ 159 #endif 160 161 /* 162 * Macro to compute ACK transmission behavior. Delay the ACK unless 163 * we have already delayed an ACK (must send an ACK every two segments). 164 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 165 * option is enabled. 166 */ 167 #define TCP_SETUP_ACK(tp, tiflags) \ 168 do { \ 169 if ((tp)->t_flags & TF_DELACK || \ 170 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \ 171 tp->t_flags |= TF_ACKNOW; \ 172 else \ 173 TCP_SET_DELACK(tp); \ 174 } while (0) 175 176 /* 177 * Insert segment ti into reassembly queue of tcp with 178 * control block tp. Return TH_FIN if reassembly now includes 179 * a segment with FIN. The macro form does the common case inline 180 * (segment is the next to be received on an established connection, 181 * and the queue is empty), avoiding linkage into and removal 182 * from the queue and repetition of various conversions. 183 * Set DELACK for segments received in order, but ack immediately 184 * when segments are out of order (so fast retransmit can work). 185 */ 186 187 int 188 tcp_reass(tp, th, m, tlen) 189 struct tcpcb *tp; 190 struct tcphdr *th; 191 struct mbuf *m; 192 int *tlen; 193 { 194 struct tcpqent *p, *q, *nq, *tiqe; 195 struct socket *so = tp->t_inpcb->inp_socket; 196 int flags; 197 198 /* 199 * Call with th==0 after become established to 200 * force pre-ESTABLISHED data up to user socket. 201 */ 202 if (th == 0) 203 goto present; 204 205 /* 206 * Allocate a new queue entry, before we throw away any data. 207 * If we can't, just drop the packet. XXX 208 */ 209 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 210 if (tiqe == NULL) { 211 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 212 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 213 /* Reuse last entry since new segment fills a hole */ 214 m_freem(tiqe->tcpqe_m); 215 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 216 } 217 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 218 /* Flush segment queue for this connection */ 219 tcp_freeq(tp); 220 tcpstat.tcps_rcvmemdrop++; 221 m_freem(m); 222 return (0); 223 } 224 } 225 226 /* 227 * Find a segment which begins after this one does. 228 */ 229 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 230 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 231 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 232 break; 233 234 /* 235 * If there is a preceding segment, it may provide some of 236 * our data already. If so, drop the data from the incoming 237 * segment. If it provides all of our data, drop us. 238 */ 239 if (p != NULL) { 240 struct tcphdr *phdr = p->tcpqe_tcp; 241 int i; 242 243 /* conversion to int (in i) handles seq wraparound */ 244 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 245 if (i > 0) { 246 if (i >= *tlen) { 247 tcpstat.tcps_rcvduppack++; 248 tcpstat.tcps_rcvdupbyte += *tlen; 249 m_freem(m); 250 pool_put(&tcpqe_pool, tiqe); 251 return (0); 252 } 253 m_adj(m, i); 254 *tlen -= i; 255 th->th_seq += i; 256 } 257 } 258 tcpstat.tcps_rcvoopack++; 259 tcpstat.tcps_rcvoobyte += *tlen; 260 261 /* 262 * While we overlap succeeding segments trim them or, 263 * if they are completely covered, dequeue them. 264 */ 265 for (; q != NULL; q = nq) { 266 struct tcphdr *qhdr = q->tcpqe_tcp; 267 int i = (th->th_seq + *tlen) - qhdr->th_seq; 268 269 if (i <= 0) 270 break; 271 if (i < qhdr->th_reseqlen) { 272 qhdr->th_seq += i; 273 qhdr->th_reseqlen -= i; 274 m_adj(q->tcpqe_m, i); 275 break; 276 } 277 nq = TAILQ_NEXT(q, tcpqe_q); 278 m_freem(q->tcpqe_m); 279 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 280 pool_put(&tcpqe_pool, q); 281 } 282 283 /* Insert the new segment queue entry into place. */ 284 tiqe->tcpqe_m = m; 285 th->th_reseqlen = *tlen; 286 tiqe->tcpqe_tcp = th; 287 if (p == NULL) { 288 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 289 } else { 290 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 291 } 292 293 present: 294 /* 295 * Present data to user, advancing rcv_nxt through 296 * completed sequence space. 297 */ 298 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 299 return (0); 300 q = TAILQ_FIRST(&tp->t_segq); 301 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 302 return (0); 303 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 304 return (0); 305 do { 306 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 307 flags = q->tcpqe_tcp->th_flags & TH_FIN; 308 309 nq = TAILQ_NEXT(q, tcpqe_q); 310 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 311 ND6_HINT(tp); 312 if (so->so_state & SS_CANTRCVMORE) 313 m_freem(q->tcpqe_m); 314 else 315 sbappendstream(&so->so_rcv, q->tcpqe_m); 316 pool_put(&tcpqe_pool, q); 317 q = nq; 318 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 319 sorwakeup(so); 320 return (flags); 321 } 322 323 #ifdef INET6 324 int 325 tcp6_input(mp, offp, proto) 326 struct mbuf **mp; 327 int *offp, proto; 328 { 329 struct mbuf *m = *mp; 330 331 #if defined(NFAITH) && 0 < NFAITH 332 if (m->m_pkthdr.rcvif) { 333 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 334 /* XXX send icmp6 host/port unreach? */ 335 m_freem(m); 336 return IPPROTO_DONE; 337 } 338 } 339 #endif 340 341 /* 342 * draft-itojun-ipv6-tcp-to-anycast 343 * better place to put this in? 344 */ 345 if (m->m_flags & M_ANYCAST6) { 346 if (m->m_len >= sizeof(struct ip6_hdr)) { 347 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 348 icmp6_error(m, ICMP6_DST_UNREACH, 349 ICMP6_DST_UNREACH_ADDR, 350 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 351 } else 352 m_freem(m); 353 return IPPROTO_DONE; 354 } 355 356 tcp_input(m, *offp, proto); 357 return IPPROTO_DONE; 358 } 359 #endif 360 361 /* 362 * TCP input routine, follows pages 65-76 of the 363 * protocol specification dated September, 1981 very closely. 364 */ 365 void 366 tcp_input(struct mbuf *m, ...) 367 { 368 struct ip *ip; 369 struct inpcb *inp; 370 u_int8_t *optp = NULL; 371 int optlen = 0; 372 int tlen, off; 373 struct tcpcb *tp = 0; 374 int tiflags; 375 struct socket *so = NULL; 376 int todrop, acked, ourfinisacked, needoutput = 0; 377 int hdroptlen = 0; 378 short ostate = 0; 379 int iss = 0; 380 u_long tiwin; 381 struct tcp_opt_info opti; 382 int iphlen; 383 va_list ap; 384 struct tcphdr *th; 385 #ifdef INET6 386 struct ip6_hdr *ip6 = NULL; 387 #endif /* INET6 */ 388 #ifdef IPSEC 389 struct m_tag *mtag; 390 struct tdb_ident *tdbi; 391 struct tdb *tdb; 392 int error, s; 393 #endif /* IPSEC */ 394 int af; 395 #ifdef TCP_ECN 396 u_char iptos; 397 #endif 398 399 va_start(ap, m); 400 iphlen = va_arg(ap, int); 401 va_end(ap); 402 403 tcpstat.tcps_rcvtotal++; 404 405 opti.ts_present = 0; 406 opti.maxseg = 0; 407 408 /* 409 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 410 * See below for AF specific multicast. 411 */ 412 if (m->m_flags & (M_BCAST|M_MCAST)) 413 goto drop; 414 415 /* 416 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 417 * TCP/IPv4. 418 */ 419 switch (mtod(m, struct ip *)->ip_v) { 420 #ifdef INET6 421 case 6: 422 af = AF_INET6; 423 break; 424 #endif 425 case 4: 426 af = AF_INET; 427 break; 428 default: 429 m_freem(m); 430 return; /*EAFNOSUPPORT*/ 431 } 432 433 /* 434 * Get IP and TCP header together in first mbuf. 435 * Note: IP leaves IP header in first mbuf. 436 */ 437 switch (af) { 438 case AF_INET: 439 #ifdef DIAGNOSTIC 440 if (iphlen < sizeof(struct ip)) { 441 m_freem(m); 442 return; 443 } 444 #endif /* DIAGNOSTIC */ 445 break; 446 #ifdef INET6 447 case AF_INET6: 448 #ifdef DIAGNOSTIC 449 if (iphlen < sizeof(struct ip6_hdr)) { 450 m_freem(m); 451 return; 452 } 453 #endif /* DIAGNOSTIC */ 454 break; 455 #endif 456 default: 457 m_freem(m); 458 return; 459 } 460 461 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 462 if (!th) { 463 tcpstat.tcps_rcvshort++; 464 return; 465 } 466 467 tlen = m->m_pkthdr.len - iphlen; 468 ip = NULL; 469 #ifdef INET6 470 ip6 = NULL; 471 #endif 472 switch (af) { 473 case AF_INET: 474 ip = mtod(m, struct ip *); 475 if (IN_MULTICAST(ip->ip_dst.s_addr) || 476 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 477 goto drop; 478 #ifdef TCP_ECN 479 /* save ip_tos before clearing it for checksum */ 480 iptos = ip->ip_tos; 481 #endif 482 /* 483 * Checksum extended TCP header and data. 484 */ 485 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 486 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 487 tcpstat.tcps_inhwcsum++; 488 tcpstat.tcps_rcvbadsum++; 489 goto drop; 490 } 491 if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) { 492 tcpstat.tcps_rcvbadsum++; 493 goto drop; 494 } 495 } else { 496 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 497 tcpstat.tcps_inhwcsum++; 498 } 499 break; 500 #ifdef INET6 501 case AF_INET6: 502 ip6 = mtod(m, struct ip6_hdr *); 503 #ifdef TCP_ECN 504 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 505 #endif 506 507 /* Be proactive about malicious use of IPv4 mapped address */ 508 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 509 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 510 /* XXX stat */ 511 goto drop; 512 } 513 514 /* 515 * Be proactive about unspecified IPv6 address in source. 516 * As we use all-zero to indicate unbounded/unconnected pcb, 517 * unspecified IPv6 address can be used to confuse us. 518 * 519 * Note that packets with unspecified IPv6 destination is 520 * already dropped in ip6_input. 521 */ 522 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 523 /* XXX stat */ 524 goto drop; 525 } 526 527 /* Discard packets to multicast */ 528 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 529 /* XXX stat */ 530 goto drop; 531 } 532 533 /* 534 * Checksum extended TCP header and data. 535 */ 536 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 537 tcpstat.tcps_rcvbadsum++; 538 goto drop; 539 } 540 break; 541 #endif 542 } 543 544 /* 545 * Check that TCP offset makes sense, 546 * pull out TCP options and adjust length. XXX 547 */ 548 off = th->th_off << 2; 549 if (off < sizeof(struct tcphdr) || off > tlen) { 550 tcpstat.tcps_rcvbadoff++; 551 goto drop; 552 } 553 tlen -= off; 554 if (off > sizeof(struct tcphdr)) { 555 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 556 if (!th) { 557 tcpstat.tcps_rcvshort++; 558 return; 559 } 560 optlen = off - sizeof(struct tcphdr); 561 optp = (u_int8_t *)(th + 1); 562 /* 563 * Do quick retrieval of timestamp options ("options 564 * prediction?"). If timestamp is the only option and it's 565 * formatted as recommended in RFC 1323 appendix A, we 566 * quickly get the values now and not bother calling 567 * tcp_dooptions(), etc. 568 */ 569 if ((optlen == TCPOLEN_TSTAMP_APPA || 570 (optlen > TCPOLEN_TSTAMP_APPA && 571 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 572 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 573 (th->th_flags & TH_SYN) == 0) { 574 opti.ts_present = 1; 575 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 576 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 577 optp = NULL; /* we've parsed the options */ 578 } 579 } 580 tiflags = th->th_flags; 581 582 /* 583 * Convert TCP protocol specific fields to host format. 584 */ 585 NTOHL(th->th_seq); 586 NTOHL(th->th_ack); 587 NTOHS(th->th_win); 588 NTOHS(th->th_urp); 589 590 /* 591 * Locate pcb for segment. 592 */ 593 findpcb: 594 switch (af) { 595 #ifdef INET6 596 case AF_INET6: 597 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport, 598 &ip6->ip6_dst, th->th_dport); 599 break; 600 #endif 601 case AF_INET: 602 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport, 603 ip->ip_dst, th->th_dport); 604 break; 605 } 606 if (inp == 0) { 607 int inpl_flags = 0; 608 #if NPF > 0 609 struct pf_mtag *t; 610 611 if ((t = pf_find_mtag(m)) != NULL && 612 t->flags & PF_TAG_TRANSLATE_LOCALHOST) 613 inpl_flags = INPLOOKUP_WILDCARD; 614 #endif 615 ++tcpstat.tcps_pcbhashmiss; 616 switch (af) { 617 #ifdef INET6 618 case AF_INET6: 619 inp = in6_pcblookup_listen(&tcbtable, 620 &ip6->ip6_dst, th->th_dport, inpl_flags); 621 break; 622 #endif /* INET6 */ 623 case AF_INET: 624 inp = in_pcblookup_listen(&tcbtable, 625 ip->ip_dst, th->th_dport, inpl_flags); 626 break; 627 } 628 /* 629 * If the state is CLOSED (i.e., TCB does not exist) then 630 * all data in the incoming segment is discarded. 631 * If the TCB exists but is in CLOSED state, it is embryonic, 632 * but should either do a listen or a connect soon. 633 */ 634 if (inp == 0) { 635 ++tcpstat.tcps_noport; 636 goto dropwithreset_ratelim; 637 } 638 } 639 640 /* Check the minimum TTL for socket. */ 641 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 642 goto drop; 643 644 tp = intotcpcb(inp); 645 if (tp == 0) 646 goto dropwithreset_ratelim; 647 if (tp->t_state == TCPS_CLOSED) 648 goto drop; 649 650 /* Unscale the window into a 32-bit value. */ 651 if ((tiflags & TH_SYN) == 0) 652 tiwin = th->th_win << tp->snd_scale; 653 else 654 tiwin = th->th_win; 655 656 so = inp->inp_socket; 657 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 658 union syn_cache_sa src; 659 union syn_cache_sa dst; 660 661 bzero(&src, sizeof(src)); 662 bzero(&dst, sizeof(dst)); 663 switch (af) { 664 #ifdef INET 665 case AF_INET: 666 src.sin.sin_len = sizeof(struct sockaddr_in); 667 src.sin.sin_family = AF_INET; 668 src.sin.sin_addr = ip->ip_src; 669 src.sin.sin_port = th->th_sport; 670 671 dst.sin.sin_len = sizeof(struct sockaddr_in); 672 dst.sin.sin_family = AF_INET; 673 dst.sin.sin_addr = ip->ip_dst; 674 dst.sin.sin_port = th->th_dport; 675 break; 676 #endif 677 #ifdef INET6 678 case AF_INET6: 679 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 680 src.sin6.sin6_family = AF_INET6; 681 src.sin6.sin6_addr = ip6->ip6_src; 682 src.sin6.sin6_port = th->th_sport; 683 684 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 685 dst.sin6.sin6_family = AF_INET6; 686 dst.sin6.sin6_addr = ip6->ip6_dst; 687 dst.sin6.sin6_port = th->th_dport; 688 break; 689 #endif /* INET6 */ 690 default: 691 goto badsyn; /*sanity*/ 692 } 693 694 if (so->so_options & SO_DEBUG) { 695 ostate = tp->t_state; 696 switch (af) { 697 #ifdef INET6 698 case AF_INET6: 699 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 700 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 701 break; 702 #endif 703 case AF_INET: 704 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 705 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 706 break; 707 } 708 } 709 if (so->so_options & SO_ACCEPTCONN) { 710 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 711 if (tiflags & TH_RST) { 712 syn_cache_reset(&src.sa, &dst.sa, th); 713 } else if ((tiflags & (TH_ACK|TH_SYN)) == 714 (TH_ACK|TH_SYN)) { 715 /* 716 * Received a SYN,ACK. This should 717 * never happen while we are in 718 * LISTEN. Send an RST. 719 */ 720 goto badsyn; 721 } else if (tiflags & TH_ACK) { 722 so = syn_cache_get(&src.sa, &dst.sa, 723 th, iphlen, tlen, so, m); 724 if (so == NULL) { 725 /* 726 * We don't have a SYN for 727 * this ACK; send an RST. 728 */ 729 goto badsyn; 730 } else if (so == 731 (struct socket *)(-1)) { 732 /* 733 * We were unable to create 734 * the connection. If the 735 * 3-way handshake was 736 * completed, and RST has 737 * been sent to the peer. 738 * Since the mbuf might be 739 * in use for the reply, 740 * do not free it. 741 */ 742 m = NULL; 743 } else { 744 /* 745 * We have created a 746 * full-blown connection. 747 */ 748 tp = NULL; 749 inp = (struct inpcb *)so->so_pcb; 750 tp = intotcpcb(inp); 751 if (tp == NULL) 752 goto badsyn; /*XXX*/ 753 754 /* 755 * Compute proper scaling 756 * value from buffer space 757 */ 758 tcp_rscale(tp, so->so_rcv.sb_hiwat); 759 goto after_listen; 760 } 761 } else { 762 /* 763 * None of RST, SYN or ACK was set. 764 * This is an invalid packet for a 765 * TCB in LISTEN state. Send a RST. 766 */ 767 goto badsyn; 768 } 769 } else { 770 /* 771 * Received a SYN. 772 */ 773 #ifdef INET6 774 /* 775 * If deprecated address is forbidden, we do 776 * not accept SYN to deprecated interface 777 * address to prevent any new inbound 778 * connection from getting established. 779 * When we do not accept SYN, we send a TCP 780 * RST, with deprecated source address (instead 781 * of dropping it). We compromise it as it is 782 * much better for peer to send a RST, and 783 * RST will be the final packet for the 784 * exchange. 785 * 786 * If we do not forbid deprecated addresses, we 787 * accept the SYN packet. RFC2462 does not 788 * suggest dropping SYN in this case. 789 * If we decipher RFC2462 5.5.4, it says like 790 * this: 791 * 1. use of deprecated addr with existing 792 * communication is okay - "SHOULD continue 793 * to be used" 794 * 2. use of it with new communication: 795 * (2a) "SHOULD NOT be used if alternate 796 * address with sufficient scope is 797 * available" 798 * (2b) nothing mentioned otherwise. 799 * Here we fall into (2b) case as we have no 800 * choice in our source address selection - we 801 * must obey the peer. 802 * 803 * The wording in RFC2462 is confusing, and 804 * there are multiple description text for 805 * deprecated address handling - worse, they 806 * are not exactly the same. I believe 5.5.4 807 * is the best one, so we follow 5.5.4. 808 */ 809 if (ip6 && !ip6_use_deprecated) { 810 struct in6_ifaddr *ia6; 811 812 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 813 &ip6->ip6_dst)) && 814 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 815 tp = NULL; 816 goto dropwithreset; 817 } 818 } 819 #endif 820 821 /* 822 * LISTEN socket received a SYN 823 * from itself? This can't possibly 824 * be valid; drop the packet. 825 */ 826 if (th->th_dport == th->th_sport) { 827 switch (af) { 828 #ifdef INET6 829 case AF_INET6: 830 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 831 &ip6->ip6_dst)) { 832 tcpstat.tcps_badsyn++; 833 goto drop; 834 } 835 break; 836 #endif /* INET6 */ 837 case AF_INET: 838 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 839 tcpstat.tcps_badsyn++; 840 goto drop; 841 } 842 break; 843 } 844 } 845 846 /* 847 * SYN looks ok; create compressed TCP 848 * state for it. 849 */ 850 if (so->so_qlen <= so->so_qlimit && 851 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 852 so, m, optp, optlen, &opti)) 853 m = NULL; 854 } 855 goto drop; 856 } 857 } 858 859 after_listen: 860 #ifdef DIAGNOSTIC 861 /* 862 * Should not happen now that all embryonic connections 863 * are handled with compressed state. 864 */ 865 if (tp->t_state == TCPS_LISTEN) 866 panic("tcp_input: TCPS_LISTEN"); 867 #endif 868 869 #ifdef IPSEC 870 /* Find most recent IPsec tag */ 871 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 872 s = splnet(); 873 if (mtag != NULL) { 874 tdbi = (struct tdb_ident *)(mtag + 1); 875 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 876 } else 877 tdb = NULL; 878 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 879 tdb, inp); 880 if (error) { 881 splx(s); 882 goto drop; 883 } 884 885 /* Latch SA */ 886 if (inp->inp_tdb_in != tdb) { 887 if (tdb) { 888 tdb_add_inp(tdb, inp, 1); 889 if (inp->inp_ipo == NULL) { 890 inp->inp_ipo = ipsec_add_policy(inp, af, 891 IPSP_DIRECTION_OUT); 892 if (inp->inp_ipo == NULL) { 893 splx(s); 894 goto drop; 895 } 896 } 897 if (inp->inp_ipo->ipo_dstid == NULL && 898 tdb->tdb_srcid != NULL) { 899 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 900 tdb->tdb_srcid->ref_count++; 901 } 902 if (inp->inp_ipsec_remotecred == NULL && 903 tdb->tdb_remote_cred != NULL) { 904 inp->inp_ipsec_remotecred = 905 tdb->tdb_remote_cred; 906 tdb->tdb_remote_cred->ref_count++; 907 } 908 if (inp->inp_ipsec_remoteauth == NULL && 909 tdb->tdb_remote_auth != NULL) { 910 inp->inp_ipsec_remoteauth = 911 tdb->tdb_remote_auth; 912 tdb->tdb_remote_auth->ref_count++; 913 } 914 } else { /* Just reset */ 915 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 916 inp_tdb_in_next); 917 inp->inp_tdb_in = NULL; 918 } 919 } 920 splx(s); 921 #endif /* IPSEC */ 922 923 /* 924 * Segment received on connection. 925 * Reset idle time and keep-alive timer. 926 */ 927 tp->t_rcvtime = tcp_now; 928 if (TCPS_HAVEESTABLISHED(tp->t_state)) 929 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 930 931 #ifdef TCP_SACK 932 if (tp->sack_enable) 933 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 934 #endif /* TCP_SACK */ 935 936 /* 937 * Process options. 938 */ 939 #ifdef TCP_SIGNATURE 940 if (optp || (tp->t_flags & TF_SIGNATURE)) 941 #else 942 if (optp) 943 #endif 944 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti)) 945 goto drop; 946 947 if (opti.ts_present && opti.ts_ecr) { 948 int rtt_test; 949 950 /* subtract out the tcp timestamp modulator */ 951 opti.ts_ecr -= tp->ts_modulate; 952 953 /* make sure ts_ecr is sensible */ 954 rtt_test = tcp_now - opti.ts_ecr; 955 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 956 opti.ts_ecr = 0; 957 } 958 959 #ifdef TCP_ECN 960 /* if congestion experienced, set ECE bit in subsequent packets. */ 961 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 962 tp->t_flags |= TF_RCVD_CE; 963 tcpstat.tcps_ecn_rcvce++; 964 } 965 #endif 966 /* 967 * Header prediction: check for the two common cases 968 * of a uni-directional data xfer. If the packet has 969 * no control flags, is in-sequence, the window didn't 970 * change and we're not retransmitting, it's a 971 * candidate. If the length is zero and the ack moved 972 * forward, we're the sender side of the xfer. Just 973 * free the data acked & wake any higher level process 974 * that was blocked waiting for space. If the length 975 * is non-zero and the ack didn't move, we're the 976 * receiver side. If we're getting packets in-order 977 * (the reassembly queue is empty), add the data to 978 * the socket buffer and note that we need a delayed ack. 979 */ 980 if (tp->t_state == TCPS_ESTABLISHED && 981 #ifdef TCP_ECN 982 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 983 #else 984 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 985 #endif 986 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 987 th->th_seq == tp->rcv_nxt && 988 tiwin && tiwin == tp->snd_wnd && 989 tp->snd_nxt == tp->snd_max) { 990 991 /* 992 * If last ACK falls within this segment's sequence numbers, 993 * record the timestamp. 994 * Fix from Braden, see Stevens p. 870 995 */ 996 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 997 tp->ts_recent_age = tcp_now; 998 tp->ts_recent = opti.ts_val; 999 } 1000 1001 if (tlen == 0) { 1002 if (SEQ_GT(th->th_ack, tp->snd_una) && 1003 SEQ_LEQ(th->th_ack, tp->snd_max) && 1004 tp->snd_cwnd >= tp->snd_wnd && 1005 tp->t_dupacks == 0) { 1006 /* 1007 * this is a pure ack for outstanding data. 1008 */ 1009 ++tcpstat.tcps_predack; 1010 if (opti.ts_present && opti.ts_ecr) 1011 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1012 else if (tp->t_rtttime && 1013 SEQ_GT(th->th_ack, tp->t_rtseq)) 1014 tcp_xmit_timer(tp, 1015 tcp_now - tp->t_rtttime); 1016 acked = th->th_ack - tp->snd_una; 1017 tcpstat.tcps_rcvackpack++; 1018 tcpstat.tcps_rcvackbyte += acked; 1019 ND6_HINT(tp); 1020 sbdrop(&so->so_snd, acked); 1021 1022 /* 1023 * If we had a pending ICMP message that 1024 * referres to data that have just been 1025 * acknowledged, disregard the recorded ICMP 1026 * message. 1027 */ 1028 if ((tp->t_flags & TF_PMTUD_PEND) && 1029 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1030 tp->t_flags &= ~TF_PMTUD_PEND; 1031 1032 /* 1033 * Keep track of the largest chunk of data 1034 * acknowledged since last PMTU update 1035 */ 1036 if (tp->t_pmtud_mss_acked < acked) 1037 tp->t_pmtud_mss_acked = acked; 1038 1039 tp->snd_una = th->th_ack; 1040 #if defined(TCP_SACK) || defined(TCP_ECN) 1041 /* 1042 * We want snd_last to track snd_una so 1043 * as to avoid sequence wraparound problems 1044 * for very large transfers. 1045 */ 1046 #ifdef TCP_ECN 1047 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1048 #endif 1049 tp->snd_last = tp->snd_una; 1050 #endif /* TCP_SACK */ 1051 #if defined(TCP_SACK) && defined(TCP_FACK) 1052 tp->snd_fack = tp->snd_una; 1053 tp->retran_data = 0; 1054 #endif /* TCP_FACK */ 1055 m_freem(m); 1056 1057 /* 1058 * If all outstanding data are acked, stop 1059 * retransmit timer, otherwise restart timer 1060 * using current (possibly backed-off) value. 1061 * If process is waiting for space, 1062 * wakeup/selwakeup/signal. If data 1063 * are ready to send, let tcp_output 1064 * decide between more output or persist. 1065 */ 1066 if (tp->snd_una == tp->snd_max) 1067 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1068 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1069 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1070 1071 if (sb_notify(&so->so_snd)) 1072 sowwakeup(so); 1073 if (so->so_snd.sb_cc) 1074 (void) tcp_output(tp); 1075 return; 1076 } 1077 } else if (th->th_ack == tp->snd_una && 1078 TAILQ_EMPTY(&tp->t_segq) && 1079 tlen <= sbspace(&so->so_rcv)) { 1080 /* 1081 * This is a pure, in-sequence data packet 1082 * with nothing on the reassembly queue and 1083 * we have enough buffer space to take it. 1084 */ 1085 #ifdef TCP_SACK 1086 /* Clean receiver SACK report if present */ 1087 if (tp->sack_enable && tp->rcv_numsacks) 1088 tcp_clean_sackreport(tp); 1089 #endif /* TCP_SACK */ 1090 ++tcpstat.tcps_preddat; 1091 tp->rcv_nxt += tlen; 1092 tcpstat.tcps_rcvpack++; 1093 tcpstat.tcps_rcvbyte += tlen; 1094 ND6_HINT(tp); 1095 /* 1096 * Drop TCP, IP headers and TCP options then add data 1097 * to socket buffer. 1098 */ 1099 if (so->so_state & SS_CANTRCVMORE) 1100 m_freem(m); 1101 else { 1102 m_adj(m, iphlen + off); 1103 sbappendstream(&so->so_rcv, m); 1104 } 1105 sorwakeup(so); 1106 TCP_SETUP_ACK(tp, tiflags); 1107 if (tp->t_flags & TF_ACKNOW) 1108 (void) tcp_output(tp); 1109 return; 1110 } 1111 } 1112 1113 /* 1114 * Compute mbuf offset to TCP data segment. 1115 */ 1116 hdroptlen = iphlen + off; 1117 1118 /* 1119 * Calculate amount of space in receive window, 1120 * and then do TCP input processing. 1121 * Receive window is amount of space in rcv queue, 1122 * but not less than advertised window. 1123 */ 1124 { int win; 1125 1126 win = sbspace(&so->so_rcv); 1127 if (win < 0) 1128 win = 0; 1129 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1130 } 1131 1132 switch (tp->t_state) { 1133 1134 /* 1135 * If the state is SYN_RECEIVED: 1136 * if seg contains SYN/ACK, send an RST. 1137 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1138 */ 1139 1140 case TCPS_SYN_RECEIVED: 1141 if (tiflags & TH_ACK) { 1142 if (tiflags & TH_SYN) { 1143 tcpstat.tcps_badsyn++; 1144 goto dropwithreset; 1145 } 1146 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1147 SEQ_GT(th->th_ack, tp->snd_max)) 1148 goto dropwithreset; 1149 } 1150 break; 1151 1152 /* 1153 * If the state is SYN_SENT: 1154 * if seg contains an ACK, but not for our SYN, drop the input. 1155 * if seg contains a RST, then drop the connection. 1156 * if seg does not contain SYN, then drop it. 1157 * Otherwise this is an acceptable SYN segment 1158 * initialize tp->rcv_nxt and tp->irs 1159 * if seg contains ack then advance tp->snd_una 1160 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1161 * arrange for segment to be acked (eventually) 1162 * continue processing rest of data/controls, beginning with URG 1163 */ 1164 case TCPS_SYN_SENT: 1165 if ((tiflags & TH_ACK) && 1166 (SEQ_LEQ(th->th_ack, tp->iss) || 1167 SEQ_GT(th->th_ack, tp->snd_max))) 1168 goto dropwithreset; 1169 if (tiflags & TH_RST) { 1170 #ifdef TCP_ECN 1171 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1172 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1173 goto drop; 1174 #endif 1175 if (tiflags & TH_ACK) 1176 tp = tcp_drop(tp, ECONNREFUSED); 1177 goto drop; 1178 } 1179 if ((tiflags & TH_SYN) == 0) 1180 goto drop; 1181 if (tiflags & TH_ACK) { 1182 tp->snd_una = th->th_ack; 1183 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1184 tp->snd_nxt = tp->snd_una; 1185 } 1186 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1187 tp->irs = th->th_seq; 1188 tcp_mss(tp, opti.maxseg); 1189 /* Reset initial window to 1 segment for retransmit */ 1190 if (tp->t_rxtshift > 0) 1191 tp->snd_cwnd = tp->t_maxseg; 1192 tcp_rcvseqinit(tp); 1193 tp->t_flags |= TF_ACKNOW; 1194 #ifdef TCP_SACK 1195 /* 1196 * If we've sent a SACK_PERMITTED option, and the peer 1197 * also replied with one, then TF_SACK_PERMIT should have 1198 * been set in tcp_dooptions(). If it was not, disable SACKs. 1199 */ 1200 if (tp->sack_enable) 1201 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1202 #endif 1203 #ifdef TCP_ECN 1204 /* 1205 * if ECE is set but CWR is not set for SYN-ACK, or 1206 * both ECE and CWR are set for simultaneous open, 1207 * peer is ECN capable. 1208 */ 1209 if (tcp_do_ecn) { 1210 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1211 == (TH_ACK|TH_ECE) || 1212 (tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1213 == (TH_ECE|TH_CWR)) { 1214 tp->t_flags |= TF_ECN_PERMIT; 1215 tiflags &= ~(TH_ECE|TH_CWR); 1216 tcpstat.tcps_ecn_accepts++; 1217 } 1218 } 1219 #endif 1220 1221 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1222 tcpstat.tcps_connects++; 1223 soisconnected(so); 1224 tp->t_state = TCPS_ESTABLISHED; 1225 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1226 /* Do window scaling on this connection? */ 1227 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1228 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1229 tp->snd_scale = tp->requested_s_scale; 1230 tp->rcv_scale = tp->request_r_scale; 1231 } 1232 tcp_reass_lock(tp); 1233 (void) tcp_reass(tp, (struct tcphdr *)0, 1234 (struct mbuf *)0, &tlen); 1235 tcp_reass_unlock(tp); 1236 /* 1237 * if we didn't have to retransmit the SYN, 1238 * use its rtt as our initial srtt & rtt var. 1239 */ 1240 if (tp->t_rtttime) 1241 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1242 /* 1243 * Since new data was acked (the SYN), open the 1244 * congestion window by one MSS. We do this 1245 * here, because we won't go through the normal 1246 * ACK processing below. And since this is the 1247 * start of the connection, we know we are in 1248 * the exponential phase of slow-start. 1249 */ 1250 tp->snd_cwnd += tp->t_maxseg; 1251 } else 1252 tp->t_state = TCPS_SYN_RECEIVED; 1253 1254 #if 0 1255 trimthenstep6: 1256 #endif 1257 /* 1258 * Advance th->th_seq to correspond to first data byte. 1259 * If data, trim to stay within window, 1260 * dropping FIN if necessary. 1261 */ 1262 th->th_seq++; 1263 if (tlen > tp->rcv_wnd) { 1264 todrop = tlen - tp->rcv_wnd; 1265 m_adj(m, -todrop); 1266 tlen = tp->rcv_wnd; 1267 tiflags &= ~TH_FIN; 1268 tcpstat.tcps_rcvpackafterwin++; 1269 tcpstat.tcps_rcvbyteafterwin += todrop; 1270 } 1271 tp->snd_wl1 = th->th_seq - 1; 1272 tp->rcv_up = th->th_seq; 1273 goto step6; 1274 } 1275 1276 /* 1277 * States other than LISTEN or SYN_SENT. 1278 * First check timestamp, if present. 1279 * Then check that at least some bytes of segment are within 1280 * receive window. If segment begins before rcv_nxt, 1281 * drop leading data (and SYN); if nothing left, just ack. 1282 * 1283 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1284 * and it's less than opti.ts_recent, drop it. 1285 */ 1286 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1287 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1288 1289 /* Check to see if ts_recent is over 24 days old. */ 1290 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1291 /* 1292 * Invalidate ts_recent. If this segment updates 1293 * ts_recent, the age will be reset later and ts_recent 1294 * will get a valid value. If it does not, setting 1295 * ts_recent to zero will at least satisfy the 1296 * requirement that zero be placed in the timestamp 1297 * echo reply when ts_recent isn't valid. The 1298 * age isn't reset until we get a valid ts_recent 1299 * because we don't want out-of-order segments to be 1300 * dropped when ts_recent is old. 1301 */ 1302 tp->ts_recent = 0; 1303 } else { 1304 tcpstat.tcps_rcvduppack++; 1305 tcpstat.tcps_rcvdupbyte += tlen; 1306 tcpstat.tcps_pawsdrop++; 1307 goto dropafterack; 1308 } 1309 } 1310 1311 todrop = tp->rcv_nxt - th->th_seq; 1312 if (todrop > 0) { 1313 if (tiflags & TH_SYN) { 1314 tiflags &= ~TH_SYN; 1315 th->th_seq++; 1316 if (th->th_urp > 1) 1317 th->th_urp--; 1318 else 1319 tiflags &= ~TH_URG; 1320 todrop--; 1321 } 1322 if (todrop > tlen || 1323 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1324 /* 1325 * Any valid FIN must be to the left of the 1326 * window. At this point, FIN must be a 1327 * duplicate or out-of-sequence, so drop it. 1328 */ 1329 tiflags &= ~TH_FIN; 1330 /* 1331 * Send ACK to resynchronize, and drop any data, 1332 * but keep on processing for RST or ACK. 1333 */ 1334 tp->t_flags |= TF_ACKNOW; 1335 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1336 tcpstat.tcps_rcvduppack++; 1337 } else { 1338 tcpstat.tcps_rcvpartduppack++; 1339 tcpstat.tcps_rcvpartdupbyte += todrop; 1340 } 1341 hdroptlen += todrop; /* drop from head afterwards */ 1342 th->th_seq += todrop; 1343 tlen -= todrop; 1344 if (th->th_urp > todrop) 1345 th->th_urp -= todrop; 1346 else { 1347 tiflags &= ~TH_URG; 1348 th->th_urp = 0; 1349 } 1350 } 1351 1352 /* 1353 * If new data are received on a connection after the 1354 * user processes are gone, then RST the other end. 1355 */ 1356 if ((so->so_state & SS_NOFDREF) && 1357 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1358 tp = tcp_close(tp); 1359 tcpstat.tcps_rcvafterclose++; 1360 goto dropwithreset; 1361 } 1362 1363 /* 1364 * If segment ends after window, drop trailing data 1365 * (and PUSH and FIN); if nothing left, just ACK. 1366 */ 1367 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1368 if (todrop > 0) { 1369 tcpstat.tcps_rcvpackafterwin++; 1370 if (todrop >= tlen) { 1371 tcpstat.tcps_rcvbyteafterwin += tlen; 1372 /* 1373 * If a new connection request is received 1374 * while in TIME_WAIT, drop the old connection 1375 * and start over if the sequence numbers 1376 * are above the previous ones. 1377 */ 1378 if (tiflags & TH_SYN && 1379 tp->t_state == TCPS_TIME_WAIT && 1380 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1381 iss = tp->snd_nxt + TCP_ISSINCR; 1382 tp = tcp_close(tp); 1383 goto findpcb; 1384 } 1385 /* 1386 * If window is closed can only take segments at 1387 * window edge, and have to drop data and PUSH from 1388 * incoming segments. Continue processing, but 1389 * remember to ack. Otherwise, drop segment 1390 * and ack. 1391 */ 1392 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1393 tp->t_flags |= TF_ACKNOW; 1394 tcpstat.tcps_rcvwinprobe++; 1395 } else 1396 goto dropafterack; 1397 } else 1398 tcpstat.tcps_rcvbyteafterwin += todrop; 1399 m_adj(m, -todrop); 1400 tlen -= todrop; 1401 tiflags &= ~(TH_PUSH|TH_FIN); 1402 } 1403 1404 /* 1405 * If last ACK falls within this segment's sequence numbers, 1406 * record its timestamp if it's more recent. 1407 * Cf fix from Braden, see Stevens p. 870 1408 */ 1409 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1410 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1411 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1412 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1413 tp->ts_recent = opti.ts_val; 1414 else 1415 tp->ts_recent = 0; 1416 tp->ts_recent_age = tcp_now; 1417 } 1418 1419 /* 1420 * If the RST bit is set examine the state: 1421 * SYN_RECEIVED STATE: 1422 * If passive open, return to LISTEN state. 1423 * If active open, inform user that connection was refused. 1424 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1425 * Inform user that connection was reset, and close tcb. 1426 * CLOSING, LAST_ACK, TIME_WAIT STATES 1427 * Close the tcb. 1428 */ 1429 if (tiflags & TH_RST) { 1430 if (th->th_seq != tp->last_ack_sent && 1431 th->th_seq != tp->rcv_nxt && 1432 th->th_seq != (tp->rcv_nxt + 1)) 1433 goto drop; 1434 1435 switch (tp->t_state) { 1436 case TCPS_SYN_RECEIVED: 1437 #ifdef TCP_ECN 1438 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1439 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1440 goto drop; 1441 #endif 1442 so->so_error = ECONNREFUSED; 1443 goto close; 1444 1445 case TCPS_ESTABLISHED: 1446 case TCPS_FIN_WAIT_1: 1447 case TCPS_FIN_WAIT_2: 1448 case TCPS_CLOSE_WAIT: 1449 so->so_error = ECONNRESET; 1450 close: 1451 tp->t_state = TCPS_CLOSED; 1452 tcpstat.tcps_drops++; 1453 tp = tcp_close(tp); 1454 goto drop; 1455 case TCPS_CLOSING: 1456 case TCPS_LAST_ACK: 1457 case TCPS_TIME_WAIT: 1458 tp = tcp_close(tp); 1459 goto drop; 1460 } 1461 } 1462 1463 /* 1464 * If a SYN is in the window, then this is an 1465 * error and we ACK and drop the packet. 1466 */ 1467 if (tiflags & TH_SYN) 1468 goto dropafterack_ratelim; 1469 1470 /* 1471 * If the ACK bit is off we drop the segment and return. 1472 */ 1473 if ((tiflags & TH_ACK) == 0) { 1474 if (tp->t_flags & TF_ACKNOW) 1475 goto dropafterack; 1476 else 1477 goto drop; 1478 } 1479 1480 /* 1481 * Ack processing. 1482 */ 1483 switch (tp->t_state) { 1484 1485 /* 1486 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1487 * ESTABLISHED state and continue processing. 1488 * The ACK was checked above. 1489 */ 1490 case TCPS_SYN_RECEIVED: 1491 tcpstat.tcps_connects++; 1492 soisconnected(so); 1493 tp->t_state = TCPS_ESTABLISHED; 1494 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1495 /* Do window scaling? */ 1496 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1497 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1498 tp->snd_scale = tp->requested_s_scale; 1499 tp->rcv_scale = tp->request_r_scale; 1500 } 1501 tcp_reass_lock(tp); 1502 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1503 &tlen); 1504 tcp_reass_unlock(tp); 1505 tp->snd_wl1 = th->th_seq - 1; 1506 /* fall into ... */ 1507 1508 /* 1509 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1510 * ACKs. If the ack is in the range 1511 * tp->snd_una < th->th_ack <= tp->snd_max 1512 * then advance tp->snd_una to th->th_ack and drop 1513 * data from the retransmission queue. If this ACK reflects 1514 * more up to date window information we update our window information. 1515 */ 1516 case TCPS_ESTABLISHED: 1517 case TCPS_FIN_WAIT_1: 1518 case TCPS_FIN_WAIT_2: 1519 case TCPS_CLOSE_WAIT: 1520 case TCPS_CLOSING: 1521 case TCPS_LAST_ACK: 1522 case TCPS_TIME_WAIT: 1523 #ifdef TCP_ECN 1524 /* 1525 * if we receive ECE and are not already in recovery phase, 1526 * reduce cwnd by half but don't slow-start. 1527 * advance snd_last to snd_max not to reduce cwnd again 1528 * until all outstanding packets are acked. 1529 */ 1530 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1531 if ((tp->t_flags & TF_ECN_PERMIT) && 1532 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1533 u_int win; 1534 1535 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1536 if (win > 1) { 1537 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1538 tp->snd_cwnd = tp->snd_ssthresh; 1539 tp->snd_last = tp->snd_max; 1540 tp->t_flags |= TF_SEND_CWR; 1541 tcpstat.tcps_cwr_ecn++; 1542 } 1543 } 1544 tcpstat.tcps_ecn_rcvece++; 1545 } 1546 /* 1547 * if we receive CWR, we know that the peer has reduced 1548 * its congestion window. stop sending ecn-echo. 1549 */ 1550 if ((tiflags & TH_CWR)) { 1551 tp->t_flags &= ~TF_RCVD_CE; 1552 tcpstat.tcps_ecn_rcvcwr++; 1553 } 1554 #endif /* TCP_ECN */ 1555 1556 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1557 /* 1558 * Duplicate/old ACK processing. 1559 * Increments t_dupacks: 1560 * Pure duplicate (same seq/ack/window, no data) 1561 * Doesn't affect t_dupacks: 1562 * Data packets. 1563 * Normal window updates (window opens) 1564 * Resets t_dupacks: 1565 * New data ACKed. 1566 * Window shrinks 1567 * Old ACK 1568 */ 1569 if (tlen) { 1570 /* Drop very old ACKs unless th_seq matches */ 1571 if (th->th_seq != tp->rcv_nxt && 1572 SEQ_LT(th->th_ack, 1573 tp->snd_una - tp->max_sndwnd)) { 1574 tcpstat.tcps_rcvacktooold++; 1575 goto drop; 1576 } 1577 break; 1578 } 1579 /* 1580 * If we get an old ACK, there is probably packet 1581 * reordering going on. Be conservative and reset 1582 * t_dupacks so that we are less agressive in 1583 * doing a fast retransmit. 1584 */ 1585 if (th->th_ack != tp->snd_una) { 1586 tp->t_dupacks = 0; 1587 break; 1588 } 1589 if (tiwin == tp->snd_wnd) { 1590 tcpstat.tcps_rcvdupack++; 1591 /* 1592 * If we have outstanding data (other than 1593 * a window probe), this is a completely 1594 * duplicate ack (ie, window info didn't 1595 * change), the ack is the biggest we've 1596 * seen and we've seen exactly our rexmt 1597 * threshold of them, assume a packet 1598 * has been dropped and retransmit it. 1599 * Kludge snd_nxt & the congestion 1600 * window so we send only this one 1601 * packet. 1602 * 1603 * We know we're losing at the current 1604 * window size so do congestion avoidance 1605 * (set ssthresh to half the current window 1606 * and pull our congestion window back to 1607 * the new ssthresh). 1608 * 1609 * Dup acks mean that packets have left the 1610 * network (they're now cached at the receiver) 1611 * so bump cwnd by the amount in the receiver 1612 * to keep a constant cwnd packets in the 1613 * network. 1614 */ 1615 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1616 tp->t_dupacks = 0; 1617 #if defined(TCP_SACK) && defined(TCP_FACK) 1618 /* 1619 * In FACK, can enter fast rec. if the receiver 1620 * reports a reass. queue longer than 3 segs. 1621 */ 1622 else if (++tp->t_dupacks == tcprexmtthresh || 1623 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1624 tp->t_maxseg + tp->snd_una)) && 1625 SEQ_GT(tp->snd_una, tp->snd_last))) { 1626 #else 1627 else if (++tp->t_dupacks == tcprexmtthresh) { 1628 #endif /* TCP_FACK */ 1629 tcp_seq onxt = tp->snd_nxt; 1630 u_long win = 1631 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1632 2 / tp->t_maxseg; 1633 1634 #if defined(TCP_SACK) || defined(TCP_ECN) 1635 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1636 /* 1637 * False fast retx after 1638 * timeout. Do not cut window. 1639 */ 1640 tp->t_dupacks = 0; 1641 goto drop; 1642 } 1643 #endif 1644 if (win < 2) 1645 win = 2; 1646 tp->snd_ssthresh = win * tp->t_maxseg; 1647 #if defined(TCP_SACK) 1648 tp->snd_last = tp->snd_max; 1649 #endif 1650 #ifdef TCP_SACK 1651 if (tp->sack_enable) { 1652 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1653 tp->t_rtttime = 0; 1654 #ifdef TCP_ECN 1655 tp->t_flags |= TF_SEND_CWR; 1656 #endif 1657 #if 1 /* TCP_ECN */ 1658 tcpstat.tcps_cwr_frecovery++; 1659 #endif 1660 tcpstat.tcps_sack_recovery_episode++; 1661 #if defined(TCP_SACK) && defined(TCP_FACK) 1662 tp->t_dupacks = tcprexmtthresh; 1663 (void) tcp_output(tp); 1664 /* 1665 * During FR, snd_cwnd is held 1666 * constant for FACK. 1667 */ 1668 tp->snd_cwnd = tp->snd_ssthresh; 1669 #else 1670 /* 1671 * tcp_output() will send 1672 * oldest SACK-eligible rtx. 1673 */ 1674 (void) tcp_output(tp); 1675 tp->snd_cwnd = tp->snd_ssthresh+ 1676 tp->t_maxseg * tp->t_dupacks; 1677 #endif /* TCP_FACK */ 1678 goto drop; 1679 } 1680 #endif /* TCP_SACK */ 1681 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1682 tp->t_rtttime = 0; 1683 tp->snd_nxt = th->th_ack; 1684 tp->snd_cwnd = tp->t_maxseg; 1685 #ifdef TCP_ECN 1686 tp->t_flags |= TF_SEND_CWR; 1687 #endif 1688 #if 1 /* TCP_ECN */ 1689 tcpstat.tcps_cwr_frecovery++; 1690 #endif 1691 tcpstat.tcps_sndrexmitfast++; 1692 (void) tcp_output(tp); 1693 1694 tp->snd_cwnd = tp->snd_ssthresh + 1695 tp->t_maxseg * tp->t_dupacks; 1696 if (SEQ_GT(onxt, tp->snd_nxt)) 1697 tp->snd_nxt = onxt; 1698 goto drop; 1699 } else if (tp->t_dupacks > tcprexmtthresh) { 1700 #if defined(TCP_SACK) && defined(TCP_FACK) 1701 /* 1702 * while (awnd < cwnd) 1703 * sendsomething(); 1704 */ 1705 if (tp->sack_enable) { 1706 if (tp->snd_awnd < tp->snd_cwnd) 1707 tcp_output(tp); 1708 goto drop; 1709 } 1710 #endif /* TCP_FACK */ 1711 tp->snd_cwnd += tp->t_maxseg; 1712 (void) tcp_output(tp); 1713 goto drop; 1714 } 1715 } else if (tiwin < tp->snd_wnd) { 1716 /* 1717 * The window was retracted! Previous dup 1718 * ACKs may have been due to packets arriving 1719 * after the shrunken window, not a missing 1720 * packet, so play it safe and reset t_dupacks 1721 */ 1722 tp->t_dupacks = 0; 1723 } 1724 break; 1725 } 1726 /* 1727 * If the congestion window was inflated to account 1728 * for the other side's cached packets, retract it. 1729 */ 1730 #if defined(TCP_SACK) 1731 if (tp->sack_enable) { 1732 if (tp->t_dupacks >= tcprexmtthresh) { 1733 /* Check for a partial ACK */ 1734 if (tcp_sack_partialack(tp, th)) { 1735 #if defined(TCP_SACK) && defined(TCP_FACK) 1736 /* Force call to tcp_output */ 1737 if (tp->snd_awnd < tp->snd_cwnd) 1738 needoutput = 1; 1739 #else 1740 tp->snd_cwnd += tp->t_maxseg; 1741 needoutput = 1; 1742 #endif /* TCP_FACK */ 1743 } else { 1744 /* Out of fast recovery */ 1745 tp->snd_cwnd = tp->snd_ssthresh; 1746 if (tcp_seq_subtract(tp->snd_max, 1747 th->th_ack) < tp->snd_ssthresh) 1748 tp->snd_cwnd = 1749 tcp_seq_subtract(tp->snd_max, 1750 th->th_ack); 1751 tp->t_dupacks = 0; 1752 #if defined(TCP_SACK) && defined(TCP_FACK) 1753 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1754 tp->snd_fack = th->th_ack; 1755 #endif /* TCP_FACK */ 1756 } 1757 } 1758 } else { 1759 if (tp->t_dupacks >= tcprexmtthresh && 1760 !tcp_newreno(tp, th)) { 1761 /* Out of fast recovery */ 1762 tp->snd_cwnd = tp->snd_ssthresh; 1763 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1764 tp->snd_ssthresh) 1765 tp->snd_cwnd = 1766 tcp_seq_subtract(tp->snd_max, 1767 th->th_ack); 1768 tp->t_dupacks = 0; 1769 } 1770 } 1771 if (tp->t_dupacks < tcprexmtthresh) 1772 tp->t_dupacks = 0; 1773 #else /* else no TCP_SACK */ 1774 if (tp->t_dupacks >= tcprexmtthresh && 1775 tp->snd_cwnd > tp->snd_ssthresh) 1776 tp->snd_cwnd = tp->snd_ssthresh; 1777 tp->t_dupacks = 0; 1778 #endif 1779 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1780 tcpstat.tcps_rcvacktoomuch++; 1781 goto dropafterack_ratelim; 1782 } 1783 acked = th->th_ack - tp->snd_una; 1784 tcpstat.tcps_rcvackpack++; 1785 tcpstat.tcps_rcvackbyte += acked; 1786 1787 /* 1788 * If we have a timestamp reply, update smoothed 1789 * round trip time. If no timestamp is present but 1790 * transmit timer is running and timed sequence 1791 * number was acked, update smoothed round trip time. 1792 * Since we now have an rtt measurement, cancel the 1793 * timer backoff (cf., Phil Karn's retransmit alg.). 1794 * Recompute the initial retransmit timer. 1795 */ 1796 if (opti.ts_present && opti.ts_ecr) 1797 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1798 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1799 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1800 1801 /* 1802 * If all outstanding data is acked, stop retransmit 1803 * timer and remember to restart (more output or persist). 1804 * If there is more data to be acked, restart retransmit 1805 * timer, using current (possibly backed-off) value. 1806 */ 1807 if (th->th_ack == tp->snd_max) { 1808 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1809 needoutput = 1; 1810 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1811 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1812 /* 1813 * When new data is acked, open the congestion window. 1814 * If the window gives us less than ssthresh packets 1815 * in flight, open exponentially (maxseg per packet). 1816 * Otherwise open linearly: maxseg per window 1817 * (maxseg^2 / cwnd per packet). 1818 */ 1819 { 1820 u_int cw = tp->snd_cwnd; 1821 u_int incr = tp->t_maxseg; 1822 1823 if (cw > tp->snd_ssthresh) 1824 incr = incr * incr / cw; 1825 #if defined (TCP_SACK) 1826 if (tp->t_dupacks < tcprexmtthresh) 1827 #endif 1828 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1829 } 1830 ND6_HINT(tp); 1831 if (acked > so->so_snd.sb_cc) { 1832 tp->snd_wnd -= so->so_snd.sb_cc; 1833 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1834 ourfinisacked = 1; 1835 } else { 1836 sbdrop(&so->so_snd, acked); 1837 tp->snd_wnd -= acked; 1838 ourfinisacked = 0; 1839 } 1840 if (sb_notify(&so->so_snd)) 1841 sowwakeup(so); 1842 1843 /* 1844 * If we had a pending ICMP message that referred to data 1845 * that have just been acknowledged, disregard the recorded 1846 * ICMP message. 1847 */ 1848 if ((tp->t_flags & TF_PMTUD_PEND) && 1849 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1850 tp->t_flags &= ~TF_PMTUD_PEND; 1851 1852 /* 1853 * Keep track of the largest chunk of data acknowledged 1854 * since last PMTU update 1855 */ 1856 if (tp->t_pmtud_mss_acked < acked) 1857 tp->t_pmtud_mss_acked = acked; 1858 1859 tp->snd_una = th->th_ack; 1860 #ifdef TCP_ECN 1861 /* sync snd_last with snd_una */ 1862 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1863 tp->snd_last = tp->snd_una; 1864 #endif 1865 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1866 tp->snd_nxt = tp->snd_una; 1867 #if defined (TCP_SACK) && defined (TCP_FACK) 1868 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1869 tp->snd_fack = tp->snd_una; 1870 /* Update snd_awnd for partial ACK 1871 * without any SACK blocks. 1872 */ 1873 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1874 tp->snd_fack) + tp->retran_data; 1875 } 1876 #endif 1877 1878 switch (tp->t_state) { 1879 1880 /* 1881 * In FIN_WAIT_1 STATE in addition to the processing 1882 * for the ESTABLISHED state if our FIN is now acknowledged 1883 * then enter FIN_WAIT_2. 1884 */ 1885 case TCPS_FIN_WAIT_1: 1886 if (ourfinisacked) { 1887 /* 1888 * If we can't receive any more 1889 * data, then closing user can proceed. 1890 * Starting the timer is contrary to the 1891 * specification, but if we don't get a FIN 1892 * we'll hang forever. 1893 */ 1894 if (so->so_state & SS_CANTRCVMORE) { 1895 soisdisconnected(so); 1896 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1897 } 1898 tp->t_state = TCPS_FIN_WAIT_2; 1899 } 1900 break; 1901 1902 /* 1903 * In CLOSING STATE in addition to the processing for 1904 * the ESTABLISHED state if the ACK acknowledges our FIN 1905 * then enter the TIME-WAIT state, otherwise ignore 1906 * the segment. 1907 */ 1908 case TCPS_CLOSING: 1909 if (ourfinisacked) { 1910 tp->t_state = TCPS_TIME_WAIT; 1911 tcp_canceltimers(tp); 1912 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1913 soisdisconnected(so); 1914 } 1915 break; 1916 1917 /* 1918 * In LAST_ACK, we may still be waiting for data to drain 1919 * and/or to be acked, as well as for the ack of our FIN. 1920 * If our FIN is now acknowledged, delete the TCB, 1921 * enter the closed state and return. 1922 */ 1923 case TCPS_LAST_ACK: 1924 if (ourfinisacked) { 1925 tp = tcp_close(tp); 1926 goto drop; 1927 } 1928 break; 1929 1930 /* 1931 * In TIME_WAIT state the only thing that should arrive 1932 * is a retransmission of the remote FIN. Acknowledge 1933 * it and restart the finack timer. 1934 */ 1935 case TCPS_TIME_WAIT: 1936 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1937 goto dropafterack; 1938 } 1939 } 1940 1941 step6: 1942 /* 1943 * Update window information. 1944 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1945 */ 1946 if ((tiflags & TH_ACK) && 1947 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1948 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1949 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1950 /* keep track of pure window updates */ 1951 if (tlen == 0 && 1952 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1953 tcpstat.tcps_rcvwinupd++; 1954 tp->snd_wnd = tiwin; 1955 tp->snd_wl1 = th->th_seq; 1956 tp->snd_wl2 = th->th_ack; 1957 if (tp->snd_wnd > tp->max_sndwnd) 1958 tp->max_sndwnd = tp->snd_wnd; 1959 needoutput = 1; 1960 } 1961 1962 /* 1963 * Process segments with URG. 1964 */ 1965 if ((tiflags & TH_URG) && th->th_urp && 1966 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1967 /* 1968 * This is a kludge, but if we receive and accept 1969 * random urgent pointers, we'll crash in 1970 * soreceive. It's hard to imagine someone 1971 * actually wanting to send this much urgent data. 1972 */ 1973 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1974 th->th_urp = 0; /* XXX */ 1975 tiflags &= ~TH_URG; /* XXX */ 1976 goto dodata; /* XXX */ 1977 } 1978 /* 1979 * If this segment advances the known urgent pointer, 1980 * then mark the data stream. This should not happen 1981 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1982 * a FIN has been received from the remote side. 1983 * In these states we ignore the URG. 1984 * 1985 * According to RFC961 (Assigned Protocols), 1986 * the urgent pointer points to the last octet 1987 * of urgent data. We continue, however, 1988 * to consider it to indicate the first octet 1989 * of data past the urgent section as the original 1990 * spec states (in one of two places). 1991 */ 1992 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1993 tp->rcv_up = th->th_seq + th->th_urp; 1994 so->so_oobmark = so->so_rcv.sb_cc + 1995 (tp->rcv_up - tp->rcv_nxt) - 1; 1996 if (so->so_oobmark == 0) 1997 so->so_state |= SS_RCVATMARK; 1998 sohasoutofband(so); 1999 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2000 } 2001 /* 2002 * Remove out of band data so doesn't get presented to user. 2003 * This can happen independent of advancing the URG pointer, 2004 * but if two URG's are pending at once, some out-of-band 2005 * data may creep in... ick. 2006 */ 2007 if (th->th_urp <= (u_int16_t) tlen 2008 #ifdef SO_OOBINLINE 2009 && (so->so_options & SO_OOBINLINE) == 0 2010 #endif 2011 ) 2012 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2013 } else 2014 /* 2015 * If no out of band data is expected, 2016 * pull receive urgent pointer along 2017 * with the receive window. 2018 */ 2019 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2020 tp->rcv_up = tp->rcv_nxt; 2021 dodata: /* XXX */ 2022 2023 /* 2024 * Process the segment text, merging it into the TCP sequencing queue, 2025 * and arranging for acknowledgment of receipt if necessary. 2026 * This process logically involves adjusting tp->rcv_wnd as data 2027 * is presented to the user (this happens in tcp_usrreq.c, 2028 * case PRU_RCVD). If a FIN has already been received on this 2029 * connection then we just ignore the text. 2030 */ 2031 if ((tlen || (tiflags & TH_FIN)) && 2032 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2033 #ifdef TCP_SACK 2034 tcp_seq laststart = th->th_seq; 2035 tcp_seq lastend = th->th_seq + tlen; 2036 #endif 2037 tcp_reass_lock(tp); 2038 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2039 tp->t_state == TCPS_ESTABLISHED) { 2040 tcp_reass_unlock(tp); 2041 TCP_SETUP_ACK(tp, tiflags); 2042 tp->rcv_nxt += tlen; 2043 tiflags = th->th_flags & TH_FIN; 2044 tcpstat.tcps_rcvpack++; 2045 tcpstat.tcps_rcvbyte += tlen; 2046 ND6_HINT(tp); 2047 if (so->so_state & SS_CANTRCVMORE) 2048 m_freem(m); 2049 else { 2050 m_adj(m, hdroptlen); 2051 sbappendstream(&so->so_rcv, m); 2052 } 2053 sorwakeup(so); 2054 } else { 2055 m_adj(m, hdroptlen); 2056 tiflags = tcp_reass(tp, th, m, &tlen); 2057 tcp_reass_unlock(tp); 2058 tp->t_flags |= TF_ACKNOW; 2059 } 2060 #ifdef TCP_SACK 2061 if (tp->sack_enable) 2062 tcp_update_sack_list(tp, laststart, lastend); 2063 #endif 2064 2065 /* 2066 * variable len never referenced again in modern BSD, 2067 * so why bother computing it ?? 2068 */ 2069 #if 0 2070 /* 2071 * Note the amount of data that peer has sent into 2072 * our window, in order to estimate the sender's 2073 * buffer size. 2074 */ 2075 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2076 #endif /* 0 */ 2077 } else { 2078 m_freem(m); 2079 tiflags &= ~TH_FIN; 2080 } 2081 2082 /* 2083 * If FIN is received ACK the FIN and let the user know 2084 * that the connection is closing. Ignore a FIN received before 2085 * the connection is fully established. 2086 */ 2087 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2088 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2089 socantrcvmore(so); 2090 tp->t_flags |= TF_ACKNOW; 2091 tp->rcv_nxt++; 2092 } 2093 switch (tp->t_state) { 2094 2095 /* 2096 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2097 */ 2098 case TCPS_ESTABLISHED: 2099 tp->t_state = TCPS_CLOSE_WAIT; 2100 break; 2101 2102 /* 2103 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2104 * enter the CLOSING state. 2105 */ 2106 case TCPS_FIN_WAIT_1: 2107 tp->t_state = TCPS_CLOSING; 2108 break; 2109 2110 /* 2111 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2112 * starting the time-wait timer, turning off the other 2113 * standard timers. 2114 */ 2115 case TCPS_FIN_WAIT_2: 2116 tp->t_state = TCPS_TIME_WAIT; 2117 tcp_canceltimers(tp); 2118 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2119 soisdisconnected(so); 2120 break; 2121 2122 /* 2123 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2124 */ 2125 case TCPS_TIME_WAIT: 2126 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2127 break; 2128 } 2129 } 2130 if (so->so_options & SO_DEBUG) { 2131 switch (tp->pf) { 2132 #ifdef INET6 2133 case PF_INET6: 2134 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2135 0, tlen); 2136 break; 2137 #endif /* INET6 */ 2138 case PF_INET: 2139 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2140 0, tlen); 2141 break; 2142 } 2143 } 2144 2145 /* 2146 * Return any desired output. 2147 */ 2148 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2149 (void) tcp_output(tp); 2150 } 2151 return; 2152 2153 badsyn: 2154 /* 2155 * Received a bad SYN. Increment counters and dropwithreset. 2156 */ 2157 tcpstat.tcps_badsyn++; 2158 tp = NULL; 2159 goto dropwithreset; 2160 2161 dropafterack_ratelim: 2162 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2163 tcp_ackdrop_ppslim) == 0) { 2164 /* XXX stat */ 2165 goto drop; 2166 } 2167 /* ...fall into dropafterack... */ 2168 2169 dropafterack: 2170 /* 2171 * Generate an ACK dropping incoming segment if it occupies 2172 * sequence space, where the ACK reflects our state. 2173 */ 2174 if (tiflags & TH_RST) 2175 goto drop; 2176 m_freem(m); 2177 tp->t_flags |= TF_ACKNOW; 2178 (void) tcp_output(tp); 2179 return; 2180 2181 dropwithreset_ratelim: 2182 /* 2183 * We may want to rate-limit RSTs in certain situations, 2184 * particularly if we are sending an RST in response to 2185 * an attempt to connect to or otherwise communicate with 2186 * a port for which we have no socket. 2187 */ 2188 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2189 tcp_rst_ppslim) == 0) { 2190 /* XXX stat */ 2191 goto drop; 2192 } 2193 /* ...fall into dropwithreset... */ 2194 2195 dropwithreset: 2196 /* 2197 * Generate a RST, dropping incoming segment. 2198 * Make ACK acceptable to originator of segment. 2199 * Don't bother to respond to RST. 2200 */ 2201 if (tiflags & TH_RST) 2202 goto drop; 2203 if (tiflags & TH_ACK) { 2204 tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, 2205 TH_RST); 2206 } else { 2207 if (tiflags & TH_SYN) 2208 tlen++; 2209 tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen, 2210 (tcp_seq)0, TH_RST|TH_ACK); 2211 } 2212 return; 2213 2214 drop: 2215 /* 2216 * Drop space held by incoming segment and return. 2217 */ 2218 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2219 switch (tp->pf) { 2220 #ifdef INET6 2221 case PF_INET6: 2222 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2223 0, tlen); 2224 break; 2225 #endif /* INET6 */ 2226 case PF_INET: 2227 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2228 0, tlen); 2229 break; 2230 } 2231 } 2232 2233 m_freem(m); 2234 return; 2235 } 2236 2237 int 2238 tcp_dooptions(tp, cp, cnt, th, m, iphlen, oi) 2239 struct tcpcb *tp; 2240 u_char *cp; 2241 int cnt; 2242 struct tcphdr *th; 2243 struct mbuf *m; 2244 int iphlen; 2245 struct tcp_opt_info *oi; 2246 { 2247 u_int16_t mss = 0; 2248 int opt, optlen; 2249 #ifdef TCP_SIGNATURE 2250 caddr_t sigp = NULL; 2251 struct tdb *tdb = NULL; 2252 #endif /* TCP_SIGNATURE */ 2253 2254 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2255 opt = cp[0]; 2256 if (opt == TCPOPT_EOL) 2257 break; 2258 if (opt == TCPOPT_NOP) 2259 optlen = 1; 2260 else { 2261 if (cnt < 2) 2262 break; 2263 optlen = cp[1]; 2264 if (optlen < 2 || optlen > cnt) 2265 break; 2266 } 2267 switch (opt) { 2268 2269 default: 2270 continue; 2271 2272 case TCPOPT_MAXSEG: 2273 if (optlen != TCPOLEN_MAXSEG) 2274 continue; 2275 if (!(th->th_flags & TH_SYN)) 2276 continue; 2277 if (TCPS_HAVERCVDSYN(tp->t_state)) 2278 continue; 2279 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2280 NTOHS(mss); 2281 oi->maxseg = mss; 2282 break; 2283 2284 case TCPOPT_WINDOW: 2285 if (optlen != TCPOLEN_WINDOW) 2286 continue; 2287 if (!(th->th_flags & TH_SYN)) 2288 continue; 2289 if (TCPS_HAVERCVDSYN(tp->t_state)) 2290 continue; 2291 tp->t_flags |= TF_RCVD_SCALE; 2292 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2293 break; 2294 2295 case TCPOPT_TIMESTAMP: 2296 if (optlen != TCPOLEN_TIMESTAMP) 2297 continue; 2298 oi->ts_present = 1; 2299 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2300 NTOHL(oi->ts_val); 2301 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2302 NTOHL(oi->ts_ecr); 2303 2304 if (!(th->th_flags & TH_SYN)) 2305 continue; 2306 if (TCPS_HAVERCVDSYN(tp->t_state)) 2307 continue; 2308 /* 2309 * A timestamp received in a SYN makes 2310 * it ok to send timestamp requests and replies. 2311 */ 2312 tp->t_flags |= TF_RCVD_TSTMP; 2313 tp->ts_recent = oi->ts_val; 2314 tp->ts_recent_age = tcp_now; 2315 break; 2316 2317 #ifdef TCP_SACK 2318 case TCPOPT_SACK_PERMITTED: 2319 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2320 continue; 2321 if (!(th->th_flags & TH_SYN)) 2322 continue; 2323 if (TCPS_HAVERCVDSYN(tp->t_state)) 2324 continue; 2325 /* MUST only be set on SYN */ 2326 tp->t_flags |= TF_SACK_PERMIT; 2327 break; 2328 case TCPOPT_SACK: 2329 tcp_sack_option(tp, th, cp, optlen); 2330 break; 2331 #endif 2332 #ifdef TCP_SIGNATURE 2333 case TCPOPT_SIGNATURE: 2334 if (optlen != TCPOLEN_SIGNATURE) 2335 continue; 2336 2337 if (sigp && bcmp(sigp, cp + 2, 16)) 2338 return (-1); 2339 2340 sigp = cp + 2; 2341 break; 2342 #endif /* TCP_SIGNATURE */ 2343 } 2344 } 2345 2346 #ifdef TCP_SIGNATURE 2347 if (tp->t_flags & TF_SIGNATURE) { 2348 union sockaddr_union src, dst; 2349 2350 memset(&src, 0, sizeof(union sockaddr_union)); 2351 memset(&dst, 0, sizeof(union sockaddr_union)); 2352 2353 switch (tp->pf) { 2354 case 0: 2355 #ifdef INET 2356 case AF_INET: 2357 src.sa.sa_len = sizeof(struct sockaddr_in); 2358 src.sa.sa_family = AF_INET; 2359 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2360 dst.sa.sa_len = sizeof(struct sockaddr_in); 2361 dst.sa.sa_family = AF_INET; 2362 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2363 break; 2364 #endif 2365 #ifdef INET6 2366 case AF_INET6: 2367 src.sa.sa_len = sizeof(struct sockaddr_in6); 2368 src.sa.sa_family = AF_INET6; 2369 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2370 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2371 dst.sa.sa_family = AF_INET6; 2372 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2373 break; 2374 #endif /* INET6 */ 2375 } 2376 2377 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 2378 2379 /* 2380 * We don't have an SA for this peer, so we turn off 2381 * TF_SIGNATURE on the listen socket 2382 */ 2383 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2384 tp->t_flags &= ~TF_SIGNATURE; 2385 2386 } 2387 2388 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2389 tcpstat.tcps_rcvbadsig++; 2390 return (-1); 2391 } 2392 2393 if (sigp) { 2394 char sig[16]; 2395 2396 if (tdb == NULL) { 2397 tcpstat.tcps_rcvbadsig++; 2398 return (-1); 2399 } 2400 2401 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2402 return (-1); 2403 2404 if (bcmp(sig, sigp, 16)) { 2405 tcpstat.tcps_rcvbadsig++; 2406 return (-1); 2407 } 2408 2409 tcpstat.tcps_rcvgoodsig++; 2410 } 2411 #endif /* TCP_SIGNATURE */ 2412 2413 return (0); 2414 } 2415 2416 #if defined(TCP_SACK) 2417 u_long 2418 tcp_seq_subtract(a, b) 2419 u_long a, b; 2420 { 2421 return ((long)(a - b)); 2422 } 2423 #endif 2424 2425 2426 #ifdef TCP_SACK 2427 /* 2428 * This function is called upon receipt of new valid data (while not in header 2429 * prediction mode), and it updates the ordered list of sacks. 2430 */ 2431 void 2432 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2433 tcp_seq rcv_lastend) 2434 { 2435 /* 2436 * First reported block MUST be the most recent one. Subsequent 2437 * blocks SHOULD be in the order in which they arrived at the 2438 * receiver. These two conditions make the implementation fully 2439 * compliant with RFC 2018. 2440 */ 2441 int i, j = 0, count = 0, lastpos = -1; 2442 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2443 2444 /* First clean up current list of sacks */ 2445 for (i = 0; i < tp->rcv_numsacks; i++) { 2446 sack = tp->sackblks[i]; 2447 if (sack.start == 0 && sack.end == 0) { 2448 count++; /* count = number of blocks to be discarded */ 2449 continue; 2450 } 2451 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2452 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2453 count++; 2454 } else { 2455 temp[j].start = tp->sackblks[i].start; 2456 temp[j++].end = tp->sackblks[i].end; 2457 } 2458 } 2459 tp->rcv_numsacks -= count; 2460 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2461 tcp_clean_sackreport(tp); 2462 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2463 /* ==> need first sack block */ 2464 tp->sackblks[0].start = rcv_laststart; 2465 tp->sackblks[0].end = rcv_lastend; 2466 tp->rcv_numsacks = 1; 2467 } 2468 return; 2469 } 2470 /* Otherwise, sack blocks are already present. */ 2471 for (i = 0; i < tp->rcv_numsacks; i++) 2472 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2473 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2474 return; /* sack list remains unchanged */ 2475 /* 2476 * From here, segment just received should be (part of) the 1st sack. 2477 * Go through list, possibly coalescing sack block entries. 2478 */ 2479 firstsack.start = rcv_laststart; 2480 firstsack.end = rcv_lastend; 2481 for (i = 0; i < tp->rcv_numsacks; i++) { 2482 sack = tp->sackblks[i]; 2483 if (SEQ_LT(sack.end, firstsack.start) || 2484 SEQ_GT(sack.start, firstsack.end)) 2485 continue; /* no overlap */ 2486 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2487 /* 2488 * identical block; delete it here since we will 2489 * move it to the front of the list. 2490 */ 2491 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2492 lastpos = i; /* last posn with a zero entry */ 2493 continue; 2494 } 2495 if (SEQ_LEQ(sack.start, firstsack.start)) 2496 firstsack.start = sack.start; /* merge blocks */ 2497 if (SEQ_GEQ(sack.end, firstsack.end)) 2498 firstsack.end = sack.end; /* merge blocks */ 2499 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2500 lastpos = i; /* last posn with a zero entry */ 2501 } 2502 if (lastpos != -1) { /* at least one merge */ 2503 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2504 sack = tp->sackblks[i]; 2505 if (sack.start == 0 && sack.end == 0) 2506 continue; 2507 temp[j++] = sack; 2508 } 2509 tp->rcv_numsacks = j; /* including first blk (added later) */ 2510 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2511 tp->sackblks[i] = temp[i]; 2512 } else { /* no merges -- shift sacks by 1 */ 2513 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2514 tp->rcv_numsacks++; 2515 for (i = tp->rcv_numsacks-1; i > 0; i--) 2516 tp->sackblks[i] = tp->sackblks[i-1]; 2517 } 2518 tp->sackblks[0] = firstsack; 2519 return; 2520 } 2521 2522 /* 2523 * Process the TCP SACK option. tp->snd_holes is an ordered list 2524 * of holes (oldest to newest, in terms of the sequence space). 2525 */ 2526 void 2527 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2528 { 2529 int tmp_olen; 2530 u_char *tmp_cp; 2531 struct sackhole *cur, *p, *temp; 2532 2533 if (!tp->sack_enable) 2534 return; 2535 /* SACK without ACK doesn't make sense. */ 2536 if ((th->th_flags & TH_ACK) == 0) 2537 return; 2538 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2539 if (SEQ_LT(th->th_ack, tp->snd_una) || 2540 SEQ_GT(th->th_ack, tp->snd_max)) 2541 return; 2542 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2543 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2544 return; 2545 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2546 tmp_cp = cp + 2; 2547 tmp_olen = optlen - 2; 2548 tcpstat.tcps_sack_rcv_opts++; 2549 if (tp->snd_numholes < 0) 2550 tp->snd_numholes = 0; 2551 if (tp->t_maxseg == 0) 2552 panic("tcp_sack_option"); /* Should never happen */ 2553 while (tmp_olen > 0) { 2554 struct sackblk sack; 2555 2556 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2557 NTOHL(sack.start); 2558 bcopy(tmp_cp + sizeof(tcp_seq), 2559 (char *) &(sack.end), sizeof(tcp_seq)); 2560 NTOHL(sack.end); 2561 tmp_olen -= TCPOLEN_SACK; 2562 tmp_cp += TCPOLEN_SACK; 2563 if (SEQ_LEQ(sack.end, sack.start)) 2564 continue; /* bad SACK fields */ 2565 if (SEQ_LEQ(sack.end, tp->snd_una)) 2566 continue; /* old block */ 2567 #if defined(TCP_SACK) && defined(TCP_FACK) 2568 /* Updates snd_fack. */ 2569 if (SEQ_GT(sack.end, tp->snd_fack)) 2570 tp->snd_fack = sack.end; 2571 #endif /* TCP_FACK */ 2572 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2573 if (SEQ_LT(sack.start, th->th_ack)) 2574 continue; 2575 } 2576 if (SEQ_GT(sack.end, tp->snd_max)) 2577 continue; 2578 if (tp->snd_holes == NULL) { /* first hole */ 2579 tp->snd_holes = (struct sackhole *) 2580 pool_get(&sackhl_pool, PR_NOWAIT); 2581 if (tp->snd_holes == NULL) { 2582 /* ENOBUFS, so ignore SACKed block for now*/ 2583 goto done; 2584 } 2585 cur = tp->snd_holes; 2586 cur->start = th->th_ack; 2587 cur->end = sack.start; 2588 cur->rxmit = cur->start; 2589 cur->next = NULL; 2590 tp->snd_numholes = 1; 2591 tp->rcv_lastsack = sack.end; 2592 /* 2593 * dups is at least one. If more data has been 2594 * SACKed, it can be greater than one. 2595 */ 2596 cur->dups = min(tcprexmtthresh, 2597 ((sack.end - cur->end)/tp->t_maxseg)); 2598 if (cur->dups < 1) 2599 cur->dups = 1; 2600 continue; /* with next sack block */ 2601 } 2602 /* Go thru list of holes: p = previous, cur = current */ 2603 p = cur = tp->snd_holes; 2604 while (cur) { 2605 if (SEQ_LEQ(sack.end, cur->start)) 2606 /* SACKs data before the current hole */ 2607 break; /* no use going through more holes */ 2608 if (SEQ_GEQ(sack.start, cur->end)) { 2609 /* SACKs data beyond the current hole */ 2610 cur->dups++; 2611 if (((sack.end - cur->end)/tp->t_maxseg) >= 2612 tcprexmtthresh) 2613 cur->dups = tcprexmtthresh; 2614 p = cur; 2615 cur = cur->next; 2616 continue; 2617 } 2618 if (SEQ_LEQ(sack.start, cur->start)) { 2619 /* Data acks at least the beginning of hole */ 2620 #if defined(TCP_SACK) && defined(TCP_FACK) 2621 if (SEQ_GT(sack.end, cur->rxmit)) 2622 tp->retran_data -= 2623 tcp_seq_subtract(cur->rxmit, 2624 cur->start); 2625 else 2626 tp->retran_data -= 2627 tcp_seq_subtract(sack.end, 2628 cur->start); 2629 #endif /* TCP_FACK */ 2630 if (SEQ_GEQ(sack.end, cur->end)) { 2631 /* Acks entire hole, so delete hole */ 2632 if (p != cur) { 2633 p->next = cur->next; 2634 pool_put(&sackhl_pool, cur); 2635 cur = p->next; 2636 } else { 2637 cur = cur->next; 2638 pool_put(&sackhl_pool, p); 2639 p = cur; 2640 tp->snd_holes = p; 2641 } 2642 tp->snd_numholes--; 2643 continue; 2644 } 2645 /* otherwise, move start of hole forward */ 2646 cur->start = sack.end; 2647 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2648 p = cur; 2649 cur = cur->next; 2650 continue; 2651 } 2652 /* move end of hole backward */ 2653 if (SEQ_GEQ(sack.end, cur->end)) { 2654 #if defined(TCP_SACK) && defined(TCP_FACK) 2655 if (SEQ_GT(cur->rxmit, sack.start)) 2656 tp->retran_data -= 2657 tcp_seq_subtract(cur->rxmit, 2658 sack.start); 2659 #endif /* TCP_FACK */ 2660 cur->end = sack.start; 2661 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2662 cur->dups++; 2663 if (((sack.end - cur->end)/tp->t_maxseg) >= 2664 tcprexmtthresh) 2665 cur->dups = tcprexmtthresh; 2666 p = cur; 2667 cur = cur->next; 2668 continue; 2669 } 2670 if (SEQ_LT(cur->start, sack.start) && 2671 SEQ_GT(cur->end, sack.end)) { 2672 /* 2673 * ACKs some data in middle of a hole; need to 2674 * split current hole 2675 */ 2676 temp = (struct sackhole *) 2677 pool_get(&sackhl_pool, PR_NOWAIT); 2678 if (temp == NULL) 2679 goto done; /* ENOBUFS */ 2680 #if defined(TCP_SACK) && defined(TCP_FACK) 2681 if (SEQ_GT(cur->rxmit, sack.end)) 2682 tp->retran_data -= 2683 tcp_seq_subtract(sack.end, 2684 sack.start); 2685 else if (SEQ_GT(cur->rxmit, sack.start)) 2686 tp->retran_data -= 2687 tcp_seq_subtract(cur->rxmit, 2688 sack.start); 2689 #endif /* TCP_FACK */ 2690 temp->next = cur->next; 2691 temp->start = sack.end; 2692 temp->end = cur->end; 2693 temp->dups = cur->dups; 2694 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2695 cur->end = sack.start; 2696 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2697 cur->dups++; 2698 if (((sack.end - cur->end)/tp->t_maxseg) >= 2699 tcprexmtthresh) 2700 cur->dups = tcprexmtthresh; 2701 cur->next = temp; 2702 p = temp; 2703 cur = p->next; 2704 tp->snd_numholes++; 2705 } 2706 } 2707 /* At this point, p points to the last hole on the list */ 2708 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2709 /* 2710 * Need to append new hole at end. 2711 * Last hole is p (and it's not NULL). 2712 */ 2713 temp = (struct sackhole *) 2714 pool_get(&sackhl_pool, PR_NOWAIT); 2715 if (temp == NULL) 2716 goto done; /* ENOBUFS */ 2717 temp->start = tp->rcv_lastsack; 2718 temp->end = sack.start; 2719 temp->dups = min(tcprexmtthresh, 2720 ((sack.end - sack.start)/tp->t_maxseg)); 2721 if (temp->dups < 1) 2722 temp->dups = 1; 2723 temp->rxmit = temp->start; 2724 temp->next = 0; 2725 p->next = temp; 2726 tp->rcv_lastsack = sack.end; 2727 tp->snd_numholes++; 2728 } 2729 } 2730 done: 2731 #if defined(TCP_SACK) && defined(TCP_FACK) 2732 /* 2733 * Update retran_data and snd_awnd. Go through the list of 2734 * holes. Increment retran_data by (hole->rxmit - hole->start). 2735 */ 2736 tp->retran_data = 0; 2737 cur = tp->snd_holes; 2738 while (cur) { 2739 tp->retran_data += cur->rxmit - cur->start; 2740 cur = cur->next; 2741 } 2742 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2743 tp->retran_data; 2744 #endif /* TCP_FACK */ 2745 2746 return; 2747 } 2748 2749 /* 2750 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2751 * it is completely acked; otherwise, tcp_sack_option(), called from 2752 * tcp_dooptions(), will fix up the hole. 2753 */ 2754 void 2755 tcp_del_sackholes(tp, th) 2756 struct tcpcb *tp; 2757 struct tcphdr *th; 2758 { 2759 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2760 /* max because this could be an older ack just arrived */ 2761 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2762 th->th_ack : tp->snd_una; 2763 struct sackhole *cur = tp->snd_holes; 2764 struct sackhole *prev; 2765 while (cur) 2766 if (SEQ_LEQ(cur->end, lastack)) { 2767 prev = cur; 2768 cur = cur->next; 2769 pool_put(&sackhl_pool, prev); 2770 tp->snd_numholes--; 2771 } else if (SEQ_LT(cur->start, lastack)) { 2772 cur->start = lastack; 2773 if (SEQ_LT(cur->rxmit, cur->start)) 2774 cur->rxmit = cur->start; 2775 break; 2776 } else 2777 break; 2778 tp->snd_holes = cur; 2779 } 2780 } 2781 2782 /* 2783 * Delete all receiver-side SACK information. 2784 */ 2785 void 2786 tcp_clean_sackreport(tp) 2787 struct tcpcb *tp; 2788 { 2789 int i; 2790 2791 tp->rcv_numsacks = 0; 2792 for (i = 0; i < MAX_SACK_BLKS; i++) 2793 tp->sackblks[i].start = tp->sackblks[i].end=0; 2794 2795 } 2796 2797 /* 2798 * Checks for partial ack. If partial ack arrives, turn off retransmission 2799 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2800 * If the ack advances at least to tp->snd_last, return 0. 2801 */ 2802 int 2803 tcp_sack_partialack(tp, th) 2804 struct tcpcb *tp; 2805 struct tcphdr *th; 2806 { 2807 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2808 /* Turn off retx. timer (will start again next segment) */ 2809 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2810 tp->t_rtttime = 0; 2811 #ifndef TCP_FACK 2812 /* 2813 * Partial window deflation. This statement relies on the 2814 * fact that tp->snd_una has not been updated yet. In FACK 2815 * hold snd_cwnd constant during fast recovery. 2816 */ 2817 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2818 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2819 tp->snd_cwnd += tp->t_maxseg; 2820 } else 2821 tp->snd_cwnd = tp->t_maxseg; 2822 #endif 2823 return (1); 2824 } 2825 return (0); 2826 } 2827 #endif /* TCP_SACK */ 2828 2829 /* 2830 * Pull out of band byte out of a segment so 2831 * it doesn't appear in the user's data queue. 2832 * It is still reflected in the segment length for 2833 * sequencing purposes. 2834 */ 2835 void 2836 tcp_pulloutofband(so, urgent, m, off) 2837 struct socket *so; 2838 u_int urgent; 2839 struct mbuf *m; 2840 int off; 2841 { 2842 int cnt = off + urgent - 1; 2843 2844 while (cnt >= 0) { 2845 if (m->m_len > cnt) { 2846 char *cp = mtod(m, caddr_t) + cnt; 2847 struct tcpcb *tp = sototcpcb(so); 2848 2849 tp->t_iobc = *cp; 2850 tp->t_oobflags |= TCPOOB_HAVEDATA; 2851 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2852 m->m_len--; 2853 return; 2854 } 2855 cnt -= m->m_len; 2856 m = m->m_next; 2857 if (m == 0) 2858 break; 2859 } 2860 panic("tcp_pulloutofband"); 2861 } 2862 2863 /* 2864 * Collect new round-trip time estimate 2865 * and update averages and current timeout. 2866 */ 2867 void 2868 tcp_xmit_timer(tp, rtt) 2869 struct tcpcb *tp; 2870 short rtt; 2871 { 2872 short delta; 2873 short rttmin; 2874 2875 if (rtt < 0) 2876 rtt = 0; 2877 else if (rtt > TCP_RTT_MAX) 2878 rtt = TCP_RTT_MAX; 2879 2880 tcpstat.tcps_rttupdated++; 2881 if (tp->t_srtt != 0) { 2882 /* 2883 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2884 * after the binary point (scaled by 4), whereas 2885 * srtt is stored as fixed point with 5 bits after the 2886 * binary point (i.e., scaled by 32). The following magic 2887 * is equivalent to the smoothing algorithm in rfc793 with 2888 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2889 * point). 2890 */ 2891 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2892 (tp->t_srtt >> TCP_RTT_SHIFT); 2893 if ((tp->t_srtt += delta) <= 0) 2894 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2895 /* 2896 * We accumulate a smoothed rtt variance (actually, a 2897 * smoothed mean difference), then set the retransmit 2898 * timer to smoothed rtt + 4 times the smoothed variance. 2899 * rttvar is stored as fixed point with 4 bits after the 2900 * binary point (scaled by 16). The following is 2901 * equivalent to rfc793 smoothing with an alpha of .75 2902 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2903 * rfc793's wired-in beta. 2904 */ 2905 if (delta < 0) 2906 delta = -delta; 2907 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2908 if ((tp->t_rttvar += delta) <= 0) 2909 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2910 } else { 2911 /* 2912 * No rtt measurement yet - use the unsmoothed rtt. 2913 * Set the variance to half the rtt (so our first 2914 * retransmit happens at 3*rtt). 2915 */ 2916 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2917 tp->t_rttvar = (rtt + 1) << 2918 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2919 } 2920 tp->t_rtttime = 0; 2921 tp->t_rxtshift = 0; 2922 2923 /* 2924 * the retransmit should happen at rtt + 4 * rttvar. 2925 * Because of the way we do the smoothing, srtt and rttvar 2926 * will each average +1/2 tick of bias. When we compute 2927 * the retransmit timer, we want 1/2 tick of rounding and 2928 * 1 extra tick because of +-1/2 tick uncertainty in the 2929 * firing of the timer. The bias will give us exactly the 2930 * 1.5 tick we need. But, because the bias is 2931 * statistical, we have to test that we don't drop below 2932 * the minimum feasible timer (which is 2 ticks). 2933 */ 2934 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2935 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2936 2937 /* 2938 * We received an ack for a packet that wasn't retransmitted; 2939 * it is probably safe to discard any error indications we've 2940 * received recently. This isn't quite right, but close enough 2941 * for now (a route might have failed after we sent a segment, 2942 * and the return path might not be symmetrical). 2943 */ 2944 tp->t_softerror = 0; 2945 } 2946 2947 /* 2948 * Determine a reasonable value for maxseg size. 2949 * If the route is known, check route for mtu. 2950 * If none, use an mss that can be handled on the outgoing 2951 * interface without forcing IP to fragment; if bigger than 2952 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2953 * to utilize large mbufs. If no route is found, route has no mtu, 2954 * or the destination isn't local, use a default, hopefully conservative 2955 * size (usually 512 or the default IP max size, but no more than the mtu 2956 * of the interface), as we can't discover anything about intervening 2957 * gateways or networks. We also initialize the congestion/slow start 2958 * window to be a single segment if the destination isn't local. 2959 * While looking at the routing entry, we also initialize other path-dependent 2960 * parameters from pre-set or cached values in the routing entry. 2961 * 2962 * Also take into account the space needed for options that we 2963 * send regularly. Make maxseg shorter by that amount to assure 2964 * that we can send maxseg amount of data even when the options 2965 * are present. Store the upper limit of the length of options plus 2966 * data in maxopd. 2967 * 2968 * NOTE: offer == -1 indicates that the maxseg size changed due to 2969 * Path MTU discovery. 2970 */ 2971 int 2972 tcp_mss(tp, offer) 2973 struct tcpcb *tp; 2974 int offer; 2975 { 2976 struct rtentry *rt; 2977 struct ifnet *ifp; 2978 int mss, mssopt; 2979 int iphlen; 2980 struct inpcb *inp; 2981 2982 inp = tp->t_inpcb; 2983 2984 mssopt = mss = tcp_mssdflt; 2985 2986 rt = in_pcbrtentry(inp); 2987 2988 if (rt == NULL) 2989 goto out; 2990 2991 ifp = rt->rt_ifp; 2992 2993 switch (tp->pf) { 2994 #ifdef INET6 2995 case AF_INET6: 2996 iphlen = sizeof(struct ip6_hdr); 2997 break; 2998 #endif 2999 case AF_INET: 3000 iphlen = sizeof(struct ip); 3001 break; 3002 default: 3003 /* the family does not support path MTU discovery */ 3004 goto out; 3005 } 3006 3007 #ifdef RTV_MTU 3008 /* 3009 * if there's an mtu associated with the route and we support 3010 * path MTU discovery for the underlying protocol family, use it. 3011 */ 3012 if (rt->rt_rmx.rmx_mtu) { 3013 /* 3014 * One may wish to lower MSS to take into account options, 3015 * especially security-related options. 3016 */ 3017 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3018 /* 3019 * RFC2460 section 5, last paragraph: if path MTU is 3020 * smaller than 1280, use 1280 as packet size and 3021 * attach fragment header. 3022 */ 3023 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3024 sizeof(struct tcphdr); 3025 } else 3026 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3027 } else 3028 #endif /* RTV_MTU */ 3029 if (!ifp) 3030 /* 3031 * ifp may be null and rmx_mtu may be zero in certain 3032 * v6 cases (e.g., if ND wasn't able to resolve the 3033 * destination host. 3034 */ 3035 goto out; 3036 else if (ifp->if_flags & IFF_LOOPBACK) 3037 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3038 else if (tp->pf == AF_INET) { 3039 if (ip_mtudisc) 3040 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3041 else if (inp && in_localaddr(inp->inp_faddr)) 3042 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3043 } 3044 #ifdef INET6 3045 else if (tp->pf == AF_INET6) { 3046 /* 3047 * for IPv6, path MTU discovery is always turned on, 3048 * or the node must use packet size <= 1280. 3049 */ 3050 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3051 } 3052 #endif /* INET6 */ 3053 3054 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3055 if (offer != -1) { 3056 #ifndef INET6 3057 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3058 #else 3059 if (tp->pf == AF_INET6) 3060 mssopt = IN6_LINKMTU(ifp) - iphlen - 3061 sizeof(struct tcphdr); 3062 else 3063 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3064 #endif 3065 3066 mssopt = max(tcp_mssdflt, mssopt); 3067 } 3068 3069 out: 3070 /* 3071 * The current mss, t_maxseg, is initialized to the default value. 3072 * If we compute a smaller value, reduce the current mss. 3073 * If we compute a larger value, return it for use in sending 3074 * a max seg size option, but don't store it for use 3075 * unless we received an offer at least that large from peer. 3076 * 3077 * However, do not accept offers lower than the minimum of 3078 * the interface MTU and 216. 3079 */ 3080 if (offer > 0) 3081 tp->t_peermss = offer; 3082 if (tp->t_peermss) 3083 mss = min(mss, max(tp->t_peermss, 216)); 3084 3085 /* sanity - at least max opt. space */ 3086 mss = max(mss, 64); 3087 3088 /* 3089 * maxopd stores the maximum length of data AND options 3090 * in a segment; maxseg is the amount of data in a normal 3091 * segment. We need to store this value (maxopd) apart 3092 * from maxseg, because now every segment carries options 3093 * and thus we normally have somewhat less data in segments. 3094 */ 3095 tp->t_maxopd = mss; 3096 3097 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3098 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3099 mss -= TCPOLEN_TSTAMP_APPA; 3100 #ifdef TCP_SIGNATURE 3101 if (tp->t_flags & TF_SIGNATURE) 3102 mss -= TCPOLEN_SIGLEN; 3103 #endif 3104 3105 if (offer == -1) { 3106 /* mss changed due to Path MTU discovery */ 3107 tp->t_flags &= ~TF_PMTUD_PEND; 3108 tp->t_pmtud_mtu_sent = 0; 3109 tp->t_pmtud_mss_acked = 0; 3110 if (mss < tp->t_maxseg) { 3111 /* 3112 * Follow suggestion in RFC 2414 to reduce the 3113 * congestion window by the ratio of the old 3114 * segment size to the new segment size. 3115 */ 3116 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3117 mss, mss); 3118 } 3119 } else if (tcp_do_rfc3390) { 3120 /* increase initial window */ 3121 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3122 } else 3123 tp->snd_cwnd = mss; 3124 3125 tp->t_maxseg = mss; 3126 3127 return (offer != -1 ? mssopt : mss); 3128 } 3129 3130 u_int 3131 tcp_hdrsz(struct tcpcb *tp) 3132 { 3133 u_int hlen; 3134 3135 switch (tp->pf) { 3136 #ifdef INET6 3137 case AF_INET6: 3138 hlen = sizeof(struct ip6_hdr); 3139 break; 3140 #endif 3141 case AF_INET: 3142 hlen = sizeof(struct ip); 3143 break; 3144 default: 3145 hlen = 0; 3146 break; 3147 } 3148 hlen += sizeof(struct tcphdr); 3149 3150 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3151 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3152 hlen += TCPOLEN_TSTAMP_APPA; 3153 #ifdef TCP_SIGNATURE 3154 if (tp->t_flags & TF_SIGNATURE) 3155 hlen += TCPOLEN_SIGLEN; 3156 #endif 3157 return (hlen); 3158 } 3159 3160 /* 3161 * Set connection variables based on the effective MSS. 3162 * We are passed the TCPCB for the actual connection. If we 3163 * are the server, we are called by the compressed state engine 3164 * when the 3-way handshake is complete. If we are the client, 3165 * we are called when we receive the SYN,ACK from the server. 3166 * 3167 * NOTE: The t_maxseg value must be initialized in the TCPCB 3168 * before this routine is called! 3169 */ 3170 void 3171 tcp_mss_update(tp) 3172 struct tcpcb *tp; 3173 { 3174 int mss; 3175 u_long bufsize; 3176 struct rtentry *rt; 3177 struct socket *so; 3178 3179 so = tp->t_inpcb->inp_socket; 3180 mss = tp->t_maxseg; 3181 3182 rt = in_pcbrtentry(tp->t_inpcb); 3183 3184 if (rt == NULL) 3185 return; 3186 3187 bufsize = so->so_snd.sb_hiwat; 3188 if (bufsize < mss) { 3189 mss = bufsize; 3190 /* Update t_maxseg and t_maxopd */ 3191 tcp_mss(tp, mss); 3192 } else { 3193 bufsize = roundup(bufsize, mss); 3194 if (bufsize > sb_max) 3195 bufsize = sb_max; 3196 (void)sbreserve(&so->so_snd, bufsize); 3197 } 3198 3199 bufsize = so->so_rcv.sb_hiwat; 3200 if (bufsize > mss) { 3201 bufsize = roundup(bufsize, mss); 3202 if (bufsize > sb_max) 3203 bufsize = sb_max; 3204 (void)sbreserve(&so->so_rcv, bufsize); 3205 } 3206 3207 } 3208 3209 #if defined (TCP_SACK) 3210 /* 3211 * Checks for partial ack. If partial ack arrives, force the retransmission 3212 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3213 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3214 * be started again. If the ack advances at least to tp->snd_last, return 0. 3215 */ 3216 int 3217 tcp_newreno(tp, th) 3218 struct tcpcb *tp; 3219 struct tcphdr *th; 3220 { 3221 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3222 /* 3223 * snd_una has not been updated and the socket send buffer 3224 * not yet drained of the acked data, so we have to leave 3225 * snd_una as it was to get the correct data offset in 3226 * tcp_output(). 3227 */ 3228 tcp_seq onxt = tp->snd_nxt; 3229 u_long ocwnd = tp->snd_cwnd; 3230 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3231 tp->t_rtttime = 0; 3232 tp->snd_nxt = th->th_ack; 3233 /* 3234 * Set snd_cwnd to one segment beyond acknowledged offset 3235 * (tp->snd_una not yet updated when this function is called) 3236 */ 3237 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3238 (void) tcp_output(tp); 3239 tp->snd_cwnd = ocwnd; 3240 if (SEQ_GT(onxt, tp->snd_nxt)) 3241 tp->snd_nxt = onxt; 3242 /* 3243 * Partial window deflation. Relies on fact that tp->snd_una 3244 * not updated yet. 3245 */ 3246 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); 3247 return 1; 3248 } 3249 return 0; 3250 } 3251 #endif /* TCP_SACK */ 3252 3253 static int 3254 tcp_mss_adv(struct ifnet *ifp, int af) 3255 { 3256 int mss = 0; 3257 int iphlen; 3258 3259 switch (af) { 3260 case AF_INET: 3261 if (ifp != NULL) 3262 mss = ifp->if_mtu; 3263 iphlen = sizeof(struct ip); 3264 break; 3265 #ifdef INET6 3266 case AF_INET6: 3267 if (ifp != NULL) 3268 mss = IN6_LINKMTU(ifp); 3269 iphlen = sizeof(struct ip6_hdr); 3270 break; 3271 #endif 3272 } 3273 mss = mss - iphlen - sizeof(struct tcphdr); 3274 return (max(mss, tcp_mssdflt)); 3275 } 3276 3277 /* 3278 * TCP compressed state engine. Currently used to hold compressed 3279 * state for SYN_RECEIVED. 3280 */ 3281 3282 u_long syn_cache_count; 3283 u_int32_t syn_hash1, syn_hash2; 3284 3285 #define SYN_HASH(sa, sp, dp) \ 3286 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3287 ((u_int32_t)(sp)))^syn_hash2))) 3288 #ifndef INET6 3289 #define SYN_HASHALL(hash, src, dst) \ 3290 do { \ 3291 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3292 ((struct sockaddr_in *)(src))->sin_port, \ 3293 ((struct sockaddr_in *)(dst))->sin_port); \ 3294 } while (/*CONSTCOND*/ 0) 3295 #else 3296 #define SYN_HASH6(sa, sp, dp) \ 3297 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3298 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3299 & 0x7fffffff) 3300 3301 #define SYN_HASHALL(hash, src, dst) \ 3302 do { \ 3303 switch ((src)->sa_family) { \ 3304 case AF_INET: \ 3305 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3306 ((struct sockaddr_in *)(src))->sin_port, \ 3307 ((struct sockaddr_in *)(dst))->sin_port); \ 3308 break; \ 3309 case AF_INET6: \ 3310 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3311 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3312 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3313 break; \ 3314 default: \ 3315 hash = 0; \ 3316 } \ 3317 } while (/*CONSTCOND*/0) 3318 #endif /* INET6 */ 3319 3320 #define SYN_CACHE_RM(sc) \ 3321 do { \ 3322 (sc)->sc_flags |= SCF_DEAD; \ 3323 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 3324 (sc), sc_bucketq); \ 3325 (sc)->sc_tp = NULL; \ 3326 LIST_REMOVE((sc), sc_tpq); \ 3327 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 3328 timeout_del(&(sc)->sc_timer); \ 3329 syn_cache_count--; \ 3330 } while (/*CONSTCOND*/0) 3331 3332 #define SYN_CACHE_PUT(sc) \ 3333 do { \ 3334 if ((sc)->sc_ipopts) \ 3335 (void) m_free((sc)->sc_ipopts); \ 3336 if ((sc)->sc_route4.ro_rt != NULL) \ 3337 RTFREE((sc)->sc_route4.ro_rt); \ 3338 timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \ 3339 timeout_add(&(sc)->sc_timer, 0); \ 3340 } while (/*CONSTCOND*/0) 3341 3342 struct pool syn_cache_pool; 3343 3344 /* 3345 * We don't estimate RTT with SYNs, so each packet starts with the default 3346 * RTT and each timer step has a fixed timeout value. 3347 */ 3348 #define SYN_CACHE_TIMER_ARM(sc) \ 3349 do { \ 3350 TCPT_RANGESET((sc)->sc_rxtcur, \ 3351 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3352 TCPTV_REXMTMAX); \ 3353 if (!timeout_initialized(&(sc)->sc_timer)) \ 3354 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3355 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3356 } while (/*CONSTCOND*/0) 3357 3358 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3359 3360 void 3361 syn_cache_init() 3362 { 3363 int i; 3364 3365 /* Initialize the hash buckets. */ 3366 for (i = 0; i < tcp_syn_cache_size; i++) 3367 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3368 3369 /* Initialize the syn cache pool. */ 3370 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3371 "synpl", NULL); 3372 } 3373 3374 void 3375 syn_cache_insert(sc, tp) 3376 struct syn_cache *sc; 3377 struct tcpcb *tp; 3378 { 3379 struct syn_cache_head *scp; 3380 struct syn_cache *sc2; 3381 int s; 3382 3383 /* 3384 * If there are no entries in the hash table, reinitialize 3385 * the hash secrets. 3386 */ 3387 if (syn_cache_count == 0) { 3388 syn_hash1 = arc4random(); 3389 syn_hash2 = arc4random(); 3390 } 3391 3392 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3393 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3394 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3395 3396 /* 3397 * Make sure that we don't overflow the per-bucket 3398 * limit or the total cache size limit. 3399 */ 3400 s = splsoftnet(); 3401 if (scp->sch_length >= tcp_syn_bucket_limit) { 3402 tcpstat.tcps_sc_bucketoverflow++; 3403 /* 3404 * The bucket is full. Toss the oldest element in the 3405 * bucket. This will be the first entry in the bucket. 3406 */ 3407 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3408 #ifdef DIAGNOSTIC 3409 /* 3410 * This should never happen; we should always find an 3411 * entry in our bucket. 3412 */ 3413 if (sc2 == NULL) 3414 panic("syn_cache_insert: bucketoverflow: impossible"); 3415 #endif 3416 SYN_CACHE_RM(sc2); 3417 SYN_CACHE_PUT(sc2); 3418 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3419 struct syn_cache_head *scp2, *sce; 3420 3421 tcpstat.tcps_sc_overflowed++; 3422 /* 3423 * The cache is full. Toss the oldest entry in the 3424 * first non-empty bucket we can find. 3425 * 3426 * XXX We would really like to toss the oldest 3427 * entry in the cache, but we hope that this 3428 * condition doesn't happen very often. 3429 */ 3430 scp2 = scp; 3431 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3432 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3433 for (++scp2; scp2 != scp; scp2++) { 3434 if (scp2 >= sce) 3435 scp2 = &tcp_syn_cache[0]; 3436 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3437 break; 3438 } 3439 #ifdef DIAGNOSTIC 3440 /* 3441 * This should never happen; we should always find a 3442 * non-empty bucket. 3443 */ 3444 if (scp2 == scp) 3445 panic("syn_cache_insert: cacheoverflow: " 3446 "impossible"); 3447 #endif 3448 } 3449 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3450 SYN_CACHE_RM(sc2); 3451 SYN_CACHE_PUT(sc2); 3452 } 3453 3454 /* 3455 * Initialize the entry's timer. 3456 */ 3457 sc->sc_rxttot = 0; 3458 sc->sc_rxtshift = 0; 3459 SYN_CACHE_TIMER_ARM(sc); 3460 3461 /* Link it from tcpcb entry */ 3462 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3463 3464 /* Put it into the bucket. */ 3465 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3466 scp->sch_length++; 3467 syn_cache_count++; 3468 3469 tcpstat.tcps_sc_added++; 3470 splx(s); 3471 } 3472 3473 /* 3474 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3475 * If we have retransmitted an entry the maximum number of times, expire 3476 * that entry. 3477 */ 3478 void 3479 syn_cache_timer(void *arg) 3480 { 3481 struct syn_cache *sc = arg; 3482 int s; 3483 3484 s = splsoftnet(); 3485 if (sc->sc_flags & SCF_DEAD) { 3486 splx(s); 3487 return; 3488 } 3489 3490 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3491 /* Drop it -- too many retransmissions. */ 3492 goto dropit; 3493 } 3494 3495 /* 3496 * Compute the total amount of time this entry has 3497 * been on a queue. If this entry has been on longer 3498 * than the keep alive timer would allow, expire it. 3499 */ 3500 sc->sc_rxttot += sc->sc_rxtcur; 3501 if (sc->sc_rxttot >= tcptv_keep_init) 3502 goto dropit; 3503 3504 tcpstat.tcps_sc_retransmitted++; 3505 (void) syn_cache_respond(sc, NULL); 3506 3507 /* Advance the timer back-off. */ 3508 sc->sc_rxtshift++; 3509 SYN_CACHE_TIMER_ARM(sc); 3510 3511 splx(s); 3512 return; 3513 3514 dropit: 3515 tcpstat.tcps_sc_timed_out++; 3516 SYN_CACHE_RM(sc); 3517 SYN_CACHE_PUT(sc); 3518 splx(s); 3519 } 3520 3521 void 3522 syn_cache_reaper(void *arg) 3523 { 3524 struct syn_cache *sc = arg; 3525 int s; 3526 3527 s = splsoftnet(); 3528 pool_put(&syn_cache_pool, (sc)); 3529 splx(s); 3530 return; 3531 } 3532 3533 /* 3534 * Remove syn cache created by the specified tcb entry, 3535 * because this does not make sense to keep them 3536 * (if there's no tcb entry, syn cache entry will never be used) 3537 */ 3538 void 3539 syn_cache_cleanup(tp) 3540 struct tcpcb *tp; 3541 { 3542 struct syn_cache *sc, *nsc; 3543 int s; 3544 3545 s = splsoftnet(); 3546 3547 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3548 nsc = LIST_NEXT(sc, sc_tpq); 3549 3550 #ifdef DIAGNOSTIC 3551 if (sc->sc_tp != tp) 3552 panic("invalid sc_tp in syn_cache_cleanup"); 3553 #endif 3554 SYN_CACHE_RM(sc); 3555 SYN_CACHE_PUT(sc); 3556 } 3557 /* just for safety */ 3558 LIST_INIT(&tp->t_sc); 3559 3560 splx(s); 3561 } 3562 3563 /* 3564 * Find an entry in the syn cache. 3565 */ 3566 struct syn_cache * 3567 syn_cache_lookup(src, dst, headp) 3568 struct sockaddr *src; 3569 struct sockaddr *dst; 3570 struct syn_cache_head **headp; 3571 { 3572 struct syn_cache *sc; 3573 struct syn_cache_head *scp; 3574 u_int32_t hash; 3575 int s; 3576 3577 SYN_HASHALL(hash, src, dst); 3578 3579 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3580 *headp = scp; 3581 s = splsoftnet(); 3582 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3583 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3584 if (sc->sc_hash != hash) 3585 continue; 3586 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3587 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 3588 splx(s); 3589 return (sc); 3590 } 3591 } 3592 splx(s); 3593 return (NULL); 3594 } 3595 3596 /* 3597 * This function gets called when we receive an ACK for a 3598 * socket in the LISTEN state. We look up the connection 3599 * in the syn cache, and if its there, we pull it out of 3600 * the cache and turn it into a full-blown connection in 3601 * the SYN-RECEIVED state. 3602 * 3603 * The return values may not be immediately obvious, and their effects 3604 * can be subtle, so here they are: 3605 * 3606 * NULL SYN was not found in cache; caller should drop the 3607 * packet and send an RST. 3608 * 3609 * -1 We were unable to create the new connection, and are 3610 * aborting it. An ACK,RST is being sent to the peer 3611 * (unless we got screwey sequence numbners; see below), 3612 * because the 3-way handshake has been completed. Caller 3613 * should not free the mbuf, since we may be using it. If 3614 * we are not, we will free it. 3615 * 3616 * Otherwise, the return value is a pointer to the new socket 3617 * associated with the connection. 3618 */ 3619 struct socket * 3620 syn_cache_get(src, dst, th, hlen, tlen, so, m) 3621 struct sockaddr *src; 3622 struct sockaddr *dst; 3623 struct tcphdr *th; 3624 unsigned int hlen, tlen; 3625 struct socket *so; 3626 struct mbuf *m; 3627 { 3628 struct syn_cache *sc; 3629 struct syn_cache_head *scp; 3630 struct inpcb *inp = NULL; 3631 struct tcpcb *tp = 0; 3632 struct mbuf *am; 3633 int s; 3634 struct socket *oso; 3635 3636 s = splsoftnet(); 3637 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3638 splx(s); 3639 return (NULL); 3640 } 3641 3642 /* 3643 * Verify the sequence and ack numbers. Try getting the correct 3644 * response again. 3645 */ 3646 if ((th->th_ack != sc->sc_iss + 1) || 3647 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3648 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3649 (void) syn_cache_respond(sc, m); 3650 splx(s); 3651 return ((struct socket *)(-1)); 3652 } 3653 3654 /* Remove this cache entry */ 3655 SYN_CACHE_RM(sc); 3656 splx(s); 3657 3658 /* 3659 * Ok, create the full blown connection, and set things up 3660 * as they would have been set up if we had created the 3661 * connection when the SYN arrived. If we can't create 3662 * the connection, abort it. 3663 */ 3664 oso = so; 3665 so = sonewconn(so, SS_ISCONNECTED); 3666 if (so == NULL) 3667 goto resetandabort; 3668 3669 inp = sotoinpcb(oso); 3670 #ifdef IPSEC 3671 /* 3672 * We need to copy the required security levels 3673 * from the old pcb. Ditto for any other 3674 * IPsec-related information. 3675 */ 3676 { 3677 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 3678 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3679 sizeof(inp->inp_seclevel)); 3680 newinp->inp_secrequire = inp->inp_secrequire; 3681 if (inp->inp_ipo != NULL) { 3682 newinp->inp_ipo = inp->inp_ipo; 3683 inp->inp_ipo->ipo_ref_count++; 3684 } 3685 if (inp->inp_ipsec_remotecred != NULL) { 3686 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3687 inp->inp_ipsec_remotecred->ref_count++; 3688 } 3689 if (inp->inp_ipsec_remoteauth != NULL) { 3690 newinp->inp_ipsec_remoteauth 3691 = inp->inp_ipsec_remoteauth; 3692 inp->inp_ipsec_remoteauth->ref_count++; 3693 } 3694 } 3695 #endif /* IPSEC */ 3696 #ifdef INET6 3697 /* 3698 * inp still has the OLD in_pcb stuff, set the 3699 * v6-related flags on the new guy, too. 3700 */ 3701 { 3702 int flags = inp->inp_flags; 3703 struct inpcb *oldinpcb = inp; 3704 3705 inp = (struct inpcb *)so->so_pcb; 3706 inp->inp_flags |= (flags & INP_IPV6); 3707 if ((inp->inp_flags & INP_IPV6) != 0) { 3708 inp->inp_ipv6.ip6_hlim = 3709 oldinpcb->inp_ipv6.ip6_hlim; 3710 } 3711 } 3712 #else /* INET6 */ 3713 inp = (struct inpcb *)so->so_pcb; 3714 #endif /* INET6 */ 3715 3716 inp->inp_lport = th->th_dport; 3717 switch (src->sa_family) { 3718 #ifdef INET6 3719 case AF_INET6: 3720 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3721 break; 3722 #endif /* INET6 */ 3723 case AF_INET: 3724 3725 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3726 inp->inp_options = ip_srcroute(); 3727 if (inp->inp_options == NULL) { 3728 inp->inp_options = sc->sc_ipopts; 3729 sc->sc_ipopts = NULL; 3730 } 3731 break; 3732 } 3733 in_pcbrehash(inp); 3734 3735 /* 3736 * Give the new socket our cached route reference. 3737 */ 3738 if (src->sa_family == AF_INET) 3739 inp->inp_route = sc->sc_route4; /* struct assignment */ 3740 #ifdef INET6 3741 else 3742 inp->inp_route6 = sc->sc_route6; 3743 #endif 3744 sc->sc_route4.ro_rt = NULL; 3745 3746 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3747 if (am == NULL) 3748 goto resetandabort; 3749 am->m_len = src->sa_len; 3750 bcopy(src, mtod(am, caddr_t), src->sa_len); 3751 3752 switch (src->sa_family) { 3753 case AF_INET: 3754 /* drop IPv4 packet to AF_INET6 socket */ 3755 if (inp->inp_flags & INP_IPV6) { 3756 (void) m_free(am); 3757 goto resetandabort; 3758 } 3759 if (in_pcbconnect(inp, am)) { 3760 (void) m_free(am); 3761 goto resetandabort; 3762 } 3763 break; 3764 #ifdef INET6 3765 case AF_INET6: 3766 if (in6_pcbconnect(inp, am)) { 3767 (void) m_free(am); 3768 goto resetandabort; 3769 } 3770 break; 3771 #endif 3772 } 3773 (void) m_free(am); 3774 3775 tp = intotcpcb(inp); 3776 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3777 if (sc->sc_request_r_scale != 15) { 3778 tp->requested_s_scale = sc->sc_requested_s_scale; 3779 tp->request_r_scale = sc->sc_request_r_scale; 3780 tp->snd_scale = sc->sc_requested_s_scale; 3781 tp->rcv_scale = sc->sc_request_r_scale; 3782 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3783 } 3784 if (sc->sc_flags & SCF_TIMESTAMP) 3785 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3786 3787 tp->t_template = tcp_template(tp); 3788 if (tp->t_template == 0) { 3789 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3790 so = NULL; 3791 m_freem(m); 3792 goto abort; 3793 } 3794 #ifdef TCP_SACK 3795 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3796 #endif 3797 3798 tp->ts_modulate = sc->sc_modulate; 3799 tp->iss = sc->sc_iss; 3800 tp->irs = sc->sc_irs; 3801 tcp_sendseqinit(tp); 3802 #if defined (TCP_SACK) || defined(TCP_ECN) 3803 tp->snd_last = tp->snd_una; 3804 #endif /* TCP_SACK */ 3805 #if defined(TCP_SACK) && defined(TCP_FACK) 3806 tp->snd_fack = tp->snd_una; 3807 tp->retran_data = 0; 3808 tp->snd_awnd = 0; 3809 #endif /* TCP_FACK */ 3810 #ifdef TCP_ECN 3811 if (sc->sc_flags & SCF_ECN_PERMIT) { 3812 tp->t_flags |= TF_ECN_PERMIT; 3813 tcpstat.tcps_ecn_accepts++; 3814 } 3815 #endif 3816 #ifdef TCP_SACK 3817 if (sc->sc_flags & SCF_SACK_PERMIT) 3818 tp->t_flags |= TF_SACK_PERMIT; 3819 #endif 3820 #ifdef TCP_SIGNATURE 3821 if (sc->sc_flags & SCF_SIGNATURE) 3822 tp->t_flags |= TF_SIGNATURE; 3823 #endif 3824 tcp_rcvseqinit(tp); 3825 tp->t_state = TCPS_SYN_RECEIVED; 3826 tp->t_rcvtime = tcp_now; 3827 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3828 tcpstat.tcps_accepts++; 3829 3830 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3831 if (sc->sc_peermaxseg) 3832 tcp_mss_update(tp); 3833 /* Reset initial window to 1 segment for retransmit */ 3834 if (sc->sc_rxtshift > 0) 3835 tp->snd_cwnd = tp->t_maxseg; 3836 tp->snd_wl1 = sc->sc_irs; 3837 tp->rcv_up = sc->sc_irs + 1; 3838 3839 /* 3840 * This is what whould have happened in tcp_output() when 3841 * the SYN,ACK was sent. 3842 */ 3843 tp->snd_up = tp->snd_una; 3844 tp->snd_max = tp->snd_nxt = tp->iss+1; 3845 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3846 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3847 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3848 tp->last_ack_sent = tp->rcv_nxt; 3849 3850 tcpstat.tcps_sc_completed++; 3851 SYN_CACHE_PUT(sc); 3852 return (so); 3853 3854 resetandabort: 3855 tcp_respond(NULL, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, TH_RST); 3856 abort: 3857 if (so != NULL) 3858 (void) soabort(so); 3859 SYN_CACHE_PUT(sc); 3860 tcpstat.tcps_sc_aborted++; 3861 return ((struct socket *)(-1)); 3862 } 3863 3864 /* 3865 * This function is called when we get a RST for a 3866 * non-existent connection, so that we can see if the 3867 * connection is in the syn cache. If it is, zap it. 3868 */ 3869 3870 void 3871 syn_cache_reset(src, dst, th) 3872 struct sockaddr *src; 3873 struct sockaddr *dst; 3874 struct tcphdr *th; 3875 { 3876 struct syn_cache *sc; 3877 struct syn_cache_head *scp; 3878 int s = splsoftnet(); 3879 3880 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3881 splx(s); 3882 return; 3883 } 3884 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3885 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3886 splx(s); 3887 return; 3888 } 3889 SYN_CACHE_RM(sc); 3890 splx(s); 3891 tcpstat.tcps_sc_reset++; 3892 SYN_CACHE_PUT(sc); 3893 } 3894 3895 void 3896 syn_cache_unreach(src, dst, th) 3897 struct sockaddr *src; 3898 struct sockaddr *dst; 3899 struct tcphdr *th; 3900 { 3901 struct syn_cache *sc; 3902 struct syn_cache_head *scp; 3903 int s; 3904 3905 s = splsoftnet(); 3906 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3907 splx(s); 3908 return; 3909 } 3910 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3911 if (ntohl (th->th_seq) != sc->sc_iss) { 3912 splx(s); 3913 return; 3914 } 3915 3916 /* 3917 * If we've retransmitted 3 times and this is our second error, 3918 * we remove the entry. Otherwise, we allow it to continue on. 3919 * This prevents us from incorrectly nuking an entry during a 3920 * spurious network outage. 3921 * 3922 * See tcp_notify(). 3923 */ 3924 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3925 sc->sc_flags |= SCF_UNREACH; 3926 splx(s); 3927 return; 3928 } 3929 3930 SYN_CACHE_RM(sc); 3931 splx(s); 3932 tcpstat.tcps_sc_unreach++; 3933 SYN_CACHE_PUT(sc); 3934 } 3935 3936 /* 3937 * Given a LISTEN socket and an inbound SYN request, add 3938 * this to the syn cache, and send back a segment: 3939 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3940 * to the source. 3941 * 3942 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3943 * Doing so would require that we hold onto the data and deliver it 3944 * to the application. However, if we are the target of a SYN-flood 3945 * DoS attack, an attacker could send data which would eventually 3946 * consume all available buffer space if it were ACKed. By not ACKing 3947 * the data, we avoid this DoS scenario. 3948 */ 3949 3950 int 3951 syn_cache_add(src, dst, th, iphlen, so, m, optp, optlen, oi) 3952 struct sockaddr *src; 3953 struct sockaddr *dst; 3954 struct tcphdr *th; 3955 unsigned int iphlen; 3956 struct socket *so; 3957 struct mbuf *m; 3958 u_char *optp; 3959 int optlen; 3960 struct tcp_opt_info *oi; 3961 { 3962 struct tcpcb tb, *tp; 3963 long win; 3964 struct syn_cache *sc; 3965 struct syn_cache_head *scp; 3966 struct mbuf *ipopts; 3967 3968 tp = sototcpcb(so); 3969 3970 /* 3971 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3972 * 3973 * Note this check is performed in tcp_input() very early on. 3974 */ 3975 3976 /* 3977 * Initialize some local state. 3978 */ 3979 win = sbspace(&so->so_rcv); 3980 if (win > TCP_MAXWIN) 3981 win = TCP_MAXWIN; 3982 3983 #ifdef TCP_SIGNATURE 3984 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3985 #else 3986 if (optp) { 3987 #endif 3988 tb.pf = tp->pf; 3989 #ifdef TCP_SACK 3990 tb.sack_enable = tp->sack_enable; 3991 #endif 3992 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3993 #ifdef TCP_SIGNATURE 3994 if (tp->t_flags & TF_SIGNATURE) 3995 tb.t_flags |= TF_SIGNATURE; 3996 #endif 3997 tb.t_state = TCPS_LISTEN; 3998 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi)) 3999 return (0); 4000 } else 4001 tb.t_flags = 0; 4002 4003 switch (src->sa_family) { 4004 #ifdef INET 4005 case AF_INET: 4006 /* 4007 * Remember the IP options, if any. 4008 */ 4009 ipopts = ip_srcroute(); 4010 break; 4011 #endif 4012 default: 4013 ipopts = NULL; 4014 } 4015 4016 /* 4017 * See if we already have an entry for this connection. 4018 * If we do, resend the SYN,ACK. We do not count this 4019 * as a retransmission (XXX though maybe we should). 4020 */ 4021 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 4022 tcpstat.tcps_sc_dupesyn++; 4023 if (ipopts) { 4024 /* 4025 * If we were remembering a previous source route, 4026 * forget it and use the new one we've been given. 4027 */ 4028 if (sc->sc_ipopts) 4029 (void) m_free(sc->sc_ipopts); 4030 sc->sc_ipopts = ipopts; 4031 } 4032 sc->sc_timestamp = tb.ts_recent; 4033 if (syn_cache_respond(sc, m) == 0) { 4034 tcpstat.tcps_sndacks++; 4035 tcpstat.tcps_sndtotal++; 4036 } 4037 return (1); 4038 } 4039 4040 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 4041 if (sc == NULL) { 4042 if (ipopts) 4043 (void) m_free(ipopts); 4044 return (0); 4045 } 4046 4047 /* 4048 * Fill in the cache, and put the necessary IP and TCP 4049 * options into the reply. 4050 */ 4051 bzero(sc, sizeof(struct syn_cache)); 4052 bzero(&sc->sc_timer, sizeof(sc->sc_timer)); 4053 bcopy(src, &sc->sc_src, src->sa_len); 4054 bcopy(dst, &sc->sc_dst, dst->sa_len); 4055 sc->sc_flags = 0; 4056 sc->sc_ipopts = ipopts; 4057 sc->sc_irs = th->th_seq; 4058 4059 #ifdef TCP_COMPAT_42 4060 tcp_iss += TCP_ISSINCR/2; 4061 sc->sc_iss = tcp_iss; 4062 #else 4063 sc->sc_iss = tcp_rndiss_next(); 4064 #endif 4065 sc->sc_peermaxseg = oi->maxseg; 4066 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4067 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4068 sc->sc_win = win; 4069 sc->sc_timestamp = tb.ts_recent; 4070 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4071 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4072 sc->sc_flags |= SCF_TIMESTAMP; 4073 sc->sc_modulate = arc4random(); 4074 } 4075 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4076 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4077 sc->sc_requested_s_scale = tb.requested_s_scale; 4078 sc->sc_request_r_scale = 0; 4079 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4080 TCP_MAXWIN << sc->sc_request_r_scale < 4081 so->so_rcv.sb_hiwat) 4082 sc->sc_request_r_scale++; 4083 } else { 4084 sc->sc_requested_s_scale = 15; 4085 sc->sc_request_r_scale = 15; 4086 } 4087 #ifdef TCP_ECN 4088 /* 4089 * if both ECE and CWR flag bits are set, peer is ECN capable. 4090 */ 4091 if (tcp_do_ecn && 4092 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4093 sc->sc_flags |= SCF_ECN_PERMIT; 4094 #endif 4095 #ifdef TCP_SACK 4096 /* 4097 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4098 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4099 */ 4100 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4101 sc->sc_flags |= SCF_SACK_PERMIT; 4102 #endif 4103 #ifdef TCP_SIGNATURE 4104 if (tb.t_flags & TF_SIGNATURE) 4105 sc->sc_flags |= SCF_SIGNATURE; 4106 #endif 4107 sc->sc_tp = tp; 4108 if (syn_cache_respond(sc, m) == 0) { 4109 syn_cache_insert(sc, tp); 4110 tcpstat.tcps_sndacks++; 4111 tcpstat.tcps_sndtotal++; 4112 } else { 4113 SYN_CACHE_PUT(sc); 4114 tcpstat.tcps_sc_dropped++; 4115 } 4116 return (1); 4117 } 4118 4119 int 4120 syn_cache_respond(sc, m) 4121 struct syn_cache *sc; 4122 struct mbuf *m; 4123 { 4124 struct route *ro; 4125 u_int8_t *optp; 4126 int optlen, error; 4127 u_int16_t tlen; 4128 struct ip *ip = NULL; 4129 #ifdef INET6 4130 struct ip6_hdr *ip6 = NULL; 4131 #endif 4132 struct tcphdr *th; 4133 u_int hlen; 4134 struct inpcb *inp; 4135 4136 switch (sc->sc_src.sa.sa_family) { 4137 case AF_INET: 4138 hlen = sizeof(struct ip); 4139 ro = &sc->sc_route4; 4140 break; 4141 #ifdef INET6 4142 case AF_INET6: 4143 hlen = sizeof(struct ip6_hdr); 4144 ro = (struct route *)&sc->sc_route6; 4145 break; 4146 #endif 4147 default: 4148 if (m) 4149 m_freem(m); 4150 return (EAFNOSUPPORT); 4151 } 4152 4153 /* Compute the size of the TCP options. */ 4154 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4155 #ifdef TCP_SACK 4156 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4157 #endif 4158 #ifdef TCP_SIGNATURE 4159 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4160 #endif 4161 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4162 4163 tlen = hlen + sizeof(struct tcphdr) + optlen; 4164 4165 /* 4166 * Create the IP+TCP header from scratch. 4167 */ 4168 if (m) 4169 m_freem(m); 4170 #ifdef DIAGNOSTIC 4171 if (max_linkhdr + tlen > MCLBYTES) 4172 return (ENOBUFS); 4173 #endif 4174 MGETHDR(m, M_DONTWAIT, MT_DATA); 4175 if (m && max_linkhdr + tlen > MHLEN) { 4176 MCLGET(m, M_DONTWAIT); 4177 if ((m->m_flags & M_EXT) == 0) { 4178 m_freem(m); 4179 m = NULL; 4180 } 4181 } 4182 if (m == NULL) 4183 return (ENOBUFS); 4184 4185 /* Fixup the mbuf. */ 4186 m->m_data += max_linkhdr; 4187 m->m_len = m->m_pkthdr.len = tlen; 4188 m->m_pkthdr.rcvif = NULL; 4189 memset(mtod(m, u_char *), 0, tlen); 4190 4191 switch (sc->sc_src.sa.sa_family) { 4192 case AF_INET: 4193 ip = mtod(m, struct ip *); 4194 ip->ip_dst = sc->sc_src.sin.sin_addr; 4195 ip->ip_src = sc->sc_dst.sin.sin_addr; 4196 ip->ip_p = IPPROTO_TCP; 4197 th = (struct tcphdr *)(ip + 1); 4198 th->th_dport = sc->sc_src.sin.sin_port; 4199 th->th_sport = sc->sc_dst.sin.sin_port; 4200 break; 4201 #ifdef INET6 4202 case AF_INET6: 4203 ip6 = mtod(m, struct ip6_hdr *); 4204 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4205 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4206 ip6->ip6_nxt = IPPROTO_TCP; 4207 /* ip6_plen will be updated in ip6_output() */ 4208 th = (struct tcphdr *)(ip6 + 1); 4209 th->th_dport = sc->sc_src.sin6.sin6_port; 4210 th->th_sport = sc->sc_dst.sin6.sin6_port; 4211 break; 4212 #endif 4213 default: 4214 th = NULL; 4215 } 4216 4217 th->th_seq = htonl(sc->sc_iss); 4218 th->th_ack = htonl(sc->sc_irs + 1); 4219 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4220 th->th_flags = TH_SYN|TH_ACK; 4221 #ifdef TCP_ECN 4222 /* Set ECE for SYN-ACK if peer supports ECN. */ 4223 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4224 th->th_flags |= TH_ECE; 4225 #endif 4226 th->th_win = htons(sc->sc_win); 4227 /* th_sum already 0 */ 4228 /* th_urp already 0 */ 4229 4230 /* Tack on the TCP options. */ 4231 optp = (u_int8_t *)(th + 1); 4232 *optp++ = TCPOPT_MAXSEG; 4233 *optp++ = 4; 4234 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4235 *optp++ = sc->sc_ourmaxseg & 0xff; 4236 4237 #ifdef TCP_SACK 4238 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4239 if (sc->sc_flags & SCF_SACK_PERMIT) { 4240 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4241 optp += 4; 4242 } 4243 #endif 4244 4245 if (sc->sc_request_r_scale != 15) { 4246 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4247 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4248 sc->sc_request_r_scale); 4249 optp += 4; 4250 } 4251 4252 if (sc->sc_flags & SCF_TIMESTAMP) { 4253 u_int32_t *lp = (u_int32_t *)(optp); 4254 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4255 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4256 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4257 *lp = htonl(sc->sc_timestamp); 4258 optp += TCPOLEN_TSTAMP_APPA; 4259 } 4260 4261 #ifdef TCP_SIGNATURE 4262 if (sc->sc_flags & SCF_SIGNATURE) { 4263 union sockaddr_union src, dst; 4264 struct tdb *tdb; 4265 4266 bzero(&src, sizeof(union sockaddr_union)); 4267 bzero(&dst, sizeof(union sockaddr_union)); 4268 src.sa.sa_len = sc->sc_src.sa.sa_len; 4269 src.sa.sa_family = sc->sc_src.sa.sa_family; 4270 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4271 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4272 4273 switch (sc->sc_src.sa.sa_family) { 4274 case 0: /*default to PF_INET*/ 4275 #ifdef INET 4276 case AF_INET: 4277 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4278 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4279 break; 4280 #endif /* INET */ 4281 #ifdef INET6 4282 case AF_INET6: 4283 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4284 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4285 break; 4286 #endif /* INET6 */ 4287 } 4288 4289 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 4290 if (tdb == NULL) { 4291 if (m) 4292 m_freem(m); 4293 return (EPERM); 4294 } 4295 4296 /* Send signature option */ 4297 *(optp++) = TCPOPT_SIGNATURE; 4298 *(optp++) = TCPOLEN_SIGNATURE; 4299 4300 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4301 hlen, 0, optp) < 0) { 4302 if (m) 4303 m_freem(m); 4304 return (EINVAL); 4305 } 4306 optp += 16; 4307 4308 /* Pad options list to the next 32 bit boundary and 4309 * terminate it. 4310 */ 4311 *optp++ = TCPOPT_NOP; 4312 *optp++ = TCPOPT_EOL; 4313 } 4314 #endif /* TCP_SIGNATURE */ 4315 4316 /* Compute the packet's checksum. */ 4317 switch (sc->sc_src.sa.sa_family) { 4318 case AF_INET: 4319 ip->ip_len = htons(tlen - hlen); 4320 th->th_sum = 0; 4321 th->th_sum = in_cksum(m, tlen); 4322 break; 4323 #ifdef INET6 4324 case AF_INET6: 4325 ip6->ip6_plen = htons(tlen - hlen); 4326 th->th_sum = 0; 4327 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4328 break; 4329 #endif 4330 } 4331 4332 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4333 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4334 4335 /* 4336 * Fill in some straggling IP bits. Note the stack expects 4337 * ip_len to be in host order, for convenience. 4338 */ 4339 switch (sc->sc_src.sa.sa_family) { 4340 #ifdef INET 4341 case AF_INET: 4342 ip->ip_len = htons(tlen); 4343 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4344 /* XXX tos? */ 4345 break; 4346 #endif 4347 #ifdef INET6 4348 case AF_INET6: 4349 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4350 ip6->ip6_vfc |= IPV6_VERSION; 4351 ip6->ip6_plen = htons(tlen - hlen); 4352 /* ip6_hlim will be initialized afterwards */ 4353 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4354 break; 4355 #endif 4356 } 4357 4358 switch (sc->sc_src.sa.sa_family) { 4359 #ifdef INET 4360 case AF_INET: 4361 error = ip_output(m, sc->sc_ipopts, ro, 4362 (ip_mtudisc ? IP_MTUDISC : 0), 4363 (struct ip_moptions *)NULL, inp); 4364 break; 4365 #endif 4366 #ifdef INET6 4367 case AF_INET6: 4368 ip6->ip6_hlim = in6_selecthlim(NULL, 4369 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4370 4371 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4372 (struct ip6_moptions *)0, NULL); 4373 break; 4374 #endif 4375 default: 4376 error = EAFNOSUPPORT; 4377 break; 4378 } 4379 return (error); 4380 } 4381