1 /* $OpenBSD: tcp_input.c,v 1.221 2008/09/09 15:26:12 mpf Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/protosw.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/kernel.h> 78 #include <sys/pool.h> 79 80 #include <dev/rndvar.h> 81 82 #include <net/if.h> 83 #include <net/route.h> 84 85 #include <netinet/in.h> 86 #include <netinet/in_systm.h> 87 #include <netinet/ip.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/tcp.h> 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/tcp_debug.h> 97 98 #include "faith.h" 99 100 #include "pf.h" 101 #if NPF > 0 102 #include <net/pfvar.h> 103 #endif 104 105 struct tcpiphdr tcp_saveti; 106 107 int tcp_mss_adv(struct ifnet *, int); 108 109 #ifdef INET6 110 #include <netinet6/in6_var.h> 111 #include <netinet6/nd6.h> 112 113 struct tcpipv6hdr tcp_saveti6; 114 115 /* for the packet header length in the mbuf */ 116 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 117 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 118 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 119 #endif /* INET6 */ 120 121 int tcprexmtthresh = 3; 122 int tcptv_keep_init = TCPTV_KEEP_INIT; 123 124 extern u_long sb_max; 125 126 int tcp_rst_ppslim = 100; /* 100pps */ 127 int tcp_rst_ppslim_count = 0; 128 struct timeval tcp_rst_ppslim_last; 129 130 int tcp_ackdrop_ppslim = 100; /* 100pps */ 131 int tcp_ackdrop_ppslim_count = 0; 132 struct timeval tcp_ackdrop_ppslim_last; 133 134 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 135 136 /* for modulo comparisons of timestamps */ 137 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 138 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 139 140 /* for TCP SACK comparisons */ 141 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 142 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 143 144 /* 145 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 146 */ 147 #ifdef INET6 148 #define ND6_HINT(tp) \ 149 do { \ 150 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 151 tp->t_inpcb->inp_route6.ro_rt) { \ 152 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 153 } \ 154 } while (0) 155 #else 156 #define ND6_HINT(tp) 157 #endif 158 159 #ifdef TCP_ECN 160 /* 161 * ECN (Explicit Congestion Notification) support based on RFC3168 162 * implementation note: 163 * snd_last is used to track a recovery phase. 164 * when cwnd is reduced, snd_last is set to snd_max. 165 * while snd_last > snd_una, the sender is in a recovery phase and 166 * its cwnd should not be reduced again. 167 * snd_last follows snd_una when not in a recovery phase. 168 */ 169 #endif 170 171 /* 172 * Macro to compute ACK transmission behavior. Delay the ACK unless 173 * we have already delayed an ACK (must send an ACK every two segments). 174 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 175 * option is enabled. 176 */ 177 #define TCP_SETUP_ACK(tp, tiflags) \ 178 do { \ 179 if ((tp)->t_flags & TF_DELACK || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \ 181 tp->t_flags |= TF_ACKNOW; \ 182 else \ 183 TCP_SET_DELACK(tp); \ 184 } while (0) 185 186 /* 187 * Insert segment ti into reassembly queue of tcp with 188 * control block tp. Return TH_FIN if reassembly now includes 189 * a segment with FIN. The macro form does the common case inline 190 * (segment is the next to be received on an established connection, 191 * and the queue is empty), avoiding linkage into and removal 192 * from the queue and repetition of various conversions. 193 * Set DELACK for segments received in order, but ack immediately 194 * when segments are out of order (so fast retransmit can work). 195 */ 196 197 int 198 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 199 { 200 struct tcpqent *p, *q, *nq, *tiqe; 201 struct socket *so = tp->t_inpcb->inp_socket; 202 int flags; 203 204 /* 205 * Call with th==0 after become established to 206 * force pre-ESTABLISHED data up to user socket. 207 */ 208 if (th == 0) 209 goto present; 210 211 /* 212 * Allocate a new queue entry, before we throw away any data. 213 * If we can't, just drop the packet. XXX 214 */ 215 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 216 if (tiqe == NULL) { 217 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 218 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 219 /* Reuse last entry since new segment fills a hole */ 220 m_freem(tiqe->tcpqe_m); 221 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 222 } 223 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 224 /* Flush segment queue for this connection */ 225 tcp_freeq(tp); 226 tcpstat.tcps_rcvmemdrop++; 227 m_freem(m); 228 return (0); 229 } 230 } 231 232 /* 233 * Find a segment which begins after this one does. 234 */ 235 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 236 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 237 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 238 break; 239 240 /* 241 * If there is a preceding segment, it may provide some of 242 * our data already. If so, drop the data from the incoming 243 * segment. If it provides all of our data, drop us. 244 */ 245 if (p != NULL) { 246 struct tcphdr *phdr = p->tcpqe_tcp; 247 int i; 248 249 /* conversion to int (in i) handles seq wraparound */ 250 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 251 if (i > 0) { 252 if (i >= *tlen) { 253 tcpstat.tcps_rcvduppack++; 254 tcpstat.tcps_rcvdupbyte += *tlen; 255 m_freem(m); 256 pool_put(&tcpqe_pool, tiqe); 257 return (0); 258 } 259 m_adj(m, i); 260 *tlen -= i; 261 th->th_seq += i; 262 } 263 } 264 tcpstat.tcps_rcvoopack++; 265 tcpstat.tcps_rcvoobyte += *tlen; 266 267 /* 268 * While we overlap succeeding segments trim them or, 269 * if they are completely covered, dequeue them. 270 */ 271 for (; q != NULL; q = nq) { 272 struct tcphdr *qhdr = q->tcpqe_tcp; 273 int i = (th->th_seq + *tlen) - qhdr->th_seq; 274 275 if (i <= 0) 276 break; 277 if (i < qhdr->th_reseqlen) { 278 qhdr->th_seq += i; 279 qhdr->th_reseqlen -= i; 280 m_adj(q->tcpqe_m, i); 281 break; 282 } 283 nq = TAILQ_NEXT(q, tcpqe_q); 284 m_freem(q->tcpqe_m); 285 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 286 pool_put(&tcpqe_pool, q); 287 } 288 289 /* Insert the new segment queue entry into place. */ 290 tiqe->tcpqe_m = m; 291 th->th_reseqlen = *tlen; 292 tiqe->tcpqe_tcp = th; 293 if (p == NULL) { 294 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 295 } else { 296 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 297 } 298 299 present: 300 /* 301 * Present data to user, advancing rcv_nxt through 302 * completed sequence space. 303 */ 304 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 305 return (0); 306 q = TAILQ_FIRST(&tp->t_segq); 307 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 308 return (0); 309 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 310 return (0); 311 do { 312 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 313 flags = q->tcpqe_tcp->th_flags & TH_FIN; 314 315 nq = TAILQ_NEXT(q, tcpqe_q); 316 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 317 ND6_HINT(tp); 318 if (so->so_state & SS_CANTRCVMORE) 319 m_freem(q->tcpqe_m); 320 else 321 sbappendstream(&so->so_rcv, q->tcpqe_m); 322 pool_put(&tcpqe_pool, q); 323 q = nq; 324 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 325 sorwakeup(so); 326 return (flags); 327 } 328 329 #ifdef INET6 330 int 331 tcp6_input(struct mbuf **mp, int *offp, int proto) 332 { 333 struct mbuf *m = *mp; 334 335 #if NFAITH > 0 336 if (m->m_pkthdr.rcvif) { 337 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 338 /* XXX send icmp6 host/port unreach? */ 339 m_freem(m); 340 return IPPROTO_DONE; 341 } 342 } 343 #endif 344 345 /* 346 * draft-itojun-ipv6-tcp-to-anycast 347 * better place to put this in? 348 */ 349 if (m->m_flags & M_ANYCAST6) { 350 if (m->m_len >= sizeof(struct ip6_hdr)) { 351 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 352 icmp6_error(m, ICMP6_DST_UNREACH, 353 ICMP6_DST_UNREACH_ADDR, 354 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 355 } else 356 m_freem(m); 357 return IPPROTO_DONE; 358 } 359 360 tcp_input(m, *offp, proto); 361 return IPPROTO_DONE; 362 } 363 #endif 364 365 /* 366 * TCP input routine, follows pages 65-76 of the 367 * protocol specification dated September, 1981 very closely. 368 */ 369 void 370 tcp_input(struct mbuf *m, ...) 371 { 372 struct ip *ip; 373 struct inpcb *inp = NULL; 374 u_int8_t *optp = NULL; 375 int optlen = 0; 376 int tlen, off; 377 struct tcpcb *tp = 0; 378 int tiflags; 379 struct socket *so = NULL; 380 int todrop, acked, ourfinisacked, needoutput = 0; 381 int hdroptlen = 0; 382 short ostate = 0; 383 tcp_seq iss, *reuse = NULL; 384 u_long tiwin; 385 struct tcp_opt_info opti; 386 int iphlen; 387 va_list ap; 388 struct tcphdr *th; 389 #ifdef INET6 390 struct ip6_hdr *ip6 = NULL; 391 #endif /* INET6 */ 392 #ifdef IPSEC 393 struct m_tag *mtag; 394 struct tdb_ident *tdbi; 395 struct tdb *tdb; 396 int error, s; 397 #endif /* IPSEC */ 398 int af; 399 #ifdef TCP_ECN 400 u_char iptos; 401 #endif 402 403 va_start(ap, m); 404 iphlen = va_arg(ap, int); 405 va_end(ap); 406 407 tcpstat.tcps_rcvtotal++; 408 409 opti.ts_present = 0; 410 opti.maxseg = 0; 411 412 /* 413 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 414 * See below for AF specific multicast. 415 */ 416 if (m->m_flags & (M_BCAST|M_MCAST)) 417 goto drop; 418 419 /* 420 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 421 * TCP/IPv4. 422 */ 423 switch (mtod(m, struct ip *)->ip_v) { 424 #ifdef INET6 425 case 6: 426 af = AF_INET6; 427 break; 428 #endif 429 case 4: 430 af = AF_INET; 431 break; 432 default: 433 m_freem(m); 434 return; /*EAFNOSUPPORT*/ 435 } 436 437 /* 438 * Get IP and TCP header together in first mbuf. 439 * Note: IP leaves IP header in first mbuf. 440 */ 441 switch (af) { 442 case AF_INET: 443 #ifdef DIAGNOSTIC 444 if (iphlen < sizeof(struct ip)) { 445 m_freem(m); 446 return; 447 } 448 #endif /* DIAGNOSTIC */ 449 break; 450 #ifdef INET6 451 case AF_INET6: 452 #ifdef DIAGNOSTIC 453 if (iphlen < sizeof(struct ip6_hdr)) { 454 m_freem(m); 455 return; 456 } 457 #endif /* DIAGNOSTIC */ 458 break; 459 #endif 460 default: 461 m_freem(m); 462 return; 463 } 464 465 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 466 if (!th) { 467 tcpstat.tcps_rcvshort++; 468 return; 469 } 470 471 tlen = m->m_pkthdr.len - iphlen; 472 ip = NULL; 473 #ifdef INET6 474 ip6 = NULL; 475 #endif 476 switch (af) { 477 case AF_INET: 478 ip = mtod(m, struct ip *); 479 if (IN_MULTICAST(ip->ip_dst.s_addr) || 480 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 481 goto drop; 482 #ifdef TCP_ECN 483 /* save ip_tos before clearing it for checksum */ 484 iptos = ip->ip_tos; 485 #endif 486 /* 487 * Checksum extended TCP header and data. 488 */ 489 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 490 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 491 tcpstat.tcps_inhwcsum++; 492 tcpstat.tcps_rcvbadsum++; 493 goto drop; 494 } 495 if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) { 496 tcpstat.tcps_rcvbadsum++; 497 goto drop; 498 } 499 } else { 500 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 501 tcpstat.tcps_inhwcsum++; 502 } 503 break; 504 #ifdef INET6 505 case AF_INET6: 506 ip6 = mtod(m, struct ip6_hdr *); 507 #ifdef TCP_ECN 508 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 509 #endif 510 511 /* Be proactive about malicious use of IPv4 mapped address */ 512 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 513 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 514 /* XXX stat */ 515 goto drop; 516 } 517 518 /* 519 * Be proactive about unspecified IPv6 address in source. 520 * As we use all-zero to indicate unbounded/unconnected pcb, 521 * unspecified IPv6 address can be used to confuse us. 522 * 523 * Note that packets with unspecified IPv6 destination is 524 * already dropped in ip6_input. 525 */ 526 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 527 /* XXX stat */ 528 goto drop; 529 } 530 531 /* Discard packets to multicast */ 532 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 533 /* XXX stat */ 534 goto drop; 535 } 536 537 /* 538 * Checksum extended TCP header and data. 539 */ 540 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 541 tcpstat.tcps_rcvbadsum++; 542 goto drop; 543 } 544 break; 545 #endif 546 } 547 548 /* 549 * Check that TCP offset makes sense, 550 * pull out TCP options and adjust length. XXX 551 */ 552 off = th->th_off << 2; 553 if (off < sizeof(struct tcphdr) || off > tlen) { 554 tcpstat.tcps_rcvbadoff++; 555 goto drop; 556 } 557 tlen -= off; 558 if (off > sizeof(struct tcphdr)) { 559 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 560 if (!th) { 561 tcpstat.tcps_rcvshort++; 562 return; 563 } 564 optlen = off - sizeof(struct tcphdr); 565 optp = (u_int8_t *)(th + 1); 566 /* 567 * Do quick retrieval of timestamp options ("options 568 * prediction?"). If timestamp is the only option and it's 569 * formatted as recommended in RFC 1323 appendix A, we 570 * quickly get the values now and not bother calling 571 * tcp_dooptions(), etc. 572 */ 573 if ((optlen == TCPOLEN_TSTAMP_APPA || 574 (optlen > TCPOLEN_TSTAMP_APPA && 575 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 576 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 577 (th->th_flags & TH_SYN) == 0) { 578 opti.ts_present = 1; 579 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 580 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 581 optp = NULL; /* we've parsed the options */ 582 } 583 } 584 tiflags = th->th_flags; 585 586 /* 587 * Convert TCP protocol specific fields to host format. 588 */ 589 NTOHL(th->th_seq); 590 NTOHL(th->th_ack); 591 NTOHS(th->th_win); 592 NTOHS(th->th_urp); 593 594 /* 595 * Locate pcb for segment. 596 */ 597 #if NPF > 0 598 if (m->m_pkthdr.pf.statekey) 599 inp = ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp; 600 #endif 601 findpcb: 602 if (inp == NULL) { 603 switch (af) { 604 #ifdef INET6 605 case AF_INET6: 606 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 607 th->th_sport, &ip6->ip6_dst, th->th_dport); 608 break; 609 #endif 610 case AF_INET: 611 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 612 th->th_sport, ip->ip_dst, th->th_dport); 613 break; 614 } 615 #if NPF > 0 616 if (m->m_pkthdr.pf.statekey && inp) { 617 ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp = 618 inp; 619 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 620 } 621 #endif 622 } 623 if (inp == NULL) { 624 int inpl_flags = 0; 625 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 626 inpl_flags = INPLOOKUP_WILDCARD; 627 ++tcpstat.tcps_pcbhashmiss; 628 switch (af) { 629 #ifdef INET6 630 case AF_INET6: 631 inp = in6_pcblookup_listen(&tcbtable, 632 &ip6->ip6_dst, th->th_dport, inpl_flags, m); 633 break; 634 #endif /* INET6 */ 635 case AF_INET: 636 inp = in_pcblookup_listen(&tcbtable, 637 ip->ip_dst, th->th_dport, inpl_flags, m); 638 break; 639 } 640 /* 641 * If the state is CLOSED (i.e., TCB does not exist) then 642 * all data in the incoming segment is discarded. 643 * If the TCB exists but is in CLOSED state, it is embryonic, 644 * but should either do a listen or a connect soon. 645 */ 646 if (inp == 0) { 647 ++tcpstat.tcps_noport; 648 goto dropwithreset_ratelim; 649 } 650 } 651 652 /* Check the minimum TTL for socket. */ 653 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 654 goto drop; 655 656 tp = intotcpcb(inp); 657 if (tp == 0) 658 goto dropwithreset_ratelim; 659 if (tp->t_state == TCPS_CLOSED) 660 goto drop; 661 662 /* Unscale the window into a 32-bit value. */ 663 if ((tiflags & TH_SYN) == 0) 664 tiwin = th->th_win << tp->snd_scale; 665 else 666 tiwin = th->th_win; 667 668 so = inp->inp_socket; 669 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 670 union syn_cache_sa src; 671 union syn_cache_sa dst; 672 673 bzero(&src, sizeof(src)); 674 bzero(&dst, sizeof(dst)); 675 switch (af) { 676 #ifdef INET 677 case AF_INET: 678 src.sin.sin_len = sizeof(struct sockaddr_in); 679 src.sin.sin_family = AF_INET; 680 src.sin.sin_addr = ip->ip_src; 681 src.sin.sin_port = th->th_sport; 682 683 dst.sin.sin_len = sizeof(struct sockaddr_in); 684 dst.sin.sin_family = AF_INET; 685 dst.sin.sin_addr = ip->ip_dst; 686 dst.sin.sin_port = th->th_dport; 687 break; 688 #endif 689 #ifdef INET6 690 case AF_INET6: 691 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 692 src.sin6.sin6_family = AF_INET6; 693 src.sin6.sin6_addr = ip6->ip6_src; 694 src.sin6.sin6_port = th->th_sport; 695 696 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 697 dst.sin6.sin6_family = AF_INET6; 698 dst.sin6.sin6_addr = ip6->ip6_dst; 699 dst.sin6.sin6_port = th->th_dport; 700 break; 701 #endif /* INET6 */ 702 default: 703 goto badsyn; /*sanity*/ 704 } 705 706 if (so->so_options & SO_DEBUG) { 707 ostate = tp->t_state; 708 switch (af) { 709 #ifdef INET6 710 case AF_INET6: 711 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 712 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 713 break; 714 #endif 715 case AF_INET: 716 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 717 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 718 break; 719 } 720 } 721 if (so->so_options & SO_ACCEPTCONN) { 722 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 723 if (tiflags & TH_RST) { 724 syn_cache_reset(&src.sa, &dst.sa, th); 725 } else if ((tiflags & (TH_ACK|TH_SYN)) == 726 (TH_ACK|TH_SYN)) { 727 /* 728 * Received a SYN,ACK. This should 729 * never happen while we are in 730 * LISTEN. Send an RST. 731 */ 732 goto badsyn; 733 } else if (tiflags & TH_ACK) { 734 so = syn_cache_get(&src.sa, &dst.sa, 735 th, iphlen, tlen, so, m); 736 if (so == NULL) { 737 /* 738 * We don't have a SYN for 739 * this ACK; send an RST. 740 */ 741 goto badsyn; 742 } else if (so == 743 (struct socket *)(-1)) { 744 /* 745 * We were unable to create 746 * the connection. If the 747 * 3-way handshake was 748 * completed, and RST has 749 * been sent to the peer. 750 * Since the mbuf might be 751 * in use for the reply, 752 * do not free it. 753 */ 754 m = NULL; 755 } else { 756 /* 757 * We have created a 758 * full-blown connection. 759 */ 760 tp = NULL; 761 inp = (struct inpcb *)so->so_pcb; 762 tp = intotcpcb(inp); 763 if (tp == NULL) 764 goto badsyn; /*XXX*/ 765 766 /* 767 * Compute proper scaling 768 * value from buffer space 769 */ 770 tcp_rscale(tp, so->so_rcv.sb_hiwat); 771 goto after_listen; 772 } 773 } else { 774 /* 775 * None of RST, SYN or ACK was set. 776 * This is an invalid packet for a 777 * TCB in LISTEN state. Send a RST. 778 */ 779 goto badsyn; 780 } 781 } else { 782 /* 783 * Received a SYN. 784 */ 785 #ifdef INET6 786 /* 787 * If deprecated address is forbidden, we do 788 * not accept SYN to deprecated interface 789 * address to prevent any new inbound 790 * connection from getting established. 791 * When we do not accept SYN, we send a TCP 792 * RST, with deprecated source address (instead 793 * of dropping it). We compromise it as it is 794 * much better for peer to send a RST, and 795 * RST will be the final packet for the 796 * exchange. 797 * 798 * If we do not forbid deprecated addresses, we 799 * accept the SYN packet. RFC2462 does not 800 * suggest dropping SYN in this case. 801 * If we decipher RFC2462 5.5.4, it says like 802 * this: 803 * 1. use of deprecated addr with existing 804 * communication is okay - "SHOULD continue 805 * to be used" 806 * 2. use of it with new communication: 807 * (2a) "SHOULD NOT be used if alternate 808 * address with sufficient scope is 809 * available" 810 * (2b) nothing mentioned otherwise. 811 * Here we fall into (2b) case as we have no 812 * choice in our source address selection - we 813 * must obey the peer. 814 * 815 * The wording in RFC2462 is confusing, and 816 * there are multiple description text for 817 * deprecated address handling - worse, they 818 * are not exactly the same. I believe 5.5.4 819 * is the best one, so we follow 5.5.4. 820 */ 821 if (ip6 && !ip6_use_deprecated) { 822 struct in6_ifaddr *ia6; 823 824 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 825 &ip6->ip6_dst)) && 826 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 827 tp = NULL; 828 goto dropwithreset; 829 } 830 } 831 #endif 832 833 /* 834 * LISTEN socket received a SYN 835 * from itself? This can't possibly 836 * be valid; drop the packet. 837 */ 838 if (th->th_dport == th->th_sport) { 839 switch (af) { 840 #ifdef INET6 841 case AF_INET6: 842 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 843 &ip6->ip6_dst)) { 844 tcpstat.tcps_badsyn++; 845 goto drop; 846 } 847 break; 848 #endif /* INET6 */ 849 case AF_INET: 850 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 851 tcpstat.tcps_badsyn++; 852 goto drop; 853 } 854 break; 855 } 856 } 857 858 /* 859 * SYN looks ok; create compressed TCP 860 * state for it. 861 */ 862 if (so->so_qlen <= so->so_qlimit && 863 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 864 so, m, optp, optlen, &opti, reuse)) 865 m = NULL; 866 } 867 goto drop; 868 } 869 } 870 871 after_listen: 872 #ifdef DIAGNOSTIC 873 /* 874 * Should not happen now that all embryonic connections 875 * are handled with compressed state. 876 */ 877 if (tp->t_state == TCPS_LISTEN) 878 panic("tcp_input: TCPS_LISTEN"); 879 #endif 880 881 #if NPF > 0 882 if (m->m_pkthdr.pf.statekey) { 883 ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->inp = 884 inp; 885 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 886 } 887 #endif 888 889 #ifdef IPSEC 890 /* Find most recent IPsec tag */ 891 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 892 s = splnet(); 893 if (mtag != NULL) { 894 tdbi = (struct tdb_ident *)(mtag + 1); 895 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 896 } else 897 tdb = NULL; 898 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 899 tdb, inp); 900 if (error) { 901 splx(s); 902 goto drop; 903 } 904 905 /* Latch SA */ 906 if (inp->inp_tdb_in != tdb) { 907 if (tdb) { 908 tdb_add_inp(tdb, inp, 1); 909 if (inp->inp_ipo == NULL) { 910 inp->inp_ipo = ipsec_add_policy(inp, af, 911 IPSP_DIRECTION_OUT); 912 if (inp->inp_ipo == NULL) { 913 splx(s); 914 goto drop; 915 } 916 } 917 if (inp->inp_ipo->ipo_dstid == NULL && 918 tdb->tdb_srcid != NULL) { 919 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 920 tdb->tdb_srcid->ref_count++; 921 } 922 if (inp->inp_ipsec_remotecred == NULL && 923 tdb->tdb_remote_cred != NULL) { 924 inp->inp_ipsec_remotecred = 925 tdb->tdb_remote_cred; 926 tdb->tdb_remote_cred->ref_count++; 927 } 928 if (inp->inp_ipsec_remoteauth == NULL && 929 tdb->tdb_remote_auth != NULL) { 930 inp->inp_ipsec_remoteauth = 931 tdb->tdb_remote_auth; 932 tdb->tdb_remote_auth->ref_count++; 933 } 934 } else { /* Just reset */ 935 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 936 inp_tdb_in_next); 937 inp->inp_tdb_in = NULL; 938 } 939 } 940 splx(s); 941 #endif /* IPSEC */ 942 943 /* 944 * Segment received on connection. 945 * Reset idle time and keep-alive timer. 946 */ 947 tp->t_rcvtime = tcp_now; 948 if (TCPS_HAVEESTABLISHED(tp->t_state)) 949 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 950 951 #ifdef TCP_SACK 952 if (tp->sack_enable) 953 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 954 #endif /* TCP_SACK */ 955 956 /* 957 * Process options. 958 */ 959 #ifdef TCP_SIGNATURE 960 if (optp || (tp->t_flags & TF_SIGNATURE)) 961 #else 962 if (optp) 963 #endif 964 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti)) 965 goto drop; 966 967 if (opti.ts_present && opti.ts_ecr) { 968 int rtt_test; 969 970 /* subtract out the tcp timestamp modulator */ 971 opti.ts_ecr -= tp->ts_modulate; 972 973 /* make sure ts_ecr is sensible */ 974 rtt_test = tcp_now - opti.ts_ecr; 975 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 976 opti.ts_ecr = 0; 977 } 978 979 #ifdef TCP_ECN 980 /* if congestion experienced, set ECE bit in subsequent packets. */ 981 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 982 tp->t_flags |= TF_RCVD_CE; 983 tcpstat.tcps_ecn_rcvce++; 984 } 985 #endif 986 /* 987 * Header prediction: check for the two common cases 988 * of a uni-directional data xfer. If the packet has 989 * no control flags, is in-sequence, the window didn't 990 * change and we're not retransmitting, it's a 991 * candidate. If the length is zero and the ack moved 992 * forward, we're the sender side of the xfer. Just 993 * free the data acked & wake any higher level process 994 * that was blocked waiting for space. If the length 995 * is non-zero and the ack didn't move, we're the 996 * receiver side. If we're getting packets in-order 997 * (the reassembly queue is empty), add the data to 998 * the socket buffer and note that we need a delayed ack. 999 */ 1000 if (tp->t_state == TCPS_ESTABLISHED && 1001 #ifdef TCP_ECN 1002 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 1003 #else 1004 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1005 #endif 1006 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1007 th->th_seq == tp->rcv_nxt && 1008 tiwin && tiwin == tp->snd_wnd && 1009 tp->snd_nxt == tp->snd_max) { 1010 1011 /* 1012 * If last ACK falls within this segment's sequence numbers, 1013 * record the timestamp. 1014 * Fix from Braden, see Stevens p. 870 1015 */ 1016 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1017 tp->ts_recent_age = tcp_now; 1018 tp->ts_recent = opti.ts_val; 1019 } 1020 1021 if (tlen == 0) { 1022 if (SEQ_GT(th->th_ack, tp->snd_una) && 1023 SEQ_LEQ(th->th_ack, tp->snd_max) && 1024 tp->snd_cwnd >= tp->snd_wnd && 1025 tp->t_dupacks == 0) { 1026 /* 1027 * this is a pure ack for outstanding data. 1028 */ 1029 ++tcpstat.tcps_predack; 1030 if (opti.ts_present && opti.ts_ecr) 1031 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1032 else if (tp->t_rtttime && 1033 SEQ_GT(th->th_ack, tp->t_rtseq)) 1034 tcp_xmit_timer(tp, 1035 tcp_now - tp->t_rtttime); 1036 acked = th->th_ack - tp->snd_una; 1037 tcpstat.tcps_rcvackpack++; 1038 tcpstat.tcps_rcvackbyte += acked; 1039 ND6_HINT(tp); 1040 sbdrop(&so->so_snd, acked); 1041 1042 /* 1043 * If we had a pending ICMP message that 1044 * referres to data that have just been 1045 * acknowledged, disregard the recorded ICMP 1046 * message. 1047 */ 1048 if ((tp->t_flags & TF_PMTUD_PEND) && 1049 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1050 tp->t_flags &= ~TF_PMTUD_PEND; 1051 1052 /* 1053 * Keep track of the largest chunk of data 1054 * acknowledged since last PMTU update 1055 */ 1056 if (tp->t_pmtud_mss_acked < acked) 1057 tp->t_pmtud_mss_acked = acked; 1058 1059 tp->snd_una = th->th_ack; 1060 #if defined(TCP_SACK) || defined(TCP_ECN) 1061 /* 1062 * We want snd_last to track snd_una so 1063 * as to avoid sequence wraparound problems 1064 * for very large transfers. 1065 */ 1066 #ifdef TCP_ECN 1067 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1068 #endif 1069 tp->snd_last = tp->snd_una; 1070 #endif /* TCP_SACK */ 1071 #if defined(TCP_SACK) && defined(TCP_FACK) 1072 tp->snd_fack = tp->snd_una; 1073 tp->retran_data = 0; 1074 #endif /* TCP_FACK */ 1075 m_freem(m); 1076 1077 /* 1078 * If all outstanding data are acked, stop 1079 * retransmit timer, otherwise restart timer 1080 * using current (possibly backed-off) value. 1081 * If process is waiting for space, 1082 * wakeup/selwakeup/signal. If data 1083 * are ready to send, let tcp_output 1084 * decide between more output or persist. 1085 */ 1086 if (tp->snd_una == tp->snd_max) 1087 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1088 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1089 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1090 1091 if (sb_notify(&so->so_snd)) 1092 sowwakeup(so); 1093 if (so->so_snd.sb_cc) 1094 (void) tcp_output(tp); 1095 return; 1096 } 1097 } else if (th->th_ack == tp->snd_una && 1098 TAILQ_EMPTY(&tp->t_segq) && 1099 tlen <= sbspace(&so->so_rcv)) { 1100 /* 1101 * This is a pure, in-sequence data packet 1102 * with nothing on the reassembly queue and 1103 * we have enough buffer space to take it. 1104 */ 1105 #ifdef TCP_SACK 1106 /* Clean receiver SACK report if present */ 1107 if (tp->sack_enable && tp->rcv_numsacks) 1108 tcp_clean_sackreport(tp); 1109 #endif /* TCP_SACK */ 1110 ++tcpstat.tcps_preddat; 1111 tp->rcv_nxt += tlen; 1112 tcpstat.tcps_rcvpack++; 1113 tcpstat.tcps_rcvbyte += tlen; 1114 ND6_HINT(tp); 1115 /* 1116 * Drop TCP, IP headers and TCP options then add data 1117 * to socket buffer. 1118 */ 1119 if (so->so_state & SS_CANTRCVMORE) 1120 m_freem(m); 1121 else { 1122 m_adj(m, iphlen + off); 1123 sbappendstream(&so->so_rcv, m); 1124 } 1125 sorwakeup(so); 1126 TCP_SETUP_ACK(tp, tiflags); 1127 if (tp->t_flags & TF_ACKNOW) 1128 (void) tcp_output(tp); 1129 return; 1130 } 1131 } 1132 1133 /* 1134 * Compute mbuf offset to TCP data segment. 1135 */ 1136 hdroptlen = iphlen + off; 1137 1138 /* 1139 * Calculate amount of space in receive window, 1140 * and then do TCP input processing. 1141 * Receive window is amount of space in rcv queue, 1142 * but not less than advertised window. 1143 */ 1144 { int win; 1145 1146 win = sbspace(&so->so_rcv); 1147 if (win < 0) 1148 win = 0; 1149 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1150 } 1151 1152 switch (tp->t_state) { 1153 1154 /* 1155 * If the state is SYN_RECEIVED: 1156 * if seg contains SYN/ACK, send an RST. 1157 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1158 */ 1159 1160 case TCPS_SYN_RECEIVED: 1161 if (tiflags & TH_ACK) { 1162 if (tiflags & TH_SYN) { 1163 tcpstat.tcps_badsyn++; 1164 goto dropwithreset; 1165 } 1166 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1167 SEQ_GT(th->th_ack, tp->snd_max)) 1168 goto dropwithreset; 1169 } 1170 break; 1171 1172 /* 1173 * If the state is SYN_SENT: 1174 * if seg contains an ACK, but not for our SYN, drop the input. 1175 * if seg contains a RST, then drop the connection. 1176 * if seg does not contain SYN, then drop it. 1177 * Otherwise this is an acceptable SYN segment 1178 * initialize tp->rcv_nxt and tp->irs 1179 * if seg contains ack then advance tp->snd_una 1180 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1181 * arrange for segment to be acked (eventually) 1182 * continue processing rest of data/controls, beginning with URG 1183 */ 1184 case TCPS_SYN_SENT: 1185 if ((tiflags & TH_ACK) && 1186 (SEQ_LEQ(th->th_ack, tp->iss) || 1187 SEQ_GT(th->th_ack, tp->snd_max))) 1188 goto dropwithreset; 1189 if (tiflags & TH_RST) { 1190 #ifdef TCP_ECN 1191 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1192 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1193 goto drop; 1194 #endif 1195 if (tiflags & TH_ACK) 1196 tp = tcp_drop(tp, ECONNREFUSED); 1197 goto drop; 1198 } 1199 if ((tiflags & TH_SYN) == 0) 1200 goto drop; 1201 if (tiflags & TH_ACK) { 1202 tp->snd_una = th->th_ack; 1203 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1204 tp->snd_nxt = tp->snd_una; 1205 } 1206 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1207 tp->irs = th->th_seq; 1208 tcp_mss(tp, opti.maxseg); 1209 /* Reset initial window to 1 segment for retransmit */ 1210 if (tp->t_rxtshift > 0) 1211 tp->snd_cwnd = tp->t_maxseg; 1212 tcp_rcvseqinit(tp); 1213 tp->t_flags |= TF_ACKNOW; 1214 #ifdef TCP_SACK 1215 /* 1216 * If we've sent a SACK_PERMITTED option, and the peer 1217 * also replied with one, then TF_SACK_PERMIT should have 1218 * been set in tcp_dooptions(). If it was not, disable SACKs. 1219 */ 1220 if (tp->sack_enable) 1221 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1222 #endif 1223 #ifdef TCP_ECN 1224 /* 1225 * if ECE is set but CWR is not set for SYN-ACK, or 1226 * both ECE and CWR are set for simultaneous open, 1227 * peer is ECN capable. 1228 */ 1229 if (tcp_do_ecn) { 1230 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1231 == (TH_ACK|TH_ECE) || 1232 (tiflags & (TH_ACK|TH_ECE|TH_CWR)) 1233 == (TH_ECE|TH_CWR)) { 1234 tp->t_flags |= TF_ECN_PERMIT; 1235 tiflags &= ~(TH_ECE|TH_CWR); 1236 tcpstat.tcps_ecn_accepts++; 1237 } 1238 } 1239 #endif 1240 1241 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1242 tcpstat.tcps_connects++; 1243 soisconnected(so); 1244 tp->t_state = TCPS_ESTABLISHED; 1245 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1246 /* Do window scaling on this connection? */ 1247 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1248 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1249 tp->snd_scale = tp->requested_s_scale; 1250 tp->rcv_scale = tp->request_r_scale; 1251 } 1252 (void) tcp_reass(tp, (struct tcphdr *)0, 1253 (struct mbuf *)0, &tlen); 1254 /* 1255 * if we didn't have to retransmit the SYN, 1256 * use its rtt as our initial srtt & rtt var. 1257 */ 1258 if (tp->t_rtttime) 1259 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1260 /* 1261 * Since new data was acked (the SYN), open the 1262 * congestion window by one MSS. We do this 1263 * here, because we won't go through the normal 1264 * ACK processing below. And since this is the 1265 * start of the connection, we know we are in 1266 * the exponential phase of slow-start. 1267 */ 1268 tp->snd_cwnd += tp->t_maxseg; 1269 } else 1270 tp->t_state = TCPS_SYN_RECEIVED; 1271 1272 #if 0 1273 trimthenstep6: 1274 #endif 1275 /* 1276 * Advance th->th_seq to correspond to first data byte. 1277 * If data, trim to stay within window, 1278 * dropping FIN if necessary. 1279 */ 1280 th->th_seq++; 1281 if (tlen > tp->rcv_wnd) { 1282 todrop = tlen - tp->rcv_wnd; 1283 m_adj(m, -todrop); 1284 tlen = tp->rcv_wnd; 1285 tiflags &= ~TH_FIN; 1286 tcpstat.tcps_rcvpackafterwin++; 1287 tcpstat.tcps_rcvbyteafterwin += todrop; 1288 } 1289 tp->snd_wl1 = th->th_seq - 1; 1290 tp->rcv_up = th->th_seq; 1291 goto step6; 1292 /* 1293 * If a new connection request is received while in TIME_WAIT, 1294 * drop the old connection and start over if the if the 1295 * timestamp or the sequence numbers are above the previous 1296 * ones. 1297 */ 1298 case TCPS_TIME_WAIT: 1299 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1300 ((opti.ts_present && 1301 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1302 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1303 /* 1304 * Advance the iss by at least 32768, but 1305 * clear the msb in order to make sure 1306 * that SEG_LT(snd_nxt, iss). 1307 */ 1308 iss = tp->snd_nxt + 1309 ((arc4random() & 0x7fffffff) | 0x8000); 1310 reuse = &iss; 1311 tp = tcp_close(tp); 1312 inp = NULL; 1313 goto findpcb; 1314 } 1315 } 1316 1317 /* 1318 * States other than LISTEN or SYN_SENT. 1319 * First check timestamp, if present. 1320 * Then check that at least some bytes of segment are within 1321 * receive window. If segment begins before rcv_nxt, 1322 * drop leading data (and SYN); if nothing left, just ack. 1323 * 1324 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1325 * and it's less than opti.ts_recent, drop it. 1326 */ 1327 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1328 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1329 1330 /* Check to see if ts_recent is over 24 days old. */ 1331 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1332 /* 1333 * Invalidate ts_recent. If this segment updates 1334 * ts_recent, the age will be reset later and ts_recent 1335 * will get a valid value. If it does not, setting 1336 * ts_recent to zero will at least satisfy the 1337 * requirement that zero be placed in the timestamp 1338 * echo reply when ts_recent isn't valid. The 1339 * age isn't reset until we get a valid ts_recent 1340 * because we don't want out-of-order segments to be 1341 * dropped when ts_recent is old. 1342 */ 1343 tp->ts_recent = 0; 1344 } else { 1345 tcpstat.tcps_rcvduppack++; 1346 tcpstat.tcps_rcvdupbyte += tlen; 1347 tcpstat.tcps_pawsdrop++; 1348 goto dropafterack; 1349 } 1350 } 1351 1352 todrop = tp->rcv_nxt - th->th_seq; 1353 if (todrop > 0) { 1354 if (tiflags & TH_SYN) { 1355 tiflags &= ~TH_SYN; 1356 th->th_seq++; 1357 if (th->th_urp > 1) 1358 th->th_urp--; 1359 else 1360 tiflags &= ~TH_URG; 1361 todrop--; 1362 } 1363 if (todrop > tlen || 1364 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1365 /* 1366 * Any valid FIN must be to the left of the 1367 * window. At this point, FIN must be a 1368 * duplicate or out-of-sequence, so drop it. 1369 */ 1370 tiflags &= ~TH_FIN; 1371 /* 1372 * Send ACK to resynchronize, and drop any data, 1373 * but keep on processing for RST or ACK. 1374 */ 1375 tp->t_flags |= TF_ACKNOW; 1376 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1377 tcpstat.tcps_rcvduppack++; 1378 } else { 1379 tcpstat.tcps_rcvpartduppack++; 1380 tcpstat.tcps_rcvpartdupbyte += todrop; 1381 } 1382 hdroptlen += todrop; /* drop from head afterwards */ 1383 th->th_seq += todrop; 1384 tlen -= todrop; 1385 if (th->th_urp > todrop) 1386 th->th_urp -= todrop; 1387 else { 1388 tiflags &= ~TH_URG; 1389 th->th_urp = 0; 1390 } 1391 } 1392 1393 /* 1394 * If new data are received on a connection after the 1395 * user processes are gone, then RST the other end. 1396 */ 1397 if ((so->so_state & SS_NOFDREF) && 1398 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1399 tp = tcp_close(tp); 1400 tcpstat.tcps_rcvafterclose++; 1401 goto dropwithreset; 1402 } 1403 1404 /* 1405 * If segment ends after window, drop trailing data 1406 * (and PUSH and FIN); if nothing left, just ACK. 1407 */ 1408 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1409 if (todrop > 0) { 1410 tcpstat.tcps_rcvpackafterwin++; 1411 if (todrop >= tlen) { 1412 tcpstat.tcps_rcvbyteafterwin += tlen; 1413 /* 1414 * If window is closed can only take segments at 1415 * window edge, and have to drop data and PUSH from 1416 * incoming segments. Continue processing, but 1417 * remember to ack. Otherwise, drop segment 1418 * and ack. 1419 */ 1420 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1421 tp->t_flags |= TF_ACKNOW; 1422 tcpstat.tcps_rcvwinprobe++; 1423 } else 1424 goto dropafterack; 1425 } else 1426 tcpstat.tcps_rcvbyteafterwin += todrop; 1427 m_adj(m, -todrop); 1428 tlen -= todrop; 1429 tiflags &= ~(TH_PUSH|TH_FIN); 1430 } 1431 1432 /* 1433 * If last ACK falls within this segment's sequence numbers, 1434 * record its timestamp if it's more recent. 1435 * Cf fix from Braden, see Stevens p. 870 1436 */ 1437 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1438 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1439 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1440 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1441 tp->ts_recent = opti.ts_val; 1442 else 1443 tp->ts_recent = 0; 1444 tp->ts_recent_age = tcp_now; 1445 } 1446 1447 /* 1448 * If the RST bit is set examine the state: 1449 * SYN_RECEIVED STATE: 1450 * If passive open, return to LISTEN state. 1451 * If active open, inform user that connection was refused. 1452 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1453 * Inform user that connection was reset, and close tcb. 1454 * CLOSING, LAST_ACK, TIME_WAIT STATES 1455 * Close the tcb. 1456 */ 1457 if (tiflags & TH_RST) { 1458 if (th->th_seq != tp->last_ack_sent && 1459 th->th_seq != tp->rcv_nxt && 1460 th->th_seq != (tp->rcv_nxt + 1)) 1461 goto drop; 1462 1463 switch (tp->t_state) { 1464 case TCPS_SYN_RECEIVED: 1465 #ifdef TCP_ECN 1466 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1467 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1468 goto drop; 1469 #endif 1470 so->so_error = ECONNREFUSED; 1471 goto close; 1472 1473 case TCPS_ESTABLISHED: 1474 case TCPS_FIN_WAIT_1: 1475 case TCPS_FIN_WAIT_2: 1476 case TCPS_CLOSE_WAIT: 1477 so->so_error = ECONNRESET; 1478 close: 1479 tp->t_state = TCPS_CLOSED; 1480 tcpstat.tcps_drops++; 1481 tp = tcp_close(tp); 1482 goto drop; 1483 case TCPS_CLOSING: 1484 case TCPS_LAST_ACK: 1485 case TCPS_TIME_WAIT: 1486 tp = tcp_close(tp); 1487 goto drop; 1488 } 1489 } 1490 1491 /* 1492 * If a SYN is in the window, then this is an 1493 * error and we ACK and drop the packet. 1494 */ 1495 if (tiflags & TH_SYN) 1496 goto dropafterack_ratelim; 1497 1498 /* 1499 * If the ACK bit is off we drop the segment and return. 1500 */ 1501 if ((tiflags & TH_ACK) == 0) { 1502 if (tp->t_flags & TF_ACKNOW) 1503 goto dropafterack; 1504 else 1505 goto drop; 1506 } 1507 1508 /* 1509 * Ack processing. 1510 */ 1511 switch (tp->t_state) { 1512 1513 /* 1514 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1515 * ESTABLISHED state and continue processing. 1516 * The ACK was checked above. 1517 */ 1518 case TCPS_SYN_RECEIVED: 1519 tcpstat.tcps_connects++; 1520 soisconnected(so); 1521 tp->t_state = TCPS_ESTABLISHED; 1522 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1523 /* Do window scaling? */ 1524 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1525 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1526 tp->snd_scale = tp->requested_s_scale; 1527 tp->rcv_scale = tp->request_r_scale; 1528 tiwin = th->th_win << tp->snd_scale; 1529 } 1530 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1531 &tlen); 1532 tp->snd_wl1 = th->th_seq - 1; 1533 /* fall into ... */ 1534 1535 /* 1536 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1537 * ACKs. If the ack is in the range 1538 * tp->snd_una < th->th_ack <= tp->snd_max 1539 * then advance tp->snd_una to th->th_ack and drop 1540 * data from the retransmission queue. If this ACK reflects 1541 * more up to date window information we update our window information. 1542 */ 1543 case TCPS_ESTABLISHED: 1544 case TCPS_FIN_WAIT_1: 1545 case TCPS_FIN_WAIT_2: 1546 case TCPS_CLOSE_WAIT: 1547 case TCPS_CLOSING: 1548 case TCPS_LAST_ACK: 1549 case TCPS_TIME_WAIT: 1550 #ifdef TCP_ECN 1551 /* 1552 * if we receive ECE and are not already in recovery phase, 1553 * reduce cwnd by half but don't slow-start. 1554 * advance snd_last to snd_max not to reduce cwnd again 1555 * until all outstanding packets are acked. 1556 */ 1557 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1558 if ((tp->t_flags & TF_ECN_PERMIT) && 1559 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1560 u_int win; 1561 1562 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1563 if (win > 1) { 1564 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1565 tp->snd_cwnd = tp->snd_ssthresh; 1566 tp->snd_last = tp->snd_max; 1567 tp->t_flags |= TF_SEND_CWR; 1568 tcpstat.tcps_cwr_ecn++; 1569 } 1570 } 1571 tcpstat.tcps_ecn_rcvece++; 1572 } 1573 /* 1574 * if we receive CWR, we know that the peer has reduced 1575 * its congestion window. stop sending ecn-echo. 1576 */ 1577 if ((tiflags & TH_CWR)) { 1578 tp->t_flags &= ~TF_RCVD_CE; 1579 tcpstat.tcps_ecn_rcvcwr++; 1580 } 1581 #endif /* TCP_ECN */ 1582 1583 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1584 /* 1585 * Duplicate/old ACK processing. 1586 * Increments t_dupacks: 1587 * Pure duplicate (same seq/ack/window, no data) 1588 * Doesn't affect t_dupacks: 1589 * Data packets. 1590 * Normal window updates (window opens) 1591 * Resets t_dupacks: 1592 * New data ACKed. 1593 * Window shrinks 1594 * Old ACK 1595 */ 1596 if (tlen) { 1597 /* Drop very old ACKs unless th_seq matches */ 1598 if (th->th_seq != tp->rcv_nxt && 1599 SEQ_LT(th->th_ack, 1600 tp->snd_una - tp->max_sndwnd)) { 1601 tcpstat.tcps_rcvacktooold++; 1602 goto drop; 1603 } 1604 break; 1605 } 1606 /* 1607 * If we get an old ACK, there is probably packet 1608 * reordering going on. Be conservative and reset 1609 * t_dupacks so that we are less aggressive in 1610 * doing a fast retransmit. 1611 */ 1612 if (th->th_ack != tp->snd_una) { 1613 tp->t_dupacks = 0; 1614 break; 1615 } 1616 if (tiwin == tp->snd_wnd) { 1617 tcpstat.tcps_rcvdupack++; 1618 /* 1619 * If we have outstanding data (other than 1620 * a window probe), this is a completely 1621 * duplicate ack (ie, window info didn't 1622 * change), the ack is the biggest we've 1623 * seen and we've seen exactly our rexmt 1624 * threshold of them, assume a packet 1625 * has been dropped and retransmit it. 1626 * Kludge snd_nxt & the congestion 1627 * window so we send only this one 1628 * packet. 1629 * 1630 * We know we're losing at the current 1631 * window size so do congestion avoidance 1632 * (set ssthresh to half the current window 1633 * and pull our congestion window back to 1634 * the new ssthresh). 1635 * 1636 * Dup acks mean that packets have left the 1637 * network (they're now cached at the receiver) 1638 * so bump cwnd by the amount in the receiver 1639 * to keep a constant cwnd packets in the 1640 * network. 1641 */ 1642 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1643 tp->t_dupacks = 0; 1644 #if defined(TCP_SACK) && defined(TCP_FACK) 1645 /* 1646 * In FACK, can enter fast rec. if the receiver 1647 * reports a reass. queue longer than 3 segs. 1648 */ 1649 else if (++tp->t_dupacks == tcprexmtthresh || 1650 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1651 tp->t_maxseg + tp->snd_una)) && 1652 SEQ_GT(tp->snd_una, tp->snd_last))) { 1653 #else 1654 else if (++tp->t_dupacks == tcprexmtthresh) { 1655 #endif /* TCP_FACK */ 1656 tcp_seq onxt = tp->snd_nxt; 1657 u_long win = 1658 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1659 2 / tp->t_maxseg; 1660 1661 #if defined(TCP_SACK) || defined(TCP_ECN) 1662 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1663 /* 1664 * False fast retx after 1665 * timeout. Do not cut window. 1666 */ 1667 tp->t_dupacks = 0; 1668 goto drop; 1669 } 1670 #endif 1671 if (win < 2) 1672 win = 2; 1673 tp->snd_ssthresh = win * tp->t_maxseg; 1674 #ifdef TCP_SACK 1675 tp->snd_last = tp->snd_max; 1676 if (tp->sack_enable) { 1677 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1678 tp->t_rtttime = 0; 1679 #ifdef TCP_ECN 1680 tp->t_flags |= TF_SEND_CWR; 1681 #endif 1682 tcpstat.tcps_cwr_frecovery++; 1683 tcpstat.tcps_sack_recovery_episode++; 1684 #if defined(TCP_SACK) && defined(TCP_FACK) 1685 tp->t_dupacks = tcprexmtthresh; 1686 (void) tcp_output(tp); 1687 /* 1688 * During FR, snd_cwnd is held 1689 * constant for FACK. 1690 */ 1691 tp->snd_cwnd = tp->snd_ssthresh; 1692 #else 1693 /* 1694 * tcp_output() will send 1695 * oldest SACK-eligible rtx. 1696 */ 1697 (void) tcp_output(tp); 1698 tp->snd_cwnd = tp->snd_ssthresh+ 1699 tp->t_maxseg * tp->t_dupacks; 1700 #endif /* TCP_FACK */ 1701 goto drop; 1702 } 1703 #endif /* TCP_SACK */ 1704 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1705 tp->t_rtttime = 0; 1706 tp->snd_nxt = th->th_ack; 1707 tp->snd_cwnd = tp->t_maxseg; 1708 #ifdef TCP_ECN 1709 tp->t_flags |= TF_SEND_CWR; 1710 #endif 1711 tcpstat.tcps_cwr_frecovery++; 1712 tcpstat.tcps_sndrexmitfast++; 1713 (void) tcp_output(tp); 1714 1715 tp->snd_cwnd = tp->snd_ssthresh + 1716 tp->t_maxseg * tp->t_dupacks; 1717 if (SEQ_GT(onxt, tp->snd_nxt)) 1718 tp->snd_nxt = onxt; 1719 goto drop; 1720 } else if (tp->t_dupacks > tcprexmtthresh) { 1721 #if defined(TCP_SACK) && defined(TCP_FACK) 1722 /* 1723 * while (awnd < cwnd) 1724 * sendsomething(); 1725 */ 1726 if (tp->sack_enable) { 1727 if (tp->snd_awnd < tp->snd_cwnd) 1728 tcp_output(tp); 1729 goto drop; 1730 } 1731 #endif /* TCP_FACK */ 1732 tp->snd_cwnd += tp->t_maxseg; 1733 (void) tcp_output(tp); 1734 goto drop; 1735 } 1736 } else if (tiwin < tp->snd_wnd) { 1737 /* 1738 * The window was retracted! Previous dup 1739 * ACKs may have been due to packets arriving 1740 * after the shrunken window, not a missing 1741 * packet, so play it safe and reset t_dupacks 1742 */ 1743 tp->t_dupacks = 0; 1744 } 1745 break; 1746 } 1747 /* 1748 * If the congestion window was inflated to account 1749 * for the other side's cached packets, retract it. 1750 */ 1751 #if defined(TCP_SACK) 1752 if (tp->sack_enable) { 1753 if (tp->t_dupacks >= tcprexmtthresh) { 1754 /* Check for a partial ACK */ 1755 if (tcp_sack_partialack(tp, th)) { 1756 #if defined(TCP_SACK) && defined(TCP_FACK) 1757 /* Force call to tcp_output */ 1758 if (tp->snd_awnd < tp->snd_cwnd) 1759 needoutput = 1; 1760 #else 1761 tp->snd_cwnd += tp->t_maxseg; 1762 needoutput = 1; 1763 #endif /* TCP_FACK */ 1764 } else { 1765 /* Out of fast recovery */ 1766 tp->snd_cwnd = tp->snd_ssthresh; 1767 if (tcp_seq_subtract(tp->snd_max, 1768 th->th_ack) < tp->snd_ssthresh) 1769 tp->snd_cwnd = 1770 tcp_seq_subtract(tp->snd_max, 1771 th->th_ack); 1772 tp->t_dupacks = 0; 1773 #if defined(TCP_SACK) && defined(TCP_FACK) 1774 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1775 tp->snd_fack = th->th_ack; 1776 #endif /* TCP_FACK */ 1777 } 1778 } 1779 } else { 1780 if (tp->t_dupacks >= tcprexmtthresh && 1781 !tcp_newreno(tp, th)) { 1782 /* Out of fast recovery */ 1783 tp->snd_cwnd = tp->snd_ssthresh; 1784 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1785 tp->snd_ssthresh) 1786 tp->snd_cwnd = 1787 tcp_seq_subtract(tp->snd_max, 1788 th->th_ack); 1789 tp->t_dupacks = 0; 1790 } 1791 } 1792 if (tp->t_dupacks < tcprexmtthresh) 1793 tp->t_dupacks = 0; 1794 #else /* else no TCP_SACK */ 1795 if (tp->t_dupacks >= tcprexmtthresh && 1796 tp->snd_cwnd > tp->snd_ssthresh) 1797 tp->snd_cwnd = tp->snd_ssthresh; 1798 tp->t_dupacks = 0; 1799 #endif 1800 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1801 tcpstat.tcps_rcvacktoomuch++; 1802 goto dropafterack_ratelim; 1803 } 1804 acked = th->th_ack - tp->snd_una; 1805 tcpstat.tcps_rcvackpack++; 1806 tcpstat.tcps_rcvackbyte += acked; 1807 1808 /* 1809 * If we have a timestamp reply, update smoothed 1810 * round trip time. If no timestamp is present but 1811 * transmit timer is running and timed sequence 1812 * number was acked, update smoothed round trip time. 1813 * Since we now have an rtt measurement, cancel the 1814 * timer backoff (cf., Phil Karn's retransmit alg.). 1815 * Recompute the initial retransmit timer. 1816 */ 1817 if (opti.ts_present && opti.ts_ecr) 1818 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1819 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1820 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1821 1822 /* 1823 * If all outstanding data is acked, stop retransmit 1824 * timer and remember to restart (more output or persist). 1825 * If there is more data to be acked, restart retransmit 1826 * timer, using current (possibly backed-off) value. 1827 */ 1828 if (th->th_ack == tp->snd_max) { 1829 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1830 needoutput = 1; 1831 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1832 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1833 /* 1834 * When new data is acked, open the congestion window. 1835 * If the window gives us less than ssthresh packets 1836 * in flight, open exponentially (maxseg per packet). 1837 * Otherwise open linearly: maxseg per window 1838 * (maxseg^2 / cwnd per packet). 1839 */ 1840 { 1841 u_int cw = tp->snd_cwnd; 1842 u_int incr = tp->t_maxseg; 1843 1844 if (cw > tp->snd_ssthresh) 1845 incr = incr * incr / cw; 1846 #if defined (TCP_SACK) 1847 if (tp->t_dupacks < tcprexmtthresh) 1848 #endif 1849 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1850 } 1851 ND6_HINT(tp); 1852 if (acked > so->so_snd.sb_cc) { 1853 tp->snd_wnd -= so->so_snd.sb_cc; 1854 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1855 ourfinisacked = 1; 1856 } else { 1857 sbdrop(&so->so_snd, acked); 1858 tp->snd_wnd -= acked; 1859 ourfinisacked = 0; 1860 } 1861 if (sb_notify(&so->so_snd)) 1862 sowwakeup(so); 1863 1864 /* 1865 * If we had a pending ICMP message that referred to data 1866 * that have just been acknowledged, disregard the recorded 1867 * ICMP message. 1868 */ 1869 if ((tp->t_flags & TF_PMTUD_PEND) && 1870 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1871 tp->t_flags &= ~TF_PMTUD_PEND; 1872 1873 /* 1874 * Keep track of the largest chunk of data acknowledged 1875 * since last PMTU update 1876 */ 1877 if (tp->t_pmtud_mss_acked < acked) 1878 tp->t_pmtud_mss_acked = acked; 1879 1880 tp->snd_una = th->th_ack; 1881 #ifdef TCP_ECN 1882 /* sync snd_last with snd_una */ 1883 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1884 tp->snd_last = tp->snd_una; 1885 #endif 1886 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1887 tp->snd_nxt = tp->snd_una; 1888 #if defined (TCP_SACK) && defined (TCP_FACK) 1889 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1890 tp->snd_fack = tp->snd_una; 1891 /* Update snd_awnd for partial ACK 1892 * without any SACK blocks. 1893 */ 1894 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1895 tp->snd_fack) + tp->retran_data; 1896 } 1897 #endif 1898 1899 switch (tp->t_state) { 1900 1901 /* 1902 * In FIN_WAIT_1 STATE in addition to the processing 1903 * for the ESTABLISHED state if our FIN is now acknowledged 1904 * then enter FIN_WAIT_2. 1905 */ 1906 case TCPS_FIN_WAIT_1: 1907 if (ourfinisacked) { 1908 /* 1909 * If we can't receive any more 1910 * data, then closing user can proceed. 1911 * Starting the timer is contrary to the 1912 * specification, but if we don't get a FIN 1913 * we'll hang forever. 1914 */ 1915 if (so->so_state & SS_CANTRCVMORE) { 1916 soisdisconnected(so); 1917 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1918 } 1919 tp->t_state = TCPS_FIN_WAIT_2; 1920 } 1921 break; 1922 1923 /* 1924 * In CLOSING STATE in addition to the processing for 1925 * the ESTABLISHED state if the ACK acknowledges our FIN 1926 * then enter the TIME-WAIT state, otherwise ignore 1927 * the segment. 1928 */ 1929 case TCPS_CLOSING: 1930 if (ourfinisacked) { 1931 tp->t_state = TCPS_TIME_WAIT; 1932 tcp_canceltimers(tp); 1933 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1934 soisdisconnected(so); 1935 } 1936 break; 1937 1938 /* 1939 * In LAST_ACK, we may still be waiting for data to drain 1940 * and/or to be acked, as well as for the ack of our FIN. 1941 * If our FIN is now acknowledged, delete the TCB, 1942 * enter the closed state and return. 1943 */ 1944 case TCPS_LAST_ACK: 1945 if (ourfinisacked) { 1946 tp = tcp_close(tp); 1947 goto drop; 1948 } 1949 break; 1950 1951 /* 1952 * In TIME_WAIT state the only thing that should arrive 1953 * is a retransmission of the remote FIN. Acknowledge 1954 * it and restart the finack timer. 1955 */ 1956 case TCPS_TIME_WAIT: 1957 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1958 goto dropafterack; 1959 } 1960 } 1961 1962 step6: 1963 /* 1964 * Update window information. 1965 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1966 */ 1967 if ((tiflags & TH_ACK) && 1968 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1969 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1970 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1971 /* keep track of pure window updates */ 1972 if (tlen == 0 && 1973 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1974 tcpstat.tcps_rcvwinupd++; 1975 tp->snd_wnd = tiwin; 1976 tp->snd_wl1 = th->th_seq; 1977 tp->snd_wl2 = th->th_ack; 1978 if (tp->snd_wnd > tp->max_sndwnd) 1979 tp->max_sndwnd = tp->snd_wnd; 1980 needoutput = 1; 1981 } 1982 1983 /* 1984 * Process segments with URG. 1985 */ 1986 if ((tiflags & TH_URG) && th->th_urp && 1987 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1988 /* 1989 * This is a kludge, but if we receive and accept 1990 * random urgent pointers, we'll crash in 1991 * soreceive. It's hard to imagine someone 1992 * actually wanting to send this much urgent data. 1993 */ 1994 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1995 th->th_urp = 0; /* XXX */ 1996 tiflags &= ~TH_URG; /* XXX */ 1997 goto dodata; /* XXX */ 1998 } 1999 /* 2000 * If this segment advances the known urgent pointer, 2001 * then mark the data stream. This should not happen 2002 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2003 * a FIN has been received from the remote side. 2004 * In these states we ignore the URG. 2005 * 2006 * According to RFC961 (Assigned Protocols), 2007 * the urgent pointer points to the last octet 2008 * of urgent data. We continue, however, 2009 * to consider it to indicate the first octet 2010 * of data past the urgent section as the original 2011 * spec states (in one of two places). 2012 */ 2013 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2014 tp->rcv_up = th->th_seq + th->th_urp; 2015 so->so_oobmark = so->so_rcv.sb_cc + 2016 (tp->rcv_up - tp->rcv_nxt) - 1; 2017 if (so->so_oobmark == 0) 2018 so->so_state |= SS_RCVATMARK; 2019 sohasoutofband(so); 2020 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2021 } 2022 /* 2023 * Remove out of band data so doesn't get presented to user. 2024 * This can happen independent of advancing the URG pointer, 2025 * but if two URG's are pending at once, some out-of-band 2026 * data may creep in... ick. 2027 */ 2028 if (th->th_urp <= (u_int16_t) tlen 2029 #ifdef SO_OOBINLINE 2030 && (so->so_options & SO_OOBINLINE) == 0 2031 #endif 2032 ) 2033 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2034 } else 2035 /* 2036 * If no out of band data is expected, 2037 * pull receive urgent pointer along 2038 * with the receive window. 2039 */ 2040 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2041 tp->rcv_up = tp->rcv_nxt; 2042 dodata: /* XXX */ 2043 2044 /* 2045 * Process the segment text, merging it into the TCP sequencing queue, 2046 * and arranging for acknowledgment of receipt if necessary. 2047 * This process logically involves adjusting tp->rcv_wnd as data 2048 * is presented to the user (this happens in tcp_usrreq.c, 2049 * case PRU_RCVD). If a FIN has already been received on this 2050 * connection then we just ignore the text. 2051 */ 2052 if ((tlen || (tiflags & TH_FIN)) && 2053 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2054 #ifdef TCP_SACK 2055 tcp_seq laststart = th->th_seq; 2056 tcp_seq lastend = th->th_seq + tlen; 2057 #endif 2058 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2059 tp->t_state == TCPS_ESTABLISHED) { 2060 TCP_SETUP_ACK(tp, tiflags); 2061 tp->rcv_nxt += tlen; 2062 tiflags = th->th_flags & TH_FIN; 2063 tcpstat.tcps_rcvpack++; 2064 tcpstat.tcps_rcvbyte += tlen; 2065 ND6_HINT(tp); 2066 if (so->so_state & SS_CANTRCVMORE) 2067 m_freem(m); 2068 else { 2069 m_adj(m, hdroptlen); 2070 sbappendstream(&so->so_rcv, m); 2071 } 2072 sorwakeup(so); 2073 } else { 2074 m_adj(m, hdroptlen); 2075 tiflags = tcp_reass(tp, th, m, &tlen); 2076 tp->t_flags |= TF_ACKNOW; 2077 } 2078 #ifdef TCP_SACK 2079 if (tp->sack_enable) 2080 tcp_update_sack_list(tp, laststart, lastend); 2081 #endif 2082 2083 /* 2084 * variable len never referenced again in modern BSD, 2085 * so why bother computing it ?? 2086 */ 2087 #if 0 2088 /* 2089 * Note the amount of data that peer has sent into 2090 * our window, in order to estimate the sender's 2091 * buffer size. 2092 */ 2093 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2094 #endif /* 0 */ 2095 } else { 2096 m_freem(m); 2097 tiflags &= ~TH_FIN; 2098 } 2099 2100 /* 2101 * If FIN is received ACK the FIN and let the user know 2102 * that the connection is closing. Ignore a FIN received before 2103 * the connection is fully established. 2104 */ 2105 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2106 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2107 socantrcvmore(so); 2108 tp->t_flags |= TF_ACKNOW; 2109 tp->rcv_nxt++; 2110 } 2111 switch (tp->t_state) { 2112 2113 /* 2114 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2115 */ 2116 case TCPS_ESTABLISHED: 2117 tp->t_state = TCPS_CLOSE_WAIT; 2118 break; 2119 2120 /* 2121 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2122 * enter the CLOSING state. 2123 */ 2124 case TCPS_FIN_WAIT_1: 2125 tp->t_state = TCPS_CLOSING; 2126 break; 2127 2128 /* 2129 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2130 * starting the time-wait timer, turning off the other 2131 * standard timers. 2132 */ 2133 case TCPS_FIN_WAIT_2: 2134 tp->t_state = TCPS_TIME_WAIT; 2135 tcp_canceltimers(tp); 2136 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2137 soisdisconnected(so); 2138 break; 2139 2140 /* 2141 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2142 */ 2143 case TCPS_TIME_WAIT: 2144 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2145 break; 2146 } 2147 } 2148 if (so->so_options & SO_DEBUG) { 2149 switch (tp->pf) { 2150 #ifdef INET6 2151 case PF_INET6: 2152 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2153 0, tlen); 2154 break; 2155 #endif /* INET6 */ 2156 case PF_INET: 2157 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2158 0, tlen); 2159 break; 2160 } 2161 } 2162 2163 /* 2164 * Return any desired output. 2165 */ 2166 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2167 (void) tcp_output(tp); 2168 } 2169 return; 2170 2171 badsyn: 2172 /* 2173 * Received a bad SYN. Increment counters and dropwithreset. 2174 */ 2175 tcpstat.tcps_badsyn++; 2176 tp = NULL; 2177 goto dropwithreset; 2178 2179 dropafterack_ratelim: 2180 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2181 tcp_ackdrop_ppslim) == 0) { 2182 /* XXX stat */ 2183 goto drop; 2184 } 2185 /* ...fall into dropafterack... */ 2186 2187 dropafterack: 2188 /* 2189 * Generate an ACK dropping incoming segment if it occupies 2190 * sequence space, where the ACK reflects our state. 2191 */ 2192 if (tiflags & TH_RST) 2193 goto drop; 2194 m_freem(m); 2195 tp->t_flags |= TF_ACKNOW; 2196 (void) tcp_output(tp); 2197 return; 2198 2199 dropwithreset_ratelim: 2200 /* 2201 * We may want to rate-limit RSTs in certain situations, 2202 * particularly if we are sending an RST in response to 2203 * an attempt to connect to or otherwise communicate with 2204 * a port for which we have no socket. 2205 */ 2206 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2207 tcp_rst_ppslim) == 0) { 2208 /* XXX stat */ 2209 goto drop; 2210 } 2211 /* ...fall into dropwithreset... */ 2212 2213 dropwithreset: 2214 /* 2215 * Generate a RST, dropping incoming segment. 2216 * Make ACK acceptable to originator of segment. 2217 * Don't bother to respond to RST. 2218 */ 2219 if (tiflags & TH_RST) 2220 goto drop; 2221 if (tiflags & TH_ACK) { 2222 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2223 TH_RST); 2224 } else { 2225 if (tiflags & TH_SYN) 2226 tlen++; 2227 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2228 (tcp_seq)0, TH_RST|TH_ACK); 2229 } 2230 m_freem(m); 2231 return; 2232 2233 drop: 2234 /* 2235 * Drop space held by incoming segment and return. 2236 */ 2237 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2238 switch (tp->pf) { 2239 #ifdef INET6 2240 case PF_INET6: 2241 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2242 0, tlen); 2243 break; 2244 #endif /* INET6 */ 2245 case PF_INET: 2246 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2247 0, tlen); 2248 break; 2249 } 2250 } 2251 2252 m_freem(m); 2253 return; 2254 } 2255 2256 int 2257 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2258 struct mbuf *m, int iphlen, struct tcp_opt_info *oi) 2259 { 2260 u_int16_t mss = 0; 2261 int opt, optlen; 2262 #ifdef TCP_SIGNATURE 2263 caddr_t sigp = NULL; 2264 struct tdb *tdb = NULL; 2265 #endif /* TCP_SIGNATURE */ 2266 2267 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2268 opt = cp[0]; 2269 if (opt == TCPOPT_EOL) 2270 break; 2271 if (opt == TCPOPT_NOP) 2272 optlen = 1; 2273 else { 2274 if (cnt < 2) 2275 break; 2276 optlen = cp[1]; 2277 if (optlen < 2 || optlen > cnt) 2278 break; 2279 } 2280 switch (opt) { 2281 2282 default: 2283 continue; 2284 2285 case TCPOPT_MAXSEG: 2286 if (optlen != TCPOLEN_MAXSEG) 2287 continue; 2288 if (!(th->th_flags & TH_SYN)) 2289 continue; 2290 if (TCPS_HAVERCVDSYN(tp->t_state)) 2291 continue; 2292 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2293 NTOHS(mss); 2294 oi->maxseg = mss; 2295 break; 2296 2297 case TCPOPT_WINDOW: 2298 if (optlen != TCPOLEN_WINDOW) 2299 continue; 2300 if (!(th->th_flags & TH_SYN)) 2301 continue; 2302 if (TCPS_HAVERCVDSYN(tp->t_state)) 2303 continue; 2304 tp->t_flags |= TF_RCVD_SCALE; 2305 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2306 break; 2307 2308 case TCPOPT_TIMESTAMP: 2309 if (optlen != TCPOLEN_TIMESTAMP) 2310 continue; 2311 oi->ts_present = 1; 2312 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2313 NTOHL(oi->ts_val); 2314 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2315 NTOHL(oi->ts_ecr); 2316 2317 if (!(th->th_flags & TH_SYN)) 2318 continue; 2319 if (TCPS_HAVERCVDSYN(tp->t_state)) 2320 continue; 2321 /* 2322 * A timestamp received in a SYN makes 2323 * it ok to send timestamp requests and replies. 2324 */ 2325 tp->t_flags |= TF_RCVD_TSTMP; 2326 tp->ts_recent = oi->ts_val; 2327 tp->ts_recent_age = tcp_now; 2328 break; 2329 2330 #ifdef TCP_SACK 2331 case TCPOPT_SACK_PERMITTED: 2332 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2333 continue; 2334 if (!(th->th_flags & TH_SYN)) 2335 continue; 2336 if (TCPS_HAVERCVDSYN(tp->t_state)) 2337 continue; 2338 /* MUST only be set on SYN */ 2339 tp->t_flags |= TF_SACK_PERMIT; 2340 break; 2341 case TCPOPT_SACK: 2342 tcp_sack_option(tp, th, cp, optlen); 2343 break; 2344 #endif 2345 #ifdef TCP_SIGNATURE 2346 case TCPOPT_SIGNATURE: 2347 if (optlen != TCPOLEN_SIGNATURE) 2348 continue; 2349 2350 if (sigp && bcmp(sigp, cp + 2, 16)) 2351 return (-1); 2352 2353 sigp = cp + 2; 2354 break; 2355 #endif /* TCP_SIGNATURE */ 2356 } 2357 } 2358 2359 #ifdef TCP_SIGNATURE 2360 if (tp->t_flags & TF_SIGNATURE) { 2361 union sockaddr_union src, dst; 2362 2363 memset(&src, 0, sizeof(union sockaddr_union)); 2364 memset(&dst, 0, sizeof(union sockaddr_union)); 2365 2366 switch (tp->pf) { 2367 case 0: 2368 #ifdef INET 2369 case AF_INET: 2370 src.sa.sa_len = sizeof(struct sockaddr_in); 2371 src.sa.sa_family = AF_INET; 2372 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2373 dst.sa.sa_len = sizeof(struct sockaddr_in); 2374 dst.sa.sa_family = AF_INET; 2375 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2376 break; 2377 #endif 2378 #ifdef INET6 2379 case AF_INET6: 2380 src.sa.sa_len = sizeof(struct sockaddr_in6); 2381 src.sa.sa_family = AF_INET6; 2382 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2383 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2384 dst.sa.sa_family = AF_INET6; 2385 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2386 break; 2387 #endif /* INET6 */ 2388 } 2389 2390 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 2391 2392 /* 2393 * We don't have an SA for this peer, so we turn off 2394 * TF_SIGNATURE on the listen socket 2395 */ 2396 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2397 tp->t_flags &= ~TF_SIGNATURE; 2398 2399 } 2400 2401 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2402 tcpstat.tcps_rcvbadsig++; 2403 return (-1); 2404 } 2405 2406 if (sigp) { 2407 char sig[16]; 2408 2409 if (tdb == NULL) { 2410 tcpstat.tcps_rcvbadsig++; 2411 return (-1); 2412 } 2413 2414 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2415 return (-1); 2416 2417 if (bcmp(sig, sigp, 16)) { 2418 tcpstat.tcps_rcvbadsig++; 2419 return (-1); 2420 } 2421 2422 tcpstat.tcps_rcvgoodsig++; 2423 } 2424 #endif /* TCP_SIGNATURE */ 2425 2426 return (0); 2427 } 2428 2429 #if defined(TCP_SACK) 2430 u_long 2431 tcp_seq_subtract(u_long a, u_long b) 2432 { 2433 return ((long)(a - b)); 2434 } 2435 #endif 2436 2437 2438 #ifdef TCP_SACK 2439 /* 2440 * This function is called upon receipt of new valid data (while not in header 2441 * prediction mode), and it updates the ordered list of sacks. 2442 */ 2443 void 2444 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2445 tcp_seq rcv_lastend) 2446 { 2447 /* 2448 * First reported block MUST be the most recent one. Subsequent 2449 * blocks SHOULD be in the order in which they arrived at the 2450 * receiver. These two conditions make the implementation fully 2451 * compliant with RFC 2018. 2452 */ 2453 int i, j = 0, count = 0, lastpos = -1; 2454 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2455 2456 /* First clean up current list of sacks */ 2457 for (i = 0; i < tp->rcv_numsacks; i++) { 2458 sack = tp->sackblks[i]; 2459 if (sack.start == 0 && sack.end == 0) { 2460 count++; /* count = number of blocks to be discarded */ 2461 continue; 2462 } 2463 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2464 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2465 count++; 2466 } else { 2467 temp[j].start = tp->sackblks[i].start; 2468 temp[j++].end = tp->sackblks[i].end; 2469 } 2470 } 2471 tp->rcv_numsacks -= count; 2472 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2473 tcp_clean_sackreport(tp); 2474 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2475 /* ==> need first sack block */ 2476 tp->sackblks[0].start = rcv_laststart; 2477 tp->sackblks[0].end = rcv_lastend; 2478 tp->rcv_numsacks = 1; 2479 } 2480 return; 2481 } 2482 /* Otherwise, sack blocks are already present. */ 2483 for (i = 0; i < tp->rcv_numsacks; i++) 2484 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2485 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2486 return; /* sack list remains unchanged */ 2487 /* 2488 * From here, segment just received should be (part of) the 1st sack. 2489 * Go through list, possibly coalescing sack block entries. 2490 */ 2491 firstsack.start = rcv_laststart; 2492 firstsack.end = rcv_lastend; 2493 for (i = 0; i < tp->rcv_numsacks; i++) { 2494 sack = tp->sackblks[i]; 2495 if (SEQ_LT(sack.end, firstsack.start) || 2496 SEQ_GT(sack.start, firstsack.end)) 2497 continue; /* no overlap */ 2498 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2499 /* 2500 * identical block; delete it here since we will 2501 * move it to the front of the list. 2502 */ 2503 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2504 lastpos = i; /* last posn with a zero entry */ 2505 continue; 2506 } 2507 if (SEQ_LEQ(sack.start, firstsack.start)) 2508 firstsack.start = sack.start; /* merge blocks */ 2509 if (SEQ_GEQ(sack.end, firstsack.end)) 2510 firstsack.end = sack.end; /* merge blocks */ 2511 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2512 lastpos = i; /* last posn with a zero entry */ 2513 } 2514 if (lastpos != -1) { /* at least one merge */ 2515 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2516 sack = tp->sackblks[i]; 2517 if (sack.start == 0 && sack.end == 0) 2518 continue; 2519 temp[j++] = sack; 2520 } 2521 tp->rcv_numsacks = j; /* including first blk (added later) */ 2522 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2523 tp->sackblks[i] = temp[i]; 2524 } else { /* no merges -- shift sacks by 1 */ 2525 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2526 tp->rcv_numsacks++; 2527 for (i = tp->rcv_numsacks-1; i > 0; i--) 2528 tp->sackblks[i] = tp->sackblks[i-1]; 2529 } 2530 tp->sackblks[0] = firstsack; 2531 return; 2532 } 2533 2534 /* 2535 * Process the TCP SACK option. tp->snd_holes is an ordered list 2536 * of holes (oldest to newest, in terms of the sequence space). 2537 */ 2538 void 2539 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2540 { 2541 int tmp_olen; 2542 u_char *tmp_cp; 2543 struct sackhole *cur, *p, *temp; 2544 2545 if (!tp->sack_enable) 2546 return; 2547 /* SACK without ACK doesn't make sense. */ 2548 if ((th->th_flags & TH_ACK) == 0) 2549 return; 2550 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2551 if (SEQ_LT(th->th_ack, tp->snd_una) || 2552 SEQ_GT(th->th_ack, tp->snd_max)) 2553 return; 2554 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2555 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2556 return; 2557 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2558 tmp_cp = cp + 2; 2559 tmp_olen = optlen - 2; 2560 tcpstat.tcps_sack_rcv_opts++; 2561 if (tp->snd_numholes < 0) 2562 tp->snd_numholes = 0; 2563 if (tp->t_maxseg == 0) 2564 panic("tcp_sack_option"); /* Should never happen */ 2565 while (tmp_olen > 0) { 2566 struct sackblk sack; 2567 2568 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2569 NTOHL(sack.start); 2570 bcopy(tmp_cp + sizeof(tcp_seq), 2571 (char *) &(sack.end), sizeof(tcp_seq)); 2572 NTOHL(sack.end); 2573 tmp_olen -= TCPOLEN_SACK; 2574 tmp_cp += TCPOLEN_SACK; 2575 if (SEQ_LEQ(sack.end, sack.start)) 2576 continue; /* bad SACK fields */ 2577 if (SEQ_LEQ(sack.end, tp->snd_una)) 2578 continue; /* old block */ 2579 #if defined(TCP_SACK) && defined(TCP_FACK) 2580 /* Updates snd_fack. */ 2581 if (SEQ_GT(sack.end, tp->snd_fack)) 2582 tp->snd_fack = sack.end; 2583 #endif /* TCP_FACK */ 2584 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2585 if (SEQ_LT(sack.start, th->th_ack)) 2586 continue; 2587 } 2588 if (SEQ_GT(sack.end, tp->snd_max)) 2589 continue; 2590 if (tp->snd_holes == NULL) { /* first hole */ 2591 tp->snd_holes = (struct sackhole *) 2592 pool_get(&sackhl_pool, PR_NOWAIT); 2593 if (tp->snd_holes == NULL) { 2594 /* ENOBUFS, so ignore SACKed block for now*/ 2595 goto done; 2596 } 2597 cur = tp->snd_holes; 2598 cur->start = th->th_ack; 2599 cur->end = sack.start; 2600 cur->rxmit = cur->start; 2601 cur->next = NULL; 2602 tp->snd_numholes = 1; 2603 tp->rcv_lastsack = sack.end; 2604 /* 2605 * dups is at least one. If more data has been 2606 * SACKed, it can be greater than one. 2607 */ 2608 cur->dups = min(tcprexmtthresh, 2609 ((sack.end - cur->end)/tp->t_maxseg)); 2610 if (cur->dups < 1) 2611 cur->dups = 1; 2612 continue; /* with next sack block */ 2613 } 2614 /* Go thru list of holes: p = previous, cur = current */ 2615 p = cur = tp->snd_holes; 2616 while (cur) { 2617 if (SEQ_LEQ(sack.end, cur->start)) 2618 /* SACKs data before the current hole */ 2619 break; /* no use going through more holes */ 2620 if (SEQ_GEQ(sack.start, cur->end)) { 2621 /* SACKs data beyond the current hole */ 2622 cur->dups++; 2623 if (((sack.end - cur->end)/tp->t_maxseg) >= 2624 tcprexmtthresh) 2625 cur->dups = tcprexmtthresh; 2626 p = cur; 2627 cur = cur->next; 2628 continue; 2629 } 2630 if (SEQ_LEQ(sack.start, cur->start)) { 2631 /* Data acks at least the beginning of hole */ 2632 #if defined(TCP_SACK) && defined(TCP_FACK) 2633 if (SEQ_GT(sack.end, cur->rxmit)) 2634 tp->retran_data -= 2635 tcp_seq_subtract(cur->rxmit, 2636 cur->start); 2637 else 2638 tp->retran_data -= 2639 tcp_seq_subtract(sack.end, 2640 cur->start); 2641 #endif /* TCP_FACK */ 2642 if (SEQ_GEQ(sack.end, cur->end)) { 2643 /* Acks entire hole, so delete hole */ 2644 if (p != cur) { 2645 p->next = cur->next; 2646 pool_put(&sackhl_pool, cur); 2647 cur = p->next; 2648 } else { 2649 cur = cur->next; 2650 pool_put(&sackhl_pool, p); 2651 p = cur; 2652 tp->snd_holes = p; 2653 } 2654 tp->snd_numholes--; 2655 continue; 2656 } 2657 /* otherwise, move start of hole forward */ 2658 cur->start = sack.end; 2659 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2660 p = cur; 2661 cur = cur->next; 2662 continue; 2663 } 2664 /* move end of hole backward */ 2665 if (SEQ_GEQ(sack.end, cur->end)) { 2666 #if defined(TCP_SACK) && defined(TCP_FACK) 2667 if (SEQ_GT(cur->rxmit, sack.start)) 2668 tp->retran_data -= 2669 tcp_seq_subtract(cur->rxmit, 2670 sack.start); 2671 #endif /* TCP_FACK */ 2672 cur->end = sack.start; 2673 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2674 cur->dups++; 2675 if (((sack.end - cur->end)/tp->t_maxseg) >= 2676 tcprexmtthresh) 2677 cur->dups = tcprexmtthresh; 2678 p = cur; 2679 cur = cur->next; 2680 continue; 2681 } 2682 if (SEQ_LT(cur->start, sack.start) && 2683 SEQ_GT(cur->end, sack.end)) { 2684 /* 2685 * ACKs some data in middle of a hole; need to 2686 * split current hole 2687 */ 2688 temp = (struct sackhole *) 2689 pool_get(&sackhl_pool, PR_NOWAIT); 2690 if (temp == NULL) 2691 goto done; /* ENOBUFS */ 2692 #if defined(TCP_SACK) && defined(TCP_FACK) 2693 if (SEQ_GT(cur->rxmit, sack.end)) 2694 tp->retran_data -= 2695 tcp_seq_subtract(sack.end, 2696 sack.start); 2697 else if (SEQ_GT(cur->rxmit, sack.start)) 2698 tp->retran_data -= 2699 tcp_seq_subtract(cur->rxmit, 2700 sack.start); 2701 #endif /* TCP_FACK */ 2702 temp->next = cur->next; 2703 temp->start = sack.end; 2704 temp->end = cur->end; 2705 temp->dups = cur->dups; 2706 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2707 cur->end = sack.start; 2708 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2709 cur->dups++; 2710 if (((sack.end - cur->end)/tp->t_maxseg) >= 2711 tcprexmtthresh) 2712 cur->dups = tcprexmtthresh; 2713 cur->next = temp; 2714 p = temp; 2715 cur = p->next; 2716 tp->snd_numholes++; 2717 } 2718 } 2719 /* At this point, p points to the last hole on the list */ 2720 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2721 /* 2722 * Need to append new hole at end. 2723 * Last hole is p (and it's not NULL). 2724 */ 2725 temp = (struct sackhole *) 2726 pool_get(&sackhl_pool, PR_NOWAIT); 2727 if (temp == NULL) 2728 goto done; /* ENOBUFS */ 2729 temp->start = tp->rcv_lastsack; 2730 temp->end = sack.start; 2731 temp->dups = min(tcprexmtthresh, 2732 ((sack.end - sack.start)/tp->t_maxseg)); 2733 if (temp->dups < 1) 2734 temp->dups = 1; 2735 temp->rxmit = temp->start; 2736 temp->next = 0; 2737 p->next = temp; 2738 tp->rcv_lastsack = sack.end; 2739 tp->snd_numholes++; 2740 } 2741 } 2742 done: 2743 #if defined(TCP_SACK) && defined(TCP_FACK) 2744 /* 2745 * Update retran_data and snd_awnd. Go through the list of 2746 * holes. Increment retran_data by (hole->rxmit - hole->start). 2747 */ 2748 tp->retran_data = 0; 2749 cur = tp->snd_holes; 2750 while (cur) { 2751 tp->retran_data += cur->rxmit - cur->start; 2752 cur = cur->next; 2753 } 2754 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2755 tp->retran_data; 2756 #endif /* TCP_FACK */ 2757 2758 return; 2759 } 2760 2761 /* 2762 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2763 * it is completely acked; otherwise, tcp_sack_option(), called from 2764 * tcp_dooptions(), will fix up the hole. 2765 */ 2766 void 2767 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2768 { 2769 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2770 /* max because this could be an older ack just arrived */ 2771 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2772 th->th_ack : tp->snd_una; 2773 struct sackhole *cur = tp->snd_holes; 2774 struct sackhole *prev; 2775 while (cur) 2776 if (SEQ_LEQ(cur->end, lastack)) { 2777 prev = cur; 2778 cur = cur->next; 2779 pool_put(&sackhl_pool, prev); 2780 tp->snd_numholes--; 2781 } else if (SEQ_LT(cur->start, lastack)) { 2782 cur->start = lastack; 2783 if (SEQ_LT(cur->rxmit, cur->start)) 2784 cur->rxmit = cur->start; 2785 break; 2786 } else 2787 break; 2788 tp->snd_holes = cur; 2789 } 2790 } 2791 2792 /* 2793 * Delete all receiver-side SACK information. 2794 */ 2795 void 2796 tcp_clean_sackreport(struct tcpcb *tp) 2797 { 2798 int i; 2799 2800 tp->rcv_numsacks = 0; 2801 for (i = 0; i < MAX_SACK_BLKS; i++) 2802 tp->sackblks[i].start = tp->sackblks[i].end=0; 2803 2804 } 2805 2806 /* 2807 * Checks for partial ack. If partial ack arrives, turn off retransmission 2808 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2809 * If the ack advances at least to tp->snd_last, return 0. 2810 */ 2811 int 2812 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2813 { 2814 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2815 /* Turn off retx. timer (will start again next segment) */ 2816 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2817 tp->t_rtttime = 0; 2818 #ifndef TCP_FACK 2819 /* 2820 * Partial window deflation. This statement relies on the 2821 * fact that tp->snd_una has not been updated yet. In FACK 2822 * hold snd_cwnd constant during fast recovery. 2823 */ 2824 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2825 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2826 tp->snd_cwnd += tp->t_maxseg; 2827 } else 2828 tp->snd_cwnd = tp->t_maxseg; 2829 #endif 2830 return (1); 2831 } 2832 return (0); 2833 } 2834 #endif /* TCP_SACK */ 2835 2836 /* 2837 * Pull out of band byte out of a segment so 2838 * it doesn't appear in the user's data queue. 2839 * It is still reflected in the segment length for 2840 * sequencing purposes. 2841 */ 2842 void 2843 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2844 { 2845 int cnt = off + urgent - 1; 2846 2847 while (cnt >= 0) { 2848 if (m->m_len > cnt) { 2849 char *cp = mtod(m, caddr_t) + cnt; 2850 struct tcpcb *tp = sototcpcb(so); 2851 2852 tp->t_iobc = *cp; 2853 tp->t_oobflags |= TCPOOB_HAVEDATA; 2854 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2855 m->m_len--; 2856 return; 2857 } 2858 cnt -= m->m_len; 2859 m = m->m_next; 2860 if (m == 0) 2861 break; 2862 } 2863 panic("tcp_pulloutofband"); 2864 } 2865 2866 /* 2867 * Collect new round-trip time estimate 2868 * and update averages and current timeout. 2869 */ 2870 void 2871 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2872 { 2873 short delta; 2874 short rttmin; 2875 2876 if (rtt < 0) 2877 rtt = 0; 2878 else if (rtt > TCP_RTT_MAX) 2879 rtt = TCP_RTT_MAX; 2880 2881 tcpstat.tcps_rttupdated++; 2882 if (tp->t_srtt != 0) { 2883 /* 2884 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2885 * after the binary point (scaled by 4), whereas 2886 * srtt is stored as fixed point with 5 bits after the 2887 * binary point (i.e., scaled by 32). The following magic 2888 * is equivalent to the smoothing algorithm in rfc793 with 2889 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2890 * point). 2891 */ 2892 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2893 (tp->t_srtt >> TCP_RTT_SHIFT); 2894 if ((tp->t_srtt += delta) <= 0) 2895 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2896 /* 2897 * We accumulate a smoothed rtt variance (actually, a 2898 * smoothed mean difference), then set the retransmit 2899 * timer to smoothed rtt + 4 times the smoothed variance. 2900 * rttvar is stored as fixed point with 4 bits after the 2901 * binary point (scaled by 16). The following is 2902 * equivalent to rfc793 smoothing with an alpha of .75 2903 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2904 * rfc793's wired-in beta. 2905 */ 2906 if (delta < 0) 2907 delta = -delta; 2908 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2909 if ((tp->t_rttvar += delta) <= 0) 2910 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2911 } else { 2912 /* 2913 * No rtt measurement yet - use the unsmoothed rtt. 2914 * Set the variance to half the rtt (so our first 2915 * retransmit happens at 3*rtt). 2916 */ 2917 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2918 tp->t_rttvar = (rtt + 1) << 2919 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2920 } 2921 tp->t_rtttime = 0; 2922 tp->t_rxtshift = 0; 2923 2924 /* 2925 * the retransmit should happen at rtt + 4 * rttvar. 2926 * Because of the way we do the smoothing, srtt and rttvar 2927 * will each average +1/2 tick of bias. When we compute 2928 * the retransmit timer, we want 1/2 tick of rounding and 2929 * 1 extra tick because of +-1/2 tick uncertainty in the 2930 * firing of the timer. The bias will give us exactly the 2931 * 1.5 tick we need. But, because the bias is 2932 * statistical, we have to test that we don't drop below 2933 * the minimum feasible timer (which is 2 ticks). 2934 */ 2935 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2936 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2937 2938 /* 2939 * We received an ack for a packet that wasn't retransmitted; 2940 * it is probably safe to discard any error indications we've 2941 * received recently. This isn't quite right, but close enough 2942 * for now (a route might have failed after we sent a segment, 2943 * and the return path might not be symmetrical). 2944 */ 2945 tp->t_softerror = 0; 2946 } 2947 2948 /* 2949 * Determine a reasonable value for maxseg size. 2950 * If the route is known, check route for mtu. 2951 * If none, use an mss that can be handled on the outgoing 2952 * interface without forcing IP to fragment; if bigger than 2953 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2954 * to utilize large mbufs. If no route is found, route has no mtu, 2955 * or the destination isn't local, use a default, hopefully conservative 2956 * size (usually 512 or the default IP max size, but no more than the mtu 2957 * of the interface), as we can't discover anything about intervening 2958 * gateways or networks. We also initialize the congestion/slow start 2959 * window to be a single segment if the destination isn't local. 2960 * While looking at the routing entry, we also initialize other path-dependent 2961 * parameters from pre-set or cached values in the routing entry. 2962 * 2963 * Also take into account the space needed for options that we 2964 * send regularly. Make maxseg shorter by that amount to assure 2965 * that we can send maxseg amount of data even when the options 2966 * are present. Store the upper limit of the length of options plus 2967 * data in maxopd. 2968 * 2969 * NOTE: offer == -1 indicates that the maxseg size changed due to 2970 * Path MTU discovery. 2971 */ 2972 int 2973 tcp_mss(struct tcpcb *tp, int offer) 2974 { 2975 struct rtentry *rt; 2976 struct ifnet *ifp; 2977 int mss, mssopt; 2978 int iphlen; 2979 struct inpcb *inp; 2980 2981 inp = tp->t_inpcb; 2982 2983 mssopt = mss = tcp_mssdflt; 2984 2985 rt = in_pcbrtentry(inp); 2986 2987 if (rt == NULL) 2988 goto out; 2989 2990 ifp = rt->rt_ifp; 2991 2992 switch (tp->pf) { 2993 #ifdef INET6 2994 case AF_INET6: 2995 iphlen = sizeof(struct ip6_hdr); 2996 break; 2997 #endif 2998 case AF_INET: 2999 iphlen = sizeof(struct ip); 3000 break; 3001 default: 3002 /* the family does not support path MTU discovery */ 3003 goto out; 3004 } 3005 3006 #ifdef RTV_MTU 3007 /* 3008 * if there's an mtu associated with the route and we support 3009 * path MTU discovery for the underlying protocol family, use it. 3010 */ 3011 if (rt->rt_rmx.rmx_mtu) { 3012 /* 3013 * One may wish to lower MSS to take into account options, 3014 * especially security-related options. 3015 */ 3016 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3017 /* 3018 * RFC2460 section 5, last paragraph: if path MTU is 3019 * smaller than 1280, use 1280 as packet size and 3020 * attach fragment header. 3021 */ 3022 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3023 sizeof(struct tcphdr); 3024 } else 3025 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3026 } else 3027 #endif /* RTV_MTU */ 3028 if (!ifp) 3029 /* 3030 * ifp may be null and rmx_mtu may be zero in certain 3031 * v6 cases (e.g., if ND wasn't able to resolve the 3032 * destination host. 3033 */ 3034 goto out; 3035 else if (ifp->if_flags & IFF_LOOPBACK) 3036 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3037 else if (tp->pf == AF_INET) { 3038 if (ip_mtudisc) 3039 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3040 else if (inp && in_localaddr(inp->inp_faddr)) 3041 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3042 } 3043 #ifdef INET6 3044 else if (tp->pf == AF_INET6) { 3045 /* 3046 * for IPv6, path MTU discovery is always turned on, 3047 * or the node must use packet size <= 1280. 3048 */ 3049 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3050 } 3051 #endif /* INET6 */ 3052 3053 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3054 if (offer != -1) { 3055 #ifndef INET6 3056 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3057 #else 3058 if (tp->pf == AF_INET6) 3059 mssopt = IN6_LINKMTU(ifp) - iphlen - 3060 sizeof(struct tcphdr); 3061 else 3062 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3063 #endif 3064 3065 mssopt = max(tcp_mssdflt, mssopt); 3066 } 3067 3068 out: 3069 /* 3070 * The current mss, t_maxseg, is initialized to the default value. 3071 * If we compute a smaller value, reduce the current mss. 3072 * If we compute a larger value, return it for use in sending 3073 * a max seg size option, but don't store it for use 3074 * unless we received an offer at least that large from peer. 3075 * 3076 * However, do not accept offers lower than the minimum of 3077 * the interface MTU and 216. 3078 */ 3079 if (offer > 0) 3080 tp->t_peermss = offer; 3081 if (tp->t_peermss) 3082 mss = min(mss, max(tp->t_peermss, 216)); 3083 3084 /* sanity - at least max opt. space */ 3085 mss = max(mss, 64); 3086 3087 /* 3088 * maxopd stores the maximum length of data AND options 3089 * in a segment; maxseg is the amount of data in a normal 3090 * segment. We need to store this value (maxopd) apart 3091 * from maxseg, because now every segment carries options 3092 * and thus we normally have somewhat less data in segments. 3093 */ 3094 tp->t_maxopd = mss; 3095 3096 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3097 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3098 mss -= TCPOLEN_TSTAMP_APPA; 3099 #ifdef TCP_SIGNATURE 3100 if (tp->t_flags & TF_SIGNATURE) 3101 mss -= TCPOLEN_SIGLEN; 3102 #endif 3103 3104 if (offer == -1) { 3105 /* mss changed due to Path MTU discovery */ 3106 tp->t_flags &= ~TF_PMTUD_PEND; 3107 tp->t_pmtud_mtu_sent = 0; 3108 tp->t_pmtud_mss_acked = 0; 3109 if (mss < tp->t_maxseg) { 3110 /* 3111 * Follow suggestion in RFC 2414 to reduce the 3112 * congestion window by the ratio of the old 3113 * segment size to the new segment size. 3114 */ 3115 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3116 mss, mss); 3117 } 3118 } else if (tcp_do_rfc3390) { 3119 /* increase initial window */ 3120 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3121 } else 3122 tp->snd_cwnd = mss; 3123 3124 tp->t_maxseg = mss; 3125 3126 return (offer != -1 ? mssopt : mss); 3127 } 3128 3129 u_int 3130 tcp_hdrsz(struct tcpcb *tp) 3131 { 3132 u_int hlen; 3133 3134 switch (tp->pf) { 3135 #ifdef INET6 3136 case AF_INET6: 3137 hlen = sizeof(struct ip6_hdr); 3138 break; 3139 #endif 3140 case AF_INET: 3141 hlen = sizeof(struct ip); 3142 break; 3143 default: 3144 hlen = 0; 3145 break; 3146 } 3147 hlen += sizeof(struct tcphdr); 3148 3149 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3150 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3151 hlen += TCPOLEN_TSTAMP_APPA; 3152 #ifdef TCP_SIGNATURE 3153 if (tp->t_flags & TF_SIGNATURE) 3154 hlen += TCPOLEN_SIGLEN; 3155 #endif 3156 return (hlen); 3157 } 3158 3159 /* 3160 * Set connection variables based on the effective MSS. 3161 * We are passed the TCPCB for the actual connection. If we 3162 * are the server, we are called by the compressed state engine 3163 * when the 3-way handshake is complete. If we are the client, 3164 * we are called when we receive the SYN,ACK from the server. 3165 * 3166 * NOTE: The t_maxseg value must be initialized in the TCPCB 3167 * before this routine is called! 3168 */ 3169 void 3170 tcp_mss_update(struct tcpcb *tp) 3171 { 3172 int mss; 3173 u_long bufsize; 3174 struct rtentry *rt; 3175 struct socket *so; 3176 3177 so = tp->t_inpcb->inp_socket; 3178 mss = tp->t_maxseg; 3179 3180 rt = in_pcbrtentry(tp->t_inpcb); 3181 3182 if (rt == NULL) 3183 return; 3184 3185 bufsize = so->so_snd.sb_hiwat; 3186 if (bufsize < mss) { 3187 mss = bufsize; 3188 /* Update t_maxseg and t_maxopd */ 3189 tcp_mss(tp, mss); 3190 } else { 3191 bufsize = roundup(bufsize, mss); 3192 if (bufsize > sb_max) 3193 bufsize = sb_max; 3194 (void)sbreserve(&so->so_snd, bufsize); 3195 } 3196 3197 bufsize = so->so_rcv.sb_hiwat; 3198 if (bufsize > mss) { 3199 bufsize = roundup(bufsize, mss); 3200 if (bufsize > sb_max) 3201 bufsize = sb_max; 3202 (void)sbreserve(&so->so_rcv, bufsize); 3203 } 3204 3205 } 3206 3207 #if defined (TCP_SACK) 3208 /* 3209 * Checks for partial ack. If partial ack arrives, force the retransmission 3210 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3211 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3212 * be started again. If the ack advances at least to tp->snd_last, return 0. 3213 */ 3214 int 3215 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3216 { 3217 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3218 /* 3219 * snd_una has not been updated and the socket send buffer 3220 * not yet drained of the acked data, so we have to leave 3221 * snd_una as it was to get the correct data offset in 3222 * tcp_output(). 3223 */ 3224 tcp_seq onxt = tp->snd_nxt; 3225 u_long ocwnd = tp->snd_cwnd; 3226 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3227 tp->t_rtttime = 0; 3228 tp->snd_nxt = th->th_ack; 3229 /* 3230 * Set snd_cwnd to one segment beyond acknowledged offset 3231 * (tp->snd_una not yet updated when this function is called) 3232 */ 3233 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3234 (void) tcp_output(tp); 3235 tp->snd_cwnd = ocwnd; 3236 if (SEQ_GT(onxt, tp->snd_nxt)) 3237 tp->snd_nxt = onxt; 3238 /* 3239 * Partial window deflation. Relies on fact that tp->snd_una 3240 * not updated yet. 3241 */ 3242 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3243 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3244 else 3245 tp->snd_cwnd = 0; 3246 tp->snd_cwnd += tp->t_maxseg; 3247 3248 return 1; 3249 } 3250 return 0; 3251 } 3252 #endif /* TCP_SACK */ 3253 3254 int 3255 tcp_mss_adv(struct ifnet *ifp, int af) 3256 { 3257 int mss = 0; 3258 int iphlen; 3259 3260 switch (af) { 3261 case AF_INET: 3262 if (ifp != NULL) 3263 mss = ifp->if_mtu; 3264 iphlen = sizeof(struct ip); 3265 break; 3266 #ifdef INET6 3267 case AF_INET6: 3268 if (ifp != NULL) 3269 mss = IN6_LINKMTU(ifp); 3270 iphlen = sizeof(struct ip6_hdr); 3271 break; 3272 #endif 3273 } 3274 mss = mss - iphlen - sizeof(struct tcphdr); 3275 return (max(mss, tcp_mssdflt)); 3276 } 3277 3278 /* 3279 * TCP compressed state engine. Currently used to hold compressed 3280 * state for SYN_RECEIVED. 3281 */ 3282 3283 u_long syn_cache_count; 3284 u_int32_t syn_hash1, syn_hash2; 3285 3286 #define SYN_HASH(sa, sp, dp) \ 3287 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3288 ((u_int32_t)(sp)))^syn_hash2))) 3289 #ifndef INET6 3290 #define SYN_HASHALL(hash, src, dst) \ 3291 do { \ 3292 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3293 ((struct sockaddr_in *)(src))->sin_port, \ 3294 ((struct sockaddr_in *)(dst))->sin_port); \ 3295 } while (/*CONSTCOND*/ 0) 3296 #else 3297 #define SYN_HASH6(sa, sp, dp) \ 3298 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3299 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3300 & 0x7fffffff) 3301 3302 #define SYN_HASHALL(hash, src, dst) \ 3303 do { \ 3304 switch ((src)->sa_family) { \ 3305 case AF_INET: \ 3306 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3307 ((struct sockaddr_in *)(src))->sin_port, \ 3308 ((struct sockaddr_in *)(dst))->sin_port); \ 3309 break; \ 3310 case AF_INET6: \ 3311 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3312 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3313 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3314 break; \ 3315 default: \ 3316 hash = 0; \ 3317 } \ 3318 } while (/*CONSTCOND*/0) 3319 #endif /* INET6 */ 3320 3321 #define SYN_CACHE_RM(sc) \ 3322 do { \ 3323 (sc)->sc_flags |= SCF_DEAD; \ 3324 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 3325 (sc), sc_bucketq); \ 3326 (sc)->sc_tp = NULL; \ 3327 LIST_REMOVE((sc), sc_tpq); \ 3328 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 3329 timeout_del(&(sc)->sc_timer); \ 3330 syn_cache_count--; \ 3331 } while (/*CONSTCOND*/0) 3332 3333 #define SYN_CACHE_PUT(sc) \ 3334 do { \ 3335 if ((sc)->sc_ipopts) \ 3336 (void) m_free((sc)->sc_ipopts); \ 3337 if ((sc)->sc_route4.ro_rt != NULL) \ 3338 RTFREE((sc)->sc_route4.ro_rt); \ 3339 timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \ 3340 timeout_add(&(sc)->sc_timer, 0); \ 3341 } while (/*CONSTCOND*/0) 3342 3343 struct pool syn_cache_pool; 3344 3345 /* 3346 * We don't estimate RTT with SYNs, so each packet starts with the default 3347 * RTT and each timer step has a fixed timeout value. 3348 */ 3349 #define SYN_CACHE_TIMER_ARM(sc) \ 3350 do { \ 3351 TCPT_RANGESET((sc)->sc_rxtcur, \ 3352 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3353 TCPTV_REXMTMAX); \ 3354 if (!timeout_initialized(&(sc)->sc_timer)) \ 3355 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3356 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3357 } while (/*CONSTCOND*/0) 3358 3359 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3360 3361 void 3362 syn_cache_init() 3363 { 3364 int i; 3365 3366 /* Initialize the hash buckets. */ 3367 for (i = 0; i < tcp_syn_cache_size; i++) 3368 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3369 3370 /* Initialize the syn cache pool. */ 3371 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3372 "synpl", NULL); 3373 } 3374 3375 void 3376 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3377 { 3378 struct syn_cache_head *scp; 3379 struct syn_cache *sc2; 3380 int s; 3381 3382 /* 3383 * If there are no entries in the hash table, reinitialize 3384 * the hash secrets. 3385 */ 3386 if (syn_cache_count == 0) { 3387 syn_hash1 = arc4random(); 3388 syn_hash2 = arc4random(); 3389 } 3390 3391 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3392 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3393 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3394 3395 /* 3396 * Make sure that we don't overflow the per-bucket 3397 * limit or the total cache size limit. 3398 */ 3399 s = splsoftnet(); 3400 if (scp->sch_length >= tcp_syn_bucket_limit) { 3401 tcpstat.tcps_sc_bucketoverflow++; 3402 /* 3403 * The bucket is full. Toss the oldest element in the 3404 * bucket. This will be the first entry in the bucket. 3405 */ 3406 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3407 #ifdef DIAGNOSTIC 3408 /* 3409 * This should never happen; we should always find an 3410 * entry in our bucket. 3411 */ 3412 if (sc2 == NULL) 3413 panic("syn_cache_insert: bucketoverflow: impossible"); 3414 #endif 3415 SYN_CACHE_RM(sc2); 3416 SYN_CACHE_PUT(sc2); 3417 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3418 struct syn_cache_head *scp2, *sce; 3419 3420 tcpstat.tcps_sc_overflowed++; 3421 /* 3422 * The cache is full. Toss the oldest entry in the 3423 * first non-empty bucket we can find. 3424 * 3425 * XXX We would really like to toss the oldest 3426 * entry in the cache, but we hope that this 3427 * condition doesn't happen very often. 3428 */ 3429 scp2 = scp; 3430 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3431 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3432 for (++scp2; scp2 != scp; scp2++) { 3433 if (scp2 >= sce) 3434 scp2 = &tcp_syn_cache[0]; 3435 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3436 break; 3437 } 3438 #ifdef DIAGNOSTIC 3439 /* 3440 * This should never happen; we should always find a 3441 * non-empty bucket. 3442 */ 3443 if (scp2 == scp) 3444 panic("syn_cache_insert: cacheoverflow: " 3445 "impossible"); 3446 #endif 3447 } 3448 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3449 SYN_CACHE_RM(sc2); 3450 SYN_CACHE_PUT(sc2); 3451 } 3452 3453 /* 3454 * Initialize the entry's timer. 3455 */ 3456 sc->sc_rxttot = 0; 3457 sc->sc_rxtshift = 0; 3458 SYN_CACHE_TIMER_ARM(sc); 3459 3460 /* Link it from tcpcb entry */ 3461 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3462 3463 /* Put it into the bucket. */ 3464 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3465 scp->sch_length++; 3466 syn_cache_count++; 3467 3468 tcpstat.tcps_sc_added++; 3469 splx(s); 3470 } 3471 3472 /* 3473 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3474 * If we have retransmitted an entry the maximum number of times, expire 3475 * that entry. 3476 */ 3477 void 3478 syn_cache_timer(void *arg) 3479 { 3480 struct syn_cache *sc = arg; 3481 int s; 3482 3483 s = splsoftnet(); 3484 if (sc->sc_flags & SCF_DEAD) { 3485 splx(s); 3486 return; 3487 } 3488 3489 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3490 /* Drop it -- too many retransmissions. */ 3491 goto dropit; 3492 } 3493 3494 /* 3495 * Compute the total amount of time this entry has 3496 * been on a queue. If this entry has been on longer 3497 * than the keep alive timer would allow, expire it. 3498 */ 3499 sc->sc_rxttot += sc->sc_rxtcur; 3500 if (sc->sc_rxttot >= tcptv_keep_init) 3501 goto dropit; 3502 3503 tcpstat.tcps_sc_retransmitted++; 3504 (void) syn_cache_respond(sc, NULL); 3505 3506 /* Advance the timer back-off. */ 3507 sc->sc_rxtshift++; 3508 SYN_CACHE_TIMER_ARM(sc); 3509 3510 splx(s); 3511 return; 3512 3513 dropit: 3514 tcpstat.tcps_sc_timed_out++; 3515 SYN_CACHE_RM(sc); 3516 SYN_CACHE_PUT(sc); 3517 splx(s); 3518 } 3519 3520 void 3521 syn_cache_reaper(void *arg) 3522 { 3523 struct syn_cache *sc = arg; 3524 int s; 3525 3526 s = splsoftnet(); 3527 pool_put(&syn_cache_pool, (sc)); 3528 splx(s); 3529 return; 3530 } 3531 3532 /* 3533 * Remove syn cache created by the specified tcb entry, 3534 * because this does not make sense to keep them 3535 * (if there's no tcb entry, syn cache entry will never be used) 3536 */ 3537 void 3538 syn_cache_cleanup(struct tcpcb *tp) 3539 { 3540 struct syn_cache *sc, *nsc; 3541 int s; 3542 3543 s = splsoftnet(); 3544 3545 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3546 nsc = LIST_NEXT(sc, sc_tpq); 3547 3548 #ifdef DIAGNOSTIC 3549 if (sc->sc_tp != tp) 3550 panic("invalid sc_tp in syn_cache_cleanup"); 3551 #endif 3552 SYN_CACHE_RM(sc); 3553 SYN_CACHE_PUT(sc); 3554 } 3555 /* just for safety */ 3556 LIST_INIT(&tp->t_sc); 3557 3558 splx(s); 3559 } 3560 3561 /* 3562 * Find an entry in the syn cache. 3563 */ 3564 struct syn_cache * 3565 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3566 struct syn_cache_head **headp) 3567 { 3568 struct syn_cache *sc; 3569 struct syn_cache_head *scp; 3570 u_int32_t hash; 3571 int s; 3572 3573 SYN_HASHALL(hash, src, dst); 3574 3575 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3576 *headp = scp; 3577 s = splsoftnet(); 3578 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3579 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3580 if (sc->sc_hash != hash) 3581 continue; 3582 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3583 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 3584 splx(s); 3585 return (sc); 3586 } 3587 } 3588 splx(s); 3589 return (NULL); 3590 } 3591 3592 /* 3593 * This function gets called when we receive an ACK for a 3594 * socket in the LISTEN state. We look up the connection 3595 * in the syn cache, and if its there, we pull it out of 3596 * the cache and turn it into a full-blown connection in 3597 * the SYN-RECEIVED state. 3598 * 3599 * The return values may not be immediately obvious, and their effects 3600 * can be subtle, so here they are: 3601 * 3602 * NULL SYN was not found in cache; caller should drop the 3603 * packet and send an RST. 3604 * 3605 * -1 We were unable to create the new connection, and are 3606 * aborting it. An ACK,RST is being sent to the peer 3607 * (unless we got screwey sequence numbners; see below), 3608 * because the 3-way handshake has been completed. Caller 3609 * should not free the mbuf, since we may be using it. If 3610 * we are not, we will free it. 3611 * 3612 * Otherwise, the return value is a pointer to the new socket 3613 * associated with the connection. 3614 */ 3615 struct socket * 3616 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3617 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3618 { 3619 struct syn_cache *sc; 3620 struct syn_cache_head *scp; 3621 struct inpcb *inp = NULL; 3622 struct tcpcb *tp = 0; 3623 struct mbuf *am; 3624 int s; 3625 struct socket *oso; 3626 3627 s = splsoftnet(); 3628 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3629 splx(s); 3630 return (NULL); 3631 } 3632 3633 /* 3634 * Verify the sequence and ack numbers. Try getting the correct 3635 * response again. 3636 */ 3637 if ((th->th_ack != sc->sc_iss + 1) || 3638 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3639 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3640 (void) syn_cache_respond(sc, m); 3641 splx(s); 3642 return ((struct socket *)(-1)); 3643 } 3644 3645 /* Remove this cache entry */ 3646 SYN_CACHE_RM(sc); 3647 splx(s); 3648 3649 /* 3650 * Ok, create the full blown connection, and set things up 3651 * as they would have been set up if we had created the 3652 * connection when the SYN arrived. If we can't create 3653 * the connection, abort it. 3654 */ 3655 oso = so; 3656 so = sonewconn(so, SS_ISCONNECTED); 3657 if (so == NULL) 3658 goto resetandabort; 3659 3660 inp = sotoinpcb(oso); 3661 #ifdef IPSEC 3662 /* 3663 * We need to copy the required security levels 3664 * from the old pcb. Ditto for any other 3665 * IPsec-related information. 3666 */ 3667 { 3668 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 3669 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3670 sizeof(inp->inp_seclevel)); 3671 newinp->inp_secrequire = inp->inp_secrequire; 3672 if (inp->inp_ipo != NULL) { 3673 newinp->inp_ipo = inp->inp_ipo; 3674 inp->inp_ipo->ipo_ref_count++; 3675 } 3676 if (inp->inp_ipsec_remotecred != NULL) { 3677 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3678 inp->inp_ipsec_remotecred->ref_count++; 3679 } 3680 if (inp->inp_ipsec_remoteauth != NULL) { 3681 newinp->inp_ipsec_remoteauth 3682 = inp->inp_ipsec_remoteauth; 3683 inp->inp_ipsec_remoteauth->ref_count++; 3684 } 3685 } 3686 #endif /* IPSEC */ 3687 #ifdef INET6 3688 /* 3689 * inp still has the OLD in_pcb stuff, set the 3690 * v6-related flags on the new guy, too. 3691 */ 3692 { 3693 int flags = inp->inp_flags; 3694 struct inpcb *oldinpcb = inp; 3695 3696 inp = (struct inpcb *)so->so_pcb; 3697 inp->inp_flags |= (flags & INP_IPV6); 3698 if ((inp->inp_flags & INP_IPV6) != 0) { 3699 inp->inp_ipv6.ip6_hlim = 3700 oldinpcb->inp_ipv6.ip6_hlim; 3701 } 3702 } 3703 #else /* INET6 */ 3704 inp = (struct inpcb *)so->so_pcb; 3705 #endif /* INET6 */ 3706 3707 inp->inp_lport = th->th_dport; 3708 switch (src->sa_family) { 3709 #ifdef INET6 3710 case AF_INET6: 3711 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3712 break; 3713 #endif /* INET6 */ 3714 case AF_INET: 3715 3716 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3717 inp->inp_options = ip_srcroute(); 3718 if (inp->inp_options == NULL) { 3719 inp->inp_options = sc->sc_ipopts; 3720 sc->sc_ipopts = NULL; 3721 } 3722 break; 3723 } 3724 in_pcbrehash(inp); 3725 3726 /* 3727 * Give the new socket our cached route reference. 3728 */ 3729 if (src->sa_family == AF_INET) 3730 inp->inp_route = sc->sc_route4; /* struct assignment */ 3731 #ifdef INET6 3732 else 3733 inp->inp_route6 = sc->sc_route6; 3734 #endif 3735 sc->sc_route4.ro_rt = NULL; 3736 3737 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3738 if (am == NULL) 3739 goto resetandabort; 3740 am->m_len = src->sa_len; 3741 bcopy(src, mtod(am, caddr_t), src->sa_len); 3742 3743 switch (src->sa_family) { 3744 case AF_INET: 3745 /* drop IPv4 packet to AF_INET6 socket */ 3746 if (inp->inp_flags & INP_IPV6) { 3747 (void) m_free(am); 3748 goto resetandabort; 3749 } 3750 if (in_pcbconnect(inp, am)) { 3751 (void) m_free(am); 3752 goto resetandabort; 3753 } 3754 break; 3755 #ifdef INET6 3756 case AF_INET6: 3757 if (in6_pcbconnect(inp, am)) { 3758 (void) m_free(am); 3759 goto resetandabort; 3760 } 3761 break; 3762 #endif 3763 } 3764 (void) m_free(am); 3765 3766 tp = intotcpcb(inp); 3767 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3768 if (sc->sc_request_r_scale != 15) { 3769 tp->requested_s_scale = sc->sc_requested_s_scale; 3770 tp->request_r_scale = sc->sc_request_r_scale; 3771 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3772 } 3773 if (sc->sc_flags & SCF_TIMESTAMP) 3774 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3775 3776 tp->t_template = tcp_template(tp); 3777 if (tp->t_template == 0) { 3778 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3779 so = NULL; 3780 m_freem(m); 3781 goto abort; 3782 } 3783 #ifdef TCP_SACK 3784 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3785 #endif 3786 3787 tp->ts_modulate = sc->sc_modulate; 3788 tp->iss = sc->sc_iss; 3789 tp->irs = sc->sc_irs; 3790 tcp_sendseqinit(tp); 3791 #if defined (TCP_SACK) || defined(TCP_ECN) 3792 tp->snd_last = tp->snd_una; 3793 #endif /* TCP_SACK */ 3794 #if defined(TCP_SACK) && defined(TCP_FACK) 3795 tp->snd_fack = tp->snd_una; 3796 tp->retran_data = 0; 3797 tp->snd_awnd = 0; 3798 #endif /* TCP_FACK */ 3799 #ifdef TCP_ECN 3800 if (sc->sc_flags & SCF_ECN_PERMIT) { 3801 tp->t_flags |= TF_ECN_PERMIT; 3802 tcpstat.tcps_ecn_accepts++; 3803 } 3804 #endif 3805 #ifdef TCP_SACK 3806 if (sc->sc_flags & SCF_SACK_PERMIT) 3807 tp->t_flags |= TF_SACK_PERMIT; 3808 #endif 3809 #ifdef TCP_SIGNATURE 3810 if (sc->sc_flags & SCF_SIGNATURE) 3811 tp->t_flags |= TF_SIGNATURE; 3812 #endif 3813 tcp_rcvseqinit(tp); 3814 tp->t_state = TCPS_SYN_RECEIVED; 3815 tp->t_rcvtime = tcp_now; 3816 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3817 tcpstat.tcps_accepts++; 3818 3819 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3820 if (sc->sc_peermaxseg) 3821 tcp_mss_update(tp); 3822 /* Reset initial window to 1 segment for retransmit */ 3823 if (sc->sc_rxtshift > 0) 3824 tp->snd_cwnd = tp->t_maxseg; 3825 tp->snd_wl1 = sc->sc_irs; 3826 tp->rcv_up = sc->sc_irs + 1; 3827 3828 /* 3829 * This is what whould have happened in tcp_output() when 3830 * the SYN,ACK was sent. 3831 */ 3832 tp->snd_up = tp->snd_una; 3833 tp->snd_max = tp->snd_nxt = tp->iss+1; 3834 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3835 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3836 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3837 tp->last_ack_sent = tp->rcv_nxt; 3838 3839 tcpstat.tcps_sc_completed++; 3840 SYN_CACHE_PUT(sc); 3841 return (so); 3842 3843 resetandabort: 3844 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST); 3845 m_freem(m); 3846 abort: 3847 if (so != NULL) 3848 (void) soabort(so); 3849 SYN_CACHE_PUT(sc); 3850 tcpstat.tcps_sc_aborted++; 3851 return ((struct socket *)(-1)); 3852 } 3853 3854 /* 3855 * This function is called when we get a RST for a 3856 * non-existent connection, so that we can see if the 3857 * connection is in the syn cache. If it is, zap it. 3858 */ 3859 3860 void 3861 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 3862 { 3863 struct syn_cache *sc; 3864 struct syn_cache_head *scp; 3865 int s = splsoftnet(); 3866 3867 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3868 splx(s); 3869 return; 3870 } 3871 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3872 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3873 splx(s); 3874 return; 3875 } 3876 SYN_CACHE_RM(sc); 3877 splx(s); 3878 tcpstat.tcps_sc_reset++; 3879 SYN_CACHE_PUT(sc); 3880 } 3881 3882 void 3883 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 3884 { 3885 struct syn_cache *sc; 3886 struct syn_cache_head *scp; 3887 int s; 3888 3889 s = splsoftnet(); 3890 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3891 splx(s); 3892 return; 3893 } 3894 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3895 if (ntohl (th->th_seq) != sc->sc_iss) { 3896 splx(s); 3897 return; 3898 } 3899 3900 /* 3901 * If we've retransmitted 3 times and this is our second error, 3902 * we remove the entry. Otherwise, we allow it to continue on. 3903 * This prevents us from incorrectly nuking an entry during a 3904 * spurious network outage. 3905 * 3906 * See tcp_notify(). 3907 */ 3908 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3909 sc->sc_flags |= SCF_UNREACH; 3910 splx(s); 3911 return; 3912 } 3913 3914 SYN_CACHE_RM(sc); 3915 splx(s); 3916 tcpstat.tcps_sc_unreach++; 3917 SYN_CACHE_PUT(sc); 3918 } 3919 3920 /* 3921 * Given a LISTEN socket and an inbound SYN request, add 3922 * this to the syn cache, and send back a segment: 3923 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3924 * to the source. 3925 * 3926 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3927 * Doing so would require that we hold onto the data and deliver it 3928 * to the application. However, if we are the target of a SYN-flood 3929 * DoS attack, an attacker could send data which would eventually 3930 * consume all available buffer space if it were ACKed. By not ACKing 3931 * the data, we avoid this DoS scenario. 3932 */ 3933 3934 int 3935 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3936 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3937 struct tcp_opt_info *oi, tcp_seq *issp) 3938 { 3939 struct tcpcb tb, *tp; 3940 long win; 3941 struct syn_cache *sc; 3942 struct syn_cache_head *scp; 3943 struct mbuf *ipopts; 3944 3945 tp = sototcpcb(so); 3946 3947 /* 3948 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3949 * 3950 * Note this check is performed in tcp_input() very early on. 3951 */ 3952 3953 /* 3954 * Initialize some local state. 3955 */ 3956 win = sbspace(&so->so_rcv); 3957 if (win > TCP_MAXWIN) 3958 win = TCP_MAXWIN; 3959 3960 #ifdef TCP_SIGNATURE 3961 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3962 #else 3963 if (optp) { 3964 #endif 3965 tb.pf = tp->pf; 3966 #ifdef TCP_SACK 3967 tb.sack_enable = tp->sack_enable; 3968 #endif 3969 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3970 #ifdef TCP_SIGNATURE 3971 if (tp->t_flags & TF_SIGNATURE) 3972 tb.t_flags |= TF_SIGNATURE; 3973 #endif 3974 tb.t_state = TCPS_LISTEN; 3975 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi)) 3976 return (0); 3977 } else 3978 tb.t_flags = 0; 3979 3980 switch (src->sa_family) { 3981 #ifdef INET 3982 case AF_INET: 3983 /* 3984 * Remember the IP options, if any. 3985 */ 3986 ipopts = ip_srcroute(); 3987 break; 3988 #endif 3989 default: 3990 ipopts = NULL; 3991 } 3992 3993 /* 3994 * See if we already have an entry for this connection. 3995 * If we do, resend the SYN,ACK. We do not count this 3996 * as a retransmission (XXX though maybe we should). 3997 */ 3998 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3999 tcpstat.tcps_sc_dupesyn++; 4000 if (ipopts) { 4001 /* 4002 * If we were remembering a previous source route, 4003 * forget it and use the new one we've been given. 4004 */ 4005 if (sc->sc_ipopts) 4006 (void) m_free(sc->sc_ipopts); 4007 sc->sc_ipopts = ipopts; 4008 } 4009 sc->sc_timestamp = tb.ts_recent; 4010 if (syn_cache_respond(sc, m) == 0) { 4011 tcpstat.tcps_sndacks++; 4012 tcpstat.tcps_sndtotal++; 4013 } 4014 return (1); 4015 } 4016 4017 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 4018 if (sc == NULL) { 4019 if (ipopts) 4020 (void) m_free(ipopts); 4021 return (0); 4022 } 4023 4024 /* 4025 * Fill in the cache, and put the necessary IP and TCP 4026 * options into the reply. 4027 */ 4028 bzero(sc, sizeof(struct syn_cache)); 4029 bzero(&sc->sc_timer, sizeof(sc->sc_timer)); 4030 bcopy(src, &sc->sc_src, src->sa_len); 4031 bcopy(dst, &sc->sc_dst, dst->sa_len); 4032 sc->sc_flags = 0; 4033 sc->sc_ipopts = ipopts; 4034 sc->sc_irs = th->th_seq; 4035 4036 sc->sc_iss = issp ? *issp : arc4random(); 4037 sc->sc_peermaxseg = oi->maxseg; 4038 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4039 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4040 sc->sc_win = win; 4041 sc->sc_timestamp = tb.ts_recent; 4042 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4043 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4044 sc->sc_flags |= SCF_TIMESTAMP; 4045 sc->sc_modulate = arc4random(); 4046 } 4047 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4048 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4049 sc->sc_requested_s_scale = tb.requested_s_scale; 4050 sc->sc_request_r_scale = 0; 4051 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4052 TCP_MAXWIN << sc->sc_request_r_scale < 4053 so->so_rcv.sb_hiwat) 4054 sc->sc_request_r_scale++; 4055 } else { 4056 sc->sc_requested_s_scale = 15; 4057 sc->sc_request_r_scale = 15; 4058 } 4059 #ifdef TCP_ECN 4060 /* 4061 * if both ECE and CWR flag bits are set, peer is ECN capable. 4062 */ 4063 if (tcp_do_ecn && 4064 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4065 sc->sc_flags |= SCF_ECN_PERMIT; 4066 #endif 4067 #ifdef TCP_SACK 4068 /* 4069 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4070 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4071 */ 4072 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4073 sc->sc_flags |= SCF_SACK_PERMIT; 4074 #endif 4075 #ifdef TCP_SIGNATURE 4076 if (tb.t_flags & TF_SIGNATURE) 4077 sc->sc_flags |= SCF_SIGNATURE; 4078 #endif 4079 sc->sc_tp = tp; 4080 if (syn_cache_respond(sc, m) == 0) { 4081 syn_cache_insert(sc, tp); 4082 tcpstat.tcps_sndacks++; 4083 tcpstat.tcps_sndtotal++; 4084 } else { 4085 SYN_CACHE_PUT(sc); 4086 tcpstat.tcps_sc_dropped++; 4087 } 4088 return (1); 4089 } 4090 4091 int 4092 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4093 { 4094 struct route *ro; 4095 u_int8_t *optp; 4096 int optlen, error; 4097 u_int16_t tlen; 4098 struct ip *ip = NULL; 4099 #ifdef INET6 4100 struct ip6_hdr *ip6 = NULL; 4101 #endif 4102 struct tcphdr *th; 4103 u_int hlen; 4104 struct inpcb *inp; 4105 4106 switch (sc->sc_src.sa.sa_family) { 4107 case AF_INET: 4108 hlen = sizeof(struct ip); 4109 ro = &sc->sc_route4; 4110 break; 4111 #ifdef INET6 4112 case AF_INET6: 4113 hlen = sizeof(struct ip6_hdr); 4114 ro = (struct route *)&sc->sc_route6; 4115 break; 4116 #endif 4117 default: 4118 if (m) 4119 m_freem(m); 4120 return (EAFNOSUPPORT); 4121 } 4122 4123 /* Compute the size of the TCP options. */ 4124 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4125 #ifdef TCP_SACK 4126 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4127 #endif 4128 #ifdef TCP_SIGNATURE 4129 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4130 #endif 4131 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4132 4133 tlen = hlen + sizeof(struct tcphdr) + optlen; 4134 4135 /* 4136 * Create the IP+TCP header from scratch. 4137 */ 4138 if (m) 4139 m_freem(m); 4140 #ifdef DIAGNOSTIC 4141 if (max_linkhdr + tlen > MCLBYTES) 4142 return (ENOBUFS); 4143 #endif 4144 MGETHDR(m, M_DONTWAIT, MT_DATA); 4145 if (m && max_linkhdr + tlen > MHLEN) { 4146 MCLGET(m, M_DONTWAIT); 4147 if ((m->m_flags & M_EXT) == 0) { 4148 m_freem(m); 4149 m = NULL; 4150 } 4151 } 4152 if (m == NULL) 4153 return (ENOBUFS); 4154 4155 /* Fixup the mbuf. */ 4156 m->m_data += max_linkhdr; 4157 m->m_len = m->m_pkthdr.len = tlen; 4158 m->m_pkthdr.rcvif = NULL; 4159 memset(mtod(m, u_char *), 0, tlen); 4160 4161 switch (sc->sc_src.sa.sa_family) { 4162 case AF_INET: 4163 ip = mtod(m, struct ip *); 4164 ip->ip_dst = sc->sc_src.sin.sin_addr; 4165 ip->ip_src = sc->sc_dst.sin.sin_addr; 4166 ip->ip_p = IPPROTO_TCP; 4167 th = (struct tcphdr *)(ip + 1); 4168 th->th_dport = sc->sc_src.sin.sin_port; 4169 th->th_sport = sc->sc_dst.sin.sin_port; 4170 break; 4171 #ifdef INET6 4172 case AF_INET6: 4173 ip6 = mtod(m, struct ip6_hdr *); 4174 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4175 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4176 ip6->ip6_nxt = IPPROTO_TCP; 4177 /* ip6_plen will be updated in ip6_output() */ 4178 th = (struct tcphdr *)(ip6 + 1); 4179 th->th_dport = sc->sc_src.sin6.sin6_port; 4180 th->th_sport = sc->sc_dst.sin6.sin6_port; 4181 break; 4182 #endif 4183 default: 4184 th = NULL; 4185 } 4186 4187 th->th_seq = htonl(sc->sc_iss); 4188 th->th_ack = htonl(sc->sc_irs + 1); 4189 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4190 th->th_flags = TH_SYN|TH_ACK; 4191 #ifdef TCP_ECN 4192 /* Set ECE for SYN-ACK if peer supports ECN. */ 4193 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4194 th->th_flags |= TH_ECE; 4195 #endif 4196 th->th_win = htons(sc->sc_win); 4197 /* th_sum already 0 */ 4198 /* th_urp already 0 */ 4199 4200 /* Tack on the TCP options. */ 4201 optp = (u_int8_t *)(th + 1); 4202 *optp++ = TCPOPT_MAXSEG; 4203 *optp++ = 4; 4204 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4205 *optp++ = sc->sc_ourmaxseg & 0xff; 4206 4207 #ifdef TCP_SACK 4208 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4209 if (sc->sc_flags & SCF_SACK_PERMIT) { 4210 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4211 optp += 4; 4212 } 4213 #endif 4214 4215 if (sc->sc_request_r_scale != 15) { 4216 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4217 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4218 sc->sc_request_r_scale); 4219 optp += 4; 4220 } 4221 4222 if (sc->sc_flags & SCF_TIMESTAMP) { 4223 u_int32_t *lp = (u_int32_t *)(optp); 4224 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4225 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4226 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4227 *lp = htonl(sc->sc_timestamp); 4228 optp += TCPOLEN_TSTAMP_APPA; 4229 } 4230 4231 #ifdef TCP_SIGNATURE 4232 if (sc->sc_flags & SCF_SIGNATURE) { 4233 union sockaddr_union src, dst; 4234 struct tdb *tdb; 4235 4236 bzero(&src, sizeof(union sockaddr_union)); 4237 bzero(&dst, sizeof(union sockaddr_union)); 4238 src.sa.sa_len = sc->sc_src.sa.sa_len; 4239 src.sa.sa_family = sc->sc_src.sa.sa_family; 4240 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4241 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4242 4243 switch (sc->sc_src.sa.sa_family) { 4244 case 0: /*default to PF_INET*/ 4245 #ifdef INET 4246 case AF_INET: 4247 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4248 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4249 break; 4250 #endif /* INET */ 4251 #ifdef INET6 4252 case AF_INET6: 4253 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4254 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4255 break; 4256 #endif /* INET6 */ 4257 } 4258 4259 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP); 4260 if (tdb == NULL) { 4261 if (m) 4262 m_freem(m); 4263 return (EPERM); 4264 } 4265 4266 /* Send signature option */ 4267 *(optp++) = TCPOPT_SIGNATURE; 4268 *(optp++) = TCPOLEN_SIGNATURE; 4269 4270 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4271 hlen, 0, optp) < 0) { 4272 if (m) 4273 m_freem(m); 4274 return (EINVAL); 4275 } 4276 optp += 16; 4277 4278 /* Pad options list to the next 32 bit boundary and 4279 * terminate it. 4280 */ 4281 *optp++ = TCPOPT_NOP; 4282 *optp++ = TCPOPT_EOL; 4283 } 4284 #endif /* TCP_SIGNATURE */ 4285 4286 /* Compute the packet's checksum. */ 4287 switch (sc->sc_src.sa.sa_family) { 4288 case AF_INET: 4289 ip->ip_len = htons(tlen - hlen); 4290 th->th_sum = 0; 4291 th->th_sum = in_cksum(m, tlen); 4292 break; 4293 #ifdef INET6 4294 case AF_INET6: 4295 ip6->ip6_plen = htons(tlen - hlen); 4296 th->th_sum = 0; 4297 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4298 break; 4299 #endif 4300 } 4301 4302 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4303 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4304 4305 /* 4306 * Fill in some straggling IP bits. Note the stack expects 4307 * ip_len to be in host order, for convenience. 4308 */ 4309 switch (sc->sc_src.sa.sa_family) { 4310 #ifdef INET 4311 case AF_INET: 4312 ip->ip_len = htons(tlen); 4313 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4314 /* XXX tos? */ 4315 break; 4316 #endif 4317 #ifdef INET6 4318 case AF_INET6: 4319 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4320 ip6->ip6_vfc |= IPV6_VERSION; 4321 ip6->ip6_plen = htons(tlen - hlen); 4322 /* ip6_hlim will be initialized afterwards */ 4323 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4324 break; 4325 #endif 4326 } 4327 4328 switch (sc->sc_src.sa.sa_family) { 4329 #ifdef INET 4330 case AF_INET: 4331 error = ip_output(m, sc->sc_ipopts, ro, 4332 (ip_mtudisc ? IP_MTUDISC : 0), 4333 (struct ip_moptions *)NULL, inp); 4334 break; 4335 #endif 4336 #ifdef INET6 4337 case AF_INET6: 4338 ip6->ip6_hlim = in6_selecthlim(NULL, 4339 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4340 4341 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4342 (struct ip6_moptions *)0, NULL, NULL); 4343 break; 4344 #endif 4345 default: 4346 error = EAFNOSUPPORT; 4347 break; 4348 } 4349 return (error); 4350 } 4351