1 /* $OpenBSD: tcp_input.c,v 1.276 2014/04/25 09:44:38 mpi Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <dev/rndvar.h> 84 85 #include <net/if.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_systm.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/tcp.h> 94 #include <netinet/tcp_fsm.h> 95 #include <netinet/tcp_seq.h> 96 #include <netinet/tcp_timer.h> 97 #include <netinet/tcp_var.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/tcp_debug.h> 100 101 #if NPF > 0 102 #include <net/pfvar.h> 103 #endif 104 105 struct tcpiphdr tcp_saveti; 106 107 int tcp_mss_adv(struct ifnet *, int); 108 int tcp_flush_queue(struct tcpcb *); 109 110 #ifdef INET6 111 #include <netinet6/in6_var.h> 112 #include <netinet6/nd6.h> 113 114 struct tcpipv6hdr tcp_saveti6; 115 116 /* for the packet header length in the mbuf */ 117 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 118 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 119 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 120 #endif /* INET6 */ 121 122 int tcprexmtthresh = 3; 123 int tcptv_keep_init = TCPTV_KEEP_INIT; 124 125 int tcp_rst_ppslim = 100; /* 100pps */ 126 int tcp_rst_ppslim_count = 0; 127 struct timeval tcp_rst_ppslim_last; 128 129 int tcp_ackdrop_ppslim = 100; /* 100pps */ 130 int tcp_ackdrop_ppslim_count = 0; 131 struct timeval tcp_ackdrop_ppslim_last; 132 133 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 134 135 /* for modulo comparisons of timestamps */ 136 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 137 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 138 139 /* for TCP SACK comparisons */ 140 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 141 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 142 143 /* 144 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 145 */ 146 #ifdef INET6 147 #define ND6_HINT(tp) \ 148 do { \ 149 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 150 tp->t_inpcb->inp_route6.ro_rt) { \ 151 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0, \ 152 tp->t_inpcb->inp_rtableid); \ 153 } \ 154 } while (0) 155 #else 156 #define ND6_HINT(tp) 157 #endif 158 159 #ifdef TCP_ECN 160 /* 161 * ECN (Explicit Congestion Notification) support based on RFC3168 162 * implementation note: 163 * snd_last is used to track a recovery phase. 164 * when cwnd is reduced, snd_last is set to snd_max. 165 * while snd_last > snd_una, the sender is in a recovery phase and 166 * its cwnd should not be reduced again. 167 * snd_last follows snd_una when not in a recovery phase. 168 */ 169 #endif 170 171 /* 172 * Macro to compute ACK transmission behavior. Delay the ACK unless 173 * we have already delayed an ACK (must send an ACK every two segments). 174 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 175 * option is enabled or when the packet is coming from a loopback 176 * interface. 177 */ 178 #define TCP_SETUP_ACK(tp, tiflags, m) \ 179 do { \ 180 if ((tp)->t_flags & TF_DELACK || \ 181 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 182 (m && (m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif && \ 183 (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK))) \ 184 tp->t_flags |= TF_ACKNOW; \ 185 else \ 186 TCP_SET_DELACK(tp); \ 187 } while (0) 188 189 void syn_cache_put(struct syn_cache *); 190 void syn_cache_rm(struct syn_cache *); 191 192 /* 193 * Insert segment ti into reassembly queue of tcp with 194 * control block tp. Return TH_FIN if reassembly now includes 195 * a segment with FIN. The macro form does the common case inline 196 * (segment is the next to be received on an established connection, 197 * and the queue is empty), avoiding linkage into and removal 198 * from the queue and repetition of various conversions. 199 * Set DELACK for segments received in order, but ack immediately 200 * when segments are out of order (so fast retransmit can work). 201 */ 202 203 int 204 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 205 { 206 struct tcpqent *p, *q, *nq, *tiqe; 207 208 /* 209 * Allocate a new queue entry, before we throw away any data. 210 * If we can't, just drop the packet. XXX 211 */ 212 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 213 if (tiqe == NULL) { 214 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 215 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 216 /* Reuse last entry since new segment fills a hole */ 217 m_freem(tiqe->tcpqe_m); 218 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 219 } 220 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 221 /* Flush segment queue for this connection */ 222 tcp_freeq(tp); 223 tcpstat.tcps_rcvmemdrop++; 224 m_freem(m); 225 return (0); 226 } 227 } 228 229 /* 230 * Find a segment which begins after this one does. 231 */ 232 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 233 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 234 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 235 break; 236 237 /* 238 * If there is a preceding segment, it may provide some of 239 * our data already. If so, drop the data from the incoming 240 * segment. If it provides all of our data, drop us. 241 */ 242 if (p != NULL) { 243 struct tcphdr *phdr = p->tcpqe_tcp; 244 int i; 245 246 /* conversion to int (in i) handles seq wraparound */ 247 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 248 if (i > 0) { 249 if (i >= *tlen) { 250 tcpstat.tcps_rcvduppack++; 251 tcpstat.tcps_rcvdupbyte += *tlen; 252 m_freem(m); 253 pool_put(&tcpqe_pool, tiqe); 254 return (0); 255 } 256 m_adj(m, i); 257 *tlen -= i; 258 th->th_seq += i; 259 } 260 } 261 tcpstat.tcps_rcvoopack++; 262 tcpstat.tcps_rcvoobyte += *tlen; 263 264 /* 265 * While we overlap succeeding segments trim them or, 266 * if they are completely covered, dequeue them. 267 */ 268 for (; q != NULL; q = nq) { 269 struct tcphdr *qhdr = q->tcpqe_tcp; 270 int i = (th->th_seq + *tlen) - qhdr->th_seq; 271 272 if (i <= 0) 273 break; 274 if (i < qhdr->th_reseqlen) { 275 qhdr->th_seq += i; 276 qhdr->th_reseqlen -= i; 277 m_adj(q->tcpqe_m, i); 278 break; 279 } 280 nq = TAILQ_NEXT(q, tcpqe_q); 281 m_freem(q->tcpqe_m); 282 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 283 pool_put(&tcpqe_pool, q); 284 } 285 286 /* Insert the new segment queue entry into place. */ 287 tiqe->tcpqe_m = m; 288 th->th_reseqlen = *tlen; 289 tiqe->tcpqe_tcp = th; 290 if (p == NULL) { 291 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 292 } else { 293 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 294 } 295 296 if (th->th_seq != tp->rcv_nxt) 297 return (0); 298 299 return (tcp_flush_queue(tp)); 300 } 301 302 int 303 tcp_flush_queue(struct tcpcb *tp) 304 { 305 struct socket *so = tp->t_inpcb->inp_socket; 306 struct tcpqent *q, *nq; 307 int flags; 308 309 /* 310 * Present data to user, advancing rcv_nxt through 311 * completed sequence space. 312 */ 313 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 314 return (0); 315 q = TAILQ_FIRST(&tp->t_segq); 316 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 317 return (0); 318 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 319 return (0); 320 do { 321 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 322 flags = q->tcpqe_tcp->th_flags & TH_FIN; 323 324 nq = TAILQ_NEXT(q, tcpqe_q); 325 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 326 ND6_HINT(tp); 327 if (so->so_state & SS_CANTRCVMORE) 328 m_freem(q->tcpqe_m); 329 else 330 sbappendstream(&so->so_rcv, q->tcpqe_m); 331 pool_put(&tcpqe_pool, q); 332 q = nq; 333 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 334 tp->t_flags |= TF_BLOCKOUTPUT; 335 sorwakeup(so); 336 tp->t_flags &= ~TF_BLOCKOUTPUT; 337 return (flags); 338 } 339 340 #ifdef INET6 341 int 342 tcp6_input(struct mbuf **mp, int *offp, int proto) 343 { 344 struct mbuf *m = *mp; 345 346 tcp_input(m, *offp, proto); 347 return IPPROTO_DONE; 348 } 349 #endif 350 351 /* 352 * TCP input routine, follows pages 65-76 of the 353 * protocol specification dated September, 1981 very closely. 354 */ 355 void 356 tcp_input(struct mbuf *m, ...) 357 { 358 struct ip *ip; 359 struct inpcb *inp = NULL; 360 u_int8_t *optp = NULL; 361 int optlen = 0; 362 int tlen, off; 363 struct tcpcb *tp = NULL; 364 int tiflags; 365 struct socket *so = NULL; 366 int todrop, acked, ourfinisacked; 367 int hdroptlen = 0; 368 short ostate = 0; 369 tcp_seq iss, *reuse = NULL; 370 u_long tiwin; 371 struct tcp_opt_info opti; 372 int iphlen; 373 va_list ap; 374 struct tcphdr *th; 375 #ifdef INET6 376 struct ip6_hdr *ip6 = NULL; 377 #endif /* INET6 */ 378 #ifdef IPSEC 379 struct m_tag *mtag; 380 struct tdb_ident *tdbi; 381 struct tdb *tdb; 382 int error; 383 #endif /* IPSEC */ 384 int af; 385 #ifdef TCP_ECN 386 u_char iptos; 387 #endif 388 389 va_start(ap, m); 390 iphlen = va_arg(ap, int); 391 va_end(ap); 392 393 tcpstat.tcps_rcvtotal++; 394 395 opti.ts_present = 0; 396 opti.maxseg = 0; 397 398 /* 399 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 400 * See below for AF specific multicast. 401 */ 402 if (m->m_flags & (M_BCAST|M_MCAST)) 403 goto drop; 404 405 /* 406 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 407 * TCP/IPv4. 408 */ 409 switch (mtod(m, struct ip *)->ip_v) { 410 #ifdef INET6 411 case 6: 412 af = AF_INET6; 413 break; 414 #endif 415 case 4: 416 af = AF_INET; 417 break; 418 default: 419 m_freem(m); 420 return; /*EAFNOSUPPORT*/ 421 } 422 423 /* 424 * Get IP and TCP header together in first mbuf. 425 * Note: IP leaves IP header in first mbuf. 426 */ 427 switch (af) { 428 case AF_INET: 429 #ifdef DIAGNOSTIC 430 if (iphlen < sizeof(struct ip)) { 431 m_freem(m); 432 return; 433 } 434 #endif /* DIAGNOSTIC */ 435 break; 436 #ifdef INET6 437 case AF_INET6: 438 #ifdef DIAGNOSTIC 439 if (iphlen < sizeof(struct ip6_hdr)) { 440 m_freem(m); 441 return; 442 } 443 #endif /* DIAGNOSTIC */ 444 break; 445 #endif 446 default: 447 m_freem(m); 448 return; 449 } 450 451 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 452 if (!th) { 453 tcpstat.tcps_rcvshort++; 454 return; 455 } 456 457 tlen = m->m_pkthdr.len - iphlen; 458 ip = NULL; 459 #ifdef INET6 460 ip6 = NULL; 461 #endif 462 switch (af) { 463 case AF_INET: 464 ip = mtod(m, struct ip *); 465 if (IN_MULTICAST(ip->ip_dst.s_addr) || 466 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif, 467 m->m_pkthdr.ph_rtableid)) 468 goto drop; 469 #ifdef TCP_ECN 470 /* save ip_tos before clearing it for checksum */ 471 iptos = ip->ip_tos; 472 #endif 473 break; 474 #ifdef INET6 475 case AF_INET6: 476 ip6 = mtod(m, struct ip6_hdr *); 477 #ifdef TCP_ECN 478 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 479 #endif 480 481 /* Be proactive about malicious use of IPv4 mapped address */ 482 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 483 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 484 /* XXX stat */ 485 goto drop; 486 } 487 488 /* 489 * Be proactive about unspecified IPv6 address in source. 490 * As we use all-zero to indicate unbounded/unconnected pcb, 491 * unspecified IPv6 address can be used to confuse us. 492 * 493 * Note that packets with unspecified IPv6 destination is 494 * already dropped in ip6_input. 495 */ 496 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 497 /* XXX stat */ 498 goto drop; 499 } 500 501 /* Discard packets to multicast */ 502 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 503 /* XXX stat */ 504 goto drop; 505 } 506 break; 507 #endif 508 } 509 510 /* 511 * Checksum extended TCP header and data. 512 */ 513 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 514 int sum; 515 516 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 517 tcpstat.tcps_rcvbadsum++; 518 goto drop; 519 } 520 tcpstat.tcps_inswcsum++; 521 switch (af) { 522 case AF_INET: 523 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 524 break; 525 #ifdef INET6 526 case AF_INET6: 527 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 528 tlen); 529 break; 530 #endif 531 } 532 if (sum != 0) { 533 tcpstat.tcps_rcvbadsum++; 534 goto drop; 535 } 536 } 537 538 /* 539 * Check that TCP offset makes sense, 540 * pull out TCP options and adjust length. XXX 541 */ 542 off = th->th_off << 2; 543 if (off < sizeof(struct tcphdr) || off > tlen) { 544 tcpstat.tcps_rcvbadoff++; 545 goto drop; 546 } 547 tlen -= off; 548 if (off > sizeof(struct tcphdr)) { 549 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 550 if (!th) { 551 tcpstat.tcps_rcvshort++; 552 return; 553 } 554 optlen = off - sizeof(struct tcphdr); 555 optp = (u_int8_t *)(th + 1); 556 /* 557 * Do quick retrieval of timestamp options ("options 558 * prediction?"). If timestamp is the only option and it's 559 * formatted as recommended in RFC 1323 appendix A, we 560 * quickly get the values now and not bother calling 561 * tcp_dooptions(), etc. 562 */ 563 if ((optlen == TCPOLEN_TSTAMP_APPA || 564 (optlen > TCPOLEN_TSTAMP_APPA && 565 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 566 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 567 (th->th_flags & TH_SYN) == 0) { 568 opti.ts_present = 1; 569 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 570 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 571 optp = NULL; /* we've parsed the options */ 572 } 573 } 574 tiflags = th->th_flags; 575 576 /* 577 * Convert TCP protocol specific fields to host format. 578 */ 579 NTOHL(th->th_seq); 580 NTOHL(th->th_ack); 581 NTOHS(th->th_win); 582 NTOHS(th->th_urp); 583 584 /* 585 * Locate pcb for segment. 586 */ 587 #if NPF > 0 588 if (m->m_pkthdr.pf.statekey) 589 inp = m->m_pkthdr.pf.statekey->inp; 590 #endif 591 findpcb: 592 if (inp == NULL) { 593 switch (af) { 594 #ifdef INET6 595 case AF_INET6: 596 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 597 th->th_sport, &ip6->ip6_dst, th->th_dport, 598 m->m_pkthdr.ph_rtableid); 599 break; 600 #endif 601 case AF_INET: 602 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 603 th->th_sport, ip->ip_dst, th->th_dport, 604 m->m_pkthdr.ph_rtableid); 605 break; 606 } 607 #if NPF > 0 608 if (m->m_pkthdr.pf.statekey && inp) { 609 m->m_pkthdr.pf.statekey->inp = inp; 610 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 611 } 612 #endif 613 } 614 if (inp == NULL) { 615 int inpl_reverse = 0; 616 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 617 inpl_reverse = 1; 618 ++tcpstat.tcps_pcbhashmiss; 619 switch (af) { 620 #ifdef INET6 621 case AF_INET6: 622 inp = in6_pcblookup_listen(&tcbtable, 623 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 624 m->m_pkthdr.ph_rtableid); 625 break; 626 #endif /* INET6 */ 627 case AF_INET: 628 inp = in_pcblookup_listen(&tcbtable, 629 ip->ip_dst, th->th_dport, inpl_reverse, m, 630 m->m_pkthdr.ph_rtableid); 631 break; 632 } 633 /* 634 * If the state is CLOSED (i.e., TCB does not exist) then 635 * all data in the incoming segment is discarded. 636 * If the TCB exists but is in CLOSED state, it is embryonic, 637 * but should either do a listen or a connect soon. 638 */ 639 if (inp == 0) { 640 ++tcpstat.tcps_noport; 641 goto dropwithreset_ratelim; 642 } 643 } 644 KASSERT(sotoinpcb(inp->inp_socket) == inp); 645 KASSERT(intotcpcb(inp)->t_inpcb == inp); 646 647 /* Check the minimum TTL for socket. */ 648 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 649 goto drop; 650 651 tp = intotcpcb(inp); 652 if (tp == 0) 653 goto dropwithreset_ratelim; 654 if (tp->t_state == TCPS_CLOSED) 655 goto drop; 656 657 /* Unscale the window into a 32-bit value. */ 658 if ((tiflags & TH_SYN) == 0) 659 tiwin = th->th_win << tp->snd_scale; 660 else 661 tiwin = th->th_win; 662 663 so = inp->inp_socket; 664 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 665 union syn_cache_sa src; 666 union syn_cache_sa dst; 667 668 bzero(&src, sizeof(src)); 669 bzero(&dst, sizeof(dst)); 670 switch (af) { 671 #ifdef INET 672 case AF_INET: 673 src.sin.sin_len = sizeof(struct sockaddr_in); 674 src.sin.sin_family = AF_INET; 675 src.sin.sin_addr = ip->ip_src; 676 src.sin.sin_port = th->th_sport; 677 678 dst.sin.sin_len = sizeof(struct sockaddr_in); 679 dst.sin.sin_family = AF_INET; 680 dst.sin.sin_addr = ip->ip_dst; 681 dst.sin.sin_port = th->th_dport; 682 break; 683 #endif 684 #ifdef INET6 685 case AF_INET6: 686 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 687 src.sin6.sin6_family = AF_INET6; 688 src.sin6.sin6_addr = ip6->ip6_src; 689 src.sin6.sin6_port = th->th_sport; 690 691 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 692 dst.sin6.sin6_family = AF_INET6; 693 dst.sin6.sin6_addr = ip6->ip6_dst; 694 dst.sin6.sin6_port = th->th_dport; 695 break; 696 #endif /* INET6 */ 697 default: 698 goto badsyn; /*sanity*/ 699 } 700 701 if (so->so_options & SO_DEBUG) { 702 ostate = tp->t_state; 703 switch (af) { 704 #ifdef INET6 705 case AF_INET6: 706 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 707 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 708 break; 709 #endif 710 case AF_INET: 711 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 712 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 713 break; 714 } 715 } 716 if (so->so_options & SO_ACCEPTCONN) { 717 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 718 719 case TH_SYN|TH_ACK|TH_RST: 720 case TH_SYN|TH_RST: 721 case TH_ACK|TH_RST: 722 case TH_RST: 723 syn_cache_reset(&src.sa, &dst.sa, th, 724 inp->inp_rtableid); 725 goto drop; 726 727 case TH_SYN|TH_ACK: 728 /* 729 * Received a SYN,ACK. This should 730 * never happen while we are in 731 * LISTEN. Send an RST. 732 */ 733 goto badsyn; 734 735 case TH_ACK: 736 so = syn_cache_get(&src.sa, &dst.sa, 737 th, iphlen, tlen, so, m); 738 if (so == NULL) { 739 /* 740 * We don't have a SYN for 741 * this ACK; send an RST. 742 */ 743 goto badsyn; 744 } else if (so == (struct socket *)(-1)) { 745 /* 746 * We were unable to create 747 * the connection. If the 748 * 3-way handshake was 749 * completed, and RST has 750 * been sent to the peer. 751 * Since the mbuf might be 752 * in use for the reply, 753 * do not free it. 754 */ 755 m = NULL; 756 goto drop; 757 } else { 758 /* 759 * We have created a 760 * full-blown connection. 761 */ 762 tp = NULL; 763 inp = sotoinpcb(so); 764 tp = intotcpcb(inp); 765 if (tp == NULL) 766 goto badsyn; /*XXX*/ 767 768 } 769 break; 770 771 default: 772 /* 773 * None of RST, SYN or ACK was set. 774 * This is an invalid packet for a 775 * TCB in LISTEN state. Send a RST. 776 */ 777 goto badsyn; 778 779 case TH_SYN: 780 /* 781 * Received a SYN. 782 */ 783 #ifdef INET6 784 /* 785 * If deprecated address is forbidden, we do 786 * not accept SYN to deprecated interface 787 * address to prevent any new inbound 788 * connection from getting established. 789 * When we do not accept SYN, we send a TCP 790 * RST, with deprecated source address (instead 791 * of dropping it). We compromise it as it is 792 * much better for peer to send a RST, and 793 * RST will be the final packet for the 794 * exchange. 795 * 796 * If we do not forbid deprecated addresses, we 797 * accept the SYN packet. RFC2462 does not 798 * suggest dropping SYN in this case. 799 * If we decipher RFC2462 5.5.4, it says like 800 * this: 801 * 1. use of deprecated addr with existing 802 * communication is okay - "SHOULD continue 803 * to be used" 804 * 2. use of it with new communication: 805 * (2a) "SHOULD NOT be used if alternate 806 * address with sufficient scope is 807 * available" 808 * (2b) nothing mentioned otherwise. 809 * Here we fall into (2b) case as we have no 810 * choice in our source address selection - we 811 * must obey the peer. 812 * 813 * The wording in RFC2462 is confusing, and 814 * there are multiple description text for 815 * deprecated address handling - worse, they 816 * are not exactly the same. I believe 5.5.4 817 * is the best one, so we follow 5.5.4. 818 */ 819 if (ip6 && !ip6_use_deprecated) { 820 struct in6_ifaddr *ia6; 821 822 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 823 &ip6->ip6_dst)) && 824 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 825 tp = NULL; 826 goto dropwithreset; 827 } 828 } 829 #endif 830 831 /* 832 * LISTEN socket received a SYN 833 * from itself? This can't possibly 834 * be valid; drop the packet. 835 */ 836 if (th->th_dport == th->th_sport) { 837 switch (af) { 838 #ifdef INET6 839 case AF_INET6: 840 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 841 &ip6->ip6_dst)) { 842 tcpstat.tcps_badsyn++; 843 goto drop; 844 } 845 break; 846 #endif /* INET6 */ 847 case AF_INET: 848 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 849 tcpstat.tcps_badsyn++; 850 goto drop; 851 } 852 break; 853 } 854 } 855 856 /* 857 * SYN looks ok; create compressed TCP 858 * state for it. 859 */ 860 if (so->so_qlen > so->so_qlimit || 861 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 862 so, m, optp, optlen, &opti, reuse) == -1) 863 goto drop; 864 return; 865 } 866 } 867 } 868 869 #ifdef DIAGNOSTIC 870 /* 871 * Should not happen now that all embryonic connections 872 * are handled with compressed state. 873 */ 874 if (tp->t_state == TCPS_LISTEN) 875 panic("tcp_input: TCPS_LISTEN"); 876 #endif 877 878 #if NPF > 0 879 if (m->m_pkthdr.pf.statekey && !m->m_pkthdr.pf.statekey->inp && 880 !inp->inp_pf_sk) { 881 m->m_pkthdr.pf.statekey->inp = inp; 882 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 883 } 884 /* The statekey has finished finding the inp, it is no longer needed. */ 885 m->m_pkthdr.pf.statekey = NULL; 886 #endif 887 888 #ifdef IPSEC 889 /* Find most recent IPsec tag */ 890 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 891 if (mtag != NULL) { 892 tdbi = (struct tdb_ident *)(mtag + 1); 893 tdb = gettdb(tdbi->rdomain, tdbi->spi, 894 &tdbi->dst, tdbi->proto); 895 } else 896 tdb = NULL; 897 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 898 tdb, inp, 0); 899 if (error) { 900 tcpstat.tcps_rcvnosec++; 901 goto drop; 902 } 903 904 /* Latch SA */ 905 if (inp->inp_tdb_in != tdb) { 906 if (tdb) { 907 tdb_add_inp(tdb, inp, 1); 908 if (inp->inp_ipo == NULL) { 909 inp->inp_ipo = ipsec_add_policy(inp, af, 910 IPSP_DIRECTION_OUT); 911 if (inp->inp_ipo == NULL) { 912 goto drop; 913 } 914 } 915 if (inp->inp_ipo->ipo_dstid == NULL && 916 tdb->tdb_srcid != NULL) { 917 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 918 tdb->tdb_srcid->ref_count++; 919 } 920 if (inp->inp_ipsec_remotecred == NULL && 921 tdb->tdb_remote_cred != NULL) { 922 inp->inp_ipsec_remotecred = 923 tdb->tdb_remote_cred; 924 tdb->tdb_remote_cred->ref_count++; 925 } 926 if (inp->inp_ipsec_remoteauth == NULL && 927 tdb->tdb_remote_auth != NULL) { 928 inp->inp_ipsec_remoteauth = 929 tdb->tdb_remote_auth; 930 tdb->tdb_remote_auth->ref_count++; 931 } 932 } else { /* Just reset */ 933 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 934 inp_tdb_in_next); 935 inp->inp_tdb_in = NULL; 936 } 937 } 938 #endif /* IPSEC */ 939 940 /* 941 * Segment received on connection. 942 * Reset idle time and keep-alive timer. 943 */ 944 tp->t_rcvtime = tcp_now; 945 if (TCPS_HAVEESTABLISHED(tp->t_state)) 946 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 947 948 #ifdef TCP_SACK 949 if (tp->sack_enable) 950 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 951 #endif /* TCP_SACK */ 952 953 /* 954 * Process options. 955 */ 956 #ifdef TCP_SIGNATURE 957 if (optp || (tp->t_flags & TF_SIGNATURE)) 958 #else 959 if (optp) 960 #endif 961 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 962 m->m_pkthdr.ph_rtableid)) 963 goto drop; 964 965 if (opti.ts_present && opti.ts_ecr) { 966 int rtt_test; 967 968 /* subtract out the tcp timestamp modulator */ 969 opti.ts_ecr -= tp->ts_modulate; 970 971 /* make sure ts_ecr is sensible */ 972 rtt_test = tcp_now - opti.ts_ecr; 973 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 974 opti.ts_ecr = 0; 975 } 976 977 #ifdef TCP_ECN 978 /* if congestion experienced, set ECE bit in subsequent packets. */ 979 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 980 tp->t_flags |= TF_RCVD_CE; 981 tcpstat.tcps_ecn_rcvce++; 982 } 983 #endif 984 /* 985 * Header prediction: check for the two common cases 986 * of a uni-directional data xfer. If the packet has 987 * no control flags, is in-sequence, the window didn't 988 * change and we're not retransmitting, it's a 989 * candidate. If the length is zero and the ack moved 990 * forward, we're the sender side of the xfer. Just 991 * free the data acked & wake any higher level process 992 * that was blocked waiting for space. If the length 993 * is non-zero and the ack didn't move, we're the 994 * receiver side. If we're getting packets in-order 995 * (the reassembly queue is empty), add the data to 996 * the socket buffer and note that we need a delayed ack. 997 */ 998 if (tp->t_state == TCPS_ESTABLISHED && 999 #ifdef TCP_ECN 1000 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 1001 #else 1002 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1003 #endif 1004 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1005 th->th_seq == tp->rcv_nxt && 1006 tiwin && tiwin == tp->snd_wnd && 1007 tp->snd_nxt == tp->snd_max) { 1008 1009 /* 1010 * If last ACK falls within this segment's sequence numbers, 1011 * record the timestamp. 1012 * Fix from Braden, see Stevens p. 870 1013 */ 1014 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1015 tp->ts_recent_age = tcp_now; 1016 tp->ts_recent = opti.ts_val; 1017 } 1018 1019 if (tlen == 0) { 1020 if (SEQ_GT(th->th_ack, tp->snd_una) && 1021 SEQ_LEQ(th->th_ack, tp->snd_max) && 1022 tp->snd_cwnd >= tp->snd_wnd && 1023 tp->t_dupacks == 0) { 1024 /* 1025 * this is a pure ack for outstanding data. 1026 */ 1027 ++tcpstat.tcps_predack; 1028 if (opti.ts_present && opti.ts_ecr) 1029 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1030 else if (tp->t_rtttime && 1031 SEQ_GT(th->th_ack, tp->t_rtseq)) 1032 tcp_xmit_timer(tp, 1033 tcp_now - tp->t_rtttime); 1034 acked = th->th_ack - tp->snd_una; 1035 tcpstat.tcps_rcvackpack++; 1036 tcpstat.tcps_rcvackbyte += acked; 1037 ND6_HINT(tp); 1038 sbdrop(&so->so_snd, acked); 1039 1040 /* 1041 * If we had a pending ICMP message that 1042 * referres to data that have just been 1043 * acknowledged, disregard the recorded ICMP 1044 * message. 1045 */ 1046 if ((tp->t_flags & TF_PMTUD_PEND) && 1047 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1048 tp->t_flags &= ~TF_PMTUD_PEND; 1049 1050 /* 1051 * Keep track of the largest chunk of data 1052 * acknowledged since last PMTU update 1053 */ 1054 if (tp->t_pmtud_mss_acked < acked) 1055 tp->t_pmtud_mss_acked = acked; 1056 1057 tp->snd_una = th->th_ack; 1058 #if defined(TCP_SACK) || defined(TCP_ECN) 1059 /* 1060 * We want snd_last to track snd_una so 1061 * as to avoid sequence wraparound problems 1062 * for very large transfers. 1063 */ 1064 #ifdef TCP_ECN 1065 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1066 #endif 1067 tp->snd_last = tp->snd_una; 1068 #endif /* TCP_SACK */ 1069 #if defined(TCP_SACK) && defined(TCP_FACK) 1070 tp->snd_fack = tp->snd_una; 1071 tp->retran_data = 0; 1072 #endif /* TCP_FACK */ 1073 m_freem(m); 1074 1075 /* 1076 * If all outstanding data are acked, stop 1077 * retransmit timer, otherwise restart timer 1078 * using current (possibly backed-off) value. 1079 * If process is waiting for space, 1080 * wakeup/selwakeup/signal. If data 1081 * are ready to send, let tcp_output 1082 * decide between more output or persist. 1083 */ 1084 if (tp->snd_una == tp->snd_max) 1085 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1086 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1087 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1088 1089 tcp_update_sndspace(tp); 1090 if (sb_notify(&so->so_snd)) { 1091 tp->t_flags |= TF_BLOCKOUTPUT; 1092 sowwakeup(so); 1093 tp->t_flags &= ~TF_BLOCKOUTPUT; 1094 } 1095 if (so->so_snd.sb_cc || 1096 tp->t_flags & TF_NEEDOUTPUT) 1097 (void) tcp_output(tp); 1098 return; 1099 } 1100 } else if (th->th_ack == tp->snd_una && 1101 TAILQ_EMPTY(&tp->t_segq) && 1102 tlen <= sbspace(&so->so_rcv)) { 1103 /* 1104 * This is a pure, in-sequence data packet 1105 * with nothing on the reassembly queue and 1106 * we have enough buffer space to take it. 1107 */ 1108 #ifdef TCP_SACK 1109 /* Clean receiver SACK report if present */ 1110 if (tp->sack_enable && tp->rcv_numsacks) 1111 tcp_clean_sackreport(tp); 1112 #endif /* TCP_SACK */ 1113 ++tcpstat.tcps_preddat; 1114 tp->rcv_nxt += tlen; 1115 tcpstat.tcps_rcvpack++; 1116 tcpstat.tcps_rcvbyte += tlen; 1117 ND6_HINT(tp); 1118 1119 TCP_SETUP_ACK(tp, tiflags, m); 1120 /* 1121 * Drop TCP, IP headers and TCP options then add data 1122 * to socket buffer. 1123 */ 1124 if (so->so_state & SS_CANTRCVMORE) 1125 m_freem(m); 1126 else { 1127 if (opti.ts_present && opti.ts_ecr) { 1128 if (tp->rfbuf_ts < opti.ts_ecr && 1129 opti.ts_ecr - tp->rfbuf_ts < hz) { 1130 tcp_update_rcvspace(tp); 1131 /* Start over with next RTT. */ 1132 tp->rfbuf_cnt = 0; 1133 tp->rfbuf_ts = 0; 1134 } else 1135 tp->rfbuf_cnt += tlen; 1136 } 1137 m_adj(m, iphlen + off); 1138 sbappendstream(&so->so_rcv, m); 1139 } 1140 tp->t_flags |= TF_BLOCKOUTPUT; 1141 sorwakeup(so); 1142 tp->t_flags &= ~TF_BLOCKOUTPUT; 1143 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1144 (void) tcp_output(tp); 1145 return; 1146 } 1147 } 1148 1149 /* 1150 * Compute mbuf offset to TCP data segment. 1151 */ 1152 hdroptlen = iphlen + off; 1153 1154 /* 1155 * Calculate amount of space in receive window, 1156 * and then do TCP input processing. 1157 * Receive window is amount of space in rcv queue, 1158 * but not less than advertised window. 1159 */ 1160 { int win; 1161 1162 win = sbspace(&so->so_rcv); 1163 if (win < 0) 1164 win = 0; 1165 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1166 } 1167 1168 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1169 tp->rfbuf_cnt = 0; 1170 tp->rfbuf_ts = 0; 1171 1172 switch (tp->t_state) { 1173 1174 /* 1175 * If the state is SYN_RECEIVED: 1176 * if seg contains SYN/ACK, send an RST. 1177 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1178 */ 1179 1180 case TCPS_SYN_RECEIVED: 1181 if (tiflags & TH_ACK) { 1182 if (tiflags & TH_SYN) { 1183 tcpstat.tcps_badsyn++; 1184 goto dropwithreset; 1185 } 1186 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1187 SEQ_GT(th->th_ack, tp->snd_max)) 1188 goto dropwithreset; 1189 } 1190 break; 1191 1192 /* 1193 * If the state is SYN_SENT: 1194 * if seg contains an ACK, but not for our SYN, drop the input. 1195 * if seg contains a RST, then drop the connection. 1196 * if seg does not contain SYN, then drop it. 1197 * Otherwise this is an acceptable SYN segment 1198 * initialize tp->rcv_nxt and tp->irs 1199 * if seg contains ack then advance tp->snd_una 1200 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1201 * arrange for segment to be acked (eventually) 1202 * continue processing rest of data/controls, beginning with URG 1203 */ 1204 case TCPS_SYN_SENT: 1205 if ((tiflags & TH_ACK) && 1206 (SEQ_LEQ(th->th_ack, tp->iss) || 1207 SEQ_GT(th->th_ack, tp->snd_max))) 1208 goto dropwithreset; 1209 if (tiflags & TH_RST) { 1210 #ifdef TCP_ECN 1211 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1212 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1213 goto drop; 1214 #endif 1215 if (tiflags & TH_ACK) 1216 tp = tcp_drop(tp, ECONNREFUSED); 1217 goto drop; 1218 } 1219 if ((tiflags & TH_SYN) == 0) 1220 goto drop; 1221 if (tiflags & TH_ACK) { 1222 tp->snd_una = th->th_ack; 1223 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1224 tp->snd_nxt = tp->snd_una; 1225 } 1226 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1227 tp->irs = th->th_seq; 1228 tcp_mss(tp, opti.maxseg); 1229 /* Reset initial window to 1 segment for retransmit */ 1230 if (tp->t_rxtshift > 0) 1231 tp->snd_cwnd = tp->t_maxseg; 1232 tcp_rcvseqinit(tp); 1233 tp->t_flags |= TF_ACKNOW; 1234 #ifdef TCP_SACK 1235 /* 1236 * If we've sent a SACK_PERMITTED option, and the peer 1237 * also replied with one, then TF_SACK_PERMIT should have 1238 * been set in tcp_dooptions(). If it was not, disable SACKs. 1239 */ 1240 if (tp->sack_enable) 1241 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1242 #endif 1243 #ifdef TCP_ECN 1244 /* 1245 * if ECE is set but CWR is not set for SYN-ACK, or 1246 * both ECE and CWR are set for simultaneous open, 1247 * peer is ECN capable. 1248 */ 1249 if (tcp_do_ecn) { 1250 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1251 case TH_ACK|TH_ECE: 1252 case TH_ECE|TH_CWR: 1253 tp->t_flags |= TF_ECN_PERMIT; 1254 tiflags &= ~(TH_ECE|TH_CWR); 1255 tcpstat.tcps_ecn_accepts++; 1256 } 1257 } 1258 #endif 1259 1260 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1261 tcpstat.tcps_connects++; 1262 soisconnected(so); 1263 tp->t_state = TCPS_ESTABLISHED; 1264 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1265 /* Do window scaling on this connection? */ 1266 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1267 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1268 tp->snd_scale = tp->requested_s_scale; 1269 tp->rcv_scale = tp->request_r_scale; 1270 } 1271 tcp_flush_queue(tp); 1272 1273 /* 1274 * if we didn't have to retransmit the SYN, 1275 * use its rtt as our initial srtt & rtt var. 1276 */ 1277 if (tp->t_rtttime) 1278 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1279 /* 1280 * Since new data was acked (the SYN), open the 1281 * congestion window by one MSS. We do this 1282 * here, because we won't go through the normal 1283 * ACK processing below. And since this is the 1284 * start of the connection, we know we are in 1285 * the exponential phase of slow-start. 1286 */ 1287 tp->snd_cwnd += tp->t_maxseg; 1288 } else 1289 tp->t_state = TCPS_SYN_RECEIVED; 1290 1291 #if 0 1292 trimthenstep6: 1293 #endif 1294 /* 1295 * Advance th->th_seq to correspond to first data byte. 1296 * If data, trim to stay within window, 1297 * dropping FIN if necessary. 1298 */ 1299 th->th_seq++; 1300 if (tlen > tp->rcv_wnd) { 1301 todrop = tlen - tp->rcv_wnd; 1302 m_adj(m, -todrop); 1303 tlen = tp->rcv_wnd; 1304 tiflags &= ~TH_FIN; 1305 tcpstat.tcps_rcvpackafterwin++; 1306 tcpstat.tcps_rcvbyteafterwin += todrop; 1307 } 1308 tp->snd_wl1 = th->th_seq - 1; 1309 tp->rcv_up = th->th_seq; 1310 goto step6; 1311 /* 1312 * If a new connection request is received while in TIME_WAIT, 1313 * drop the old connection and start over if the if the 1314 * timestamp or the sequence numbers are above the previous 1315 * ones. 1316 */ 1317 case TCPS_TIME_WAIT: 1318 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1319 ((opti.ts_present && 1320 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1321 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1322 #if NPF > 0 1323 /* 1324 * The socket will be recreated but the new state 1325 * has already been linked to the socket. Remove the 1326 * link between old socket and new state. 1327 */ 1328 if (inp->inp_pf_sk) { 1329 inp->inp_pf_sk->inp = NULL; 1330 inp->inp_pf_sk = NULL; 1331 } 1332 #endif 1333 /* 1334 * Advance the iss by at least 32768, but 1335 * clear the msb in order to make sure 1336 * that SEG_LT(snd_nxt, iss). 1337 */ 1338 iss = tp->snd_nxt + 1339 ((arc4random() & 0x7fffffff) | 0x8000); 1340 reuse = &iss; 1341 tp = tcp_close(tp); 1342 inp = NULL; 1343 goto findpcb; 1344 } 1345 } 1346 1347 /* 1348 * States other than LISTEN or SYN_SENT. 1349 * First check timestamp, if present. 1350 * Then check that at least some bytes of segment are within 1351 * receive window. If segment begins before rcv_nxt, 1352 * drop leading data (and SYN); if nothing left, just ack. 1353 * 1354 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1355 * and it's less than opti.ts_recent, drop it. 1356 */ 1357 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1358 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1359 1360 /* Check to see if ts_recent is over 24 days old. */ 1361 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1362 /* 1363 * Invalidate ts_recent. If this segment updates 1364 * ts_recent, the age will be reset later and ts_recent 1365 * will get a valid value. If it does not, setting 1366 * ts_recent to zero will at least satisfy the 1367 * requirement that zero be placed in the timestamp 1368 * echo reply when ts_recent isn't valid. The 1369 * age isn't reset until we get a valid ts_recent 1370 * because we don't want out-of-order segments to be 1371 * dropped when ts_recent is old. 1372 */ 1373 tp->ts_recent = 0; 1374 } else { 1375 tcpstat.tcps_rcvduppack++; 1376 tcpstat.tcps_rcvdupbyte += tlen; 1377 tcpstat.tcps_pawsdrop++; 1378 goto dropafterack; 1379 } 1380 } 1381 1382 todrop = tp->rcv_nxt - th->th_seq; 1383 if (todrop > 0) { 1384 if (tiflags & TH_SYN) { 1385 tiflags &= ~TH_SYN; 1386 th->th_seq++; 1387 if (th->th_urp > 1) 1388 th->th_urp--; 1389 else 1390 tiflags &= ~TH_URG; 1391 todrop--; 1392 } 1393 if (todrop > tlen || 1394 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1395 /* 1396 * Any valid FIN must be to the left of the 1397 * window. At this point, FIN must be a 1398 * duplicate or out-of-sequence, so drop it. 1399 */ 1400 tiflags &= ~TH_FIN; 1401 /* 1402 * Send ACK to resynchronize, and drop any data, 1403 * but keep on processing for RST or ACK. 1404 */ 1405 tp->t_flags |= TF_ACKNOW; 1406 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1407 tcpstat.tcps_rcvduppack++; 1408 } else { 1409 tcpstat.tcps_rcvpartduppack++; 1410 tcpstat.tcps_rcvpartdupbyte += todrop; 1411 } 1412 hdroptlen += todrop; /* drop from head afterwards */ 1413 th->th_seq += todrop; 1414 tlen -= todrop; 1415 if (th->th_urp > todrop) 1416 th->th_urp -= todrop; 1417 else { 1418 tiflags &= ~TH_URG; 1419 th->th_urp = 0; 1420 } 1421 } 1422 1423 /* 1424 * If new data are received on a connection after the 1425 * user processes are gone, then RST the other end. 1426 */ 1427 if ((so->so_state & SS_NOFDREF) && 1428 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1429 tp = tcp_close(tp); 1430 tcpstat.tcps_rcvafterclose++; 1431 goto dropwithreset; 1432 } 1433 1434 /* 1435 * If segment ends after window, drop trailing data 1436 * (and PUSH and FIN); if nothing left, just ACK. 1437 */ 1438 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1439 if (todrop > 0) { 1440 tcpstat.tcps_rcvpackafterwin++; 1441 if (todrop >= tlen) { 1442 tcpstat.tcps_rcvbyteafterwin += tlen; 1443 /* 1444 * If window is closed can only take segments at 1445 * window edge, and have to drop data and PUSH from 1446 * incoming segments. Continue processing, but 1447 * remember to ack. Otherwise, drop segment 1448 * and ack. 1449 */ 1450 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1451 tp->t_flags |= TF_ACKNOW; 1452 tcpstat.tcps_rcvwinprobe++; 1453 } else 1454 goto dropafterack; 1455 } else 1456 tcpstat.tcps_rcvbyteafterwin += todrop; 1457 m_adj(m, -todrop); 1458 tlen -= todrop; 1459 tiflags &= ~(TH_PUSH|TH_FIN); 1460 } 1461 1462 /* 1463 * If last ACK falls within this segment's sequence numbers, 1464 * record its timestamp if it's more recent. 1465 * Cf fix from Braden, see Stevens p. 870 1466 */ 1467 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1468 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1469 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1470 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1471 tp->ts_recent = opti.ts_val; 1472 else 1473 tp->ts_recent = 0; 1474 tp->ts_recent_age = tcp_now; 1475 } 1476 1477 /* 1478 * If the RST bit is set examine the state: 1479 * SYN_RECEIVED STATE: 1480 * If passive open, return to LISTEN state. 1481 * If active open, inform user that connection was refused. 1482 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1483 * Inform user that connection was reset, and close tcb. 1484 * CLOSING, LAST_ACK, TIME_WAIT STATES 1485 * Close the tcb. 1486 */ 1487 if (tiflags & TH_RST) { 1488 if (th->th_seq != tp->last_ack_sent && 1489 th->th_seq != tp->rcv_nxt && 1490 th->th_seq != (tp->rcv_nxt + 1)) 1491 goto drop; 1492 1493 switch (tp->t_state) { 1494 case TCPS_SYN_RECEIVED: 1495 #ifdef TCP_ECN 1496 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1497 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1498 goto drop; 1499 #endif 1500 so->so_error = ECONNREFUSED; 1501 goto close; 1502 1503 case TCPS_ESTABLISHED: 1504 case TCPS_FIN_WAIT_1: 1505 case TCPS_FIN_WAIT_2: 1506 case TCPS_CLOSE_WAIT: 1507 so->so_error = ECONNRESET; 1508 close: 1509 tp->t_state = TCPS_CLOSED; 1510 tcpstat.tcps_drops++; 1511 tp = tcp_close(tp); 1512 goto drop; 1513 case TCPS_CLOSING: 1514 case TCPS_LAST_ACK: 1515 case TCPS_TIME_WAIT: 1516 tp = tcp_close(tp); 1517 goto drop; 1518 } 1519 } 1520 1521 /* 1522 * If a SYN is in the window, then this is an 1523 * error and we ACK and drop the packet. 1524 */ 1525 if (tiflags & TH_SYN) 1526 goto dropafterack_ratelim; 1527 1528 /* 1529 * If the ACK bit is off we drop the segment and return. 1530 */ 1531 if ((tiflags & TH_ACK) == 0) { 1532 if (tp->t_flags & TF_ACKNOW) 1533 goto dropafterack; 1534 else 1535 goto drop; 1536 } 1537 1538 /* 1539 * Ack processing. 1540 */ 1541 switch (tp->t_state) { 1542 1543 /* 1544 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1545 * ESTABLISHED state and continue processing. 1546 * The ACK was checked above. 1547 */ 1548 case TCPS_SYN_RECEIVED: 1549 tcpstat.tcps_connects++; 1550 soisconnected(so); 1551 tp->t_state = TCPS_ESTABLISHED; 1552 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1553 /* Do window scaling? */ 1554 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1555 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1556 tp->snd_scale = tp->requested_s_scale; 1557 tp->rcv_scale = tp->request_r_scale; 1558 tiwin = th->th_win << tp->snd_scale; 1559 } 1560 tcp_flush_queue(tp); 1561 tp->snd_wl1 = th->th_seq - 1; 1562 /* fall into ... */ 1563 1564 /* 1565 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1566 * ACKs. If the ack is in the range 1567 * tp->snd_una < th->th_ack <= tp->snd_max 1568 * then advance tp->snd_una to th->th_ack and drop 1569 * data from the retransmission queue. If this ACK reflects 1570 * more up to date window information we update our window information. 1571 */ 1572 case TCPS_ESTABLISHED: 1573 case TCPS_FIN_WAIT_1: 1574 case TCPS_FIN_WAIT_2: 1575 case TCPS_CLOSE_WAIT: 1576 case TCPS_CLOSING: 1577 case TCPS_LAST_ACK: 1578 case TCPS_TIME_WAIT: 1579 #ifdef TCP_ECN 1580 /* 1581 * if we receive ECE and are not already in recovery phase, 1582 * reduce cwnd by half but don't slow-start. 1583 * advance snd_last to snd_max not to reduce cwnd again 1584 * until all outstanding packets are acked. 1585 */ 1586 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1587 if ((tp->t_flags & TF_ECN_PERMIT) && 1588 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1589 u_int win; 1590 1591 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1592 if (win > 1) { 1593 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1594 tp->snd_cwnd = tp->snd_ssthresh; 1595 tp->snd_last = tp->snd_max; 1596 tp->t_flags |= TF_SEND_CWR; 1597 tcpstat.tcps_cwr_ecn++; 1598 } 1599 } 1600 tcpstat.tcps_ecn_rcvece++; 1601 } 1602 /* 1603 * if we receive CWR, we know that the peer has reduced 1604 * its congestion window. stop sending ecn-echo. 1605 */ 1606 if ((tiflags & TH_CWR)) { 1607 tp->t_flags &= ~TF_RCVD_CE; 1608 tcpstat.tcps_ecn_rcvcwr++; 1609 } 1610 #endif /* TCP_ECN */ 1611 1612 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1613 /* 1614 * Duplicate/old ACK processing. 1615 * Increments t_dupacks: 1616 * Pure duplicate (same seq/ack/window, no data) 1617 * Doesn't affect t_dupacks: 1618 * Data packets. 1619 * Normal window updates (window opens) 1620 * Resets t_dupacks: 1621 * New data ACKed. 1622 * Window shrinks 1623 * Old ACK 1624 */ 1625 if (tlen) { 1626 /* Drop very old ACKs unless th_seq matches */ 1627 if (th->th_seq != tp->rcv_nxt && 1628 SEQ_LT(th->th_ack, 1629 tp->snd_una - tp->max_sndwnd)) { 1630 tcpstat.tcps_rcvacktooold++; 1631 goto drop; 1632 } 1633 break; 1634 } 1635 /* 1636 * If we get an old ACK, there is probably packet 1637 * reordering going on. Be conservative and reset 1638 * t_dupacks so that we are less aggressive in 1639 * doing a fast retransmit. 1640 */ 1641 if (th->th_ack != tp->snd_una) { 1642 tp->t_dupacks = 0; 1643 break; 1644 } 1645 if (tiwin == tp->snd_wnd) { 1646 tcpstat.tcps_rcvdupack++; 1647 /* 1648 * If we have outstanding data (other than 1649 * a window probe), this is a completely 1650 * duplicate ack (ie, window info didn't 1651 * change), the ack is the biggest we've 1652 * seen and we've seen exactly our rexmt 1653 * threshold of them, assume a packet 1654 * has been dropped and retransmit it. 1655 * Kludge snd_nxt & the congestion 1656 * window so we send only this one 1657 * packet. 1658 * 1659 * We know we're losing at the current 1660 * window size so do congestion avoidance 1661 * (set ssthresh to half the current window 1662 * and pull our congestion window back to 1663 * the new ssthresh). 1664 * 1665 * Dup acks mean that packets have left the 1666 * network (they're now cached at the receiver) 1667 * so bump cwnd by the amount in the receiver 1668 * to keep a constant cwnd packets in the 1669 * network. 1670 */ 1671 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1672 tp->t_dupacks = 0; 1673 #if defined(TCP_SACK) && defined(TCP_FACK) 1674 /* 1675 * In FACK, can enter fast rec. if the receiver 1676 * reports a reass. queue longer than 3 segs. 1677 */ 1678 else if (++tp->t_dupacks == tcprexmtthresh || 1679 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1680 tp->t_maxseg + tp->snd_una)) && 1681 SEQ_GT(tp->snd_una, tp->snd_last))) { 1682 #else 1683 else if (++tp->t_dupacks == tcprexmtthresh) { 1684 #endif /* TCP_FACK */ 1685 tcp_seq onxt = tp->snd_nxt; 1686 u_long win = 1687 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1688 2 / tp->t_maxseg; 1689 1690 #if defined(TCP_SACK) || defined(TCP_ECN) 1691 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1692 /* 1693 * False fast retx after 1694 * timeout. Do not cut window. 1695 */ 1696 tp->t_dupacks = 0; 1697 goto drop; 1698 } 1699 #endif 1700 if (win < 2) 1701 win = 2; 1702 tp->snd_ssthresh = win * tp->t_maxseg; 1703 #ifdef TCP_SACK 1704 tp->snd_last = tp->snd_max; 1705 if (tp->sack_enable) { 1706 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1707 tp->t_rtttime = 0; 1708 #ifdef TCP_ECN 1709 tp->t_flags |= TF_SEND_CWR; 1710 #endif 1711 tcpstat.tcps_cwr_frecovery++; 1712 tcpstat.tcps_sack_recovery_episode++; 1713 #if defined(TCP_SACK) && defined(TCP_FACK) 1714 tp->t_dupacks = tcprexmtthresh; 1715 (void) tcp_output(tp); 1716 /* 1717 * During FR, snd_cwnd is held 1718 * constant for FACK. 1719 */ 1720 tp->snd_cwnd = tp->snd_ssthresh; 1721 #else 1722 /* 1723 * tcp_output() will send 1724 * oldest SACK-eligible rtx. 1725 */ 1726 (void) tcp_output(tp); 1727 tp->snd_cwnd = tp->snd_ssthresh+ 1728 tp->t_maxseg * tp->t_dupacks; 1729 #endif /* TCP_FACK */ 1730 goto drop; 1731 } 1732 #endif /* TCP_SACK */ 1733 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1734 tp->t_rtttime = 0; 1735 tp->snd_nxt = th->th_ack; 1736 tp->snd_cwnd = tp->t_maxseg; 1737 #ifdef TCP_ECN 1738 tp->t_flags |= TF_SEND_CWR; 1739 #endif 1740 tcpstat.tcps_cwr_frecovery++; 1741 tcpstat.tcps_sndrexmitfast++; 1742 (void) tcp_output(tp); 1743 1744 tp->snd_cwnd = tp->snd_ssthresh + 1745 tp->t_maxseg * tp->t_dupacks; 1746 if (SEQ_GT(onxt, tp->snd_nxt)) 1747 tp->snd_nxt = onxt; 1748 goto drop; 1749 } else if (tp->t_dupacks > tcprexmtthresh) { 1750 #if defined(TCP_SACK) && defined(TCP_FACK) 1751 /* 1752 * while (awnd < cwnd) 1753 * sendsomething(); 1754 */ 1755 if (tp->sack_enable) { 1756 if (tp->snd_awnd < tp->snd_cwnd) 1757 tcp_output(tp); 1758 goto drop; 1759 } 1760 #endif /* TCP_FACK */ 1761 tp->snd_cwnd += tp->t_maxseg; 1762 (void) tcp_output(tp); 1763 goto drop; 1764 } 1765 } else if (tiwin < tp->snd_wnd) { 1766 /* 1767 * The window was retracted! Previous dup 1768 * ACKs may have been due to packets arriving 1769 * after the shrunken window, not a missing 1770 * packet, so play it safe and reset t_dupacks 1771 */ 1772 tp->t_dupacks = 0; 1773 } 1774 break; 1775 } 1776 /* 1777 * If the congestion window was inflated to account 1778 * for the other side's cached packets, retract it. 1779 */ 1780 #if defined(TCP_SACK) 1781 if (tp->sack_enable) { 1782 if (tp->t_dupacks >= tcprexmtthresh) { 1783 /* Check for a partial ACK */ 1784 if (tcp_sack_partialack(tp, th)) { 1785 #if defined(TCP_SACK) && defined(TCP_FACK) 1786 /* Force call to tcp_output */ 1787 if (tp->snd_awnd < tp->snd_cwnd) 1788 tp->t_flags |= TF_NEEDOUTPUT; 1789 #else 1790 tp->snd_cwnd += tp->t_maxseg; 1791 tp->t_flags |= TF_NEEDOUTPUT; 1792 #endif /* TCP_FACK */ 1793 } else { 1794 /* Out of fast recovery */ 1795 tp->snd_cwnd = tp->snd_ssthresh; 1796 if (tcp_seq_subtract(tp->snd_max, 1797 th->th_ack) < tp->snd_ssthresh) 1798 tp->snd_cwnd = 1799 tcp_seq_subtract(tp->snd_max, 1800 th->th_ack); 1801 tp->t_dupacks = 0; 1802 #if defined(TCP_SACK) && defined(TCP_FACK) 1803 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1804 tp->snd_fack = th->th_ack; 1805 #endif /* TCP_FACK */ 1806 } 1807 } 1808 } else { 1809 if (tp->t_dupacks >= tcprexmtthresh && 1810 !tcp_newreno(tp, th)) { 1811 /* Out of fast recovery */ 1812 tp->snd_cwnd = tp->snd_ssthresh; 1813 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1814 tp->snd_ssthresh) 1815 tp->snd_cwnd = 1816 tcp_seq_subtract(tp->snd_max, 1817 th->th_ack); 1818 tp->t_dupacks = 0; 1819 } 1820 } 1821 if (tp->t_dupacks < tcprexmtthresh) 1822 tp->t_dupacks = 0; 1823 #else /* else no TCP_SACK */ 1824 if (tp->t_dupacks >= tcprexmtthresh && 1825 tp->snd_cwnd > tp->snd_ssthresh) 1826 tp->snd_cwnd = tp->snd_ssthresh; 1827 tp->t_dupacks = 0; 1828 #endif 1829 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1830 tcpstat.tcps_rcvacktoomuch++; 1831 goto dropafterack_ratelim; 1832 } 1833 acked = th->th_ack - tp->snd_una; 1834 tcpstat.tcps_rcvackpack++; 1835 tcpstat.tcps_rcvackbyte += acked; 1836 1837 /* 1838 * If we have a timestamp reply, update smoothed 1839 * round trip time. If no timestamp is present but 1840 * transmit timer is running and timed sequence 1841 * number was acked, update smoothed round trip time. 1842 * Since we now have an rtt measurement, cancel the 1843 * timer backoff (cf., Phil Karn's retransmit alg.). 1844 * Recompute the initial retransmit timer. 1845 */ 1846 if (opti.ts_present && opti.ts_ecr) 1847 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1848 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1849 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1850 1851 /* 1852 * If all outstanding data is acked, stop retransmit 1853 * timer and remember to restart (more output or persist). 1854 * If there is more data to be acked, restart retransmit 1855 * timer, using current (possibly backed-off) value. 1856 */ 1857 if (th->th_ack == tp->snd_max) { 1858 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1859 tp->t_flags |= TF_NEEDOUTPUT; 1860 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1861 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1862 /* 1863 * When new data is acked, open the congestion window. 1864 * If the window gives us less than ssthresh packets 1865 * in flight, open exponentially (maxseg per packet). 1866 * Otherwise open linearly: maxseg per window 1867 * (maxseg^2 / cwnd per packet). 1868 */ 1869 { 1870 u_int cw = tp->snd_cwnd; 1871 u_int incr = tp->t_maxseg; 1872 1873 if (cw > tp->snd_ssthresh) 1874 incr = incr * incr / cw; 1875 #if defined (TCP_SACK) 1876 if (tp->t_dupacks < tcprexmtthresh) 1877 #endif 1878 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1879 } 1880 ND6_HINT(tp); 1881 if (acked > so->so_snd.sb_cc) { 1882 tp->snd_wnd -= so->so_snd.sb_cc; 1883 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1884 ourfinisacked = 1; 1885 } else { 1886 sbdrop(&so->so_snd, acked); 1887 tp->snd_wnd -= acked; 1888 ourfinisacked = 0; 1889 } 1890 1891 tcp_update_sndspace(tp); 1892 if (sb_notify(&so->so_snd)) { 1893 tp->t_flags |= TF_BLOCKOUTPUT; 1894 sowwakeup(so); 1895 tp->t_flags &= ~TF_BLOCKOUTPUT; 1896 } 1897 1898 /* 1899 * If we had a pending ICMP message that referred to data 1900 * that have just been acknowledged, disregard the recorded 1901 * ICMP message. 1902 */ 1903 if ((tp->t_flags & TF_PMTUD_PEND) && 1904 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1905 tp->t_flags &= ~TF_PMTUD_PEND; 1906 1907 /* 1908 * Keep track of the largest chunk of data acknowledged 1909 * since last PMTU update 1910 */ 1911 if (tp->t_pmtud_mss_acked < acked) 1912 tp->t_pmtud_mss_acked = acked; 1913 1914 tp->snd_una = th->th_ack; 1915 #ifdef TCP_ECN 1916 /* sync snd_last with snd_una */ 1917 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1918 tp->snd_last = tp->snd_una; 1919 #endif 1920 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1921 tp->snd_nxt = tp->snd_una; 1922 #if defined (TCP_SACK) && defined (TCP_FACK) 1923 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1924 tp->snd_fack = tp->snd_una; 1925 /* Update snd_awnd for partial ACK 1926 * without any SACK blocks. 1927 */ 1928 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1929 tp->snd_fack) + tp->retran_data; 1930 } 1931 #endif 1932 1933 switch (tp->t_state) { 1934 1935 /* 1936 * In FIN_WAIT_1 STATE in addition to the processing 1937 * for the ESTABLISHED state if our FIN is now acknowledged 1938 * then enter FIN_WAIT_2. 1939 */ 1940 case TCPS_FIN_WAIT_1: 1941 if (ourfinisacked) { 1942 /* 1943 * If we can't receive any more 1944 * data, then closing user can proceed. 1945 * Starting the timer is contrary to the 1946 * specification, but if we don't get a FIN 1947 * we'll hang forever. 1948 */ 1949 if (so->so_state & SS_CANTRCVMORE) { 1950 soisdisconnected(so); 1951 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1952 } 1953 tp->t_state = TCPS_FIN_WAIT_2; 1954 } 1955 break; 1956 1957 /* 1958 * In CLOSING STATE in addition to the processing for 1959 * the ESTABLISHED state if the ACK acknowledges our FIN 1960 * then enter the TIME-WAIT state, otherwise ignore 1961 * the segment. 1962 */ 1963 case TCPS_CLOSING: 1964 if (ourfinisacked) { 1965 tp->t_state = TCPS_TIME_WAIT; 1966 tcp_canceltimers(tp); 1967 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1968 soisdisconnected(so); 1969 } 1970 break; 1971 1972 /* 1973 * In LAST_ACK, we may still be waiting for data to drain 1974 * and/or to be acked, as well as for the ack of our FIN. 1975 * If our FIN is now acknowledged, delete the TCB, 1976 * enter the closed state and return. 1977 */ 1978 case TCPS_LAST_ACK: 1979 if (ourfinisacked) { 1980 tp = tcp_close(tp); 1981 goto drop; 1982 } 1983 break; 1984 1985 /* 1986 * In TIME_WAIT state the only thing that should arrive 1987 * is a retransmission of the remote FIN. Acknowledge 1988 * it and restart the finack timer. 1989 */ 1990 case TCPS_TIME_WAIT: 1991 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1992 goto dropafterack; 1993 } 1994 } 1995 1996 step6: 1997 /* 1998 * Update window information. 1999 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2000 */ 2001 if ((tiflags & TH_ACK) && 2002 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 2003 (SEQ_LT(tp->snd_wl2, th->th_ack) || 2004 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2005 /* keep track of pure window updates */ 2006 if (tlen == 0 && 2007 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2008 tcpstat.tcps_rcvwinupd++; 2009 tp->snd_wnd = tiwin; 2010 tp->snd_wl1 = th->th_seq; 2011 tp->snd_wl2 = th->th_ack; 2012 if (tp->snd_wnd > tp->max_sndwnd) 2013 tp->max_sndwnd = tp->snd_wnd; 2014 tp->t_flags |= TF_NEEDOUTPUT; 2015 } 2016 2017 /* 2018 * Process segments with URG. 2019 */ 2020 if ((tiflags & TH_URG) && th->th_urp && 2021 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2022 /* 2023 * This is a kludge, but if we receive and accept 2024 * random urgent pointers, we'll crash in 2025 * soreceive. It's hard to imagine someone 2026 * actually wanting to send this much urgent data. 2027 */ 2028 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2029 th->th_urp = 0; /* XXX */ 2030 tiflags &= ~TH_URG; /* XXX */ 2031 goto dodata; /* XXX */ 2032 } 2033 /* 2034 * If this segment advances the known urgent pointer, 2035 * then mark the data stream. This should not happen 2036 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2037 * a FIN has been received from the remote side. 2038 * In these states we ignore the URG. 2039 * 2040 * According to RFC961 (Assigned Protocols), 2041 * the urgent pointer points to the last octet 2042 * of urgent data. We continue, however, 2043 * to consider it to indicate the first octet 2044 * of data past the urgent section as the original 2045 * spec states (in one of two places). 2046 */ 2047 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2048 tp->rcv_up = th->th_seq + th->th_urp; 2049 so->so_oobmark = so->so_rcv.sb_cc + 2050 (tp->rcv_up - tp->rcv_nxt) - 1; 2051 if (so->so_oobmark == 0) 2052 so->so_state |= SS_RCVATMARK; 2053 sohasoutofband(so); 2054 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2055 } 2056 /* 2057 * Remove out of band data so doesn't get presented to user. 2058 * This can happen independent of advancing the URG pointer, 2059 * but if two URG's are pending at once, some out-of-band 2060 * data may creep in... ick. 2061 */ 2062 if (th->th_urp <= (u_int16_t) tlen 2063 #ifdef SO_OOBINLINE 2064 && (so->so_options & SO_OOBINLINE) == 0 2065 #endif 2066 ) 2067 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2068 } else 2069 /* 2070 * If no out of band data is expected, 2071 * pull receive urgent pointer along 2072 * with the receive window. 2073 */ 2074 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2075 tp->rcv_up = tp->rcv_nxt; 2076 dodata: /* XXX */ 2077 2078 /* 2079 * Process the segment text, merging it into the TCP sequencing queue, 2080 * and arranging for acknowledgment of receipt if necessary. 2081 * This process logically involves adjusting tp->rcv_wnd as data 2082 * is presented to the user (this happens in tcp_usrreq.c, 2083 * case PRU_RCVD). If a FIN has already been received on this 2084 * connection then we just ignore the text. 2085 */ 2086 if ((tlen || (tiflags & TH_FIN)) && 2087 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2088 #ifdef TCP_SACK 2089 tcp_seq laststart = th->th_seq; 2090 tcp_seq lastend = th->th_seq + tlen; 2091 #endif 2092 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2093 tp->t_state == TCPS_ESTABLISHED) { 2094 TCP_SETUP_ACK(tp, tiflags, m); 2095 tp->rcv_nxt += tlen; 2096 tiflags = th->th_flags & TH_FIN; 2097 tcpstat.tcps_rcvpack++; 2098 tcpstat.tcps_rcvbyte += tlen; 2099 ND6_HINT(tp); 2100 if (so->so_state & SS_CANTRCVMORE) 2101 m_freem(m); 2102 else { 2103 m_adj(m, hdroptlen); 2104 sbappendstream(&so->so_rcv, m); 2105 } 2106 tp->t_flags |= TF_BLOCKOUTPUT; 2107 sorwakeup(so); 2108 tp->t_flags &= ~TF_BLOCKOUTPUT; 2109 } else { 2110 m_adj(m, hdroptlen); 2111 tiflags = tcp_reass(tp, th, m, &tlen); 2112 tp->t_flags |= TF_ACKNOW; 2113 } 2114 #ifdef TCP_SACK 2115 if (tp->sack_enable) 2116 tcp_update_sack_list(tp, laststart, lastend); 2117 #endif 2118 2119 /* 2120 * variable len never referenced again in modern BSD, 2121 * so why bother computing it ?? 2122 */ 2123 #if 0 2124 /* 2125 * Note the amount of data that peer has sent into 2126 * our window, in order to estimate the sender's 2127 * buffer size. 2128 */ 2129 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2130 #endif /* 0 */ 2131 } else { 2132 m_freem(m); 2133 tiflags &= ~TH_FIN; 2134 } 2135 2136 /* 2137 * If FIN is received ACK the FIN and let the user know 2138 * that the connection is closing. Ignore a FIN received before 2139 * the connection is fully established. 2140 */ 2141 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2142 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2143 socantrcvmore(so); 2144 tp->t_flags |= TF_ACKNOW; 2145 tp->rcv_nxt++; 2146 } 2147 switch (tp->t_state) { 2148 2149 /* 2150 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2151 */ 2152 case TCPS_ESTABLISHED: 2153 tp->t_state = TCPS_CLOSE_WAIT; 2154 break; 2155 2156 /* 2157 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2158 * enter the CLOSING state. 2159 */ 2160 case TCPS_FIN_WAIT_1: 2161 tp->t_state = TCPS_CLOSING; 2162 break; 2163 2164 /* 2165 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2166 * starting the time-wait timer, turning off the other 2167 * standard timers. 2168 */ 2169 case TCPS_FIN_WAIT_2: 2170 tp->t_state = TCPS_TIME_WAIT; 2171 tcp_canceltimers(tp); 2172 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2173 soisdisconnected(so); 2174 break; 2175 2176 /* 2177 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2178 */ 2179 case TCPS_TIME_WAIT: 2180 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2181 break; 2182 } 2183 } 2184 if (so->so_options & SO_DEBUG) { 2185 switch (tp->pf) { 2186 #ifdef INET6 2187 case PF_INET6: 2188 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2189 0, tlen); 2190 break; 2191 #endif /* INET6 */ 2192 case PF_INET: 2193 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2194 0, tlen); 2195 break; 2196 } 2197 } 2198 2199 /* 2200 * Return any desired output. 2201 */ 2202 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2203 (void) tcp_output(tp); 2204 return; 2205 2206 badsyn: 2207 /* 2208 * Received a bad SYN. Increment counters and dropwithreset. 2209 */ 2210 tcpstat.tcps_badsyn++; 2211 tp = NULL; 2212 goto dropwithreset; 2213 2214 dropafterack_ratelim: 2215 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2216 tcp_ackdrop_ppslim) == 0) { 2217 /* XXX stat */ 2218 goto drop; 2219 } 2220 /* ...fall into dropafterack... */ 2221 2222 dropafterack: 2223 /* 2224 * Generate an ACK dropping incoming segment if it occupies 2225 * sequence space, where the ACK reflects our state. 2226 */ 2227 if (tiflags & TH_RST) 2228 goto drop; 2229 m_freem(m); 2230 tp->t_flags |= TF_ACKNOW; 2231 (void) tcp_output(tp); 2232 return; 2233 2234 dropwithreset_ratelim: 2235 /* 2236 * We may want to rate-limit RSTs in certain situations, 2237 * particularly if we are sending an RST in response to 2238 * an attempt to connect to or otherwise communicate with 2239 * a port for which we have no socket. 2240 */ 2241 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2242 tcp_rst_ppslim) == 0) { 2243 /* XXX stat */ 2244 goto drop; 2245 } 2246 /* ...fall into dropwithreset... */ 2247 2248 dropwithreset: 2249 /* 2250 * Generate a RST, dropping incoming segment. 2251 * Make ACK acceptable to originator of segment. 2252 * Don't bother to respond to RST. 2253 */ 2254 if (tiflags & TH_RST) 2255 goto drop; 2256 if (tiflags & TH_ACK) { 2257 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2258 TH_RST, m->m_pkthdr.ph_rtableid); 2259 } else { 2260 if (tiflags & TH_SYN) 2261 tlen++; 2262 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2263 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2264 } 2265 m_freem(m); 2266 return; 2267 2268 drop: 2269 /* 2270 * Drop space held by incoming segment and return. 2271 */ 2272 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2273 switch (tp->pf) { 2274 #ifdef INET6 2275 case PF_INET6: 2276 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2277 0, tlen); 2278 break; 2279 #endif /* INET6 */ 2280 case PF_INET: 2281 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2282 0, tlen); 2283 break; 2284 } 2285 } 2286 2287 m_freem(m); 2288 return; 2289 } 2290 2291 int 2292 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2293 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2294 u_int rtableid) 2295 { 2296 u_int16_t mss = 0; 2297 int opt, optlen; 2298 #ifdef TCP_SIGNATURE 2299 caddr_t sigp = NULL; 2300 struct tdb *tdb = NULL; 2301 #endif /* TCP_SIGNATURE */ 2302 2303 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2304 opt = cp[0]; 2305 if (opt == TCPOPT_EOL) 2306 break; 2307 if (opt == TCPOPT_NOP) 2308 optlen = 1; 2309 else { 2310 if (cnt < 2) 2311 break; 2312 optlen = cp[1]; 2313 if (optlen < 2 || optlen > cnt) 2314 break; 2315 } 2316 switch (opt) { 2317 2318 default: 2319 continue; 2320 2321 case TCPOPT_MAXSEG: 2322 if (optlen != TCPOLEN_MAXSEG) 2323 continue; 2324 if (!(th->th_flags & TH_SYN)) 2325 continue; 2326 if (TCPS_HAVERCVDSYN(tp->t_state)) 2327 continue; 2328 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2329 NTOHS(mss); 2330 oi->maxseg = mss; 2331 break; 2332 2333 case TCPOPT_WINDOW: 2334 if (optlen != TCPOLEN_WINDOW) 2335 continue; 2336 if (!(th->th_flags & TH_SYN)) 2337 continue; 2338 if (TCPS_HAVERCVDSYN(tp->t_state)) 2339 continue; 2340 tp->t_flags |= TF_RCVD_SCALE; 2341 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2342 break; 2343 2344 case TCPOPT_TIMESTAMP: 2345 if (optlen != TCPOLEN_TIMESTAMP) 2346 continue; 2347 oi->ts_present = 1; 2348 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2349 NTOHL(oi->ts_val); 2350 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2351 NTOHL(oi->ts_ecr); 2352 2353 if (!(th->th_flags & TH_SYN)) 2354 continue; 2355 if (TCPS_HAVERCVDSYN(tp->t_state)) 2356 continue; 2357 /* 2358 * A timestamp received in a SYN makes 2359 * it ok to send timestamp requests and replies. 2360 */ 2361 tp->t_flags |= TF_RCVD_TSTMP; 2362 tp->ts_recent = oi->ts_val; 2363 tp->ts_recent_age = tcp_now; 2364 break; 2365 2366 #ifdef TCP_SACK 2367 case TCPOPT_SACK_PERMITTED: 2368 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2369 continue; 2370 if (!(th->th_flags & TH_SYN)) 2371 continue; 2372 if (TCPS_HAVERCVDSYN(tp->t_state)) 2373 continue; 2374 /* MUST only be set on SYN */ 2375 tp->t_flags |= TF_SACK_PERMIT; 2376 break; 2377 case TCPOPT_SACK: 2378 tcp_sack_option(tp, th, cp, optlen); 2379 break; 2380 #endif 2381 #ifdef TCP_SIGNATURE 2382 case TCPOPT_SIGNATURE: 2383 if (optlen != TCPOLEN_SIGNATURE) 2384 continue; 2385 2386 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2387 return (-1); 2388 2389 sigp = cp + 2; 2390 break; 2391 #endif /* TCP_SIGNATURE */ 2392 } 2393 } 2394 2395 #ifdef TCP_SIGNATURE 2396 if (tp->t_flags & TF_SIGNATURE) { 2397 union sockaddr_union src, dst; 2398 2399 memset(&src, 0, sizeof(union sockaddr_union)); 2400 memset(&dst, 0, sizeof(union sockaddr_union)); 2401 2402 switch (tp->pf) { 2403 case 0: 2404 #ifdef INET 2405 case AF_INET: 2406 src.sa.sa_len = sizeof(struct sockaddr_in); 2407 src.sa.sa_family = AF_INET; 2408 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2409 dst.sa.sa_len = sizeof(struct sockaddr_in); 2410 dst.sa.sa_family = AF_INET; 2411 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2412 break; 2413 #endif 2414 #ifdef INET6 2415 case AF_INET6: 2416 src.sa.sa_len = sizeof(struct sockaddr_in6); 2417 src.sa.sa_family = AF_INET6; 2418 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2419 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2420 dst.sa.sa_family = AF_INET6; 2421 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2422 break; 2423 #endif /* INET6 */ 2424 } 2425 2426 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2427 0, &src, &dst, IPPROTO_TCP); 2428 2429 /* 2430 * We don't have an SA for this peer, so we turn off 2431 * TF_SIGNATURE on the listen socket 2432 */ 2433 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2434 tp->t_flags &= ~TF_SIGNATURE; 2435 2436 } 2437 2438 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2439 tcpstat.tcps_rcvbadsig++; 2440 return (-1); 2441 } 2442 2443 if (sigp) { 2444 char sig[16]; 2445 2446 if (tdb == NULL) { 2447 tcpstat.tcps_rcvbadsig++; 2448 return (-1); 2449 } 2450 2451 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2452 return (-1); 2453 2454 if (timingsafe_bcmp(sig, sigp, 16)) { 2455 tcpstat.tcps_rcvbadsig++; 2456 return (-1); 2457 } 2458 2459 tcpstat.tcps_rcvgoodsig++; 2460 } 2461 #endif /* TCP_SIGNATURE */ 2462 2463 return (0); 2464 } 2465 2466 #if defined(TCP_SACK) 2467 u_long 2468 tcp_seq_subtract(u_long a, u_long b) 2469 { 2470 return ((long)(a - b)); 2471 } 2472 #endif 2473 2474 2475 #ifdef TCP_SACK 2476 /* 2477 * This function is called upon receipt of new valid data (while not in header 2478 * prediction mode), and it updates the ordered list of sacks. 2479 */ 2480 void 2481 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2482 tcp_seq rcv_lastend) 2483 { 2484 /* 2485 * First reported block MUST be the most recent one. Subsequent 2486 * blocks SHOULD be in the order in which they arrived at the 2487 * receiver. These two conditions make the implementation fully 2488 * compliant with RFC 2018. 2489 */ 2490 int i, j = 0, count = 0, lastpos = -1; 2491 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2492 2493 /* First clean up current list of sacks */ 2494 for (i = 0; i < tp->rcv_numsacks; i++) { 2495 sack = tp->sackblks[i]; 2496 if (sack.start == 0 && sack.end == 0) { 2497 count++; /* count = number of blocks to be discarded */ 2498 continue; 2499 } 2500 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2501 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2502 count++; 2503 } else { 2504 temp[j].start = tp->sackblks[i].start; 2505 temp[j++].end = tp->sackblks[i].end; 2506 } 2507 } 2508 tp->rcv_numsacks -= count; 2509 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2510 tcp_clean_sackreport(tp); 2511 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2512 /* ==> need first sack block */ 2513 tp->sackblks[0].start = rcv_laststart; 2514 tp->sackblks[0].end = rcv_lastend; 2515 tp->rcv_numsacks = 1; 2516 } 2517 return; 2518 } 2519 /* Otherwise, sack blocks are already present. */ 2520 for (i = 0; i < tp->rcv_numsacks; i++) 2521 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2522 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2523 return; /* sack list remains unchanged */ 2524 /* 2525 * From here, segment just received should be (part of) the 1st sack. 2526 * Go through list, possibly coalescing sack block entries. 2527 */ 2528 firstsack.start = rcv_laststart; 2529 firstsack.end = rcv_lastend; 2530 for (i = 0; i < tp->rcv_numsacks; i++) { 2531 sack = tp->sackblks[i]; 2532 if (SEQ_LT(sack.end, firstsack.start) || 2533 SEQ_GT(sack.start, firstsack.end)) 2534 continue; /* no overlap */ 2535 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2536 /* 2537 * identical block; delete it here since we will 2538 * move it to the front of the list. 2539 */ 2540 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2541 lastpos = i; /* last posn with a zero entry */ 2542 continue; 2543 } 2544 if (SEQ_LEQ(sack.start, firstsack.start)) 2545 firstsack.start = sack.start; /* merge blocks */ 2546 if (SEQ_GEQ(sack.end, firstsack.end)) 2547 firstsack.end = sack.end; /* merge blocks */ 2548 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2549 lastpos = i; /* last posn with a zero entry */ 2550 } 2551 if (lastpos != -1) { /* at least one merge */ 2552 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2553 sack = tp->sackblks[i]; 2554 if (sack.start == 0 && sack.end == 0) 2555 continue; 2556 temp[j++] = sack; 2557 } 2558 tp->rcv_numsacks = j; /* including first blk (added later) */ 2559 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2560 tp->sackblks[i] = temp[i]; 2561 } else { /* no merges -- shift sacks by 1 */ 2562 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2563 tp->rcv_numsacks++; 2564 for (i = tp->rcv_numsacks-1; i > 0; i--) 2565 tp->sackblks[i] = tp->sackblks[i-1]; 2566 } 2567 tp->sackblks[0] = firstsack; 2568 return; 2569 } 2570 2571 /* 2572 * Process the TCP SACK option. tp->snd_holes is an ordered list 2573 * of holes (oldest to newest, in terms of the sequence space). 2574 */ 2575 void 2576 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2577 { 2578 int tmp_olen; 2579 u_char *tmp_cp; 2580 struct sackhole *cur, *p, *temp; 2581 2582 if (!tp->sack_enable) 2583 return; 2584 /* SACK without ACK doesn't make sense. */ 2585 if ((th->th_flags & TH_ACK) == 0) 2586 return; 2587 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2588 if (SEQ_LT(th->th_ack, tp->snd_una) || 2589 SEQ_GT(th->th_ack, tp->snd_max)) 2590 return; 2591 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2592 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2593 return; 2594 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2595 tmp_cp = cp + 2; 2596 tmp_olen = optlen - 2; 2597 tcpstat.tcps_sack_rcv_opts++; 2598 if (tp->snd_numholes < 0) 2599 tp->snd_numholes = 0; 2600 if (tp->t_maxseg == 0) 2601 panic("tcp_sack_option"); /* Should never happen */ 2602 while (tmp_olen > 0) { 2603 struct sackblk sack; 2604 2605 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2606 NTOHL(sack.start); 2607 bcopy(tmp_cp + sizeof(tcp_seq), 2608 (char *) &(sack.end), sizeof(tcp_seq)); 2609 NTOHL(sack.end); 2610 tmp_olen -= TCPOLEN_SACK; 2611 tmp_cp += TCPOLEN_SACK; 2612 if (SEQ_LEQ(sack.end, sack.start)) 2613 continue; /* bad SACK fields */ 2614 if (SEQ_LEQ(sack.end, tp->snd_una)) 2615 continue; /* old block */ 2616 #if defined(TCP_SACK) && defined(TCP_FACK) 2617 /* Updates snd_fack. */ 2618 if (SEQ_GT(sack.end, tp->snd_fack)) 2619 tp->snd_fack = sack.end; 2620 #endif /* TCP_FACK */ 2621 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2622 if (SEQ_LT(sack.start, th->th_ack)) 2623 continue; 2624 } 2625 if (SEQ_GT(sack.end, tp->snd_max)) 2626 continue; 2627 if (tp->snd_holes == NULL) { /* first hole */ 2628 tp->snd_holes = (struct sackhole *) 2629 pool_get(&sackhl_pool, PR_NOWAIT); 2630 if (tp->snd_holes == NULL) { 2631 /* ENOBUFS, so ignore SACKed block for now*/ 2632 goto done; 2633 } 2634 cur = tp->snd_holes; 2635 cur->start = th->th_ack; 2636 cur->end = sack.start; 2637 cur->rxmit = cur->start; 2638 cur->next = NULL; 2639 tp->snd_numholes = 1; 2640 tp->rcv_lastsack = sack.end; 2641 /* 2642 * dups is at least one. If more data has been 2643 * SACKed, it can be greater than one. 2644 */ 2645 cur->dups = min(tcprexmtthresh, 2646 ((sack.end - cur->end)/tp->t_maxseg)); 2647 if (cur->dups < 1) 2648 cur->dups = 1; 2649 continue; /* with next sack block */ 2650 } 2651 /* Go thru list of holes: p = previous, cur = current */ 2652 p = cur = tp->snd_holes; 2653 while (cur) { 2654 if (SEQ_LEQ(sack.end, cur->start)) 2655 /* SACKs data before the current hole */ 2656 break; /* no use going through more holes */ 2657 if (SEQ_GEQ(sack.start, cur->end)) { 2658 /* SACKs data beyond the current hole */ 2659 cur->dups++; 2660 if (((sack.end - cur->end)/tp->t_maxseg) >= 2661 tcprexmtthresh) 2662 cur->dups = tcprexmtthresh; 2663 p = cur; 2664 cur = cur->next; 2665 continue; 2666 } 2667 if (SEQ_LEQ(sack.start, cur->start)) { 2668 /* Data acks at least the beginning of hole */ 2669 #if defined(TCP_SACK) && defined(TCP_FACK) 2670 if (SEQ_GT(sack.end, cur->rxmit)) 2671 tp->retran_data -= 2672 tcp_seq_subtract(cur->rxmit, 2673 cur->start); 2674 else 2675 tp->retran_data -= 2676 tcp_seq_subtract(sack.end, 2677 cur->start); 2678 #endif /* TCP_FACK */ 2679 if (SEQ_GEQ(sack.end, cur->end)) { 2680 /* Acks entire hole, so delete hole */ 2681 if (p != cur) { 2682 p->next = cur->next; 2683 pool_put(&sackhl_pool, cur); 2684 cur = p->next; 2685 } else { 2686 cur = cur->next; 2687 pool_put(&sackhl_pool, p); 2688 p = cur; 2689 tp->snd_holes = p; 2690 } 2691 tp->snd_numholes--; 2692 continue; 2693 } 2694 /* otherwise, move start of hole forward */ 2695 cur->start = sack.end; 2696 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2697 p = cur; 2698 cur = cur->next; 2699 continue; 2700 } 2701 /* move end of hole backward */ 2702 if (SEQ_GEQ(sack.end, cur->end)) { 2703 #if defined(TCP_SACK) && defined(TCP_FACK) 2704 if (SEQ_GT(cur->rxmit, sack.start)) 2705 tp->retran_data -= 2706 tcp_seq_subtract(cur->rxmit, 2707 sack.start); 2708 #endif /* TCP_FACK */ 2709 cur->end = sack.start; 2710 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2711 cur->dups++; 2712 if (((sack.end - cur->end)/tp->t_maxseg) >= 2713 tcprexmtthresh) 2714 cur->dups = tcprexmtthresh; 2715 p = cur; 2716 cur = cur->next; 2717 continue; 2718 } 2719 if (SEQ_LT(cur->start, sack.start) && 2720 SEQ_GT(cur->end, sack.end)) { 2721 /* 2722 * ACKs some data in middle of a hole; need to 2723 * split current hole 2724 */ 2725 temp = (struct sackhole *) 2726 pool_get(&sackhl_pool, PR_NOWAIT); 2727 if (temp == NULL) 2728 goto done; /* ENOBUFS */ 2729 #if defined(TCP_SACK) && defined(TCP_FACK) 2730 if (SEQ_GT(cur->rxmit, sack.end)) 2731 tp->retran_data -= 2732 tcp_seq_subtract(sack.end, 2733 sack.start); 2734 else if (SEQ_GT(cur->rxmit, sack.start)) 2735 tp->retran_data -= 2736 tcp_seq_subtract(cur->rxmit, 2737 sack.start); 2738 #endif /* TCP_FACK */ 2739 temp->next = cur->next; 2740 temp->start = sack.end; 2741 temp->end = cur->end; 2742 temp->dups = cur->dups; 2743 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2744 cur->end = sack.start; 2745 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2746 cur->dups++; 2747 if (((sack.end - cur->end)/tp->t_maxseg) >= 2748 tcprexmtthresh) 2749 cur->dups = tcprexmtthresh; 2750 cur->next = temp; 2751 p = temp; 2752 cur = p->next; 2753 tp->snd_numholes++; 2754 } 2755 } 2756 /* At this point, p points to the last hole on the list */ 2757 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2758 /* 2759 * Need to append new hole at end. 2760 * Last hole is p (and it's not NULL). 2761 */ 2762 temp = (struct sackhole *) 2763 pool_get(&sackhl_pool, PR_NOWAIT); 2764 if (temp == NULL) 2765 goto done; /* ENOBUFS */ 2766 temp->start = tp->rcv_lastsack; 2767 temp->end = sack.start; 2768 temp->dups = min(tcprexmtthresh, 2769 ((sack.end - sack.start)/tp->t_maxseg)); 2770 if (temp->dups < 1) 2771 temp->dups = 1; 2772 temp->rxmit = temp->start; 2773 temp->next = 0; 2774 p->next = temp; 2775 tp->rcv_lastsack = sack.end; 2776 tp->snd_numholes++; 2777 } 2778 } 2779 done: 2780 #if defined(TCP_SACK) && defined(TCP_FACK) 2781 /* 2782 * Update retran_data and snd_awnd. Go through the list of 2783 * holes. Increment retran_data by (hole->rxmit - hole->start). 2784 */ 2785 tp->retran_data = 0; 2786 cur = tp->snd_holes; 2787 while (cur) { 2788 tp->retran_data += cur->rxmit - cur->start; 2789 cur = cur->next; 2790 } 2791 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2792 tp->retran_data; 2793 #endif /* TCP_FACK */ 2794 2795 return; 2796 } 2797 2798 /* 2799 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2800 * it is completely acked; otherwise, tcp_sack_option(), called from 2801 * tcp_dooptions(), will fix up the hole. 2802 */ 2803 void 2804 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2805 { 2806 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2807 /* max because this could be an older ack just arrived */ 2808 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2809 th->th_ack : tp->snd_una; 2810 struct sackhole *cur = tp->snd_holes; 2811 struct sackhole *prev; 2812 while (cur) 2813 if (SEQ_LEQ(cur->end, lastack)) { 2814 prev = cur; 2815 cur = cur->next; 2816 pool_put(&sackhl_pool, prev); 2817 tp->snd_numholes--; 2818 } else if (SEQ_LT(cur->start, lastack)) { 2819 cur->start = lastack; 2820 if (SEQ_LT(cur->rxmit, cur->start)) 2821 cur->rxmit = cur->start; 2822 break; 2823 } else 2824 break; 2825 tp->snd_holes = cur; 2826 } 2827 } 2828 2829 /* 2830 * Delete all receiver-side SACK information. 2831 */ 2832 void 2833 tcp_clean_sackreport(struct tcpcb *tp) 2834 { 2835 int i; 2836 2837 tp->rcv_numsacks = 0; 2838 for (i = 0; i < MAX_SACK_BLKS; i++) 2839 tp->sackblks[i].start = tp->sackblks[i].end=0; 2840 2841 } 2842 2843 /* 2844 * Checks for partial ack. If partial ack arrives, turn off retransmission 2845 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2846 * If the ack advances at least to tp->snd_last, return 0. 2847 */ 2848 int 2849 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2850 { 2851 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2852 /* Turn off retx. timer (will start again next segment) */ 2853 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2854 tp->t_rtttime = 0; 2855 #ifndef TCP_FACK 2856 /* 2857 * Partial window deflation. This statement relies on the 2858 * fact that tp->snd_una has not been updated yet. In FACK 2859 * hold snd_cwnd constant during fast recovery. 2860 */ 2861 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2862 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2863 tp->snd_cwnd += tp->t_maxseg; 2864 } else 2865 tp->snd_cwnd = tp->t_maxseg; 2866 #endif 2867 return (1); 2868 } 2869 return (0); 2870 } 2871 #endif /* TCP_SACK */ 2872 2873 /* 2874 * Pull out of band byte out of a segment so 2875 * it doesn't appear in the user's data queue. 2876 * It is still reflected in the segment length for 2877 * sequencing purposes. 2878 */ 2879 void 2880 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2881 { 2882 int cnt = off + urgent - 1; 2883 2884 while (cnt >= 0) { 2885 if (m->m_len > cnt) { 2886 char *cp = mtod(m, caddr_t) + cnt; 2887 struct tcpcb *tp = sototcpcb(so); 2888 2889 tp->t_iobc = *cp; 2890 tp->t_oobflags |= TCPOOB_HAVEDATA; 2891 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2892 m->m_len--; 2893 return; 2894 } 2895 cnt -= m->m_len; 2896 m = m->m_next; 2897 if (m == 0) 2898 break; 2899 } 2900 panic("tcp_pulloutofband"); 2901 } 2902 2903 /* 2904 * Collect new round-trip time estimate 2905 * and update averages and current timeout. 2906 */ 2907 void 2908 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2909 { 2910 short delta; 2911 short rttmin; 2912 2913 if (rtt < 0) 2914 rtt = 0; 2915 else if (rtt > TCP_RTT_MAX) 2916 rtt = TCP_RTT_MAX; 2917 2918 tcpstat.tcps_rttupdated++; 2919 if (tp->t_srtt != 0) { 2920 /* 2921 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2922 * after the binary point (scaled by 4), whereas 2923 * srtt is stored as fixed point with 5 bits after the 2924 * binary point (i.e., scaled by 32). The following magic 2925 * is equivalent to the smoothing algorithm in rfc793 with 2926 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2927 * point). 2928 */ 2929 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2930 (tp->t_srtt >> TCP_RTT_SHIFT); 2931 if ((tp->t_srtt += delta) <= 0) 2932 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2933 /* 2934 * We accumulate a smoothed rtt variance (actually, a 2935 * smoothed mean difference), then set the retransmit 2936 * timer to smoothed rtt + 4 times the smoothed variance. 2937 * rttvar is stored as fixed point with 4 bits after the 2938 * binary point (scaled by 16). The following is 2939 * equivalent to rfc793 smoothing with an alpha of .75 2940 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2941 * rfc793's wired-in beta. 2942 */ 2943 if (delta < 0) 2944 delta = -delta; 2945 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2946 if ((tp->t_rttvar += delta) <= 0) 2947 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2948 } else { 2949 /* 2950 * No rtt measurement yet - use the unsmoothed rtt. 2951 * Set the variance to half the rtt (so our first 2952 * retransmit happens at 3*rtt). 2953 */ 2954 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2955 tp->t_rttvar = (rtt + 1) << 2956 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2957 } 2958 tp->t_rtttime = 0; 2959 tp->t_rxtshift = 0; 2960 2961 /* 2962 * the retransmit should happen at rtt + 4 * rttvar. 2963 * Because of the way we do the smoothing, srtt and rttvar 2964 * will each average +1/2 tick of bias. When we compute 2965 * the retransmit timer, we want 1/2 tick of rounding and 2966 * 1 extra tick because of +-1/2 tick uncertainty in the 2967 * firing of the timer. The bias will give us exactly the 2968 * 1.5 tick we need. But, because the bias is 2969 * statistical, we have to test that we don't drop below 2970 * the minimum feasible timer (which is 2 ticks). 2971 */ 2972 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2973 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2974 2975 /* 2976 * We received an ack for a packet that wasn't retransmitted; 2977 * it is probably safe to discard any error indications we've 2978 * received recently. This isn't quite right, but close enough 2979 * for now (a route might have failed after we sent a segment, 2980 * and the return path might not be symmetrical). 2981 */ 2982 tp->t_softerror = 0; 2983 } 2984 2985 /* 2986 * Determine a reasonable value for maxseg size. 2987 * If the route is known, check route for mtu. 2988 * If none, use an mss that can be handled on the outgoing 2989 * interface without forcing IP to fragment; if bigger than 2990 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2991 * to utilize large mbufs. If no route is found, route has no mtu, 2992 * or the destination isn't local, use a default, hopefully conservative 2993 * size (usually 512 or the default IP max size, but no more than the mtu 2994 * of the interface), as we can't discover anything about intervening 2995 * gateways or networks. We also initialize the congestion/slow start 2996 * window to be a single segment if the destination isn't local. 2997 * While looking at the routing entry, we also initialize other path-dependent 2998 * parameters from pre-set or cached values in the routing entry. 2999 * 3000 * Also take into account the space needed for options that we 3001 * send regularly. Make maxseg shorter by that amount to assure 3002 * that we can send maxseg amount of data even when the options 3003 * are present. Store the upper limit of the length of options plus 3004 * data in maxopd. 3005 * 3006 * NOTE: offer == -1 indicates that the maxseg size changed due to 3007 * Path MTU discovery. 3008 */ 3009 int 3010 tcp_mss(struct tcpcb *tp, int offer) 3011 { 3012 struct rtentry *rt; 3013 struct ifnet *ifp; 3014 int mss, mssopt; 3015 int iphlen; 3016 struct inpcb *inp; 3017 3018 inp = tp->t_inpcb; 3019 3020 mssopt = mss = tcp_mssdflt; 3021 3022 rt = in_pcbrtentry(inp); 3023 3024 if (rt == NULL) 3025 goto out; 3026 3027 ifp = rt->rt_ifp; 3028 3029 switch (tp->pf) { 3030 #ifdef INET6 3031 case AF_INET6: 3032 iphlen = sizeof(struct ip6_hdr); 3033 break; 3034 #endif 3035 case AF_INET: 3036 iphlen = sizeof(struct ip); 3037 break; 3038 default: 3039 /* the family does not support path MTU discovery */ 3040 goto out; 3041 } 3042 3043 /* 3044 * if there's an mtu associated with the route and we support 3045 * path MTU discovery for the underlying protocol family, use it. 3046 */ 3047 if (rt->rt_rmx.rmx_mtu) { 3048 /* 3049 * One may wish to lower MSS to take into account options, 3050 * especially security-related options. 3051 */ 3052 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3053 /* 3054 * RFC2460 section 5, last paragraph: if path MTU is 3055 * smaller than 1280, use 1280 as packet size and 3056 * attach fragment header. 3057 */ 3058 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3059 sizeof(struct tcphdr); 3060 } else { 3061 mss = rt->rt_rmx.rmx_mtu - iphlen - 3062 sizeof(struct tcphdr); 3063 } 3064 } else if (!ifp) { 3065 /* 3066 * ifp may be null and rmx_mtu may be zero in certain 3067 * v6 cases (e.g., if ND wasn't able to resolve the 3068 * destination host. 3069 */ 3070 goto out; 3071 } else if (ifp->if_flags & IFF_LOOPBACK) { 3072 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3073 } else if (tp->pf == AF_INET) { 3074 if (ip_mtudisc) 3075 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3076 } 3077 #ifdef INET6 3078 else if (tp->pf == AF_INET6) { 3079 /* 3080 * for IPv6, path MTU discovery is always turned on, 3081 * or the node must use packet size <= 1280. 3082 */ 3083 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3084 } 3085 #endif /* INET6 */ 3086 3087 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3088 if (offer != -1) { 3089 #ifndef INET6 3090 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3091 #else 3092 if (tp->pf == AF_INET6) 3093 mssopt = IN6_LINKMTU(ifp) - iphlen - 3094 sizeof(struct tcphdr); 3095 else 3096 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3097 #endif 3098 3099 mssopt = max(tcp_mssdflt, mssopt); 3100 } 3101 3102 out: 3103 /* 3104 * The current mss, t_maxseg, is initialized to the default value. 3105 * If we compute a smaller value, reduce the current mss. 3106 * If we compute a larger value, return it for use in sending 3107 * a max seg size option, but don't store it for use 3108 * unless we received an offer at least that large from peer. 3109 * 3110 * However, do not accept offers lower than the minimum of 3111 * the interface MTU and 216. 3112 */ 3113 if (offer > 0) 3114 tp->t_peermss = offer; 3115 if (tp->t_peermss) 3116 mss = min(mss, max(tp->t_peermss, 216)); 3117 3118 /* sanity - at least max opt. space */ 3119 mss = max(mss, 64); 3120 3121 /* 3122 * maxopd stores the maximum length of data AND options 3123 * in a segment; maxseg is the amount of data in a normal 3124 * segment. We need to store this value (maxopd) apart 3125 * from maxseg, because now every segment carries options 3126 * and thus we normally have somewhat less data in segments. 3127 */ 3128 tp->t_maxopd = mss; 3129 3130 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3131 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3132 mss -= TCPOLEN_TSTAMP_APPA; 3133 #ifdef TCP_SIGNATURE 3134 if (tp->t_flags & TF_SIGNATURE) 3135 mss -= TCPOLEN_SIGLEN; 3136 #endif 3137 3138 if (offer == -1) { 3139 /* mss changed due to Path MTU discovery */ 3140 tp->t_flags &= ~TF_PMTUD_PEND; 3141 tp->t_pmtud_mtu_sent = 0; 3142 tp->t_pmtud_mss_acked = 0; 3143 if (mss < tp->t_maxseg) { 3144 /* 3145 * Follow suggestion in RFC 2414 to reduce the 3146 * congestion window by the ratio of the old 3147 * segment size to the new segment size. 3148 */ 3149 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3150 mss, mss); 3151 } 3152 } else if (tcp_do_rfc3390 == 2) { 3153 /* increase initial window */ 3154 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3155 } else if (tcp_do_rfc3390) { 3156 /* increase initial window */ 3157 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3158 } else 3159 tp->snd_cwnd = mss; 3160 3161 tp->t_maxseg = mss; 3162 3163 return (offer != -1 ? mssopt : mss); 3164 } 3165 3166 u_int 3167 tcp_hdrsz(struct tcpcb *tp) 3168 { 3169 u_int hlen; 3170 3171 switch (tp->pf) { 3172 #ifdef INET6 3173 case AF_INET6: 3174 hlen = sizeof(struct ip6_hdr); 3175 break; 3176 #endif 3177 case AF_INET: 3178 hlen = sizeof(struct ip); 3179 break; 3180 default: 3181 hlen = 0; 3182 break; 3183 } 3184 hlen += sizeof(struct tcphdr); 3185 3186 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3187 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3188 hlen += TCPOLEN_TSTAMP_APPA; 3189 #ifdef TCP_SIGNATURE 3190 if (tp->t_flags & TF_SIGNATURE) 3191 hlen += TCPOLEN_SIGLEN; 3192 #endif 3193 return (hlen); 3194 } 3195 3196 /* 3197 * Set connection variables based on the effective MSS. 3198 * We are passed the TCPCB for the actual connection. If we 3199 * are the server, we are called by the compressed state engine 3200 * when the 3-way handshake is complete. If we are the client, 3201 * we are called when we receive the SYN,ACK from the server. 3202 * 3203 * NOTE: The t_maxseg value must be initialized in the TCPCB 3204 * before this routine is called! 3205 */ 3206 void 3207 tcp_mss_update(struct tcpcb *tp) 3208 { 3209 int mss; 3210 u_long bufsize; 3211 struct rtentry *rt; 3212 struct socket *so; 3213 3214 so = tp->t_inpcb->inp_socket; 3215 mss = tp->t_maxseg; 3216 3217 rt = in_pcbrtentry(tp->t_inpcb); 3218 3219 if (rt == NULL) 3220 return; 3221 3222 bufsize = so->so_snd.sb_hiwat; 3223 if (bufsize < mss) { 3224 mss = bufsize; 3225 /* Update t_maxseg and t_maxopd */ 3226 tcp_mss(tp, mss); 3227 } else { 3228 bufsize = roundup(bufsize, mss); 3229 if (bufsize > sb_max) 3230 bufsize = sb_max; 3231 (void)sbreserve(&so->so_snd, bufsize); 3232 } 3233 3234 bufsize = so->so_rcv.sb_hiwat; 3235 if (bufsize > mss) { 3236 bufsize = roundup(bufsize, mss); 3237 if (bufsize > sb_max) 3238 bufsize = sb_max; 3239 (void)sbreserve(&so->so_rcv, bufsize); 3240 } 3241 3242 } 3243 3244 #if defined (TCP_SACK) 3245 /* 3246 * Checks for partial ack. If partial ack arrives, force the retransmission 3247 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3248 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3249 * be started again. If the ack advances at least to tp->snd_last, return 0. 3250 */ 3251 int 3252 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3253 { 3254 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3255 /* 3256 * snd_una has not been updated and the socket send buffer 3257 * not yet drained of the acked data, so we have to leave 3258 * snd_una as it was to get the correct data offset in 3259 * tcp_output(). 3260 */ 3261 tcp_seq onxt = tp->snd_nxt; 3262 u_long ocwnd = tp->snd_cwnd; 3263 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3264 tp->t_rtttime = 0; 3265 tp->snd_nxt = th->th_ack; 3266 /* 3267 * Set snd_cwnd to one segment beyond acknowledged offset 3268 * (tp->snd_una not yet updated when this function is called) 3269 */ 3270 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3271 (void) tcp_output(tp); 3272 tp->snd_cwnd = ocwnd; 3273 if (SEQ_GT(onxt, tp->snd_nxt)) 3274 tp->snd_nxt = onxt; 3275 /* 3276 * Partial window deflation. Relies on fact that tp->snd_una 3277 * not updated yet. 3278 */ 3279 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3280 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3281 else 3282 tp->snd_cwnd = 0; 3283 tp->snd_cwnd += tp->t_maxseg; 3284 3285 return 1; 3286 } 3287 return 0; 3288 } 3289 #endif /* TCP_SACK */ 3290 3291 int 3292 tcp_mss_adv(struct ifnet *ifp, int af) 3293 { 3294 int mss = 0; 3295 int iphlen; 3296 3297 switch (af) { 3298 case AF_INET: 3299 if (ifp != NULL) 3300 mss = ifp->if_mtu; 3301 iphlen = sizeof(struct ip); 3302 break; 3303 #ifdef INET6 3304 case AF_INET6: 3305 if (ifp != NULL) 3306 mss = IN6_LINKMTU(ifp); 3307 iphlen = sizeof(struct ip6_hdr); 3308 break; 3309 #endif 3310 } 3311 mss = mss - iphlen - sizeof(struct tcphdr); 3312 return (max(mss, tcp_mssdflt)); 3313 } 3314 3315 /* 3316 * TCP compressed state engine. Currently used to hold compressed 3317 * state for SYN_RECEIVED. 3318 */ 3319 3320 u_long syn_cache_count; 3321 u_int32_t syn_hash1, syn_hash2; 3322 3323 #define SYN_HASH(sa, sp, dp) \ 3324 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3325 ((u_int32_t)(sp)))^syn_hash2))) 3326 #ifndef INET6 3327 #define SYN_HASHALL(hash, src, dst) \ 3328 do { \ 3329 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3330 ((struct sockaddr_in *)(src))->sin_port, \ 3331 ((struct sockaddr_in *)(dst))->sin_port); \ 3332 } while (/*CONSTCOND*/ 0) 3333 #else 3334 #define SYN_HASH6(sa, sp, dp) \ 3335 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3336 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3337 & 0x7fffffff) 3338 3339 #define SYN_HASHALL(hash, src, dst) \ 3340 do { \ 3341 switch ((src)->sa_family) { \ 3342 case AF_INET: \ 3343 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3344 ((struct sockaddr_in *)(src))->sin_port, \ 3345 ((struct sockaddr_in *)(dst))->sin_port); \ 3346 break; \ 3347 case AF_INET6: \ 3348 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3349 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3350 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3351 break; \ 3352 default: \ 3353 hash = 0; \ 3354 } \ 3355 } while (/*CONSTCOND*/0) 3356 #endif /* INET6 */ 3357 3358 void 3359 syn_cache_rm(struct syn_cache *sc) 3360 { 3361 sc->sc_flags |= SCF_DEAD; 3362 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 3363 sc, sc_bucketq); 3364 sc->sc_tp = NULL; 3365 LIST_REMOVE(sc, sc_tpq); 3366 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 3367 timeout_del(&sc->sc_timer); 3368 syn_cache_count--; 3369 } 3370 3371 void 3372 syn_cache_put(struct syn_cache *sc) 3373 { 3374 if (sc->sc_ipopts) 3375 (void) m_free(sc->sc_ipopts); 3376 if (sc->sc_route4.ro_rt != NULL) 3377 RTFREE(sc->sc_route4.ro_rt); 3378 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3379 timeout_add(&sc->sc_timer, 0); 3380 } 3381 3382 struct pool syn_cache_pool; 3383 3384 /* 3385 * We don't estimate RTT with SYNs, so each packet starts with the default 3386 * RTT and each timer step has a fixed timeout value. 3387 */ 3388 #define SYN_CACHE_TIMER_ARM(sc) \ 3389 do { \ 3390 TCPT_RANGESET((sc)->sc_rxtcur, \ 3391 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3392 TCPTV_REXMTMAX); \ 3393 if (!timeout_initialized(&(sc)->sc_timer)) \ 3394 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3395 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3396 } while (/*CONSTCOND*/0) 3397 3398 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3399 3400 void 3401 syn_cache_init() 3402 { 3403 int i; 3404 3405 /* Initialize the hash buckets. */ 3406 for (i = 0; i < tcp_syn_cache_size; i++) 3407 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3408 3409 /* Initialize the syn cache pool. */ 3410 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3411 "synpl", NULL); 3412 } 3413 3414 void 3415 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3416 { 3417 struct syn_cache_head *scp; 3418 struct syn_cache *sc2; 3419 int s; 3420 3421 /* 3422 * If there are no entries in the hash table, reinitialize 3423 * the hash secrets. 3424 */ 3425 if (syn_cache_count == 0) { 3426 syn_hash1 = arc4random(); 3427 syn_hash2 = arc4random(); 3428 } 3429 3430 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3431 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3432 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3433 3434 /* 3435 * Make sure that we don't overflow the per-bucket 3436 * limit or the total cache size limit. 3437 */ 3438 s = splsoftnet(); 3439 if (scp->sch_length >= tcp_syn_bucket_limit) { 3440 tcpstat.tcps_sc_bucketoverflow++; 3441 /* 3442 * The bucket is full. Toss the oldest element in the 3443 * bucket. This will be the first entry in the bucket. 3444 */ 3445 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3446 #ifdef DIAGNOSTIC 3447 /* 3448 * This should never happen; we should always find an 3449 * entry in our bucket. 3450 */ 3451 if (sc2 == NULL) 3452 panic("syn_cache_insert: bucketoverflow: impossible"); 3453 #endif 3454 syn_cache_rm(sc2); 3455 syn_cache_put(sc2); 3456 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3457 struct syn_cache_head *scp2, *sce; 3458 3459 tcpstat.tcps_sc_overflowed++; 3460 /* 3461 * The cache is full. Toss the oldest entry in the 3462 * first non-empty bucket we can find. 3463 * 3464 * XXX We would really like to toss the oldest 3465 * entry in the cache, but we hope that this 3466 * condition doesn't happen very often. 3467 */ 3468 scp2 = scp; 3469 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3470 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3471 for (++scp2; scp2 != scp; scp2++) { 3472 if (scp2 >= sce) 3473 scp2 = &tcp_syn_cache[0]; 3474 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3475 break; 3476 } 3477 #ifdef DIAGNOSTIC 3478 /* 3479 * This should never happen; we should always find a 3480 * non-empty bucket. 3481 */ 3482 if (scp2 == scp) 3483 panic("syn_cache_insert: cacheoverflow: " 3484 "impossible"); 3485 #endif 3486 } 3487 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3488 syn_cache_rm(sc2); 3489 syn_cache_put(sc2); 3490 } 3491 3492 /* 3493 * Initialize the entry's timer. 3494 */ 3495 sc->sc_rxttot = 0; 3496 sc->sc_rxtshift = 0; 3497 SYN_CACHE_TIMER_ARM(sc); 3498 3499 /* Link it from tcpcb entry */ 3500 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3501 3502 /* Put it into the bucket. */ 3503 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3504 scp->sch_length++; 3505 syn_cache_count++; 3506 3507 tcpstat.tcps_sc_added++; 3508 splx(s); 3509 } 3510 3511 /* 3512 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3513 * If we have retransmitted an entry the maximum number of times, expire 3514 * that entry. 3515 */ 3516 void 3517 syn_cache_timer(void *arg) 3518 { 3519 struct syn_cache *sc = arg; 3520 int s; 3521 3522 s = splsoftnet(); 3523 if (sc->sc_flags & SCF_DEAD) { 3524 splx(s); 3525 return; 3526 } 3527 3528 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3529 /* Drop it -- too many retransmissions. */ 3530 goto dropit; 3531 } 3532 3533 /* 3534 * Compute the total amount of time this entry has 3535 * been on a queue. If this entry has been on longer 3536 * than the keep alive timer would allow, expire it. 3537 */ 3538 sc->sc_rxttot += sc->sc_rxtcur; 3539 if (sc->sc_rxttot >= tcptv_keep_init) 3540 goto dropit; 3541 3542 tcpstat.tcps_sc_retransmitted++; 3543 (void) syn_cache_respond(sc, NULL); 3544 3545 /* Advance the timer back-off. */ 3546 sc->sc_rxtshift++; 3547 SYN_CACHE_TIMER_ARM(sc); 3548 3549 splx(s); 3550 return; 3551 3552 dropit: 3553 tcpstat.tcps_sc_timed_out++; 3554 syn_cache_rm(sc); 3555 syn_cache_put(sc); 3556 splx(s); 3557 } 3558 3559 void 3560 syn_cache_reaper(void *arg) 3561 { 3562 struct syn_cache *sc = arg; 3563 int s; 3564 3565 s = splsoftnet(); 3566 pool_put(&syn_cache_pool, (sc)); 3567 splx(s); 3568 return; 3569 } 3570 3571 /* 3572 * Remove syn cache created by the specified tcb entry, 3573 * because this does not make sense to keep them 3574 * (if there's no tcb entry, syn cache entry will never be used) 3575 */ 3576 void 3577 syn_cache_cleanup(struct tcpcb *tp) 3578 { 3579 struct syn_cache *sc, *nsc; 3580 int s; 3581 3582 s = splsoftnet(); 3583 3584 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3585 nsc = LIST_NEXT(sc, sc_tpq); 3586 3587 #ifdef DIAGNOSTIC 3588 if (sc->sc_tp != tp) 3589 panic("invalid sc_tp in syn_cache_cleanup"); 3590 #endif 3591 syn_cache_rm(sc); 3592 syn_cache_put(sc); 3593 } 3594 /* just for safety */ 3595 LIST_INIT(&tp->t_sc); 3596 3597 splx(s); 3598 } 3599 3600 /* 3601 * Find an entry in the syn cache. 3602 */ 3603 struct syn_cache * 3604 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3605 struct syn_cache_head **headp, u_int rtableid) 3606 { 3607 struct syn_cache *sc; 3608 struct syn_cache_head *scp; 3609 u_int32_t hash; 3610 int s; 3611 3612 SYN_HASHALL(hash, src, dst); 3613 3614 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3615 *headp = scp; 3616 s = splsoftnet(); 3617 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3618 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3619 if (sc->sc_hash != hash) 3620 continue; 3621 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3622 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3623 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) { 3624 splx(s); 3625 return (sc); 3626 } 3627 } 3628 splx(s); 3629 return (NULL); 3630 } 3631 3632 /* 3633 * This function gets called when we receive an ACK for a 3634 * socket in the LISTEN state. We look up the connection 3635 * in the syn cache, and if its there, we pull it out of 3636 * the cache and turn it into a full-blown connection in 3637 * the SYN-RECEIVED state. 3638 * 3639 * The return values may not be immediately obvious, and their effects 3640 * can be subtle, so here they are: 3641 * 3642 * NULL SYN was not found in cache; caller should drop the 3643 * packet and send an RST. 3644 * 3645 * -1 We were unable to create the new connection, and are 3646 * aborting it. An ACK,RST is being sent to the peer 3647 * (unless we got screwey sequence numbners; see below), 3648 * because the 3-way handshake has been completed. Caller 3649 * should not free the mbuf, since we may be using it. If 3650 * we are not, we will free it. 3651 * 3652 * Otherwise, the return value is a pointer to the new socket 3653 * associated with the connection. 3654 */ 3655 struct socket * 3656 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3657 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3658 { 3659 struct syn_cache *sc; 3660 struct syn_cache_head *scp; 3661 struct inpcb *inp = NULL; 3662 struct tcpcb *tp = NULL; 3663 struct mbuf *am; 3664 int s; 3665 struct socket *oso; 3666 #if NPF > 0 3667 struct pf_divert *divert = NULL; 3668 #endif 3669 3670 s = splsoftnet(); 3671 if ((sc = syn_cache_lookup(src, dst, &scp, 3672 sotoinpcb(so)->inp_rtableid)) == NULL) { 3673 splx(s); 3674 return (NULL); 3675 } 3676 3677 /* 3678 * Verify the sequence and ack numbers. Try getting the correct 3679 * response again. 3680 */ 3681 if ((th->th_ack != sc->sc_iss + 1) || 3682 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3683 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3684 (void) syn_cache_respond(sc, m); 3685 splx(s); 3686 return ((struct socket *)(-1)); 3687 } 3688 3689 /* Remove this cache entry */ 3690 syn_cache_rm(sc); 3691 splx(s); 3692 3693 /* 3694 * Ok, create the full blown connection, and set things up 3695 * as they would have been set up if we had created the 3696 * connection when the SYN arrived. If we can't create 3697 * the connection, abort it. 3698 */ 3699 oso = so; 3700 so = sonewconn(so, SS_ISCONNECTED); 3701 if (so == NULL) 3702 goto resetandabort; 3703 3704 inp = sotoinpcb(oso); 3705 3706 #ifdef IPSEC 3707 /* 3708 * We need to copy the required security levels 3709 * from the old pcb. Ditto for any other 3710 * IPsec-related information. 3711 */ 3712 { 3713 struct inpcb *newinp = sotoinpcb(so); 3714 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3715 sizeof(inp->inp_seclevel)); 3716 newinp->inp_secrequire = inp->inp_secrequire; 3717 if (inp->inp_ipo != NULL) { 3718 newinp->inp_ipo = inp->inp_ipo; 3719 inp->inp_ipo->ipo_ref_count++; 3720 } 3721 if (inp->inp_ipsec_remotecred != NULL) { 3722 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3723 inp->inp_ipsec_remotecred->ref_count++; 3724 } 3725 if (inp->inp_ipsec_remoteauth != NULL) { 3726 newinp->inp_ipsec_remoteauth 3727 = inp->inp_ipsec_remoteauth; 3728 inp->inp_ipsec_remoteauth->ref_count++; 3729 } 3730 } 3731 #endif /* IPSEC */ 3732 #ifdef INET6 3733 /* 3734 * inp still has the OLD in_pcb stuff, set the 3735 * v6-related flags on the new guy, too. 3736 */ 3737 { 3738 int flags = inp->inp_flags; 3739 struct inpcb *oldinpcb = inp; 3740 3741 inp = sotoinpcb(so); 3742 inp->inp_flags |= (flags & INP_IPV6); 3743 if ((inp->inp_flags & INP_IPV6) != 0) { 3744 inp->inp_ipv6.ip6_hlim = 3745 oldinpcb->inp_ipv6.ip6_hlim; 3746 } 3747 } 3748 #else /* INET6 */ 3749 inp = sotoinpcb(so); 3750 #endif /* INET6 */ 3751 3752 #if NPF > 0 3753 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3754 (divert = pf_find_divert(m)) != NULL) 3755 inp->inp_rtableid = divert->rdomain; 3756 else 3757 #endif 3758 /* inherit rtable from listening socket */ 3759 inp->inp_rtableid = sc->sc_rtableid; 3760 3761 inp->inp_lport = th->th_dport; 3762 switch (src->sa_family) { 3763 #ifdef INET6 3764 case AF_INET6: 3765 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3766 break; 3767 #endif /* INET6 */ 3768 case AF_INET: 3769 3770 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3771 inp->inp_options = ip_srcroute(m); 3772 if (inp->inp_options == NULL) { 3773 inp->inp_options = sc->sc_ipopts; 3774 sc->sc_ipopts = NULL; 3775 } 3776 break; 3777 } 3778 in_pcbrehash(inp); 3779 3780 /* 3781 * Give the new socket our cached route reference. 3782 */ 3783 if (src->sa_family == AF_INET) 3784 inp->inp_route = sc->sc_route4; /* struct assignment */ 3785 #ifdef INET6 3786 else 3787 inp->inp_route6 = sc->sc_route6; 3788 #endif 3789 sc->sc_route4.ro_rt = NULL; 3790 3791 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3792 if (am == NULL) 3793 goto resetandabort; 3794 am->m_len = src->sa_len; 3795 bcopy(src, mtod(am, caddr_t), src->sa_len); 3796 3797 switch (src->sa_family) { 3798 case AF_INET: 3799 /* drop IPv4 packet to AF_INET6 socket */ 3800 if (inp->inp_flags & INP_IPV6) { 3801 (void) m_free(am); 3802 goto resetandabort; 3803 } 3804 if (in_pcbconnect(inp, am)) { 3805 (void) m_free(am); 3806 goto resetandabort; 3807 } 3808 break; 3809 #ifdef INET6 3810 case AF_INET6: 3811 if (in6_pcbconnect(inp, am)) { 3812 (void) m_free(am); 3813 goto resetandabort; 3814 } 3815 break; 3816 #endif 3817 } 3818 (void) m_free(am); 3819 3820 tp = intotcpcb(inp); 3821 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3822 if (sc->sc_request_r_scale != 15) { 3823 tp->requested_s_scale = sc->sc_requested_s_scale; 3824 tp->request_r_scale = sc->sc_request_r_scale; 3825 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3826 } 3827 if (sc->sc_flags & SCF_TIMESTAMP) 3828 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3829 3830 tp->t_template = tcp_template(tp); 3831 if (tp->t_template == 0) { 3832 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3833 so = NULL; 3834 m_freem(m); 3835 goto abort; 3836 } 3837 #ifdef TCP_SACK 3838 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3839 #endif 3840 3841 tp->ts_modulate = sc->sc_modulate; 3842 tp->ts_recent = sc->sc_timestamp; 3843 tp->iss = sc->sc_iss; 3844 tp->irs = sc->sc_irs; 3845 tcp_sendseqinit(tp); 3846 #if defined (TCP_SACK) || defined(TCP_ECN) 3847 tp->snd_last = tp->snd_una; 3848 #endif /* TCP_SACK */ 3849 #if defined(TCP_SACK) && defined(TCP_FACK) 3850 tp->snd_fack = tp->snd_una; 3851 tp->retran_data = 0; 3852 tp->snd_awnd = 0; 3853 #endif /* TCP_FACK */ 3854 #ifdef TCP_ECN 3855 if (sc->sc_flags & SCF_ECN_PERMIT) { 3856 tp->t_flags |= TF_ECN_PERMIT; 3857 tcpstat.tcps_ecn_accepts++; 3858 } 3859 #endif 3860 #ifdef TCP_SACK 3861 if (sc->sc_flags & SCF_SACK_PERMIT) 3862 tp->t_flags |= TF_SACK_PERMIT; 3863 #endif 3864 #ifdef TCP_SIGNATURE 3865 if (sc->sc_flags & SCF_SIGNATURE) 3866 tp->t_flags |= TF_SIGNATURE; 3867 #endif 3868 tcp_rcvseqinit(tp); 3869 tp->t_state = TCPS_SYN_RECEIVED; 3870 tp->t_rcvtime = tcp_now; 3871 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3872 tcpstat.tcps_accepts++; 3873 3874 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3875 if (sc->sc_peermaxseg) 3876 tcp_mss_update(tp); 3877 /* Reset initial window to 1 segment for retransmit */ 3878 if (sc->sc_rxtshift > 0) 3879 tp->snd_cwnd = tp->t_maxseg; 3880 tp->snd_wl1 = sc->sc_irs; 3881 tp->rcv_up = sc->sc_irs + 1; 3882 3883 /* 3884 * This is what whould have happened in tcp_output() when 3885 * the SYN,ACK was sent. 3886 */ 3887 tp->snd_up = tp->snd_una; 3888 tp->snd_max = tp->snd_nxt = tp->iss+1; 3889 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3890 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3891 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3892 tp->last_ack_sent = tp->rcv_nxt; 3893 3894 tcpstat.tcps_sc_completed++; 3895 syn_cache_put(sc); 3896 return (so); 3897 3898 resetandabort: 3899 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3900 m->m_pkthdr.ph_rtableid); 3901 m_freem(m); 3902 abort: 3903 if (so != NULL) 3904 (void) soabort(so); 3905 syn_cache_put(sc); 3906 tcpstat.tcps_sc_aborted++; 3907 return ((struct socket *)(-1)); 3908 } 3909 3910 /* 3911 * This function is called when we get a RST for a 3912 * non-existent connection, so that we can see if the 3913 * connection is in the syn cache. If it is, zap it. 3914 */ 3915 3916 void 3917 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3918 u_int rtableid) 3919 { 3920 struct syn_cache *sc; 3921 struct syn_cache_head *scp; 3922 int s = splsoftnet(); 3923 3924 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3925 splx(s); 3926 return; 3927 } 3928 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3929 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3930 splx(s); 3931 return; 3932 } 3933 syn_cache_rm(sc); 3934 splx(s); 3935 tcpstat.tcps_sc_reset++; 3936 syn_cache_put(sc); 3937 } 3938 3939 void 3940 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3941 u_int rtableid) 3942 { 3943 struct syn_cache *sc; 3944 struct syn_cache_head *scp; 3945 int s; 3946 3947 s = splsoftnet(); 3948 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3949 splx(s); 3950 return; 3951 } 3952 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3953 if (ntohl (th->th_seq) != sc->sc_iss) { 3954 splx(s); 3955 return; 3956 } 3957 3958 /* 3959 * If we've retransmitted 3 times and this is our second error, 3960 * we remove the entry. Otherwise, we allow it to continue on. 3961 * This prevents us from incorrectly nuking an entry during a 3962 * spurious network outage. 3963 * 3964 * See tcp_notify(). 3965 */ 3966 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3967 sc->sc_flags |= SCF_UNREACH; 3968 splx(s); 3969 return; 3970 } 3971 3972 syn_cache_rm(sc); 3973 splx(s); 3974 tcpstat.tcps_sc_unreach++; 3975 syn_cache_put(sc); 3976 } 3977 3978 /* 3979 * Given a LISTEN socket and an inbound SYN request, add 3980 * this to the syn cache, and send back a segment: 3981 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3982 * to the source. 3983 * 3984 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3985 * Doing so would require that we hold onto the data and deliver it 3986 * to the application. However, if we are the target of a SYN-flood 3987 * DoS attack, an attacker could send data which would eventually 3988 * consume all available buffer space if it were ACKed. By not ACKing 3989 * the data, we avoid this DoS scenario. 3990 */ 3991 3992 int 3993 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3994 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3995 struct tcp_opt_info *oi, tcp_seq *issp) 3996 { 3997 struct tcpcb tb, *tp; 3998 long win; 3999 struct syn_cache *sc; 4000 struct syn_cache_head *scp; 4001 struct mbuf *ipopts; 4002 4003 tp = sototcpcb(so); 4004 4005 /* 4006 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 4007 * 4008 * Note this check is performed in tcp_input() very early on. 4009 */ 4010 4011 /* 4012 * Initialize some local state. 4013 */ 4014 win = sbspace(&so->so_rcv); 4015 if (win > TCP_MAXWIN) 4016 win = TCP_MAXWIN; 4017 4018 bzero(&tb, sizeof(tb)); 4019 #ifdef TCP_SIGNATURE 4020 if (optp || (tp->t_flags & TF_SIGNATURE)) { 4021 #else 4022 if (optp) { 4023 #endif 4024 tb.pf = tp->pf; 4025 #ifdef TCP_SACK 4026 tb.sack_enable = tp->sack_enable; 4027 #endif 4028 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 4029 #ifdef TCP_SIGNATURE 4030 if (tp->t_flags & TF_SIGNATURE) 4031 tb.t_flags |= TF_SIGNATURE; 4032 #endif 4033 tb.t_state = TCPS_LISTEN; 4034 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 4035 sotoinpcb(so)->inp_rtableid)) 4036 return (-1); 4037 } 4038 4039 switch (src->sa_family) { 4040 #ifdef INET 4041 case AF_INET: 4042 /* 4043 * Remember the IP options, if any. 4044 */ 4045 ipopts = ip_srcroute(m); 4046 break; 4047 #endif 4048 default: 4049 ipopts = NULL; 4050 } 4051 4052 /* 4053 * See if we already have an entry for this connection. 4054 * If we do, resend the SYN,ACK. We do not count this 4055 * as a retransmission (XXX though maybe we should). 4056 */ 4057 if ((sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid)) 4058 != NULL) { 4059 tcpstat.tcps_sc_dupesyn++; 4060 if (ipopts) { 4061 /* 4062 * If we were remembering a previous source route, 4063 * forget it and use the new one we've been given. 4064 */ 4065 if (sc->sc_ipopts) 4066 (void) m_free(sc->sc_ipopts); 4067 sc->sc_ipopts = ipopts; 4068 } 4069 sc->sc_timestamp = tb.ts_recent; 4070 if (syn_cache_respond(sc, m) == 0) { 4071 tcpstat.tcps_sndacks++; 4072 tcpstat.tcps_sndtotal++; 4073 } 4074 return (0); 4075 } 4076 4077 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4078 if (sc == NULL) { 4079 if (ipopts) 4080 (void) m_free(ipopts); 4081 return (-1); 4082 } 4083 4084 /* 4085 * Fill in the cache, and put the necessary IP and TCP 4086 * options into the reply. 4087 */ 4088 bcopy(src, &sc->sc_src, src->sa_len); 4089 bcopy(dst, &sc->sc_dst, dst->sa_len); 4090 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 4091 sc->sc_flags = 0; 4092 sc->sc_ipopts = ipopts; 4093 sc->sc_irs = th->th_seq; 4094 4095 sc->sc_iss = issp ? *issp : arc4random(); 4096 sc->sc_peermaxseg = oi->maxseg; 4097 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4098 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4099 sc->sc_win = win; 4100 sc->sc_timestamp = tb.ts_recent; 4101 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4102 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4103 sc->sc_flags |= SCF_TIMESTAMP; 4104 sc->sc_modulate = arc4random(); 4105 } 4106 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4107 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4108 sc->sc_requested_s_scale = tb.requested_s_scale; 4109 sc->sc_request_r_scale = 0; 4110 /* 4111 * Pick the smallest possible scaling factor that 4112 * will still allow us to scale up to sb_max. 4113 * 4114 * We do this because there are broken firewalls that 4115 * will corrupt the window scale option, leading to 4116 * the other endpoint believing that our advertised 4117 * window is unscaled. At scale factors larger than 4118 * 5 the unscaled window will drop below 1500 bytes, 4119 * leading to serious problems when traversing these 4120 * broken firewalls. 4121 * 4122 * With the default sbmax of 256K, a scale factor 4123 * of 3 will be chosen by this algorithm. Those who 4124 * choose a larger sbmax should watch out 4125 * for the compatiblity problems mentioned above. 4126 * 4127 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4128 * or <SYN,ACK>) segment itself is never scaled. 4129 */ 4130 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4131 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4132 sc->sc_request_r_scale++; 4133 } else { 4134 sc->sc_requested_s_scale = 15; 4135 sc->sc_request_r_scale = 15; 4136 } 4137 #ifdef TCP_ECN 4138 /* 4139 * if both ECE and CWR flag bits are set, peer is ECN capable. 4140 */ 4141 if (tcp_do_ecn && 4142 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4143 sc->sc_flags |= SCF_ECN_PERMIT; 4144 #endif 4145 #ifdef TCP_SACK 4146 /* 4147 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4148 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4149 */ 4150 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4151 sc->sc_flags |= SCF_SACK_PERMIT; 4152 #endif 4153 #ifdef TCP_SIGNATURE 4154 if (tb.t_flags & TF_SIGNATURE) 4155 sc->sc_flags |= SCF_SIGNATURE; 4156 #endif 4157 sc->sc_tp = tp; 4158 if (syn_cache_respond(sc, m) == 0) { 4159 syn_cache_insert(sc, tp); 4160 tcpstat.tcps_sndacks++; 4161 tcpstat.tcps_sndtotal++; 4162 } else { 4163 syn_cache_put(sc); 4164 tcpstat.tcps_sc_dropped++; 4165 } 4166 4167 return (0); 4168 } 4169 4170 int 4171 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4172 { 4173 struct route *ro; 4174 u_int8_t *optp; 4175 int optlen, error; 4176 u_int16_t tlen; 4177 struct ip *ip = NULL; 4178 #ifdef INET6 4179 struct ip6_hdr *ip6 = NULL; 4180 #endif 4181 struct tcphdr *th; 4182 u_int hlen; 4183 struct inpcb *inp; 4184 4185 switch (sc->sc_src.sa.sa_family) { 4186 case AF_INET: 4187 hlen = sizeof(struct ip); 4188 ro = &sc->sc_route4; 4189 break; 4190 #ifdef INET6 4191 case AF_INET6: 4192 hlen = sizeof(struct ip6_hdr); 4193 ro = (struct route *)&sc->sc_route6; 4194 break; 4195 #endif 4196 default: 4197 if (m) 4198 m_freem(m); 4199 return (EAFNOSUPPORT); 4200 } 4201 4202 /* Compute the size of the TCP options. */ 4203 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4204 #ifdef TCP_SACK 4205 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4206 #endif 4207 #ifdef TCP_SIGNATURE 4208 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4209 #endif 4210 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4211 4212 tlen = hlen + sizeof(struct tcphdr) + optlen; 4213 4214 /* 4215 * Create the IP+TCP header from scratch. 4216 */ 4217 if (m) 4218 m_freem(m); 4219 #ifdef DIAGNOSTIC 4220 if (max_linkhdr + tlen > MCLBYTES) 4221 return (ENOBUFS); 4222 #endif 4223 MGETHDR(m, M_DONTWAIT, MT_DATA); 4224 if (m && max_linkhdr + tlen > MHLEN) { 4225 MCLGET(m, M_DONTWAIT); 4226 if ((m->m_flags & M_EXT) == 0) { 4227 m_freem(m); 4228 m = NULL; 4229 } 4230 } 4231 if (m == NULL) 4232 return (ENOBUFS); 4233 4234 /* Fixup the mbuf. */ 4235 m->m_data += max_linkhdr; 4236 m->m_len = m->m_pkthdr.len = tlen; 4237 m->m_pkthdr.rcvif = NULL; 4238 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4239 memset(mtod(m, u_char *), 0, tlen); 4240 4241 switch (sc->sc_src.sa.sa_family) { 4242 case AF_INET: 4243 ip = mtod(m, struct ip *); 4244 ip->ip_dst = sc->sc_src.sin.sin_addr; 4245 ip->ip_src = sc->sc_dst.sin.sin_addr; 4246 ip->ip_p = IPPROTO_TCP; 4247 th = (struct tcphdr *)(ip + 1); 4248 th->th_dport = sc->sc_src.sin.sin_port; 4249 th->th_sport = sc->sc_dst.sin.sin_port; 4250 break; 4251 #ifdef INET6 4252 case AF_INET6: 4253 ip6 = mtod(m, struct ip6_hdr *); 4254 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4255 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4256 ip6->ip6_nxt = IPPROTO_TCP; 4257 /* ip6_plen will be updated in ip6_output() */ 4258 th = (struct tcphdr *)(ip6 + 1); 4259 th->th_dport = sc->sc_src.sin6.sin6_port; 4260 th->th_sport = sc->sc_dst.sin6.sin6_port; 4261 break; 4262 #endif 4263 default: 4264 th = NULL; 4265 } 4266 4267 th->th_seq = htonl(sc->sc_iss); 4268 th->th_ack = htonl(sc->sc_irs + 1); 4269 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4270 th->th_flags = TH_SYN|TH_ACK; 4271 #ifdef TCP_ECN 4272 /* Set ECE for SYN-ACK if peer supports ECN. */ 4273 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4274 th->th_flags |= TH_ECE; 4275 #endif 4276 th->th_win = htons(sc->sc_win); 4277 /* th_sum already 0 */ 4278 /* th_urp already 0 */ 4279 4280 /* Tack on the TCP options. */ 4281 optp = (u_int8_t *)(th + 1); 4282 *optp++ = TCPOPT_MAXSEG; 4283 *optp++ = 4; 4284 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4285 *optp++ = sc->sc_ourmaxseg & 0xff; 4286 4287 #ifdef TCP_SACK 4288 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4289 if (sc->sc_flags & SCF_SACK_PERMIT) { 4290 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4291 optp += 4; 4292 } 4293 #endif 4294 4295 if (sc->sc_request_r_scale != 15) { 4296 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4297 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4298 sc->sc_request_r_scale); 4299 optp += 4; 4300 } 4301 4302 if (sc->sc_flags & SCF_TIMESTAMP) { 4303 u_int32_t *lp = (u_int32_t *)(optp); 4304 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4305 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4306 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4307 *lp = htonl(sc->sc_timestamp); 4308 optp += TCPOLEN_TSTAMP_APPA; 4309 } 4310 4311 #ifdef TCP_SIGNATURE 4312 if (sc->sc_flags & SCF_SIGNATURE) { 4313 union sockaddr_union src, dst; 4314 struct tdb *tdb; 4315 4316 bzero(&src, sizeof(union sockaddr_union)); 4317 bzero(&dst, sizeof(union sockaddr_union)); 4318 src.sa.sa_len = sc->sc_src.sa.sa_len; 4319 src.sa.sa_family = sc->sc_src.sa.sa_family; 4320 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4321 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4322 4323 switch (sc->sc_src.sa.sa_family) { 4324 case 0: /*default to PF_INET*/ 4325 #ifdef INET 4326 case AF_INET: 4327 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4328 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4329 break; 4330 #endif /* INET */ 4331 #ifdef INET6 4332 case AF_INET6: 4333 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4334 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4335 break; 4336 #endif /* INET6 */ 4337 } 4338 4339 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4340 0, &src, &dst, IPPROTO_TCP); 4341 if (tdb == NULL) { 4342 if (m) 4343 m_freem(m); 4344 return (EPERM); 4345 } 4346 4347 /* Send signature option */ 4348 *(optp++) = TCPOPT_SIGNATURE; 4349 *(optp++) = TCPOLEN_SIGNATURE; 4350 4351 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4352 hlen, 0, optp) < 0) { 4353 if (m) 4354 m_freem(m); 4355 return (EINVAL); 4356 } 4357 optp += 16; 4358 4359 /* Pad options list to the next 32 bit boundary and 4360 * terminate it. 4361 */ 4362 *optp++ = TCPOPT_NOP; 4363 *optp++ = TCPOPT_EOL; 4364 } 4365 #endif /* TCP_SIGNATURE */ 4366 4367 /* Compute the packet's checksum. */ 4368 switch (sc->sc_src.sa.sa_family) { 4369 case AF_INET: 4370 ip->ip_len = htons(tlen - hlen); 4371 th->th_sum = 0; 4372 th->th_sum = in_cksum(m, tlen); 4373 break; 4374 #ifdef INET6 4375 case AF_INET6: 4376 ip6->ip6_plen = htons(tlen - hlen); 4377 th->th_sum = 0; 4378 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4379 break; 4380 #endif 4381 } 4382 4383 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4384 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4385 4386 /* 4387 * Fill in some straggling IP bits. Note the stack expects 4388 * ip_len to be in host order, for convenience. 4389 */ 4390 switch (sc->sc_src.sa.sa_family) { 4391 #ifdef INET 4392 case AF_INET: 4393 ip->ip_len = htons(tlen); 4394 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4395 if (inp != NULL) 4396 ip->ip_tos = inp->inp_ip.ip_tos; 4397 break; 4398 #endif 4399 #ifdef INET6 4400 case AF_INET6: 4401 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4402 ip6->ip6_vfc |= IPV6_VERSION; 4403 ip6->ip6_plen = htons(tlen - hlen); 4404 /* ip6_hlim will be initialized afterwards */ 4405 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4406 break; 4407 #endif 4408 } 4409 4410 switch (sc->sc_src.sa.sa_family) { 4411 #ifdef INET 4412 case AF_INET: 4413 error = ip_output(m, sc->sc_ipopts, ro, 4414 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4415 break; 4416 #endif 4417 #ifdef INET6 4418 case AF_INET6: 4419 ip6->ip6_hlim = in6_selecthlim(NULL, 4420 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4421 4422 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4423 NULL, NULL, NULL); 4424 break; 4425 #endif 4426 default: 4427 error = EAFNOSUPPORT; 4428 break; 4429 } 4430 return (error); 4431 } 4432