1 /* $OpenBSD: tcp_input.c,v 1.284 2014/11/20 11:05:19 mpi Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/route.h> 85 86 #include <netinet/in.h> 87 #include <netinet/ip.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/tcp.h> 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct ifnet *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 tp->t_inpcb->inp_route6.ro_rt) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0, \ 149 tp->t_inpcb->inp_rtableid); \ 150 } \ 151 } while (0) 152 #else 153 #define ND6_HINT(tp) 154 #endif 155 156 #ifdef TCP_ECN 157 /* 158 * ECN (Explicit Congestion Notification) support based on RFC3168 159 * implementation note: 160 * snd_last is used to track a recovery phase. 161 * when cwnd is reduced, snd_last is set to snd_max. 162 * while snd_last > snd_una, the sender is in a recovery phase and 163 * its cwnd should not be reduced again. 164 * snd_last follows snd_una when not in a recovery phase. 165 */ 166 #endif 167 168 /* 169 * Macro to compute ACK transmission behavior. Delay the ACK unless 170 * we have already delayed an ACK (must send an ACK every two segments). 171 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 172 * option is enabled or when the packet is coming from a loopback 173 * interface. 174 */ 175 #define TCP_SETUP_ACK(tp, tiflags, m) \ 176 do { \ 177 if ((tp)->t_flags & TF_DELACK || \ 178 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 179 (m && (m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif && \ 180 (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK))) \ 181 tp->t_flags |= TF_ACKNOW; \ 182 else \ 183 TCP_SET_DELACK(tp); \ 184 } while (0) 185 186 void syn_cache_put(struct syn_cache *); 187 void syn_cache_rm(struct syn_cache *); 188 189 /* 190 * Insert segment ti into reassembly queue of tcp with 191 * control block tp. Return TH_FIN if reassembly now includes 192 * a segment with FIN. The macro form does the common case inline 193 * (segment is the next to be received on an established connection, 194 * and the queue is empty), avoiding linkage into and removal 195 * from the queue and repetition of various conversions. 196 * Set DELACK for segments received in order, but ack immediately 197 * when segments are out of order (so fast retransmit can work). 198 */ 199 200 int 201 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 202 { 203 struct tcpqent *p, *q, *nq, *tiqe; 204 205 /* 206 * Allocate a new queue entry, before we throw away any data. 207 * If we can't, just drop the packet. XXX 208 */ 209 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 210 if (tiqe == NULL) { 211 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 212 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 213 /* Reuse last entry since new segment fills a hole */ 214 m_freem(tiqe->tcpqe_m); 215 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 216 } 217 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 218 /* Flush segment queue for this connection */ 219 tcp_freeq(tp); 220 tcpstat.tcps_rcvmemdrop++; 221 m_freem(m); 222 return (0); 223 } 224 } 225 226 /* 227 * Find a segment which begins after this one does. 228 */ 229 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 230 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 231 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 232 break; 233 234 /* 235 * If there is a preceding segment, it may provide some of 236 * our data already. If so, drop the data from the incoming 237 * segment. If it provides all of our data, drop us. 238 */ 239 if (p != NULL) { 240 struct tcphdr *phdr = p->tcpqe_tcp; 241 int i; 242 243 /* conversion to int (in i) handles seq wraparound */ 244 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 245 if (i > 0) { 246 if (i >= *tlen) { 247 tcpstat.tcps_rcvduppack++; 248 tcpstat.tcps_rcvdupbyte += *tlen; 249 m_freem(m); 250 pool_put(&tcpqe_pool, tiqe); 251 return (0); 252 } 253 m_adj(m, i); 254 *tlen -= i; 255 th->th_seq += i; 256 } 257 } 258 tcpstat.tcps_rcvoopack++; 259 tcpstat.tcps_rcvoobyte += *tlen; 260 261 /* 262 * While we overlap succeeding segments trim them or, 263 * if they are completely covered, dequeue them. 264 */ 265 for (; q != NULL; q = nq) { 266 struct tcphdr *qhdr = q->tcpqe_tcp; 267 int i = (th->th_seq + *tlen) - qhdr->th_seq; 268 269 if (i <= 0) 270 break; 271 if (i < qhdr->th_reseqlen) { 272 qhdr->th_seq += i; 273 qhdr->th_reseqlen -= i; 274 m_adj(q->tcpqe_m, i); 275 break; 276 } 277 nq = TAILQ_NEXT(q, tcpqe_q); 278 m_freem(q->tcpqe_m); 279 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 280 pool_put(&tcpqe_pool, q); 281 } 282 283 /* Insert the new segment queue entry into place. */ 284 tiqe->tcpqe_m = m; 285 th->th_reseqlen = *tlen; 286 tiqe->tcpqe_tcp = th; 287 if (p == NULL) { 288 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 289 } else { 290 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 291 } 292 293 if (th->th_seq != tp->rcv_nxt) 294 return (0); 295 296 return (tcp_flush_queue(tp)); 297 } 298 299 int 300 tcp_flush_queue(struct tcpcb *tp) 301 { 302 struct socket *so = tp->t_inpcb->inp_socket; 303 struct tcpqent *q, *nq; 304 int flags; 305 306 /* 307 * Present data to user, advancing rcv_nxt through 308 * completed sequence space. 309 */ 310 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 311 return (0); 312 q = TAILQ_FIRST(&tp->t_segq); 313 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 314 return (0); 315 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 316 return (0); 317 do { 318 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 319 flags = q->tcpqe_tcp->th_flags & TH_FIN; 320 321 nq = TAILQ_NEXT(q, tcpqe_q); 322 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 323 ND6_HINT(tp); 324 if (so->so_state & SS_CANTRCVMORE) 325 m_freem(q->tcpqe_m); 326 else 327 sbappendstream(&so->so_rcv, q->tcpqe_m); 328 pool_put(&tcpqe_pool, q); 329 q = nq; 330 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 331 tp->t_flags |= TF_BLOCKOUTPUT; 332 sorwakeup(so); 333 tp->t_flags &= ~TF_BLOCKOUTPUT; 334 return (flags); 335 } 336 337 #ifdef INET6 338 int 339 tcp6_input(struct mbuf **mp, int *offp, int proto) 340 { 341 struct mbuf *m = *mp; 342 343 tcp_input(m, *offp, proto); 344 return IPPROTO_DONE; 345 } 346 #endif 347 348 /* 349 * TCP input routine, follows pages 65-76 of the 350 * protocol specification dated September, 1981 very closely. 351 */ 352 void 353 tcp_input(struct mbuf *m, ...) 354 { 355 struct ip *ip; 356 struct inpcb *inp = NULL; 357 u_int8_t *optp = NULL; 358 int optlen = 0; 359 int tlen, off; 360 struct tcpcb *tp = NULL; 361 int tiflags; 362 struct socket *so = NULL; 363 int todrop, acked, ourfinisacked; 364 int hdroptlen = 0; 365 short ostate = 0; 366 tcp_seq iss, *reuse = NULL; 367 u_long tiwin; 368 struct tcp_opt_info opti; 369 int iphlen; 370 va_list ap; 371 struct tcphdr *th; 372 #ifdef INET6 373 struct ip6_hdr *ip6 = NULL; 374 #endif /* INET6 */ 375 #ifdef IPSEC 376 struct m_tag *mtag; 377 struct tdb_ident *tdbi; 378 struct tdb *tdb; 379 int error; 380 #endif /* IPSEC */ 381 int af; 382 #ifdef TCP_ECN 383 u_char iptos; 384 #endif 385 386 va_start(ap, m); 387 iphlen = va_arg(ap, int); 388 va_end(ap); 389 390 tcpstat.tcps_rcvtotal++; 391 392 opti.ts_present = 0; 393 opti.maxseg = 0; 394 395 /* 396 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 397 */ 398 if (m->m_flags & (M_BCAST|M_MCAST)) 399 goto drop; 400 401 /* 402 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 403 * TCP/IPv4. 404 */ 405 switch (mtod(m, struct ip *)->ip_v) { 406 #ifdef INET6 407 case 6: 408 af = AF_INET6; 409 break; 410 #endif 411 case 4: 412 af = AF_INET; 413 break; 414 default: 415 m_freem(m); 416 return; /*EAFNOSUPPORT*/ 417 } 418 419 /* 420 * Get IP and TCP header together in first mbuf. 421 * Note: IP leaves IP header in first mbuf. 422 */ 423 switch (af) { 424 case AF_INET: 425 #ifdef DIAGNOSTIC 426 if (iphlen < sizeof(struct ip)) { 427 m_freem(m); 428 return; 429 } 430 #endif /* DIAGNOSTIC */ 431 break; 432 #ifdef INET6 433 case AF_INET6: 434 #ifdef DIAGNOSTIC 435 if (iphlen < sizeof(struct ip6_hdr)) { 436 m_freem(m); 437 return; 438 } 439 #endif /* DIAGNOSTIC */ 440 break; 441 #endif 442 default: 443 m_freem(m); 444 return; 445 } 446 447 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 448 if (!th) { 449 tcpstat.tcps_rcvshort++; 450 return; 451 } 452 453 tlen = m->m_pkthdr.len - iphlen; 454 ip = NULL; 455 #ifdef INET6 456 ip6 = NULL; 457 #endif 458 switch (af) { 459 case AF_INET: 460 ip = mtod(m, struct ip *); 461 #ifdef TCP_ECN 462 /* save ip_tos before clearing it for checksum */ 463 iptos = ip->ip_tos; 464 #endif 465 break; 466 #ifdef INET6 467 case AF_INET6: 468 ip6 = mtod(m, struct ip6_hdr *); 469 #ifdef TCP_ECN 470 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 471 #endif 472 473 /* Be proactive about malicious use of IPv4 mapped address */ 474 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 475 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 476 /* XXX stat */ 477 goto drop; 478 } 479 480 /* 481 * Be proactive about unspecified IPv6 address in source. 482 * As we use all-zero to indicate unbounded/unconnected pcb, 483 * unspecified IPv6 address can be used to confuse us. 484 * 485 * Note that packets with unspecified IPv6 destination is 486 * already dropped in ip6_input. 487 */ 488 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 489 /* XXX stat */ 490 goto drop; 491 } 492 493 /* Discard packets to multicast */ 494 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 495 /* XXX stat */ 496 goto drop; 497 } 498 break; 499 #endif 500 } 501 502 /* 503 * Checksum extended TCP header and data. 504 */ 505 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 506 int sum; 507 508 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 509 tcpstat.tcps_rcvbadsum++; 510 goto drop; 511 } 512 tcpstat.tcps_inswcsum++; 513 switch (af) { 514 case AF_INET: 515 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 516 break; 517 #ifdef INET6 518 case AF_INET6: 519 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 520 tlen); 521 break; 522 #endif 523 } 524 if (sum != 0) { 525 tcpstat.tcps_rcvbadsum++; 526 goto drop; 527 } 528 } 529 530 /* 531 * Check that TCP offset makes sense, 532 * pull out TCP options and adjust length. XXX 533 */ 534 off = th->th_off << 2; 535 if (off < sizeof(struct tcphdr) || off > tlen) { 536 tcpstat.tcps_rcvbadoff++; 537 goto drop; 538 } 539 tlen -= off; 540 if (off > sizeof(struct tcphdr)) { 541 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 542 if (!th) { 543 tcpstat.tcps_rcvshort++; 544 return; 545 } 546 optlen = off - sizeof(struct tcphdr); 547 optp = (u_int8_t *)(th + 1); 548 /* 549 * Do quick retrieval of timestamp options ("options 550 * prediction?"). If timestamp is the only option and it's 551 * formatted as recommended in RFC 1323 appendix A, we 552 * quickly get the values now and not bother calling 553 * tcp_dooptions(), etc. 554 */ 555 if ((optlen == TCPOLEN_TSTAMP_APPA || 556 (optlen > TCPOLEN_TSTAMP_APPA && 557 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 558 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 559 (th->th_flags & TH_SYN) == 0) { 560 opti.ts_present = 1; 561 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 562 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 563 optp = NULL; /* we've parsed the options */ 564 } 565 } 566 tiflags = th->th_flags; 567 568 /* 569 * Convert TCP protocol specific fields to host format. 570 */ 571 NTOHL(th->th_seq); 572 NTOHL(th->th_ack); 573 NTOHS(th->th_win); 574 NTOHS(th->th_urp); 575 576 /* 577 * Locate pcb for segment. 578 */ 579 #if NPF > 0 580 if (m->m_pkthdr.pf.statekey) { 581 inp = m->m_pkthdr.pf.statekey->inp; 582 if (inp && inp->inp_pf_sk) 583 KASSERT(m->m_pkthdr.pf.statekey == inp->inp_pf_sk); 584 } 585 #endif 586 findpcb: 587 if (inp == NULL) { 588 switch (af) { 589 #ifdef INET6 590 case AF_INET6: 591 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 592 th->th_sport, &ip6->ip6_dst, th->th_dport, 593 m->m_pkthdr.ph_rtableid); 594 break; 595 #endif 596 case AF_INET: 597 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 598 th->th_sport, ip->ip_dst, th->th_dport, 599 m->m_pkthdr.ph_rtableid); 600 break; 601 } 602 #if NPF > 0 603 if (m->m_pkthdr.pf.statekey && inp) { 604 m->m_pkthdr.pf.statekey->inp = inp; 605 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 606 } 607 #endif 608 } 609 if (inp == NULL) { 610 int inpl_reverse = 0; 611 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 612 inpl_reverse = 1; 613 ++tcpstat.tcps_pcbhashmiss; 614 switch (af) { 615 #ifdef INET6 616 case AF_INET6: 617 inp = in6_pcblookup_listen(&tcbtable, 618 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 619 m->m_pkthdr.ph_rtableid); 620 break; 621 #endif /* INET6 */ 622 case AF_INET: 623 inp = in_pcblookup_listen(&tcbtable, 624 ip->ip_dst, th->th_dport, inpl_reverse, m, 625 m->m_pkthdr.ph_rtableid); 626 break; 627 } 628 /* 629 * If the state is CLOSED (i.e., TCB does not exist) then 630 * all data in the incoming segment is discarded. 631 * If the TCB exists but is in CLOSED state, it is embryonic, 632 * but should either do a listen or a connect soon. 633 */ 634 if (inp == 0) { 635 ++tcpstat.tcps_noport; 636 goto dropwithreset_ratelim; 637 } 638 } 639 KASSERT(sotoinpcb(inp->inp_socket) == inp); 640 KASSERT(intotcpcb(inp)->t_inpcb == inp); 641 642 /* Check the minimum TTL for socket. */ 643 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 644 goto drop; 645 646 tp = intotcpcb(inp); 647 if (tp == 0) 648 goto dropwithreset_ratelim; 649 if (tp->t_state == TCPS_CLOSED) 650 goto drop; 651 652 /* Unscale the window into a 32-bit value. */ 653 if ((tiflags & TH_SYN) == 0) 654 tiwin = th->th_win << tp->snd_scale; 655 else 656 tiwin = th->th_win; 657 658 so = inp->inp_socket; 659 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 660 union syn_cache_sa src; 661 union syn_cache_sa dst; 662 663 bzero(&src, sizeof(src)); 664 bzero(&dst, sizeof(dst)); 665 switch (af) { 666 #ifdef INET 667 case AF_INET: 668 src.sin.sin_len = sizeof(struct sockaddr_in); 669 src.sin.sin_family = AF_INET; 670 src.sin.sin_addr = ip->ip_src; 671 src.sin.sin_port = th->th_sport; 672 673 dst.sin.sin_len = sizeof(struct sockaddr_in); 674 dst.sin.sin_family = AF_INET; 675 dst.sin.sin_addr = ip->ip_dst; 676 dst.sin.sin_port = th->th_dport; 677 break; 678 #endif 679 #ifdef INET6 680 case AF_INET6: 681 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 682 src.sin6.sin6_family = AF_INET6; 683 src.sin6.sin6_addr = ip6->ip6_src; 684 src.sin6.sin6_port = th->th_sport; 685 686 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 687 dst.sin6.sin6_family = AF_INET6; 688 dst.sin6.sin6_addr = ip6->ip6_dst; 689 dst.sin6.sin6_port = th->th_dport; 690 break; 691 #endif /* INET6 */ 692 default: 693 goto badsyn; /*sanity*/ 694 } 695 696 if (so->so_options & SO_DEBUG) { 697 ostate = tp->t_state; 698 switch (af) { 699 #ifdef INET6 700 case AF_INET6: 701 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 702 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 703 break; 704 #endif 705 case AF_INET: 706 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 707 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 708 break; 709 } 710 } 711 if (so->so_options & SO_ACCEPTCONN) { 712 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 713 714 case TH_SYN|TH_ACK|TH_RST: 715 case TH_SYN|TH_RST: 716 case TH_ACK|TH_RST: 717 case TH_RST: 718 syn_cache_reset(&src.sa, &dst.sa, th, 719 inp->inp_rtableid); 720 goto drop; 721 722 case TH_SYN|TH_ACK: 723 /* 724 * Received a SYN,ACK. This should 725 * never happen while we are in 726 * LISTEN. Send an RST. 727 */ 728 goto badsyn; 729 730 case TH_ACK: 731 so = syn_cache_get(&src.sa, &dst.sa, 732 th, iphlen, tlen, so, m); 733 if (so == NULL) { 734 /* 735 * We don't have a SYN for 736 * this ACK; send an RST. 737 */ 738 goto badsyn; 739 } else if (so == (struct socket *)(-1)) { 740 /* 741 * We were unable to create 742 * the connection. If the 743 * 3-way handshake was 744 * completed, and RST has 745 * been sent to the peer. 746 * Since the mbuf might be 747 * in use for the reply, 748 * do not free it. 749 */ 750 m = NULL; 751 goto drop; 752 } else { 753 /* 754 * We have created a 755 * full-blown connection. 756 */ 757 tp = NULL; 758 inp = sotoinpcb(so); 759 tp = intotcpcb(inp); 760 if (tp == NULL) 761 goto badsyn; /*XXX*/ 762 763 } 764 break; 765 766 default: 767 /* 768 * None of RST, SYN or ACK was set. 769 * This is an invalid packet for a 770 * TCB in LISTEN state. Send a RST. 771 */ 772 goto badsyn; 773 774 case TH_SYN: 775 /* 776 * Received a SYN. 777 */ 778 #ifdef INET6 779 /* 780 * If deprecated address is forbidden, we do 781 * not accept SYN to deprecated interface 782 * address to prevent any new inbound 783 * connection from getting established. 784 * When we do not accept SYN, we send a TCP 785 * RST, with deprecated source address (instead 786 * of dropping it). We compromise it as it is 787 * much better for peer to send a RST, and 788 * RST will be the final packet for the 789 * exchange. 790 * 791 * If we do not forbid deprecated addresses, we 792 * accept the SYN packet. RFC2462 does not 793 * suggest dropping SYN in this case. 794 * If we decipher RFC2462 5.5.4, it says like 795 * this: 796 * 1. use of deprecated addr with existing 797 * communication is okay - "SHOULD continue 798 * to be used" 799 * 2. use of it with new communication: 800 * (2a) "SHOULD NOT be used if alternate 801 * address with sufficient scope is 802 * available" 803 * (2b) nothing mentioned otherwise. 804 * Here we fall into (2b) case as we have no 805 * choice in our source address selection - we 806 * must obey the peer. 807 * 808 * The wording in RFC2462 is confusing, and 809 * there are multiple description text for 810 * deprecated address handling - worse, they 811 * are not exactly the same. I believe 5.5.4 812 * is the best one, so we follow 5.5.4. 813 */ 814 if (ip6 && !ip6_use_deprecated) { 815 struct in6_ifaddr *ia6; 816 817 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 818 &ip6->ip6_dst)) && 819 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 820 tp = NULL; 821 goto dropwithreset; 822 } 823 } 824 #endif 825 826 /* 827 * LISTEN socket received a SYN 828 * from itself? This can't possibly 829 * be valid; drop the packet. 830 */ 831 if (th->th_dport == th->th_sport) { 832 switch (af) { 833 #ifdef INET6 834 case AF_INET6: 835 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 836 &ip6->ip6_dst)) { 837 tcpstat.tcps_badsyn++; 838 goto drop; 839 } 840 break; 841 #endif /* INET6 */ 842 case AF_INET: 843 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 844 tcpstat.tcps_badsyn++; 845 goto drop; 846 } 847 break; 848 } 849 } 850 851 /* 852 * SYN looks ok; create compressed TCP 853 * state for it. 854 */ 855 if (so->so_qlen > so->so_qlimit || 856 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 857 so, m, optp, optlen, &opti, reuse) == -1) 858 goto drop; 859 return; 860 } 861 } 862 } 863 864 #ifdef DIAGNOSTIC 865 /* 866 * Should not happen now that all embryonic connections 867 * are handled with compressed state. 868 */ 869 if (tp->t_state == TCPS_LISTEN) 870 panic("tcp_input: TCPS_LISTEN"); 871 #endif 872 873 #if NPF > 0 874 if (m->m_pkthdr.pf.statekey && !m->m_pkthdr.pf.statekey->inp && 875 !inp->inp_pf_sk) { 876 m->m_pkthdr.pf.statekey->inp = inp; 877 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 878 } 879 /* The statekey has finished finding the inp, it is no longer needed. */ 880 m->m_pkthdr.pf.statekey = NULL; 881 #endif 882 883 #ifdef IPSEC 884 /* Find most recent IPsec tag */ 885 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 886 if (mtag != NULL) { 887 tdbi = (struct tdb_ident *)(mtag + 1); 888 tdb = gettdb(tdbi->rdomain, tdbi->spi, 889 &tdbi->dst, tdbi->proto); 890 } else 891 tdb = NULL; 892 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 893 tdb, inp, 0); 894 if (error) { 895 tcpstat.tcps_rcvnosec++; 896 goto drop; 897 } 898 899 /* Latch SA */ 900 if (inp->inp_tdb_in != tdb) { 901 if (tdb) { 902 tdb_add_inp(tdb, inp, 1); 903 if (inp->inp_ipo == NULL) { 904 inp->inp_ipo = ipsec_add_policy(inp, af, 905 IPSP_DIRECTION_OUT); 906 if (inp->inp_ipo == NULL) { 907 goto drop; 908 } 909 } 910 if (inp->inp_ipo->ipo_dstid == NULL && 911 tdb->tdb_srcid != NULL) { 912 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 913 tdb->tdb_srcid->ref_count++; 914 } 915 if (inp->inp_ipsec_remotecred == NULL && 916 tdb->tdb_remote_cred != NULL) { 917 inp->inp_ipsec_remotecred = 918 tdb->tdb_remote_cred; 919 tdb->tdb_remote_cred->ref_count++; 920 } 921 if (inp->inp_ipsec_remoteauth == NULL && 922 tdb->tdb_remote_auth != NULL) { 923 inp->inp_ipsec_remoteauth = 924 tdb->tdb_remote_auth; 925 tdb->tdb_remote_auth->ref_count++; 926 } 927 } else { /* Just reset */ 928 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 929 inp_tdb_in_next); 930 inp->inp_tdb_in = NULL; 931 } 932 } 933 #endif /* IPSEC */ 934 935 /* 936 * Segment received on connection. 937 * Reset idle time and keep-alive timer. 938 */ 939 tp->t_rcvtime = tcp_now; 940 if (TCPS_HAVEESTABLISHED(tp->t_state)) 941 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 942 943 #ifdef TCP_SACK 944 if (tp->sack_enable) 945 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 946 #endif /* TCP_SACK */ 947 948 /* 949 * Process options. 950 */ 951 #ifdef TCP_SIGNATURE 952 if (optp || (tp->t_flags & TF_SIGNATURE)) 953 #else 954 if (optp) 955 #endif 956 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 957 m->m_pkthdr.ph_rtableid)) 958 goto drop; 959 960 if (opti.ts_present && opti.ts_ecr) { 961 int rtt_test; 962 963 /* subtract out the tcp timestamp modulator */ 964 opti.ts_ecr -= tp->ts_modulate; 965 966 /* make sure ts_ecr is sensible */ 967 rtt_test = tcp_now - opti.ts_ecr; 968 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 969 opti.ts_ecr = 0; 970 } 971 972 #ifdef TCP_ECN 973 /* if congestion experienced, set ECE bit in subsequent packets. */ 974 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 975 tp->t_flags |= TF_RCVD_CE; 976 tcpstat.tcps_ecn_rcvce++; 977 } 978 #endif 979 /* 980 * Header prediction: check for the two common cases 981 * of a uni-directional data xfer. If the packet has 982 * no control flags, is in-sequence, the window didn't 983 * change and we're not retransmitting, it's a 984 * candidate. If the length is zero and the ack moved 985 * forward, we're the sender side of the xfer. Just 986 * free the data acked & wake any higher level process 987 * that was blocked waiting for space. If the length 988 * is non-zero and the ack didn't move, we're the 989 * receiver side. If we're getting packets in-order 990 * (the reassembly queue is empty), add the data to 991 * the socket buffer and note that we need a delayed ack. 992 */ 993 if (tp->t_state == TCPS_ESTABLISHED && 994 #ifdef TCP_ECN 995 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 996 #else 997 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 998 #endif 999 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1000 th->th_seq == tp->rcv_nxt && 1001 tiwin && tiwin == tp->snd_wnd && 1002 tp->snd_nxt == tp->snd_max) { 1003 1004 /* 1005 * If last ACK falls within this segment's sequence numbers, 1006 * record the timestamp. 1007 * Fix from Braden, see Stevens p. 870 1008 */ 1009 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1010 tp->ts_recent_age = tcp_now; 1011 tp->ts_recent = opti.ts_val; 1012 } 1013 1014 if (tlen == 0) { 1015 if (SEQ_GT(th->th_ack, tp->snd_una) && 1016 SEQ_LEQ(th->th_ack, tp->snd_max) && 1017 tp->snd_cwnd >= tp->snd_wnd && 1018 tp->t_dupacks == 0) { 1019 /* 1020 * this is a pure ack for outstanding data. 1021 */ 1022 ++tcpstat.tcps_predack; 1023 if (opti.ts_present && opti.ts_ecr) 1024 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1025 else if (tp->t_rtttime && 1026 SEQ_GT(th->th_ack, tp->t_rtseq)) 1027 tcp_xmit_timer(tp, 1028 tcp_now - tp->t_rtttime); 1029 acked = th->th_ack - tp->snd_una; 1030 tcpstat.tcps_rcvackpack++; 1031 tcpstat.tcps_rcvackbyte += acked; 1032 ND6_HINT(tp); 1033 sbdrop(&so->so_snd, acked); 1034 1035 /* 1036 * If we had a pending ICMP message that 1037 * referres to data that have just been 1038 * acknowledged, disregard the recorded ICMP 1039 * message. 1040 */ 1041 if ((tp->t_flags & TF_PMTUD_PEND) && 1042 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1043 tp->t_flags &= ~TF_PMTUD_PEND; 1044 1045 /* 1046 * Keep track of the largest chunk of data 1047 * acknowledged since last PMTU update 1048 */ 1049 if (tp->t_pmtud_mss_acked < acked) 1050 tp->t_pmtud_mss_acked = acked; 1051 1052 tp->snd_una = th->th_ack; 1053 #if defined(TCP_SACK) || defined(TCP_ECN) 1054 /* 1055 * We want snd_last to track snd_una so 1056 * as to avoid sequence wraparound problems 1057 * for very large transfers. 1058 */ 1059 #ifdef TCP_ECN 1060 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1061 #endif 1062 tp->snd_last = tp->snd_una; 1063 #endif /* TCP_SACK */ 1064 #if defined(TCP_SACK) && defined(TCP_FACK) 1065 tp->snd_fack = tp->snd_una; 1066 tp->retran_data = 0; 1067 #endif /* TCP_FACK */ 1068 m_freem(m); 1069 1070 /* 1071 * If all outstanding data are acked, stop 1072 * retransmit timer, otherwise restart timer 1073 * using current (possibly backed-off) value. 1074 * If process is waiting for space, 1075 * wakeup/selwakeup/signal. If data 1076 * are ready to send, let tcp_output 1077 * decide between more output or persist. 1078 */ 1079 if (tp->snd_una == tp->snd_max) 1080 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1081 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1082 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1083 1084 tcp_update_sndspace(tp); 1085 if (sb_notify(&so->so_snd)) { 1086 tp->t_flags |= TF_BLOCKOUTPUT; 1087 sowwakeup(so); 1088 tp->t_flags &= ~TF_BLOCKOUTPUT; 1089 } 1090 if (so->so_snd.sb_cc || 1091 tp->t_flags & TF_NEEDOUTPUT) 1092 (void) tcp_output(tp); 1093 return; 1094 } 1095 } else if (th->th_ack == tp->snd_una && 1096 TAILQ_EMPTY(&tp->t_segq) && 1097 tlen <= sbspace(&so->so_rcv)) { 1098 /* 1099 * This is a pure, in-sequence data packet 1100 * with nothing on the reassembly queue and 1101 * we have enough buffer space to take it. 1102 */ 1103 #ifdef TCP_SACK 1104 /* Clean receiver SACK report if present */ 1105 if (tp->sack_enable && tp->rcv_numsacks) 1106 tcp_clean_sackreport(tp); 1107 #endif /* TCP_SACK */ 1108 ++tcpstat.tcps_preddat; 1109 tp->rcv_nxt += tlen; 1110 tcpstat.tcps_rcvpack++; 1111 tcpstat.tcps_rcvbyte += tlen; 1112 ND6_HINT(tp); 1113 1114 TCP_SETUP_ACK(tp, tiflags, m); 1115 /* 1116 * Drop TCP, IP headers and TCP options then add data 1117 * to socket buffer. 1118 */ 1119 if (so->so_state & SS_CANTRCVMORE) 1120 m_freem(m); 1121 else { 1122 if (opti.ts_present && opti.ts_ecr) { 1123 if (tp->rfbuf_ts < opti.ts_ecr && 1124 opti.ts_ecr - tp->rfbuf_ts < hz) { 1125 tcp_update_rcvspace(tp); 1126 /* Start over with next RTT. */ 1127 tp->rfbuf_cnt = 0; 1128 tp->rfbuf_ts = 0; 1129 } else 1130 tp->rfbuf_cnt += tlen; 1131 } 1132 m_adj(m, iphlen + off); 1133 sbappendstream(&so->so_rcv, m); 1134 } 1135 tp->t_flags |= TF_BLOCKOUTPUT; 1136 sorwakeup(so); 1137 tp->t_flags &= ~TF_BLOCKOUTPUT; 1138 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1139 (void) tcp_output(tp); 1140 return; 1141 } 1142 } 1143 1144 /* 1145 * Compute mbuf offset to TCP data segment. 1146 */ 1147 hdroptlen = iphlen + off; 1148 1149 /* 1150 * Calculate amount of space in receive window, 1151 * and then do TCP input processing. 1152 * Receive window is amount of space in rcv queue, 1153 * but not less than advertised window. 1154 */ 1155 { int win; 1156 1157 win = sbspace(&so->so_rcv); 1158 if (win < 0) 1159 win = 0; 1160 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1161 } 1162 1163 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1164 tp->rfbuf_cnt = 0; 1165 tp->rfbuf_ts = 0; 1166 1167 switch (tp->t_state) { 1168 1169 /* 1170 * If the state is SYN_RECEIVED: 1171 * if seg contains SYN/ACK, send an RST. 1172 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1173 */ 1174 1175 case TCPS_SYN_RECEIVED: 1176 if (tiflags & TH_ACK) { 1177 if (tiflags & TH_SYN) { 1178 tcpstat.tcps_badsyn++; 1179 goto dropwithreset; 1180 } 1181 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1182 SEQ_GT(th->th_ack, tp->snd_max)) 1183 goto dropwithreset; 1184 } 1185 break; 1186 1187 /* 1188 * If the state is SYN_SENT: 1189 * if seg contains an ACK, but not for our SYN, drop the input. 1190 * if seg contains a RST, then drop the connection. 1191 * if seg does not contain SYN, then drop it. 1192 * Otherwise this is an acceptable SYN segment 1193 * initialize tp->rcv_nxt and tp->irs 1194 * if seg contains ack then advance tp->snd_una 1195 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1196 * arrange for segment to be acked (eventually) 1197 * continue processing rest of data/controls, beginning with URG 1198 */ 1199 case TCPS_SYN_SENT: 1200 if ((tiflags & TH_ACK) && 1201 (SEQ_LEQ(th->th_ack, tp->iss) || 1202 SEQ_GT(th->th_ack, tp->snd_max))) 1203 goto dropwithreset; 1204 if (tiflags & TH_RST) { 1205 #ifdef TCP_ECN 1206 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1207 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1208 goto drop; 1209 #endif 1210 if (tiflags & TH_ACK) 1211 tp = tcp_drop(tp, ECONNREFUSED); 1212 goto drop; 1213 } 1214 if ((tiflags & TH_SYN) == 0) 1215 goto drop; 1216 if (tiflags & TH_ACK) { 1217 tp->snd_una = th->th_ack; 1218 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1219 tp->snd_nxt = tp->snd_una; 1220 } 1221 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1222 tp->irs = th->th_seq; 1223 tcp_mss(tp, opti.maxseg); 1224 /* Reset initial window to 1 segment for retransmit */ 1225 if (tp->t_rxtshift > 0) 1226 tp->snd_cwnd = tp->t_maxseg; 1227 tcp_rcvseqinit(tp); 1228 tp->t_flags |= TF_ACKNOW; 1229 #ifdef TCP_SACK 1230 /* 1231 * If we've sent a SACK_PERMITTED option, and the peer 1232 * also replied with one, then TF_SACK_PERMIT should have 1233 * been set in tcp_dooptions(). If it was not, disable SACKs. 1234 */ 1235 if (tp->sack_enable) 1236 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1237 #endif 1238 #ifdef TCP_ECN 1239 /* 1240 * if ECE is set but CWR is not set for SYN-ACK, or 1241 * both ECE and CWR are set for simultaneous open, 1242 * peer is ECN capable. 1243 */ 1244 if (tcp_do_ecn) { 1245 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1246 case TH_ACK|TH_ECE: 1247 case TH_ECE|TH_CWR: 1248 tp->t_flags |= TF_ECN_PERMIT; 1249 tiflags &= ~(TH_ECE|TH_CWR); 1250 tcpstat.tcps_ecn_accepts++; 1251 } 1252 } 1253 #endif 1254 1255 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1256 tcpstat.tcps_connects++; 1257 soisconnected(so); 1258 tp->t_state = TCPS_ESTABLISHED; 1259 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1260 /* Do window scaling on this connection? */ 1261 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1262 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1263 tp->snd_scale = tp->requested_s_scale; 1264 tp->rcv_scale = tp->request_r_scale; 1265 } 1266 tcp_flush_queue(tp); 1267 1268 /* 1269 * if we didn't have to retransmit the SYN, 1270 * use its rtt as our initial srtt & rtt var. 1271 */ 1272 if (tp->t_rtttime) 1273 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1274 /* 1275 * Since new data was acked (the SYN), open the 1276 * congestion window by one MSS. We do this 1277 * here, because we won't go through the normal 1278 * ACK processing below. And since this is the 1279 * start of the connection, we know we are in 1280 * the exponential phase of slow-start. 1281 */ 1282 tp->snd_cwnd += tp->t_maxseg; 1283 } else 1284 tp->t_state = TCPS_SYN_RECEIVED; 1285 1286 #if 0 1287 trimthenstep6: 1288 #endif 1289 /* 1290 * Advance th->th_seq to correspond to first data byte. 1291 * If data, trim to stay within window, 1292 * dropping FIN if necessary. 1293 */ 1294 th->th_seq++; 1295 if (tlen > tp->rcv_wnd) { 1296 todrop = tlen - tp->rcv_wnd; 1297 m_adj(m, -todrop); 1298 tlen = tp->rcv_wnd; 1299 tiflags &= ~TH_FIN; 1300 tcpstat.tcps_rcvpackafterwin++; 1301 tcpstat.tcps_rcvbyteafterwin += todrop; 1302 } 1303 tp->snd_wl1 = th->th_seq - 1; 1304 tp->rcv_up = th->th_seq; 1305 goto step6; 1306 /* 1307 * If a new connection request is received while in TIME_WAIT, 1308 * drop the old connection and start over if the if the 1309 * timestamp or the sequence numbers are above the previous 1310 * ones. 1311 */ 1312 case TCPS_TIME_WAIT: 1313 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1314 ((opti.ts_present && 1315 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1316 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1317 #if NPF > 0 1318 /* 1319 * The socket will be recreated but the new state 1320 * has already been linked to the socket. Remove the 1321 * link between old socket and new state. 1322 */ 1323 if (inp->inp_pf_sk) { 1324 inp->inp_pf_sk->inp = NULL; 1325 inp->inp_pf_sk = NULL; 1326 } 1327 #endif 1328 /* 1329 * Advance the iss by at least 32768, but 1330 * clear the msb in order to make sure 1331 * that SEG_LT(snd_nxt, iss). 1332 */ 1333 iss = tp->snd_nxt + 1334 ((arc4random() & 0x7fffffff) | 0x8000); 1335 reuse = &iss; 1336 tp = tcp_close(tp); 1337 inp = NULL; 1338 goto findpcb; 1339 } 1340 } 1341 1342 /* 1343 * States other than LISTEN or SYN_SENT. 1344 * First check timestamp, if present. 1345 * Then check that at least some bytes of segment are within 1346 * receive window. If segment begins before rcv_nxt, 1347 * drop leading data (and SYN); if nothing left, just ack. 1348 * 1349 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1350 * and it's less than opti.ts_recent, drop it. 1351 */ 1352 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1353 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1354 1355 /* Check to see if ts_recent is over 24 days old. */ 1356 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1357 /* 1358 * Invalidate ts_recent. If this segment updates 1359 * ts_recent, the age will be reset later and ts_recent 1360 * will get a valid value. If it does not, setting 1361 * ts_recent to zero will at least satisfy the 1362 * requirement that zero be placed in the timestamp 1363 * echo reply when ts_recent isn't valid. The 1364 * age isn't reset until we get a valid ts_recent 1365 * because we don't want out-of-order segments to be 1366 * dropped when ts_recent is old. 1367 */ 1368 tp->ts_recent = 0; 1369 } else { 1370 tcpstat.tcps_rcvduppack++; 1371 tcpstat.tcps_rcvdupbyte += tlen; 1372 tcpstat.tcps_pawsdrop++; 1373 goto dropafterack; 1374 } 1375 } 1376 1377 todrop = tp->rcv_nxt - th->th_seq; 1378 if (todrop > 0) { 1379 if (tiflags & TH_SYN) { 1380 tiflags &= ~TH_SYN; 1381 th->th_seq++; 1382 if (th->th_urp > 1) 1383 th->th_urp--; 1384 else 1385 tiflags &= ~TH_URG; 1386 todrop--; 1387 } 1388 if (todrop > tlen || 1389 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1390 /* 1391 * Any valid FIN must be to the left of the 1392 * window. At this point, FIN must be a 1393 * duplicate or out-of-sequence, so drop it. 1394 */ 1395 tiflags &= ~TH_FIN; 1396 /* 1397 * Send ACK to resynchronize, and drop any data, 1398 * but keep on processing for RST or ACK. 1399 */ 1400 tp->t_flags |= TF_ACKNOW; 1401 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1402 tcpstat.tcps_rcvduppack++; 1403 } else { 1404 tcpstat.tcps_rcvpartduppack++; 1405 tcpstat.tcps_rcvpartdupbyte += todrop; 1406 } 1407 hdroptlen += todrop; /* drop from head afterwards */ 1408 th->th_seq += todrop; 1409 tlen -= todrop; 1410 if (th->th_urp > todrop) 1411 th->th_urp -= todrop; 1412 else { 1413 tiflags &= ~TH_URG; 1414 th->th_urp = 0; 1415 } 1416 } 1417 1418 /* 1419 * If new data are received on a connection after the 1420 * user processes are gone, then RST the other end. 1421 */ 1422 if ((so->so_state & SS_NOFDREF) && 1423 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1424 tp = tcp_close(tp); 1425 tcpstat.tcps_rcvafterclose++; 1426 goto dropwithreset; 1427 } 1428 1429 /* 1430 * If segment ends after window, drop trailing data 1431 * (and PUSH and FIN); if nothing left, just ACK. 1432 */ 1433 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1434 if (todrop > 0) { 1435 tcpstat.tcps_rcvpackafterwin++; 1436 if (todrop >= tlen) { 1437 tcpstat.tcps_rcvbyteafterwin += tlen; 1438 /* 1439 * If window is closed can only take segments at 1440 * window edge, and have to drop data and PUSH from 1441 * incoming segments. Continue processing, but 1442 * remember to ack. Otherwise, drop segment 1443 * and ack. 1444 */ 1445 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1446 tp->t_flags |= TF_ACKNOW; 1447 tcpstat.tcps_rcvwinprobe++; 1448 } else 1449 goto dropafterack; 1450 } else 1451 tcpstat.tcps_rcvbyteafterwin += todrop; 1452 m_adj(m, -todrop); 1453 tlen -= todrop; 1454 tiflags &= ~(TH_PUSH|TH_FIN); 1455 } 1456 1457 /* 1458 * If last ACK falls within this segment's sequence numbers, 1459 * record its timestamp if it's more recent. 1460 * Cf fix from Braden, see Stevens p. 870 1461 */ 1462 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1463 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1464 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1465 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1466 tp->ts_recent = opti.ts_val; 1467 else 1468 tp->ts_recent = 0; 1469 tp->ts_recent_age = tcp_now; 1470 } 1471 1472 /* 1473 * If the RST bit is set examine the state: 1474 * SYN_RECEIVED STATE: 1475 * If passive open, return to LISTEN state. 1476 * If active open, inform user that connection was refused. 1477 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1478 * Inform user that connection was reset, and close tcb. 1479 * CLOSING, LAST_ACK, TIME_WAIT STATES 1480 * Close the tcb. 1481 */ 1482 if (tiflags & TH_RST) { 1483 if (th->th_seq != tp->last_ack_sent && 1484 th->th_seq != tp->rcv_nxt && 1485 th->th_seq != (tp->rcv_nxt + 1)) 1486 goto drop; 1487 1488 switch (tp->t_state) { 1489 case TCPS_SYN_RECEIVED: 1490 #ifdef TCP_ECN 1491 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1492 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1493 goto drop; 1494 #endif 1495 so->so_error = ECONNREFUSED; 1496 goto close; 1497 1498 case TCPS_ESTABLISHED: 1499 case TCPS_FIN_WAIT_1: 1500 case TCPS_FIN_WAIT_2: 1501 case TCPS_CLOSE_WAIT: 1502 so->so_error = ECONNRESET; 1503 close: 1504 tp->t_state = TCPS_CLOSED; 1505 tcpstat.tcps_drops++; 1506 tp = tcp_close(tp); 1507 goto drop; 1508 case TCPS_CLOSING: 1509 case TCPS_LAST_ACK: 1510 case TCPS_TIME_WAIT: 1511 tp = tcp_close(tp); 1512 goto drop; 1513 } 1514 } 1515 1516 /* 1517 * If a SYN is in the window, then this is an 1518 * error and we ACK and drop the packet. 1519 */ 1520 if (tiflags & TH_SYN) 1521 goto dropafterack_ratelim; 1522 1523 /* 1524 * If the ACK bit is off we drop the segment and return. 1525 */ 1526 if ((tiflags & TH_ACK) == 0) { 1527 if (tp->t_flags & TF_ACKNOW) 1528 goto dropafterack; 1529 else 1530 goto drop; 1531 } 1532 1533 /* 1534 * Ack processing. 1535 */ 1536 switch (tp->t_state) { 1537 1538 /* 1539 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1540 * ESTABLISHED state and continue processing. 1541 * The ACK was checked above. 1542 */ 1543 case TCPS_SYN_RECEIVED: 1544 tcpstat.tcps_connects++; 1545 soisconnected(so); 1546 tp->t_state = TCPS_ESTABLISHED; 1547 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1548 /* Do window scaling? */ 1549 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1550 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1551 tp->snd_scale = tp->requested_s_scale; 1552 tp->rcv_scale = tp->request_r_scale; 1553 tiwin = th->th_win << tp->snd_scale; 1554 } 1555 tcp_flush_queue(tp); 1556 tp->snd_wl1 = th->th_seq - 1; 1557 /* fall into ... */ 1558 1559 /* 1560 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1561 * ACKs. If the ack is in the range 1562 * tp->snd_una < th->th_ack <= tp->snd_max 1563 * then advance tp->snd_una to th->th_ack and drop 1564 * data from the retransmission queue. If this ACK reflects 1565 * more up to date window information we update our window information. 1566 */ 1567 case TCPS_ESTABLISHED: 1568 case TCPS_FIN_WAIT_1: 1569 case TCPS_FIN_WAIT_2: 1570 case TCPS_CLOSE_WAIT: 1571 case TCPS_CLOSING: 1572 case TCPS_LAST_ACK: 1573 case TCPS_TIME_WAIT: 1574 #ifdef TCP_ECN 1575 /* 1576 * if we receive ECE and are not already in recovery phase, 1577 * reduce cwnd by half but don't slow-start. 1578 * advance snd_last to snd_max not to reduce cwnd again 1579 * until all outstanding packets are acked. 1580 */ 1581 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1582 if ((tp->t_flags & TF_ECN_PERMIT) && 1583 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1584 u_int win; 1585 1586 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1587 if (win > 1) { 1588 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1589 tp->snd_cwnd = tp->snd_ssthresh; 1590 tp->snd_last = tp->snd_max; 1591 tp->t_flags |= TF_SEND_CWR; 1592 tcpstat.tcps_cwr_ecn++; 1593 } 1594 } 1595 tcpstat.tcps_ecn_rcvece++; 1596 } 1597 /* 1598 * if we receive CWR, we know that the peer has reduced 1599 * its congestion window. stop sending ecn-echo. 1600 */ 1601 if ((tiflags & TH_CWR)) { 1602 tp->t_flags &= ~TF_RCVD_CE; 1603 tcpstat.tcps_ecn_rcvcwr++; 1604 } 1605 #endif /* TCP_ECN */ 1606 1607 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1608 /* 1609 * Duplicate/old ACK processing. 1610 * Increments t_dupacks: 1611 * Pure duplicate (same seq/ack/window, no data) 1612 * Doesn't affect t_dupacks: 1613 * Data packets. 1614 * Normal window updates (window opens) 1615 * Resets t_dupacks: 1616 * New data ACKed. 1617 * Window shrinks 1618 * Old ACK 1619 */ 1620 if (tlen) { 1621 /* Drop very old ACKs unless th_seq matches */ 1622 if (th->th_seq != tp->rcv_nxt && 1623 SEQ_LT(th->th_ack, 1624 tp->snd_una - tp->max_sndwnd)) { 1625 tcpstat.tcps_rcvacktooold++; 1626 goto drop; 1627 } 1628 break; 1629 } 1630 /* 1631 * If we get an old ACK, there is probably packet 1632 * reordering going on. Be conservative and reset 1633 * t_dupacks so that we are less aggressive in 1634 * doing a fast retransmit. 1635 */ 1636 if (th->th_ack != tp->snd_una) { 1637 tp->t_dupacks = 0; 1638 break; 1639 } 1640 if (tiwin == tp->snd_wnd) { 1641 tcpstat.tcps_rcvdupack++; 1642 /* 1643 * If we have outstanding data (other than 1644 * a window probe), this is a completely 1645 * duplicate ack (ie, window info didn't 1646 * change), the ack is the biggest we've 1647 * seen and we've seen exactly our rexmt 1648 * threshold of them, assume a packet 1649 * has been dropped and retransmit it. 1650 * Kludge snd_nxt & the congestion 1651 * window so we send only this one 1652 * packet. 1653 * 1654 * We know we're losing at the current 1655 * window size so do congestion avoidance 1656 * (set ssthresh to half the current window 1657 * and pull our congestion window back to 1658 * the new ssthresh). 1659 * 1660 * Dup acks mean that packets have left the 1661 * network (they're now cached at the receiver) 1662 * so bump cwnd by the amount in the receiver 1663 * to keep a constant cwnd packets in the 1664 * network. 1665 */ 1666 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1667 tp->t_dupacks = 0; 1668 #if defined(TCP_SACK) && defined(TCP_FACK) 1669 /* 1670 * In FACK, can enter fast rec. if the receiver 1671 * reports a reass. queue longer than 3 segs. 1672 */ 1673 else if (++tp->t_dupacks == tcprexmtthresh || 1674 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1675 tp->t_maxseg + tp->snd_una)) && 1676 SEQ_GT(tp->snd_una, tp->snd_last))) { 1677 #else 1678 else if (++tp->t_dupacks == tcprexmtthresh) { 1679 #endif /* TCP_FACK */ 1680 tcp_seq onxt = tp->snd_nxt; 1681 u_long win = 1682 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1683 2 / tp->t_maxseg; 1684 1685 #if defined(TCP_SACK) || defined(TCP_ECN) 1686 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1687 /* 1688 * False fast retx after 1689 * timeout. Do not cut window. 1690 */ 1691 tp->t_dupacks = 0; 1692 goto drop; 1693 } 1694 #endif 1695 if (win < 2) 1696 win = 2; 1697 tp->snd_ssthresh = win * tp->t_maxseg; 1698 #ifdef TCP_SACK 1699 tp->snd_last = tp->snd_max; 1700 if (tp->sack_enable) { 1701 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1702 tp->t_rtttime = 0; 1703 #ifdef TCP_ECN 1704 tp->t_flags |= TF_SEND_CWR; 1705 #endif 1706 tcpstat.tcps_cwr_frecovery++; 1707 tcpstat.tcps_sack_recovery_episode++; 1708 #if defined(TCP_SACK) && defined(TCP_FACK) 1709 tp->t_dupacks = tcprexmtthresh; 1710 (void) tcp_output(tp); 1711 /* 1712 * During FR, snd_cwnd is held 1713 * constant for FACK. 1714 */ 1715 tp->snd_cwnd = tp->snd_ssthresh; 1716 #else 1717 /* 1718 * tcp_output() will send 1719 * oldest SACK-eligible rtx. 1720 */ 1721 (void) tcp_output(tp); 1722 tp->snd_cwnd = tp->snd_ssthresh+ 1723 tp->t_maxseg * tp->t_dupacks; 1724 #endif /* TCP_FACK */ 1725 goto drop; 1726 } 1727 #endif /* TCP_SACK */ 1728 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1729 tp->t_rtttime = 0; 1730 tp->snd_nxt = th->th_ack; 1731 tp->snd_cwnd = tp->t_maxseg; 1732 #ifdef TCP_ECN 1733 tp->t_flags |= TF_SEND_CWR; 1734 #endif 1735 tcpstat.tcps_cwr_frecovery++; 1736 tcpstat.tcps_sndrexmitfast++; 1737 (void) tcp_output(tp); 1738 1739 tp->snd_cwnd = tp->snd_ssthresh + 1740 tp->t_maxseg * tp->t_dupacks; 1741 if (SEQ_GT(onxt, tp->snd_nxt)) 1742 tp->snd_nxt = onxt; 1743 goto drop; 1744 } else if (tp->t_dupacks > tcprexmtthresh) { 1745 #if defined(TCP_SACK) && defined(TCP_FACK) 1746 /* 1747 * while (awnd < cwnd) 1748 * sendsomething(); 1749 */ 1750 if (tp->sack_enable) { 1751 if (tp->snd_awnd < tp->snd_cwnd) 1752 tcp_output(tp); 1753 goto drop; 1754 } 1755 #endif /* TCP_FACK */ 1756 tp->snd_cwnd += tp->t_maxseg; 1757 (void) tcp_output(tp); 1758 goto drop; 1759 } 1760 } else if (tiwin < tp->snd_wnd) { 1761 /* 1762 * The window was retracted! Previous dup 1763 * ACKs may have been due to packets arriving 1764 * after the shrunken window, not a missing 1765 * packet, so play it safe and reset t_dupacks 1766 */ 1767 tp->t_dupacks = 0; 1768 } 1769 break; 1770 } 1771 /* 1772 * If the congestion window was inflated to account 1773 * for the other side's cached packets, retract it. 1774 */ 1775 #if defined(TCP_SACK) 1776 if (tp->sack_enable) { 1777 if (tp->t_dupacks >= tcprexmtthresh) { 1778 /* Check for a partial ACK */ 1779 if (tcp_sack_partialack(tp, th)) { 1780 #if defined(TCP_SACK) && defined(TCP_FACK) 1781 /* Force call to tcp_output */ 1782 if (tp->snd_awnd < tp->snd_cwnd) 1783 tp->t_flags |= TF_NEEDOUTPUT; 1784 #else 1785 tp->snd_cwnd += tp->t_maxseg; 1786 tp->t_flags |= TF_NEEDOUTPUT; 1787 #endif /* TCP_FACK */ 1788 } else { 1789 /* Out of fast recovery */ 1790 tp->snd_cwnd = tp->snd_ssthresh; 1791 if (tcp_seq_subtract(tp->snd_max, 1792 th->th_ack) < tp->snd_ssthresh) 1793 tp->snd_cwnd = 1794 tcp_seq_subtract(tp->snd_max, 1795 th->th_ack); 1796 tp->t_dupacks = 0; 1797 #if defined(TCP_SACK) && defined(TCP_FACK) 1798 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1799 tp->snd_fack = th->th_ack; 1800 #endif /* TCP_FACK */ 1801 } 1802 } 1803 } else { 1804 if (tp->t_dupacks >= tcprexmtthresh && 1805 !tcp_newreno(tp, th)) { 1806 /* Out of fast recovery */ 1807 tp->snd_cwnd = tp->snd_ssthresh; 1808 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1809 tp->snd_ssthresh) 1810 tp->snd_cwnd = 1811 tcp_seq_subtract(tp->snd_max, 1812 th->th_ack); 1813 tp->t_dupacks = 0; 1814 } 1815 } 1816 if (tp->t_dupacks < tcprexmtthresh) 1817 tp->t_dupacks = 0; 1818 #else /* else no TCP_SACK */ 1819 if (tp->t_dupacks >= tcprexmtthresh && 1820 tp->snd_cwnd > tp->snd_ssthresh) 1821 tp->snd_cwnd = tp->snd_ssthresh; 1822 tp->t_dupacks = 0; 1823 #endif 1824 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1825 tcpstat.tcps_rcvacktoomuch++; 1826 goto dropafterack_ratelim; 1827 } 1828 acked = th->th_ack - tp->snd_una; 1829 tcpstat.tcps_rcvackpack++; 1830 tcpstat.tcps_rcvackbyte += acked; 1831 1832 /* 1833 * If we have a timestamp reply, update smoothed 1834 * round trip time. If no timestamp is present but 1835 * transmit timer is running and timed sequence 1836 * number was acked, update smoothed round trip time. 1837 * Since we now have an rtt measurement, cancel the 1838 * timer backoff (cf., Phil Karn's retransmit alg.). 1839 * Recompute the initial retransmit timer. 1840 */ 1841 if (opti.ts_present && opti.ts_ecr) 1842 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1843 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1844 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1845 1846 /* 1847 * If all outstanding data is acked, stop retransmit 1848 * timer and remember to restart (more output or persist). 1849 * If there is more data to be acked, restart retransmit 1850 * timer, using current (possibly backed-off) value. 1851 */ 1852 if (th->th_ack == tp->snd_max) { 1853 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1854 tp->t_flags |= TF_NEEDOUTPUT; 1855 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1856 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1857 /* 1858 * When new data is acked, open the congestion window. 1859 * If the window gives us less than ssthresh packets 1860 * in flight, open exponentially (maxseg per packet). 1861 * Otherwise open linearly: maxseg per window 1862 * (maxseg^2 / cwnd per packet). 1863 */ 1864 { 1865 u_int cw = tp->snd_cwnd; 1866 u_int incr = tp->t_maxseg; 1867 1868 if (cw > tp->snd_ssthresh) 1869 incr = incr * incr / cw; 1870 #if defined (TCP_SACK) 1871 if (tp->t_dupacks < tcprexmtthresh) 1872 #endif 1873 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1874 } 1875 ND6_HINT(tp); 1876 if (acked > so->so_snd.sb_cc) { 1877 tp->snd_wnd -= so->so_snd.sb_cc; 1878 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1879 ourfinisacked = 1; 1880 } else { 1881 sbdrop(&so->so_snd, acked); 1882 tp->snd_wnd -= acked; 1883 ourfinisacked = 0; 1884 } 1885 1886 tcp_update_sndspace(tp); 1887 if (sb_notify(&so->so_snd)) { 1888 tp->t_flags |= TF_BLOCKOUTPUT; 1889 sowwakeup(so); 1890 tp->t_flags &= ~TF_BLOCKOUTPUT; 1891 } 1892 1893 /* 1894 * If we had a pending ICMP message that referred to data 1895 * that have just been acknowledged, disregard the recorded 1896 * ICMP message. 1897 */ 1898 if ((tp->t_flags & TF_PMTUD_PEND) && 1899 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1900 tp->t_flags &= ~TF_PMTUD_PEND; 1901 1902 /* 1903 * Keep track of the largest chunk of data acknowledged 1904 * since last PMTU update 1905 */ 1906 if (tp->t_pmtud_mss_acked < acked) 1907 tp->t_pmtud_mss_acked = acked; 1908 1909 tp->snd_una = th->th_ack; 1910 #ifdef TCP_ECN 1911 /* sync snd_last with snd_una */ 1912 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1913 tp->snd_last = tp->snd_una; 1914 #endif 1915 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1916 tp->snd_nxt = tp->snd_una; 1917 #if defined (TCP_SACK) && defined (TCP_FACK) 1918 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1919 tp->snd_fack = tp->snd_una; 1920 /* Update snd_awnd for partial ACK 1921 * without any SACK blocks. 1922 */ 1923 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1924 tp->snd_fack) + tp->retran_data; 1925 } 1926 #endif 1927 1928 switch (tp->t_state) { 1929 1930 /* 1931 * In FIN_WAIT_1 STATE in addition to the processing 1932 * for the ESTABLISHED state if our FIN is now acknowledged 1933 * then enter FIN_WAIT_2. 1934 */ 1935 case TCPS_FIN_WAIT_1: 1936 if (ourfinisacked) { 1937 /* 1938 * If we can't receive any more 1939 * data, then closing user can proceed. 1940 * Starting the timer is contrary to the 1941 * specification, but if we don't get a FIN 1942 * we'll hang forever. 1943 */ 1944 if (so->so_state & SS_CANTRCVMORE) { 1945 soisdisconnected(so); 1946 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1947 } 1948 tp->t_state = TCPS_FIN_WAIT_2; 1949 } 1950 break; 1951 1952 /* 1953 * In CLOSING STATE in addition to the processing for 1954 * the ESTABLISHED state if the ACK acknowledges our FIN 1955 * then enter the TIME-WAIT state, otherwise ignore 1956 * the segment. 1957 */ 1958 case TCPS_CLOSING: 1959 if (ourfinisacked) { 1960 tp->t_state = TCPS_TIME_WAIT; 1961 tcp_canceltimers(tp); 1962 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1963 soisdisconnected(so); 1964 } 1965 break; 1966 1967 /* 1968 * In LAST_ACK, we may still be waiting for data to drain 1969 * and/or to be acked, as well as for the ack of our FIN. 1970 * If our FIN is now acknowledged, delete the TCB, 1971 * enter the closed state and return. 1972 */ 1973 case TCPS_LAST_ACK: 1974 if (ourfinisacked) { 1975 tp = tcp_close(tp); 1976 goto drop; 1977 } 1978 break; 1979 1980 /* 1981 * In TIME_WAIT state the only thing that should arrive 1982 * is a retransmission of the remote FIN. Acknowledge 1983 * it and restart the finack timer. 1984 */ 1985 case TCPS_TIME_WAIT: 1986 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1987 goto dropafterack; 1988 } 1989 } 1990 1991 step6: 1992 /* 1993 * Update window information. 1994 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1995 */ 1996 if ((tiflags & TH_ACK) && 1997 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1998 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1999 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2000 /* keep track of pure window updates */ 2001 if (tlen == 0 && 2002 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2003 tcpstat.tcps_rcvwinupd++; 2004 tp->snd_wnd = tiwin; 2005 tp->snd_wl1 = th->th_seq; 2006 tp->snd_wl2 = th->th_ack; 2007 if (tp->snd_wnd > tp->max_sndwnd) 2008 tp->max_sndwnd = tp->snd_wnd; 2009 tp->t_flags |= TF_NEEDOUTPUT; 2010 } 2011 2012 /* 2013 * Process segments with URG. 2014 */ 2015 if ((tiflags & TH_URG) && th->th_urp && 2016 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2017 /* 2018 * This is a kludge, but if we receive and accept 2019 * random urgent pointers, we'll crash in 2020 * soreceive. It's hard to imagine someone 2021 * actually wanting to send this much urgent data. 2022 */ 2023 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2024 th->th_urp = 0; /* XXX */ 2025 tiflags &= ~TH_URG; /* XXX */ 2026 goto dodata; /* XXX */ 2027 } 2028 /* 2029 * If this segment advances the known urgent pointer, 2030 * then mark the data stream. This should not happen 2031 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2032 * a FIN has been received from the remote side. 2033 * In these states we ignore the URG. 2034 * 2035 * According to RFC961 (Assigned Protocols), 2036 * the urgent pointer points to the last octet 2037 * of urgent data. We continue, however, 2038 * to consider it to indicate the first octet 2039 * of data past the urgent section as the original 2040 * spec states (in one of two places). 2041 */ 2042 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2043 tp->rcv_up = th->th_seq + th->th_urp; 2044 so->so_oobmark = so->so_rcv.sb_cc + 2045 (tp->rcv_up - tp->rcv_nxt) - 1; 2046 if (so->so_oobmark == 0) 2047 so->so_state |= SS_RCVATMARK; 2048 sohasoutofband(so); 2049 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2050 } 2051 /* 2052 * Remove out of band data so doesn't get presented to user. 2053 * This can happen independent of advancing the URG pointer, 2054 * but if two URG's are pending at once, some out-of-band 2055 * data may creep in... ick. 2056 */ 2057 if (th->th_urp <= (u_int16_t) tlen && 2058 (so->so_options & SO_OOBINLINE) == 0) 2059 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2060 } else 2061 /* 2062 * If no out of band data is expected, 2063 * pull receive urgent pointer along 2064 * with the receive window. 2065 */ 2066 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2067 tp->rcv_up = tp->rcv_nxt; 2068 dodata: /* XXX */ 2069 2070 /* 2071 * Process the segment text, merging it into the TCP sequencing queue, 2072 * and arranging for acknowledgment of receipt if necessary. 2073 * This process logically involves adjusting tp->rcv_wnd as data 2074 * is presented to the user (this happens in tcp_usrreq.c, 2075 * case PRU_RCVD). If a FIN has already been received on this 2076 * connection then we just ignore the text. 2077 */ 2078 if ((tlen || (tiflags & TH_FIN)) && 2079 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2080 #ifdef TCP_SACK 2081 tcp_seq laststart = th->th_seq; 2082 tcp_seq lastend = th->th_seq + tlen; 2083 #endif 2084 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2085 tp->t_state == TCPS_ESTABLISHED) { 2086 TCP_SETUP_ACK(tp, tiflags, m); 2087 tp->rcv_nxt += tlen; 2088 tiflags = th->th_flags & TH_FIN; 2089 tcpstat.tcps_rcvpack++; 2090 tcpstat.tcps_rcvbyte += tlen; 2091 ND6_HINT(tp); 2092 if (so->so_state & SS_CANTRCVMORE) 2093 m_freem(m); 2094 else { 2095 m_adj(m, hdroptlen); 2096 sbappendstream(&so->so_rcv, m); 2097 } 2098 tp->t_flags |= TF_BLOCKOUTPUT; 2099 sorwakeup(so); 2100 tp->t_flags &= ~TF_BLOCKOUTPUT; 2101 } else { 2102 m_adj(m, hdroptlen); 2103 tiflags = tcp_reass(tp, th, m, &tlen); 2104 tp->t_flags |= TF_ACKNOW; 2105 } 2106 #ifdef TCP_SACK 2107 if (tp->sack_enable) 2108 tcp_update_sack_list(tp, laststart, lastend); 2109 #endif 2110 2111 /* 2112 * variable len never referenced again in modern BSD, 2113 * so why bother computing it ?? 2114 */ 2115 #if 0 2116 /* 2117 * Note the amount of data that peer has sent into 2118 * our window, in order to estimate the sender's 2119 * buffer size. 2120 */ 2121 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2122 #endif /* 0 */ 2123 } else { 2124 m_freem(m); 2125 tiflags &= ~TH_FIN; 2126 } 2127 2128 /* 2129 * If FIN is received ACK the FIN and let the user know 2130 * that the connection is closing. Ignore a FIN received before 2131 * the connection is fully established. 2132 */ 2133 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2134 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2135 socantrcvmore(so); 2136 tp->t_flags |= TF_ACKNOW; 2137 tp->rcv_nxt++; 2138 } 2139 switch (tp->t_state) { 2140 2141 /* 2142 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2143 */ 2144 case TCPS_ESTABLISHED: 2145 tp->t_state = TCPS_CLOSE_WAIT; 2146 break; 2147 2148 /* 2149 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2150 * enter the CLOSING state. 2151 */ 2152 case TCPS_FIN_WAIT_1: 2153 tp->t_state = TCPS_CLOSING; 2154 break; 2155 2156 /* 2157 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2158 * starting the time-wait timer, turning off the other 2159 * standard timers. 2160 */ 2161 case TCPS_FIN_WAIT_2: 2162 tp->t_state = TCPS_TIME_WAIT; 2163 tcp_canceltimers(tp); 2164 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2165 soisdisconnected(so); 2166 break; 2167 2168 /* 2169 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2170 */ 2171 case TCPS_TIME_WAIT: 2172 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2173 break; 2174 } 2175 } 2176 if (so->so_options & SO_DEBUG) { 2177 switch (tp->pf) { 2178 #ifdef INET6 2179 case PF_INET6: 2180 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2181 0, tlen); 2182 break; 2183 #endif /* INET6 */ 2184 case PF_INET: 2185 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2186 0, tlen); 2187 break; 2188 } 2189 } 2190 2191 /* 2192 * Return any desired output. 2193 */ 2194 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2195 (void) tcp_output(tp); 2196 return; 2197 2198 badsyn: 2199 /* 2200 * Received a bad SYN. Increment counters and dropwithreset. 2201 */ 2202 tcpstat.tcps_badsyn++; 2203 tp = NULL; 2204 goto dropwithreset; 2205 2206 dropafterack_ratelim: 2207 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2208 tcp_ackdrop_ppslim) == 0) { 2209 /* XXX stat */ 2210 goto drop; 2211 } 2212 /* ...fall into dropafterack... */ 2213 2214 dropafterack: 2215 /* 2216 * Generate an ACK dropping incoming segment if it occupies 2217 * sequence space, where the ACK reflects our state. 2218 */ 2219 if (tiflags & TH_RST) 2220 goto drop; 2221 m_freem(m); 2222 tp->t_flags |= TF_ACKNOW; 2223 (void) tcp_output(tp); 2224 return; 2225 2226 dropwithreset_ratelim: 2227 /* 2228 * We may want to rate-limit RSTs in certain situations, 2229 * particularly if we are sending an RST in response to 2230 * an attempt to connect to or otherwise communicate with 2231 * a port for which we have no socket. 2232 */ 2233 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2234 tcp_rst_ppslim) == 0) { 2235 /* XXX stat */ 2236 goto drop; 2237 } 2238 /* ...fall into dropwithreset... */ 2239 2240 dropwithreset: 2241 /* 2242 * Generate a RST, dropping incoming segment. 2243 * Make ACK acceptable to originator of segment. 2244 * Don't bother to respond to RST. 2245 */ 2246 if (tiflags & TH_RST) 2247 goto drop; 2248 if (tiflags & TH_ACK) { 2249 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2250 TH_RST, m->m_pkthdr.ph_rtableid); 2251 } else { 2252 if (tiflags & TH_SYN) 2253 tlen++; 2254 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2255 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2256 } 2257 m_freem(m); 2258 return; 2259 2260 drop: 2261 /* 2262 * Drop space held by incoming segment and return. 2263 */ 2264 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2265 switch (tp->pf) { 2266 #ifdef INET6 2267 case PF_INET6: 2268 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2269 0, tlen); 2270 break; 2271 #endif /* INET6 */ 2272 case PF_INET: 2273 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2274 0, tlen); 2275 break; 2276 } 2277 } 2278 2279 m_freem(m); 2280 return; 2281 } 2282 2283 int 2284 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2285 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2286 u_int rtableid) 2287 { 2288 u_int16_t mss = 0; 2289 int opt, optlen; 2290 #ifdef TCP_SIGNATURE 2291 caddr_t sigp = NULL; 2292 struct tdb *tdb = NULL; 2293 #endif /* TCP_SIGNATURE */ 2294 2295 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2296 opt = cp[0]; 2297 if (opt == TCPOPT_EOL) 2298 break; 2299 if (opt == TCPOPT_NOP) 2300 optlen = 1; 2301 else { 2302 if (cnt < 2) 2303 break; 2304 optlen = cp[1]; 2305 if (optlen < 2 || optlen > cnt) 2306 break; 2307 } 2308 switch (opt) { 2309 2310 default: 2311 continue; 2312 2313 case TCPOPT_MAXSEG: 2314 if (optlen != TCPOLEN_MAXSEG) 2315 continue; 2316 if (!(th->th_flags & TH_SYN)) 2317 continue; 2318 if (TCPS_HAVERCVDSYN(tp->t_state)) 2319 continue; 2320 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2321 NTOHS(mss); 2322 oi->maxseg = mss; 2323 break; 2324 2325 case TCPOPT_WINDOW: 2326 if (optlen != TCPOLEN_WINDOW) 2327 continue; 2328 if (!(th->th_flags & TH_SYN)) 2329 continue; 2330 if (TCPS_HAVERCVDSYN(tp->t_state)) 2331 continue; 2332 tp->t_flags |= TF_RCVD_SCALE; 2333 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2334 break; 2335 2336 case TCPOPT_TIMESTAMP: 2337 if (optlen != TCPOLEN_TIMESTAMP) 2338 continue; 2339 oi->ts_present = 1; 2340 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2341 NTOHL(oi->ts_val); 2342 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2343 NTOHL(oi->ts_ecr); 2344 2345 if (!(th->th_flags & TH_SYN)) 2346 continue; 2347 if (TCPS_HAVERCVDSYN(tp->t_state)) 2348 continue; 2349 /* 2350 * A timestamp received in a SYN makes 2351 * it ok to send timestamp requests and replies. 2352 */ 2353 tp->t_flags |= TF_RCVD_TSTMP; 2354 tp->ts_recent = oi->ts_val; 2355 tp->ts_recent_age = tcp_now; 2356 break; 2357 2358 #ifdef TCP_SACK 2359 case TCPOPT_SACK_PERMITTED: 2360 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2361 continue; 2362 if (!(th->th_flags & TH_SYN)) 2363 continue; 2364 if (TCPS_HAVERCVDSYN(tp->t_state)) 2365 continue; 2366 /* MUST only be set on SYN */ 2367 tp->t_flags |= TF_SACK_PERMIT; 2368 break; 2369 case TCPOPT_SACK: 2370 tcp_sack_option(tp, th, cp, optlen); 2371 break; 2372 #endif 2373 #ifdef TCP_SIGNATURE 2374 case TCPOPT_SIGNATURE: 2375 if (optlen != TCPOLEN_SIGNATURE) 2376 continue; 2377 2378 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2379 return (-1); 2380 2381 sigp = cp + 2; 2382 break; 2383 #endif /* TCP_SIGNATURE */ 2384 } 2385 } 2386 2387 #ifdef TCP_SIGNATURE 2388 if (tp->t_flags & TF_SIGNATURE) { 2389 union sockaddr_union src, dst; 2390 2391 memset(&src, 0, sizeof(union sockaddr_union)); 2392 memset(&dst, 0, sizeof(union sockaddr_union)); 2393 2394 switch (tp->pf) { 2395 case 0: 2396 #ifdef INET 2397 case AF_INET: 2398 src.sa.sa_len = sizeof(struct sockaddr_in); 2399 src.sa.sa_family = AF_INET; 2400 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2401 dst.sa.sa_len = sizeof(struct sockaddr_in); 2402 dst.sa.sa_family = AF_INET; 2403 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2404 break; 2405 #endif 2406 #ifdef INET6 2407 case AF_INET6: 2408 src.sa.sa_len = sizeof(struct sockaddr_in6); 2409 src.sa.sa_family = AF_INET6; 2410 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2411 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2412 dst.sa.sa_family = AF_INET6; 2413 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2414 break; 2415 #endif /* INET6 */ 2416 } 2417 2418 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2419 0, &src, &dst, IPPROTO_TCP); 2420 2421 /* 2422 * We don't have an SA for this peer, so we turn off 2423 * TF_SIGNATURE on the listen socket 2424 */ 2425 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2426 tp->t_flags &= ~TF_SIGNATURE; 2427 2428 } 2429 2430 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2431 tcpstat.tcps_rcvbadsig++; 2432 return (-1); 2433 } 2434 2435 if (sigp) { 2436 char sig[16]; 2437 2438 if (tdb == NULL) { 2439 tcpstat.tcps_rcvbadsig++; 2440 return (-1); 2441 } 2442 2443 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2444 return (-1); 2445 2446 if (timingsafe_bcmp(sig, sigp, 16)) { 2447 tcpstat.tcps_rcvbadsig++; 2448 return (-1); 2449 } 2450 2451 tcpstat.tcps_rcvgoodsig++; 2452 } 2453 #endif /* TCP_SIGNATURE */ 2454 2455 return (0); 2456 } 2457 2458 #if defined(TCP_SACK) 2459 u_long 2460 tcp_seq_subtract(u_long a, u_long b) 2461 { 2462 return ((long)(a - b)); 2463 } 2464 #endif 2465 2466 2467 #ifdef TCP_SACK 2468 /* 2469 * This function is called upon receipt of new valid data (while not in header 2470 * prediction mode), and it updates the ordered list of sacks. 2471 */ 2472 void 2473 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2474 tcp_seq rcv_lastend) 2475 { 2476 /* 2477 * First reported block MUST be the most recent one. Subsequent 2478 * blocks SHOULD be in the order in which they arrived at the 2479 * receiver. These two conditions make the implementation fully 2480 * compliant with RFC 2018. 2481 */ 2482 int i, j = 0, count = 0, lastpos = -1; 2483 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2484 2485 /* First clean up current list of sacks */ 2486 for (i = 0; i < tp->rcv_numsacks; i++) { 2487 sack = tp->sackblks[i]; 2488 if (sack.start == 0 && sack.end == 0) { 2489 count++; /* count = number of blocks to be discarded */ 2490 continue; 2491 } 2492 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2493 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2494 count++; 2495 } else { 2496 temp[j].start = tp->sackblks[i].start; 2497 temp[j++].end = tp->sackblks[i].end; 2498 } 2499 } 2500 tp->rcv_numsacks -= count; 2501 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2502 tcp_clean_sackreport(tp); 2503 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2504 /* ==> need first sack block */ 2505 tp->sackblks[0].start = rcv_laststart; 2506 tp->sackblks[0].end = rcv_lastend; 2507 tp->rcv_numsacks = 1; 2508 } 2509 return; 2510 } 2511 /* Otherwise, sack blocks are already present. */ 2512 for (i = 0; i < tp->rcv_numsacks; i++) 2513 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2514 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2515 return; /* sack list remains unchanged */ 2516 /* 2517 * From here, segment just received should be (part of) the 1st sack. 2518 * Go through list, possibly coalescing sack block entries. 2519 */ 2520 firstsack.start = rcv_laststart; 2521 firstsack.end = rcv_lastend; 2522 for (i = 0; i < tp->rcv_numsacks; i++) { 2523 sack = tp->sackblks[i]; 2524 if (SEQ_LT(sack.end, firstsack.start) || 2525 SEQ_GT(sack.start, firstsack.end)) 2526 continue; /* no overlap */ 2527 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2528 /* 2529 * identical block; delete it here since we will 2530 * move it to the front of the list. 2531 */ 2532 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2533 lastpos = i; /* last posn with a zero entry */ 2534 continue; 2535 } 2536 if (SEQ_LEQ(sack.start, firstsack.start)) 2537 firstsack.start = sack.start; /* merge blocks */ 2538 if (SEQ_GEQ(sack.end, firstsack.end)) 2539 firstsack.end = sack.end; /* merge blocks */ 2540 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2541 lastpos = i; /* last posn with a zero entry */ 2542 } 2543 if (lastpos != -1) { /* at least one merge */ 2544 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2545 sack = tp->sackblks[i]; 2546 if (sack.start == 0 && sack.end == 0) 2547 continue; 2548 temp[j++] = sack; 2549 } 2550 tp->rcv_numsacks = j; /* including first blk (added later) */ 2551 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2552 tp->sackblks[i] = temp[i]; 2553 } else { /* no merges -- shift sacks by 1 */ 2554 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2555 tp->rcv_numsacks++; 2556 for (i = tp->rcv_numsacks-1; i > 0; i--) 2557 tp->sackblks[i] = tp->sackblks[i-1]; 2558 } 2559 tp->sackblks[0] = firstsack; 2560 return; 2561 } 2562 2563 /* 2564 * Process the TCP SACK option. tp->snd_holes is an ordered list 2565 * of holes (oldest to newest, in terms of the sequence space). 2566 */ 2567 void 2568 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2569 { 2570 int tmp_olen; 2571 u_char *tmp_cp; 2572 struct sackhole *cur, *p, *temp; 2573 2574 if (!tp->sack_enable) 2575 return; 2576 /* SACK without ACK doesn't make sense. */ 2577 if ((th->th_flags & TH_ACK) == 0) 2578 return; 2579 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2580 if (SEQ_LT(th->th_ack, tp->snd_una) || 2581 SEQ_GT(th->th_ack, tp->snd_max)) 2582 return; 2583 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2584 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2585 return; 2586 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2587 tmp_cp = cp + 2; 2588 tmp_olen = optlen - 2; 2589 tcpstat.tcps_sack_rcv_opts++; 2590 if (tp->snd_numholes < 0) 2591 tp->snd_numholes = 0; 2592 if (tp->t_maxseg == 0) 2593 panic("tcp_sack_option"); /* Should never happen */ 2594 while (tmp_olen > 0) { 2595 struct sackblk sack; 2596 2597 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2598 NTOHL(sack.start); 2599 bcopy(tmp_cp + sizeof(tcp_seq), 2600 (char *) &(sack.end), sizeof(tcp_seq)); 2601 NTOHL(sack.end); 2602 tmp_olen -= TCPOLEN_SACK; 2603 tmp_cp += TCPOLEN_SACK; 2604 if (SEQ_LEQ(sack.end, sack.start)) 2605 continue; /* bad SACK fields */ 2606 if (SEQ_LEQ(sack.end, tp->snd_una)) 2607 continue; /* old block */ 2608 #if defined(TCP_SACK) && defined(TCP_FACK) 2609 /* Updates snd_fack. */ 2610 if (SEQ_GT(sack.end, tp->snd_fack)) 2611 tp->snd_fack = sack.end; 2612 #endif /* TCP_FACK */ 2613 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2614 if (SEQ_LT(sack.start, th->th_ack)) 2615 continue; 2616 } 2617 if (SEQ_GT(sack.end, tp->snd_max)) 2618 continue; 2619 if (tp->snd_holes == NULL) { /* first hole */ 2620 tp->snd_holes = (struct sackhole *) 2621 pool_get(&sackhl_pool, PR_NOWAIT); 2622 if (tp->snd_holes == NULL) { 2623 /* ENOBUFS, so ignore SACKed block for now*/ 2624 goto done; 2625 } 2626 cur = tp->snd_holes; 2627 cur->start = th->th_ack; 2628 cur->end = sack.start; 2629 cur->rxmit = cur->start; 2630 cur->next = NULL; 2631 tp->snd_numholes = 1; 2632 tp->rcv_lastsack = sack.end; 2633 /* 2634 * dups is at least one. If more data has been 2635 * SACKed, it can be greater than one. 2636 */ 2637 cur->dups = min(tcprexmtthresh, 2638 ((sack.end - cur->end)/tp->t_maxseg)); 2639 if (cur->dups < 1) 2640 cur->dups = 1; 2641 continue; /* with next sack block */ 2642 } 2643 /* Go thru list of holes: p = previous, cur = current */ 2644 p = cur = tp->snd_holes; 2645 while (cur) { 2646 if (SEQ_LEQ(sack.end, cur->start)) 2647 /* SACKs data before the current hole */ 2648 break; /* no use going through more holes */ 2649 if (SEQ_GEQ(sack.start, cur->end)) { 2650 /* SACKs data beyond the current hole */ 2651 cur->dups++; 2652 if (((sack.end - cur->end)/tp->t_maxseg) >= 2653 tcprexmtthresh) 2654 cur->dups = tcprexmtthresh; 2655 p = cur; 2656 cur = cur->next; 2657 continue; 2658 } 2659 if (SEQ_LEQ(sack.start, cur->start)) { 2660 /* Data acks at least the beginning of hole */ 2661 #if defined(TCP_SACK) && defined(TCP_FACK) 2662 if (SEQ_GT(sack.end, cur->rxmit)) 2663 tp->retran_data -= 2664 tcp_seq_subtract(cur->rxmit, 2665 cur->start); 2666 else 2667 tp->retran_data -= 2668 tcp_seq_subtract(sack.end, 2669 cur->start); 2670 #endif /* TCP_FACK */ 2671 if (SEQ_GEQ(sack.end, cur->end)) { 2672 /* Acks entire hole, so delete hole */ 2673 if (p != cur) { 2674 p->next = cur->next; 2675 pool_put(&sackhl_pool, cur); 2676 cur = p->next; 2677 } else { 2678 cur = cur->next; 2679 pool_put(&sackhl_pool, p); 2680 p = cur; 2681 tp->snd_holes = p; 2682 } 2683 tp->snd_numholes--; 2684 continue; 2685 } 2686 /* otherwise, move start of hole forward */ 2687 cur->start = sack.end; 2688 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2689 p = cur; 2690 cur = cur->next; 2691 continue; 2692 } 2693 /* move end of hole backward */ 2694 if (SEQ_GEQ(sack.end, cur->end)) { 2695 #if defined(TCP_SACK) && defined(TCP_FACK) 2696 if (SEQ_GT(cur->rxmit, sack.start)) 2697 tp->retran_data -= 2698 tcp_seq_subtract(cur->rxmit, 2699 sack.start); 2700 #endif /* TCP_FACK */ 2701 cur->end = sack.start; 2702 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2703 cur->dups++; 2704 if (((sack.end - cur->end)/tp->t_maxseg) >= 2705 tcprexmtthresh) 2706 cur->dups = tcprexmtthresh; 2707 p = cur; 2708 cur = cur->next; 2709 continue; 2710 } 2711 if (SEQ_LT(cur->start, sack.start) && 2712 SEQ_GT(cur->end, sack.end)) { 2713 /* 2714 * ACKs some data in middle of a hole; need to 2715 * split current hole 2716 */ 2717 temp = (struct sackhole *) 2718 pool_get(&sackhl_pool, PR_NOWAIT); 2719 if (temp == NULL) 2720 goto done; /* ENOBUFS */ 2721 #if defined(TCP_SACK) && defined(TCP_FACK) 2722 if (SEQ_GT(cur->rxmit, sack.end)) 2723 tp->retran_data -= 2724 tcp_seq_subtract(sack.end, 2725 sack.start); 2726 else if (SEQ_GT(cur->rxmit, sack.start)) 2727 tp->retran_data -= 2728 tcp_seq_subtract(cur->rxmit, 2729 sack.start); 2730 #endif /* TCP_FACK */ 2731 temp->next = cur->next; 2732 temp->start = sack.end; 2733 temp->end = cur->end; 2734 temp->dups = cur->dups; 2735 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2736 cur->end = sack.start; 2737 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2738 cur->dups++; 2739 if (((sack.end - cur->end)/tp->t_maxseg) >= 2740 tcprexmtthresh) 2741 cur->dups = tcprexmtthresh; 2742 cur->next = temp; 2743 p = temp; 2744 cur = p->next; 2745 tp->snd_numholes++; 2746 } 2747 } 2748 /* At this point, p points to the last hole on the list */ 2749 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2750 /* 2751 * Need to append new hole at end. 2752 * Last hole is p (and it's not NULL). 2753 */ 2754 temp = (struct sackhole *) 2755 pool_get(&sackhl_pool, PR_NOWAIT); 2756 if (temp == NULL) 2757 goto done; /* ENOBUFS */ 2758 temp->start = tp->rcv_lastsack; 2759 temp->end = sack.start; 2760 temp->dups = min(tcprexmtthresh, 2761 ((sack.end - sack.start)/tp->t_maxseg)); 2762 if (temp->dups < 1) 2763 temp->dups = 1; 2764 temp->rxmit = temp->start; 2765 temp->next = 0; 2766 p->next = temp; 2767 tp->rcv_lastsack = sack.end; 2768 tp->snd_numholes++; 2769 } 2770 } 2771 done: 2772 #if defined(TCP_SACK) && defined(TCP_FACK) 2773 /* 2774 * Update retran_data and snd_awnd. Go through the list of 2775 * holes. Increment retran_data by (hole->rxmit - hole->start). 2776 */ 2777 tp->retran_data = 0; 2778 cur = tp->snd_holes; 2779 while (cur) { 2780 tp->retran_data += cur->rxmit - cur->start; 2781 cur = cur->next; 2782 } 2783 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2784 tp->retran_data; 2785 #endif /* TCP_FACK */ 2786 2787 return; 2788 } 2789 2790 /* 2791 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2792 * it is completely acked; otherwise, tcp_sack_option(), called from 2793 * tcp_dooptions(), will fix up the hole. 2794 */ 2795 void 2796 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2797 { 2798 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2799 /* max because this could be an older ack just arrived */ 2800 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2801 th->th_ack : tp->snd_una; 2802 struct sackhole *cur = tp->snd_holes; 2803 struct sackhole *prev; 2804 while (cur) 2805 if (SEQ_LEQ(cur->end, lastack)) { 2806 prev = cur; 2807 cur = cur->next; 2808 pool_put(&sackhl_pool, prev); 2809 tp->snd_numholes--; 2810 } else if (SEQ_LT(cur->start, lastack)) { 2811 cur->start = lastack; 2812 if (SEQ_LT(cur->rxmit, cur->start)) 2813 cur->rxmit = cur->start; 2814 break; 2815 } else 2816 break; 2817 tp->snd_holes = cur; 2818 } 2819 } 2820 2821 /* 2822 * Delete all receiver-side SACK information. 2823 */ 2824 void 2825 tcp_clean_sackreport(struct tcpcb *tp) 2826 { 2827 int i; 2828 2829 tp->rcv_numsacks = 0; 2830 for (i = 0; i < MAX_SACK_BLKS; i++) 2831 tp->sackblks[i].start = tp->sackblks[i].end=0; 2832 2833 } 2834 2835 /* 2836 * Checks for partial ack. If partial ack arrives, turn off retransmission 2837 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2838 * If the ack advances at least to tp->snd_last, return 0. 2839 */ 2840 int 2841 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2842 { 2843 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2844 /* Turn off retx. timer (will start again next segment) */ 2845 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2846 tp->t_rtttime = 0; 2847 #ifndef TCP_FACK 2848 /* 2849 * Partial window deflation. This statement relies on the 2850 * fact that tp->snd_una has not been updated yet. In FACK 2851 * hold snd_cwnd constant during fast recovery. 2852 */ 2853 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2854 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2855 tp->snd_cwnd += tp->t_maxseg; 2856 } else 2857 tp->snd_cwnd = tp->t_maxseg; 2858 #endif 2859 return (1); 2860 } 2861 return (0); 2862 } 2863 #endif /* TCP_SACK */ 2864 2865 /* 2866 * Pull out of band byte out of a segment so 2867 * it doesn't appear in the user's data queue. 2868 * It is still reflected in the segment length for 2869 * sequencing purposes. 2870 */ 2871 void 2872 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2873 { 2874 int cnt = off + urgent - 1; 2875 2876 while (cnt >= 0) { 2877 if (m->m_len > cnt) { 2878 char *cp = mtod(m, caddr_t) + cnt; 2879 struct tcpcb *tp = sototcpcb(so); 2880 2881 tp->t_iobc = *cp; 2882 tp->t_oobflags |= TCPOOB_HAVEDATA; 2883 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2884 m->m_len--; 2885 return; 2886 } 2887 cnt -= m->m_len; 2888 m = m->m_next; 2889 if (m == 0) 2890 break; 2891 } 2892 panic("tcp_pulloutofband"); 2893 } 2894 2895 /* 2896 * Collect new round-trip time estimate 2897 * and update averages and current timeout. 2898 */ 2899 void 2900 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2901 { 2902 short delta; 2903 short rttmin; 2904 2905 if (rtt < 0) 2906 rtt = 0; 2907 else if (rtt > TCP_RTT_MAX) 2908 rtt = TCP_RTT_MAX; 2909 2910 tcpstat.tcps_rttupdated++; 2911 if (tp->t_srtt != 0) { 2912 /* 2913 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2914 * after the binary point (scaled by 4), whereas 2915 * srtt is stored as fixed point with 5 bits after the 2916 * binary point (i.e., scaled by 32). The following magic 2917 * is equivalent to the smoothing algorithm in rfc793 with 2918 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2919 * point). 2920 */ 2921 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2922 (tp->t_srtt >> TCP_RTT_SHIFT); 2923 if ((tp->t_srtt += delta) <= 0) 2924 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2925 /* 2926 * We accumulate a smoothed rtt variance (actually, a 2927 * smoothed mean difference), then set the retransmit 2928 * timer to smoothed rtt + 4 times the smoothed variance. 2929 * rttvar is stored as fixed point with 4 bits after the 2930 * binary point (scaled by 16). The following is 2931 * equivalent to rfc793 smoothing with an alpha of .75 2932 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2933 * rfc793's wired-in beta. 2934 */ 2935 if (delta < 0) 2936 delta = -delta; 2937 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2938 if ((tp->t_rttvar += delta) <= 0) 2939 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2940 } else { 2941 /* 2942 * No rtt measurement yet - use the unsmoothed rtt. 2943 * Set the variance to half the rtt (so our first 2944 * retransmit happens at 3*rtt). 2945 */ 2946 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2947 tp->t_rttvar = (rtt + 1) << 2948 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2949 } 2950 tp->t_rtttime = 0; 2951 tp->t_rxtshift = 0; 2952 2953 /* 2954 * the retransmit should happen at rtt + 4 * rttvar. 2955 * Because of the way we do the smoothing, srtt and rttvar 2956 * will each average +1/2 tick of bias. When we compute 2957 * the retransmit timer, we want 1/2 tick of rounding and 2958 * 1 extra tick because of +-1/2 tick uncertainty in the 2959 * firing of the timer. The bias will give us exactly the 2960 * 1.5 tick we need. But, because the bias is 2961 * statistical, we have to test that we don't drop below 2962 * the minimum feasible timer (which is 2 ticks). 2963 */ 2964 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2965 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2966 2967 /* 2968 * We received an ack for a packet that wasn't retransmitted; 2969 * it is probably safe to discard any error indications we've 2970 * received recently. This isn't quite right, but close enough 2971 * for now (a route might have failed after we sent a segment, 2972 * and the return path might not be symmetrical). 2973 */ 2974 tp->t_softerror = 0; 2975 } 2976 2977 /* 2978 * Determine a reasonable value for maxseg size. 2979 * If the route is known, check route for mtu. 2980 * If none, use an mss that can be handled on the outgoing 2981 * interface without forcing IP to fragment; if bigger than 2982 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2983 * to utilize large mbufs. If no route is found, route has no mtu, 2984 * or the destination isn't local, use a default, hopefully conservative 2985 * size (usually 512 or the default IP max size, but no more than the mtu 2986 * of the interface), as we can't discover anything about intervening 2987 * gateways or networks. We also initialize the congestion/slow start 2988 * window to be a single segment if the destination isn't local. 2989 * While looking at the routing entry, we also initialize other path-dependent 2990 * parameters from pre-set or cached values in the routing entry. 2991 * 2992 * Also take into account the space needed for options that we 2993 * send regularly. Make maxseg shorter by that amount to assure 2994 * that we can send maxseg amount of data even when the options 2995 * are present. Store the upper limit of the length of options plus 2996 * data in maxopd. 2997 * 2998 * NOTE: offer == -1 indicates that the maxseg size changed due to 2999 * Path MTU discovery. 3000 */ 3001 int 3002 tcp_mss(struct tcpcb *tp, int offer) 3003 { 3004 struct rtentry *rt; 3005 struct ifnet *ifp; 3006 int mss, mssopt; 3007 int iphlen; 3008 struct inpcb *inp; 3009 3010 inp = tp->t_inpcb; 3011 3012 mssopt = mss = tcp_mssdflt; 3013 3014 rt = in_pcbrtentry(inp); 3015 3016 if (rt == NULL) 3017 goto out; 3018 3019 ifp = rt->rt_ifp; 3020 3021 switch (tp->pf) { 3022 #ifdef INET6 3023 case AF_INET6: 3024 iphlen = sizeof(struct ip6_hdr); 3025 break; 3026 #endif 3027 case AF_INET: 3028 iphlen = sizeof(struct ip); 3029 break; 3030 default: 3031 /* the family does not support path MTU discovery */ 3032 goto out; 3033 } 3034 3035 /* 3036 * if there's an mtu associated with the route and we support 3037 * path MTU discovery for the underlying protocol family, use it. 3038 */ 3039 if (rt->rt_rmx.rmx_mtu) { 3040 /* 3041 * One may wish to lower MSS to take into account options, 3042 * especially security-related options. 3043 */ 3044 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3045 /* 3046 * RFC2460 section 5, last paragraph: if path MTU is 3047 * smaller than 1280, use 1280 as packet size and 3048 * attach fragment header. 3049 */ 3050 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3051 sizeof(struct tcphdr); 3052 } else { 3053 mss = rt->rt_rmx.rmx_mtu - iphlen - 3054 sizeof(struct tcphdr); 3055 } 3056 } else if (!ifp) { 3057 /* 3058 * ifp may be null and rmx_mtu may be zero in certain 3059 * v6 cases (e.g., if ND wasn't able to resolve the 3060 * destination host. 3061 */ 3062 goto out; 3063 } else if (ifp->if_flags & IFF_LOOPBACK) { 3064 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3065 } else if (tp->pf == AF_INET) { 3066 if (ip_mtudisc) 3067 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3068 } 3069 #ifdef INET6 3070 else if (tp->pf == AF_INET6) { 3071 /* 3072 * for IPv6, path MTU discovery is always turned on, 3073 * or the node must use packet size <= 1280. 3074 */ 3075 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3076 } 3077 #endif /* INET6 */ 3078 3079 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3080 if (offer != -1) { 3081 #ifndef INET6 3082 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3083 #else 3084 if (tp->pf == AF_INET6) 3085 mssopt = IN6_LINKMTU(ifp) - iphlen - 3086 sizeof(struct tcphdr); 3087 else 3088 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3089 #endif 3090 3091 mssopt = max(tcp_mssdflt, mssopt); 3092 } 3093 3094 out: 3095 /* 3096 * The current mss, t_maxseg, is initialized to the default value. 3097 * If we compute a smaller value, reduce the current mss. 3098 * If we compute a larger value, return it for use in sending 3099 * a max seg size option, but don't store it for use 3100 * unless we received an offer at least that large from peer. 3101 * 3102 * However, do not accept offers lower than the minimum of 3103 * the interface MTU and 216. 3104 */ 3105 if (offer > 0) 3106 tp->t_peermss = offer; 3107 if (tp->t_peermss) 3108 mss = min(mss, max(tp->t_peermss, 216)); 3109 3110 /* sanity - at least max opt. space */ 3111 mss = max(mss, 64); 3112 3113 /* 3114 * maxopd stores the maximum length of data AND options 3115 * in a segment; maxseg is the amount of data in a normal 3116 * segment. We need to store this value (maxopd) apart 3117 * from maxseg, because now every segment carries options 3118 * and thus we normally have somewhat less data in segments. 3119 */ 3120 tp->t_maxopd = mss; 3121 3122 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3123 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3124 mss -= TCPOLEN_TSTAMP_APPA; 3125 #ifdef TCP_SIGNATURE 3126 if (tp->t_flags & TF_SIGNATURE) 3127 mss -= TCPOLEN_SIGLEN; 3128 #endif 3129 3130 if (offer == -1) { 3131 /* mss changed due to Path MTU discovery */ 3132 tp->t_flags &= ~TF_PMTUD_PEND; 3133 tp->t_pmtud_mtu_sent = 0; 3134 tp->t_pmtud_mss_acked = 0; 3135 if (mss < tp->t_maxseg) { 3136 /* 3137 * Follow suggestion in RFC 2414 to reduce the 3138 * congestion window by the ratio of the old 3139 * segment size to the new segment size. 3140 */ 3141 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3142 mss, mss); 3143 } 3144 } else if (tcp_do_rfc3390 == 2) { 3145 /* increase initial window */ 3146 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3147 } else if (tcp_do_rfc3390) { 3148 /* increase initial window */ 3149 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3150 } else 3151 tp->snd_cwnd = mss; 3152 3153 tp->t_maxseg = mss; 3154 3155 return (offer != -1 ? mssopt : mss); 3156 } 3157 3158 u_int 3159 tcp_hdrsz(struct tcpcb *tp) 3160 { 3161 u_int hlen; 3162 3163 switch (tp->pf) { 3164 #ifdef INET6 3165 case AF_INET6: 3166 hlen = sizeof(struct ip6_hdr); 3167 break; 3168 #endif 3169 case AF_INET: 3170 hlen = sizeof(struct ip); 3171 break; 3172 default: 3173 hlen = 0; 3174 break; 3175 } 3176 hlen += sizeof(struct tcphdr); 3177 3178 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3179 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3180 hlen += TCPOLEN_TSTAMP_APPA; 3181 #ifdef TCP_SIGNATURE 3182 if (tp->t_flags & TF_SIGNATURE) 3183 hlen += TCPOLEN_SIGLEN; 3184 #endif 3185 return (hlen); 3186 } 3187 3188 /* 3189 * Set connection variables based on the effective MSS. 3190 * We are passed the TCPCB for the actual connection. If we 3191 * are the server, we are called by the compressed state engine 3192 * when the 3-way handshake is complete. If we are the client, 3193 * we are called when we receive the SYN,ACK from the server. 3194 * 3195 * NOTE: The t_maxseg value must be initialized in the TCPCB 3196 * before this routine is called! 3197 */ 3198 void 3199 tcp_mss_update(struct tcpcb *tp) 3200 { 3201 int mss; 3202 u_long bufsize; 3203 struct rtentry *rt; 3204 struct socket *so; 3205 3206 so = tp->t_inpcb->inp_socket; 3207 mss = tp->t_maxseg; 3208 3209 rt = in_pcbrtentry(tp->t_inpcb); 3210 3211 if (rt == NULL) 3212 return; 3213 3214 bufsize = so->so_snd.sb_hiwat; 3215 if (bufsize < mss) { 3216 mss = bufsize; 3217 /* Update t_maxseg and t_maxopd */ 3218 tcp_mss(tp, mss); 3219 } else { 3220 bufsize = roundup(bufsize, mss); 3221 if (bufsize > sb_max) 3222 bufsize = sb_max; 3223 (void)sbreserve(&so->so_snd, bufsize); 3224 } 3225 3226 bufsize = so->so_rcv.sb_hiwat; 3227 if (bufsize > mss) { 3228 bufsize = roundup(bufsize, mss); 3229 if (bufsize > sb_max) 3230 bufsize = sb_max; 3231 (void)sbreserve(&so->so_rcv, bufsize); 3232 } 3233 3234 } 3235 3236 #if defined (TCP_SACK) 3237 /* 3238 * Checks for partial ack. If partial ack arrives, force the retransmission 3239 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3240 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3241 * be started again. If the ack advances at least to tp->snd_last, return 0. 3242 */ 3243 int 3244 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3245 { 3246 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3247 /* 3248 * snd_una has not been updated and the socket send buffer 3249 * not yet drained of the acked data, so we have to leave 3250 * snd_una as it was to get the correct data offset in 3251 * tcp_output(). 3252 */ 3253 tcp_seq onxt = tp->snd_nxt; 3254 u_long ocwnd = tp->snd_cwnd; 3255 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3256 tp->t_rtttime = 0; 3257 tp->snd_nxt = th->th_ack; 3258 /* 3259 * Set snd_cwnd to one segment beyond acknowledged offset 3260 * (tp->snd_una not yet updated when this function is called) 3261 */ 3262 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3263 (void) tcp_output(tp); 3264 tp->snd_cwnd = ocwnd; 3265 if (SEQ_GT(onxt, tp->snd_nxt)) 3266 tp->snd_nxt = onxt; 3267 /* 3268 * Partial window deflation. Relies on fact that tp->snd_una 3269 * not updated yet. 3270 */ 3271 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3272 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3273 else 3274 tp->snd_cwnd = 0; 3275 tp->snd_cwnd += tp->t_maxseg; 3276 3277 return 1; 3278 } 3279 return 0; 3280 } 3281 #endif /* TCP_SACK */ 3282 3283 int 3284 tcp_mss_adv(struct ifnet *ifp, int af) 3285 { 3286 int mss = 0; 3287 int iphlen; 3288 3289 switch (af) { 3290 case AF_INET: 3291 if (ifp != NULL) 3292 mss = ifp->if_mtu; 3293 iphlen = sizeof(struct ip); 3294 break; 3295 #ifdef INET6 3296 case AF_INET6: 3297 if (ifp != NULL) 3298 mss = IN6_LINKMTU(ifp); 3299 iphlen = sizeof(struct ip6_hdr); 3300 break; 3301 #endif 3302 } 3303 mss = mss - iphlen - sizeof(struct tcphdr); 3304 return (max(mss, tcp_mssdflt)); 3305 } 3306 3307 /* 3308 * TCP compressed state engine. Currently used to hold compressed 3309 * state for SYN_RECEIVED. 3310 */ 3311 3312 u_long syn_cache_count; 3313 u_int32_t syn_hash1, syn_hash2; 3314 3315 #define SYN_HASH(sa, sp, dp) \ 3316 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3317 ((u_int32_t)(sp)))^syn_hash2))) 3318 #ifndef INET6 3319 #define SYN_HASHALL(hash, src, dst) \ 3320 do { \ 3321 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3322 ((struct sockaddr_in *)(src))->sin_port, \ 3323 ((struct sockaddr_in *)(dst))->sin_port); \ 3324 } while (/*CONSTCOND*/ 0) 3325 #else 3326 #define SYN_HASH6(sa, sp, dp) \ 3327 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3328 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3329 & 0x7fffffff) 3330 3331 #define SYN_HASHALL(hash, src, dst) \ 3332 do { \ 3333 switch ((src)->sa_family) { \ 3334 case AF_INET: \ 3335 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3336 ((struct sockaddr_in *)(src))->sin_port, \ 3337 ((struct sockaddr_in *)(dst))->sin_port); \ 3338 break; \ 3339 case AF_INET6: \ 3340 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3341 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3342 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3343 break; \ 3344 default: \ 3345 hash = 0; \ 3346 } \ 3347 } while (/*CONSTCOND*/0) 3348 #endif /* INET6 */ 3349 3350 void 3351 syn_cache_rm(struct syn_cache *sc) 3352 { 3353 sc->sc_flags |= SCF_DEAD; 3354 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 3355 sc, sc_bucketq); 3356 sc->sc_tp = NULL; 3357 LIST_REMOVE(sc, sc_tpq); 3358 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 3359 timeout_del(&sc->sc_timer); 3360 syn_cache_count--; 3361 } 3362 3363 void 3364 syn_cache_put(struct syn_cache *sc) 3365 { 3366 if (sc->sc_ipopts) 3367 (void) m_free(sc->sc_ipopts); 3368 if (sc->sc_route4.ro_rt != NULL) { 3369 rtfree(sc->sc_route4.ro_rt); 3370 sc->sc_route4.ro_rt = NULL; 3371 } 3372 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3373 timeout_add(&sc->sc_timer, 0); 3374 } 3375 3376 struct pool syn_cache_pool; 3377 3378 /* 3379 * We don't estimate RTT with SYNs, so each packet starts with the default 3380 * RTT and each timer step has a fixed timeout value. 3381 */ 3382 #define SYN_CACHE_TIMER_ARM(sc) \ 3383 do { \ 3384 TCPT_RANGESET((sc)->sc_rxtcur, \ 3385 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3386 TCPTV_REXMTMAX); \ 3387 if (!timeout_initialized(&(sc)->sc_timer)) \ 3388 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3389 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3390 } while (/*CONSTCOND*/0) 3391 3392 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3393 3394 void 3395 syn_cache_init() 3396 { 3397 int i; 3398 3399 /* Initialize the hash buckets. */ 3400 for (i = 0; i < tcp_syn_cache_size; i++) 3401 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3402 3403 /* Initialize the syn cache pool. */ 3404 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3405 "syncache", NULL); 3406 } 3407 3408 void 3409 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3410 { 3411 struct syn_cache_head *scp; 3412 struct syn_cache *sc2; 3413 int s; 3414 3415 /* 3416 * If there are no entries in the hash table, reinitialize 3417 * the hash secrets. 3418 */ 3419 if (syn_cache_count == 0) { 3420 syn_hash1 = arc4random(); 3421 syn_hash2 = arc4random(); 3422 } 3423 3424 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3425 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3426 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3427 3428 /* 3429 * Make sure that we don't overflow the per-bucket 3430 * limit or the total cache size limit. 3431 */ 3432 s = splsoftnet(); 3433 if (scp->sch_length >= tcp_syn_bucket_limit) { 3434 tcpstat.tcps_sc_bucketoverflow++; 3435 /* 3436 * The bucket is full. Toss the oldest element in the 3437 * bucket. This will be the first entry in the bucket. 3438 */ 3439 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3440 #ifdef DIAGNOSTIC 3441 /* 3442 * This should never happen; we should always find an 3443 * entry in our bucket. 3444 */ 3445 if (sc2 == NULL) 3446 panic("syn_cache_insert: bucketoverflow: impossible"); 3447 #endif 3448 syn_cache_rm(sc2); 3449 syn_cache_put(sc2); 3450 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3451 struct syn_cache_head *scp2, *sce; 3452 3453 tcpstat.tcps_sc_overflowed++; 3454 /* 3455 * The cache is full. Toss the oldest entry in the 3456 * first non-empty bucket we can find. 3457 * 3458 * XXX We would really like to toss the oldest 3459 * entry in the cache, but we hope that this 3460 * condition doesn't happen very often. 3461 */ 3462 scp2 = scp; 3463 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3464 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3465 for (++scp2; scp2 != scp; scp2++) { 3466 if (scp2 >= sce) 3467 scp2 = &tcp_syn_cache[0]; 3468 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3469 break; 3470 } 3471 #ifdef DIAGNOSTIC 3472 /* 3473 * This should never happen; we should always find a 3474 * non-empty bucket. 3475 */ 3476 if (scp2 == scp) 3477 panic("syn_cache_insert: cacheoverflow: " 3478 "impossible"); 3479 #endif 3480 } 3481 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3482 syn_cache_rm(sc2); 3483 syn_cache_put(sc2); 3484 } 3485 3486 /* 3487 * Initialize the entry's timer. 3488 */ 3489 sc->sc_rxttot = 0; 3490 sc->sc_rxtshift = 0; 3491 SYN_CACHE_TIMER_ARM(sc); 3492 3493 /* Link it from tcpcb entry */ 3494 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3495 3496 /* Put it into the bucket. */ 3497 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3498 scp->sch_length++; 3499 syn_cache_count++; 3500 3501 tcpstat.tcps_sc_added++; 3502 splx(s); 3503 } 3504 3505 /* 3506 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3507 * If we have retransmitted an entry the maximum number of times, expire 3508 * that entry. 3509 */ 3510 void 3511 syn_cache_timer(void *arg) 3512 { 3513 struct syn_cache *sc = arg; 3514 int s; 3515 3516 s = splsoftnet(); 3517 if (sc->sc_flags & SCF_DEAD) { 3518 splx(s); 3519 return; 3520 } 3521 3522 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3523 /* Drop it -- too many retransmissions. */ 3524 goto dropit; 3525 } 3526 3527 /* 3528 * Compute the total amount of time this entry has 3529 * been on a queue. If this entry has been on longer 3530 * than the keep alive timer would allow, expire it. 3531 */ 3532 sc->sc_rxttot += sc->sc_rxtcur; 3533 if (sc->sc_rxttot >= tcptv_keep_init) 3534 goto dropit; 3535 3536 tcpstat.tcps_sc_retransmitted++; 3537 (void) syn_cache_respond(sc, NULL); 3538 3539 /* Advance the timer back-off. */ 3540 sc->sc_rxtshift++; 3541 SYN_CACHE_TIMER_ARM(sc); 3542 3543 splx(s); 3544 return; 3545 3546 dropit: 3547 tcpstat.tcps_sc_timed_out++; 3548 syn_cache_rm(sc); 3549 syn_cache_put(sc); 3550 splx(s); 3551 } 3552 3553 void 3554 syn_cache_reaper(void *arg) 3555 { 3556 struct syn_cache *sc = arg; 3557 int s; 3558 3559 s = splsoftnet(); 3560 pool_put(&syn_cache_pool, (sc)); 3561 splx(s); 3562 return; 3563 } 3564 3565 /* 3566 * Remove syn cache created by the specified tcb entry, 3567 * because this does not make sense to keep them 3568 * (if there's no tcb entry, syn cache entry will never be used) 3569 */ 3570 void 3571 syn_cache_cleanup(struct tcpcb *tp) 3572 { 3573 struct syn_cache *sc, *nsc; 3574 int s; 3575 3576 s = splsoftnet(); 3577 3578 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3579 nsc = LIST_NEXT(sc, sc_tpq); 3580 3581 #ifdef DIAGNOSTIC 3582 if (sc->sc_tp != tp) 3583 panic("invalid sc_tp in syn_cache_cleanup"); 3584 #endif 3585 syn_cache_rm(sc); 3586 syn_cache_put(sc); 3587 } 3588 /* just for safety */ 3589 LIST_INIT(&tp->t_sc); 3590 3591 splx(s); 3592 } 3593 3594 /* 3595 * Find an entry in the syn cache. 3596 */ 3597 struct syn_cache * 3598 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3599 struct syn_cache_head **headp, u_int rtableid) 3600 { 3601 struct syn_cache *sc; 3602 struct syn_cache_head *scp; 3603 u_int32_t hash; 3604 int s; 3605 3606 SYN_HASHALL(hash, src, dst); 3607 3608 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3609 *headp = scp; 3610 s = splsoftnet(); 3611 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3612 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3613 if (sc->sc_hash != hash) 3614 continue; 3615 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3616 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3617 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) { 3618 splx(s); 3619 return (sc); 3620 } 3621 } 3622 splx(s); 3623 return (NULL); 3624 } 3625 3626 /* 3627 * This function gets called when we receive an ACK for a 3628 * socket in the LISTEN state. We look up the connection 3629 * in the syn cache, and if its there, we pull it out of 3630 * the cache and turn it into a full-blown connection in 3631 * the SYN-RECEIVED state. 3632 * 3633 * The return values may not be immediately obvious, and their effects 3634 * can be subtle, so here they are: 3635 * 3636 * NULL SYN was not found in cache; caller should drop the 3637 * packet and send an RST. 3638 * 3639 * -1 We were unable to create the new connection, and are 3640 * aborting it. An ACK,RST is being sent to the peer 3641 * (unless we got screwey sequence numbners; see below), 3642 * because the 3-way handshake has been completed. Caller 3643 * should not free the mbuf, since we may be using it. If 3644 * we are not, we will free it. 3645 * 3646 * Otherwise, the return value is a pointer to the new socket 3647 * associated with the connection. 3648 */ 3649 struct socket * 3650 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3651 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3652 { 3653 struct syn_cache *sc; 3654 struct syn_cache_head *scp; 3655 struct inpcb *inp = NULL; 3656 struct tcpcb *tp = NULL; 3657 struct mbuf *am; 3658 int s; 3659 struct socket *oso; 3660 #if NPF > 0 3661 struct pf_divert *divert = NULL; 3662 #endif 3663 3664 s = splsoftnet(); 3665 if ((sc = syn_cache_lookup(src, dst, &scp, 3666 sotoinpcb(so)->inp_rtableid)) == NULL) { 3667 splx(s); 3668 return (NULL); 3669 } 3670 3671 /* 3672 * Verify the sequence and ack numbers. Try getting the correct 3673 * response again. 3674 */ 3675 if ((th->th_ack != sc->sc_iss + 1) || 3676 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3677 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3678 (void) syn_cache_respond(sc, m); 3679 splx(s); 3680 return ((struct socket *)(-1)); 3681 } 3682 3683 /* Remove this cache entry */ 3684 syn_cache_rm(sc); 3685 splx(s); 3686 3687 /* 3688 * Ok, create the full blown connection, and set things up 3689 * as they would have been set up if we had created the 3690 * connection when the SYN arrived. If we can't create 3691 * the connection, abort it. 3692 */ 3693 oso = so; 3694 so = sonewconn(so, SS_ISCONNECTED); 3695 if (so == NULL) 3696 goto resetandabort; 3697 3698 inp = sotoinpcb(oso); 3699 3700 #ifdef IPSEC 3701 /* 3702 * We need to copy the required security levels 3703 * from the old pcb. Ditto for any other 3704 * IPsec-related information. 3705 */ 3706 { 3707 struct inpcb *newinp = sotoinpcb(so); 3708 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3709 sizeof(inp->inp_seclevel)); 3710 newinp->inp_secrequire = inp->inp_secrequire; 3711 if (inp->inp_ipo != NULL) { 3712 newinp->inp_ipo = inp->inp_ipo; 3713 inp->inp_ipo->ipo_ref_count++; 3714 } 3715 if (inp->inp_ipsec_remotecred != NULL) { 3716 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3717 inp->inp_ipsec_remotecred->ref_count++; 3718 } 3719 if (inp->inp_ipsec_remoteauth != NULL) { 3720 newinp->inp_ipsec_remoteauth 3721 = inp->inp_ipsec_remoteauth; 3722 inp->inp_ipsec_remoteauth->ref_count++; 3723 } 3724 } 3725 #endif /* IPSEC */ 3726 #ifdef INET6 3727 /* 3728 * inp still has the OLD in_pcb stuff, set the 3729 * v6-related flags on the new guy, too. 3730 */ 3731 { 3732 int flags = inp->inp_flags; 3733 struct inpcb *oldinpcb = inp; 3734 3735 inp = sotoinpcb(so); 3736 inp->inp_flags |= (flags & INP_IPV6); 3737 if ((inp->inp_flags & INP_IPV6) != 0) { 3738 inp->inp_ipv6.ip6_hlim = 3739 oldinpcb->inp_ipv6.ip6_hlim; 3740 } 3741 } 3742 #else /* INET6 */ 3743 inp = sotoinpcb(so); 3744 #endif /* INET6 */ 3745 3746 #if NPF > 0 3747 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3748 (divert = pf_find_divert(m)) != NULL) 3749 inp->inp_rtableid = divert->rdomain; 3750 else 3751 #endif 3752 /* inherit rtable from listening socket */ 3753 inp->inp_rtableid = sc->sc_rtableid; 3754 3755 inp->inp_lport = th->th_dport; 3756 switch (src->sa_family) { 3757 #ifdef INET6 3758 case AF_INET6: 3759 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3760 break; 3761 #endif /* INET6 */ 3762 case AF_INET: 3763 3764 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3765 inp->inp_options = ip_srcroute(m); 3766 if (inp->inp_options == NULL) { 3767 inp->inp_options = sc->sc_ipopts; 3768 sc->sc_ipopts = NULL; 3769 } 3770 break; 3771 } 3772 in_pcbrehash(inp); 3773 3774 /* 3775 * Give the new socket our cached route reference. 3776 */ 3777 if (src->sa_family == AF_INET) 3778 inp->inp_route = sc->sc_route4; /* struct assignment */ 3779 #ifdef INET6 3780 else 3781 inp->inp_route6 = sc->sc_route6; 3782 #endif 3783 sc->sc_route4.ro_rt = NULL; 3784 3785 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3786 if (am == NULL) 3787 goto resetandabort; 3788 am->m_len = src->sa_len; 3789 bcopy(src, mtod(am, caddr_t), src->sa_len); 3790 3791 switch (src->sa_family) { 3792 case AF_INET: 3793 /* drop IPv4 packet to AF_INET6 socket */ 3794 if (inp->inp_flags & INP_IPV6) { 3795 (void) m_free(am); 3796 goto resetandabort; 3797 } 3798 if (in_pcbconnect(inp, am)) { 3799 (void) m_free(am); 3800 goto resetandabort; 3801 } 3802 break; 3803 #ifdef INET6 3804 case AF_INET6: 3805 if (in6_pcbconnect(inp, am)) { 3806 (void) m_free(am); 3807 goto resetandabort; 3808 } 3809 break; 3810 #endif 3811 } 3812 (void) m_free(am); 3813 3814 tp = intotcpcb(inp); 3815 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3816 if (sc->sc_request_r_scale != 15) { 3817 tp->requested_s_scale = sc->sc_requested_s_scale; 3818 tp->request_r_scale = sc->sc_request_r_scale; 3819 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3820 } 3821 if (sc->sc_flags & SCF_TIMESTAMP) 3822 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3823 3824 tp->t_template = tcp_template(tp); 3825 if (tp->t_template == 0) { 3826 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3827 so = NULL; 3828 m_freem(m); 3829 goto abort; 3830 } 3831 #ifdef TCP_SACK 3832 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3833 #endif 3834 3835 tp->ts_modulate = sc->sc_modulate; 3836 tp->ts_recent = sc->sc_timestamp; 3837 tp->iss = sc->sc_iss; 3838 tp->irs = sc->sc_irs; 3839 tcp_sendseqinit(tp); 3840 #if defined (TCP_SACK) || defined(TCP_ECN) 3841 tp->snd_last = tp->snd_una; 3842 #endif /* TCP_SACK */ 3843 #if defined(TCP_SACK) && defined(TCP_FACK) 3844 tp->snd_fack = tp->snd_una; 3845 tp->retran_data = 0; 3846 tp->snd_awnd = 0; 3847 #endif /* TCP_FACK */ 3848 #ifdef TCP_ECN 3849 if (sc->sc_flags & SCF_ECN_PERMIT) { 3850 tp->t_flags |= TF_ECN_PERMIT; 3851 tcpstat.tcps_ecn_accepts++; 3852 } 3853 #endif 3854 #ifdef TCP_SACK 3855 if (sc->sc_flags & SCF_SACK_PERMIT) 3856 tp->t_flags |= TF_SACK_PERMIT; 3857 #endif 3858 #ifdef TCP_SIGNATURE 3859 if (sc->sc_flags & SCF_SIGNATURE) 3860 tp->t_flags |= TF_SIGNATURE; 3861 #endif 3862 tcp_rcvseqinit(tp); 3863 tp->t_state = TCPS_SYN_RECEIVED; 3864 tp->t_rcvtime = tcp_now; 3865 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3866 tcpstat.tcps_accepts++; 3867 3868 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3869 if (sc->sc_peermaxseg) 3870 tcp_mss_update(tp); 3871 /* Reset initial window to 1 segment for retransmit */ 3872 if (sc->sc_rxtshift > 0) 3873 tp->snd_cwnd = tp->t_maxseg; 3874 tp->snd_wl1 = sc->sc_irs; 3875 tp->rcv_up = sc->sc_irs + 1; 3876 3877 /* 3878 * This is what whould have happened in tcp_output() when 3879 * the SYN,ACK was sent. 3880 */ 3881 tp->snd_up = tp->snd_una; 3882 tp->snd_max = tp->snd_nxt = tp->iss+1; 3883 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3884 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3885 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3886 tp->last_ack_sent = tp->rcv_nxt; 3887 3888 tcpstat.tcps_sc_completed++; 3889 syn_cache_put(sc); 3890 return (so); 3891 3892 resetandabort: 3893 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3894 m->m_pkthdr.ph_rtableid); 3895 m_freem(m); 3896 abort: 3897 if (so != NULL) 3898 (void) soabort(so); 3899 syn_cache_put(sc); 3900 tcpstat.tcps_sc_aborted++; 3901 return ((struct socket *)(-1)); 3902 } 3903 3904 /* 3905 * This function is called when we get a RST for a 3906 * non-existent connection, so that we can see if the 3907 * connection is in the syn cache. If it is, zap it. 3908 */ 3909 3910 void 3911 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3912 u_int rtableid) 3913 { 3914 struct syn_cache *sc; 3915 struct syn_cache_head *scp; 3916 int s = splsoftnet(); 3917 3918 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3919 splx(s); 3920 return; 3921 } 3922 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3923 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3924 splx(s); 3925 return; 3926 } 3927 syn_cache_rm(sc); 3928 splx(s); 3929 tcpstat.tcps_sc_reset++; 3930 syn_cache_put(sc); 3931 } 3932 3933 void 3934 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3935 u_int rtableid) 3936 { 3937 struct syn_cache *sc; 3938 struct syn_cache_head *scp; 3939 int s; 3940 3941 s = splsoftnet(); 3942 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3943 splx(s); 3944 return; 3945 } 3946 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3947 if (ntohl (th->th_seq) != sc->sc_iss) { 3948 splx(s); 3949 return; 3950 } 3951 3952 /* 3953 * If we've retransmitted 3 times and this is our second error, 3954 * we remove the entry. Otherwise, we allow it to continue on. 3955 * This prevents us from incorrectly nuking an entry during a 3956 * spurious network outage. 3957 * 3958 * See tcp_notify(). 3959 */ 3960 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3961 sc->sc_flags |= SCF_UNREACH; 3962 splx(s); 3963 return; 3964 } 3965 3966 syn_cache_rm(sc); 3967 splx(s); 3968 tcpstat.tcps_sc_unreach++; 3969 syn_cache_put(sc); 3970 } 3971 3972 /* 3973 * Given a LISTEN socket and an inbound SYN request, add 3974 * this to the syn cache, and send back a segment: 3975 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3976 * to the source. 3977 * 3978 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3979 * Doing so would require that we hold onto the data and deliver it 3980 * to the application. However, if we are the target of a SYN-flood 3981 * DoS attack, an attacker could send data which would eventually 3982 * consume all available buffer space if it were ACKed. By not ACKing 3983 * the data, we avoid this DoS scenario. 3984 */ 3985 3986 int 3987 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3988 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3989 struct tcp_opt_info *oi, tcp_seq *issp) 3990 { 3991 struct tcpcb tb, *tp; 3992 long win; 3993 struct syn_cache *sc; 3994 struct syn_cache_head *scp; 3995 struct mbuf *ipopts; 3996 3997 tp = sototcpcb(so); 3998 3999 /* 4000 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 4001 * 4002 * Note this check is performed in tcp_input() very early on. 4003 */ 4004 4005 /* 4006 * Initialize some local state. 4007 */ 4008 win = sbspace(&so->so_rcv); 4009 if (win > TCP_MAXWIN) 4010 win = TCP_MAXWIN; 4011 4012 bzero(&tb, sizeof(tb)); 4013 #ifdef TCP_SIGNATURE 4014 if (optp || (tp->t_flags & TF_SIGNATURE)) { 4015 #else 4016 if (optp) { 4017 #endif 4018 tb.pf = tp->pf; 4019 #ifdef TCP_SACK 4020 tb.sack_enable = tp->sack_enable; 4021 #endif 4022 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 4023 #ifdef TCP_SIGNATURE 4024 if (tp->t_flags & TF_SIGNATURE) 4025 tb.t_flags |= TF_SIGNATURE; 4026 #endif 4027 tb.t_state = TCPS_LISTEN; 4028 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 4029 sotoinpcb(so)->inp_rtableid)) 4030 return (-1); 4031 } 4032 4033 switch (src->sa_family) { 4034 #ifdef INET 4035 case AF_INET: 4036 /* 4037 * Remember the IP options, if any. 4038 */ 4039 ipopts = ip_srcroute(m); 4040 break; 4041 #endif 4042 default: 4043 ipopts = NULL; 4044 } 4045 4046 /* 4047 * See if we already have an entry for this connection. 4048 * If we do, resend the SYN,ACK. We do not count this 4049 * as a retransmission (XXX though maybe we should). 4050 */ 4051 if ((sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid)) 4052 != NULL) { 4053 tcpstat.tcps_sc_dupesyn++; 4054 if (ipopts) { 4055 /* 4056 * If we were remembering a previous source route, 4057 * forget it and use the new one we've been given. 4058 */ 4059 if (sc->sc_ipopts) 4060 (void) m_free(sc->sc_ipopts); 4061 sc->sc_ipopts = ipopts; 4062 } 4063 sc->sc_timestamp = tb.ts_recent; 4064 if (syn_cache_respond(sc, m) == 0) { 4065 tcpstat.tcps_sndacks++; 4066 tcpstat.tcps_sndtotal++; 4067 } 4068 return (0); 4069 } 4070 4071 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4072 if (sc == NULL) { 4073 if (ipopts) 4074 (void) m_free(ipopts); 4075 return (-1); 4076 } 4077 4078 /* 4079 * Fill in the cache, and put the necessary IP and TCP 4080 * options into the reply. 4081 */ 4082 bcopy(src, &sc->sc_src, src->sa_len); 4083 bcopy(dst, &sc->sc_dst, dst->sa_len); 4084 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 4085 sc->sc_flags = 0; 4086 sc->sc_ipopts = ipopts; 4087 sc->sc_irs = th->th_seq; 4088 4089 sc->sc_iss = issp ? *issp : arc4random(); 4090 sc->sc_peermaxseg = oi->maxseg; 4091 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4092 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4093 sc->sc_win = win; 4094 sc->sc_timestamp = tb.ts_recent; 4095 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4096 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4097 sc->sc_flags |= SCF_TIMESTAMP; 4098 sc->sc_modulate = arc4random(); 4099 } 4100 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4101 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4102 sc->sc_requested_s_scale = tb.requested_s_scale; 4103 sc->sc_request_r_scale = 0; 4104 /* 4105 * Pick the smallest possible scaling factor that 4106 * will still allow us to scale up to sb_max. 4107 * 4108 * We do this because there are broken firewalls that 4109 * will corrupt the window scale option, leading to 4110 * the other endpoint believing that our advertised 4111 * window is unscaled. At scale factors larger than 4112 * 5 the unscaled window will drop below 1500 bytes, 4113 * leading to serious problems when traversing these 4114 * broken firewalls. 4115 * 4116 * With the default sbmax of 256K, a scale factor 4117 * of 3 will be chosen by this algorithm. Those who 4118 * choose a larger sbmax should watch out 4119 * for the compatiblity problems mentioned above. 4120 * 4121 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4122 * or <SYN,ACK>) segment itself is never scaled. 4123 */ 4124 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4125 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4126 sc->sc_request_r_scale++; 4127 } else { 4128 sc->sc_requested_s_scale = 15; 4129 sc->sc_request_r_scale = 15; 4130 } 4131 #ifdef TCP_ECN 4132 /* 4133 * if both ECE and CWR flag bits are set, peer is ECN capable. 4134 */ 4135 if (tcp_do_ecn && 4136 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4137 sc->sc_flags |= SCF_ECN_PERMIT; 4138 #endif 4139 #ifdef TCP_SACK 4140 /* 4141 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4142 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4143 */ 4144 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4145 sc->sc_flags |= SCF_SACK_PERMIT; 4146 #endif 4147 #ifdef TCP_SIGNATURE 4148 if (tb.t_flags & TF_SIGNATURE) 4149 sc->sc_flags |= SCF_SIGNATURE; 4150 #endif 4151 sc->sc_tp = tp; 4152 if (syn_cache_respond(sc, m) == 0) { 4153 syn_cache_insert(sc, tp); 4154 tcpstat.tcps_sndacks++; 4155 tcpstat.tcps_sndtotal++; 4156 } else { 4157 syn_cache_put(sc); 4158 tcpstat.tcps_sc_dropped++; 4159 } 4160 4161 return (0); 4162 } 4163 4164 int 4165 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4166 { 4167 struct route *ro; 4168 u_int8_t *optp; 4169 int optlen, error; 4170 u_int16_t tlen; 4171 struct ip *ip = NULL; 4172 #ifdef INET6 4173 struct ip6_hdr *ip6 = NULL; 4174 #endif 4175 struct tcphdr *th; 4176 u_int hlen; 4177 struct inpcb *inp; 4178 4179 switch (sc->sc_src.sa.sa_family) { 4180 case AF_INET: 4181 hlen = sizeof(struct ip); 4182 ro = &sc->sc_route4; 4183 break; 4184 #ifdef INET6 4185 case AF_INET6: 4186 hlen = sizeof(struct ip6_hdr); 4187 ro = (struct route *)&sc->sc_route6; 4188 break; 4189 #endif 4190 default: 4191 if (m) 4192 m_freem(m); 4193 return (EAFNOSUPPORT); 4194 } 4195 4196 /* Compute the size of the TCP options. */ 4197 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4198 #ifdef TCP_SACK 4199 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4200 #endif 4201 #ifdef TCP_SIGNATURE 4202 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4203 #endif 4204 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4205 4206 tlen = hlen + sizeof(struct tcphdr) + optlen; 4207 4208 /* 4209 * Create the IP+TCP header from scratch. 4210 */ 4211 if (m) 4212 m_freem(m); 4213 #ifdef DIAGNOSTIC 4214 if (max_linkhdr + tlen > MCLBYTES) 4215 return (ENOBUFS); 4216 #endif 4217 MGETHDR(m, M_DONTWAIT, MT_DATA); 4218 if (m && max_linkhdr + tlen > MHLEN) { 4219 MCLGET(m, M_DONTWAIT); 4220 if ((m->m_flags & M_EXT) == 0) { 4221 m_freem(m); 4222 m = NULL; 4223 } 4224 } 4225 if (m == NULL) 4226 return (ENOBUFS); 4227 4228 /* Fixup the mbuf. */ 4229 m->m_data += max_linkhdr; 4230 m->m_len = m->m_pkthdr.len = tlen; 4231 m->m_pkthdr.rcvif = NULL; 4232 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4233 memset(mtod(m, u_char *), 0, tlen); 4234 4235 switch (sc->sc_src.sa.sa_family) { 4236 case AF_INET: 4237 ip = mtod(m, struct ip *); 4238 ip->ip_dst = sc->sc_src.sin.sin_addr; 4239 ip->ip_src = sc->sc_dst.sin.sin_addr; 4240 ip->ip_p = IPPROTO_TCP; 4241 th = (struct tcphdr *)(ip + 1); 4242 th->th_dport = sc->sc_src.sin.sin_port; 4243 th->th_sport = sc->sc_dst.sin.sin_port; 4244 break; 4245 #ifdef INET6 4246 case AF_INET6: 4247 ip6 = mtod(m, struct ip6_hdr *); 4248 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4249 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4250 ip6->ip6_nxt = IPPROTO_TCP; 4251 /* ip6_plen will be updated in ip6_output() */ 4252 th = (struct tcphdr *)(ip6 + 1); 4253 th->th_dport = sc->sc_src.sin6.sin6_port; 4254 th->th_sport = sc->sc_dst.sin6.sin6_port; 4255 break; 4256 #endif 4257 default: 4258 th = NULL; 4259 } 4260 4261 th->th_seq = htonl(sc->sc_iss); 4262 th->th_ack = htonl(sc->sc_irs + 1); 4263 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4264 th->th_flags = TH_SYN|TH_ACK; 4265 #ifdef TCP_ECN 4266 /* Set ECE for SYN-ACK if peer supports ECN. */ 4267 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4268 th->th_flags |= TH_ECE; 4269 #endif 4270 th->th_win = htons(sc->sc_win); 4271 /* th_sum already 0 */ 4272 /* th_urp already 0 */ 4273 4274 /* Tack on the TCP options. */ 4275 optp = (u_int8_t *)(th + 1); 4276 *optp++ = TCPOPT_MAXSEG; 4277 *optp++ = 4; 4278 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4279 *optp++ = sc->sc_ourmaxseg & 0xff; 4280 4281 #ifdef TCP_SACK 4282 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4283 if (sc->sc_flags & SCF_SACK_PERMIT) { 4284 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4285 optp += 4; 4286 } 4287 #endif 4288 4289 if (sc->sc_request_r_scale != 15) { 4290 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4291 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4292 sc->sc_request_r_scale); 4293 optp += 4; 4294 } 4295 4296 if (sc->sc_flags & SCF_TIMESTAMP) { 4297 u_int32_t *lp = (u_int32_t *)(optp); 4298 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4299 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4300 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4301 *lp = htonl(sc->sc_timestamp); 4302 optp += TCPOLEN_TSTAMP_APPA; 4303 } 4304 4305 #ifdef TCP_SIGNATURE 4306 if (sc->sc_flags & SCF_SIGNATURE) { 4307 union sockaddr_union src, dst; 4308 struct tdb *tdb; 4309 4310 bzero(&src, sizeof(union sockaddr_union)); 4311 bzero(&dst, sizeof(union sockaddr_union)); 4312 src.sa.sa_len = sc->sc_src.sa.sa_len; 4313 src.sa.sa_family = sc->sc_src.sa.sa_family; 4314 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4315 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4316 4317 switch (sc->sc_src.sa.sa_family) { 4318 case 0: /*default to PF_INET*/ 4319 #ifdef INET 4320 case AF_INET: 4321 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4322 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4323 break; 4324 #endif /* INET */ 4325 #ifdef INET6 4326 case AF_INET6: 4327 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4328 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4329 break; 4330 #endif /* INET6 */ 4331 } 4332 4333 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4334 0, &src, &dst, IPPROTO_TCP); 4335 if (tdb == NULL) { 4336 if (m) 4337 m_freem(m); 4338 return (EPERM); 4339 } 4340 4341 /* Send signature option */ 4342 *(optp++) = TCPOPT_SIGNATURE; 4343 *(optp++) = TCPOLEN_SIGNATURE; 4344 4345 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4346 hlen, 0, optp) < 0) { 4347 if (m) 4348 m_freem(m); 4349 return (EINVAL); 4350 } 4351 optp += 16; 4352 4353 /* Pad options list to the next 32 bit boundary and 4354 * terminate it. 4355 */ 4356 *optp++ = TCPOPT_NOP; 4357 *optp++ = TCPOPT_EOL; 4358 } 4359 #endif /* TCP_SIGNATURE */ 4360 4361 /* Compute the packet's checksum. */ 4362 switch (sc->sc_src.sa.sa_family) { 4363 case AF_INET: 4364 ip->ip_len = htons(tlen - hlen); 4365 th->th_sum = 0; 4366 th->th_sum = in_cksum(m, tlen); 4367 break; 4368 #ifdef INET6 4369 case AF_INET6: 4370 ip6->ip6_plen = htons(tlen - hlen); 4371 th->th_sum = 0; 4372 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4373 break; 4374 #endif 4375 } 4376 4377 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4378 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4379 4380 /* 4381 * Fill in some straggling IP bits. Note the stack expects 4382 * ip_len to be in host order, for convenience. 4383 */ 4384 switch (sc->sc_src.sa.sa_family) { 4385 #ifdef INET 4386 case AF_INET: 4387 ip->ip_len = htons(tlen); 4388 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4389 if (inp != NULL) 4390 ip->ip_tos = inp->inp_ip.ip_tos; 4391 break; 4392 #endif 4393 #ifdef INET6 4394 case AF_INET6: 4395 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4396 ip6->ip6_vfc |= IPV6_VERSION; 4397 ip6->ip6_plen = htons(tlen - hlen); 4398 /* ip6_hlim will be initialized afterwards */ 4399 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4400 break; 4401 #endif 4402 } 4403 4404 switch (sc->sc_src.sa.sa_family) { 4405 #ifdef INET 4406 case AF_INET: 4407 error = ip_output(m, sc->sc_ipopts, ro, 4408 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4409 break; 4410 #endif 4411 #ifdef INET6 4412 case AF_INET6: 4413 ip6->ip6_hlim = in6_selecthlim(NULL, 4414 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4415 4416 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4417 NULL, NULL, NULL); 4418 break; 4419 #endif 4420 default: 4421 error = EAFNOSUPPORT; 4422 break; 4423 } 4424 return (error); 4425 } 4426