1 /* $OpenBSD: tcp_input.c,v 1.262 2013/06/03 16:57:05 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <dev/rndvar.h> 84 85 #include <net/if.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_systm.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/tcp.h> 94 #include <netinet/tcp_fsm.h> 95 #include <netinet/tcp_seq.h> 96 #include <netinet/tcp_timer.h> 97 #include <netinet/tcp_var.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/tcp_debug.h> 100 101 #if NPF > 0 102 #include <net/pfvar.h> 103 #endif 104 105 struct tcpiphdr tcp_saveti; 106 107 int tcp_mss_adv(struct ifnet *, int); 108 int tcp_flush_queue(struct tcpcb *); 109 110 #ifdef INET6 111 #include <netinet6/in6_var.h> 112 #include <netinet6/nd6.h> 113 114 struct tcpipv6hdr tcp_saveti6; 115 116 /* for the packet header length in the mbuf */ 117 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 118 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 119 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 120 #endif /* INET6 */ 121 122 int tcprexmtthresh = 3; 123 int tcptv_keep_init = TCPTV_KEEP_INIT; 124 125 int tcp_rst_ppslim = 100; /* 100pps */ 126 int tcp_rst_ppslim_count = 0; 127 struct timeval tcp_rst_ppslim_last; 128 129 int tcp_ackdrop_ppslim = 100; /* 100pps */ 130 int tcp_ackdrop_ppslim_count = 0; 131 struct timeval tcp_ackdrop_ppslim_last; 132 133 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 134 135 /* for modulo comparisons of timestamps */ 136 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 137 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 138 139 /* for TCP SACK comparisons */ 140 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 141 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 142 143 /* 144 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 145 */ 146 #ifdef INET6 147 #define ND6_HINT(tp) \ 148 do { \ 149 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 150 tp->t_inpcb->inp_route6.ro_rt) { \ 151 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 152 } \ 153 } while (0) 154 #else 155 #define ND6_HINT(tp) 156 #endif 157 158 #ifdef TCP_ECN 159 /* 160 * ECN (Explicit Congestion Notification) support based on RFC3168 161 * implementation note: 162 * snd_last is used to track a recovery phase. 163 * when cwnd is reduced, snd_last is set to snd_max. 164 * while snd_last > snd_una, the sender is in a recovery phase and 165 * its cwnd should not be reduced again. 166 * snd_last follows snd_una when not in a recovery phase. 167 */ 168 #endif 169 170 /* 171 * Macro to compute ACK transmission behavior. Delay the ACK unless 172 * we have already delayed an ACK (must send an ACK every two segments). 173 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 174 * option is enabled or when the packet is coming from a loopback 175 * interface. 176 */ 177 #define TCP_SETUP_ACK(tp, tiflags, m) \ 178 do { \ 179 if ((tp)->t_flags & TF_DELACK || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (m && (m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif && \ 182 (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK))) \ 183 tp->t_flags |= TF_ACKNOW; \ 184 else \ 185 TCP_SET_DELACK(tp); \ 186 } while (0) 187 188 void syn_cache_put(struct syn_cache *); 189 void syn_cache_rm(struct syn_cache *); 190 191 /* 192 * Insert segment ti into reassembly queue of tcp with 193 * control block tp. Return TH_FIN if reassembly now includes 194 * a segment with FIN. The macro form does the common case inline 195 * (segment is the next to be received on an established connection, 196 * and the queue is empty), avoiding linkage into and removal 197 * from the queue and repetition of various conversions. 198 * Set DELACK for segments received in order, but ack immediately 199 * when segments are out of order (so fast retransmit can work). 200 */ 201 202 int 203 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 204 { 205 struct tcpqent *p, *q, *nq, *tiqe; 206 207 /* 208 * Allocate a new queue entry, before we throw away any data. 209 * If we can't, just drop the packet. XXX 210 */ 211 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 212 if (tiqe == NULL) { 213 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 214 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 215 /* Reuse last entry since new segment fills a hole */ 216 m_freem(tiqe->tcpqe_m); 217 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 218 } 219 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 220 /* Flush segment queue for this connection */ 221 tcp_freeq(tp); 222 tcpstat.tcps_rcvmemdrop++; 223 m_freem(m); 224 return (0); 225 } 226 } 227 228 /* 229 * Find a segment which begins after this one does. 230 */ 231 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 232 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 233 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 234 break; 235 236 /* 237 * If there is a preceding segment, it may provide some of 238 * our data already. If so, drop the data from the incoming 239 * segment. If it provides all of our data, drop us. 240 */ 241 if (p != NULL) { 242 struct tcphdr *phdr = p->tcpqe_tcp; 243 int i; 244 245 /* conversion to int (in i) handles seq wraparound */ 246 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 247 if (i > 0) { 248 if (i >= *tlen) { 249 tcpstat.tcps_rcvduppack++; 250 tcpstat.tcps_rcvdupbyte += *tlen; 251 m_freem(m); 252 pool_put(&tcpqe_pool, tiqe); 253 return (0); 254 } 255 m_adj(m, i); 256 *tlen -= i; 257 th->th_seq += i; 258 } 259 } 260 tcpstat.tcps_rcvoopack++; 261 tcpstat.tcps_rcvoobyte += *tlen; 262 263 /* 264 * While we overlap succeeding segments trim them or, 265 * if they are completely covered, dequeue them. 266 */ 267 for (; q != NULL; q = nq) { 268 struct tcphdr *qhdr = q->tcpqe_tcp; 269 int i = (th->th_seq + *tlen) - qhdr->th_seq; 270 271 if (i <= 0) 272 break; 273 if (i < qhdr->th_reseqlen) { 274 qhdr->th_seq += i; 275 qhdr->th_reseqlen -= i; 276 m_adj(q->tcpqe_m, i); 277 break; 278 } 279 nq = TAILQ_NEXT(q, tcpqe_q); 280 m_freem(q->tcpqe_m); 281 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 282 pool_put(&tcpqe_pool, q); 283 } 284 285 /* Insert the new segment queue entry into place. */ 286 tiqe->tcpqe_m = m; 287 th->th_reseqlen = *tlen; 288 tiqe->tcpqe_tcp = th; 289 if (p == NULL) { 290 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 291 } else { 292 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 293 } 294 295 if (th->th_seq != tp->rcv_nxt) 296 return (0); 297 298 return (tcp_flush_queue(tp)); 299 } 300 301 int 302 tcp_flush_queue(struct tcpcb *tp) 303 { 304 struct socket *so = tp->t_inpcb->inp_socket; 305 struct tcpqent *q, *nq; 306 int flags; 307 308 /* 309 * Present data to user, advancing rcv_nxt through 310 * completed sequence space. 311 */ 312 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 313 return (0); 314 q = TAILQ_FIRST(&tp->t_segq); 315 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 316 return (0); 317 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 318 return (0); 319 do { 320 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 321 flags = q->tcpqe_tcp->th_flags & TH_FIN; 322 323 nq = TAILQ_NEXT(q, tcpqe_q); 324 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 325 ND6_HINT(tp); 326 if (so->so_state & SS_CANTRCVMORE) 327 m_freem(q->tcpqe_m); 328 else 329 sbappendstream(&so->so_rcv, q->tcpqe_m); 330 pool_put(&tcpqe_pool, q); 331 q = nq; 332 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 333 tp->t_flags |= TF_BLOCKOUTPUT; 334 sorwakeup(so); 335 tp->t_flags &= ~TF_BLOCKOUTPUT; 336 return (flags); 337 } 338 339 #ifdef INET6 340 int 341 tcp6_input(struct mbuf **mp, int *offp, int proto) 342 { 343 struct mbuf *m = *mp; 344 345 tcp_input(m, *offp, proto); 346 return IPPROTO_DONE; 347 } 348 #endif 349 350 /* 351 * TCP input routine, follows pages 65-76 of the 352 * protocol specification dated September, 1981 very closely. 353 */ 354 void 355 tcp_input(struct mbuf *m, ...) 356 { 357 struct ip *ip; 358 struct inpcb *inp = NULL; 359 u_int8_t *optp = NULL; 360 int optlen = 0; 361 int tlen, off; 362 struct tcpcb *tp = NULL; 363 int tiflags; 364 struct socket *so = NULL; 365 int todrop, acked, ourfinisacked; 366 int hdroptlen = 0; 367 short ostate = 0; 368 tcp_seq iss, *reuse = NULL; 369 u_long tiwin; 370 struct tcp_opt_info opti; 371 int iphlen; 372 va_list ap; 373 struct tcphdr *th; 374 #ifdef INET6 375 struct ip6_hdr *ip6 = NULL; 376 #endif /* INET6 */ 377 #ifdef IPSEC 378 struct m_tag *mtag; 379 struct tdb_ident *tdbi; 380 struct tdb *tdb; 381 int error, s; 382 #endif /* IPSEC */ 383 int af; 384 #ifdef TCP_ECN 385 u_char iptos; 386 #endif 387 388 va_start(ap, m); 389 iphlen = va_arg(ap, int); 390 va_end(ap); 391 392 tcpstat.tcps_rcvtotal++; 393 394 opti.ts_present = 0; 395 opti.maxseg = 0; 396 397 /* 398 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 399 * See below for AF specific multicast. 400 */ 401 if (m->m_flags & (M_BCAST|M_MCAST)) 402 goto drop; 403 404 /* 405 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 406 * TCP/IPv4. 407 */ 408 switch (mtod(m, struct ip *)->ip_v) { 409 #ifdef INET6 410 case 6: 411 af = AF_INET6; 412 break; 413 #endif 414 case 4: 415 af = AF_INET; 416 break; 417 default: 418 m_freem(m); 419 return; /*EAFNOSUPPORT*/ 420 } 421 422 /* 423 * Get IP and TCP header together in first mbuf. 424 * Note: IP leaves IP header in first mbuf. 425 */ 426 switch (af) { 427 case AF_INET: 428 #ifdef DIAGNOSTIC 429 if (iphlen < sizeof(struct ip)) { 430 m_freem(m); 431 return; 432 } 433 #endif /* DIAGNOSTIC */ 434 break; 435 #ifdef INET6 436 case AF_INET6: 437 #ifdef DIAGNOSTIC 438 if (iphlen < sizeof(struct ip6_hdr)) { 439 m_freem(m); 440 return; 441 } 442 #endif /* DIAGNOSTIC */ 443 break; 444 #endif 445 default: 446 m_freem(m); 447 return; 448 } 449 450 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 451 if (!th) { 452 tcpstat.tcps_rcvshort++; 453 return; 454 } 455 456 tlen = m->m_pkthdr.len - iphlen; 457 ip = NULL; 458 #ifdef INET6 459 ip6 = NULL; 460 #endif 461 switch (af) { 462 case AF_INET: 463 ip = mtod(m, struct ip *); 464 if (IN_MULTICAST(ip->ip_dst.s_addr) || 465 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif, 466 m->m_pkthdr.rdomain)) 467 goto drop; 468 #ifdef TCP_ECN 469 /* save ip_tos before clearing it for checksum */ 470 iptos = ip->ip_tos; 471 #endif 472 break; 473 #ifdef INET6 474 case AF_INET6: 475 ip6 = mtod(m, struct ip6_hdr *); 476 #ifdef TCP_ECN 477 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 478 #endif 479 480 /* Be proactive about malicious use of IPv4 mapped address */ 481 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 482 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 483 /* XXX stat */ 484 goto drop; 485 } 486 487 /* 488 * Be proactive about unspecified IPv6 address in source. 489 * As we use all-zero to indicate unbounded/unconnected pcb, 490 * unspecified IPv6 address can be used to confuse us. 491 * 492 * Note that packets with unspecified IPv6 destination is 493 * already dropped in ip6_input. 494 */ 495 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 496 /* XXX stat */ 497 goto drop; 498 } 499 500 /* Discard packets to multicast */ 501 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 502 /* XXX stat */ 503 goto drop; 504 } 505 break; 506 #endif 507 } 508 509 /* 510 * Checksum extended TCP header and data. 511 */ 512 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 513 int sum; 514 515 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 516 tcpstat.tcps_inhwcsum++; 517 tcpstat.tcps_rcvbadsum++; 518 goto drop; 519 } 520 switch (af) { 521 case AF_INET: 522 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 523 break; 524 #ifdef INET6 525 case AF_INET6: 526 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 527 tlen); 528 break; 529 #endif 530 } 531 if (sum != 0) { 532 tcpstat.tcps_rcvbadsum++; 533 goto drop; 534 } 535 } else { 536 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK; 537 tcpstat.tcps_inhwcsum++; 538 } 539 540 /* 541 * Check that TCP offset makes sense, 542 * pull out TCP options and adjust length. XXX 543 */ 544 off = th->th_off << 2; 545 if (off < sizeof(struct tcphdr) || off > tlen) { 546 tcpstat.tcps_rcvbadoff++; 547 goto drop; 548 } 549 tlen -= off; 550 if (off > sizeof(struct tcphdr)) { 551 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 552 if (!th) { 553 tcpstat.tcps_rcvshort++; 554 return; 555 } 556 optlen = off - sizeof(struct tcphdr); 557 optp = (u_int8_t *)(th + 1); 558 /* 559 * Do quick retrieval of timestamp options ("options 560 * prediction?"). If timestamp is the only option and it's 561 * formatted as recommended in RFC 1323 appendix A, we 562 * quickly get the values now and not bother calling 563 * tcp_dooptions(), etc. 564 */ 565 if ((optlen == TCPOLEN_TSTAMP_APPA || 566 (optlen > TCPOLEN_TSTAMP_APPA && 567 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 568 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 569 (th->th_flags & TH_SYN) == 0) { 570 opti.ts_present = 1; 571 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 572 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 573 optp = NULL; /* we've parsed the options */ 574 } 575 } 576 tiflags = th->th_flags; 577 578 /* 579 * Convert TCP protocol specific fields to host format. 580 */ 581 NTOHL(th->th_seq); 582 NTOHL(th->th_ack); 583 NTOHS(th->th_win); 584 NTOHS(th->th_urp); 585 586 /* 587 * Locate pcb for segment. 588 */ 589 #if NPF > 0 590 if (m->m_pkthdr.pf.statekey) 591 inp = m->m_pkthdr.pf.statekey->inp; 592 #endif 593 findpcb: 594 if (inp == NULL) { 595 switch (af) { 596 #ifdef INET6 597 case AF_INET6: 598 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 599 th->th_sport, &ip6->ip6_dst, th->th_dport); 600 break; 601 #endif 602 case AF_INET: 603 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 604 th->th_sport, ip->ip_dst, th->th_dport, 605 m->m_pkthdr.rdomain); 606 break; 607 } 608 #if NPF > 0 609 if (m->m_pkthdr.pf.statekey && inp) { 610 m->m_pkthdr.pf.statekey->inp = inp; 611 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 612 } 613 #endif 614 } 615 if (inp == NULL) { 616 int inpl_flags = 0; 617 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 618 inpl_flags = INPLOOKUP_WILDCARD; 619 ++tcpstat.tcps_pcbhashmiss; 620 switch (af) { 621 #ifdef INET6 622 case AF_INET6: 623 inp = in6_pcblookup_listen(&tcbtable, 624 &ip6->ip6_dst, th->th_dport, inpl_flags, m); 625 break; 626 #endif /* INET6 */ 627 case AF_INET: 628 inp = in_pcblookup_listen(&tcbtable, 629 ip->ip_dst, th->th_dport, inpl_flags, m, 630 m->m_pkthdr.rdomain); 631 break; 632 } 633 /* 634 * If the state is CLOSED (i.e., TCB does not exist) then 635 * all data in the incoming segment is discarded. 636 * If the TCB exists but is in CLOSED state, it is embryonic, 637 * but should either do a listen or a connect soon. 638 */ 639 if (inp == 0) { 640 ++tcpstat.tcps_noport; 641 goto dropwithreset_ratelim; 642 } 643 } 644 645 /* Check the minimum TTL for socket. */ 646 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 647 goto drop; 648 649 tp = intotcpcb(inp); 650 if (tp == 0) 651 goto dropwithreset_ratelim; 652 if (tp->t_state == TCPS_CLOSED) 653 goto drop; 654 655 /* Unscale the window into a 32-bit value. */ 656 if ((tiflags & TH_SYN) == 0) 657 tiwin = th->th_win << tp->snd_scale; 658 else 659 tiwin = th->th_win; 660 661 so = inp->inp_socket; 662 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 663 union syn_cache_sa src; 664 union syn_cache_sa dst; 665 666 bzero(&src, sizeof(src)); 667 bzero(&dst, sizeof(dst)); 668 switch (af) { 669 #ifdef INET 670 case AF_INET: 671 src.sin.sin_len = sizeof(struct sockaddr_in); 672 src.sin.sin_family = AF_INET; 673 src.sin.sin_addr = ip->ip_src; 674 src.sin.sin_port = th->th_sport; 675 676 dst.sin.sin_len = sizeof(struct sockaddr_in); 677 dst.sin.sin_family = AF_INET; 678 dst.sin.sin_addr = ip->ip_dst; 679 dst.sin.sin_port = th->th_dport; 680 break; 681 #endif 682 #ifdef INET6 683 case AF_INET6: 684 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 685 src.sin6.sin6_family = AF_INET6; 686 src.sin6.sin6_addr = ip6->ip6_src; 687 src.sin6.sin6_port = th->th_sport; 688 689 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 690 dst.sin6.sin6_family = AF_INET6; 691 dst.sin6.sin6_addr = ip6->ip6_dst; 692 dst.sin6.sin6_port = th->th_dport; 693 break; 694 #endif /* INET6 */ 695 default: 696 goto badsyn; /*sanity*/ 697 } 698 699 if (so->so_options & SO_DEBUG) { 700 ostate = tp->t_state; 701 switch (af) { 702 #ifdef INET6 703 case AF_INET6: 704 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6)); 705 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th)); 706 break; 707 #endif 708 case AF_INET: 709 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip)); 710 bcopy(th, &tcp_saveti.ti_t, sizeof(*th)); 711 break; 712 } 713 } 714 if (so->so_options & SO_ACCEPTCONN) { 715 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 716 717 case TH_SYN|TH_ACK|TH_RST: 718 case TH_SYN|TH_RST: 719 case TH_ACK|TH_RST: 720 case TH_RST: 721 syn_cache_reset(&src.sa, &dst.sa, th, 722 inp->inp_rtableid); 723 goto drop; 724 725 case TH_SYN|TH_ACK: 726 /* 727 * Received a SYN,ACK. This should 728 * never happen while we are in 729 * LISTEN. Send an RST. 730 */ 731 goto badsyn; 732 733 case TH_ACK: 734 so = syn_cache_get(&src.sa, &dst.sa, 735 th, iphlen, tlen, so, m); 736 if (so == NULL) { 737 /* 738 * We don't have a SYN for 739 * this ACK; send an RST. 740 */ 741 goto badsyn; 742 } else if (so == (struct socket *)(-1)) { 743 /* 744 * We were unable to create 745 * the connection. If the 746 * 3-way handshake was 747 * completed, and RST has 748 * been sent to the peer. 749 * Since the mbuf might be 750 * in use for the reply, 751 * do not free it. 752 */ 753 m = NULL; 754 goto drop; 755 } else { 756 /* 757 * We have created a 758 * full-blown connection. 759 */ 760 tp = NULL; 761 inp = sotoinpcb(so); 762 tp = intotcpcb(inp); 763 if (tp == NULL) 764 goto badsyn; /*XXX*/ 765 766 } 767 break; 768 769 default: 770 /* 771 * None of RST, SYN or ACK was set. 772 * This is an invalid packet for a 773 * TCB in LISTEN state. Send a RST. 774 */ 775 goto badsyn; 776 777 case TH_SYN: 778 /* 779 * Received a SYN. 780 */ 781 #ifdef INET6 782 /* 783 * If deprecated address is forbidden, we do 784 * not accept SYN to deprecated interface 785 * address to prevent any new inbound 786 * connection from getting established. 787 * When we do not accept SYN, we send a TCP 788 * RST, with deprecated source address (instead 789 * of dropping it). We compromise it as it is 790 * much better for peer to send a RST, and 791 * RST will be the final packet for the 792 * exchange. 793 * 794 * If we do not forbid deprecated addresses, we 795 * accept the SYN packet. RFC2462 does not 796 * suggest dropping SYN in this case. 797 * If we decipher RFC2462 5.5.4, it says like 798 * this: 799 * 1. use of deprecated addr with existing 800 * communication is okay - "SHOULD continue 801 * to be used" 802 * 2. use of it with new communication: 803 * (2a) "SHOULD NOT be used if alternate 804 * address with sufficient scope is 805 * available" 806 * (2b) nothing mentioned otherwise. 807 * Here we fall into (2b) case as we have no 808 * choice in our source address selection - we 809 * must obey the peer. 810 * 811 * The wording in RFC2462 is confusing, and 812 * there are multiple description text for 813 * deprecated address handling - worse, they 814 * are not exactly the same. I believe 5.5.4 815 * is the best one, so we follow 5.5.4. 816 */ 817 if (ip6 && !ip6_use_deprecated) { 818 struct in6_ifaddr *ia6; 819 820 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 821 &ip6->ip6_dst)) && 822 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 823 tp = NULL; 824 goto dropwithreset; 825 } 826 } 827 #endif 828 829 /* 830 * LISTEN socket received a SYN 831 * from itself? This can't possibly 832 * be valid; drop the packet. 833 */ 834 if (th->th_dport == th->th_sport) { 835 switch (af) { 836 #ifdef INET6 837 case AF_INET6: 838 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 839 &ip6->ip6_dst)) { 840 tcpstat.tcps_badsyn++; 841 goto drop; 842 } 843 break; 844 #endif /* INET6 */ 845 case AF_INET: 846 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 847 tcpstat.tcps_badsyn++; 848 goto drop; 849 } 850 break; 851 } 852 } 853 854 /* 855 * SYN looks ok; create compressed TCP 856 * state for it. 857 */ 858 if (so->so_qlen > so->so_qlimit || 859 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 860 so, m, optp, optlen, &opti, reuse) == -1) 861 goto drop; 862 return; 863 } 864 } 865 } 866 867 #ifdef DIAGNOSTIC 868 /* 869 * Should not happen now that all embryonic connections 870 * are handled with compressed state. 871 */ 872 if (tp->t_state == TCPS_LISTEN) 873 panic("tcp_input: TCPS_LISTEN"); 874 #endif 875 876 #if NPF > 0 877 if (m->m_pkthdr.pf.statekey && !m->m_pkthdr.pf.statekey->inp && 878 !inp->inp_pf_sk) { 879 m->m_pkthdr.pf.statekey->inp = inp; 880 inp->inp_pf_sk = m->m_pkthdr.pf.statekey; 881 } 882 /* The statekey has finished finding the inp, it is no longer needed. */ 883 m->m_pkthdr.pf.statekey = NULL; 884 #endif 885 886 #ifdef IPSEC 887 /* Find most recent IPsec tag */ 888 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 889 s = splnet(); 890 if (mtag != NULL) { 891 tdbi = (struct tdb_ident *)(mtag + 1); 892 tdb = gettdb(tdbi->rdomain, tdbi->spi, 893 &tdbi->dst, tdbi->proto); 894 } else 895 tdb = NULL; 896 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 897 tdb, inp, 0); 898 if (error) { 899 splx(s); 900 goto drop; 901 } 902 903 /* Latch SA */ 904 if (inp->inp_tdb_in != tdb) { 905 if (tdb) { 906 tdb_add_inp(tdb, inp, 1); 907 if (inp->inp_ipo == NULL) { 908 inp->inp_ipo = ipsec_add_policy(inp, af, 909 IPSP_DIRECTION_OUT); 910 if (inp->inp_ipo == NULL) { 911 splx(s); 912 goto drop; 913 } 914 } 915 if (inp->inp_ipo->ipo_dstid == NULL && 916 tdb->tdb_srcid != NULL) { 917 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid; 918 tdb->tdb_srcid->ref_count++; 919 } 920 if (inp->inp_ipsec_remotecred == NULL && 921 tdb->tdb_remote_cred != NULL) { 922 inp->inp_ipsec_remotecred = 923 tdb->tdb_remote_cred; 924 tdb->tdb_remote_cred->ref_count++; 925 } 926 if (inp->inp_ipsec_remoteauth == NULL && 927 tdb->tdb_remote_auth != NULL) { 928 inp->inp_ipsec_remoteauth = 929 tdb->tdb_remote_auth; 930 tdb->tdb_remote_auth->ref_count++; 931 } 932 } else { /* Just reset */ 933 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp, 934 inp_tdb_in_next); 935 inp->inp_tdb_in = NULL; 936 } 937 } 938 splx(s); 939 #endif /* IPSEC */ 940 941 /* 942 * Segment received on connection. 943 * Reset idle time and keep-alive timer. 944 */ 945 tp->t_rcvtime = tcp_now; 946 if (TCPS_HAVEESTABLISHED(tp->t_state)) 947 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 948 949 #ifdef TCP_SACK 950 if (tp->sack_enable) 951 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 952 #endif /* TCP_SACK */ 953 954 /* 955 * Process options. 956 */ 957 #ifdef TCP_SIGNATURE 958 if (optp || (tp->t_flags & TF_SIGNATURE)) 959 #else 960 if (optp) 961 #endif 962 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 963 m->m_pkthdr.rdomain)) 964 goto drop; 965 966 if (opti.ts_present && opti.ts_ecr) { 967 int rtt_test; 968 969 /* subtract out the tcp timestamp modulator */ 970 opti.ts_ecr -= tp->ts_modulate; 971 972 /* make sure ts_ecr is sensible */ 973 rtt_test = tcp_now - opti.ts_ecr; 974 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 975 opti.ts_ecr = 0; 976 } 977 978 #ifdef TCP_ECN 979 /* if congestion experienced, set ECE bit in subsequent packets. */ 980 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 981 tp->t_flags |= TF_RCVD_CE; 982 tcpstat.tcps_ecn_rcvce++; 983 } 984 #endif 985 /* 986 * Header prediction: check for the two common cases 987 * of a uni-directional data xfer. If the packet has 988 * no control flags, is in-sequence, the window didn't 989 * change and we're not retransmitting, it's a 990 * candidate. If the length is zero and the ack moved 991 * forward, we're the sender side of the xfer. Just 992 * free the data acked & wake any higher level process 993 * that was blocked waiting for space. If the length 994 * is non-zero and the ack didn't move, we're the 995 * receiver side. If we're getting packets in-order 996 * (the reassembly queue is empty), add the data to 997 * the socket buffer and note that we need a delayed ack. 998 */ 999 if (tp->t_state == TCPS_ESTABLISHED && 1000 #ifdef TCP_ECN 1001 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 1002 #else 1003 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1004 #endif 1005 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1006 th->th_seq == tp->rcv_nxt && 1007 tiwin && tiwin == tp->snd_wnd && 1008 tp->snd_nxt == tp->snd_max) { 1009 1010 /* 1011 * If last ACK falls within this segment's sequence numbers, 1012 * record the timestamp. 1013 * Fix from Braden, see Stevens p. 870 1014 */ 1015 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1016 tp->ts_recent_age = tcp_now; 1017 tp->ts_recent = opti.ts_val; 1018 } 1019 1020 if (tlen == 0) { 1021 if (SEQ_GT(th->th_ack, tp->snd_una) && 1022 SEQ_LEQ(th->th_ack, tp->snd_max) && 1023 tp->snd_cwnd >= tp->snd_wnd && 1024 tp->t_dupacks == 0) { 1025 /* 1026 * this is a pure ack for outstanding data. 1027 */ 1028 ++tcpstat.tcps_predack; 1029 if (opti.ts_present && opti.ts_ecr) 1030 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1031 else if (tp->t_rtttime && 1032 SEQ_GT(th->th_ack, tp->t_rtseq)) 1033 tcp_xmit_timer(tp, 1034 tcp_now - tp->t_rtttime); 1035 acked = th->th_ack - tp->snd_una; 1036 tcpstat.tcps_rcvackpack++; 1037 tcpstat.tcps_rcvackbyte += acked; 1038 ND6_HINT(tp); 1039 sbdrop(&so->so_snd, acked); 1040 1041 /* 1042 * If we had a pending ICMP message that 1043 * referres to data that have just been 1044 * acknowledged, disregard the recorded ICMP 1045 * message. 1046 */ 1047 if ((tp->t_flags & TF_PMTUD_PEND) && 1048 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1049 tp->t_flags &= ~TF_PMTUD_PEND; 1050 1051 /* 1052 * Keep track of the largest chunk of data 1053 * acknowledged since last PMTU update 1054 */ 1055 if (tp->t_pmtud_mss_acked < acked) 1056 tp->t_pmtud_mss_acked = acked; 1057 1058 tp->snd_una = th->th_ack; 1059 #if defined(TCP_SACK) || defined(TCP_ECN) 1060 /* 1061 * We want snd_last to track snd_una so 1062 * as to avoid sequence wraparound problems 1063 * for very large transfers. 1064 */ 1065 #ifdef TCP_ECN 1066 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1067 #endif 1068 tp->snd_last = tp->snd_una; 1069 #endif /* TCP_SACK */ 1070 #if defined(TCP_SACK) && defined(TCP_FACK) 1071 tp->snd_fack = tp->snd_una; 1072 tp->retran_data = 0; 1073 #endif /* TCP_FACK */ 1074 m_freem(m); 1075 1076 /* 1077 * If all outstanding data are acked, stop 1078 * retransmit timer, otherwise restart timer 1079 * using current (possibly backed-off) value. 1080 * If process is waiting for space, 1081 * wakeup/selwakeup/signal. If data 1082 * are ready to send, let tcp_output 1083 * decide between more output or persist. 1084 */ 1085 if (tp->snd_una == tp->snd_max) 1086 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1087 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1088 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1089 1090 tcp_update_sndspace(tp); 1091 if (sb_notify(&so->so_snd)) { 1092 tp->t_flags |= TF_BLOCKOUTPUT; 1093 sowwakeup(so); 1094 tp->t_flags &= ~TF_BLOCKOUTPUT; 1095 } 1096 if (so->so_snd.sb_cc || 1097 tp->t_flags & TF_NEEDOUTPUT) 1098 (void) tcp_output(tp); 1099 return; 1100 } 1101 } else if (th->th_ack == tp->snd_una && 1102 TAILQ_EMPTY(&tp->t_segq) && 1103 tlen <= sbspace(&so->so_rcv)) { 1104 /* 1105 * This is a pure, in-sequence data packet 1106 * with nothing on the reassembly queue and 1107 * we have enough buffer space to take it. 1108 */ 1109 #ifdef TCP_SACK 1110 /* Clean receiver SACK report if present */ 1111 if (tp->sack_enable && tp->rcv_numsacks) 1112 tcp_clean_sackreport(tp); 1113 #endif /* TCP_SACK */ 1114 ++tcpstat.tcps_preddat; 1115 tp->rcv_nxt += tlen; 1116 tcpstat.tcps_rcvpack++; 1117 tcpstat.tcps_rcvbyte += tlen; 1118 ND6_HINT(tp); 1119 1120 TCP_SETUP_ACK(tp, tiflags, m); 1121 /* 1122 * Drop TCP, IP headers and TCP options then add data 1123 * to socket buffer. 1124 */ 1125 if (so->so_state & SS_CANTRCVMORE) 1126 m_freem(m); 1127 else { 1128 if (opti.ts_present && opti.ts_ecr) { 1129 if (tp->rfbuf_ts < opti.ts_ecr && 1130 opti.ts_ecr - tp->rfbuf_ts < hz) { 1131 tcp_update_rcvspace(tp); 1132 /* Start over with next RTT. */ 1133 tp->rfbuf_cnt = 0; 1134 tp->rfbuf_ts = 0; 1135 } else 1136 tp->rfbuf_cnt += tlen; 1137 } 1138 m_adj(m, iphlen + off); 1139 sbappendstream(&so->so_rcv, m); 1140 } 1141 tp->t_flags |= TF_BLOCKOUTPUT; 1142 sorwakeup(so); 1143 tp->t_flags &= ~TF_BLOCKOUTPUT; 1144 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1145 (void) tcp_output(tp); 1146 return; 1147 } 1148 } 1149 1150 /* 1151 * Compute mbuf offset to TCP data segment. 1152 */ 1153 hdroptlen = iphlen + off; 1154 1155 /* 1156 * Calculate amount of space in receive window, 1157 * and then do TCP input processing. 1158 * Receive window is amount of space in rcv queue, 1159 * but not less than advertised window. 1160 */ 1161 { int win; 1162 1163 win = sbspace(&so->so_rcv); 1164 if (win < 0) 1165 win = 0; 1166 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1167 } 1168 1169 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1170 tp->rfbuf_cnt = 0; 1171 tp->rfbuf_ts = 0; 1172 1173 switch (tp->t_state) { 1174 1175 /* 1176 * If the state is SYN_RECEIVED: 1177 * if seg contains SYN/ACK, send an RST. 1178 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1179 */ 1180 1181 case TCPS_SYN_RECEIVED: 1182 if (tiflags & TH_ACK) { 1183 if (tiflags & TH_SYN) { 1184 tcpstat.tcps_badsyn++; 1185 goto dropwithreset; 1186 } 1187 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1188 SEQ_GT(th->th_ack, tp->snd_max)) 1189 goto dropwithreset; 1190 } 1191 break; 1192 1193 /* 1194 * If the state is SYN_SENT: 1195 * if seg contains an ACK, but not for our SYN, drop the input. 1196 * if seg contains a RST, then drop the connection. 1197 * if seg does not contain SYN, then drop it. 1198 * Otherwise this is an acceptable SYN segment 1199 * initialize tp->rcv_nxt and tp->irs 1200 * if seg contains ack then advance tp->snd_una 1201 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1202 * arrange for segment to be acked (eventually) 1203 * continue processing rest of data/controls, beginning with URG 1204 */ 1205 case TCPS_SYN_SENT: 1206 if ((tiflags & TH_ACK) && 1207 (SEQ_LEQ(th->th_ack, tp->iss) || 1208 SEQ_GT(th->th_ack, tp->snd_max))) 1209 goto dropwithreset; 1210 if (tiflags & TH_RST) { 1211 #ifdef TCP_ECN 1212 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1213 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1214 goto drop; 1215 #endif 1216 if (tiflags & TH_ACK) 1217 tp = tcp_drop(tp, ECONNREFUSED); 1218 goto drop; 1219 } 1220 if ((tiflags & TH_SYN) == 0) 1221 goto drop; 1222 if (tiflags & TH_ACK) { 1223 tp->snd_una = th->th_ack; 1224 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1225 tp->snd_nxt = tp->snd_una; 1226 } 1227 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1228 tp->irs = th->th_seq; 1229 tcp_mss(tp, opti.maxseg); 1230 /* Reset initial window to 1 segment for retransmit */ 1231 if (tp->t_rxtshift > 0) 1232 tp->snd_cwnd = tp->t_maxseg; 1233 tcp_rcvseqinit(tp); 1234 tp->t_flags |= TF_ACKNOW; 1235 #ifdef TCP_SACK 1236 /* 1237 * If we've sent a SACK_PERMITTED option, and the peer 1238 * also replied with one, then TF_SACK_PERMIT should have 1239 * been set in tcp_dooptions(). If it was not, disable SACKs. 1240 */ 1241 if (tp->sack_enable) 1242 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1243 #endif 1244 #ifdef TCP_ECN 1245 /* 1246 * if ECE is set but CWR is not set for SYN-ACK, or 1247 * both ECE and CWR are set for simultaneous open, 1248 * peer is ECN capable. 1249 */ 1250 if (tcp_do_ecn) { 1251 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1252 case TH_ACK|TH_ECE: 1253 case TH_ECE|TH_CWR: 1254 tp->t_flags |= TF_ECN_PERMIT; 1255 tiflags &= ~(TH_ECE|TH_CWR); 1256 tcpstat.tcps_ecn_accepts++; 1257 } 1258 } 1259 #endif 1260 1261 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1262 tcpstat.tcps_connects++; 1263 soisconnected(so); 1264 tp->t_state = TCPS_ESTABLISHED; 1265 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1266 /* Do window scaling on this connection? */ 1267 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1268 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1269 tp->snd_scale = tp->requested_s_scale; 1270 tp->rcv_scale = tp->request_r_scale; 1271 } 1272 tcp_flush_queue(tp); 1273 1274 /* 1275 * if we didn't have to retransmit the SYN, 1276 * use its rtt as our initial srtt & rtt var. 1277 */ 1278 if (tp->t_rtttime) 1279 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1280 /* 1281 * Since new data was acked (the SYN), open the 1282 * congestion window by one MSS. We do this 1283 * here, because we won't go through the normal 1284 * ACK processing below. And since this is the 1285 * start of the connection, we know we are in 1286 * the exponential phase of slow-start. 1287 */ 1288 tp->snd_cwnd += tp->t_maxseg; 1289 } else 1290 tp->t_state = TCPS_SYN_RECEIVED; 1291 1292 #if 0 1293 trimthenstep6: 1294 #endif 1295 /* 1296 * Advance th->th_seq to correspond to first data byte. 1297 * If data, trim to stay within window, 1298 * dropping FIN if necessary. 1299 */ 1300 th->th_seq++; 1301 if (tlen > tp->rcv_wnd) { 1302 todrop = tlen - tp->rcv_wnd; 1303 m_adj(m, -todrop); 1304 tlen = tp->rcv_wnd; 1305 tiflags &= ~TH_FIN; 1306 tcpstat.tcps_rcvpackafterwin++; 1307 tcpstat.tcps_rcvbyteafterwin += todrop; 1308 } 1309 tp->snd_wl1 = th->th_seq - 1; 1310 tp->rcv_up = th->th_seq; 1311 goto step6; 1312 /* 1313 * If a new connection request is received while in TIME_WAIT, 1314 * drop the old connection and start over if the if the 1315 * timestamp or the sequence numbers are above the previous 1316 * ones. 1317 */ 1318 case TCPS_TIME_WAIT: 1319 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1320 ((opti.ts_present && 1321 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1322 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1323 #if NPF > 0 1324 /* 1325 * The socket will be recreated but the new state 1326 * has already been linked to the socket. Remove the 1327 * link between old socket and new state. 1328 */ 1329 if (inp->inp_pf_sk) { 1330 inp->inp_pf_sk->inp = NULL; 1331 inp->inp_pf_sk = NULL; 1332 } 1333 #endif 1334 /* 1335 * Advance the iss by at least 32768, but 1336 * clear the msb in order to make sure 1337 * that SEG_LT(snd_nxt, iss). 1338 */ 1339 iss = tp->snd_nxt + 1340 ((arc4random() & 0x7fffffff) | 0x8000); 1341 reuse = &iss; 1342 tp = tcp_close(tp); 1343 inp = NULL; 1344 goto findpcb; 1345 } 1346 } 1347 1348 /* 1349 * States other than LISTEN or SYN_SENT. 1350 * First check timestamp, if present. 1351 * Then check that at least some bytes of segment are within 1352 * receive window. If segment begins before rcv_nxt, 1353 * drop leading data (and SYN); if nothing left, just ack. 1354 * 1355 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1356 * and it's less than opti.ts_recent, drop it. 1357 */ 1358 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1359 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1360 1361 /* Check to see if ts_recent is over 24 days old. */ 1362 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1363 /* 1364 * Invalidate ts_recent. If this segment updates 1365 * ts_recent, the age will be reset later and ts_recent 1366 * will get a valid value. If it does not, setting 1367 * ts_recent to zero will at least satisfy the 1368 * requirement that zero be placed in the timestamp 1369 * echo reply when ts_recent isn't valid. The 1370 * age isn't reset until we get a valid ts_recent 1371 * because we don't want out-of-order segments to be 1372 * dropped when ts_recent is old. 1373 */ 1374 tp->ts_recent = 0; 1375 } else { 1376 tcpstat.tcps_rcvduppack++; 1377 tcpstat.tcps_rcvdupbyte += tlen; 1378 tcpstat.tcps_pawsdrop++; 1379 goto dropafterack; 1380 } 1381 } 1382 1383 todrop = tp->rcv_nxt - th->th_seq; 1384 if (todrop > 0) { 1385 if (tiflags & TH_SYN) { 1386 tiflags &= ~TH_SYN; 1387 th->th_seq++; 1388 if (th->th_urp > 1) 1389 th->th_urp--; 1390 else 1391 tiflags &= ~TH_URG; 1392 todrop--; 1393 } 1394 if (todrop > tlen || 1395 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1396 /* 1397 * Any valid FIN must be to the left of the 1398 * window. At this point, FIN must be a 1399 * duplicate or out-of-sequence, so drop it. 1400 */ 1401 tiflags &= ~TH_FIN; 1402 /* 1403 * Send ACK to resynchronize, and drop any data, 1404 * but keep on processing for RST or ACK. 1405 */ 1406 tp->t_flags |= TF_ACKNOW; 1407 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1408 tcpstat.tcps_rcvduppack++; 1409 } else { 1410 tcpstat.tcps_rcvpartduppack++; 1411 tcpstat.tcps_rcvpartdupbyte += todrop; 1412 } 1413 hdroptlen += todrop; /* drop from head afterwards */ 1414 th->th_seq += todrop; 1415 tlen -= todrop; 1416 if (th->th_urp > todrop) 1417 th->th_urp -= todrop; 1418 else { 1419 tiflags &= ~TH_URG; 1420 th->th_urp = 0; 1421 } 1422 } 1423 1424 /* 1425 * If new data are received on a connection after the 1426 * user processes are gone, then RST the other end. 1427 */ 1428 if ((so->so_state & SS_NOFDREF) && 1429 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1430 tp = tcp_close(tp); 1431 tcpstat.tcps_rcvafterclose++; 1432 goto dropwithreset; 1433 } 1434 1435 /* 1436 * If segment ends after window, drop trailing data 1437 * (and PUSH and FIN); if nothing left, just ACK. 1438 */ 1439 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1440 if (todrop > 0) { 1441 tcpstat.tcps_rcvpackafterwin++; 1442 if (todrop >= tlen) { 1443 tcpstat.tcps_rcvbyteafterwin += tlen; 1444 /* 1445 * If window is closed can only take segments at 1446 * window edge, and have to drop data and PUSH from 1447 * incoming segments. Continue processing, but 1448 * remember to ack. Otherwise, drop segment 1449 * and ack. 1450 */ 1451 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1452 tp->t_flags |= TF_ACKNOW; 1453 tcpstat.tcps_rcvwinprobe++; 1454 } else 1455 goto dropafterack; 1456 } else 1457 tcpstat.tcps_rcvbyteafterwin += todrop; 1458 m_adj(m, -todrop); 1459 tlen -= todrop; 1460 tiflags &= ~(TH_PUSH|TH_FIN); 1461 } 1462 1463 /* 1464 * If last ACK falls within this segment's sequence numbers, 1465 * record its timestamp if it's more recent. 1466 * Cf fix from Braden, see Stevens p. 870 1467 */ 1468 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1469 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1470 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1471 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1472 tp->ts_recent = opti.ts_val; 1473 else 1474 tp->ts_recent = 0; 1475 tp->ts_recent_age = tcp_now; 1476 } 1477 1478 /* 1479 * If the RST bit is set examine the state: 1480 * SYN_RECEIVED STATE: 1481 * If passive open, return to LISTEN state. 1482 * If active open, inform user that connection was refused. 1483 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1484 * Inform user that connection was reset, and close tcb. 1485 * CLOSING, LAST_ACK, TIME_WAIT STATES 1486 * Close the tcb. 1487 */ 1488 if (tiflags & TH_RST) { 1489 if (th->th_seq != tp->last_ack_sent && 1490 th->th_seq != tp->rcv_nxt && 1491 th->th_seq != (tp->rcv_nxt + 1)) 1492 goto drop; 1493 1494 switch (tp->t_state) { 1495 case TCPS_SYN_RECEIVED: 1496 #ifdef TCP_ECN 1497 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1498 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1499 goto drop; 1500 #endif 1501 so->so_error = ECONNREFUSED; 1502 goto close; 1503 1504 case TCPS_ESTABLISHED: 1505 case TCPS_FIN_WAIT_1: 1506 case TCPS_FIN_WAIT_2: 1507 case TCPS_CLOSE_WAIT: 1508 so->so_error = ECONNRESET; 1509 close: 1510 tp->t_state = TCPS_CLOSED; 1511 tcpstat.tcps_drops++; 1512 tp = tcp_close(tp); 1513 goto drop; 1514 case TCPS_CLOSING: 1515 case TCPS_LAST_ACK: 1516 case TCPS_TIME_WAIT: 1517 tp = tcp_close(tp); 1518 goto drop; 1519 } 1520 } 1521 1522 /* 1523 * If a SYN is in the window, then this is an 1524 * error and we ACK and drop the packet. 1525 */ 1526 if (tiflags & TH_SYN) 1527 goto dropafterack_ratelim; 1528 1529 /* 1530 * If the ACK bit is off we drop the segment and return. 1531 */ 1532 if ((tiflags & TH_ACK) == 0) { 1533 if (tp->t_flags & TF_ACKNOW) 1534 goto dropafterack; 1535 else 1536 goto drop; 1537 } 1538 1539 /* 1540 * Ack processing. 1541 */ 1542 switch (tp->t_state) { 1543 1544 /* 1545 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1546 * ESTABLISHED state and continue processing. 1547 * The ACK was checked above. 1548 */ 1549 case TCPS_SYN_RECEIVED: 1550 tcpstat.tcps_connects++; 1551 soisconnected(so); 1552 tp->t_state = TCPS_ESTABLISHED; 1553 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1554 /* Do window scaling? */ 1555 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1556 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1557 tp->snd_scale = tp->requested_s_scale; 1558 tp->rcv_scale = tp->request_r_scale; 1559 tiwin = th->th_win << tp->snd_scale; 1560 } 1561 tcp_flush_queue(tp); 1562 tp->snd_wl1 = th->th_seq - 1; 1563 /* fall into ... */ 1564 1565 /* 1566 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1567 * ACKs. If the ack is in the range 1568 * tp->snd_una < th->th_ack <= tp->snd_max 1569 * then advance tp->snd_una to th->th_ack and drop 1570 * data from the retransmission queue. If this ACK reflects 1571 * more up to date window information we update our window information. 1572 */ 1573 case TCPS_ESTABLISHED: 1574 case TCPS_FIN_WAIT_1: 1575 case TCPS_FIN_WAIT_2: 1576 case TCPS_CLOSE_WAIT: 1577 case TCPS_CLOSING: 1578 case TCPS_LAST_ACK: 1579 case TCPS_TIME_WAIT: 1580 #ifdef TCP_ECN 1581 /* 1582 * if we receive ECE and are not already in recovery phase, 1583 * reduce cwnd by half but don't slow-start. 1584 * advance snd_last to snd_max not to reduce cwnd again 1585 * until all outstanding packets are acked. 1586 */ 1587 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1588 if ((tp->t_flags & TF_ECN_PERMIT) && 1589 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1590 u_int win; 1591 1592 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1593 if (win > 1) { 1594 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1595 tp->snd_cwnd = tp->snd_ssthresh; 1596 tp->snd_last = tp->snd_max; 1597 tp->t_flags |= TF_SEND_CWR; 1598 tcpstat.tcps_cwr_ecn++; 1599 } 1600 } 1601 tcpstat.tcps_ecn_rcvece++; 1602 } 1603 /* 1604 * if we receive CWR, we know that the peer has reduced 1605 * its congestion window. stop sending ecn-echo. 1606 */ 1607 if ((tiflags & TH_CWR)) { 1608 tp->t_flags &= ~TF_RCVD_CE; 1609 tcpstat.tcps_ecn_rcvcwr++; 1610 } 1611 #endif /* TCP_ECN */ 1612 1613 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1614 /* 1615 * Duplicate/old ACK processing. 1616 * Increments t_dupacks: 1617 * Pure duplicate (same seq/ack/window, no data) 1618 * Doesn't affect t_dupacks: 1619 * Data packets. 1620 * Normal window updates (window opens) 1621 * Resets t_dupacks: 1622 * New data ACKed. 1623 * Window shrinks 1624 * Old ACK 1625 */ 1626 if (tlen) { 1627 /* Drop very old ACKs unless th_seq matches */ 1628 if (th->th_seq != tp->rcv_nxt && 1629 SEQ_LT(th->th_ack, 1630 tp->snd_una - tp->max_sndwnd)) { 1631 tcpstat.tcps_rcvacktooold++; 1632 goto drop; 1633 } 1634 break; 1635 } 1636 /* 1637 * If we get an old ACK, there is probably packet 1638 * reordering going on. Be conservative and reset 1639 * t_dupacks so that we are less aggressive in 1640 * doing a fast retransmit. 1641 */ 1642 if (th->th_ack != tp->snd_una) { 1643 tp->t_dupacks = 0; 1644 break; 1645 } 1646 if (tiwin == tp->snd_wnd) { 1647 tcpstat.tcps_rcvdupack++; 1648 /* 1649 * If we have outstanding data (other than 1650 * a window probe), this is a completely 1651 * duplicate ack (ie, window info didn't 1652 * change), the ack is the biggest we've 1653 * seen and we've seen exactly our rexmt 1654 * threshold of them, assume a packet 1655 * has been dropped and retransmit it. 1656 * Kludge snd_nxt & the congestion 1657 * window so we send only this one 1658 * packet. 1659 * 1660 * We know we're losing at the current 1661 * window size so do congestion avoidance 1662 * (set ssthresh to half the current window 1663 * and pull our congestion window back to 1664 * the new ssthresh). 1665 * 1666 * Dup acks mean that packets have left the 1667 * network (they're now cached at the receiver) 1668 * so bump cwnd by the amount in the receiver 1669 * to keep a constant cwnd packets in the 1670 * network. 1671 */ 1672 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1673 tp->t_dupacks = 0; 1674 #if defined(TCP_SACK) && defined(TCP_FACK) 1675 /* 1676 * In FACK, can enter fast rec. if the receiver 1677 * reports a reass. queue longer than 3 segs. 1678 */ 1679 else if (++tp->t_dupacks == tcprexmtthresh || 1680 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1681 tp->t_maxseg + tp->snd_una)) && 1682 SEQ_GT(tp->snd_una, tp->snd_last))) { 1683 #else 1684 else if (++tp->t_dupacks == tcprexmtthresh) { 1685 #endif /* TCP_FACK */ 1686 tcp_seq onxt = tp->snd_nxt; 1687 u_long win = 1688 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1689 2 / tp->t_maxseg; 1690 1691 #if defined(TCP_SACK) || defined(TCP_ECN) 1692 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1693 /* 1694 * False fast retx after 1695 * timeout. Do not cut window. 1696 */ 1697 tp->t_dupacks = 0; 1698 goto drop; 1699 } 1700 #endif 1701 if (win < 2) 1702 win = 2; 1703 tp->snd_ssthresh = win * tp->t_maxseg; 1704 #ifdef TCP_SACK 1705 tp->snd_last = tp->snd_max; 1706 if (tp->sack_enable) { 1707 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1708 tp->t_rtttime = 0; 1709 #ifdef TCP_ECN 1710 tp->t_flags |= TF_SEND_CWR; 1711 #endif 1712 tcpstat.tcps_cwr_frecovery++; 1713 tcpstat.tcps_sack_recovery_episode++; 1714 #if defined(TCP_SACK) && defined(TCP_FACK) 1715 tp->t_dupacks = tcprexmtthresh; 1716 (void) tcp_output(tp); 1717 /* 1718 * During FR, snd_cwnd is held 1719 * constant for FACK. 1720 */ 1721 tp->snd_cwnd = tp->snd_ssthresh; 1722 #else 1723 /* 1724 * tcp_output() will send 1725 * oldest SACK-eligible rtx. 1726 */ 1727 (void) tcp_output(tp); 1728 tp->snd_cwnd = tp->snd_ssthresh+ 1729 tp->t_maxseg * tp->t_dupacks; 1730 #endif /* TCP_FACK */ 1731 goto drop; 1732 } 1733 #endif /* TCP_SACK */ 1734 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1735 tp->t_rtttime = 0; 1736 tp->snd_nxt = th->th_ack; 1737 tp->snd_cwnd = tp->t_maxseg; 1738 #ifdef TCP_ECN 1739 tp->t_flags |= TF_SEND_CWR; 1740 #endif 1741 tcpstat.tcps_cwr_frecovery++; 1742 tcpstat.tcps_sndrexmitfast++; 1743 (void) tcp_output(tp); 1744 1745 tp->snd_cwnd = tp->snd_ssthresh + 1746 tp->t_maxseg * tp->t_dupacks; 1747 if (SEQ_GT(onxt, tp->snd_nxt)) 1748 tp->snd_nxt = onxt; 1749 goto drop; 1750 } else if (tp->t_dupacks > tcprexmtthresh) { 1751 #if defined(TCP_SACK) && defined(TCP_FACK) 1752 /* 1753 * while (awnd < cwnd) 1754 * sendsomething(); 1755 */ 1756 if (tp->sack_enable) { 1757 if (tp->snd_awnd < tp->snd_cwnd) 1758 tcp_output(tp); 1759 goto drop; 1760 } 1761 #endif /* TCP_FACK */ 1762 tp->snd_cwnd += tp->t_maxseg; 1763 (void) tcp_output(tp); 1764 goto drop; 1765 } 1766 } else if (tiwin < tp->snd_wnd) { 1767 /* 1768 * The window was retracted! Previous dup 1769 * ACKs may have been due to packets arriving 1770 * after the shrunken window, not a missing 1771 * packet, so play it safe and reset t_dupacks 1772 */ 1773 tp->t_dupacks = 0; 1774 } 1775 break; 1776 } 1777 /* 1778 * If the congestion window was inflated to account 1779 * for the other side's cached packets, retract it. 1780 */ 1781 #if defined(TCP_SACK) 1782 if (tp->sack_enable) { 1783 if (tp->t_dupacks >= tcprexmtthresh) { 1784 /* Check for a partial ACK */ 1785 if (tcp_sack_partialack(tp, th)) { 1786 #if defined(TCP_SACK) && defined(TCP_FACK) 1787 /* Force call to tcp_output */ 1788 if (tp->snd_awnd < tp->snd_cwnd) 1789 tp->t_flags |= TF_NEEDOUTPUT; 1790 #else 1791 tp->snd_cwnd += tp->t_maxseg; 1792 tp->t_flags |= TF_NEEDOUTPUT; 1793 #endif /* TCP_FACK */ 1794 } else { 1795 /* Out of fast recovery */ 1796 tp->snd_cwnd = tp->snd_ssthresh; 1797 if (tcp_seq_subtract(tp->snd_max, 1798 th->th_ack) < tp->snd_ssthresh) 1799 tp->snd_cwnd = 1800 tcp_seq_subtract(tp->snd_max, 1801 th->th_ack); 1802 tp->t_dupacks = 0; 1803 #if defined(TCP_SACK) && defined(TCP_FACK) 1804 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1805 tp->snd_fack = th->th_ack; 1806 #endif /* TCP_FACK */ 1807 } 1808 } 1809 } else { 1810 if (tp->t_dupacks >= tcprexmtthresh && 1811 !tcp_newreno(tp, th)) { 1812 /* Out of fast recovery */ 1813 tp->snd_cwnd = tp->snd_ssthresh; 1814 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1815 tp->snd_ssthresh) 1816 tp->snd_cwnd = 1817 tcp_seq_subtract(tp->snd_max, 1818 th->th_ack); 1819 tp->t_dupacks = 0; 1820 } 1821 } 1822 if (tp->t_dupacks < tcprexmtthresh) 1823 tp->t_dupacks = 0; 1824 #else /* else no TCP_SACK */ 1825 if (tp->t_dupacks >= tcprexmtthresh && 1826 tp->snd_cwnd > tp->snd_ssthresh) 1827 tp->snd_cwnd = tp->snd_ssthresh; 1828 tp->t_dupacks = 0; 1829 #endif 1830 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1831 tcpstat.tcps_rcvacktoomuch++; 1832 goto dropafterack_ratelim; 1833 } 1834 acked = th->th_ack - tp->snd_una; 1835 tcpstat.tcps_rcvackpack++; 1836 tcpstat.tcps_rcvackbyte += acked; 1837 1838 /* 1839 * If we have a timestamp reply, update smoothed 1840 * round trip time. If no timestamp is present but 1841 * transmit timer is running and timed sequence 1842 * number was acked, update smoothed round trip time. 1843 * Since we now have an rtt measurement, cancel the 1844 * timer backoff (cf., Phil Karn's retransmit alg.). 1845 * Recompute the initial retransmit timer. 1846 */ 1847 if (opti.ts_present && opti.ts_ecr) 1848 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1849 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1850 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1851 1852 /* 1853 * If all outstanding data is acked, stop retransmit 1854 * timer and remember to restart (more output or persist). 1855 * If there is more data to be acked, restart retransmit 1856 * timer, using current (possibly backed-off) value. 1857 */ 1858 if (th->th_ack == tp->snd_max) { 1859 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1860 tp->t_flags |= TF_NEEDOUTPUT; 1861 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1862 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1863 /* 1864 * When new data is acked, open the congestion window. 1865 * If the window gives us less than ssthresh packets 1866 * in flight, open exponentially (maxseg per packet). 1867 * Otherwise open linearly: maxseg per window 1868 * (maxseg^2 / cwnd per packet). 1869 */ 1870 { 1871 u_int cw = tp->snd_cwnd; 1872 u_int incr = tp->t_maxseg; 1873 1874 if (cw > tp->snd_ssthresh) 1875 incr = incr * incr / cw; 1876 #if defined (TCP_SACK) 1877 if (tp->t_dupacks < tcprexmtthresh) 1878 #endif 1879 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1880 } 1881 ND6_HINT(tp); 1882 if (acked > so->so_snd.sb_cc) { 1883 tp->snd_wnd -= so->so_snd.sb_cc; 1884 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1885 ourfinisacked = 1; 1886 } else { 1887 sbdrop(&so->so_snd, acked); 1888 tp->snd_wnd -= acked; 1889 ourfinisacked = 0; 1890 } 1891 1892 tcp_update_sndspace(tp); 1893 if (sb_notify(&so->so_snd)) { 1894 tp->t_flags |= TF_BLOCKOUTPUT; 1895 sowwakeup(so); 1896 tp->t_flags &= ~TF_BLOCKOUTPUT; 1897 } 1898 1899 /* 1900 * If we had a pending ICMP message that referred to data 1901 * that have just been acknowledged, disregard the recorded 1902 * ICMP message. 1903 */ 1904 if ((tp->t_flags & TF_PMTUD_PEND) && 1905 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1906 tp->t_flags &= ~TF_PMTUD_PEND; 1907 1908 /* 1909 * Keep track of the largest chunk of data acknowledged 1910 * since last PMTU update 1911 */ 1912 if (tp->t_pmtud_mss_acked < acked) 1913 tp->t_pmtud_mss_acked = acked; 1914 1915 tp->snd_una = th->th_ack; 1916 #ifdef TCP_ECN 1917 /* sync snd_last with snd_una */ 1918 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1919 tp->snd_last = tp->snd_una; 1920 #endif 1921 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1922 tp->snd_nxt = tp->snd_una; 1923 #if defined (TCP_SACK) && defined (TCP_FACK) 1924 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1925 tp->snd_fack = tp->snd_una; 1926 /* Update snd_awnd for partial ACK 1927 * without any SACK blocks. 1928 */ 1929 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1930 tp->snd_fack) + tp->retran_data; 1931 } 1932 #endif 1933 1934 switch (tp->t_state) { 1935 1936 /* 1937 * In FIN_WAIT_1 STATE in addition to the processing 1938 * for the ESTABLISHED state if our FIN is now acknowledged 1939 * then enter FIN_WAIT_2. 1940 */ 1941 case TCPS_FIN_WAIT_1: 1942 if (ourfinisacked) { 1943 /* 1944 * If we can't receive any more 1945 * data, then closing user can proceed. 1946 * Starting the timer is contrary to the 1947 * specification, but if we don't get a FIN 1948 * we'll hang forever. 1949 */ 1950 if (so->so_state & SS_CANTRCVMORE) { 1951 soisdisconnected(so); 1952 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1953 } 1954 tp->t_state = TCPS_FIN_WAIT_2; 1955 } 1956 break; 1957 1958 /* 1959 * In CLOSING STATE in addition to the processing for 1960 * the ESTABLISHED state if the ACK acknowledges our FIN 1961 * then enter the TIME-WAIT state, otherwise ignore 1962 * the segment. 1963 */ 1964 case TCPS_CLOSING: 1965 if (ourfinisacked) { 1966 tp->t_state = TCPS_TIME_WAIT; 1967 tcp_canceltimers(tp); 1968 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1969 soisdisconnected(so); 1970 } 1971 break; 1972 1973 /* 1974 * In LAST_ACK, we may still be waiting for data to drain 1975 * and/or to be acked, as well as for the ack of our FIN. 1976 * If our FIN is now acknowledged, delete the TCB, 1977 * enter the closed state and return. 1978 */ 1979 case TCPS_LAST_ACK: 1980 if (ourfinisacked) { 1981 tp = tcp_close(tp); 1982 goto drop; 1983 } 1984 break; 1985 1986 /* 1987 * In TIME_WAIT state the only thing that should arrive 1988 * is a retransmission of the remote FIN. Acknowledge 1989 * it and restart the finack timer. 1990 */ 1991 case TCPS_TIME_WAIT: 1992 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1993 goto dropafterack; 1994 } 1995 } 1996 1997 step6: 1998 /* 1999 * Update window information. 2000 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2001 */ 2002 if ((tiflags & TH_ACK) && 2003 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 2004 (SEQ_LT(tp->snd_wl2, th->th_ack) || 2005 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2006 /* keep track of pure window updates */ 2007 if (tlen == 0 && 2008 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2009 tcpstat.tcps_rcvwinupd++; 2010 tp->snd_wnd = tiwin; 2011 tp->snd_wl1 = th->th_seq; 2012 tp->snd_wl2 = th->th_ack; 2013 if (tp->snd_wnd > tp->max_sndwnd) 2014 tp->max_sndwnd = tp->snd_wnd; 2015 tp->t_flags |= TF_NEEDOUTPUT; 2016 } 2017 2018 /* 2019 * Process segments with URG. 2020 */ 2021 if ((tiflags & TH_URG) && th->th_urp && 2022 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2023 /* 2024 * This is a kludge, but if we receive and accept 2025 * random urgent pointers, we'll crash in 2026 * soreceive. It's hard to imagine someone 2027 * actually wanting to send this much urgent data. 2028 */ 2029 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2030 th->th_urp = 0; /* XXX */ 2031 tiflags &= ~TH_URG; /* XXX */ 2032 goto dodata; /* XXX */ 2033 } 2034 /* 2035 * If this segment advances the known urgent pointer, 2036 * then mark the data stream. This should not happen 2037 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2038 * a FIN has been received from the remote side. 2039 * In these states we ignore the URG. 2040 * 2041 * According to RFC961 (Assigned Protocols), 2042 * the urgent pointer points to the last octet 2043 * of urgent data. We continue, however, 2044 * to consider it to indicate the first octet 2045 * of data past the urgent section as the original 2046 * spec states (in one of two places). 2047 */ 2048 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2049 tp->rcv_up = th->th_seq + th->th_urp; 2050 so->so_oobmark = so->so_rcv.sb_cc + 2051 (tp->rcv_up - tp->rcv_nxt) - 1; 2052 if (so->so_oobmark == 0) 2053 so->so_state |= SS_RCVATMARK; 2054 sohasoutofband(so); 2055 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2056 } 2057 /* 2058 * Remove out of band data so doesn't get presented to user. 2059 * This can happen independent of advancing the URG pointer, 2060 * but if two URG's are pending at once, some out-of-band 2061 * data may creep in... ick. 2062 */ 2063 if (th->th_urp <= (u_int16_t) tlen 2064 #ifdef SO_OOBINLINE 2065 && (so->so_options & SO_OOBINLINE) == 0 2066 #endif 2067 ) 2068 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2069 } else 2070 /* 2071 * If no out of band data is expected, 2072 * pull receive urgent pointer along 2073 * with the receive window. 2074 */ 2075 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2076 tp->rcv_up = tp->rcv_nxt; 2077 dodata: /* XXX */ 2078 2079 /* 2080 * Process the segment text, merging it into the TCP sequencing queue, 2081 * and arranging for acknowledgment of receipt if necessary. 2082 * This process logically involves adjusting tp->rcv_wnd as data 2083 * is presented to the user (this happens in tcp_usrreq.c, 2084 * case PRU_RCVD). If a FIN has already been received on this 2085 * connection then we just ignore the text. 2086 */ 2087 if ((tlen || (tiflags & TH_FIN)) && 2088 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2089 #ifdef TCP_SACK 2090 tcp_seq laststart = th->th_seq; 2091 tcp_seq lastend = th->th_seq + tlen; 2092 #endif 2093 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2094 tp->t_state == TCPS_ESTABLISHED) { 2095 TCP_SETUP_ACK(tp, tiflags, m); 2096 tp->rcv_nxt += tlen; 2097 tiflags = th->th_flags & TH_FIN; 2098 tcpstat.tcps_rcvpack++; 2099 tcpstat.tcps_rcvbyte += tlen; 2100 ND6_HINT(tp); 2101 if (so->so_state & SS_CANTRCVMORE) 2102 m_freem(m); 2103 else { 2104 m_adj(m, hdroptlen); 2105 sbappendstream(&so->so_rcv, m); 2106 } 2107 tp->t_flags |= TF_BLOCKOUTPUT; 2108 sorwakeup(so); 2109 tp->t_flags &= ~TF_BLOCKOUTPUT; 2110 } else { 2111 m_adj(m, hdroptlen); 2112 tiflags = tcp_reass(tp, th, m, &tlen); 2113 tp->t_flags |= TF_ACKNOW; 2114 } 2115 #ifdef TCP_SACK 2116 if (tp->sack_enable) 2117 tcp_update_sack_list(tp, laststart, lastend); 2118 #endif 2119 2120 /* 2121 * variable len never referenced again in modern BSD, 2122 * so why bother computing it ?? 2123 */ 2124 #if 0 2125 /* 2126 * Note the amount of data that peer has sent into 2127 * our window, in order to estimate the sender's 2128 * buffer size. 2129 */ 2130 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2131 #endif /* 0 */ 2132 } else { 2133 m_freem(m); 2134 tiflags &= ~TH_FIN; 2135 } 2136 2137 /* 2138 * If FIN is received ACK the FIN and let the user know 2139 * that the connection is closing. Ignore a FIN received before 2140 * the connection is fully established. 2141 */ 2142 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2143 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2144 socantrcvmore(so); 2145 tp->t_flags |= TF_ACKNOW; 2146 tp->rcv_nxt++; 2147 } 2148 switch (tp->t_state) { 2149 2150 /* 2151 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2152 */ 2153 case TCPS_ESTABLISHED: 2154 tp->t_state = TCPS_CLOSE_WAIT; 2155 break; 2156 2157 /* 2158 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2159 * enter the CLOSING state. 2160 */ 2161 case TCPS_FIN_WAIT_1: 2162 tp->t_state = TCPS_CLOSING; 2163 break; 2164 2165 /* 2166 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2167 * starting the time-wait timer, turning off the other 2168 * standard timers. 2169 */ 2170 case TCPS_FIN_WAIT_2: 2171 tp->t_state = TCPS_TIME_WAIT; 2172 tcp_canceltimers(tp); 2173 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2174 soisdisconnected(so); 2175 break; 2176 2177 /* 2178 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2179 */ 2180 case TCPS_TIME_WAIT: 2181 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2182 break; 2183 } 2184 } 2185 if (so->so_options & SO_DEBUG) { 2186 switch (tp->pf) { 2187 #ifdef INET6 2188 case PF_INET6: 2189 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2190 0, tlen); 2191 break; 2192 #endif /* INET6 */ 2193 case PF_INET: 2194 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2195 0, tlen); 2196 break; 2197 } 2198 } 2199 2200 /* 2201 * Return any desired output. 2202 */ 2203 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2204 (void) tcp_output(tp); 2205 return; 2206 2207 badsyn: 2208 /* 2209 * Received a bad SYN. Increment counters and dropwithreset. 2210 */ 2211 tcpstat.tcps_badsyn++; 2212 tp = NULL; 2213 goto dropwithreset; 2214 2215 dropafterack_ratelim: 2216 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2217 tcp_ackdrop_ppslim) == 0) { 2218 /* XXX stat */ 2219 goto drop; 2220 } 2221 /* ...fall into dropafterack... */ 2222 2223 dropafterack: 2224 /* 2225 * Generate an ACK dropping incoming segment if it occupies 2226 * sequence space, where the ACK reflects our state. 2227 */ 2228 if (tiflags & TH_RST) 2229 goto drop; 2230 m_freem(m); 2231 tp->t_flags |= TF_ACKNOW; 2232 (void) tcp_output(tp); 2233 return; 2234 2235 dropwithreset_ratelim: 2236 /* 2237 * We may want to rate-limit RSTs in certain situations, 2238 * particularly if we are sending an RST in response to 2239 * an attempt to connect to or otherwise communicate with 2240 * a port for which we have no socket. 2241 */ 2242 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2243 tcp_rst_ppslim) == 0) { 2244 /* XXX stat */ 2245 goto drop; 2246 } 2247 /* ...fall into dropwithreset... */ 2248 2249 dropwithreset: 2250 /* 2251 * Generate a RST, dropping incoming segment. 2252 * Make ACK acceptable to originator of segment. 2253 * Don't bother to respond to RST. 2254 */ 2255 if (tiflags & TH_RST) 2256 goto drop; 2257 if (tiflags & TH_ACK) { 2258 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2259 TH_RST, m->m_pkthdr.rdomain); 2260 } else { 2261 if (tiflags & TH_SYN) 2262 tlen++; 2263 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2264 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.rdomain); 2265 } 2266 m_freem(m); 2267 return; 2268 2269 drop: 2270 /* 2271 * Drop space held by incoming segment and return. 2272 */ 2273 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2274 switch (tp->pf) { 2275 #ifdef INET6 2276 case PF_INET6: 2277 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2278 0, tlen); 2279 break; 2280 #endif /* INET6 */ 2281 case PF_INET: 2282 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2283 0, tlen); 2284 break; 2285 } 2286 } 2287 2288 m_freem(m); 2289 return; 2290 } 2291 2292 int 2293 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2294 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2295 u_int rtableid) 2296 { 2297 u_int16_t mss = 0; 2298 int opt, optlen; 2299 #ifdef TCP_SIGNATURE 2300 caddr_t sigp = NULL; 2301 struct tdb *tdb = NULL; 2302 #endif /* TCP_SIGNATURE */ 2303 2304 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2305 opt = cp[0]; 2306 if (opt == TCPOPT_EOL) 2307 break; 2308 if (opt == TCPOPT_NOP) 2309 optlen = 1; 2310 else { 2311 if (cnt < 2) 2312 break; 2313 optlen = cp[1]; 2314 if (optlen < 2 || optlen > cnt) 2315 break; 2316 } 2317 switch (opt) { 2318 2319 default: 2320 continue; 2321 2322 case TCPOPT_MAXSEG: 2323 if (optlen != TCPOLEN_MAXSEG) 2324 continue; 2325 if (!(th->th_flags & TH_SYN)) 2326 continue; 2327 if (TCPS_HAVERCVDSYN(tp->t_state)) 2328 continue; 2329 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2330 NTOHS(mss); 2331 oi->maxseg = mss; 2332 break; 2333 2334 case TCPOPT_WINDOW: 2335 if (optlen != TCPOLEN_WINDOW) 2336 continue; 2337 if (!(th->th_flags & TH_SYN)) 2338 continue; 2339 if (TCPS_HAVERCVDSYN(tp->t_state)) 2340 continue; 2341 tp->t_flags |= TF_RCVD_SCALE; 2342 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2343 break; 2344 2345 case TCPOPT_TIMESTAMP: 2346 if (optlen != TCPOLEN_TIMESTAMP) 2347 continue; 2348 oi->ts_present = 1; 2349 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2350 NTOHL(oi->ts_val); 2351 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2352 NTOHL(oi->ts_ecr); 2353 2354 if (!(th->th_flags & TH_SYN)) 2355 continue; 2356 if (TCPS_HAVERCVDSYN(tp->t_state)) 2357 continue; 2358 /* 2359 * A timestamp received in a SYN makes 2360 * it ok to send timestamp requests and replies. 2361 */ 2362 tp->t_flags |= TF_RCVD_TSTMP; 2363 tp->ts_recent = oi->ts_val; 2364 tp->ts_recent_age = tcp_now; 2365 break; 2366 2367 #ifdef TCP_SACK 2368 case TCPOPT_SACK_PERMITTED: 2369 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2370 continue; 2371 if (!(th->th_flags & TH_SYN)) 2372 continue; 2373 if (TCPS_HAVERCVDSYN(tp->t_state)) 2374 continue; 2375 /* MUST only be set on SYN */ 2376 tp->t_flags |= TF_SACK_PERMIT; 2377 break; 2378 case TCPOPT_SACK: 2379 tcp_sack_option(tp, th, cp, optlen); 2380 break; 2381 #endif 2382 #ifdef TCP_SIGNATURE 2383 case TCPOPT_SIGNATURE: 2384 if (optlen != TCPOLEN_SIGNATURE) 2385 continue; 2386 2387 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2388 return (-1); 2389 2390 sigp = cp + 2; 2391 break; 2392 #endif /* TCP_SIGNATURE */ 2393 } 2394 } 2395 2396 #ifdef TCP_SIGNATURE 2397 if (tp->t_flags & TF_SIGNATURE) { 2398 union sockaddr_union src, dst; 2399 2400 memset(&src, 0, sizeof(union sockaddr_union)); 2401 memset(&dst, 0, sizeof(union sockaddr_union)); 2402 2403 switch (tp->pf) { 2404 case 0: 2405 #ifdef INET 2406 case AF_INET: 2407 src.sa.sa_len = sizeof(struct sockaddr_in); 2408 src.sa.sa_family = AF_INET; 2409 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2410 dst.sa.sa_len = sizeof(struct sockaddr_in); 2411 dst.sa.sa_family = AF_INET; 2412 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2413 break; 2414 #endif 2415 #ifdef INET6 2416 case AF_INET6: 2417 src.sa.sa_len = sizeof(struct sockaddr_in6); 2418 src.sa.sa_family = AF_INET6; 2419 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2420 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2421 dst.sa.sa_family = AF_INET6; 2422 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2423 break; 2424 #endif /* INET6 */ 2425 } 2426 2427 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2428 0, &src, &dst, IPPROTO_TCP); 2429 2430 /* 2431 * We don't have an SA for this peer, so we turn off 2432 * TF_SIGNATURE on the listen socket 2433 */ 2434 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2435 tp->t_flags &= ~TF_SIGNATURE; 2436 2437 } 2438 2439 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2440 tcpstat.tcps_rcvbadsig++; 2441 return (-1); 2442 } 2443 2444 if (sigp) { 2445 char sig[16]; 2446 2447 if (tdb == NULL) { 2448 tcpstat.tcps_rcvbadsig++; 2449 return (-1); 2450 } 2451 2452 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2453 return (-1); 2454 2455 if (timingsafe_bcmp(sig, sigp, 16)) { 2456 tcpstat.tcps_rcvbadsig++; 2457 return (-1); 2458 } 2459 2460 tcpstat.tcps_rcvgoodsig++; 2461 } 2462 #endif /* TCP_SIGNATURE */ 2463 2464 return (0); 2465 } 2466 2467 #if defined(TCP_SACK) 2468 u_long 2469 tcp_seq_subtract(u_long a, u_long b) 2470 { 2471 return ((long)(a - b)); 2472 } 2473 #endif 2474 2475 2476 #ifdef TCP_SACK 2477 /* 2478 * This function is called upon receipt of new valid data (while not in header 2479 * prediction mode), and it updates the ordered list of sacks. 2480 */ 2481 void 2482 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2483 tcp_seq rcv_lastend) 2484 { 2485 /* 2486 * First reported block MUST be the most recent one. Subsequent 2487 * blocks SHOULD be in the order in which they arrived at the 2488 * receiver. These two conditions make the implementation fully 2489 * compliant with RFC 2018. 2490 */ 2491 int i, j = 0, count = 0, lastpos = -1; 2492 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2493 2494 /* First clean up current list of sacks */ 2495 for (i = 0; i < tp->rcv_numsacks; i++) { 2496 sack = tp->sackblks[i]; 2497 if (sack.start == 0 && sack.end == 0) { 2498 count++; /* count = number of blocks to be discarded */ 2499 continue; 2500 } 2501 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2502 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2503 count++; 2504 } else { 2505 temp[j].start = tp->sackblks[i].start; 2506 temp[j++].end = tp->sackblks[i].end; 2507 } 2508 } 2509 tp->rcv_numsacks -= count; 2510 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2511 tcp_clean_sackreport(tp); 2512 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2513 /* ==> need first sack block */ 2514 tp->sackblks[0].start = rcv_laststart; 2515 tp->sackblks[0].end = rcv_lastend; 2516 tp->rcv_numsacks = 1; 2517 } 2518 return; 2519 } 2520 /* Otherwise, sack blocks are already present. */ 2521 for (i = 0; i < tp->rcv_numsacks; i++) 2522 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2523 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2524 return; /* sack list remains unchanged */ 2525 /* 2526 * From here, segment just received should be (part of) the 1st sack. 2527 * Go through list, possibly coalescing sack block entries. 2528 */ 2529 firstsack.start = rcv_laststart; 2530 firstsack.end = rcv_lastend; 2531 for (i = 0; i < tp->rcv_numsacks; i++) { 2532 sack = tp->sackblks[i]; 2533 if (SEQ_LT(sack.end, firstsack.start) || 2534 SEQ_GT(sack.start, firstsack.end)) 2535 continue; /* no overlap */ 2536 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2537 /* 2538 * identical block; delete it here since we will 2539 * move it to the front of the list. 2540 */ 2541 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2542 lastpos = i; /* last posn with a zero entry */ 2543 continue; 2544 } 2545 if (SEQ_LEQ(sack.start, firstsack.start)) 2546 firstsack.start = sack.start; /* merge blocks */ 2547 if (SEQ_GEQ(sack.end, firstsack.end)) 2548 firstsack.end = sack.end; /* merge blocks */ 2549 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2550 lastpos = i; /* last posn with a zero entry */ 2551 } 2552 if (lastpos != -1) { /* at least one merge */ 2553 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2554 sack = tp->sackblks[i]; 2555 if (sack.start == 0 && sack.end == 0) 2556 continue; 2557 temp[j++] = sack; 2558 } 2559 tp->rcv_numsacks = j; /* including first blk (added later) */ 2560 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2561 tp->sackblks[i] = temp[i]; 2562 } else { /* no merges -- shift sacks by 1 */ 2563 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2564 tp->rcv_numsacks++; 2565 for (i = tp->rcv_numsacks-1; i > 0; i--) 2566 tp->sackblks[i] = tp->sackblks[i-1]; 2567 } 2568 tp->sackblks[0] = firstsack; 2569 return; 2570 } 2571 2572 /* 2573 * Process the TCP SACK option. tp->snd_holes is an ordered list 2574 * of holes (oldest to newest, in terms of the sequence space). 2575 */ 2576 void 2577 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2578 { 2579 int tmp_olen; 2580 u_char *tmp_cp; 2581 struct sackhole *cur, *p, *temp; 2582 2583 if (!tp->sack_enable) 2584 return; 2585 /* SACK without ACK doesn't make sense. */ 2586 if ((th->th_flags & TH_ACK) == 0) 2587 return; 2588 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2589 if (SEQ_LT(th->th_ack, tp->snd_una) || 2590 SEQ_GT(th->th_ack, tp->snd_max)) 2591 return; 2592 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2593 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2594 return; 2595 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2596 tmp_cp = cp + 2; 2597 tmp_olen = optlen - 2; 2598 tcpstat.tcps_sack_rcv_opts++; 2599 if (tp->snd_numholes < 0) 2600 tp->snd_numholes = 0; 2601 if (tp->t_maxseg == 0) 2602 panic("tcp_sack_option"); /* Should never happen */ 2603 while (tmp_olen > 0) { 2604 struct sackblk sack; 2605 2606 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2607 NTOHL(sack.start); 2608 bcopy(tmp_cp + sizeof(tcp_seq), 2609 (char *) &(sack.end), sizeof(tcp_seq)); 2610 NTOHL(sack.end); 2611 tmp_olen -= TCPOLEN_SACK; 2612 tmp_cp += TCPOLEN_SACK; 2613 if (SEQ_LEQ(sack.end, sack.start)) 2614 continue; /* bad SACK fields */ 2615 if (SEQ_LEQ(sack.end, tp->snd_una)) 2616 continue; /* old block */ 2617 #if defined(TCP_SACK) && defined(TCP_FACK) 2618 /* Updates snd_fack. */ 2619 if (SEQ_GT(sack.end, tp->snd_fack)) 2620 tp->snd_fack = sack.end; 2621 #endif /* TCP_FACK */ 2622 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2623 if (SEQ_LT(sack.start, th->th_ack)) 2624 continue; 2625 } 2626 if (SEQ_GT(sack.end, tp->snd_max)) 2627 continue; 2628 if (tp->snd_holes == NULL) { /* first hole */ 2629 tp->snd_holes = (struct sackhole *) 2630 pool_get(&sackhl_pool, PR_NOWAIT); 2631 if (tp->snd_holes == NULL) { 2632 /* ENOBUFS, so ignore SACKed block for now*/ 2633 goto done; 2634 } 2635 cur = tp->snd_holes; 2636 cur->start = th->th_ack; 2637 cur->end = sack.start; 2638 cur->rxmit = cur->start; 2639 cur->next = NULL; 2640 tp->snd_numholes = 1; 2641 tp->rcv_lastsack = sack.end; 2642 /* 2643 * dups is at least one. If more data has been 2644 * SACKed, it can be greater than one. 2645 */ 2646 cur->dups = min(tcprexmtthresh, 2647 ((sack.end - cur->end)/tp->t_maxseg)); 2648 if (cur->dups < 1) 2649 cur->dups = 1; 2650 continue; /* with next sack block */ 2651 } 2652 /* Go thru list of holes: p = previous, cur = current */ 2653 p = cur = tp->snd_holes; 2654 while (cur) { 2655 if (SEQ_LEQ(sack.end, cur->start)) 2656 /* SACKs data before the current hole */ 2657 break; /* no use going through more holes */ 2658 if (SEQ_GEQ(sack.start, cur->end)) { 2659 /* SACKs data beyond the current hole */ 2660 cur->dups++; 2661 if (((sack.end - cur->end)/tp->t_maxseg) >= 2662 tcprexmtthresh) 2663 cur->dups = tcprexmtthresh; 2664 p = cur; 2665 cur = cur->next; 2666 continue; 2667 } 2668 if (SEQ_LEQ(sack.start, cur->start)) { 2669 /* Data acks at least the beginning of hole */ 2670 #if defined(TCP_SACK) && defined(TCP_FACK) 2671 if (SEQ_GT(sack.end, cur->rxmit)) 2672 tp->retran_data -= 2673 tcp_seq_subtract(cur->rxmit, 2674 cur->start); 2675 else 2676 tp->retran_data -= 2677 tcp_seq_subtract(sack.end, 2678 cur->start); 2679 #endif /* TCP_FACK */ 2680 if (SEQ_GEQ(sack.end, cur->end)) { 2681 /* Acks entire hole, so delete hole */ 2682 if (p != cur) { 2683 p->next = cur->next; 2684 pool_put(&sackhl_pool, cur); 2685 cur = p->next; 2686 } else { 2687 cur = cur->next; 2688 pool_put(&sackhl_pool, p); 2689 p = cur; 2690 tp->snd_holes = p; 2691 } 2692 tp->snd_numholes--; 2693 continue; 2694 } 2695 /* otherwise, move start of hole forward */ 2696 cur->start = sack.end; 2697 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2698 p = cur; 2699 cur = cur->next; 2700 continue; 2701 } 2702 /* move end of hole backward */ 2703 if (SEQ_GEQ(sack.end, cur->end)) { 2704 #if defined(TCP_SACK) && defined(TCP_FACK) 2705 if (SEQ_GT(cur->rxmit, sack.start)) 2706 tp->retran_data -= 2707 tcp_seq_subtract(cur->rxmit, 2708 sack.start); 2709 #endif /* TCP_FACK */ 2710 cur->end = sack.start; 2711 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2712 cur->dups++; 2713 if (((sack.end - cur->end)/tp->t_maxseg) >= 2714 tcprexmtthresh) 2715 cur->dups = tcprexmtthresh; 2716 p = cur; 2717 cur = cur->next; 2718 continue; 2719 } 2720 if (SEQ_LT(cur->start, sack.start) && 2721 SEQ_GT(cur->end, sack.end)) { 2722 /* 2723 * ACKs some data in middle of a hole; need to 2724 * split current hole 2725 */ 2726 temp = (struct sackhole *) 2727 pool_get(&sackhl_pool, PR_NOWAIT); 2728 if (temp == NULL) 2729 goto done; /* ENOBUFS */ 2730 #if defined(TCP_SACK) && defined(TCP_FACK) 2731 if (SEQ_GT(cur->rxmit, sack.end)) 2732 tp->retran_data -= 2733 tcp_seq_subtract(sack.end, 2734 sack.start); 2735 else if (SEQ_GT(cur->rxmit, sack.start)) 2736 tp->retran_data -= 2737 tcp_seq_subtract(cur->rxmit, 2738 sack.start); 2739 #endif /* TCP_FACK */ 2740 temp->next = cur->next; 2741 temp->start = sack.end; 2742 temp->end = cur->end; 2743 temp->dups = cur->dups; 2744 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2745 cur->end = sack.start; 2746 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2747 cur->dups++; 2748 if (((sack.end - cur->end)/tp->t_maxseg) >= 2749 tcprexmtthresh) 2750 cur->dups = tcprexmtthresh; 2751 cur->next = temp; 2752 p = temp; 2753 cur = p->next; 2754 tp->snd_numholes++; 2755 } 2756 } 2757 /* At this point, p points to the last hole on the list */ 2758 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2759 /* 2760 * Need to append new hole at end. 2761 * Last hole is p (and it's not NULL). 2762 */ 2763 temp = (struct sackhole *) 2764 pool_get(&sackhl_pool, PR_NOWAIT); 2765 if (temp == NULL) 2766 goto done; /* ENOBUFS */ 2767 temp->start = tp->rcv_lastsack; 2768 temp->end = sack.start; 2769 temp->dups = min(tcprexmtthresh, 2770 ((sack.end - sack.start)/tp->t_maxseg)); 2771 if (temp->dups < 1) 2772 temp->dups = 1; 2773 temp->rxmit = temp->start; 2774 temp->next = 0; 2775 p->next = temp; 2776 tp->rcv_lastsack = sack.end; 2777 tp->snd_numholes++; 2778 } 2779 } 2780 done: 2781 #if defined(TCP_SACK) && defined(TCP_FACK) 2782 /* 2783 * Update retran_data and snd_awnd. Go through the list of 2784 * holes. Increment retran_data by (hole->rxmit - hole->start). 2785 */ 2786 tp->retran_data = 0; 2787 cur = tp->snd_holes; 2788 while (cur) { 2789 tp->retran_data += cur->rxmit - cur->start; 2790 cur = cur->next; 2791 } 2792 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2793 tp->retran_data; 2794 #endif /* TCP_FACK */ 2795 2796 return; 2797 } 2798 2799 /* 2800 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2801 * it is completely acked; otherwise, tcp_sack_option(), called from 2802 * tcp_dooptions(), will fix up the hole. 2803 */ 2804 void 2805 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2806 { 2807 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2808 /* max because this could be an older ack just arrived */ 2809 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2810 th->th_ack : tp->snd_una; 2811 struct sackhole *cur = tp->snd_holes; 2812 struct sackhole *prev; 2813 while (cur) 2814 if (SEQ_LEQ(cur->end, lastack)) { 2815 prev = cur; 2816 cur = cur->next; 2817 pool_put(&sackhl_pool, prev); 2818 tp->snd_numholes--; 2819 } else if (SEQ_LT(cur->start, lastack)) { 2820 cur->start = lastack; 2821 if (SEQ_LT(cur->rxmit, cur->start)) 2822 cur->rxmit = cur->start; 2823 break; 2824 } else 2825 break; 2826 tp->snd_holes = cur; 2827 } 2828 } 2829 2830 /* 2831 * Delete all receiver-side SACK information. 2832 */ 2833 void 2834 tcp_clean_sackreport(struct tcpcb *tp) 2835 { 2836 int i; 2837 2838 tp->rcv_numsacks = 0; 2839 for (i = 0; i < MAX_SACK_BLKS; i++) 2840 tp->sackblks[i].start = tp->sackblks[i].end=0; 2841 2842 } 2843 2844 /* 2845 * Checks for partial ack. If partial ack arrives, turn off retransmission 2846 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2847 * If the ack advances at least to tp->snd_last, return 0. 2848 */ 2849 int 2850 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2851 { 2852 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2853 /* Turn off retx. timer (will start again next segment) */ 2854 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2855 tp->t_rtttime = 0; 2856 #ifndef TCP_FACK 2857 /* 2858 * Partial window deflation. This statement relies on the 2859 * fact that tp->snd_una has not been updated yet. In FACK 2860 * hold snd_cwnd constant during fast recovery. 2861 */ 2862 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2863 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2864 tp->snd_cwnd += tp->t_maxseg; 2865 } else 2866 tp->snd_cwnd = tp->t_maxseg; 2867 #endif 2868 return (1); 2869 } 2870 return (0); 2871 } 2872 #endif /* TCP_SACK */ 2873 2874 /* 2875 * Pull out of band byte out of a segment so 2876 * it doesn't appear in the user's data queue. 2877 * It is still reflected in the segment length for 2878 * sequencing purposes. 2879 */ 2880 void 2881 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2882 { 2883 int cnt = off + urgent - 1; 2884 2885 while (cnt >= 0) { 2886 if (m->m_len > cnt) { 2887 char *cp = mtod(m, caddr_t) + cnt; 2888 struct tcpcb *tp = sototcpcb(so); 2889 2890 tp->t_iobc = *cp; 2891 tp->t_oobflags |= TCPOOB_HAVEDATA; 2892 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2893 m->m_len--; 2894 return; 2895 } 2896 cnt -= m->m_len; 2897 m = m->m_next; 2898 if (m == 0) 2899 break; 2900 } 2901 panic("tcp_pulloutofband"); 2902 } 2903 2904 /* 2905 * Collect new round-trip time estimate 2906 * and update averages and current timeout. 2907 */ 2908 void 2909 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2910 { 2911 short delta; 2912 short rttmin; 2913 2914 if (rtt < 0) 2915 rtt = 0; 2916 else if (rtt > TCP_RTT_MAX) 2917 rtt = TCP_RTT_MAX; 2918 2919 tcpstat.tcps_rttupdated++; 2920 if (tp->t_srtt != 0) { 2921 /* 2922 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2923 * after the binary point (scaled by 4), whereas 2924 * srtt is stored as fixed point with 5 bits after the 2925 * binary point (i.e., scaled by 32). The following magic 2926 * is equivalent to the smoothing algorithm in rfc793 with 2927 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2928 * point). 2929 */ 2930 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2931 (tp->t_srtt >> TCP_RTT_SHIFT); 2932 if ((tp->t_srtt += delta) <= 0) 2933 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2934 /* 2935 * We accumulate a smoothed rtt variance (actually, a 2936 * smoothed mean difference), then set the retransmit 2937 * timer to smoothed rtt + 4 times the smoothed variance. 2938 * rttvar is stored as fixed point with 4 bits after the 2939 * binary point (scaled by 16). The following is 2940 * equivalent to rfc793 smoothing with an alpha of .75 2941 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2942 * rfc793's wired-in beta. 2943 */ 2944 if (delta < 0) 2945 delta = -delta; 2946 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2947 if ((tp->t_rttvar += delta) <= 0) 2948 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2949 } else { 2950 /* 2951 * No rtt measurement yet - use the unsmoothed rtt. 2952 * Set the variance to half the rtt (so our first 2953 * retransmit happens at 3*rtt). 2954 */ 2955 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2956 tp->t_rttvar = (rtt + 1) << 2957 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2958 } 2959 tp->t_rtttime = 0; 2960 tp->t_rxtshift = 0; 2961 2962 /* 2963 * the retransmit should happen at rtt + 4 * rttvar. 2964 * Because of the way we do the smoothing, srtt and rttvar 2965 * will each average +1/2 tick of bias. When we compute 2966 * the retransmit timer, we want 1/2 tick of rounding and 2967 * 1 extra tick because of +-1/2 tick uncertainty in the 2968 * firing of the timer. The bias will give us exactly the 2969 * 1.5 tick we need. But, because the bias is 2970 * statistical, we have to test that we don't drop below 2971 * the minimum feasible timer (which is 2 ticks). 2972 */ 2973 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2974 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2975 2976 /* 2977 * We received an ack for a packet that wasn't retransmitted; 2978 * it is probably safe to discard any error indications we've 2979 * received recently. This isn't quite right, but close enough 2980 * for now (a route might have failed after we sent a segment, 2981 * and the return path might not be symmetrical). 2982 */ 2983 tp->t_softerror = 0; 2984 } 2985 2986 /* 2987 * Determine a reasonable value for maxseg size. 2988 * If the route is known, check route for mtu. 2989 * If none, use an mss that can be handled on the outgoing 2990 * interface without forcing IP to fragment; if bigger than 2991 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2992 * to utilize large mbufs. If no route is found, route has no mtu, 2993 * or the destination isn't local, use a default, hopefully conservative 2994 * size (usually 512 or the default IP max size, but no more than the mtu 2995 * of the interface), as we can't discover anything about intervening 2996 * gateways or networks. We also initialize the congestion/slow start 2997 * window to be a single segment if the destination isn't local. 2998 * While looking at the routing entry, we also initialize other path-dependent 2999 * parameters from pre-set or cached values in the routing entry. 3000 * 3001 * Also take into account the space needed for options that we 3002 * send regularly. Make maxseg shorter by that amount to assure 3003 * that we can send maxseg amount of data even when the options 3004 * are present. Store the upper limit of the length of options plus 3005 * data in maxopd. 3006 * 3007 * NOTE: offer == -1 indicates that the maxseg size changed due to 3008 * Path MTU discovery. 3009 */ 3010 int 3011 tcp_mss(struct tcpcb *tp, int offer) 3012 { 3013 struct rtentry *rt; 3014 struct ifnet *ifp; 3015 int mss, mssopt; 3016 int iphlen; 3017 struct inpcb *inp; 3018 3019 inp = tp->t_inpcb; 3020 3021 mssopt = mss = tcp_mssdflt; 3022 3023 rt = in_pcbrtentry(inp); 3024 3025 if (rt == NULL) 3026 goto out; 3027 3028 ifp = rt->rt_ifp; 3029 3030 switch (tp->pf) { 3031 #ifdef INET6 3032 case AF_INET6: 3033 iphlen = sizeof(struct ip6_hdr); 3034 break; 3035 #endif 3036 case AF_INET: 3037 iphlen = sizeof(struct ip); 3038 break; 3039 default: 3040 /* the family does not support path MTU discovery */ 3041 goto out; 3042 } 3043 3044 #ifdef RTV_MTU 3045 /* 3046 * if there's an mtu associated with the route and we support 3047 * path MTU discovery for the underlying protocol family, use it. 3048 */ 3049 if (rt->rt_rmx.rmx_mtu) { 3050 /* 3051 * One may wish to lower MSS to take into account options, 3052 * especially security-related options. 3053 */ 3054 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3055 /* 3056 * RFC2460 section 5, last paragraph: if path MTU is 3057 * smaller than 1280, use 1280 as packet size and 3058 * attach fragment header. 3059 */ 3060 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3061 sizeof(struct tcphdr); 3062 } else 3063 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 3064 } else 3065 #endif /* RTV_MTU */ 3066 if (!ifp) 3067 /* 3068 * ifp may be null and rmx_mtu may be zero in certain 3069 * v6 cases (e.g., if ND wasn't able to resolve the 3070 * destination host. 3071 */ 3072 goto out; 3073 else if (ifp->if_flags & IFF_LOOPBACK) 3074 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3075 else if (tp->pf == AF_INET) { 3076 if (ip_mtudisc) 3077 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3078 else if (inp && in_localaddr(inp->inp_faddr, inp->inp_rtableid)) 3079 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3080 } 3081 #ifdef INET6 3082 else if (tp->pf == AF_INET6) { 3083 /* 3084 * for IPv6, path MTU discovery is always turned on, 3085 * or the node must use packet size <= 1280. 3086 */ 3087 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr); 3088 } 3089 #endif /* INET6 */ 3090 3091 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3092 if (offer != -1) { 3093 #ifndef INET6 3094 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3095 #else 3096 if (tp->pf == AF_INET6) 3097 mssopt = IN6_LINKMTU(ifp) - iphlen - 3098 sizeof(struct tcphdr); 3099 else 3100 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3101 #endif 3102 3103 mssopt = max(tcp_mssdflt, mssopt); 3104 } 3105 3106 out: 3107 /* 3108 * The current mss, t_maxseg, is initialized to the default value. 3109 * If we compute a smaller value, reduce the current mss. 3110 * If we compute a larger value, return it for use in sending 3111 * a max seg size option, but don't store it for use 3112 * unless we received an offer at least that large from peer. 3113 * 3114 * However, do not accept offers lower than the minimum of 3115 * the interface MTU and 216. 3116 */ 3117 if (offer > 0) 3118 tp->t_peermss = offer; 3119 if (tp->t_peermss) 3120 mss = min(mss, max(tp->t_peermss, 216)); 3121 3122 /* sanity - at least max opt. space */ 3123 mss = max(mss, 64); 3124 3125 /* 3126 * maxopd stores the maximum length of data AND options 3127 * in a segment; maxseg is the amount of data in a normal 3128 * segment. We need to store this value (maxopd) apart 3129 * from maxseg, because now every segment carries options 3130 * and thus we normally have somewhat less data in segments. 3131 */ 3132 tp->t_maxopd = mss; 3133 3134 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3135 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3136 mss -= TCPOLEN_TSTAMP_APPA; 3137 #ifdef TCP_SIGNATURE 3138 if (tp->t_flags & TF_SIGNATURE) 3139 mss -= TCPOLEN_SIGLEN; 3140 #endif 3141 3142 if (offer == -1) { 3143 /* mss changed due to Path MTU discovery */ 3144 tp->t_flags &= ~TF_PMTUD_PEND; 3145 tp->t_pmtud_mtu_sent = 0; 3146 tp->t_pmtud_mss_acked = 0; 3147 if (mss < tp->t_maxseg) { 3148 /* 3149 * Follow suggestion in RFC 2414 to reduce the 3150 * congestion window by the ratio of the old 3151 * segment size to the new segment size. 3152 */ 3153 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3154 mss, mss); 3155 } 3156 } else if (tcp_do_rfc3390 == 2) { 3157 /* increase initial window */ 3158 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3159 } else if (tcp_do_rfc3390) { 3160 /* increase initial window */ 3161 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3162 } else 3163 tp->snd_cwnd = mss; 3164 3165 tp->t_maxseg = mss; 3166 3167 return (offer != -1 ? mssopt : mss); 3168 } 3169 3170 u_int 3171 tcp_hdrsz(struct tcpcb *tp) 3172 { 3173 u_int hlen; 3174 3175 switch (tp->pf) { 3176 #ifdef INET6 3177 case AF_INET6: 3178 hlen = sizeof(struct ip6_hdr); 3179 break; 3180 #endif 3181 case AF_INET: 3182 hlen = sizeof(struct ip); 3183 break; 3184 default: 3185 hlen = 0; 3186 break; 3187 } 3188 hlen += sizeof(struct tcphdr); 3189 3190 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3191 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3192 hlen += TCPOLEN_TSTAMP_APPA; 3193 #ifdef TCP_SIGNATURE 3194 if (tp->t_flags & TF_SIGNATURE) 3195 hlen += TCPOLEN_SIGLEN; 3196 #endif 3197 return (hlen); 3198 } 3199 3200 /* 3201 * Set connection variables based on the effective MSS. 3202 * We are passed the TCPCB for the actual connection. If we 3203 * are the server, we are called by the compressed state engine 3204 * when the 3-way handshake is complete. If we are the client, 3205 * we are called when we receive the SYN,ACK from the server. 3206 * 3207 * NOTE: The t_maxseg value must be initialized in the TCPCB 3208 * before this routine is called! 3209 */ 3210 void 3211 tcp_mss_update(struct tcpcb *tp) 3212 { 3213 int mss; 3214 u_long bufsize; 3215 struct rtentry *rt; 3216 struct socket *so; 3217 3218 so = tp->t_inpcb->inp_socket; 3219 mss = tp->t_maxseg; 3220 3221 rt = in_pcbrtentry(tp->t_inpcb); 3222 3223 if (rt == NULL) 3224 return; 3225 3226 bufsize = so->so_snd.sb_hiwat; 3227 if (bufsize < mss) { 3228 mss = bufsize; 3229 /* Update t_maxseg and t_maxopd */ 3230 tcp_mss(tp, mss); 3231 } else { 3232 bufsize = roundup(bufsize, mss); 3233 if (bufsize > sb_max) 3234 bufsize = sb_max; 3235 (void)sbreserve(&so->so_snd, bufsize); 3236 } 3237 3238 bufsize = so->so_rcv.sb_hiwat; 3239 if (bufsize > mss) { 3240 bufsize = roundup(bufsize, mss); 3241 if (bufsize > sb_max) 3242 bufsize = sb_max; 3243 (void)sbreserve(&so->so_rcv, bufsize); 3244 } 3245 3246 } 3247 3248 #if defined (TCP_SACK) 3249 /* 3250 * Checks for partial ack. If partial ack arrives, force the retransmission 3251 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3252 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3253 * be started again. If the ack advances at least to tp->snd_last, return 0. 3254 */ 3255 int 3256 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3257 { 3258 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3259 /* 3260 * snd_una has not been updated and the socket send buffer 3261 * not yet drained of the acked data, so we have to leave 3262 * snd_una as it was to get the correct data offset in 3263 * tcp_output(). 3264 */ 3265 tcp_seq onxt = tp->snd_nxt; 3266 u_long ocwnd = tp->snd_cwnd; 3267 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3268 tp->t_rtttime = 0; 3269 tp->snd_nxt = th->th_ack; 3270 /* 3271 * Set snd_cwnd to one segment beyond acknowledged offset 3272 * (tp->snd_una not yet updated when this function is called) 3273 */ 3274 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3275 (void) tcp_output(tp); 3276 tp->snd_cwnd = ocwnd; 3277 if (SEQ_GT(onxt, tp->snd_nxt)) 3278 tp->snd_nxt = onxt; 3279 /* 3280 * Partial window deflation. Relies on fact that tp->snd_una 3281 * not updated yet. 3282 */ 3283 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3284 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3285 else 3286 tp->snd_cwnd = 0; 3287 tp->snd_cwnd += tp->t_maxseg; 3288 3289 return 1; 3290 } 3291 return 0; 3292 } 3293 #endif /* TCP_SACK */ 3294 3295 int 3296 tcp_mss_adv(struct ifnet *ifp, int af) 3297 { 3298 int mss = 0; 3299 int iphlen; 3300 3301 switch (af) { 3302 case AF_INET: 3303 if (ifp != NULL) 3304 mss = ifp->if_mtu; 3305 iphlen = sizeof(struct ip); 3306 break; 3307 #ifdef INET6 3308 case AF_INET6: 3309 if (ifp != NULL) 3310 mss = IN6_LINKMTU(ifp); 3311 iphlen = sizeof(struct ip6_hdr); 3312 break; 3313 #endif 3314 } 3315 mss = mss - iphlen - sizeof(struct tcphdr); 3316 return (max(mss, tcp_mssdflt)); 3317 } 3318 3319 /* 3320 * TCP compressed state engine. Currently used to hold compressed 3321 * state for SYN_RECEIVED. 3322 */ 3323 3324 u_long syn_cache_count; 3325 u_int32_t syn_hash1, syn_hash2; 3326 3327 #define SYN_HASH(sa, sp, dp) \ 3328 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3329 ((u_int32_t)(sp)))^syn_hash2))) 3330 #ifndef INET6 3331 #define SYN_HASHALL(hash, src, dst) \ 3332 do { \ 3333 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3334 ((struct sockaddr_in *)(src))->sin_port, \ 3335 ((struct sockaddr_in *)(dst))->sin_port); \ 3336 } while (/*CONSTCOND*/ 0) 3337 #else 3338 #define SYN_HASH6(sa, sp, dp) \ 3339 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3340 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3341 & 0x7fffffff) 3342 3343 #define SYN_HASHALL(hash, src, dst) \ 3344 do { \ 3345 switch ((src)->sa_family) { \ 3346 case AF_INET: \ 3347 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 3348 ((struct sockaddr_in *)(src))->sin_port, \ 3349 ((struct sockaddr_in *)(dst))->sin_port); \ 3350 break; \ 3351 case AF_INET6: \ 3352 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 3353 ((struct sockaddr_in6 *)(src))->sin6_port, \ 3354 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 3355 break; \ 3356 default: \ 3357 hash = 0; \ 3358 } \ 3359 } while (/*CONSTCOND*/0) 3360 #endif /* INET6 */ 3361 3362 void 3363 syn_cache_rm(struct syn_cache *sc) 3364 { 3365 sc->sc_flags |= SCF_DEAD; 3366 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 3367 sc, sc_bucketq); 3368 sc->sc_tp = NULL; 3369 LIST_REMOVE(sc, sc_tpq); 3370 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 3371 timeout_del(&sc->sc_timer); 3372 syn_cache_count--; 3373 } 3374 3375 void 3376 syn_cache_put(struct syn_cache *sc) 3377 { 3378 if (sc->sc_ipopts) 3379 (void) m_free(sc->sc_ipopts); 3380 if (sc->sc_route4.ro_rt != NULL) 3381 RTFREE(sc->sc_route4.ro_rt); 3382 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3383 timeout_add(&sc->sc_timer, 0); 3384 } 3385 3386 struct pool syn_cache_pool; 3387 3388 /* 3389 * We don't estimate RTT with SYNs, so each packet starts with the default 3390 * RTT and each timer step has a fixed timeout value. 3391 */ 3392 #define SYN_CACHE_TIMER_ARM(sc) \ 3393 do { \ 3394 TCPT_RANGESET((sc)->sc_rxtcur, \ 3395 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3396 TCPTV_REXMTMAX); \ 3397 if (!timeout_initialized(&(sc)->sc_timer)) \ 3398 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3399 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3400 } while (/*CONSTCOND*/0) 3401 3402 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3403 3404 void 3405 syn_cache_init() 3406 { 3407 int i; 3408 3409 /* Initialize the hash buckets. */ 3410 for (i = 0; i < tcp_syn_cache_size; i++) 3411 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3412 3413 /* Initialize the syn cache pool. */ 3414 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3415 "synpl", NULL); 3416 } 3417 3418 void 3419 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3420 { 3421 struct syn_cache_head *scp; 3422 struct syn_cache *sc2; 3423 int s; 3424 3425 /* 3426 * If there are no entries in the hash table, reinitialize 3427 * the hash secrets. 3428 */ 3429 if (syn_cache_count == 0) { 3430 syn_hash1 = arc4random(); 3431 syn_hash2 = arc4random(); 3432 } 3433 3434 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3435 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3436 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3437 3438 /* 3439 * Make sure that we don't overflow the per-bucket 3440 * limit or the total cache size limit. 3441 */ 3442 s = splsoftnet(); 3443 if (scp->sch_length >= tcp_syn_bucket_limit) { 3444 tcpstat.tcps_sc_bucketoverflow++; 3445 /* 3446 * The bucket is full. Toss the oldest element in the 3447 * bucket. This will be the first entry in the bucket. 3448 */ 3449 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3450 #ifdef DIAGNOSTIC 3451 /* 3452 * This should never happen; we should always find an 3453 * entry in our bucket. 3454 */ 3455 if (sc2 == NULL) 3456 panic("syn_cache_insert: bucketoverflow: impossible"); 3457 #endif 3458 syn_cache_rm(sc2); 3459 syn_cache_put(sc2); 3460 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3461 struct syn_cache_head *scp2, *sce; 3462 3463 tcpstat.tcps_sc_overflowed++; 3464 /* 3465 * The cache is full. Toss the oldest entry in the 3466 * first non-empty bucket we can find. 3467 * 3468 * XXX We would really like to toss the oldest 3469 * entry in the cache, but we hope that this 3470 * condition doesn't happen very often. 3471 */ 3472 scp2 = scp; 3473 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3474 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3475 for (++scp2; scp2 != scp; scp2++) { 3476 if (scp2 >= sce) 3477 scp2 = &tcp_syn_cache[0]; 3478 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3479 break; 3480 } 3481 #ifdef DIAGNOSTIC 3482 /* 3483 * This should never happen; we should always find a 3484 * non-empty bucket. 3485 */ 3486 if (scp2 == scp) 3487 panic("syn_cache_insert: cacheoverflow: " 3488 "impossible"); 3489 #endif 3490 } 3491 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3492 syn_cache_rm(sc2); 3493 syn_cache_put(sc2); 3494 } 3495 3496 /* 3497 * Initialize the entry's timer. 3498 */ 3499 sc->sc_rxttot = 0; 3500 sc->sc_rxtshift = 0; 3501 SYN_CACHE_TIMER_ARM(sc); 3502 3503 /* Link it from tcpcb entry */ 3504 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3505 3506 /* Put it into the bucket. */ 3507 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3508 scp->sch_length++; 3509 syn_cache_count++; 3510 3511 tcpstat.tcps_sc_added++; 3512 splx(s); 3513 } 3514 3515 /* 3516 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3517 * If we have retransmitted an entry the maximum number of times, expire 3518 * that entry. 3519 */ 3520 void 3521 syn_cache_timer(void *arg) 3522 { 3523 struct syn_cache *sc = arg; 3524 int s; 3525 3526 s = splsoftnet(); 3527 if (sc->sc_flags & SCF_DEAD) { 3528 splx(s); 3529 return; 3530 } 3531 3532 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3533 /* Drop it -- too many retransmissions. */ 3534 goto dropit; 3535 } 3536 3537 /* 3538 * Compute the total amount of time this entry has 3539 * been on a queue. If this entry has been on longer 3540 * than the keep alive timer would allow, expire it. 3541 */ 3542 sc->sc_rxttot += sc->sc_rxtcur; 3543 if (sc->sc_rxttot >= tcptv_keep_init) 3544 goto dropit; 3545 3546 tcpstat.tcps_sc_retransmitted++; 3547 (void) syn_cache_respond(sc, NULL); 3548 3549 /* Advance the timer back-off. */ 3550 sc->sc_rxtshift++; 3551 SYN_CACHE_TIMER_ARM(sc); 3552 3553 splx(s); 3554 return; 3555 3556 dropit: 3557 tcpstat.tcps_sc_timed_out++; 3558 syn_cache_rm(sc); 3559 syn_cache_put(sc); 3560 splx(s); 3561 } 3562 3563 void 3564 syn_cache_reaper(void *arg) 3565 { 3566 struct syn_cache *sc = arg; 3567 int s; 3568 3569 s = splsoftnet(); 3570 pool_put(&syn_cache_pool, (sc)); 3571 splx(s); 3572 return; 3573 } 3574 3575 /* 3576 * Remove syn cache created by the specified tcb entry, 3577 * because this does not make sense to keep them 3578 * (if there's no tcb entry, syn cache entry will never be used) 3579 */ 3580 void 3581 syn_cache_cleanup(struct tcpcb *tp) 3582 { 3583 struct syn_cache *sc, *nsc; 3584 int s; 3585 3586 s = splsoftnet(); 3587 3588 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3589 nsc = LIST_NEXT(sc, sc_tpq); 3590 3591 #ifdef DIAGNOSTIC 3592 if (sc->sc_tp != tp) 3593 panic("invalid sc_tp in syn_cache_cleanup"); 3594 #endif 3595 syn_cache_rm(sc); 3596 syn_cache_put(sc); 3597 } 3598 /* just for safety */ 3599 LIST_INIT(&tp->t_sc); 3600 3601 splx(s); 3602 } 3603 3604 /* 3605 * Find an entry in the syn cache. 3606 */ 3607 struct syn_cache * 3608 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3609 struct syn_cache_head **headp, u_int rtableid) 3610 { 3611 struct syn_cache *sc; 3612 struct syn_cache_head *scp; 3613 u_int32_t hash; 3614 int s; 3615 3616 SYN_HASHALL(hash, src, dst); 3617 3618 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3619 *headp = scp; 3620 s = splsoftnet(); 3621 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3622 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3623 if (sc->sc_hash != hash) 3624 continue; 3625 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3626 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3627 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) { 3628 splx(s); 3629 return (sc); 3630 } 3631 } 3632 splx(s); 3633 return (NULL); 3634 } 3635 3636 /* 3637 * This function gets called when we receive an ACK for a 3638 * socket in the LISTEN state. We look up the connection 3639 * in the syn cache, and if its there, we pull it out of 3640 * the cache and turn it into a full-blown connection in 3641 * the SYN-RECEIVED state. 3642 * 3643 * The return values may not be immediately obvious, and their effects 3644 * can be subtle, so here they are: 3645 * 3646 * NULL SYN was not found in cache; caller should drop the 3647 * packet and send an RST. 3648 * 3649 * -1 We were unable to create the new connection, and are 3650 * aborting it. An ACK,RST is being sent to the peer 3651 * (unless we got screwey sequence numbners; see below), 3652 * because the 3-way handshake has been completed. Caller 3653 * should not free the mbuf, since we may be using it. If 3654 * we are not, we will free it. 3655 * 3656 * Otherwise, the return value is a pointer to the new socket 3657 * associated with the connection. 3658 */ 3659 struct socket * 3660 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3661 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3662 { 3663 struct syn_cache *sc; 3664 struct syn_cache_head *scp; 3665 struct inpcb *inp = NULL; 3666 struct tcpcb *tp = NULL; 3667 struct mbuf *am; 3668 int s; 3669 struct socket *oso; 3670 #if NPF > 0 3671 struct pf_divert *divert = NULL; 3672 #endif 3673 3674 s = splsoftnet(); 3675 if ((sc = syn_cache_lookup(src, dst, &scp, 3676 sotoinpcb(so)->inp_rtableid)) == NULL) { 3677 splx(s); 3678 return (NULL); 3679 } 3680 3681 /* 3682 * Verify the sequence and ack numbers. Try getting the correct 3683 * response again. 3684 */ 3685 if ((th->th_ack != sc->sc_iss + 1) || 3686 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3687 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3688 (void) syn_cache_respond(sc, m); 3689 splx(s); 3690 return ((struct socket *)(-1)); 3691 } 3692 3693 /* Remove this cache entry */ 3694 syn_cache_rm(sc); 3695 splx(s); 3696 3697 /* 3698 * Ok, create the full blown connection, and set things up 3699 * as they would have been set up if we had created the 3700 * connection when the SYN arrived. If we can't create 3701 * the connection, abort it. 3702 */ 3703 oso = so; 3704 so = sonewconn(so, SS_ISCONNECTED); 3705 if (so == NULL) 3706 goto resetandabort; 3707 3708 inp = sotoinpcb(oso); 3709 3710 #ifdef IPSEC 3711 /* 3712 * We need to copy the required security levels 3713 * from the old pcb. Ditto for any other 3714 * IPsec-related information. 3715 */ 3716 { 3717 struct inpcb *newinp = sotoinpcb(so); 3718 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 3719 sizeof(inp->inp_seclevel)); 3720 newinp->inp_secrequire = inp->inp_secrequire; 3721 if (inp->inp_ipo != NULL) { 3722 newinp->inp_ipo = inp->inp_ipo; 3723 inp->inp_ipo->ipo_ref_count++; 3724 } 3725 if (inp->inp_ipsec_remotecred != NULL) { 3726 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred; 3727 inp->inp_ipsec_remotecred->ref_count++; 3728 } 3729 if (inp->inp_ipsec_remoteauth != NULL) { 3730 newinp->inp_ipsec_remoteauth 3731 = inp->inp_ipsec_remoteauth; 3732 inp->inp_ipsec_remoteauth->ref_count++; 3733 } 3734 } 3735 #endif /* IPSEC */ 3736 #ifdef INET6 3737 /* 3738 * inp still has the OLD in_pcb stuff, set the 3739 * v6-related flags on the new guy, too. 3740 */ 3741 { 3742 int flags = inp->inp_flags; 3743 struct inpcb *oldinpcb = inp; 3744 3745 inp = sotoinpcb(so); 3746 inp->inp_flags |= (flags & INP_IPV6); 3747 if ((inp->inp_flags & INP_IPV6) != 0) { 3748 inp->inp_ipv6.ip6_hlim = 3749 oldinpcb->inp_ipv6.ip6_hlim; 3750 } 3751 } 3752 #else /* INET6 */ 3753 inp = sotoinpcb(so); 3754 #endif /* INET6 */ 3755 3756 #if NPF > 0 3757 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3758 (divert = pf_find_divert(m)) != NULL) 3759 inp->inp_rtableid = divert->rdomain; 3760 else 3761 #endif 3762 /* inherit rtable from listening socket */ 3763 inp->inp_rtableid = sc->sc_rtableid; 3764 3765 inp->inp_lport = th->th_dport; 3766 switch (src->sa_family) { 3767 #ifdef INET6 3768 case AF_INET6: 3769 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr; 3770 break; 3771 #endif /* INET6 */ 3772 case AF_INET: 3773 3774 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 3775 inp->inp_options = ip_srcroute(); 3776 if (inp->inp_options == NULL) { 3777 inp->inp_options = sc->sc_ipopts; 3778 sc->sc_ipopts = NULL; 3779 } 3780 break; 3781 } 3782 in_pcbrehash(inp); 3783 3784 /* 3785 * Give the new socket our cached route reference. 3786 */ 3787 if (src->sa_family == AF_INET) 3788 inp->inp_route = sc->sc_route4; /* struct assignment */ 3789 #ifdef INET6 3790 else 3791 inp->inp_route6 = sc->sc_route6; 3792 #endif 3793 sc->sc_route4.ro_rt = NULL; 3794 3795 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3796 if (am == NULL) 3797 goto resetandabort; 3798 am->m_len = src->sa_len; 3799 bcopy(src, mtod(am, caddr_t), src->sa_len); 3800 3801 switch (src->sa_family) { 3802 case AF_INET: 3803 /* drop IPv4 packet to AF_INET6 socket */ 3804 if (inp->inp_flags & INP_IPV6) { 3805 (void) m_free(am); 3806 goto resetandabort; 3807 } 3808 if (in_pcbconnect(inp, am)) { 3809 (void) m_free(am); 3810 goto resetandabort; 3811 } 3812 break; 3813 #ifdef INET6 3814 case AF_INET6: 3815 if (in6_pcbconnect(inp, am)) { 3816 (void) m_free(am); 3817 goto resetandabort; 3818 } 3819 break; 3820 #endif 3821 } 3822 (void) m_free(am); 3823 3824 tp = intotcpcb(inp); 3825 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 3826 if (sc->sc_request_r_scale != 15) { 3827 tp->requested_s_scale = sc->sc_requested_s_scale; 3828 tp->request_r_scale = sc->sc_request_r_scale; 3829 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3830 } 3831 if (sc->sc_flags & SCF_TIMESTAMP) 3832 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3833 3834 tp->t_template = tcp_template(tp); 3835 if (tp->t_template == 0) { 3836 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3837 so = NULL; 3838 m_freem(m); 3839 goto abort; 3840 } 3841 #ifdef TCP_SACK 3842 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3843 #endif 3844 3845 tp->ts_modulate = sc->sc_modulate; 3846 tp->ts_recent = sc->sc_timestamp; 3847 tp->iss = sc->sc_iss; 3848 tp->irs = sc->sc_irs; 3849 tcp_sendseqinit(tp); 3850 #if defined (TCP_SACK) || defined(TCP_ECN) 3851 tp->snd_last = tp->snd_una; 3852 #endif /* TCP_SACK */ 3853 #if defined(TCP_SACK) && defined(TCP_FACK) 3854 tp->snd_fack = tp->snd_una; 3855 tp->retran_data = 0; 3856 tp->snd_awnd = 0; 3857 #endif /* TCP_FACK */ 3858 #ifdef TCP_ECN 3859 if (sc->sc_flags & SCF_ECN_PERMIT) { 3860 tp->t_flags |= TF_ECN_PERMIT; 3861 tcpstat.tcps_ecn_accepts++; 3862 } 3863 #endif 3864 #ifdef TCP_SACK 3865 if (sc->sc_flags & SCF_SACK_PERMIT) 3866 tp->t_flags |= TF_SACK_PERMIT; 3867 #endif 3868 #ifdef TCP_SIGNATURE 3869 if (sc->sc_flags & SCF_SIGNATURE) 3870 tp->t_flags |= TF_SIGNATURE; 3871 #endif 3872 tcp_rcvseqinit(tp); 3873 tp->t_state = TCPS_SYN_RECEIVED; 3874 tp->t_rcvtime = tcp_now; 3875 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3876 tcpstat.tcps_accepts++; 3877 3878 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3879 if (sc->sc_peermaxseg) 3880 tcp_mss_update(tp); 3881 /* Reset initial window to 1 segment for retransmit */ 3882 if (sc->sc_rxtshift > 0) 3883 tp->snd_cwnd = tp->t_maxseg; 3884 tp->snd_wl1 = sc->sc_irs; 3885 tp->rcv_up = sc->sc_irs + 1; 3886 3887 /* 3888 * This is what whould have happened in tcp_output() when 3889 * the SYN,ACK was sent. 3890 */ 3891 tp->snd_up = tp->snd_una; 3892 tp->snd_max = tp->snd_nxt = tp->iss+1; 3893 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3894 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3895 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3896 tp->last_ack_sent = tp->rcv_nxt; 3897 3898 tcpstat.tcps_sc_completed++; 3899 syn_cache_put(sc); 3900 return (so); 3901 3902 resetandabort: 3903 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3904 m->m_pkthdr.rdomain); 3905 m_freem(m); 3906 abort: 3907 if (so != NULL) 3908 (void) soabort(so); 3909 syn_cache_put(sc); 3910 tcpstat.tcps_sc_aborted++; 3911 return ((struct socket *)(-1)); 3912 } 3913 3914 /* 3915 * This function is called when we get a RST for a 3916 * non-existent connection, so that we can see if the 3917 * connection is in the syn cache. If it is, zap it. 3918 */ 3919 3920 void 3921 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3922 u_int rtableid) 3923 { 3924 struct syn_cache *sc; 3925 struct syn_cache_head *scp; 3926 int s = splsoftnet(); 3927 3928 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3929 splx(s); 3930 return; 3931 } 3932 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3933 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3934 splx(s); 3935 return; 3936 } 3937 syn_cache_rm(sc); 3938 splx(s); 3939 tcpstat.tcps_sc_reset++; 3940 syn_cache_put(sc); 3941 } 3942 3943 void 3944 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3945 u_int rtableid) 3946 { 3947 struct syn_cache *sc; 3948 struct syn_cache_head *scp; 3949 int s; 3950 3951 s = splsoftnet(); 3952 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) { 3953 splx(s); 3954 return; 3955 } 3956 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3957 if (ntohl (th->th_seq) != sc->sc_iss) { 3958 splx(s); 3959 return; 3960 } 3961 3962 /* 3963 * If we've retransmitted 3 times and this is our second error, 3964 * we remove the entry. Otherwise, we allow it to continue on. 3965 * This prevents us from incorrectly nuking an entry during a 3966 * spurious network outage. 3967 * 3968 * See tcp_notify(). 3969 */ 3970 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3971 sc->sc_flags |= SCF_UNREACH; 3972 splx(s); 3973 return; 3974 } 3975 3976 syn_cache_rm(sc); 3977 splx(s); 3978 tcpstat.tcps_sc_unreach++; 3979 syn_cache_put(sc); 3980 } 3981 3982 /* 3983 * Given a LISTEN socket and an inbound SYN request, add 3984 * this to the syn cache, and send back a segment: 3985 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3986 * to the source. 3987 * 3988 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3989 * Doing so would require that we hold onto the data and deliver it 3990 * to the application. However, if we are the target of a SYN-flood 3991 * DoS attack, an attacker could send data which would eventually 3992 * consume all available buffer space if it were ACKed. By not ACKing 3993 * the data, we avoid this DoS scenario. 3994 */ 3995 3996 int 3997 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3998 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3999 struct tcp_opt_info *oi, tcp_seq *issp) 4000 { 4001 struct tcpcb tb, *tp; 4002 long win; 4003 struct syn_cache *sc; 4004 struct syn_cache_head *scp; 4005 struct mbuf *ipopts; 4006 4007 tp = sototcpcb(so); 4008 4009 /* 4010 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 4011 * 4012 * Note this check is performed in tcp_input() very early on. 4013 */ 4014 4015 /* 4016 * Initialize some local state. 4017 */ 4018 win = sbspace(&so->so_rcv); 4019 if (win > TCP_MAXWIN) 4020 win = TCP_MAXWIN; 4021 4022 #ifdef TCP_SIGNATURE 4023 if (optp || (tp->t_flags & TF_SIGNATURE)) { 4024 #else 4025 if (optp) { 4026 #endif 4027 bzero(&tb, sizeof(tb)); 4028 tb.pf = tp->pf; 4029 #ifdef TCP_SACK 4030 tb.sack_enable = tp->sack_enable; 4031 #endif 4032 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 4033 #ifdef TCP_SIGNATURE 4034 if (tp->t_flags & TF_SIGNATURE) 4035 tb.t_flags |= TF_SIGNATURE; 4036 #endif 4037 tb.t_state = TCPS_LISTEN; 4038 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 4039 sotoinpcb(so)->inp_rtableid)) 4040 return (-1); 4041 } else 4042 tb.t_flags = 0; 4043 4044 switch (src->sa_family) { 4045 #ifdef INET 4046 case AF_INET: 4047 /* 4048 * Remember the IP options, if any. 4049 */ 4050 ipopts = ip_srcroute(); 4051 break; 4052 #endif 4053 default: 4054 ipopts = NULL; 4055 } 4056 4057 /* 4058 * See if we already have an entry for this connection. 4059 * If we do, resend the SYN,ACK. We do not count this 4060 * as a retransmission (XXX though maybe we should). 4061 */ 4062 if ((sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid)) 4063 != NULL) { 4064 tcpstat.tcps_sc_dupesyn++; 4065 if (ipopts) { 4066 /* 4067 * If we were remembering a previous source route, 4068 * forget it and use the new one we've been given. 4069 */ 4070 if (sc->sc_ipopts) 4071 (void) m_free(sc->sc_ipopts); 4072 sc->sc_ipopts = ipopts; 4073 } 4074 sc->sc_timestamp = tb.ts_recent; 4075 if (syn_cache_respond(sc, m) == 0) { 4076 tcpstat.tcps_sndacks++; 4077 tcpstat.tcps_sndtotal++; 4078 } 4079 return (0); 4080 } 4081 4082 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4083 if (sc == NULL) { 4084 if (ipopts) 4085 (void) m_free(ipopts); 4086 return (-1); 4087 } 4088 4089 /* 4090 * Fill in the cache, and put the necessary IP and TCP 4091 * options into the reply. 4092 */ 4093 bcopy(src, &sc->sc_src, src->sa_len); 4094 bcopy(dst, &sc->sc_dst, dst->sa_len); 4095 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 4096 sc->sc_flags = 0; 4097 sc->sc_ipopts = ipopts; 4098 sc->sc_irs = th->th_seq; 4099 4100 sc->sc_iss = issp ? *issp : arc4random(); 4101 sc->sc_peermaxseg = oi->maxseg; 4102 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ? 4103 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family); 4104 sc->sc_win = win; 4105 sc->sc_timestamp = tb.ts_recent; 4106 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4107 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4108 sc->sc_flags |= SCF_TIMESTAMP; 4109 sc->sc_modulate = arc4random(); 4110 } 4111 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4112 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4113 sc->sc_requested_s_scale = tb.requested_s_scale; 4114 sc->sc_request_r_scale = 0; 4115 /* 4116 * Pick the smallest possible scaling factor that 4117 * will still allow us to scale up to sb_max. 4118 * 4119 * We do this because there are broken firewalls that 4120 * will corrupt the window scale option, leading to 4121 * the other endpoint believing that our advertised 4122 * window is unscaled. At scale factors larger than 4123 * 5 the unscaled window will drop below 1500 bytes, 4124 * leading to serious problems when traversing these 4125 * broken firewalls. 4126 * 4127 * With the default sbmax of 256K, a scale factor 4128 * of 3 will be chosen by this algorithm. Those who 4129 * choose a larger sbmax should watch out 4130 * for the compatiblity problems mentioned above. 4131 * 4132 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4133 * or <SYN,ACK>) segment itself is never scaled. 4134 */ 4135 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4136 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4137 sc->sc_request_r_scale++; 4138 } else { 4139 sc->sc_requested_s_scale = 15; 4140 sc->sc_request_r_scale = 15; 4141 } 4142 #ifdef TCP_ECN 4143 /* 4144 * if both ECE and CWR flag bits are set, peer is ECN capable. 4145 */ 4146 if (tcp_do_ecn && 4147 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4148 sc->sc_flags |= SCF_ECN_PERMIT; 4149 #endif 4150 #ifdef TCP_SACK 4151 /* 4152 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4153 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4154 */ 4155 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4156 sc->sc_flags |= SCF_SACK_PERMIT; 4157 #endif 4158 #ifdef TCP_SIGNATURE 4159 if (tb.t_flags & TF_SIGNATURE) 4160 sc->sc_flags |= SCF_SIGNATURE; 4161 #endif 4162 sc->sc_tp = tp; 4163 if (syn_cache_respond(sc, m) == 0) { 4164 syn_cache_insert(sc, tp); 4165 tcpstat.tcps_sndacks++; 4166 tcpstat.tcps_sndtotal++; 4167 } else { 4168 syn_cache_put(sc); 4169 tcpstat.tcps_sc_dropped++; 4170 } 4171 4172 return (0); 4173 } 4174 4175 int 4176 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4177 { 4178 struct route *ro; 4179 u_int8_t *optp; 4180 int optlen, error; 4181 u_int16_t tlen; 4182 struct ip *ip = NULL; 4183 #ifdef INET6 4184 struct ip6_hdr *ip6 = NULL; 4185 #endif 4186 struct tcphdr *th; 4187 u_int hlen; 4188 struct inpcb *inp; 4189 4190 switch (sc->sc_src.sa.sa_family) { 4191 case AF_INET: 4192 hlen = sizeof(struct ip); 4193 ro = &sc->sc_route4; 4194 break; 4195 #ifdef INET6 4196 case AF_INET6: 4197 hlen = sizeof(struct ip6_hdr); 4198 ro = (struct route *)&sc->sc_route6; 4199 break; 4200 #endif 4201 default: 4202 if (m) 4203 m_freem(m); 4204 return (EAFNOSUPPORT); 4205 } 4206 4207 /* Compute the size of the TCP options. */ 4208 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4209 #ifdef TCP_SACK 4210 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4211 #endif 4212 #ifdef TCP_SIGNATURE 4213 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4214 #endif 4215 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4216 4217 tlen = hlen + sizeof(struct tcphdr) + optlen; 4218 4219 /* 4220 * Create the IP+TCP header from scratch. 4221 */ 4222 if (m) 4223 m_freem(m); 4224 #ifdef DIAGNOSTIC 4225 if (max_linkhdr + tlen > MCLBYTES) 4226 return (ENOBUFS); 4227 #endif 4228 MGETHDR(m, M_DONTWAIT, MT_DATA); 4229 if (m && max_linkhdr + tlen > MHLEN) { 4230 MCLGET(m, M_DONTWAIT); 4231 if ((m->m_flags & M_EXT) == 0) { 4232 m_freem(m); 4233 m = NULL; 4234 } 4235 } 4236 if (m == NULL) 4237 return (ENOBUFS); 4238 4239 /* Fixup the mbuf. */ 4240 m->m_data += max_linkhdr; 4241 m->m_len = m->m_pkthdr.len = tlen; 4242 m->m_pkthdr.rcvif = NULL; 4243 m->m_pkthdr.rdomain = sc->sc_rtableid; 4244 memset(mtod(m, u_char *), 0, tlen); 4245 4246 switch (sc->sc_src.sa.sa_family) { 4247 case AF_INET: 4248 ip = mtod(m, struct ip *); 4249 ip->ip_dst = sc->sc_src.sin.sin_addr; 4250 ip->ip_src = sc->sc_dst.sin.sin_addr; 4251 ip->ip_p = IPPROTO_TCP; 4252 th = (struct tcphdr *)(ip + 1); 4253 th->th_dport = sc->sc_src.sin.sin_port; 4254 th->th_sport = sc->sc_dst.sin.sin_port; 4255 break; 4256 #ifdef INET6 4257 case AF_INET6: 4258 ip6 = mtod(m, struct ip6_hdr *); 4259 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4260 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4261 ip6->ip6_nxt = IPPROTO_TCP; 4262 /* ip6_plen will be updated in ip6_output() */ 4263 th = (struct tcphdr *)(ip6 + 1); 4264 th->th_dport = sc->sc_src.sin6.sin6_port; 4265 th->th_sport = sc->sc_dst.sin6.sin6_port; 4266 break; 4267 #endif 4268 default: 4269 th = NULL; 4270 } 4271 4272 th->th_seq = htonl(sc->sc_iss); 4273 th->th_ack = htonl(sc->sc_irs + 1); 4274 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4275 th->th_flags = TH_SYN|TH_ACK; 4276 #ifdef TCP_ECN 4277 /* Set ECE for SYN-ACK if peer supports ECN. */ 4278 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4279 th->th_flags |= TH_ECE; 4280 #endif 4281 th->th_win = htons(sc->sc_win); 4282 /* th_sum already 0 */ 4283 /* th_urp already 0 */ 4284 4285 /* Tack on the TCP options. */ 4286 optp = (u_int8_t *)(th + 1); 4287 *optp++ = TCPOPT_MAXSEG; 4288 *optp++ = 4; 4289 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4290 *optp++ = sc->sc_ourmaxseg & 0xff; 4291 4292 #ifdef TCP_SACK 4293 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4294 if (sc->sc_flags & SCF_SACK_PERMIT) { 4295 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4296 optp += 4; 4297 } 4298 #endif 4299 4300 if (sc->sc_request_r_scale != 15) { 4301 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4302 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4303 sc->sc_request_r_scale); 4304 optp += 4; 4305 } 4306 4307 if (sc->sc_flags & SCF_TIMESTAMP) { 4308 u_int32_t *lp = (u_int32_t *)(optp); 4309 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4310 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4311 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4312 *lp = htonl(sc->sc_timestamp); 4313 optp += TCPOLEN_TSTAMP_APPA; 4314 } 4315 4316 #ifdef TCP_SIGNATURE 4317 if (sc->sc_flags & SCF_SIGNATURE) { 4318 union sockaddr_union src, dst; 4319 struct tdb *tdb; 4320 4321 bzero(&src, sizeof(union sockaddr_union)); 4322 bzero(&dst, sizeof(union sockaddr_union)); 4323 src.sa.sa_len = sc->sc_src.sa.sa_len; 4324 src.sa.sa_family = sc->sc_src.sa.sa_family; 4325 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4326 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4327 4328 switch (sc->sc_src.sa.sa_family) { 4329 case 0: /*default to PF_INET*/ 4330 #ifdef INET 4331 case AF_INET: 4332 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4333 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4334 break; 4335 #endif /* INET */ 4336 #ifdef INET6 4337 case AF_INET6: 4338 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4339 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4340 break; 4341 #endif /* INET6 */ 4342 } 4343 4344 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4345 0, &src, &dst, IPPROTO_TCP); 4346 if (tdb == NULL) { 4347 if (m) 4348 m_freem(m); 4349 return (EPERM); 4350 } 4351 4352 /* Send signature option */ 4353 *(optp++) = TCPOPT_SIGNATURE; 4354 *(optp++) = TCPOLEN_SIGNATURE; 4355 4356 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4357 hlen, 0, optp) < 0) { 4358 if (m) 4359 m_freem(m); 4360 return (EINVAL); 4361 } 4362 optp += 16; 4363 4364 /* Pad options list to the next 32 bit boundary and 4365 * terminate it. 4366 */ 4367 *optp++ = TCPOPT_NOP; 4368 *optp++ = TCPOPT_EOL; 4369 } 4370 #endif /* TCP_SIGNATURE */ 4371 4372 /* Compute the packet's checksum. */ 4373 switch (sc->sc_src.sa.sa_family) { 4374 case AF_INET: 4375 ip->ip_len = htons(tlen - hlen); 4376 th->th_sum = 0; 4377 th->th_sum = in_cksum(m, tlen); 4378 break; 4379 #ifdef INET6 4380 case AF_INET6: 4381 ip6->ip6_plen = htons(tlen - hlen); 4382 th->th_sum = 0; 4383 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4384 break; 4385 #endif 4386 } 4387 4388 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4389 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4390 4391 /* 4392 * Fill in some straggling IP bits. Note the stack expects 4393 * ip_len to be in host order, for convenience. 4394 */ 4395 switch (sc->sc_src.sa.sa_family) { 4396 #ifdef INET 4397 case AF_INET: 4398 ip->ip_len = htons(tlen); 4399 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4400 if (inp != NULL) 4401 ip->ip_tos = inp->inp_ip.ip_tos; 4402 break; 4403 #endif 4404 #ifdef INET6 4405 case AF_INET6: 4406 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4407 ip6->ip6_vfc |= IPV6_VERSION; 4408 ip6->ip6_plen = htons(tlen - hlen); 4409 /* ip6_hlim will be initialized afterwards */ 4410 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4411 break; 4412 #endif 4413 } 4414 4415 switch (sc->sc_src.sa.sa_family) { 4416 #ifdef INET 4417 case AF_INET: 4418 error = ip_output(m, sc->sc_ipopts, ro, 4419 (ip_mtudisc ? IP_MTUDISC : 0), 4420 (struct ip_moptions *)NULL, inp); 4421 break; 4422 #endif 4423 #ifdef INET6 4424 case AF_INET6: 4425 ip6->ip6_hlim = in6_selecthlim(NULL, 4426 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 4427 4428 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0, 4429 (struct ip6_moptions *)0, NULL, NULL); 4430 break; 4431 #endif 4432 default: 4433 error = EAFNOSUPPORT; 4434 break; 4435 } 4436 return (error); 4437 } 4438