1 /* $OpenBSD: tcp_input.c,v 1.338 2017/02/09 15:19:32 jca Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_debug.h> 98 99 #if NPF > 0 100 #include <net/pfvar.h> 101 #endif 102 103 struct tcpiphdr tcp_saveti; 104 105 int tcp_mss_adv(struct mbuf *, int); 106 int tcp_flush_queue(struct tcpcb *); 107 108 #ifdef INET6 109 #include <netinet6/in6_var.h> 110 #include <netinet6/nd6.h> 111 112 struct tcpipv6hdr tcp_saveti6; 113 114 /* for the packet header length in the mbuf */ 115 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 116 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 117 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 118 #endif /* INET6 */ 119 120 int tcprexmtthresh = 3; 121 int tcptv_keep_init = TCPTV_KEEP_INIT; 122 123 int tcp_rst_ppslim = 100; /* 100pps */ 124 int tcp_rst_ppslim_count = 0; 125 struct timeval tcp_rst_ppslim_last; 126 127 int tcp_ackdrop_ppslim = 100; /* 100pps */ 128 int tcp_ackdrop_ppslim_count = 0; 129 struct timeval tcp_ackdrop_ppslim_last; 130 131 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 132 133 /* for modulo comparisons of timestamps */ 134 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 135 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 136 137 /* for TCP SACK comparisons */ 138 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 139 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 140 141 /* 142 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 143 */ 144 #ifdef INET6 145 #define ND6_HINT(tp) \ 146 do { \ 147 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 148 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 149 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 150 } \ 151 } while (0) 152 #else 153 #define ND6_HINT(tp) 154 #endif 155 156 #ifdef TCP_ECN 157 /* 158 * ECN (Explicit Congestion Notification) support based on RFC3168 159 * implementation note: 160 * snd_last is used to track a recovery phase. 161 * when cwnd is reduced, snd_last is set to snd_max. 162 * while snd_last > snd_una, the sender is in a recovery phase and 163 * its cwnd should not be reduced again. 164 * snd_last follows snd_una when not in a recovery phase. 165 */ 166 #endif 167 168 /* 169 * Macro to compute ACK transmission behavior. Delay the ACK unless 170 * we have already delayed an ACK (must send an ACK every two segments). 171 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 172 * option is enabled or when the packet is coming from a loopback 173 * interface. 174 */ 175 #define TCP_SETUP_ACK(tp, tiflags, m) \ 176 do { \ 177 struct ifnet *ifp = NULL; \ 178 if (m && (m->m_flags & M_PKTHDR)) \ 179 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 180 if ((tp)->t_flags & TF_DELACK || \ 181 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 182 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 183 tp->t_flags |= TF_ACKNOW; \ 184 else \ 185 TCP_SET_DELACK(tp); \ 186 if_put(ifp); \ 187 } while (0) 188 189 void syn_cache_put(struct syn_cache *); 190 void syn_cache_rm(struct syn_cache *); 191 int syn_cache_respond(struct syn_cache *, struct mbuf *); 192 void syn_cache_timer(void *); 193 void syn_cache_reaper(void *); 194 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 195 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 196 struct tcphdr *, u_int); 197 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 198 unsigned int, struct socket *, struct mbuf *, u_char *, int, 199 struct tcp_opt_info *, tcp_seq *); 200 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 201 struct tcphdr *, unsigned int, unsigned int, struct socket *, 202 struct mbuf *); 203 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 204 struct syn_cache_head **, u_int); 205 206 /* 207 * Insert segment ti into reassembly queue of tcp with 208 * control block tp. Return TH_FIN if reassembly now includes 209 * a segment with FIN. The macro form does the common case inline 210 * (segment is the next to be received on an established connection, 211 * and the queue is empty), avoiding linkage into and removal 212 * from the queue and repetition of various conversions. 213 * Set DELACK for segments received in order, but ack immediately 214 * when segments are out of order (so fast retransmit can work). 215 */ 216 217 int 218 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 219 { 220 struct tcpqent *p, *q, *nq, *tiqe; 221 222 /* 223 * Allocate a new queue entry, before we throw away any data. 224 * If we can't, just drop the packet. XXX 225 */ 226 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 227 if (tiqe == NULL) { 228 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 229 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 230 /* Reuse last entry since new segment fills a hole */ 231 m_freem(tiqe->tcpqe_m); 232 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 233 } 234 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 235 /* Flush segment queue for this connection */ 236 tcp_freeq(tp); 237 tcpstat_inc(tcps_rcvmemdrop); 238 m_freem(m); 239 return (0); 240 } 241 } 242 243 /* 244 * Find a segment which begins after this one does. 245 */ 246 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 247 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 248 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 249 break; 250 251 /* 252 * If there is a preceding segment, it may provide some of 253 * our data already. If so, drop the data from the incoming 254 * segment. If it provides all of our data, drop us. 255 */ 256 if (p != NULL) { 257 struct tcphdr *phdr = p->tcpqe_tcp; 258 int i; 259 260 /* conversion to int (in i) handles seq wraparound */ 261 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 262 if (i > 0) { 263 if (i >= *tlen) { 264 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 265 *tlen); 266 m_freem(m); 267 pool_put(&tcpqe_pool, tiqe); 268 return (0); 269 } 270 m_adj(m, i); 271 *tlen -= i; 272 th->th_seq += i; 273 } 274 } 275 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 276 277 /* 278 * While we overlap succeeding segments trim them or, 279 * if they are completely covered, dequeue them. 280 */ 281 for (; q != NULL; q = nq) { 282 struct tcphdr *qhdr = q->tcpqe_tcp; 283 int i = (th->th_seq + *tlen) - qhdr->th_seq; 284 285 if (i <= 0) 286 break; 287 if (i < qhdr->th_reseqlen) { 288 qhdr->th_seq += i; 289 qhdr->th_reseqlen -= i; 290 m_adj(q->tcpqe_m, i); 291 break; 292 } 293 nq = TAILQ_NEXT(q, tcpqe_q); 294 m_freem(q->tcpqe_m); 295 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 296 pool_put(&tcpqe_pool, q); 297 } 298 299 /* Insert the new segment queue entry into place. */ 300 tiqe->tcpqe_m = m; 301 th->th_reseqlen = *tlen; 302 tiqe->tcpqe_tcp = th; 303 if (p == NULL) { 304 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 305 } else { 306 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 307 } 308 309 if (th->th_seq != tp->rcv_nxt) 310 return (0); 311 312 return (tcp_flush_queue(tp)); 313 } 314 315 int 316 tcp_flush_queue(struct tcpcb *tp) 317 { 318 struct socket *so = tp->t_inpcb->inp_socket; 319 struct tcpqent *q, *nq; 320 int flags; 321 322 /* 323 * Present data to user, advancing rcv_nxt through 324 * completed sequence space. 325 */ 326 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 327 return (0); 328 q = TAILQ_FIRST(&tp->t_segq); 329 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 330 return (0); 331 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 332 return (0); 333 do { 334 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 335 flags = q->tcpqe_tcp->th_flags & TH_FIN; 336 337 nq = TAILQ_NEXT(q, tcpqe_q); 338 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 339 ND6_HINT(tp); 340 if (so->so_state & SS_CANTRCVMORE) 341 m_freem(q->tcpqe_m); 342 else 343 sbappendstream(&so->so_rcv, q->tcpqe_m); 344 pool_put(&tcpqe_pool, q); 345 q = nq; 346 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 347 tp->t_flags |= TF_BLOCKOUTPUT; 348 sorwakeup(so); 349 tp->t_flags &= ~TF_BLOCKOUTPUT; 350 return (flags); 351 } 352 353 /* 354 * TCP input routine, follows pages 65-76 of the 355 * protocol specification dated September, 1981 very closely. 356 */ 357 int 358 tcp_input(struct mbuf **mp, int *offp, int proto) 359 { 360 struct mbuf *m = *mp; 361 int iphlen = *offp; 362 struct ip *ip; 363 struct inpcb *inp = NULL; 364 u_int8_t *optp = NULL; 365 int optlen = 0; 366 int tlen, off; 367 struct tcpcb *tp = NULL; 368 int tiflags; 369 struct socket *so = NULL; 370 int todrop, acked, ourfinisacked; 371 int hdroptlen = 0; 372 short ostate = 0; 373 tcp_seq iss, *reuse = NULL; 374 u_long tiwin; 375 struct tcp_opt_info opti; 376 struct tcphdr *th; 377 #ifdef INET6 378 struct ip6_hdr *ip6 = NULL; 379 #endif /* INET6 */ 380 #ifdef IPSEC 381 struct m_tag *mtag; 382 struct tdb_ident *tdbi; 383 struct tdb *tdb; 384 int error; 385 #endif /* IPSEC */ 386 int af; 387 #ifdef TCP_ECN 388 u_char iptos; 389 #endif 390 391 tcpstat_inc(tcps_rcvtotal); 392 393 opti.ts_present = 0; 394 opti.maxseg = 0; 395 396 /* 397 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 398 */ 399 if (m->m_flags & (M_BCAST|M_MCAST)) 400 goto drop; 401 402 /* 403 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 404 * TCP/IPv4. 405 */ 406 switch (mtod(m, struct ip *)->ip_v) { 407 #ifdef INET6 408 case 6: 409 af = AF_INET6; 410 break; 411 #endif 412 case 4: 413 af = AF_INET; 414 break; 415 default: 416 m_freem(m); 417 return IPPROTO_DONE; 418 } 419 420 /* 421 * Get IP and TCP header together in first mbuf. 422 * Note: IP leaves IP header in first mbuf. 423 */ 424 switch (af) { 425 case AF_INET: 426 #ifdef DIAGNOSTIC 427 if (iphlen < sizeof(struct ip)) { 428 m_freem(m); 429 return IPPROTO_DONE; 430 } 431 #endif /* DIAGNOSTIC */ 432 break; 433 #ifdef INET6 434 case AF_INET6: 435 #ifdef DIAGNOSTIC 436 if (iphlen < sizeof(struct ip6_hdr)) { 437 m_freem(m); 438 return IPPROTO_DONE; 439 } 440 #endif /* DIAGNOSTIC */ 441 break; 442 #endif 443 default: 444 m_freem(m); 445 return IPPROTO_DONE; 446 } 447 448 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 449 if (!th) { 450 tcpstat_inc(tcps_rcvshort); 451 return IPPROTO_DONE; 452 } 453 454 tlen = m->m_pkthdr.len - iphlen; 455 ip = NULL; 456 #ifdef INET6 457 ip6 = NULL; 458 #endif 459 switch (af) { 460 case AF_INET: 461 ip = mtod(m, struct ip *); 462 #ifdef TCP_ECN 463 /* save ip_tos before clearing it for checksum */ 464 iptos = ip->ip_tos; 465 #endif 466 break; 467 #ifdef INET6 468 case AF_INET6: 469 ip6 = mtod(m, struct ip6_hdr *); 470 #ifdef TCP_ECN 471 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 472 #endif 473 474 /* Be proactive about malicious use of IPv4 mapped address */ 475 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 476 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 477 /* XXX stat */ 478 goto drop; 479 } 480 481 /* 482 * Be proactive about unspecified IPv6 address in source. 483 * As we use all-zero to indicate unbounded/unconnected pcb, 484 * unspecified IPv6 address can be used to confuse us. 485 * 486 * Note that packets with unspecified IPv6 destination is 487 * already dropped in ip6_input. 488 */ 489 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 490 /* XXX stat */ 491 goto drop; 492 } 493 494 /* Discard packets to multicast */ 495 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 496 /* XXX stat */ 497 goto drop; 498 } 499 break; 500 #endif 501 } 502 503 /* 504 * Checksum extended TCP header and data. 505 */ 506 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 507 int sum; 508 509 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 510 tcpstat_inc(tcps_rcvbadsum); 511 goto drop; 512 } 513 tcpstat_inc(tcps_inswcsum); 514 switch (af) { 515 case AF_INET: 516 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 517 break; 518 #ifdef INET6 519 case AF_INET6: 520 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 521 tlen); 522 break; 523 #endif 524 } 525 if (sum != 0) { 526 tcpstat_inc(tcps_rcvbadsum); 527 goto drop; 528 } 529 } 530 531 /* 532 * Check that TCP offset makes sense, 533 * pull out TCP options and adjust length. XXX 534 */ 535 off = th->th_off << 2; 536 if (off < sizeof(struct tcphdr) || off > tlen) { 537 tcpstat_inc(tcps_rcvbadoff); 538 goto drop; 539 } 540 tlen -= off; 541 if (off > sizeof(struct tcphdr)) { 542 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 543 if (!th) { 544 tcpstat_inc(tcps_rcvshort); 545 return IPPROTO_DONE; 546 } 547 optlen = off - sizeof(struct tcphdr); 548 optp = (u_int8_t *)(th + 1); 549 /* 550 * Do quick retrieval of timestamp options ("options 551 * prediction?"). If timestamp is the only option and it's 552 * formatted as recommended in RFC 1323 appendix A, we 553 * quickly get the values now and not bother calling 554 * tcp_dooptions(), etc. 555 */ 556 if ((optlen == TCPOLEN_TSTAMP_APPA || 557 (optlen > TCPOLEN_TSTAMP_APPA && 558 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 559 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 560 (th->th_flags & TH_SYN) == 0) { 561 opti.ts_present = 1; 562 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 563 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 564 optp = NULL; /* we've parsed the options */ 565 } 566 } 567 tiflags = th->th_flags; 568 569 /* 570 * Convert TCP protocol specific fields to host format. 571 */ 572 th->th_seq = ntohl(th->th_seq); 573 th->th_ack = ntohl(th->th_ack); 574 th->th_win = ntohs(th->th_win); 575 th->th_urp = ntohs(th->th_urp); 576 577 /* 578 * Locate pcb for segment. 579 */ 580 #if NPF > 0 581 inp = pf_inp_lookup(m); 582 #endif 583 findpcb: 584 if (inp == NULL) { 585 switch (af) { 586 #ifdef INET6 587 case AF_INET6: 588 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 589 th->th_sport, &ip6->ip6_dst, th->th_dport, 590 m->m_pkthdr.ph_rtableid); 591 break; 592 #endif 593 case AF_INET: 594 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 595 th->th_sport, ip->ip_dst, th->th_dport, 596 m->m_pkthdr.ph_rtableid); 597 break; 598 } 599 } 600 if (inp == NULL) { 601 int inpl_reverse = 0; 602 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 603 inpl_reverse = 1; 604 tcpstat_inc(tcps_pcbhashmiss); 605 switch (af) { 606 #ifdef INET6 607 case AF_INET6: 608 inp = in6_pcblookup_listen(&tcbtable, 609 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 610 m->m_pkthdr.ph_rtableid); 611 break; 612 #endif /* INET6 */ 613 case AF_INET: 614 inp = in_pcblookup_listen(&tcbtable, 615 ip->ip_dst, th->th_dport, inpl_reverse, m, 616 m->m_pkthdr.ph_rtableid); 617 break; 618 } 619 /* 620 * If the state is CLOSED (i.e., TCB does not exist) then 621 * all data in the incoming segment is discarded. 622 * If the TCB exists but is in CLOSED state, it is embryonic, 623 * but should either do a listen or a connect soon. 624 */ 625 if (inp == NULL) { 626 tcpstat_inc(tcps_noport); 627 goto dropwithreset_ratelim; 628 } 629 } 630 KASSERT(sotoinpcb(inp->inp_socket) == inp); 631 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 632 633 /* Check the minimum TTL for socket. */ 634 switch (af) { 635 case AF_INET: 636 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 637 goto drop; 638 break; 639 #ifdef INET6 640 case AF_INET6: 641 if (inp->inp_ip6_minhlim && 642 inp->inp_ip6_minhlim > ip6->ip6_hlim) 643 goto drop; 644 break; 645 #endif 646 } 647 648 tp = intotcpcb(inp); 649 if (tp == NULL) 650 goto dropwithreset_ratelim; 651 if (tp->t_state == TCPS_CLOSED) 652 goto drop; 653 654 /* Unscale the window into a 32-bit value. */ 655 if ((tiflags & TH_SYN) == 0) 656 tiwin = th->th_win << tp->snd_scale; 657 else 658 tiwin = th->th_win; 659 660 so = inp->inp_socket; 661 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 662 union syn_cache_sa src; 663 union syn_cache_sa dst; 664 665 bzero(&src, sizeof(src)); 666 bzero(&dst, sizeof(dst)); 667 switch (af) { 668 case AF_INET: 669 src.sin.sin_len = sizeof(struct sockaddr_in); 670 src.sin.sin_family = AF_INET; 671 src.sin.sin_addr = ip->ip_src; 672 src.sin.sin_port = th->th_sport; 673 674 dst.sin.sin_len = sizeof(struct sockaddr_in); 675 dst.sin.sin_family = AF_INET; 676 dst.sin.sin_addr = ip->ip_dst; 677 dst.sin.sin_port = th->th_dport; 678 break; 679 #ifdef INET6 680 case AF_INET6: 681 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 682 src.sin6.sin6_family = AF_INET6; 683 src.sin6.sin6_addr = ip6->ip6_src; 684 src.sin6.sin6_port = th->th_sport; 685 686 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 687 dst.sin6.sin6_family = AF_INET6; 688 dst.sin6.sin6_addr = ip6->ip6_dst; 689 dst.sin6.sin6_port = th->th_dport; 690 break; 691 #endif /* INET6 */ 692 default: 693 goto badsyn; /*sanity*/ 694 } 695 696 if (so->so_options & SO_DEBUG) { 697 ostate = tp->t_state; 698 switch (af) { 699 #ifdef INET6 700 case AF_INET6: 701 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 702 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 703 break; 704 #endif 705 case AF_INET: 706 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 707 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 708 break; 709 } 710 } 711 if (so->so_options & SO_ACCEPTCONN) { 712 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 713 714 case TH_SYN|TH_ACK|TH_RST: 715 case TH_SYN|TH_RST: 716 case TH_ACK|TH_RST: 717 case TH_RST: 718 syn_cache_reset(&src.sa, &dst.sa, th, 719 inp->inp_rtableid); 720 goto drop; 721 722 case TH_SYN|TH_ACK: 723 /* 724 * Received a SYN,ACK. This should 725 * never happen while we are in 726 * LISTEN. Send an RST. 727 */ 728 goto badsyn; 729 730 case TH_ACK: 731 so = syn_cache_get(&src.sa, &dst.sa, 732 th, iphlen, tlen, so, m); 733 if (so == NULL) { 734 /* 735 * We don't have a SYN for 736 * this ACK; send an RST. 737 */ 738 goto badsyn; 739 } else if (so == (struct socket *)(-1)) { 740 /* 741 * We were unable to create 742 * the connection. If the 743 * 3-way handshake was 744 * completed, and RST has 745 * been sent to the peer. 746 * Since the mbuf might be 747 * in use for the reply, 748 * do not free it. 749 */ 750 m = NULL; 751 goto drop; 752 } else { 753 /* 754 * We have created a 755 * full-blown connection. 756 */ 757 tp = NULL; 758 inp = sotoinpcb(so); 759 tp = intotcpcb(inp); 760 if (tp == NULL) 761 goto badsyn; /*XXX*/ 762 763 } 764 break; 765 766 default: 767 /* 768 * None of RST, SYN or ACK was set. 769 * This is an invalid packet for a 770 * TCB in LISTEN state. Send a RST. 771 */ 772 goto badsyn; 773 774 case TH_SYN: 775 /* 776 * Received a SYN. 777 */ 778 #ifdef INET6 779 /* 780 * If deprecated address is forbidden, we do 781 * not accept SYN to deprecated interface 782 * address to prevent any new inbound 783 * connection from getting established. 784 * When we do not accept SYN, we send a TCP 785 * RST, with deprecated source address (instead 786 * of dropping it). We compromise it as it is 787 * much better for peer to send a RST, and 788 * RST will be the final packet for the 789 * exchange. 790 * 791 * If we do not forbid deprecated addresses, we 792 * accept the SYN packet. RFC2462 does not 793 * suggest dropping SYN in this case. 794 * If we decipher RFC2462 5.5.4, it says like 795 * this: 796 * 1. use of deprecated addr with existing 797 * communication is okay - "SHOULD continue 798 * to be used" 799 * 2. use of it with new communication: 800 * (2a) "SHOULD NOT be used if alternate 801 * address with sufficient scope is 802 * available" 803 * (2b) nothing mentioned otherwise. 804 * Here we fall into (2b) case as we have no 805 * choice in our source address selection - we 806 * must obey the peer. 807 * 808 * The wording in RFC2462 is confusing, and 809 * there are multiple description text for 810 * deprecated address handling - worse, they 811 * are not exactly the same. I believe 5.5.4 812 * is the best one, so we follow 5.5.4. 813 */ 814 if (ip6 && !ip6_use_deprecated) { 815 struct in6_ifaddr *ia6; 816 struct ifnet *ifp = 817 if_get(m->m_pkthdr.ph_ifidx); 818 819 if (ifp && 820 (ia6 = in6ifa_ifpwithaddr(ifp, 821 &ip6->ip6_dst)) && 822 (ia6->ia6_flags & 823 IN6_IFF_DEPRECATED)) { 824 tp = NULL; 825 if_put(ifp); 826 goto dropwithreset; 827 } 828 if_put(ifp); 829 } 830 #endif 831 832 /* 833 * LISTEN socket received a SYN 834 * from itself? This can't possibly 835 * be valid; drop the packet. 836 */ 837 if (th->th_dport == th->th_sport) { 838 switch (af) { 839 #ifdef INET6 840 case AF_INET6: 841 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 842 &ip6->ip6_dst)) { 843 tcpstat_inc(tcps_badsyn); 844 goto drop; 845 } 846 break; 847 #endif /* INET6 */ 848 case AF_INET: 849 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 850 tcpstat_inc(tcps_badsyn); 851 goto drop; 852 } 853 break; 854 } 855 } 856 857 /* 858 * SYN looks ok; create compressed TCP 859 * state for it. 860 */ 861 if (so->so_qlen > so->so_qlimit || 862 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 863 so, m, optp, optlen, &opti, reuse) == -1) { 864 tcpstat_inc(tcps_dropsyn); 865 goto drop; 866 } 867 return IPPROTO_DONE; 868 } 869 } 870 } 871 872 #ifdef DIAGNOSTIC 873 /* 874 * Should not happen now that all embryonic connections 875 * are handled with compressed state. 876 */ 877 if (tp->t_state == TCPS_LISTEN) 878 panic("tcp_input: TCPS_LISTEN"); 879 #endif 880 881 #if NPF > 0 882 pf_inp_link(m, inp); 883 #endif 884 885 #ifdef IPSEC 886 /* Find most recent IPsec tag */ 887 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 888 if (mtag != NULL) { 889 tdbi = (struct tdb_ident *)(mtag + 1); 890 tdb = gettdb(tdbi->rdomain, tdbi->spi, 891 &tdbi->dst, tdbi->proto); 892 } else 893 tdb = NULL; 894 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 895 tdb, inp, 0); 896 if (error) { 897 tcpstat_inc(tcps_rcvnosec); 898 goto drop; 899 } 900 #endif /* IPSEC */ 901 902 /* 903 * Segment received on connection. 904 * Reset idle time and keep-alive timer. 905 */ 906 tp->t_rcvtime = tcp_now; 907 if (TCPS_HAVEESTABLISHED(tp->t_state)) 908 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 909 910 #ifdef TCP_SACK 911 if (tp->sack_enable) 912 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 913 #endif /* TCP_SACK */ 914 915 /* 916 * Process options. 917 */ 918 #ifdef TCP_SIGNATURE 919 if (optp || (tp->t_flags & TF_SIGNATURE)) 920 #else 921 if (optp) 922 #endif 923 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 924 m->m_pkthdr.ph_rtableid)) 925 goto drop; 926 927 if (opti.ts_present && opti.ts_ecr) { 928 int rtt_test; 929 930 /* subtract out the tcp timestamp modulator */ 931 opti.ts_ecr -= tp->ts_modulate; 932 933 /* make sure ts_ecr is sensible */ 934 rtt_test = tcp_now - opti.ts_ecr; 935 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 936 opti.ts_ecr = 0; 937 } 938 939 #ifdef TCP_ECN 940 /* if congestion experienced, set ECE bit in subsequent packets. */ 941 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 942 tp->t_flags |= TF_RCVD_CE; 943 tcpstat_inc(tcps_ecn_rcvce); 944 } 945 #endif 946 /* 947 * Header prediction: check for the two common cases 948 * of a uni-directional data xfer. If the packet has 949 * no control flags, is in-sequence, the window didn't 950 * change and we're not retransmitting, it's a 951 * candidate. If the length is zero and the ack moved 952 * forward, we're the sender side of the xfer. Just 953 * free the data acked & wake any higher level process 954 * that was blocked waiting for space. If the length 955 * is non-zero and the ack didn't move, we're the 956 * receiver side. If we're getting packets in-order 957 * (the reassembly queue is empty), add the data to 958 * the socket buffer and note that we need a delayed ack. 959 */ 960 if (tp->t_state == TCPS_ESTABLISHED && 961 #ifdef TCP_ECN 962 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 963 #else 964 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 965 #endif 966 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 967 th->th_seq == tp->rcv_nxt && 968 tiwin && tiwin == tp->snd_wnd && 969 tp->snd_nxt == tp->snd_max) { 970 971 /* 972 * If last ACK falls within this segment's sequence numbers, 973 * record the timestamp. 974 * Fix from Braden, see Stevens p. 870 975 */ 976 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 977 tp->ts_recent_age = tcp_now; 978 tp->ts_recent = opti.ts_val; 979 } 980 981 if (tlen == 0) { 982 if (SEQ_GT(th->th_ack, tp->snd_una) && 983 SEQ_LEQ(th->th_ack, tp->snd_max) && 984 tp->snd_cwnd >= tp->snd_wnd && 985 tp->t_dupacks == 0) { 986 /* 987 * this is a pure ack for outstanding data. 988 */ 989 tcpstat_inc(tcps_predack); 990 if (opti.ts_present && opti.ts_ecr) 991 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 992 else if (tp->t_rtttime && 993 SEQ_GT(th->th_ack, tp->t_rtseq)) 994 tcp_xmit_timer(tp, 995 tcp_now - tp->t_rtttime); 996 acked = th->th_ack - tp->snd_una; 997 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 998 acked); 999 ND6_HINT(tp); 1000 sbdrop(&so->so_snd, acked); 1001 1002 /* 1003 * If we had a pending ICMP message that 1004 * refers to data that have just been 1005 * acknowledged, disregard the recorded ICMP 1006 * message. 1007 */ 1008 if ((tp->t_flags & TF_PMTUD_PEND) && 1009 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1010 tp->t_flags &= ~TF_PMTUD_PEND; 1011 1012 /* 1013 * Keep track of the largest chunk of data 1014 * acknowledged since last PMTU update 1015 */ 1016 if (tp->t_pmtud_mss_acked < acked) 1017 tp->t_pmtud_mss_acked = acked; 1018 1019 tp->snd_una = th->th_ack; 1020 #if defined(TCP_SACK) || defined(TCP_ECN) 1021 /* 1022 * We want snd_last to track snd_una so 1023 * as to avoid sequence wraparound problems 1024 * for very large transfers. 1025 */ 1026 #ifdef TCP_ECN 1027 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1028 #endif 1029 tp->snd_last = tp->snd_una; 1030 #endif /* TCP_SACK */ 1031 #if defined(TCP_SACK) && defined(TCP_FACK) 1032 tp->snd_fack = tp->snd_una; 1033 tp->retran_data = 0; 1034 #endif /* TCP_FACK */ 1035 m_freem(m); 1036 1037 /* 1038 * If all outstanding data are acked, stop 1039 * retransmit timer, otherwise restart timer 1040 * using current (possibly backed-off) value. 1041 * If process is waiting for space, 1042 * wakeup/selwakeup/signal. If data 1043 * are ready to send, let tcp_output 1044 * decide between more output or persist. 1045 */ 1046 if (tp->snd_una == tp->snd_max) 1047 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1048 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1049 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1050 1051 tcp_update_sndspace(tp); 1052 if (sb_notify(&so->so_snd)) { 1053 tp->t_flags |= TF_BLOCKOUTPUT; 1054 sowwakeup(so); 1055 tp->t_flags &= ~TF_BLOCKOUTPUT; 1056 } 1057 if (so->so_snd.sb_cc || 1058 tp->t_flags & TF_NEEDOUTPUT) 1059 (void) tcp_output(tp); 1060 return IPPROTO_DONE; 1061 } 1062 } else if (th->th_ack == tp->snd_una && 1063 TAILQ_EMPTY(&tp->t_segq) && 1064 tlen <= sbspace(&so->so_rcv)) { 1065 /* 1066 * This is a pure, in-sequence data packet 1067 * with nothing on the reassembly queue and 1068 * we have enough buffer space to take it. 1069 */ 1070 #ifdef TCP_SACK 1071 /* Clean receiver SACK report if present */ 1072 if (tp->sack_enable && tp->rcv_numsacks) 1073 tcp_clean_sackreport(tp); 1074 #endif /* TCP_SACK */ 1075 tcpstat_inc(tcps_preddat); 1076 tp->rcv_nxt += tlen; 1077 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1078 ND6_HINT(tp); 1079 1080 TCP_SETUP_ACK(tp, tiflags, m); 1081 /* 1082 * Drop TCP, IP headers and TCP options then add data 1083 * to socket buffer. 1084 */ 1085 if (so->so_state & SS_CANTRCVMORE) 1086 m_freem(m); 1087 else { 1088 if (opti.ts_present && opti.ts_ecr) { 1089 if (tp->rfbuf_ts < opti.ts_ecr && 1090 opti.ts_ecr - tp->rfbuf_ts < hz) { 1091 tcp_update_rcvspace(tp); 1092 /* Start over with next RTT. */ 1093 tp->rfbuf_cnt = 0; 1094 tp->rfbuf_ts = 0; 1095 } else 1096 tp->rfbuf_cnt += tlen; 1097 } 1098 m_adj(m, iphlen + off); 1099 sbappendstream(&so->so_rcv, m); 1100 } 1101 tp->t_flags |= TF_BLOCKOUTPUT; 1102 sorwakeup(so); 1103 tp->t_flags &= ~TF_BLOCKOUTPUT; 1104 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1105 (void) tcp_output(tp); 1106 return IPPROTO_DONE; 1107 } 1108 } 1109 1110 /* 1111 * Compute mbuf offset to TCP data segment. 1112 */ 1113 hdroptlen = iphlen + off; 1114 1115 /* 1116 * Calculate amount of space in receive window, 1117 * and then do TCP input processing. 1118 * Receive window is amount of space in rcv queue, 1119 * but not less than advertised window. 1120 */ 1121 { int win; 1122 1123 win = sbspace(&so->so_rcv); 1124 if (win < 0) 1125 win = 0; 1126 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1127 } 1128 1129 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1130 tp->rfbuf_cnt = 0; 1131 tp->rfbuf_ts = 0; 1132 1133 switch (tp->t_state) { 1134 1135 /* 1136 * If the state is SYN_RECEIVED: 1137 * if seg contains SYN/ACK, send an RST. 1138 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1139 */ 1140 1141 case TCPS_SYN_RECEIVED: 1142 if (tiflags & TH_ACK) { 1143 if (tiflags & TH_SYN) { 1144 tcpstat_inc(tcps_badsyn); 1145 goto dropwithreset; 1146 } 1147 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1148 SEQ_GT(th->th_ack, tp->snd_max)) 1149 goto dropwithreset; 1150 } 1151 break; 1152 1153 /* 1154 * If the state is SYN_SENT: 1155 * if seg contains an ACK, but not for our SYN, drop the input. 1156 * if seg contains a RST, then drop the connection. 1157 * if seg does not contain SYN, then drop it. 1158 * Otherwise this is an acceptable SYN segment 1159 * initialize tp->rcv_nxt and tp->irs 1160 * if seg contains ack then advance tp->snd_una 1161 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1162 * arrange for segment to be acked (eventually) 1163 * continue processing rest of data/controls, beginning with URG 1164 */ 1165 case TCPS_SYN_SENT: 1166 if ((tiflags & TH_ACK) && 1167 (SEQ_LEQ(th->th_ack, tp->iss) || 1168 SEQ_GT(th->th_ack, tp->snd_max))) 1169 goto dropwithreset; 1170 if (tiflags & TH_RST) { 1171 #ifdef TCP_ECN 1172 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1173 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1174 goto drop; 1175 #endif 1176 if (tiflags & TH_ACK) 1177 tp = tcp_drop(tp, ECONNREFUSED); 1178 goto drop; 1179 } 1180 if ((tiflags & TH_SYN) == 0) 1181 goto drop; 1182 if (tiflags & TH_ACK) { 1183 tp->snd_una = th->th_ack; 1184 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1185 tp->snd_nxt = tp->snd_una; 1186 } 1187 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1188 tp->irs = th->th_seq; 1189 tcp_mss(tp, opti.maxseg); 1190 /* Reset initial window to 1 segment for retransmit */ 1191 if (tp->t_rxtshift > 0) 1192 tp->snd_cwnd = tp->t_maxseg; 1193 tcp_rcvseqinit(tp); 1194 tp->t_flags |= TF_ACKNOW; 1195 #ifdef TCP_SACK 1196 /* 1197 * If we've sent a SACK_PERMITTED option, and the peer 1198 * also replied with one, then TF_SACK_PERMIT should have 1199 * been set in tcp_dooptions(). If it was not, disable SACKs. 1200 */ 1201 if (tp->sack_enable) 1202 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1203 #endif 1204 #ifdef TCP_ECN 1205 /* 1206 * if ECE is set but CWR is not set for SYN-ACK, or 1207 * both ECE and CWR are set for simultaneous open, 1208 * peer is ECN capable. 1209 */ 1210 if (tcp_do_ecn) { 1211 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1212 case TH_ACK|TH_ECE: 1213 case TH_ECE|TH_CWR: 1214 tp->t_flags |= TF_ECN_PERMIT; 1215 tiflags &= ~(TH_ECE|TH_CWR); 1216 tcpstat_inc(tcps_ecn_accepts); 1217 } 1218 } 1219 #endif 1220 1221 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1222 tcpstat_inc(tcps_connects); 1223 soisconnected(so); 1224 tp->t_state = TCPS_ESTABLISHED; 1225 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1226 /* Do window scaling on this connection? */ 1227 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1228 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1229 tp->snd_scale = tp->requested_s_scale; 1230 tp->rcv_scale = tp->request_r_scale; 1231 } 1232 tcp_flush_queue(tp); 1233 1234 /* 1235 * if we didn't have to retransmit the SYN, 1236 * use its rtt as our initial srtt & rtt var. 1237 */ 1238 if (tp->t_rtttime) 1239 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1240 /* 1241 * Since new data was acked (the SYN), open the 1242 * congestion window by one MSS. We do this 1243 * here, because we won't go through the normal 1244 * ACK processing below. And since this is the 1245 * start of the connection, we know we are in 1246 * the exponential phase of slow-start. 1247 */ 1248 tp->snd_cwnd += tp->t_maxseg; 1249 } else 1250 tp->t_state = TCPS_SYN_RECEIVED; 1251 1252 #if 0 1253 trimthenstep6: 1254 #endif 1255 /* 1256 * Advance th->th_seq to correspond to first data byte. 1257 * If data, trim to stay within window, 1258 * dropping FIN if necessary. 1259 */ 1260 th->th_seq++; 1261 if (tlen > tp->rcv_wnd) { 1262 todrop = tlen - tp->rcv_wnd; 1263 m_adj(m, -todrop); 1264 tlen = tp->rcv_wnd; 1265 tiflags &= ~TH_FIN; 1266 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1267 todrop); 1268 } 1269 tp->snd_wl1 = th->th_seq - 1; 1270 tp->rcv_up = th->th_seq; 1271 goto step6; 1272 /* 1273 * If a new connection request is received while in TIME_WAIT, 1274 * drop the old connection and start over if the if the 1275 * timestamp or the sequence numbers are above the previous 1276 * ones. 1277 */ 1278 case TCPS_TIME_WAIT: 1279 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1280 ((opti.ts_present && 1281 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1282 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1283 #if NPF > 0 1284 /* 1285 * The socket will be recreated but the new state 1286 * has already been linked to the socket. Remove the 1287 * link between old socket and new state. 1288 */ 1289 pf_inp_unlink(inp); 1290 #endif 1291 /* 1292 * Advance the iss by at least 32768, but 1293 * clear the msb in order to make sure 1294 * that SEG_LT(snd_nxt, iss). 1295 */ 1296 iss = tp->snd_nxt + 1297 ((arc4random() & 0x7fffffff) | 0x8000); 1298 reuse = &iss; 1299 tp = tcp_close(tp); 1300 inp = NULL; 1301 goto findpcb; 1302 } 1303 } 1304 1305 /* 1306 * States other than LISTEN or SYN_SENT. 1307 * First check timestamp, if present. 1308 * Then check that at least some bytes of segment are within 1309 * receive window. If segment begins before rcv_nxt, 1310 * drop leading data (and SYN); if nothing left, just ack. 1311 * 1312 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1313 * and it's less than opti.ts_recent, drop it. 1314 */ 1315 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1316 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1317 1318 /* Check to see if ts_recent is over 24 days old. */ 1319 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1320 /* 1321 * Invalidate ts_recent. If this segment updates 1322 * ts_recent, the age will be reset later and ts_recent 1323 * will get a valid value. If it does not, setting 1324 * ts_recent to zero will at least satisfy the 1325 * requirement that zero be placed in the timestamp 1326 * echo reply when ts_recent isn't valid. The 1327 * age isn't reset until we get a valid ts_recent 1328 * because we don't want out-of-order segments to be 1329 * dropped when ts_recent is old. 1330 */ 1331 tp->ts_recent = 0; 1332 } else { 1333 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1334 tcpstat_inc(tcps_pawsdrop); 1335 goto dropafterack; 1336 } 1337 } 1338 1339 todrop = tp->rcv_nxt - th->th_seq; 1340 if (todrop > 0) { 1341 if (tiflags & TH_SYN) { 1342 tiflags &= ~TH_SYN; 1343 th->th_seq++; 1344 if (th->th_urp > 1) 1345 th->th_urp--; 1346 else 1347 tiflags &= ~TH_URG; 1348 todrop--; 1349 } 1350 if (todrop > tlen || 1351 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1352 /* 1353 * Any valid FIN must be to the left of the 1354 * window. At this point, FIN must be a 1355 * duplicate or out-of-sequence, so drop it. 1356 */ 1357 tiflags &= ~TH_FIN; 1358 /* 1359 * Send ACK to resynchronize, and drop any data, 1360 * but keep on processing for RST or ACK. 1361 */ 1362 tp->t_flags |= TF_ACKNOW; 1363 todrop = tlen; 1364 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1365 } else { 1366 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1367 todrop); 1368 } 1369 hdroptlen += todrop; /* drop from head afterwards */ 1370 th->th_seq += todrop; 1371 tlen -= todrop; 1372 if (th->th_urp > todrop) 1373 th->th_urp -= todrop; 1374 else { 1375 tiflags &= ~TH_URG; 1376 th->th_urp = 0; 1377 } 1378 } 1379 1380 /* 1381 * If new data are received on a connection after the 1382 * user processes are gone, then RST the other end. 1383 */ 1384 if ((so->so_state & SS_NOFDREF) && 1385 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1386 tp = tcp_close(tp); 1387 tcpstat_inc(tcps_rcvafterclose); 1388 goto dropwithreset; 1389 } 1390 1391 /* 1392 * If segment ends after window, drop trailing data 1393 * (and PUSH and FIN); if nothing left, just ACK. 1394 */ 1395 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1396 if (todrop > 0) { 1397 tcpstat_inc(tcps_rcvpackafterwin); 1398 if (todrop >= tlen) { 1399 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1400 /* 1401 * If window is closed can only take segments at 1402 * window edge, and have to drop data and PUSH from 1403 * incoming segments. Continue processing, but 1404 * remember to ack. Otherwise, drop segment 1405 * and ack. 1406 */ 1407 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1408 tp->t_flags |= TF_ACKNOW; 1409 tcpstat_inc(tcps_rcvwinprobe); 1410 } else 1411 goto dropafterack; 1412 } else 1413 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1414 m_adj(m, -todrop); 1415 tlen -= todrop; 1416 tiflags &= ~(TH_PUSH|TH_FIN); 1417 } 1418 1419 /* 1420 * If last ACK falls within this segment's sequence numbers, 1421 * record its timestamp if it's more recent. 1422 * Cf fix from Braden, see Stevens p. 870 1423 */ 1424 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1425 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1426 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1427 ((tiflags & (TH_SYN|TH_FIN)) != 0))) 1428 tp->ts_recent = opti.ts_val; 1429 else 1430 tp->ts_recent = 0; 1431 tp->ts_recent_age = tcp_now; 1432 } 1433 1434 /* 1435 * If the RST bit is set examine the state: 1436 * SYN_RECEIVED STATE: 1437 * If passive open, return to LISTEN state. 1438 * If active open, inform user that connection was refused. 1439 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1440 * Inform user that connection was reset, and close tcb. 1441 * CLOSING, LAST_ACK, TIME_WAIT STATES 1442 * Close the tcb. 1443 */ 1444 if (tiflags & TH_RST) { 1445 if (th->th_seq != tp->last_ack_sent && 1446 th->th_seq != tp->rcv_nxt && 1447 th->th_seq != (tp->rcv_nxt + 1)) 1448 goto drop; 1449 1450 switch (tp->t_state) { 1451 case TCPS_SYN_RECEIVED: 1452 #ifdef TCP_ECN 1453 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1454 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1455 goto drop; 1456 #endif 1457 so->so_error = ECONNREFUSED; 1458 goto close; 1459 1460 case TCPS_ESTABLISHED: 1461 case TCPS_FIN_WAIT_1: 1462 case TCPS_FIN_WAIT_2: 1463 case TCPS_CLOSE_WAIT: 1464 so->so_error = ECONNRESET; 1465 close: 1466 tp->t_state = TCPS_CLOSED; 1467 tcpstat_inc(tcps_drops); 1468 tp = tcp_close(tp); 1469 goto drop; 1470 case TCPS_CLOSING: 1471 case TCPS_LAST_ACK: 1472 case TCPS_TIME_WAIT: 1473 tp = tcp_close(tp); 1474 goto drop; 1475 } 1476 } 1477 1478 /* 1479 * If a SYN is in the window, then this is an 1480 * error and we ACK and drop the packet. 1481 */ 1482 if (tiflags & TH_SYN) 1483 goto dropafterack_ratelim; 1484 1485 /* 1486 * If the ACK bit is off we drop the segment and return. 1487 */ 1488 if ((tiflags & TH_ACK) == 0) { 1489 if (tp->t_flags & TF_ACKNOW) 1490 goto dropafterack; 1491 else 1492 goto drop; 1493 } 1494 1495 /* 1496 * Ack processing. 1497 */ 1498 switch (tp->t_state) { 1499 1500 /* 1501 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1502 * ESTABLISHED state and continue processing. 1503 * The ACK was checked above. 1504 */ 1505 case TCPS_SYN_RECEIVED: 1506 tcpstat_inc(tcps_connects); 1507 soisconnected(so); 1508 tp->t_state = TCPS_ESTABLISHED; 1509 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1510 /* Do window scaling? */ 1511 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1512 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1513 tp->snd_scale = tp->requested_s_scale; 1514 tp->rcv_scale = tp->request_r_scale; 1515 tiwin = th->th_win << tp->snd_scale; 1516 } 1517 tcp_flush_queue(tp); 1518 tp->snd_wl1 = th->th_seq - 1; 1519 /* fall into ... */ 1520 1521 /* 1522 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1523 * ACKs. If the ack is in the range 1524 * tp->snd_una < th->th_ack <= tp->snd_max 1525 * then advance tp->snd_una to th->th_ack and drop 1526 * data from the retransmission queue. If this ACK reflects 1527 * more up to date window information we update our window information. 1528 */ 1529 case TCPS_ESTABLISHED: 1530 case TCPS_FIN_WAIT_1: 1531 case TCPS_FIN_WAIT_2: 1532 case TCPS_CLOSE_WAIT: 1533 case TCPS_CLOSING: 1534 case TCPS_LAST_ACK: 1535 case TCPS_TIME_WAIT: 1536 #ifdef TCP_ECN 1537 /* 1538 * if we receive ECE and are not already in recovery phase, 1539 * reduce cwnd by half but don't slow-start. 1540 * advance snd_last to snd_max not to reduce cwnd again 1541 * until all outstanding packets are acked. 1542 */ 1543 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1544 if ((tp->t_flags & TF_ECN_PERMIT) && 1545 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1546 u_int win; 1547 1548 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1549 if (win > 1) { 1550 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1551 tp->snd_cwnd = tp->snd_ssthresh; 1552 tp->snd_last = tp->snd_max; 1553 tp->t_flags |= TF_SEND_CWR; 1554 tcpstat_inc(tcps_cwr_ecn); 1555 } 1556 } 1557 tcpstat_inc(tcps_ecn_rcvece); 1558 } 1559 /* 1560 * if we receive CWR, we know that the peer has reduced 1561 * its congestion window. stop sending ecn-echo. 1562 */ 1563 if ((tiflags & TH_CWR)) { 1564 tp->t_flags &= ~TF_RCVD_CE; 1565 tcpstat_inc(tcps_ecn_rcvcwr); 1566 } 1567 #endif /* TCP_ECN */ 1568 1569 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1570 /* 1571 * Duplicate/old ACK processing. 1572 * Increments t_dupacks: 1573 * Pure duplicate (same seq/ack/window, no data) 1574 * Doesn't affect t_dupacks: 1575 * Data packets. 1576 * Normal window updates (window opens) 1577 * Resets t_dupacks: 1578 * New data ACKed. 1579 * Window shrinks 1580 * Old ACK 1581 */ 1582 if (tlen) { 1583 /* Drop very old ACKs unless th_seq matches */ 1584 if (th->th_seq != tp->rcv_nxt && 1585 SEQ_LT(th->th_ack, 1586 tp->snd_una - tp->max_sndwnd)) { 1587 tcpstat_inc(tcps_rcvacktooold); 1588 goto drop; 1589 } 1590 break; 1591 } 1592 /* 1593 * If we get an old ACK, there is probably packet 1594 * reordering going on. Be conservative and reset 1595 * t_dupacks so that we are less aggressive in 1596 * doing a fast retransmit. 1597 */ 1598 if (th->th_ack != tp->snd_una) { 1599 tp->t_dupacks = 0; 1600 break; 1601 } 1602 if (tiwin == tp->snd_wnd) { 1603 tcpstat_inc(tcps_rcvdupack); 1604 /* 1605 * If we have outstanding data (other than 1606 * a window probe), this is a completely 1607 * duplicate ack (ie, window info didn't 1608 * change), the ack is the biggest we've 1609 * seen and we've seen exactly our rexmt 1610 * threshold of them, assume a packet 1611 * has been dropped and retransmit it. 1612 * Kludge snd_nxt & the congestion 1613 * window so we send only this one 1614 * packet. 1615 * 1616 * We know we're losing at the current 1617 * window size so do congestion avoidance 1618 * (set ssthresh to half the current window 1619 * and pull our congestion window back to 1620 * the new ssthresh). 1621 * 1622 * Dup acks mean that packets have left the 1623 * network (they're now cached at the receiver) 1624 * so bump cwnd by the amount in the receiver 1625 * to keep a constant cwnd packets in the 1626 * network. 1627 */ 1628 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1629 tp->t_dupacks = 0; 1630 #if defined(TCP_SACK) && defined(TCP_FACK) 1631 /* 1632 * In FACK, can enter fast rec. if the receiver 1633 * reports a reass. queue longer than 3 segs. 1634 */ 1635 else if (++tp->t_dupacks == tcprexmtthresh || 1636 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1637 tp->t_maxseg + tp->snd_una)) && 1638 SEQ_GT(tp->snd_una, tp->snd_last))) { 1639 #else 1640 else if (++tp->t_dupacks == tcprexmtthresh) { 1641 #endif /* TCP_FACK */ 1642 tcp_seq onxt = tp->snd_nxt; 1643 u_long win = 1644 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1645 2 / tp->t_maxseg; 1646 1647 #if defined(TCP_SACK) || defined(TCP_ECN) 1648 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1649 /* 1650 * False fast retx after 1651 * timeout. Do not cut window. 1652 */ 1653 tp->t_dupacks = 0; 1654 goto drop; 1655 } 1656 #endif 1657 if (win < 2) 1658 win = 2; 1659 tp->snd_ssthresh = win * tp->t_maxseg; 1660 #ifdef TCP_SACK 1661 tp->snd_last = tp->snd_max; 1662 if (tp->sack_enable) { 1663 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1664 tp->t_rtttime = 0; 1665 #ifdef TCP_ECN 1666 tp->t_flags |= TF_SEND_CWR; 1667 #endif 1668 tcpstat_inc(tcps_cwr_frecovery); 1669 tcpstat_inc(tcps_sack_recovery_episode); 1670 #if defined(TCP_SACK) && defined(TCP_FACK) 1671 tp->t_dupacks = tcprexmtthresh; 1672 (void) tcp_output(tp); 1673 /* 1674 * During FR, snd_cwnd is held 1675 * constant for FACK. 1676 */ 1677 tp->snd_cwnd = tp->snd_ssthresh; 1678 #else 1679 /* 1680 * tcp_output() will send 1681 * oldest SACK-eligible rtx. 1682 */ 1683 (void) tcp_output(tp); 1684 tp->snd_cwnd = tp->snd_ssthresh+ 1685 tp->t_maxseg * tp->t_dupacks; 1686 #endif /* TCP_FACK */ 1687 goto drop; 1688 } 1689 #endif /* TCP_SACK */ 1690 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1691 tp->t_rtttime = 0; 1692 tp->snd_nxt = th->th_ack; 1693 tp->snd_cwnd = tp->t_maxseg; 1694 #ifdef TCP_ECN 1695 tp->t_flags |= TF_SEND_CWR; 1696 #endif 1697 tcpstat_inc(tcps_cwr_frecovery); 1698 tcpstat_inc(tcps_sndrexmitfast); 1699 (void) tcp_output(tp); 1700 1701 tp->snd_cwnd = tp->snd_ssthresh + 1702 tp->t_maxseg * tp->t_dupacks; 1703 if (SEQ_GT(onxt, tp->snd_nxt)) 1704 tp->snd_nxt = onxt; 1705 goto drop; 1706 } else if (tp->t_dupacks > tcprexmtthresh) { 1707 #if defined(TCP_SACK) && defined(TCP_FACK) 1708 /* 1709 * while (awnd < cwnd) 1710 * sendsomething(); 1711 */ 1712 if (tp->sack_enable) { 1713 if (tp->snd_awnd < tp->snd_cwnd) 1714 tcp_output(tp); 1715 goto drop; 1716 } 1717 #endif /* TCP_FACK */ 1718 tp->snd_cwnd += tp->t_maxseg; 1719 (void) tcp_output(tp); 1720 goto drop; 1721 } 1722 } else if (tiwin < tp->snd_wnd) { 1723 /* 1724 * The window was retracted! Previous dup 1725 * ACKs may have been due to packets arriving 1726 * after the shrunken window, not a missing 1727 * packet, so play it safe and reset t_dupacks 1728 */ 1729 tp->t_dupacks = 0; 1730 } 1731 break; 1732 } 1733 /* 1734 * If the congestion window was inflated to account 1735 * for the other side's cached packets, retract it. 1736 */ 1737 #if defined(TCP_SACK) 1738 if (tp->sack_enable) { 1739 if (tp->t_dupacks >= tcprexmtthresh) { 1740 /* Check for a partial ACK */ 1741 if (tcp_sack_partialack(tp, th)) { 1742 #if defined(TCP_SACK) && defined(TCP_FACK) 1743 /* Force call to tcp_output */ 1744 if (tp->snd_awnd < tp->snd_cwnd) 1745 tp->t_flags |= TF_NEEDOUTPUT; 1746 #else 1747 tp->snd_cwnd += tp->t_maxseg; 1748 tp->t_flags |= TF_NEEDOUTPUT; 1749 #endif /* TCP_FACK */ 1750 } else { 1751 /* Out of fast recovery */ 1752 tp->snd_cwnd = tp->snd_ssthresh; 1753 if (tcp_seq_subtract(tp->snd_max, 1754 th->th_ack) < tp->snd_ssthresh) 1755 tp->snd_cwnd = 1756 tcp_seq_subtract(tp->snd_max, 1757 th->th_ack); 1758 tp->t_dupacks = 0; 1759 #if defined(TCP_SACK) && defined(TCP_FACK) 1760 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1761 tp->snd_fack = th->th_ack; 1762 #endif /* TCP_FACK */ 1763 } 1764 } 1765 } else { 1766 if (tp->t_dupacks >= tcprexmtthresh && 1767 !tcp_newreno(tp, th)) { 1768 /* Out of fast recovery */ 1769 tp->snd_cwnd = tp->snd_ssthresh; 1770 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1771 tp->snd_ssthresh) 1772 tp->snd_cwnd = 1773 tcp_seq_subtract(tp->snd_max, 1774 th->th_ack); 1775 tp->t_dupacks = 0; 1776 } 1777 } 1778 if (tp->t_dupacks < tcprexmtthresh) 1779 tp->t_dupacks = 0; 1780 #else /* else no TCP_SACK */ 1781 if (tp->t_dupacks >= tcprexmtthresh && 1782 tp->snd_cwnd > tp->snd_ssthresh) 1783 tp->snd_cwnd = tp->snd_ssthresh; 1784 tp->t_dupacks = 0; 1785 #endif 1786 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1787 tcpstat_inc(tcps_rcvacktoomuch); 1788 goto dropafterack_ratelim; 1789 } 1790 acked = th->th_ack - tp->snd_una; 1791 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1792 1793 /* 1794 * If we have a timestamp reply, update smoothed 1795 * round trip time. If no timestamp is present but 1796 * transmit timer is running and timed sequence 1797 * number was acked, update smoothed round trip time. 1798 * Since we now have an rtt measurement, cancel the 1799 * timer backoff (cf., Phil Karn's retransmit alg.). 1800 * Recompute the initial retransmit timer. 1801 */ 1802 if (opti.ts_present && opti.ts_ecr) 1803 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1804 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1805 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1806 1807 /* 1808 * If all outstanding data is acked, stop retransmit 1809 * timer and remember to restart (more output or persist). 1810 * If there is more data to be acked, restart retransmit 1811 * timer, using current (possibly backed-off) value. 1812 */ 1813 if (th->th_ack == tp->snd_max) { 1814 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1815 tp->t_flags |= TF_NEEDOUTPUT; 1816 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1817 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1818 /* 1819 * When new data is acked, open the congestion window. 1820 * If the window gives us less than ssthresh packets 1821 * in flight, open exponentially (maxseg per packet). 1822 * Otherwise open linearly: maxseg per window 1823 * (maxseg^2 / cwnd per packet). 1824 */ 1825 { 1826 u_int cw = tp->snd_cwnd; 1827 u_int incr = tp->t_maxseg; 1828 1829 if (cw > tp->snd_ssthresh) 1830 incr = incr * incr / cw; 1831 #if defined (TCP_SACK) 1832 if (tp->t_dupacks < tcprexmtthresh) 1833 #endif 1834 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1835 } 1836 ND6_HINT(tp); 1837 if (acked > so->so_snd.sb_cc) { 1838 tp->snd_wnd -= so->so_snd.sb_cc; 1839 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1840 ourfinisacked = 1; 1841 } else { 1842 sbdrop(&so->so_snd, acked); 1843 tp->snd_wnd -= acked; 1844 ourfinisacked = 0; 1845 } 1846 1847 tcp_update_sndspace(tp); 1848 if (sb_notify(&so->so_snd)) { 1849 tp->t_flags |= TF_BLOCKOUTPUT; 1850 sowwakeup(so); 1851 tp->t_flags &= ~TF_BLOCKOUTPUT; 1852 } 1853 1854 /* 1855 * If we had a pending ICMP message that referred to data 1856 * that have just been acknowledged, disregard the recorded 1857 * ICMP message. 1858 */ 1859 if ((tp->t_flags & TF_PMTUD_PEND) && 1860 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1861 tp->t_flags &= ~TF_PMTUD_PEND; 1862 1863 /* 1864 * Keep track of the largest chunk of data acknowledged 1865 * since last PMTU update 1866 */ 1867 if (tp->t_pmtud_mss_acked < acked) 1868 tp->t_pmtud_mss_acked = acked; 1869 1870 tp->snd_una = th->th_ack; 1871 #ifdef TCP_ECN 1872 /* sync snd_last with snd_una */ 1873 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1874 tp->snd_last = tp->snd_una; 1875 #endif 1876 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1877 tp->snd_nxt = tp->snd_una; 1878 #if defined (TCP_SACK) && defined (TCP_FACK) 1879 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1880 tp->snd_fack = tp->snd_una; 1881 /* Update snd_awnd for partial ACK 1882 * without any SACK blocks. 1883 */ 1884 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1885 tp->snd_fack) + tp->retran_data; 1886 } 1887 #endif 1888 1889 switch (tp->t_state) { 1890 1891 /* 1892 * In FIN_WAIT_1 STATE in addition to the processing 1893 * for the ESTABLISHED state if our FIN is now acknowledged 1894 * then enter FIN_WAIT_2. 1895 */ 1896 case TCPS_FIN_WAIT_1: 1897 if (ourfinisacked) { 1898 /* 1899 * If we can't receive any more 1900 * data, then closing user can proceed. 1901 * Starting the timer is contrary to the 1902 * specification, but if we don't get a FIN 1903 * we'll hang forever. 1904 */ 1905 if (so->so_state & SS_CANTRCVMORE) { 1906 soisdisconnected(so); 1907 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1908 } 1909 tp->t_state = TCPS_FIN_WAIT_2; 1910 } 1911 break; 1912 1913 /* 1914 * In CLOSING STATE in addition to the processing for 1915 * the ESTABLISHED state if the ACK acknowledges our FIN 1916 * then enter the TIME-WAIT state, otherwise ignore 1917 * the segment. 1918 */ 1919 case TCPS_CLOSING: 1920 if (ourfinisacked) { 1921 tp->t_state = TCPS_TIME_WAIT; 1922 tcp_canceltimers(tp); 1923 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1924 soisdisconnected(so); 1925 } 1926 break; 1927 1928 /* 1929 * In LAST_ACK, we may still be waiting for data to drain 1930 * and/or to be acked, as well as for the ack of our FIN. 1931 * If our FIN is now acknowledged, delete the TCB, 1932 * enter the closed state and return. 1933 */ 1934 case TCPS_LAST_ACK: 1935 if (ourfinisacked) { 1936 tp = tcp_close(tp); 1937 goto drop; 1938 } 1939 break; 1940 1941 /* 1942 * In TIME_WAIT state the only thing that should arrive 1943 * is a retransmission of the remote FIN. Acknowledge 1944 * it and restart the finack timer. 1945 */ 1946 case TCPS_TIME_WAIT: 1947 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1948 goto dropafterack; 1949 } 1950 } 1951 1952 step6: 1953 /* 1954 * Update window information. 1955 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1956 */ 1957 if ((tiflags & TH_ACK) && 1958 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1959 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1960 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1961 /* keep track of pure window updates */ 1962 if (tlen == 0 && 1963 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1964 tcpstat_inc(tcps_rcvwinupd); 1965 tp->snd_wnd = tiwin; 1966 tp->snd_wl1 = th->th_seq; 1967 tp->snd_wl2 = th->th_ack; 1968 if (tp->snd_wnd > tp->max_sndwnd) 1969 tp->max_sndwnd = tp->snd_wnd; 1970 tp->t_flags |= TF_NEEDOUTPUT; 1971 } 1972 1973 /* 1974 * Process segments with URG. 1975 */ 1976 if ((tiflags & TH_URG) && th->th_urp && 1977 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1978 /* 1979 * This is a kludge, but if we receive and accept 1980 * random urgent pointers, we'll crash in 1981 * soreceive. It's hard to imagine someone 1982 * actually wanting to send this much urgent data. 1983 */ 1984 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1985 th->th_urp = 0; /* XXX */ 1986 tiflags &= ~TH_URG; /* XXX */ 1987 goto dodata; /* XXX */ 1988 } 1989 /* 1990 * If this segment advances the known urgent pointer, 1991 * then mark the data stream. This should not happen 1992 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1993 * a FIN has been received from the remote side. 1994 * In these states we ignore the URG. 1995 * 1996 * According to RFC961 (Assigned Protocols), 1997 * the urgent pointer points to the last octet 1998 * of urgent data. We continue, however, 1999 * to consider it to indicate the first octet 2000 * of data past the urgent section as the original 2001 * spec states (in one of two places). 2002 */ 2003 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2004 tp->rcv_up = th->th_seq + th->th_urp; 2005 so->so_oobmark = so->so_rcv.sb_cc + 2006 (tp->rcv_up - tp->rcv_nxt) - 1; 2007 if (so->so_oobmark == 0) 2008 so->so_state |= SS_RCVATMARK; 2009 sohasoutofband(so); 2010 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2011 } 2012 /* 2013 * Remove out of band data so doesn't get presented to user. 2014 * This can happen independent of advancing the URG pointer, 2015 * but if two URG's are pending at once, some out-of-band 2016 * data may creep in... ick. 2017 */ 2018 if (th->th_urp <= (u_int16_t) tlen && 2019 (so->so_options & SO_OOBINLINE) == 0) 2020 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 2021 } else 2022 /* 2023 * If no out of band data is expected, 2024 * pull receive urgent pointer along 2025 * with the receive window. 2026 */ 2027 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2028 tp->rcv_up = tp->rcv_nxt; 2029 dodata: /* XXX */ 2030 2031 /* 2032 * Process the segment text, merging it into the TCP sequencing queue, 2033 * and arranging for acknowledgment of receipt if necessary. 2034 * This process logically involves adjusting tp->rcv_wnd as data 2035 * is presented to the user (this happens in tcp_usrreq.c, 2036 * case PRU_RCVD). If a FIN has already been received on this 2037 * connection then we just ignore the text. 2038 */ 2039 if ((tlen || (tiflags & TH_FIN)) && 2040 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2041 #ifdef TCP_SACK 2042 tcp_seq laststart = th->th_seq; 2043 tcp_seq lastend = th->th_seq + tlen; 2044 #endif 2045 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 2046 tp->t_state == TCPS_ESTABLISHED) { 2047 TCP_SETUP_ACK(tp, tiflags, m); 2048 tp->rcv_nxt += tlen; 2049 tiflags = th->th_flags & TH_FIN; 2050 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 2051 ND6_HINT(tp); 2052 if (so->so_state & SS_CANTRCVMORE) 2053 m_freem(m); 2054 else { 2055 m_adj(m, hdroptlen); 2056 sbappendstream(&so->so_rcv, m); 2057 } 2058 tp->t_flags |= TF_BLOCKOUTPUT; 2059 sorwakeup(so); 2060 tp->t_flags &= ~TF_BLOCKOUTPUT; 2061 } else { 2062 m_adj(m, hdroptlen); 2063 tiflags = tcp_reass(tp, th, m, &tlen); 2064 tp->t_flags |= TF_ACKNOW; 2065 } 2066 #ifdef TCP_SACK 2067 if (tp->sack_enable) 2068 tcp_update_sack_list(tp, laststart, lastend); 2069 #endif 2070 2071 /* 2072 * variable len never referenced again in modern BSD, 2073 * so why bother computing it ?? 2074 */ 2075 #if 0 2076 /* 2077 * Note the amount of data that peer has sent into 2078 * our window, in order to estimate the sender's 2079 * buffer size. 2080 */ 2081 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2082 #endif /* 0 */ 2083 } else { 2084 m_freem(m); 2085 tiflags &= ~TH_FIN; 2086 } 2087 2088 /* 2089 * If FIN is received ACK the FIN and let the user know 2090 * that the connection is closing. Ignore a FIN received before 2091 * the connection is fully established. 2092 */ 2093 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2094 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2095 socantrcvmore(so); 2096 tp->t_flags |= TF_ACKNOW; 2097 tp->rcv_nxt++; 2098 } 2099 switch (tp->t_state) { 2100 2101 /* 2102 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2103 */ 2104 case TCPS_ESTABLISHED: 2105 tp->t_state = TCPS_CLOSE_WAIT; 2106 break; 2107 2108 /* 2109 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2110 * enter the CLOSING state. 2111 */ 2112 case TCPS_FIN_WAIT_1: 2113 tp->t_state = TCPS_CLOSING; 2114 break; 2115 2116 /* 2117 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2118 * starting the time-wait timer, turning off the other 2119 * standard timers. 2120 */ 2121 case TCPS_FIN_WAIT_2: 2122 tp->t_state = TCPS_TIME_WAIT; 2123 tcp_canceltimers(tp); 2124 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2125 soisdisconnected(so); 2126 break; 2127 2128 /* 2129 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2130 */ 2131 case TCPS_TIME_WAIT: 2132 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2133 break; 2134 } 2135 } 2136 if (so->so_options & SO_DEBUG) { 2137 switch (tp->pf) { 2138 #ifdef INET6 2139 case PF_INET6: 2140 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2141 0, tlen); 2142 break; 2143 #endif /* INET6 */ 2144 case PF_INET: 2145 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2146 0, tlen); 2147 break; 2148 } 2149 } 2150 2151 /* 2152 * Return any desired output. 2153 */ 2154 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2155 (void) tcp_output(tp); 2156 return IPPROTO_DONE; 2157 2158 badsyn: 2159 /* 2160 * Received a bad SYN. Increment counters and dropwithreset. 2161 */ 2162 tcpstat_inc(tcps_badsyn); 2163 tp = NULL; 2164 goto dropwithreset; 2165 2166 dropafterack_ratelim: 2167 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2168 tcp_ackdrop_ppslim) == 0) { 2169 /* XXX stat */ 2170 goto drop; 2171 } 2172 /* ...fall into dropafterack... */ 2173 2174 dropafterack: 2175 /* 2176 * Generate an ACK dropping incoming segment if it occupies 2177 * sequence space, where the ACK reflects our state. 2178 */ 2179 if (tiflags & TH_RST) 2180 goto drop; 2181 m_freem(m); 2182 tp->t_flags |= TF_ACKNOW; 2183 (void) tcp_output(tp); 2184 return IPPROTO_DONE; 2185 2186 dropwithreset_ratelim: 2187 /* 2188 * We may want to rate-limit RSTs in certain situations, 2189 * particularly if we are sending an RST in response to 2190 * an attempt to connect to or otherwise communicate with 2191 * a port for which we have no socket. 2192 */ 2193 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2194 tcp_rst_ppslim) == 0) { 2195 /* XXX stat */ 2196 goto drop; 2197 } 2198 /* ...fall into dropwithreset... */ 2199 2200 dropwithreset: 2201 /* 2202 * Generate a RST, dropping incoming segment. 2203 * Make ACK acceptable to originator of segment. 2204 * Don't bother to respond to RST. 2205 */ 2206 if (tiflags & TH_RST) 2207 goto drop; 2208 if (tiflags & TH_ACK) { 2209 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2210 TH_RST, m->m_pkthdr.ph_rtableid); 2211 } else { 2212 if (tiflags & TH_SYN) 2213 tlen++; 2214 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2215 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2216 } 2217 m_freem(m); 2218 return IPPROTO_DONE; 2219 2220 drop: 2221 /* 2222 * Drop space held by incoming segment and return. 2223 */ 2224 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2225 switch (tp->pf) { 2226 #ifdef INET6 2227 case PF_INET6: 2228 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2229 0, tlen); 2230 break; 2231 #endif /* INET6 */ 2232 case PF_INET: 2233 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2234 0, tlen); 2235 break; 2236 } 2237 } 2238 2239 m_freem(m); 2240 return IPPROTO_DONE; 2241 } 2242 2243 int 2244 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2245 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2246 u_int rtableid) 2247 { 2248 u_int16_t mss = 0; 2249 int opt, optlen; 2250 #ifdef TCP_SIGNATURE 2251 caddr_t sigp = NULL; 2252 struct tdb *tdb = NULL; 2253 #endif /* TCP_SIGNATURE */ 2254 2255 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2256 opt = cp[0]; 2257 if (opt == TCPOPT_EOL) 2258 break; 2259 if (opt == TCPOPT_NOP) 2260 optlen = 1; 2261 else { 2262 if (cnt < 2) 2263 break; 2264 optlen = cp[1]; 2265 if (optlen < 2 || optlen > cnt) 2266 break; 2267 } 2268 switch (opt) { 2269 2270 default: 2271 continue; 2272 2273 case TCPOPT_MAXSEG: 2274 if (optlen != TCPOLEN_MAXSEG) 2275 continue; 2276 if (!(th->th_flags & TH_SYN)) 2277 continue; 2278 if (TCPS_HAVERCVDSYN(tp->t_state)) 2279 continue; 2280 memcpy(&mss, cp + 2, sizeof(mss)); 2281 mss = ntohs(mss); 2282 oi->maxseg = mss; 2283 break; 2284 2285 case TCPOPT_WINDOW: 2286 if (optlen != TCPOLEN_WINDOW) 2287 continue; 2288 if (!(th->th_flags & TH_SYN)) 2289 continue; 2290 if (TCPS_HAVERCVDSYN(tp->t_state)) 2291 continue; 2292 tp->t_flags |= TF_RCVD_SCALE; 2293 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2294 break; 2295 2296 case TCPOPT_TIMESTAMP: 2297 if (optlen != TCPOLEN_TIMESTAMP) 2298 continue; 2299 oi->ts_present = 1; 2300 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2301 oi->ts_val = ntohl(oi->ts_val); 2302 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2303 oi->ts_ecr = ntohl(oi->ts_ecr); 2304 2305 if (!(th->th_flags & TH_SYN)) 2306 continue; 2307 if (TCPS_HAVERCVDSYN(tp->t_state)) 2308 continue; 2309 /* 2310 * A timestamp received in a SYN makes 2311 * it ok to send timestamp requests and replies. 2312 */ 2313 tp->t_flags |= TF_RCVD_TSTMP; 2314 tp->ts_recent = oi->ts_val; 2315 tp->ts_recent_age = tcp_now; 2316 break; 2317 2318 #ifdef TCP_SACK 2319 case TCPOPT_SACK_PERMITTED: 2320 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2321 continue; 2322 if (!(th->th_flags & TH_SYN)) 2323 continue; 2324 if (TCPS_HAVERCVDSYN(tp->t_state)) 2325 continue; 2326 /* MUST only be set on SYN */ 2327 tp->t_flags |= TF_SACK_PERMIT; 2328 break; 2329 case TCPOPT_SACK: 2330 tcp_sack_option(tp, th, cp, optlen); 2331 break; 2332 #endif 2333 #ifdef TCP_SIGNATURE 2334 case TCPOPT_SIGNATURE: 2335 if (optlen != TCPOLEN_SIGNATURE) 2336 continue; 2337 2338 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2339 return (-1); 2340 2341 sigp = cp + 2; 2342 break; 2343 #endif /* TCP_SIGNATURE */ 2344 } 2345 } 2346 2347 #ifdef TCP_SIGNATURE 2348 if (tp->t_flags & TF_SIGNATURE) { 2349 union sockaddr_union src, dst; 2350 2351 memset(&src, 0, sizeof(union sockaddr_union)); 2352 memset(&dst, 0, sizeof(union sockaddr_union)); 2353 2354 switch (tp->pf) { 2355 case 0: 2356 case AF_INET: 2357 src.sa.sa_len = sizeof(struct sockaddr_in); 2358 src.sa.sa_family = AF_INET; 2359 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2360 dst.sa.sa_len = sizeof(struct sockaddr_in); 2361 dst.sa.sa_family = AF_INET; 2362 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2363 break; 2364 #ifdef INET6 2365 case AF_INET6: 2366 src.sa.sa_len = sizeof(struct sockaddr_in6); 2367 src.sa.sa_family = AF_INET6; 2368 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2369 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2370 dst.sa.sa_family = AF_INET6; 2371 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2372 break; 2373 #endif /* INET6 */ 2374 } 2375 2376 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2377 0, &src, &dst, IPPROTO_TCP); 2378 2379 /* 2380 * We don't have an SA for this peer, so we turn off 2381 * TF_SIGNATURE on the listen socket 2382 */ 2383 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2384 tp->t_flags &= ~TF_SIGNATURE; 2385 2386 } 2387 2388 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2389 tcpstat_inc(tcps_rcvbadsig); 2390 return (-1); 2391 } 2392 2393 if (sigp) { 2394 char sig[16]; 2395 2396 if (tdb == NULL) { 2397 tcpstat_inc(tcps_rcvbadsig); 2398 return (-1); 2399 } 2400 2401 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2402 return (-1); 2403 2404 if (timingsafe_bcmp(sig, sigp, 16)) { 2405 tcpstat_inc(tcps_rcvbadsig); 2406 return (-1); 2407 } 2408 2409 tcpstat_inc(tcps_rcvgoodsig); 2410 } 2411 #endif /* TCP_SIGNATURE */ 2412 2413 return (0); 2414 } 2415 2416 #if defined(TCP_SACK) 2417 u_long 2418 tcp_seq_subtract(u_long a, u_long b) 2419 { 2420 return ((long)(a - b)); 2421 } 2422 #endif 2423 2424 2425 #ifdef TCP_SACK 2426 /* 2427 * This function is called upon receipt of new valid data (while not in header 2428 * prediction mode), and it updates the ordered list of sacks. 2429 */ 2430 void 2431 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2432 tcp_seq rcv_lastend) 2433 { 2434 /* 2435 * First reported block MUST be the most recent one. Subsequent 2436 * blocks SHOULD be in the order in which they arrived at the 2437 * receiver. These two conditions make the implementation fully 2438 * compliant with RFC 2018. 2439 */ 2440 int i, j = 0, count = 0, lastpos = -1; 2441 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2442 2443 /* First clean up current list of sacks */ 2444 for (i = 0; i < tp->rcv_numsacks; i++) { 2445 sack = tp->sackblks[i]; 2446 if (sack.start == 0 && sack.end == 0) { 2447 count++; /* count = number of blocks to be discarded */ 2448 continue; 2449 } 2450 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2451 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2452 count++; 2453 } else { 2454 temp[j].start = tp->sackblks[i].start; 2455 temp[j++].end = tp->sackblks[i].end; 2456 } 2457 } 2458 tp->rcv_numsacks -= count; 2459 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2460 tcp_clean_sackreport(tp); 2461 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2462 /* ==> need first sack block */ 2463 tp->sackblks[0].start = rcv_laststart; 2464 tp->sackblks[0].end = rcv_lastend; 2465 tp->rcv_numsacks = 1; 2466 } 2467 return; 2468 } 2469 /* Otherwise, sack blocks are already present. */ 2470 for (i = 0; i < tp->rcv_numsacks; i++) 2471 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2472 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2473 return; /* sack list remains unchanged */ 2474 /* 2475 * From here, segment just received should be (part of) the 1st sack. 2476 * Go through list, possibly coalescing sack block entries. 2477 */ 2478 firstsack.start = rcv_laststart; 2479 firstsack.end = rcv_lastend; 2480 for (i = 0; i < tp->rcv_numsacks; i++) { 2481 sack = tp->sackblks[i]; 2482 if (SEQ_LT(sack.end, firstsack.start) || 2483 SEQ_GT(sack.start, firstsack.end)) 2484 continue; /* no overlap */ 2485 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2486 /* 2487 * identical block; delete it here since we will 2488 * move it to the front of the list. 2489 */ 2490 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2491 lastpos = i; /* last posn with a zero entry */ 2492 continue; 2493 } 2494 if (SEQ_LEQ(sack.start, firstsack.start)) 2495 firstsack.start = sack.start; /* merge blocks */ 2496 if (SEQ_GEQ(sack.end, firstsack.end)) 2497 firstsack.end = sack.end; /* merge blocks */ 2498 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2499 lastpos = i; /* last posn with a zero entry */ 2500 } 2501 if (lastpos != -1) { /* at least one merge */ 2502 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2503 sack = tp->sackblks[i]; 2504 if (sack.start == 0 && sack.end == 0) 2505 continue; 2506 temp[j++] = sack; 2507 } 2508 tp->rcv_numsacks = j; /* including first blk (added later) */ 2509 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2510 tp->sackblks[i] = temp[i]; 2511 } else { /* no merges -- shift sacks by 1 */ 2512 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2513 tp->rcv_numsacks++; 2514 for (i = tp->rcv_numsacks-1; i > 0; i--) 2515 tp->sackblks[i] = tp->sackblks[i-1]; 2516 } 2517 tp->sackblks[0] = firstsack; 2518 return; 2519 } 2520 2521 /* 2522 * Process the TCP SACK option. tp->snd_holes is an ordered list 2523 * of holes (oldest to newest, in terms of the sequence space). 2524 */ 2525 void 2526 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2527 { 2528 int tmp_olen; 2529 u_char *tmp_cp; 2530 struct sackhole *cur, *p, *temp; 2531 2532 if (!tp->sack_enable) 2533 return; 2534 /* SACK without ACK doesn't make sense. */ 2535 if ((th->th_flags & TH_ACK) == 0) 2536 return; 2537 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2538 if (SEQ_LT(th->th_ack, tp->snd_una) || 2539 SEQ_GT(th->th_ack, tp->snd_max)) 2540 return; 2541 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2542 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2543 return; 2544 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2545 tmp_cp = cp + 2; 2546 tmp_olen = optlen - 2; 2547 tcpstat_inc(tcps_sack_rcv_opts); 2548 if (tp->snd_numholes < 0) 2549 tp->snd_numholes = 0; 2550 if (tp->t_maxseg == 0) 2551 panic("tcp_sack_option"); /* Should never happen */ 2552 while (tmp_olen > 0) { 2553 struct sackblk sack; 2554 2555 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2556 sack.start = ntohl(sack.start); 2557 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2558 sack.end = ntohl(sack.end); 2559 tmp_olen -= TCPOLEN_SACK; 2560 tmp_cp += TCPOLEN_SACK; 2561 if (SEQ_LEQ(sack.end, sack.start)) 2562 continue; /* bad SACK fields */ 2563 if (SEQ_LEQ(sack.end, tp->snd_una)) 2564 continue; /* old block */ 2565 #if defined(TCP_SACK) && defined(TCP_FACK) 2566 /* Updates snd_fack. */ 2567 if (SEQ_GT(sack.end, tp->snd_fack)) 2568 tp->snd_fack = sack.end; 2569 #endif /* TCP_FACK */ 2570 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2571 if (SEQ_LT(sack.start, th->th_ack)) 2572 continue; 2573 } 2574 if (SEQ_GT(sack.end, tp->snd_max)) 2575 continue; 2576 if (tp->snd_holes == NULL) { /* first hole */ 2577 tp->snd_holes = (struct sackhole *) 2578 pool_get(&sackhl_pool, PR_NOWAIT); 2579 if (tp->snd_holes == NULL) { 2580 /* ENOBUFS, so ignore SACKed block for now*/ 2581 goto done; 2582 } 2583 cur = tp->snd_holes; 2584 cur->start = th->th_ack; 2585 cur->end = sack.start; 2586 cur->rxmit = cur->start; 2587 cur->next = NULL; 2588 tp->snd_numholes = 1; 2589 tp->rcv_lastsack = sack.end; 2590 /* 2591 * dups is at least one. If more data has been 2592 * SACKed, it can be greater than one. 2593 */ 2594 cur->dups = min(tcprexmtthresh, 2595 ((sack.end - cur->end)/tp->t_maxseg)); 2596 if (cur->dups < 1) 2597 cur->dups = 1; 2598 continue; /* with next sack block */ 2599 } 2600 /* Go thru list of holes: p = previous, cur = current */ 2601 p = cur = tp->snd_holes; 2602 while (cur) { 2603 if (SEQ_LEQ(sack.end, cur->start)) 2604 /* SACKs data before the current hole */ 2605 break; /* no use going through more holes */ 2606 if (SEQ_GEQ(sack.start, cur->end)) { 2607 /* SACKs data beyond the current hole */ 2608 cur->dups++; 2609 if (((sack.end - cur->end)/tp->t_maxseg) >= 2610 tcprexmtthresh) 2611 cur->dups = tcprexmtthresh; 2612 p = cur; 2613 cur = cur->next; 2614 continue; 2615 } 2616 if (SEQ_LEQ(sack.start, cur->start)) { 2617 /* Data acks at least the beginning of hole */ 2618 #if defined(TCP_SACK) && defined(TCP_FACK) 2619 if (SEQ_GT(sack.end, cur->rxmit)) 2620 tp->retran_data -= 2621 tcp_seq_subtract(cur->rxmit, 2622 cur->start); 2623 else 2624 tp->retran_data -= 2625 tcp_seq_subtract(sack.end, 2626 cur->start); 2627 #endif /* TCP_FACK */ 2628 if (SEQ_GEQ(sack.end, cur->end)) { 2629 /* Acks entire hole, so delete hole */ 2630 if (p != cur) { 2631 p->next = cur->next; 2632 pool_put(&sackhl_pool, cur); 2633 cur = p->next; 2634 } else { 2635 cur = cur->next; 2636 pool_put(&sackhl_pool, p); 2637 p = cur; 2638 tp->snd_holes = p; 2639 } 2640 tp->snd_numholes--; 2641 continue; 2642 } 2643 /* otherwise, move start of hole forward */ 2644 cur->start = sack.end; 2645 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2646 p = cur; 2647 cur = cur->next; 2648 continue; 2649 } 2650 /* move end of hole backward */ 2651 if (SEQ_GEQ(sack.end, cur->end)) { 2652 #if defined(TCP_SACK) && defined(TCP_FACK) 2653 if (SEQ_GT(cur->rxmit, sack.start)) 2654 tp->retran_data -= 2655 tcp_seq_subtract(cur->rxmit, 2656 sack.start); 2657 #endif /* TCP_FACK */ 2658 cur->end = sack.start; 2659 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2660 cur->dups++; 2661 if (((sack.end - cur->end)/tp->t_maxseg) >= 2662 tcprexmtthresh) 2663 cur->dups = tcprexmtthresh; 2664 p = cur; 2665 cur = cur->next; 2666 continue; 2667 } 2668 if (SEQ_LT(cur->start, sack.start) && 2669 SEQ_GT(cur->end, sack.end)) { 2670 /* 2671 * ACKs some data in middle of a hole; need to 2672 * split current hole 2673 */ 2674 temp = (struct sackhole *) 2675 pool_get(&sackhl_pool, PR_NOWAIT); 2676 if (temp == NULL) 2677 goto done; /* ENOBUFS */ 2678 #if defined(TCP_SACK) && defined(TCP_FACK) 2679 if (SEQ_GT(cur->rxmit, sack.end)) 2680 tp->retran_data -= 2681 tcp_seq_subtract(sack.end, 2682 sack.start); 2683 else if (SEQ_GT(cur->rxmit, sack.start)) 2684 tp->retran_data -= 2685 tcp_seq_subtract(cur->rxmit, 2686 sack.start); 2687 #endif /* TCP_FACK */ 2688 temp->next = cur->next; 2689 temp->start = sack.end; 2690 temp->end = cur->end; 2691 temp->dups = cur->dups; 2692 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2693 cur->end = sack.start; 2694 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2695 cur->dups++; 2696 if (((sack.end - cur->end)/tp->t_maxseg) >= 2697 tcprexmtthresh) 2698 cur->dups = tcprexmtthresh; 2699 cur->next = temp; 2700 p = temp; 2701 cur = p->next; 2702 tp->snd_numholes++; 2703 } 2704 } 2705 /* At this point, p points to the last hole on the list */ 2706 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2707 /* 2708 * Need to append new hole at end. 2709 * Last hole is p (and it's not NULL). 2710 */ 2711 temp = (struct sackhole *) 2712 pool_get(&sackhl_pool, PR_NOWAIT); 2713 if (temp == NULL) 2714 goto done; /* ENOBUFS */ 2715 temp->start = tp->rcv_lastsack; 2716 temp->end = sack.start; 2717 temp->dups = min(tcprexmtthresh, 2718 ((sack.end - sack.start)/tp->t_maxseg)); 2719 if (temp->dups < 1) 2720 temp->dups = 1; 2721 temp->rxmit = temp->start; 2722 temp->next = 0; 2723 p->next = temp; 2724 tp->rcv_lastsack = sack.end; 2725 tp->snd_numholes++; 2726 } 2727 } 2728 done: 2729 #if defined(TCP_SACK) && defined(TCP_FACK) 2730 /* 2731 * Update retran_data and snd_awnd. Go through the list of 2732 * holes. Increment retran_data by (hole->rxmit - hole->start). 2733 */ 2734 tp->retran_data = 0; 2735 cur = tp->snd_holes; 2736 while (cur) { 2737 tp->retran_data += cur->rxmit - cur->start; 2738 cur = cur->next; 2739 } 2740 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2741 tp->retran_data; 2742 #endif /* TCP_FACK */ 2743 2744 return; 2745 } 2746 2747 /* 2748 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2749 * it is completely acked; otherwise, tcp_sack_option(), called from 2750 * tcp_dooptions(), will fix up the hole. 2751 */ 2752 void 2753 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2754 { 2755 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2756 /* max because this could be an older ack just arrived */ 2757 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2758 th->th_ack : tp->snd_una; 2759 struct sackhole *cur = tp->snd_holes; 2760 struct sackhole *prev; 2761 while (cur) 2762 if (SEQ_LEQ(cur->end, lastack)) { 2763 prev = cur; 2764 cur = cur->next; 2765 pool_put(&sackhl_pool, prev); 2766 tp->snd_numholes--; 2767 } else if (SEQ_LT(cur->start, lastack)) { 2768 cur->start = lastack; 2769 if (SEQ_LT(cur->rxmit, cur->start)) 2770 cur->rxmit = cur->start; 2771 break; 2772 } else 2773 break; 2774 tp->snd_holes = cur; 2775 } 2776 } 2777 2778 /* 2779 * Delete all receiver-side SACK information. 2780 */ 2781 void 2782 tcp_clean_sackreport(struct tcpcb *tp) 2783 { 2784 int i; 2785 2786 tp->rcv_numsacks = 0; 2787 for (i = 0; i < MAX_SACK_BLKS; i++) 2788 tp->sackblks[i].start = tp->sackblks[i].end=0; 2789 2790 } 2791 2792 /* 2793 * Checks for partial ack. If partial ack arrives, turn off retransmission 2794 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2795 * If the ack advances at least to tp->snd_last, return 0. 2796 */ 2797 int 2798 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2799 { 2800 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2801 /* Turn off retx. timer (will start again next segment) */ 2802 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2803 tp->t_rtttime = 0; 2804 #ifndef TCP_FACK 2805 /* 2806 * Partial window deflation. This statement relies on the 2807 * fact that tp->snd_una has not been updated yet. In FACK 2808 * hold snd_cwnd constant during fast recovery. 2809 */ 2810 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2811 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2812 tp->snd_cwnd += tp->t_maxseg; 2813 } else 2814 tp->snd_cwnd = tp->t_maxseg; 2815 #endif 2816 return (1); 2817 } 2818 return (0); 2819 } 2820 #endif /* TCP_SACK */ 2821 2822 /* 2823 * Pull out of band byte out of a segment so 2824 * it doesn't appear in the user's data queue. 2825 * It is still reflected in the segment length for 2826 * sequencing purposes. 2827 */ 2828 void 2829 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2830 { 2831 int cnt = off + urgent - 1; 2832 2833 while (cnt >= 0) { 2834 if (m->m_len > cnt) { 2835 char *cp = mtod(m, caddr_t) + cnt; 2836 struct tcpcb *tp = sototcpcb(so); 2837 2838 tp->t_iobc = *cp; 2839 tp->t_oobflags |= TCPOOB_HAVEDATA; 2840 memmove(cp, cp + 1, m->m_len - cnt - 1); 2841 m->m_len--; 2842 return; 2843 } 2844 cnt -= m->m_len; 2845 m = m->m_next; 2846 if (m == NULL) 2847 break; 2848 } 2849 panic("tcp_pulloutofband"); 2850 } 2851 2852 /* 2853 * Collect new round-trip time estimate 2854 * and update averages and current timeout. 2855 */ 2856 void 2857 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2858 { 2859 short delta; 2860 short rttmin; 2861 2862 if (rtt < 0) 2863 rtt = 0; 2864 else if (rtt > TCP_RTT_MAX) 2865 rtt = TCP_RTT_MAX; 2866 2867 tcpstat_inc(tcps_rttupdated); 2868 if (tp->t_srtt != 0) { 2869 /* 2870 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2871 * after the binary point (scaled by 4), whereas 2872 * srtt is stored as fixed point with 5 bits after the 2873 * binary point (i.e., scaled by 32). The following magic 2874 * is equivalent to the smoothing algorithm in rfc793 with 2875 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2876 * point). 2877 */ 2878 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2879 (tp->t_srtt >> TCP_RTT_SHIFT); 2880 if ((tp->t_srtt += delta) <= 0) 2881 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2882 /* 2883 * We accumulate a smoothed rtt variance (actually, a 2884 * smoothed mean difference), then set the retransmit 2885 * timer to smoothed rtt + 4 times the smoothed variance. 2886 * rttvar is stored as fixed point with 4 bits after the 2887 * binary point (scaled by 16). The following is 2888 * equivalent to rfc793 smoothing with an alpha of .75 2889 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2890 * rfc793's wired-in beta. 2891 */ 2892 if (delta < 0) 2893 delta = -delta; 2894 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2895 if ((tp->t_rttvar += delta) <= 0) 2896 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2897 } else { 2898 /* 2899 * No rtt measurement yet - use the unsmoothed rtt. 2900 * Set the variance to half the rtt (so our first 2901 * retransmit happens at 3*rtt). 2902 */ 2903 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2904 tp->t_rttvar = (rtt + 1) << 2905 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2906 } 2907 tp->t_rtttime = 0; 2908 tp->t_rxtshift = 0; 2909 2910 /* 2911 * the retransmit should happen at rtt + 4 * rttvar. 2912 * Because of the way we do the smoothing, srtt and rttvar 2913 * will each average +1/2 tick of bias. When we compute 2914 * the retransmit timer, we want 1/2 tick of rounding and 2915 * 1 extra tick because of +-1/2 tick uncertainty in the 2916 * firing of the timer. The bias will give us exactly the 2917 * 1.5 tick we need. But, because the bias is 2918 * statistical, we have to test that we don't drop below 2919 * the minimum feasible timer (which is 2 ticks). 2920 */ 2921 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2922 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2923 2924 /* 2925 * We received an ack for a packet that wasn't retransmitted; 2926 * it is probably safe to discard any error indications we've 2927 * received recently. This isn't quite right, but close enough 2928 * for now (a route might have failed after we sent a segment, 2929 * and the return path might not be symmetrical). 2930 */ 2931 tp->t_softerror = 0; 2932 } 2933 2934 /* 2935 * Determine a reasonable value for maxseg size. 2936 * If the route is known, check route for mtu. 2937 * If none, use an mss that can be handled on the outgoing 2938 * interface without forcing IP to fragment; if bigger than 2939 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2940 * to utilize large mbufs. If no route is found, route has no mtu, 2941 * or the destination isn't local, use a default, hopefully conservative 2942 * size (usually 512 or the default IP max size, but no more than the mtu 2943 * of the interface), as we can't discover anything about intervening 2944 * gateways or networks. We also initialize the congestion/slow start 2945 * window to be a single segment if the destination isn't local. 2946 * While looking at the routing entry, we also initialize other path-dependent 2947 * parameters from pre-set or cached values in the routing entry. 2948 * 2949 * Also take into account the space needed for options that we 2950 * send regularly. Make maxseg shorter by that amount to assure 2951 * that we can send maxseg amount of data even when the options 2952 * are present. Store the upper limit of the length of options plus 2953 * data in maxopd. 2954 * 2955 * NOTE: offer == -1 indicates that the maxseg size changed due to 2956 * Path MTU discovery. 2957 */ 2958 int 2959 tcp_mss(struct tcpcb *tp, int offer) 2960 { 2961 struct rtentry *rt; 2962 struct ifnet *ifp = NULL; 2963 int mss, mssopt; 2964 int iphlen; 2965 struct inpcb *inp; 2966 2967 inp = tp->t_inpcb; 2968 2969 mssopt = mss = tcp_mssdflt; 2970 2971 rt = in_pcbrtentry(inp); 2972 2973 if (rt == NULL) 2974 goto out; 2975 2976 ifp = if_get(rt->rt_ifidx); 2977 if (ifp == NULL) 2978 goto out; 2979 2980 switch (tp->pf) { 2981 #ifdef INET6 2982 case AF_INET6: 2983 iphlen = sizeof(struct ip6_hdr); 2984 break; 2985 #endif 2986 case AF_INET: 2987 iphlen = sizeof(struct ip); 2988 break; 2989 default: 2990 /* the family does not support path MTU discovery */ 2991 goto out; 2992 } 2993 2994 /* 2995 * if there's an mtu associated with the route and we support 2996 * path MTU discovery for the underlying protocol family, use it. 2997 */ 2998 if (rt->rt_rmx.rmx_mtu) { 2999 /* 3000 * One may wish to lower MSS to take into account options, 3001 * especially security-related options. 3002 */ 3003 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 3004 /* 3005 * RFC2460 section 5, last paragraph: if path MTU is 3006 * smaller than 1280, use 1280 as packet size and 3007 * attach fragment header. 3008 */ 3009 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 3010 sizeof(struct tcphdr); 3011 } else { 3012 mss = rt->rt_rmx.rmx_mtu - iphlen - 3013 sizeof(struct tcphdr); 3014 } 3015 } else if (ifp->if_flags & IFF_LOOPBACK) { 3016 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3017 } else if (tp->pf == AF_INET) { 3018 if (ip_mtudisc) 3019 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3020 } 3021 #ifdef INET6 3022 else if (tp->pf == AF_INET6) { 3023 /* 3024 * for IPv6, path MTU discovery is always turned on, 3025 * or the node must use packet size <= 1280. 3026 */ 3027 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3028 } 3029 #endif /* INET6 */ 3030 3031 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 3032 if (offer != -1) { 3033 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 3034 mssopt = max(tcp_mssdflt, mssopt); 3035 } 3036 out: 3037 if_put(ifp); 3038 /* 3039 * The current mss, t_maxseg, is initialized to the default value. 3040 * If we compute a smaller value, reduce the current mss. 3041 * If we compute a larger value, return it for use in sending 3042 * a max seg size option, but don't store it for use 3043 * unless we received an offer at least that large from peer. 3044 * 3045 * However, do not accept offers lower than the minimum of 3046 * the interface MTU and 216. 3047 */ 3048 if (offer > 0) 3049 tp->t_peermss = offer; 3050 if (tp->t_peermss) 3051 mss = min(mss, max(tp->t_peermss, 216)); 3052 3053 /* sanity - at least max opt. space */ 3054 mss = max(mss, 64); 3055 3056 /* 3057 * maxopd stores the maximum length of data AND options 3058 * in a segment; maxseg is the amount of data in a normal 3059 * segment. We need to store this value (maxopd) apart 3060 * from maxseg, because now every segment carries options 3061 * and thus we normally have somewhat less data in segments. 3062 */ 3063 tp->t_maxopd = mss; 3064 3065 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3066 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3067 mss -= TCPOLEN_TSTAMP_APPA; 3068 #ifdef TCP_SIGNATURE 3069 if (tp->t_flags & TF_SIGNATURE) 3070 mss -= TCPOLEN_SIGLEN; 3071 #endif 3072 3073 if (offer == -1) { 3074 /* mss changed due to Path MTU discovery */ 3075 tp->t_flags &= ~TF_PMTUD_PEND; 3076 tp->t_pmtud_mtu_sent = 0; 3077 tp->t_pmtud_mss_acked = 0; 3078 if (mss < tp->t_maxseg) { 3079 /* 3080 * Follow suggestion in RFC 2414 to reduce the 3081 * congestion window by the ratio of the old 3082 * segment size to the new segment size. 3083 */ 3084 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3085 mss, mss); 3086 } 3087 } else if (tcp_do_rfc3390 == 2) { 3088 /* increase initial window */ 3089 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3090 } else if (tcp_do_rfc3390) { 3091 /* increase initial window */ 3092 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3093 } else 3094 tp->snd_cwnd = mss; 3095 3096 tp->t_maxseg = mss; 3097 3098 return (offer != -1 ? mssopt : mss); 3099 } 3100 3101 u_int 3102 tcp_hdrsz(struct tcpcb *tp) 3103 { 3104 u_int hlen; 3105 3106 switch (tp->pf) { 3107 #ifdef INET6 3108 case AF_INET6: 3109 hlen = sizeof(struct ip6_hdr); 3110 break; 3111 #endif 3112 case AF_INET: 3113 hlen = sizeof(struct ip); 3114 break; 3115 default: 3116 hlen = 0; 3117 break; 3118 } 3119 hlen += sizeof(struct tcphdr); 3120 3121 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3122 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3123 hlen += TCPOLEN_TSTAMP_APPA; 3124 #ifdef TCP_SIGNATURE 3125 if (tp->t_flags & TF_SIGNATURE) 3126 hlen += TCPOLEN_SIGLEN; 3127 #endif 3128 return (hlen); 3129 } 3130 3131 /* 3132 * Set connection variables based on the effective MSS. 3133 * We are passed the TCPCB for the actual connection. If we 3134 * are the server, we are called by the compressed state engine 3135 * when the 3-way handshake is complete. If we are the client, 3136 * we are called when we receive the SYN,ACK from the server. 3137 * 3138 * NOTE: The t_maxseg value must be initialized in the TCPCB 3139 * before this routine is called! 3140 */ 3141 void 3142 tcp_mss_update(struct tcpcb *tp) 3143 { 3144 int mss; 3145 u_long bufsize; 3146 struct rtentry *rt; 3147 struct socket *so; 3148 3149 so = tp->t_inpcb->inp_socket; 3150 mss = tp->t_maxseg; 3151 3152 rt = in_pcbrtentry(tp->t_inpcb); 3153 3154 if (rt == NULL) 3155 return; 3156 3157 bufsize = so->so_snd.sb_hiwat; 3158 if (bufsize < mss) { 3159 mss = bufsize; 3160 /* Update t_maxseg and t_maxopd */ 3161 tcp_mss(tp, mss); 3162 } else { 3163 bufsize = roundup(bufsize, mss); 3164 if (bufsize > sb_max) 3165 bufsize = sb_max; 3166 (void)sbreserve(&so->so_snd, bufsize); 3167 } 3168 3169 bufsize = so->so_rcv.sb_hiwat; 3170 if (bufsize > mss) { 3171 bufsize = roundup(bufsize, mss); 3172 if (bufsize > sb_max) 3173 bufsize = sb_max; 3174 (void)sbreserve(&so->so_rcv, bufsize); 3175 } 3176 3177 } 3178 3179 #if defined (TCP_SACK) 3180 /* 3181 * Checks for partial ack. If partial ack arrives, force the retransmission 3182 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3183 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3184 * be started again. If the ack advances at least to tp->snd_last, return 0. 3185 */ 3186 int 3187 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3188 { 3189 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3190 /* 3191 * snd_una has not been updated and the socket send buffer 3192 * not yet drained of the acked data, so we have to leave 3193 * snd_una as it was to get the correct data offset in 3194 * tcp_output(). 3195 */ 3196 tcp_seq onxt = tp->snd_nxt; 3197 u_long ocwnd = tp->snd_cwnd; 3198 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3199 tp->t_rtttime = 0; 3200 tp->snd_nxt = th->th_ack; 3201 /* 3202 * Set snd_cwnd to one segment beyond acknowledged offset 3203 * (tp->snd_una not yet updated when this function is called) 3204 */ 3205 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3206 (void) tcp_output(tp); 3207 tp->snd_cwnd = ocwnd; 3208 if (SEQ_GT(onxt, tp->snd_nxt)) 3209 tp->snd_nxt = onxt; 3210 /* 3211 * Partial window deflation. Relies on fact that tp->snd_una 3212 * not updated yet. 3213 */ 3214 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3215 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3216 else 3217 tp->snd_cwnd = 0; 3218 tp->snd_cwnd += tp->t_maxseg; 3219 3220 return 1; 3221 } 3222 return 0; 3223 } 3224 #endif /* TCP_SACK */ 3225 3226 int 3227 tcp_mss_adv(struct mbuf *m, int af) 3228 { 3229 int mss = 0; 3230 int iphlen; 3231 struct ifnet *ifp = NULL; 3232 3233 if (m && (m->m_flags & M_PKTHDR)) 3234 ifp = if_get(m->m_pkthdr.ph_ifidx); 3235 3236 switch (af) { 3237 case AF_INET: 3238 if (ifp != NULL) 3239 mss = ifp->if_mtu; 3240 iphlen = sizeof(struct ip); 3241 break; 3242 #ifdef INET6 3243 case AF_INET6: 3244 if (ifp != NULL) 3245 mss = ifp->if_mtu; 3246 iphlen = sizeof(struct ip6_hdr); 3247 break; 3248 #endif 3249 default: 3250 unhandled_af(af); 3251 } 3252 if_put(ifp); 3253 mss = mss - iphlen - sizeof(struct tcphdr); 3254 return (max(mss, tcp_mssdflt)); 3255 } 3256 3257 /* 3258 * TCP compressed state engine. Currently used to hold compressed 3259 * state for SYN_RECEIVED. 3260 */ 3261 3262 /* syn hash parameters */ 3263 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3264 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3265 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3266 int tcp_syn_use_limit = 100000; 3267 3268 struct syn_cache_set tcp_syn_cache[2]; 3269 int tcp_syn_cache_active; 3270 3271 #define SYN_HASH(sa, sp, dp, rand) \ 3272 (((sa)->s_addr ^ (rand)[0]) * \ 3273 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3274 #ifndef INET6 3275 #define SYN_HASHALL(hash, src, dst, rand) \ 3276 do { \ 3277 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3278 satosin(src)->sin_port, \ 3279 satosin(dst)->sin_port, (rand)); \ 3280 } while (/*CONSTCOND*/ 0) 3281 #else 3282 #define SYN_HASH6(sa, sp, dp, rand) \ 3283 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3284 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3285 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3286 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3287 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3288 3289 #define SYN_HASHALL(hash, src, dst, rand) \ 3290 do { \ 3291 switch ((src)->sa_family) { \ 3292 case AF_INET: \ 3293 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3294 satosin(src)->sin_port, \ 3295 satosin(dst)->sin_port, (rand)); \ 3296 break; \ 3297 case AF_INET6: \ 3298 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3299 satosin6(src)->sin6_port, \ 3300 satosin6(dst)->sin6_port, (rand)); \ 3301 break; \ 3302 default: \ 3303 hash = 0; \ 3304 } \ 3305 } while (/*CONSTCOND*/0) 3306 #endif /* INET6 */ 3307 3308 void 3309 syn_cache_rm(struct syn_cache *sc) 3310 { 3311 sc->sc_flags |= SCF_DEAD; 3312 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3313 sc->sc_tp = NULL; 3314 LIST_REMOVE(sc, sc_tpq); 3315 sc->sc_buckethead->sch_length--; 3316 timeout_del(&sc->sc_timer); 3317 sc->sc_set->scs_count--; 3318 } 3319 3320 void 3321 syn_cache_put(struct syn_cache *sc) 3322 { 3323 m_free(sc->sc_ipopts); 3324 if (sc->sc_route4.ro_rt != NULL) { 3325 rtfree(sc->sc_route4.ro_rt); 3326 sc->sc_route4.ro_rt = NULL; 3327 } 3328 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3329 timeout_add(&sc->sc_timer, 0); 3330 } 3331 3332 struct pool syn_cache_pool; 3333 3334 /* 3335 * We don't estimate RTT with SYNs, so each packet starts with the default 3336 * RTT and each timer step has a fixed timeout value. 3337 */ 3338 #define SYN_CACHE_TIMER_ARM(sc) \ 3339 do { \ 3340 TCPT_RANGESET((sc)->sc_rxtcur, \ 3341 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3342 TCPTV_REXMTMAX); \ 3343 if (!timeout_initialized(&(sc)->sc_timer)) \ 3344 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3345 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3346 } while (/*CONSTCOND*/0) 3347 3348 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3349 3350 void 3351 syn_cache_init(void) 3352 { 3353 int i; 3354 3355 /* Initialize the hash buckets. */ 3356 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3357 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3358 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3359 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3360 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3361 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3362 for (i = 0; i < tcp_syn_hash_size; i++) { 3363 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3364 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3365 } 3366 3367 /* Initialize the syn cache pool. */ 3368 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3369 0, "syncache", NULL); 3370 } 3371 3372 void 3373 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3374 { 3375 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3376 struct syn_cache_head *scp; 3377 struct syn_cache *sc2; 3378 int i; 3379 3380 NET_ASSERT_LOCKED(); 3381 3382 /* 3383 * If there are no entries in the hash table, reinitialize 3384 * the hash secrets. To avoid useless cache swaps and 3385 * reinitialization, use it until the limit is reached. 3386 * An emtpy cache is also the oportunity to resize the hash. 3387 */ 3388 if (set->scs_count == 0 && set->scs_use <= 0) { 3389 set->scs_use = tcp_syn_use_limit; 3390 if (set->scs_size != tcp_syn_hash_size) { 3391 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3392 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3393 if (scp == NULL) { 3394 /* Try again next time. */ 3395 set->scs_use = 0; 3396 } else { 3397 free(set->scs_buckethead, M_SYNCACHE, 3398 set->scs_size * 3399 sizeof(struct syn_cache_head)); 3400 set->scs_buckethead = scp; 3401 set->scs_size = tcp_syn_hash_size; 3402 for (i = 0; i < tcp_syn_hash_size; i++) 3403 TAILQ_INIT(&scp[i].sch_bucket); 3404 } 3405 } 3406 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3407 tcpstat_inc(tcps_sc_seedrandom); 3408 } 3409 3410 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3411 set->scs_random); 3412 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3413 sc->sc_buckethead = scp; 3414 3415 /* 3416 * Make sure that we don't overflow the per-bucket 3417 * limit or the total cache size limit. 3418 */ 3419 if (scp->sch_length >= tcp_syn_bucket_limit) { 3420 tcpstat_inc(tcps_sc_bucketoverflow); 3421 /* 3422 * Someone might attack our bucket hash function. Reseed 3423 * with random as soon as the passive syn cache gets empty. 3424 */ 3425 set->scs_use = 0; 3426 /* 3427 * The bucket is full. Toss the oldest element in the 3428 * bucket. This will be the first entry in the bucket. 3429 */ 3430 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3431 #ifdef DIAGNOSTIC 3432 /* 3433 * This should never happen; we should always find an 3434 * entry in our bucket. 3435 */ 3436 if (sc2 == NULL) 3437 panic("%s: bucketoverflow: impossible", __func__); 3438 #endif 3439 syn_cache_rm(sc2); 3440 syn_cache_put(sc2); 3441 } else if (set->scs_count >= tcp_syn_cache_limit) { 3442 struct syn_cache_head *scp2, *sce; 3443 3444 tcpstat_inc(tcps_sc_overflowed); 3445 /* 3446 * The cache is full. Toss the oldest entry in the 3447 * first non-empty bucket we can find. 3448 * 3449 * XXX We would really like to toss the oldest 3450 * entry in the cache, but we hope that this 3451 * condition doesn't happen very often. 3452 */ 3453 scp2 = scp; 3454 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3455 sce = &set->scs_buckethead[set->scs_size]; 3456 for (++scp2; scp2 != scp; scp2++) { 3457 if (scp2 >= sce) 3458 scp2 = &set->scs_buckethead[0]; 3459 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3460 break; 3461 } 3462 #ifdef DIAGNOSTIC 3463 /* 3464 * This should never happen; we should always find a 3465 * non-empty bucket. 3466 */ 3467 if (scp2 == scp) 3468 panic("%s: cacheoverflow: impossible", 3469 __func__); 3470 #endif 3471 } 3472 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3473 syn_cache_rm(sc2); 3474 syn_cache_put(sc2); 3475 } 3476 3477 /* 3478 * Initialize the entry's timer. 3479 */ 3480 sc->sc_rxttot = 0; 3481 sc->sc_rxtshift = 0; 3482 SYN_CACHE_TIMER_ARM(sc); 3483 3484 /* Link it from tcpcb entry */ 3485 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3486 3487 /* Put it into the bucket. */ 3488 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3489 scp->sch_length++; 3490 sc->sc_set = set; 3491 set->scs_count++; 3492 set->scs_use--; 3493 3494 tcpstat_inc(tcps_sc_added); 3495 3496 /* 3497 * If the active cache has exceeded its use limit and 3498 * the passive syn cache is empty, exchange their roles. 3499 */ 3500 if (set->scs_use <= 0 && 3501 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3502 tcp_syn_cache_active = !tcp_syn_cache_active; 3503 } 3504 3505 /* 3506 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3507 * If we have retransmitted an entry the maximum number of times, expire 3508 * that entry. 3509 */ 3510 void 3511 syn_cache_timer(void *arg) 3512 { 3513 struct syn_cache *sc = arg; 3514 int s; 3515 3516 NET_LOCK(s); 3517 if (sc->sc_flags & SCF_DEAD) 3518 goto out; 3519 3520 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3521 /* Drop it -- too many retransmissions. */ 3522 goto dropit; 3523 } 3524 3525 /* 3526 * Compute the total amount of time this entry has 3527 * been on a queue. If this entry has been on longer 3528 * than the keep alive timer would allow, expire it. 3529 */ 3530 sc->sc_rxttot += sc->sc_rxtcur; 3531 if (sc->sc_rxttot >= tcptv_keep_init) 3532 goto dropit; 3533 3534 tcpstat_inc(tcps_sc_retransmitted); 3535 (void) syn_cache_respond(sc, NULL); 3536 3537 /* Advance the timer back-off. */ 3538 sc->sc_rxtshift++; 3539 SYN_CACHE_TIMER_ARM(sc); 3540 3541 out: 3542 NET_UNLOCK(s); 3543 return; 3544 3545 dropit: 3546 tcpstat_inc(tcps_sc_timed_out); 3547 syn_cache_rm(sc); 3548 syn_cache_put(sc); 3549 NET_UNLOCK(s); 3550 } 3551 3552 void 3553 syn_cache_reaper(void *arg) 3554 { 3555 struct syn_cache *sc = arg; 3556 3557 pool_put(&syn_cache_pool, (sc)); 3558 return; 3559 } 3560 3561 /* 3562 * Remove syn cache created by the specified tcb entry, 3563 * because this does not make sense to keep them 3564 * (if there's no tcb entry, syn cache entry will never be used) 3565 */ 3566 void 3567 syn_cache_cleanup(struct tcpcb *tp) 3568 { 3569 struct syn_cache *sc, *nsc; 3570 3571 NET_ASSERT_LOCKED(); 3572 3573 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3574 #ifdef DIAGNOSTIC 3575 if (sc->sc_tp != tp) 3576 panic("invalid sc_tp in syn_cache_cleanup"); 3577 #endif 3578 syn_cache_rm(sc); 3579 syn_cache_put(sc); 3580 } 3581 /* just for safety */ 3582 LIST_INIT(&tp->t_sc); 3583 } 3584 3585 /* 3586 * Find an entry in the syn cache. 3587 */ 3588 struct syn_cache * 3589 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3590 struct syn_cache_head **headp, u_int rtableid) 3591 { 3592 struct syn_cache_set *sets[2]; 3593 struct syn_cache *sc; 3594 struct syn_cache_head *scp; 3595 u_int32_t hash; 3596 int i; 3597 3598 NET_ASSERT_LOCKED(); 3599 3600 /* Check the active cache first, the passive cache is likely emtpy. */ 3601 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3602 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3603 for (i = 0; i < 2; i++) { 3604 if (sets[i]->scs_count == 0) 3605 continue; 3606 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3607 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3608 *headp = scp; 3609 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3610 if (sc->sc_hash != hash) 3611 continue; 3612 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3613 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3614 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3615 return (sc); 3616 } 3617 } 3618 return (NULL); 3619 } 3620 3621 /* 3622 * This function gets called when we receive an ACK for a 3623 * socket in the LISTEN state. We look up the connection 3624 * in the syn cache, and if its there, we pull it out of 3625 * the cache and turn it into a full-blown connection in 3626 * the SYN-RECEIVED state. 3627 * 3628 * The return values may not be immediately obvious, and their effects 3629 * can be subtle, so here they are: 3630 * 3631 * NULL SYN was not found in cache; caller should drop the 3632 * packet and send an RST. 3633 * 3634 * -1 We were unable to create the new connection, and are 3635 * aborting it. An ACK,RST is being sent to the peer 3636 * (unless we got screwey sequence numbners; see below), 3637 * because the 3-way handshake has been completed. Caller 3638 * should not free the mbuf, since we may be using it. If 3639 * we are not, we will free it. 3640 * 3641 * Otherwise, the return value is a pointer to the new socket 3642 * associated with the connection. 3643 */ 3644 struct socket * 3645 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3646 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3647 { 3648 struct syn_cache *sc; 3649 struct syn_cache_head *scp; 3650 struct inpcb *inp, *oldinp; 3651 struct tcpcb *tp = NULL; 3652 struct mbuf *am; 3653 struct socket *oso; 3654 #if NPF > 0 3655 struct pf_divert *divert = NULL; 3656 #endif 3657 3658 NET_ASSERT_LOCKED(); 3659 3660 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3661 if (sc == NULL) 3662 return (NULL); 3663 3664 /* 3665 * Verify the sequence and ack numbers. Try getting the correct 3666 * response again. 3667 */ 3668 if ((th->th_ack != sc->sc_iss + 1) || 3669 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3670 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3671 (void) syn_cache_respond(sc, m); 3672 return ((struct socket *)(-1)); 3673 } 3674 3675 /* Remove this cache entry */ 3676 syn_cache_rm(sc); 3677 3678 /* 3679 * Ok, create the full blown connection, and set things up 3680 * as they would have been set up if we had created the 3681 * connection when the SYN arrived. If we can't create 3682 * the connection, abort it. 3683 */ 3684 oso = so; 3685 so = sonewconn(so, SS_ISCONNECTED); 3686 if (so == NULL) 3687 goto resetandabort; 3688 3689 oldinp = sotoinpcb(oso); 3690 inp = sotoinpcb(so); 3691 3692 #ifdef IPSEC 3693 /* 3694 * We need to copy the required security levels 3695 * from the old pcb. Ditto for any other 3696 * IPsec-related information. 3697 */ 3698 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3699 sizeof(oldinp->inp_seclevel)); 3700 #endif /* IPSEC */ 3701 #ifdef INET6 3702 /* 3703 * inp still has the OLD in_pcb stuff, set the 3704 * v6-related flags on the new guy, too. 3705 */ 3706 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3707 if (inp->inp_flags & INP_IPV6) { 3708 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3709 inp->inp_hops = oldinp->inp_hops; 3710 } else 3711 #endif /* INET6 */ 3712 { 3713 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3714 } 3715 3716 #if NPF > 0 3717 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3718 (divert = pf_find_divert(m)) != NULL) 3719 inp->inp_rtableid = divert->rdomain; 3720 else 3721 #endif 3722 /* inherit rtable from listening socket */ 3723 inp->inp_rtableid = sc->sc_rtableid; 3724 3725 inp->inp_lport = th->th_dport; 3726 switch (src->sa_family) { 3727 #ifdef INET6 3728 case AF_INET6: 3729 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3730 break; 3731 #endif /* INET6 */ 3732 case AF_INET: 3733 inp->inp_laddr = satosin(dst)->sin_addr; 3734 inp->inp_options = ip_srcroute(m); 3735 if (inp->inp_options == NULL) { 3736 inp->inp_options = sc->sc_ipopts; 3737 sc->sc_ipopts = NULL; 3738 } 3739 break; 3740 } 3741 in_pcbrehash(inp); 3742 3743 /* 3744 * Give the new socket our cached route reference. 3745 */ 3746 if (src->sa_family == AF_INET) 3747 inp->inp_route = sc->sc_route4; /* struct assignment */ 3748 #ifdef INET6 3749 else 3750 inp->inp_route6 = sc->sc_route6; 3751 #endif 3752 sc->sc_route4.ro_rt = NULL; 3753 3754 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3755 if (am == NULL) 3756 goto resetandabort; 3757 am->m_len = src->sa_len; 3758 memcpy(mtod(am, caddr_t), src, src->sa_len); 3759 3760 switch (src->sa_family) { 3761 case AF_INET: 3762 /* drop IPv4 packet to AF_INET6 socket */ 3763 if (inp->inp_flags & INP_IPV6) { 3764 (void) m_free(am); 3765 goto resetandabort; 3766 } 3767 if (in_pcbconnect(inp, am)) { 3768 (void) m_free(am); 3769 goto resetandabort; 3770 } 3771 break; 3772 #ifdef INET6 3773 case AF_INET6: 3774 if (in6_pcbconnect(inp, am)) { 3775 (void) m_free(am); 3776 goto resetandabort; 3777 } 3778 break; 3779 #endif 3780 } 3781 (void) m_free(am); 3782 3783 tp = intotcpcb(inp); 3784 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3785 if (sc->sc_request_r_scale != 15) { 3786 tp->requested_s_scale = sc->sc_requested_s_scale; 3787 tp->request_r_scale = sc->sc_request_r_scale; 3788 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3789 } 3790 if (sc->sc_flags & SCF_TIMESTAMP) 3791 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3792 3793 tp->t_template = tcp_template(tp); 3794 if (tp->t_template == 0) { 3795 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3796 so = NULL; 3797 m_freem(m); 3798 goto abort; 3799 } 3800 #ifdef TCP_SACK 3801 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3802 #endif 3803 3804 tp->ts_modulate = sc->sc_modulate; 3805 tp->ts_recent = sc->sc_timestamp; 3806 tp->iss = sc->sc_iss; 3807 tp->irs = sc->sc_irs; 3808 tcp_sendseqinit(tp); 3809 #if defined (TCP_SACK) || defined(TCP_ECN) 3810 tp->snd_last = tp->snd_una; 3811 #endif /* TCP_SACK */ 3812 #if defined(TCP_SACK) && defined(TCP_FACK) 3813 tp->snd_fack = tp->snd_una; 3814 tp->retran_data = 0; 3815 tp->snd_awnd = 0; 3816 #endif /* TCP_FACK */ 3817 #ifdef TCP_ECN 3818 if (sc->sc_flags & SCF_ECN_PERMIT) { 3819 tp->t_flags |= TF_ECN_PERMIT; 3820 tcpstat_inc(tcps_ecn_accepts); 3821 } 3822 #endif 3823 #ifdef TCP_SACK 3824 if (sc->sc_flags & SCF_SACK_PERMIT) 3825 tp->t_flags |= TF_SACK_PERMIT; 3826 #endif 3827 #ifdef TCP_SIGNATURE 3828 if (sc->sc_flags & SCF_SIGNATURE) 3829 tp->t_flags |= TF_SIGNATURE; 3830 #endif 3831 tcp_rcvseqinit(tp); 3832 tp->t_state = TCPS_SYN_RECEIVED; 3833 tp->t_rcvtime = tcp_now; 3834 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3835 tcpstat_inc(tcps_accepts); 3836 3837 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3838 if (sc->sc_peermaxseg) 3839 tcp_mss_update(tp); 3840 /* Reset initial window to 1 segment for retransmit */ 3841 if (sc->sc_rxtshift > 0) 3842 tp->snd_cwnd = tp->t_maxseg; 3843 tp->snd_wl1 = sc->sc_irs; 3844 tp->rcv_up = sc->sc_irs + 1; 3845 3846 /* 3847 * This is what whould have happened in tcp_output() when 3848 * the SYN,ACK was sent. 3849 */ 3850 tp->snd_up = tp->snd_una; 3851 tp->snd_max = tp->snd_nxt = tp->iss+1; 3852 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3853 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3854 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3855 tp->last_ack_sent = tp->rcv_nxt; 3856 3857 tcpstat_inc(tcps_sc_completed); 3858 syn_cache_put(sc); 3859 return (so); 3860 3861 resetandabort: 3862 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3863 m->m_pkthdr.ph_rtableid); 3864 m_freem(m); 3865 abort: 3866 if (so != NULL) 3867 (void) soabort(so); 3868 syn_cache_put(sc); 3869 tcpstat_inc(tcps_sc_aborted); 3870 return ((struct socket *)(-1)); 3871 } 3872 3873 /* 3874 * This function is called when we get a RST for a 3875 * non-existent connection, so that we can see if the 3876 * connection is in the syn cache. If it is, zap it. 3877 */ 3878 3879 void 3880 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3881 u_int rtableid) 3882 { 3883 struct syn_cache *sc; 3884 struct syn_cache_head *scp; 3885 3886 NET_ASSERT_LOCKED(); 3887 3888 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3889 return; 3890 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3891 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3892 return; 3893 syn_cache_rm(sc); 3894 tcpstat_inc(tcps_sc_reset); 3895 syn_cache_put(sc); 3896 } 3897 3898 void 3899 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3900 u_int rtableid) 3901 { 3902 struct syn_cache *sc; 3903 struct syn_cache_head *scp; 3904 3905 NET_ASSERT_LOCKED(); 3906 3907 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3908 return; 3909 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3910 if (ntohl (th->th_seq) != sc->sc_iss) { 3911 return; 3912 } 3913 3914 /* 3915 * If we've retransmitted 3 times and this is our second error, 3916 * we remove the entry. Otherwise, we allow it to continue on. 3917 * This prevents us from incorrectly nuking an entry during a 3918 * spurious network outage. 3919 * 3920 * See tcp_notify(). 3921 */ 3922 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3923 sc->sc_flags |= SCF_UNREACH; 3924 return; 3925 } 3926 3927 syn_cache_rm(sc); 3928 tcpstat_inc(tcps_sc_unreach); 3929 syn_cache_put(sc); 3930 } 3931 3932 /* 3933 * Given a LISTEN socket and an inbound SYN request, add 3934 * this to the syn cache, and send back a segment: 3935 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3936 * to the source. 3937 * 3938 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3939 * Doing so would require that we hold onto the data and deliver it 3940 * to the application. However, if we are the target of a SYN-flood 3941 * DoS attack, an attacker could send data which would eventually 3942 * consume all available buffer space if it were ACKed. By not ACKing 3943 * the data, we avoid this DoS scenario. 3944 */ 3945 3946 int 3947 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3948 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3949 struct tcp_opt_info *oi, tcp_seq *issp) 3950 { 3951 struct tcpcb tb, *tp; 3952 long win; 3953 struct syn_cache *sc; 3954 struct syn_cache_head *scp; 3955 struct mbuf *ipopts; 3956 3957 tp = sototcpcb(so); 3958 3959 /* 3960 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3961 * 3962 * Note this check is performed in tcp_input() very early on. 3963 */ 3964 3965 /* 3966 * Initialize some local state. 3967 */ 3968 win = sbspace(&so->so_rcv); 3969 if (win > TCP_MAXWIN) 3970 win = TCP_MAXWIN; 3971 3972 bzero(&tb, sizeof(tb)); 3973 #ifdef TCP_SIGNATURE 3974 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3975 #else 3976 if (optp) { 3977 #endif 3978 tb.pf = tp->pf; 3979 #ifdef TCP_SACK 3980 tb.sack_enable = tp->sack_enable; 3981 #endif 3982 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3983 #ifdef TCP_SIGNATURE 3984 if (tp->t_flags & TF_SIGNATURE) 3985 tb.t_flags |= TF_SIGNATURE; 3986 #endif 3987 tb.t_state = TCPS_LISTEN; 3988 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3989 sotoinpcb(so)->inp_rtableid)) 3990 return (-1); 3991 } 3992 3993 switch (src->sa_family) { 3994 case AF_INET: 3995 /* 3996 * Remember the IP options, if any. 3997 */ 3998 ipopts = ip_srcroute(m); 3999 break; 4000 default: 4001 ipopts = NULL; 4002 } 4003 4004 /* 4005 * See if we already have an entry for this connection. 4006 * If we do, resend the SYN,ACK. We do not count this 4007 * as a retransmission (XXX though maybe we should). 4008 */ 4009 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 4010 if (sc != NULL) { 4011 tcpstat_inc(tcps_sc_dupesyn); 4012 if (ipopts) { 4013 /* 4014 * If we were remembering a previous source route, 4015 * forget it and use the new one we've been given. 4016 */ 4017 m_free(sc->sc_ipopts); 4018 sc->sc_ipopts = ipopts; 4019 } 4020 sc->sc_timestamp = tb.ts_recent; 4021 if (syn_cache_respond(sc, m) == 0) { 4022 tcpstat_inc(tcps_sndacks); 4023 tcpstat_inc(tcps_sndtotal); 4024 } 4025 return (0); 4026 } 4027 4028 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 4029 if (sc == NULL) { 4030 m_free(ipopts); 4031 return (-1); 4032 } 4033 4034 /* 4035 * Fill in the cache, and put the necessary IP and TCP 4036 * options into the reply. 4037 */ 4038 memcpy(&sc->sc_src, src, src->sa_len); 4039 memcpy(&sc->sc_dst, dst, dst->sa_len); 4040 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 4041 sc->sc_flags = 0; 4042 sc->sc_ipopts = ipopts; 4043 sc->sc_irs = th->th_seq; 4044 4045 sc->sc_iss = issp ? *issp : arc4random(); 4046 sc->sc_peermaxseg = oi->maxseg; 4047 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 4048 sc->sc_win = win; 4049 sc->sc_timestamp = tb.ts_recent; 4050 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4051 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 4052 sc->sc_flags |= SCF_TIMESTAMP; 4053 sc->sc_modulate = arc4random(); 4054 } 4055 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4056 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4057 sc->sc_requested_s_scale = tb.requested_s_scale; 4058 sc->sc_request_r_scale = 0; 4059 /* 4060 * Pick the smallest possible scaling factor that 4061 * will still allow us to scale up to sb_max. 4062 * 4063 * We do this because there are broken firewalls that 4064 * will corrupt the window scale option, leading to 4065 * the other endpoint believing that our advertised 4066 * window is unscaled. At scale factors larger than 4067 * 5 the unscaled window will drop below 1500 bytes, 4068 * leading to serious problems when traversing these 4069 * broken firewalls. 4070 * 4071 * With the default sbmax of 256K, a scale factor 4072 * of 3 will be chosen by this algorithm. Those who 4073 * choose a larger sbmax should watch out 4074 * for the compatiblity problems mentioned above. 4075 * 4076 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4077 * or <SYN,ACK>) segment itself is never scaled. 4078 */ 4079 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4080 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4081 sc->sc_request_r_scale++; 4082 } else { 4083 sc->sc_requested_s_scale = 15; 4084 sc->sc_request_r_scale = 15; 4085 } 4086 #ifdef TCP_ECN 4087 /* 4088 * if both ECE and CWR flag bits are set, peer is ECN capable. 4089 */ 4090 if (tcp_do_ecn && 4091 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4092 sc->sc_flags |= SCF_ECN_PERMIT; 4093 #endif 4094 #ifdef TCP_SACK 4095 /* 4096 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4097 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4098 */ 4099 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4100 sc->sc_flags |= SCF_SACK_PERMIT; 4101 #endif 4102 #ifdef TCP_SIGNATURE 4103 if (tb.t_flags & TF_SIGNATURE) 4104 sc->sc_flags |= SCF_SIGNATURE; 4105 #endif 4106 sc->sc_tp = tp; 4107 if (syn_cache_respond(sc, m) == 0) { 4108 syn_cache_insert(sc, tp); 4109 tcpstat_inc(tcps_sndacks); 4110 tcpstat_inc(tcps_sndtotal); 4111 } else { 4112 syn_cache_put(sc); 4113 tcpstat_inc(tcps_sc_dropped); 4114 } 4115 4116 return (0); 4117 } 4118 4119 int 4120 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4121 { 4122 u_int8_t *optp; 4123 int optlen, error; 4124 u_int16_t tlen; 4125 struct ip *ip = NULL; 4126 #ifdef INET6 4127 struct ip6_hdr *ip6 = NULL; 4128 #endif 4129 struct tcphdr *th; 4130 u_int hlen; 4131 struct inpcb *inp; 4132 4133 switch (sc->sc_src.sa.sa_family) { 4134 case AF_INET: 4135 hlen = sizeof(struct ip); 4136 break; 4137 #ifdef INET6 4138 case AF_INET6: 4139 hlen = sizeof(struct ip6_hdr); 4140 break; 4141 #endif 4142 default: 4143 m_freem(m); 4144 return (EAFNOSUPPORT); 4145 } 4146 4147 /* Compute the size of the TCP options. */ 4148 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4149 #ifdef TCP_SACK 4150 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4151 #endif 4152 #ifdef TCP_SIGNATURE 4153 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4154 #endif 4155 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4156 4157 tlen = hlen + sizeof(struct tcphdr) + optlen; 4158 4159 /* 4160 * Create the IP+TCP header from scratch. 4161 */ 4162 m_freem(m); 4163 #ifdef DIAGNOSTIC 4164 if (max_linkhdr + tlen > MCLBYTES) 4165 return (ENOBUFS); 4166 #endif 4167 MGETHDR(m, M_DONTWAIT, MT_DATA); 4168 if (m && max_linkhdr + tlen > MHLEN) { 4169 MCLGET(m, M_DONTWAIT); 4170 if ((m->m_flags & M_EXT) == 0) { 4171 m_freem(m); 4172 m = NULL; 4173 } 4174 } 4175 if (m == NULL) 4176 return (ENOBUFS); 4177 4178 /* Fixup the mbuf. */ 4179 m->m_data += max_linkhdr; 4180 m->m_len = m->m_pkthdr.len = tlen; 4181 m->m_pkthdr.ph_ifidx = 0; 4182 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4183 memset(mtod(m, u_char *), 0, tlen); 4184 4185 switch (sc->sc_src.sa.sa_family) { 4186 case AF_INET: 4187 ip = mtod(m, struct ip *); 4188 ip->ip_dst = sc->sc_src.sin.sin_addr; 4189 ip->ip_src = sc->sc_dst.sin.sin_addr; 4190 ip->ip_p = IPPROTO_TCP; 4191 th = (struct tcphdr *)(ip + 1); 4192 th->th_dport = sc->sc_src.sin.sin_port; 4193 th->th_sport = sc->sc_dst.sin.sin_port; 4194 break; 4195 #ifdef INET6 4196 case AF_INET6: 4197 ip6 = mtod(m, struct ip6_hdr *); 4198 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4199 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4200 ip6->ip6_nxt = IPPROTO_TCP; 4201 /* ip6_plen will be updated in ip6_output() */ 4202 th = (struct tcphdr *)(ip6 + 1); 4203 th->th_dport = sc->sc_src.sin6.sin6_port; 4204 th->th_sport = sc->sc_dst.sin6.sin6_port; 4205 break; 4206 #endif 4207 default: 4208 unhandled_af(sc->sc_src.sa.sa_family); 4209 } 4210 4211 th->th_seq = htonl(sc->sc_iss); 4212 th->th_ack = htonl(sc->sc_irs + 1); 4213 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4214 th->th_flags = TH_SYN|TH_ACK; 4215 #ifdef TCP_ECN 4216 /* Set ECE for SYN-ACK if peer supports ECN. */ 4217 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4218 th->th_flags |= TH_ECE; 4219 #endif 4220 th->th_win = htons(sc->sc_win); 4221 /* th_sum already 0 */ 4222 /* th_urp already 0 */ 4223 4224 /* Tack on the TCP options. */ 4225 optp = (u_int8_t *)(th + 1); 4226 *optp++ = TCPOPT_MAXSEG; 4227 *optp++ = 4; 4228 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4229 *optp++ = sc->sc_ourmaxseg & 0xff; 4230 4231 #ifdef TCP_SACK 4232 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4233 if (sc->sc_flags & SCF_SACK_PERMIT) { 4234 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4235 optp += 4; 4236 } 4237 #endif 4238 4239 if (sc->sc_request_r_scale != 15) { 4240 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4241 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4242 sc->sc_request_r_scale); 4243 optp += 4; 4244 } 4245 4246 if (sc->sc_flags & SCF_TIMESTAMP) { 4247 u_int32_t *lp = (u_int32_t *)(optp); 4248 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4249 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4250 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4251 *lp = htonl(sc->sc_timestamp); 4252 optp += TCPOLEN_TSTAMP_APPA; 4253 } 4254 4255 #ifdef TCP_SIGNATURE 4256 if (sc->sc_flags & SCF_SIGNATURE) { 4257 union sockaddr_union src, dst; 4258 struct tdb *tdb; 4259 4260 bzero(&src, sizeof(union sockaddr_union)); 4261 bzero(&dst, sizeof(union sockaddr_union)); 4262 src.sa.sa_len = sc->sc_src.sa.sa_len; 4263 src.sa.sa_family = sc->sc_src.sa.sa_family; 4264 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4265 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4266 4267 switch (sc->sc_src.sa.sa_family) { 4268 case 0: /*default to PF_INET*/ 4269 case AF_INET: 4270 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4271 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4272 break; 4273 #ifdef INET6 4274 case AF_INET6: 4275 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4276 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4277 break; 4278 #endif /* INET6 */ 4279 } 4280 4281 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4282 0, &src, &dst, IPPROTO_TCP); 4283 if (tdb == NULL) { 4284 m_freem(m); 4285 return (EPERM); 4286 } 4287 4288 /* Send signature option */ 4289 *(optp++) = TCPOPT_SIGNATURE; 4290 *(optp++) = TCPOLEN_SIGNATURE; 4291 4292 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4293 hlen, 0, optp) < 0) { 4294 m_freem(m); 4295 return (EINVAL); 4296 } 4297 optp += 16; 4298 4299 /* Pad options list to the next 32 bit boundary and 4300 * terminate it. 4301 */ 4302 *optp++ = TCPOPT_NOP; 4303 *optp++ = TCPOPT_EOL; 4304 } 4305 #endif /* TCP_SIGNATURE */ 4306 4307 /* Compute the packet's checksum. */ 4308 switch (sc->sc_src.sa.sa_family) { 4309 case AF_INET: 4310 ip->ip_len = htons(tlen - hlen); 4311 th->th_sum = 0; 4312 th->th_sum = in_cksum(m, tlen); 4313 break; 4314 #ifdef INET6 4315 case AF_INET6: 4316 ip6->ip6_plen = htons(tlen - hlen); 4317 th->th_sum = 0; 4318 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4319 break; 4320 #endif 4321 } 4322 4323 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4324 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4325 4326 /* 4327 * Fill in some straggling IP bits. Note the stack expects 4328 * ip_len to be in host order, for convenience. 4329 */ 4330 switch (sc->sc_src.sa.sa_family) { 4331 case AF_INET: 4332 ip->ip_len = htons(tlen); 4333 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4334 if (inp != NULL) 4335 ip->ip_tos = inp->inp_ip.ip_tos; 4336 break; 4337 #ifdef INET6 4338 case AF_INET6: 4339 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4340 ip6->ip6_vfc |= IPV6_VERSION; 4341 ip6->ip6_plen = htons(tlen - hlen); 4342 /* ip6_hlim will be initialized afterwards */ 4343 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4344 break; 4345 #endif 4346 } 4347 4348 switch (sc->sc_src.sa.sa_family) { 4349 case AF_INET: 4350 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4351 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4352 break; 4353 #ifdef INET6 4354 case AF_INET6: 4355 ip6->ip6_hlim = in6_selecthlim(inp); 4356 4357 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4358 NULL, NULL); 4359 break; 4360 #endif 4361 default: 4362 error = EAFNOSUPPORT; 4363 break; 4364 } 4365 return (error); 4366 } 4367