1 /* $OpenBSD: tcp_input.c,v 1.347 2017/08/11 21:24:20 mpi Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if ((tp)->t_flags & TF_DELACK || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_SET_DELACK(tp); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void syn_cache_put(struct syn_cache *); 189 void syn_cache_rm(struct syn_cache *); 190 int syn_cache_respond(struct syn_cache *, struct mbuf *); 191 void syn_cache_timer(void *); 192 void syn_cache_reaper(void *); 193 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 194 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 195 struct tcphdr *, u_int); 196 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 197 unsigned int, struct socket *, struct mbuf *, u_char *, int, 198 struct tcp_opt_info *, tcp_seq *); 199 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 200 struct tcphdr *, unsigned int, unsigned int, struct socket *, 201 struct mbuf *); 202 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 203 struct syn_cache_head **, u_int); 204 205 /* 206 * Insert segment ti into reassembly queue of tcp with 207 * control block tp. Return TH_FIN if reassembly now includes 208 * a segment with FIN. The macro form does the common case inline 209 * (segment is the next to be received on an established connection, 210 * and the queue is empty), avoiding linkage into and removal 211 * from the queue and repetition of various conversions. 212 * Set DELACK for segments received in order, but ack immediately 213 * when segments are out of order (so fast retransmit can work). 214 */ 215 216 int 217 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 218 { 219 struct tcpqent *p, *q, *nq, *tiqe; 220 221 /* 222 * Allocate a new queue entry, before we throw away any data. 223 * If we can't, just drop the packet. XXX 224 */ 225 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 226 if (tiqe == NULL) { 227 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 228 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 229 /* Reuse last entry since new segment fills a hole */ 230 m_freem(tiqe->tcpqe_m); 231 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 232 } 233 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 234 /* Flush segment queue for this connection */ 235 tcp_freeq(tp); 236 tcpstat_inc(tcps_rcvmemdrop); 237 m_freem(m); 238 return (0); 239 } 240 } 241 242 /* 243 * Find a segment which begins after this one does. 244 */ 245 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 246 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 247 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 248 break; 249 250 /* 251 * If there is a preceding segment, it may provide some of 252 * our data already. If so, drop the data from the incoming 253 * segment. If it provides all of our data, drop us. 254 */ 255 if (p != NULL) { 256 struct tcphdr *phdr = p->tcpqe_tcp; 257 int i; 258 259 /* conversion to int (in i) handles seq wraparound */ 260 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 261 if (i > 0) { 262 if (i >= *tlen) { 263 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 264 *tlen); 265 m_freem(m); 266 pool_put(&tcpqe_pool, tiqe); 267 return (0); 268 } 269 m_adj(m, i); 270 *tlen -= i; 271 th->th_seq += i; 272 } 273 } 274 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 275 276 /* 277 * While we overlap succeeding segments trim them or, 278 * if they are completely covered, dequeue them. 279 */ 280 for (; q != NULL; q = nq) { 281 struct tcphdr *qhdr = q->tcpqe_tcp; 282 int i = (th->th_seq + *tlen) - qhdr->th_seq; 283 284 if (i <= 0) 285 break; 286 if (i < qhdr->th_reseqlen) { 287 qhdr->th_seq += i; 288 qhdr->th_reseqlen -= i; 289 m_adj(q->tcpqe_m, i); 290 break; 291 } 292 nq = TAILQ_NEXT(q, tcpqe_q); 293 m_freem(q->tcpqe_m); 294 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 295 pool_put(&tcpqe_pool, q); 296 } 297 298 /* Insert the new segment queue entry into place. */ 299 tiqe->tcpqe_m = m; 300 th->th_reseqlen = *tlen; 301 tiqe->tcpqe_tcp = th; 302 if (p == NULL) { 303 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 304 } else { 305 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 306 } 307 308 if (th->th_seq != tp->rcv_nxt) 309 return (0); 310 311 return (tcp_flush_queue(tp)); 312 } 313 314 int 315 tcp_flush_queue(struct tcpcb *tp) 316 { 317 struct socket *so = tp->t_inpcb->inp_socket; 318 struct tcpqent *q, *nq; 319 int flags; 320 321 /* 322 * Present data to user, advancing rcv_nxt through 323 * completed sequence space. 324 */ 325 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 326 return (0); 327 q = TAILQ_FIRST(&tp->t_segq); 328 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 329 return (0); 330 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 331 return (0); 332 do { 333 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 334 flags = q->tcpqe_tcp->th_flags & TH_FIN; 335 336 nq = TAILQ_NEXT(q, tcpqe_q); 337 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 338 ND6_HINT(tp); 339 if (so->so_state & SS_CANTRCVMORE) 340 m_freem(q->tcpqe_m); 341 else 342 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 343 pool_put(&tcpqe_pool, q); 344 q = nq; 345 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 346 tp->t_flags |= TF_BLOCKOUTPUT; 347 sorwakeup(so); 348 tp->t_flags &= ~TF_BLOCKOUTPUT; 349 return (flags); 350 } 351 352 /* 353 * TCP input routine, follows pages 65-76 of the 354 * protocol specification dated September, 1981 very closely. 355 */ 356 int 357 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 358 { 359 struct mbuf *m = *mp; 360 int iphlen = *offp; 361 struct ip *ip = NULL; 362 struct inpcb *inp = NULL; 363 u_int8_t *optp = NULL; 364 int optlen = 0; 365 int tlen, off; 366 struct tcpcb *tp = NULL; 367 int tiflags; 368 struct socket *so = NULL; 369 int todrop, acked, ourfinisacked; 370 int hdroptlen = 0; 371 short ostate = 0; 372 tcp_seq iss, *reuse = NULL; 373 u_long tiwin; 374 struct tcp_opt_info opti; 375 struct tcphdr *th; 376 #ifdef INET6 377 struct ip6_hdr *ip6 = NULL; 378 #endif /* INET6 */ 379 #ifdef IPSEC 380 struct m_tag *mtag; 381 struct tdb_ident *tdbi; 382 struct tdb *tdb; 383 int error; 384 #endif /* IPSEC */ 385 #ifdef TCP_ECN 386 u_char iptos; 387 #endif 388 389 tcpstat_inc(tcps_rcvtotal); 390 391 opti.ts_present = 0; 392 opti.maxseg = 0; 393 394 /* 395 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 396 */ 397 if (m->m_flags & (M_BCAST|M_MCAST)) 398 goto drop; 399 400 /* 401 * Get IP and TCP header together in first mbuf. 402 * Note: IP leaves IP header in first mbuf. 403 */ 404 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 405 if (!th) { 406 tcpstat_inc(tcps_rcvshort); 407 return IPPROTO_DONE; 408 } 409 410 tlen = m->m_pkthdr.len - iphlen; 411 switch (af) { 412 case AF_INET: 413 ip = mtod(m, struct ip *); 414 #ifdef TCP_ECN 415 /* save ip_tos before clearing it for checksum */ 416 iptos = ip->ip_tos; 417 #endif 418 break; 419 #ifdef INET6 420 case AF_INET6: 421 ip6 = mtod(m, struct ip6_hdr *); 422 #ifdef TCP_ECN 423 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 424 #endif 425 426 /* 427 * Be proactive about unspecified IPv6 address in source. 428 * As we use all-zero to indicate unbounded/unconnected pcb, 429 * unspecified IPv6 address can be used to confuse us. 430 * 431 * Note that packets with unspecified IPv6 destination is 432 * already dropped in ip6_input. 433 */ 434 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 435 /* XXX stat */ 436 goto drop; 437 } 438 439 /* Discard packets to multicast */ 440 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 441 /* XXX stat */ 442 goto drop; 443 } 444 break; 445 #endif 446 default: 447 unhandled_af(af); 448 } 449 450 /* 451 * Checksum extended TCP header and data. 452 */ 453 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 454 int sum; 455 456 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 457 tcpstat_inc(tcps_rcvbadsum); 458 goto drop; 459 } 460 tcpstat_inc(tcps_inswcsum); 461 switch (af) { 462 case AF_INET: 463 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 464 break; 465 #ifdef INET6 466 case AF_INET6: 467 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 468 tlen); 469 break; 470 #endif 471 } 472 if (sum != 0) { 473 tcpstat_inc(tcps_rcvbadsum); 474 goto drop; 475 } 476 } 477 478 /* 479 * Check that TCP offset makes sense, 480 * pull out TCP options and adjust length. XXX 481 */ 482 off = th->th_off << 2; 483 if (off < sizeof(struct tcphdr) || off > tlen) { 484 tcpstat_inc(tcps_rcvbadoff); 485 goto drop; 486 } 487 tlen -= off; 488 if (off > sizeof(struct tcphdr)) { 489 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 490 if (!th) { 491 tcpstat_inc(tcps_rcvshort); 492 return IPPROTO_DONE; 493 } 494 optlen = off - sizeof(struct tcphdr); 495 optp = (u_int8_t *)(th + 1); 496 /* 497 * Do quick retrieval of timestamp options ("options 498 * prediction?"). If timestamp is the only option and it's 499 * formatted as recommended in RFC 1323 appendix A, we 500 * quickly get the values now and not bother calling 501 * tcp_dooptions(), etc. 502 */ 503 if ((optlen == TCPOLEN_TSTAMP_APPA || 504 (optlen > TCPOLEN_TSTAMP_APPA && 505 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 506 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 507 (th->th_flags & TH_SYN) == 0) { 508 opti.ts_present = 1; 509 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 510 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 511 optp = NULL; /* we've parsed the options */ 512 } 513 } 514 tiflags = th->th_flags; 515 516 /* 517 * Convert TCP protocol specific fields to host format. 518 */ 519 th->th_seq = ntohl(th->th_seq); 520 th->th_ack = ntohl(th->th_ack); 521 th->th_win = ntohs(th->th_win); 522 th->th_urp = ntohs(th->th_urp); 523 524 /* 525 * Locate pcb for segment. 526 */ 527 #if NPF > 0 528 inp = pf_inp_lookup(m); 529 #endif 530 findpcb: 531 if (inp == NULL) { 532 switch (af) { 533 #ifdef INET6 534 case AF_INET6: 535 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, 536 th->th_sport, &ip6->ip6_dst, th->th_dport, 537 m->m_pkthdr.ph_rtableid); 538 break; 539 #endif 540 case AF_INET: 541 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, 542 th->th_sport, ip->ip_dst, th->th_dport, 543 m->m_pkthdr.ph_rtableid); 544 break; 545 } 546 } 547 if (inp == NULL) { 548 int inpl_reverse = 0; 549 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) 550 inpl_reverse = 1; 551 tcpstat_inc(tcps_pcbhashmiss); 552 switch (af) { 553 #ifdef INET6 554 case AF_INET6: 555 inp = in6_pcblookup_listen(&tcbtable, 556 &ip6->ip6_dst, th->th_dport, inpl_reverse, m, 557 m->m_pkthdr.ph_rtableid); 558 break; 559 #endif /* INET6 */ 560 case AF_INET: 561 inp = in_pcblookup_listen(&tcbtable, 562 ip->ip_dst, th->th_dport, inpl_reverse, m, 563 m->m_pkthdr.ph_rtableid); 564 break; 565 } 566 /* 567 * If the state is CLOSED (i.e., TCB does not exist) then 568 * all data in the incoming segment is discarded. 569 * If the TCB exists but is in CLOSED state, it is embryonic, 570 * but should either do a listen or a connect soon. 571 */ 572 if (inp == NULL) { 573 tcpstat_inc(tcps_noport); 574 goto dropwithreset_ratelim; 575 } 576 } 577 KASSERT(sotoinpcb(inp->inp_socket) == inp); 578 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 579 580 /* Check the minimum TTL for socket. */ 581 switch (af) { 582 case AF_INET: 583 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 584 goto drop; 585 break; 586 #ifdef INET6 587 case AF_INET6: 588 if (inp->inp_ip6_minhlim && 589 inp->inp_ip6_minhlim > ip6->ip6_hlim) 590 goto drop; 591 break; 592 #endif 593 } 594 595 tp = intotcpcb(inp); 596 if (tp == NULL) 597 goto dropwithreset_ratelim; 598 if (tp->t_state == TCPS_CLOSED) 599 goto drop; 600 601 /* Unscale the window into a 32-bit value. */ 602 if ((tiflags & TH_SYN) == 0) 603 tiwin = th->th_win << tp->snd_scale; 604 else 605 tiwin = th->th_win; 606 607 so = inp->inp_socket; 608 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 609 union syn_cache_sa src; 610 union syn_cache_sa dst; 611 612 bzero(&src, sizeof(src)); 613 bzero(&dst, sizeof(dst)); 614 switch (af) { 615 case AF_INET: 616 src.sin.sin_len = sizeof(struct sockaddr_in); 617 src.sin.sin_family = AF_INET; 618 src.sin.sin_addr = ip->ip_src; 619 src.sin.sin_port = th->th_sport; 620 621 dst.sin.sin_len = sizeof(struct sockaddr_in); 622 dst.sin.sin_family = AF_INET; 623 dst.sin.sin_addr = ip->ip_dst; 624 dst.sin.sin_port = th->th_dport; 625 break; 626 #ifdef INET6 627 case AF_INET6: 628 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 629 src.sin6.sin6_family = AF_INET6; 630 src.sin6.sin6_addr = ip6->ip6_src; 631 src.sin6.sin6_port = th->th_sport; 632 633 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 634 dst.sin6.sin6_family = AF_INET6; 635 dst.sin6.sin6_addr = ip6->ip6_dst; 636 dst.sin6.sin6_port = th->th_dport; 637 break; 638 #endif /* INET6 */ 639 default: 640 goto badsyn; /*sanity*/ 641 } 642 643 if (so->so_options & SO_DEBUG) { 644 ostate = tp->t_state; 645 switch (af) { 646 #ifdef INET6 647 case AF_INET6: 648 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 649 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 650 break; 651 #endif 652 case AF_INET: 653 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 654 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 655 break; 656 } 657 } 658 if (so->so_options & SO_ACCEPTCONN) { 659 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 660 661 case TH_SYN|TH_ACK|TH_RST: 662 case TH_SYN|TH_RST: 663 case TH_ACK|TH_RST: 664 case TH_RST: 665 syn_cache_reset(&src.sa, &dst.sa, th, 666 inp->inp_rtableid); 667 goto drop; 668 669 case TH_SYN|TH_ACK: 670 /* 671 * Received a SYN,ACK. This should 672 * never happen while we are in 673 * LISTEN. Send an RST. 674 */ 675 goto badsyn; 676 677 case TH_ACK: 678 so = syn_cache_get(&src.sa, &dst.sa, 679 th, iphlen, tlen, so, m); 680 if (so == NULL) { 681 /* 682 * We don't have a SYN for 683 * this ACK; send an RST. 684 */ 685 goto badsyn; 686 } else if (so == (struct socket *)(-1)) { 687 /* 688 * We were unable to create 689 * the connection. If the 690 * 3-way handshake was 691 * completed, and RST has 692 * been sent to the peer. 693 * Since the mbuf might be 694 * in use for the reply, 695 * do not free it. 696 */ 697 m = *mp = NULL; 698 goto drop; 699 } else { 700 /* 701 * We have created a 702 * full-blown connection. 703 */ 704 tp = NULL; 705 inp = sotoinpcb(so); 706 tp = intotcpcb(inp); 707 if (tp == NULL) 708 goto badsyn; /*XXX*/ 709 710 } 711 break; 712 713 default: 714 /* 715 * None of RST, SYN or ACK was set. 716 * This is an invalid packet for a 717 * TCB in LISTEN state. Send a RST. 718 */ 719 goto badsyn; 720 721 case TH_SYN: 722 /* 723 * Received a SYN. 724 */ 725 #ifdef INET6 726 /* 727 * If deprecated address is forbidden, we do 728 * not accept SYN to deprecated interface 729 * address to prevent any new inbound 730 * connection from getting established. 731 * When we do not accept SYN, we send a TCP 732 * RST, with deprecated source address (instead 733 * of dropping it). We compromise it as it is 734 * much better for peer to send a RST, and 735 * RST will be the final packet for the 736 * exchange. 737 * 738 * If we do not forbid deprecated addresses, we 739 * accept the SYN packet. RFC2462 does not 740 * suggest dropping SYN in this case. 741 * If we decipher RFC2462 5.5.4, it says like 742 * this: 743 * 1. use of deprecated addr with existing 744 * communication is okay - "SHOULD continue 745 * to be used" 746 * 2. use of it with new communication: 747 * (2a) "SHOULD NOT be used if alternate 748 * address with sufficient scope is 749 * available" 750 * (2b) nothing mentioned otherwise. 751 * Here we fall into (2b) case as we have no 752 * choice in our source address selection - we 753 * must obey the peer. 754 * 755 * The wording in RFC2462 is confusing, and 756 * there are multiple description text for 757 * deprecated address handling - worse, they 758 * are not exactly the same. I believe 5.5.4 759 * is the best one, so we follow 5.5.4. 760 */ 761 if (ip6 && !ip6_use_deprecated) { 762 struct in6_ifaddr *ia6; 763 struct ifnet *ifp = 764 if_get(m->m_pkthdr.ph_ifidx); 765 766 if (ifp && 767 (ia6 = in6ifa_ifpwithaddr(ifp, 768 &ip6->ip6_dst)) && 769 (ia6->ia6_flags & 770 IN6_IFF_DEPRECATED)) { 771 tp = NULL; 772 if_put(ifp); 773 goto dropwithreset; 774 } 775 if_put(ifp); 776 } 777 #endif 778 779 /* 780 * LISTEN socket received a SYN 781 * from itself? This can't possibly 782 * be valid; drop the packet. 783 */ 784 if (th->th_dport == th->th_sport) { 785 switch (af) { 786 #ifdef INET6 787 case AF_INET6: 788 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 789 &ip6->ip6_dst)) { 790 tcpstat_inc(tcps_badsyn); 791 goto drop; 792 } 793 break; 794 #endif /* INET6 */ 795 case AF_INET: 796 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 797 tcpstat_inc(tcps_badsyn); 798 goto drop; 799 } 800 break; 801 } 802 } 803 804 /* 805 * SYN looks ok; create compressed TCP 806 * state for it. 807 */ 808 if (so->so_qlen > so->so_qlimit || 809 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 810 so, m, optp, optlen, &opti, reuse) == -1) { 811 tcpstat_inc(tcps_dropsyn); 812 goto drop; 813 } 814 return IPPROTO_DONE; 815 } 816 } 817 } 818 819 #ifdef DIAGNOSTIC 820 /* 821 * Should not happen now that all embryonic connections 822 * are handled with compressed state. 823 */ 824 if (tp->t_state == TCPS_LISTEN) 825 panic("tcp_input: TCPS_LISTEN"); 826 #endif 827 828 #if NPF > 0 829 pf_inp_link(m, inp); 830 #endif 831 832 #ifdef IPSEC 833 /* Find most recent IPsec tag */ 834 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 835 if (mtag != NULL) { 836 tdbi = (struct tdb_ident *)(mtag + 1); 837 tdb = gettdb(tdbi->rdomain, tdbi->spi, 838 &tdbi->dst, tdbi->proto); 839 } else 840 tdb = NULL; 841 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 842 tdb, inp, 0); 843 if (error) { 844 tcpstat_inc(tcps_rcvnosec); 845 goto drop; 846 } 847 #endif /* IPSEC */ 848 849 /* 850 * Segment received on connection. 851 * Reset idle time and keep-alive timer. 852 */ 853 tp->t_rcvtime = tcp_now; 854 if (TCPS_HAVEESTABLISHED(tp->t_state)) 855 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 856 857 #ifdef TCP_SACK 858 if (tp->sack_enable) 859 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 860 #endif /* TCP_SACK */ 861 862 /* 863 * Process options. 864 */ 865 #ifdef TCP_SIGNATURE 866 if (optp || (tp->t_flags & TF_SIGNATURE)) 867 #else 868 if (optp) 869 #endif 870 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 871 m->m_pkthdr.ph_rtableid)) 872 goto drop; 873 874 if (opti.ts_present && opti.ts_ecr) { 875 int rtt_test; 876 877 /* subtract out the tcp timestamp modulator */ 878 opti.ts_ecr -= tp->ts_modulate; 879 880 /* make sure ts_ecr is sensible */ 881 rtt_test = tcp_now - opti.ts_ecr; 882 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 883 opti.ts_ecr = 0; 884 } 885 886 #ifdef TCP_ECN 887 /* if congestion experienced, set ECE bit in subsequent packets. */ 888 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 889 tp->t_flags |= TF_RCVD_CE; 890 tcpstat_inc(tcps_ecn_rcvce); 891 } 892 #endif 893 /* 894 * Header prediction: check for the two common cases 895 * of a uni-directional data xfer. If the packet has 896 * no control flags, is in-sequence, the window didn't 897 * change and we're not retransmitting, it's a 898 * candidate. If the length is zero and the ack moved 899 * forward, we're the sender side of the xfer. Just 900 * free the data acked & wake any higher level process 901 * that was blocked waiting for space. If the length 902 * is non-zero and the ack didn't move, we're the 903 * receiver side. If we're getting packets in-order 904 * (the reassembly queue is empty), add the data to 905 * the socket buffer and note that we need a delayed ack. 906 */ 907 if (tp->t_state == TCPS_ESTABLISHED && 908 #ifdef TCP_ECN 909 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 910 #else 911 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 912 #endif 913 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 914 th->th_seq == tp->rcv_nxt && 915 tiwin && tiwin == tp->snd_wnd && 916 tp->snd_nxt == tp->snd_max) { 917 918 /* 919 * If last ACK falls within this segment's sequence numbers, 920 * record the timestamp. 921 * Fix from Braden, see Stevens p. 870 922 */ 923 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 924 tp->ts_recent_age = tcp_now; 925 tp->ts_recent = opti.ts_val; 926 } 927 928 if (tlen == 0) { 929 if (SEQ_GT(th->th_ack, tp->snd_una) && 930 SEQ_LEQ(th->th_ack, tp->snd_max) && 931 tp->snd_cwnd >= tp->snd_wnd && 932 tp->t_dupacks == 0) { 933 /* 934 * this is a pure ack for outstanding data. 935 */ 936 tcpstat_inc(tcps_predack); 937 if (opti.ts_present && opti.ts_ecr) 938 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 939 else if (tp->t_rtttime && 940 SEQ_GT(th->th_ack, tp->t_rtseq)) 941 tcp_xmit_timer(tp, 942 tcp_now - tp->t_rtttime); 943 acked = th->th_ack - tp->snd_una; 944 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 945 acked); 946 ND6_HINT(tp); 947 sbdrop(so, &so->so_snd, acked); 948 949 /* 950 * If we had a pending ICMP message that 951 * refers to data that have just been 952 * acknowledged, disregard the recorded ICMP 953 * message. 954 */ 955 if ((tp->t_flags & TF_PMTUD_PEND) && 956 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 957 tp->t_flags &= ~TF_PMTUD_PEND; 958 959 /* 960 * Keep track of the largest chunk of data 961 * acknowledged since last PMTU update 962 */ 963 if (tp->t_pmtud_mss_acked < acked) 964 tp->t_pmtud_mss_acked = acked; 965 966 tp->snd_una = th->th_ack; 967 #if defined(TCP_SACK) || defined(TCP_ECN) 968 /* 969 * We want snd_last to track snd_una so 970 * as to avoid sequence wraparound problems 971 * for very large transfers. 972 */ 973 #ifdef TCP_ECN 974 if (SEQ_GT(tp->snd_una, tp->snd_last)) 975 #endif 976 tp->snd_last = tp->snd_una; 977 #endif /* TCP_SACK */ 978 #if defined(TCP_SACK) && defined(TCP_FACK) 979 tp->snd_fack = tp->snd_una; 980 tp->retran_data = 0; 981 #endif /* TCP_FACK */ 982 m_freem(m); 983 984 /* 985 * If all outstanding data are acked, stop 986 * retransmit timer, otherwise restart timer 987 * using current (possibly backed-off) value. 988 * If process is waiting for space, 989 * wakeup/selwakeup/signal. If data 990 * are ready to send, let tcp_output 991 * decide between more output or persist. 992 */ 993 if (tp->snd_una == tp->snd_max) 994 TCP_TIMER_DISARM(tp, TCPT_REXMT); 995 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 996 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 997 998 tcp_update_sndspace(tp); 999 if (sb_notify(so, &so->so_snd)) { 1000 tp->t_flags |= TF_BLOCKOUTPUT; 1001 sowwakeup(so); 1002 tp->t_flags &= ~TF_BLOCKOUTPUT; 1003 } 1004 if (so->so_snd.sb_cc || 1005 tp->t_flags & TF_NEEDOUTPUT) 1006 (void) tcp_output(tp); 1007 return IPPROTO_DONE; 1008 } 1009 } else if (th->th_ack == tp->snd_una && 1010 TAILQ_EMPTY(&tp->t_segq) && 1011 tlen <= sbspace(so, &so->so_rcv)) { 1012 /* 1013 * This is a pure, in-sequence data packet 1014 * with nothing on the reassembly queue and 1015 * we have enough buffer space to take it. 1016 */ 1017 #ifdef TCP_SACK 1018 /* Clean receiver SACK report if present */ 1019 if (tp->sack_enable && tp->rcv_numsacks) 1020 tcp_clean_sackreport(tp); 1021 #endif /* TCP_SACK */ 1022 tcpstat_inc(tcps_preddat); 1023 tp->rcv_nxt += tlen; 1024 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1025 ND6_HINT(tp); 1026 1027 TCP_SETUP_ACK(tp, tiflags, m); 1028 /* 1029 * Drop TCP, IP headers and TCP options then add data 1030 * to socket buffer. 1031 */ 1032 if (so->so_state & SS_CANTRCVMORE) 1033 m_freem(m); 1034 else { 1035 if (opti.ts_present && opti.ts_ecr) { 1036 if (tp->rfbuf_ts < opti.ts_ecr && 1037 opti.ts_ecr - tp->rfbuf_ts < hz) { 1038 tcp_update_rcvspace(tp); 1039 /* Start over with next RTT. */ 1040 tp->rfbuf_cnt = 0; 1041 tp->rfbuf_ts = 0; 1042 } else 1043 tp->rfbuf_cnt += tlen; 1044 } 1045 m_adj(m, iphlen + off); 1046 sbappendstream(so, &so->so_rcv, m); 1047 } 1048 tp->t_flags |= TF_BLOCKOUTPUT; 1049 sorwakeup(so); 1050 tp->t_flags &= ~TF_BLOCKOUTPUT; 1051 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1052 (void) tcp_output(tp); 1053 return IPPROTO_DONE; 1054 } 1055 } 1056 1057 /* 1058 * Compute mbuf offset to TCP data segment. 1059 */ 1060 hdroptlen = iphlen + off; 1061 1062 /* 1063 * Calculate amount of space in receive window, 1064 * and then do TCP input processing. 1065 * Receive window is amount of space in rcv queue, 1066 * but not less than advertised window. 1067 */ 1068 { int win; 1069 1070 win = sbspace(so, &so->so_rcv); 1071 if (win < 0) 1072 win = 0; 1073 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1074 } 1075 1076 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1077 tp->rfbuf_cnt = 0; 1078 tp->rfbuf_ts = 0; 1079 1080 switch (tp->t_state) { 1081 1082 /* 1083 * If the state is SYN_RECEIVED: 1084 * if seg contains SYN/ACK, send an RST. 1085 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1086 */ 1087 1088 case TCPS_SYN_RECEIVED: 1089 if (tiflags & TH_ACK) { 1090 if (tiflags & TH_SYN) { 1091 tcpstat_inc(tcps_badsyn); 1092 goto dropwithreset; 1093 } 1094 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1095 SEQ_GT(th->th_ack, tp->snd_max)) 1096 goto dropwithreset; 1097 } 1098 break; 1099 1100 /* 1101 * If the state is SYN_SENT: 1102 * if seg contains an ACK, but not for our SYN, drop the input. 1103 * if seg contains a RST, then drop the connection. 1104 * if seg does not contain SYN, then drop it. 1105 * Otherwise this is an acceptable SYN segment 1106 * initialize tp->rcv_nxt and tp->irs 1107 * if seg contains ack then advance tp->snd_una 1108 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1109 * arrange for segment to be acked (eventually) 1110 * continue processing rest of data/controls, beginning with URG 1111 */ 1112 case TCPS_SYN_SENT: 1113 if ((tiflags & TH_ACK) && 1114 (SEQ_LEQ(th->th_ack, tp->iss) || 1115 SEQ_GT(th->th_ack, tp->snd_max))) 1116 goto dropwithreset; 1117 if (tiflags & TH_RST) { 1118 #ifdef TCP_ECN 1119 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1120 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1121 goto drop; 1122 #endif 1123 if (tiflags & TH_ACK) 1124 tp = tcp_drop(tp, ECONNREFUSED); 1125 goto drop; 1126 } 1127 if ((tiflags & TH_SYN) == 0) 1128 goto drop; 1129 if (tiflags & TH_ACK) { 1130 tp->snd_una = th->th_ack; 1131 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1132 tp->snd_nxt = tp->snd_una; 1133 } 1134 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1135 tp->irs = th->th_seq; 1136 tcp_mss(tp, opti.maxseg); 1137 /* Reset initial window to 1 segment for retransmit */ 1138 if (tp->t_rxtshift > 0) 1139 tp->snd_cwnd = tp->t_maxseg; 1140 tcp_rcvseqinit(tp); 1141 tp->t_flags |= TF_ACKNOW; 1142 #ifdef TCP_SACK 1143 /* 1144 * If we've sent a SACK_PERMITTED option, and the peer 1145 * also replied with one, then TF_SACK_PERMIT should have 1146 * been set in tcp_dooptions(). If it was not, disable SACKs. 1147 */ 1148 if (tp->sack_enable) 1149 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1150 #endif 1151 #ifdef TCP_ECN 1152 /* 1153 * if ECE is set but CWR is not set for SYN-ACK, or 1154 * both ECE and CWR are set for simultaneous open, 1155 * peer is ECN capable. 1156 */ 1157 if (tcp_do_ecn) { 1158 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1159 case TH_ACK|TH_ECE: 1160 case TH_ECE|TH_CWR: 1161 tp->t_flags |= TF_ECN_PERMIT; 1162 tiflags &= ~(TH_ECE|TH_CWR); 1163 tcpstat_inc(tcps_ecn_accepts); 1164 } 1165 } 1166 #endif 1167 1168 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1169 tcpstat_inc(tcps_connects); 1170 soisconnected(so); 1171 tp->t_state = TCPS_ESTABLISHED; 1172 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1173 /* Do window scaling on this connection? */ 1174 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1175 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1176 tp->snd_scale = tp->requested_s_scale; 1177 tp->rcv_scale = tp->request_r_scale; 1178 } 1179 tcp_flush_queue(tp); 1180 1181 /* 1182 * if we didn't have to retransmit the SYN, 1183 * use its rtt as our initial srtt & rtt var. 1184 */ 1185 if (tp->t_rtttime) 1186 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1187 /* 1188 * Since new data was acked (the SYN), open the 1189 * congestion window by one MSS. We do this 1190 * here, because we won't go through the normal 1191 * ACK processing below. And since this is the 1192 * start of the connection, we know we are in 1193 * the exponential phase of slow-start. 1194 */ 1195 tp->snd_cwnd += tp->t_maxseg; 1196 } else 1197 tp->t_state = TCPS_SYN_RECEIVED; 1198 1199 #if 0 1200 trimthenstep6: 1201 #endif 1202 /* 1203 * Advance th->th_seq to correspond to first data byte. 1204 * If data, trim to stay within window, 1205 * dropping FIN if necessary. 1206 */ 1207 th->th_seq++; 1208 if (tlen > tp->rcv_wnd) { 1209 todrop = tlen - tp->rcv_wnd; 1210 m_adj(m, -todrop); 1211 tlen = tp->rcv_wnd; 1212 tiflags &= ~TH_FIN; 1213 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1214 todrop); 1215 } 1216 tp->snd_wl1 = th->th_seq - 1; 1217 tp->rcv_up = th->th_seq; 1218 goto step6; 1219 /* 1220 * If a new connection request is received while in TIME_WAIT, 1221 * drop the old connection and start over if the if the 1222 * timestamp or the sequence numbers are above the previous 1223 * ones. 1224 */ 1225 case TCPS_TIME_WAIT: 1226 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1227 ((opti.ts_present && 1228 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1229 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1230 #if NPF > 0 1231 /* 1232 * The socket will be recreated but the new state 1233 * has already been linked to the socket. Remove the 1234 * link between old socket and new state. 1235 */ 1236 pf_inp_unlink(inp); 1237 #endif 1238 /* 1239 * Advance the iss by at least 32768, but 1240 * clear the msb in order to make sure 1241 * that SEG_LT(snd_nxt, iss). 1242 */ 1243 iss = tp->snd_nxt + 1244 ((arc4random() & 0x7fffffff) | 0x8000); 1245 reuse = &iss; 1246 tp = tcp_close(tp); 1247 inp = NULL; 1248 goto findpcb; 1249 } 1250 } 1251 1252 /* 1253 * States other than LISTEN or SYN_SENT. 1254 * First check timestamp, if present. 1255 * Then check that at least some bytes of segment are within 1256 * receive window. If segment begins before rcv_nxt, 1257 * drop leading data (and SYN); if nothing left, just ack. 1258 * 1259 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1260 * and it's less than opti.ts_recent, drop it. 1261 */ 1262 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1263 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1264 1265 /* Check to see if ts_recent is over 24 days old. */ 1266 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1267 /* 1268 * Invalidate ts_recent. If this segment updates 1269 * ts_recent, the age will be reset later and ts_recent 1270 * will get a valid value. If it does not, setting 1271 * ts_recent to zero will at least satisfy the 1272 * requirement that zero be placed in the timestamp 1273 * echo reply when ts_recent isn't valid. The 1274 * age isn't reset until we get a valid ts_recent 1275 * because we don't want out-of-order segments to be 1276 * dropped when ts_recent is old. 1277 */ 1278 tp->ts_recent = 0; 1279 } else { 1280 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1281 tcpstat_inc(tcps_pawsdrop); 1282 goto dropafterack; 1283 } 1284 } 1285 1286 todrop = tp->rcv_nxt - th->th_seq; 1287 if (todrop > 0) { 1288 if (tiflags & TH_SYN) { 1289 tiflags &= ~TH_SYN; 1290 th->th_seq++; 1291 if (th->th_urp > 1) 1292 th->th_urp--; 1293 else 1294 tiflags &= ~TH_URG; 1295 todrop--; 1296 } 1297 if (todrop > tlen || 1298 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1299 /* 1300 * Any valid FIN must be to the left of the 1301 * window. At this point, FIN must be a 1302 * duplicate or out-of-sequence, so drop it. 1303 */ 1304 tiflags &= ~TH_FIN; 1305 /* 1306 * Send ACK to resynchronize, and drop any data, 1307 * but keep on processing for RST or ACK. 1308 */ 1309 tp->t_flags |= TF_ACKNOW; 1310 todrop = tlen; 1311 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1312 } else { 1313 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1314 todrop); 1315 } 1316 hdroptlen += todrop; /* drop from head afterwards */ 1317 th->th_seq += todrop; 1318 tlen -= todrop; 1319 if (th->th_urp > todrop) 1320 th->th_urp -= todrop; 1321 else { 1322 tiflags &= ~TH_URG; 1323 th->th_urp = 0; 1324 } 1325 } 1326 1327 /* 1328 * If new data are received on a connection after the 1329 * user processes are gone, then RST the other end. 1330 */ 1331 if ((so->so_state & SS_NOFDREF) && 1332 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1333 tp = tcp_close(tp); 1334 tcpstat_inc(tcps_rcvafterclose); 1335 goto dropwithreset; 1336 } 1337 1338 /* 1339 * If segment ends after window, drop trailing data 1340 * (and PUSH and FIN); if nothing left, just ACK. 1341 */ 1342 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1343 if (todrop > 0) { 1344 tcpstat_inc(tcps_rcvpackafterwin); 1345 if (todrop >= tlen) { 1346 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1347 /* 1348 * If window is closed can only take segments at 1349 * window edge, and have to drop data and PUSH from 1350 * incoming segments. Continue processing, but 1351 * remember to ack. Otherwise, drop segment 1352 * and ack. 1353 */ 1354 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1355 tp->t_flags |= TF_ACKNOW; 1356 tcpstat_inc(tcps_rcvwinprobe); 1357 } else 1358 goto dropafterack; 1359 } else 1360 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1361 m_adj(m, -todrop); 1362 tlen -= todrop; 1363 tiflags &= ~(TH_PUSH|TH_FIN); 1364 } 1365 1366 /* 1367 * If last ACK falls within this segment's sequence numbers, 1368 * record its timestamp if it's more recent. 1369 * NOTE that the test is modified according to the latest 1370 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1371 */ 1372 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1373 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1374 tp->ts_recent_age = tcp_now; 1375 tp->ts_recent = opti.ts_val; 1376 } 1377 1378 /* 1379 * If the RST bit is set examine the state: 1380 * SYN_RECEIVED STATE: 1381 * If passive open, return to LISTEN state. 1382 * If active open, inform user that connection was refused. 1383 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1384 * Inform user that connection was reset, and close tcb. 1385 * CLOSING, LAST_ACK, TIME_WAIT STATES 1386 * Close the tcb. 1387 */ 1388 if (tiflags & TH_RST) { 1389 if (th->th_seq != tp->last_ack_sent && 1390 th->th_seq != tp->rcv_nxt && 1391 th->th_seq != (tp->rcv_nxt + 1)) 1392 goto drop; 1393 1394 switch (tp->t_state) { 1395 case TCPS_SYN_RECEIVED: 1396 #ifdef TCP_ECN 1397 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1398 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1399 goto drop; 1400 #endif 1401 so->so_error = ECONNREFUSED; 1402 goto close; 1403 1404 case TCPS_ESTABLISHED: 1405 case TCPS_FIN_WAIT_1: 1406 case TCPS_FIN_WAIT_2: 1407 case TCPS_CLOSE_WAIT: 1408 so->so_error = ECONNRESET; 1409 close: 1410 tp->t_state = TCPS_CLOSED; 1411 tcpstat_inc(tcps_drops); 1412 tp = tcp_close(tp); 1413 goto drop; 1414 case TCPS_CLOSING: 1415 case TCPS_LAST_ACK: 1416 case TCPS_TIME_WAIT: 1417 tp = tcp_close(tp); 1418 goto drop; 1419 } 1420 } 1421 1422 /* 1423 * If a SYN is in the window, then this is an 1424 * error and we ACK and drop the packet. 1425 */ 1426 if (tiflags & TH_SYN) 1427 goto dropafterack_ratelim; 1428 1429 /* 1430 * If the ACK bit is off we drop the segment and return. 1431 */ 1432 if ((tiflags & TH_ACK) == 0) { 1433 if (tp->t_flags & TF_ACKNOW) 1434 goto dropafterack; 1435 else 1436 goto drop; 1437 } 1438 1439 /* 1440 * Ack processing. 1441 */ 1442 switch (tp->t_state) { 1443 1444 /* 1445 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1446 * ESTABLISHED state and continue processing. 1447 * The ACK was checked above. 1448 */ 1449 case TCPS_SYN_RECEIVED: 1450 tcpstat_inc(tcps_connects); 1451 soisconnected(so); 1452 tp->t_state = TCPS_ESTABLISHED; 1453 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1454 /* Do window scaling? */ 1455 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1456 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1457 tp->snd_scale = tp->requested_s_scale; 1458 tp->rcv_scale = tp->request_r_scale; 1459 tiwin = th->th_win << tp->snd_scale; 1460 } 1461 tcp_flush_queue(tp); 1462 tp->snd_wl1 = th->th_seq - 1; 1463 /* fall into ... */ 1464 1465 /* 1466 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1467 * ACKs. If the ack is in the range 1468 * tp->snd_una < th->th_ack <= tp->snd_max 1469 * then advance tp->snd_una to th->th_ack and drop 1470 * data from the retransmission queue. If this ACK reflects 1471 * more up to date window information we update our window information. 1472 */ 1473 case TCPS_ESTABLISHED: 1474 case TCPS_FIN_WAIT_1: 1475 case TCPS_FIN_WAIT_2: 1476 case TCPS_CLOSE_WAIT: 1477 case TCPS_CLOSING: 1478 case TCPS_LAST_ACK: 1479 case TCPS_TIME_WAIT: 1480 #ifdef TCP_ECN 1481 /* 1482 * if we receive ECE and are not already in recovery phase, 1483 * reduce cwnd by half but don't slow-start. 1484 * advance snd_last to snd_max not to reduce cwnd again 1485 * until all outstanding packets are acked. 1486 */ 1487 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1488 if ((tp->t_flags & TF_ECN_PERMIT) && 1489 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1490 u_int win; 1491 1492 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1493 if (win > 1) { 1494 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1495 tp->snd_cwnd = tp->snd_ssthresh; 1496 tp->snd_last = tp->snd_max; 1497 tp->t_flags |= TF_SEND_CWR; 1498 tcpstat_inc(tcps_cwr_ecn); 1499 } 1500 } 1501 tcpstat_inc(tcps_ecn_rcvece); 1502 } 1503 /* 1504 * if we receive CWR, we know that the peer has reduced 1505 * its congestion window. stop sending ecn-echo. 1506 */ 1507 if ((tiflags & TH_CWR)) { 1508 tp->t_flags &= ~TF_RCVD_CE; 1509 tcpstat_inc(tcps_ecn_rcvcwr); 1510 } 1511 #endif /* TCP_ECN */ 1512 1513 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1514 /* 1515 * Duplicate/old ACK processing. 1516 * Increments t_dupacks: 1517 * Pure duplicate (same seq/ack/window, no data) 1518 * Doesn't affect t_dupacks: 1519 * Data packets. 1520 * Normal window updates (window opens) 1521 * Resets t_dupacks: 1522 * New data ACKed. 1523 * Window shrinks 1524 * Old ACK 1525 */ 1526 if (tlen) { 1527 /* Drop very old ACKs unless th_seq matches */ 1528 if (th->th_seq != tp->rcv_nxt && 1529 SEQ_LT(th->th_ack, 1530 tp->snd_una - tp->max_sndwnd)) { 1531 tcpstat_inc(tcps_rcvacktooold); 1532 goto drop; 1533 } 1534 break; 1535 } 1536 /* 1537 * If we get an old ACK, there is probably packet 1538 * reordering going on. Be conservative and reset 1539 * t_dupacks so that we are less aggressive in 1540 * doing a fast retransmit. 1541 */ 1542 if (th->th_ack != tp->snd_una) { 1543 tp->t_dupacks = 0; 1544 break; 1545 } 1546 if (tiwin == tp->snd_wnd) { 1547 tcpstat_inc(tcps_rcvdupack); 1548 /* 1549 * If we have outstanding data (other than 1550 * a window probe), this is a completely 1551 * duplicate ack (ie, window info didn't 1552 * change), the ack is the biggest we've 1553 * seen and we've seen exactly our rexmt 1554 * threshold of them, assume a packet 1555 * has been dropped and retransmit it. 1556 * Kludge snd_nxt & the congestion 1557 * window so we send only this one 1558 * packet. 1559 * 1560 * We know we're losing at the current 1561 * window size so do congestion avoidance 1562 * (set ssthresh to half the current window 1563 * and pull our congestion window back to 1564 * the new ssthresh). 1565 * 1566 * Dup acks mean that packets have left the 1567 * network (they're now cached at the receiver) 1568 * so bump cwnd by the amount in the receiver 1569 * to keep a constant cwnd packets in the 1570 * network. 1571 */ 1572 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1573 tp->t_dupacks = 0; 1574 #if defined(TCP_SACK) && defined(TCP_FACK) 1575 /* 1576 * In FACK, can enter fast rec. if the receiver 1577 * reports a reass. queue longer than 3 segs. 1578 */ 1579 else if (++tp->t_dupacks == tcprexmtthresh || 1580 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1581 tp->t_maxseg + tp->snd_una)) && 1582 SEQ_GT(tp->snd_una, tp->snd_last))) { 1583 #else 1584 else if (++tp->t_dupacks == tcprexmtthresh) { 1585 #endif /* TCP_FACK */ 1586 tcp_seq onxt = tp->snd_nxt; 1587 u_long win = 1588 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1589 2 / tp->t_maxseg; 1590 1591 #if defined(TCP_SACK) || defined(TCP_ECN) 1592 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1593 /* 1594 * False fast retx after 1595 * timeout. Do not cut window. 1596 */ 1597 tp->t_dupacks = 0; 1598 goto drop; 1599 } 1600 #endif 1601 if (win < 2) 1602 win = 2; 1603 tp->snd_ssthresh = win * tp->t_maxseg; 1604 #ifdef TCP_SACK 1605 tp->snd_last = tp->snd_max; 1606 if (tp->sack_enable) { 1607 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1608 tp->t_rtttime = 0; 1609 #ifdef TCP_ECN 1610 tp->t_flags |= TF_SEND_CWR; 1611 #endif 1612 tcpstat_inc(tcps_cwr_frecovery); 1613 tcpstat_inc(tcps_sack_recovery_episode); 1614 #if defined(TCP_SACK) && defined(TCP_FACK) 1615 tp->t_dupacks = tcprexmtthresh; 1616 (void) tcp_output(tp); 1617 /* 1618 * During FR, snd_cwnd is held 1619 * constant for FACK. 1620 */ 1621 tp->snd_cwnd = tp->snd_ssthresh; 1622 #else 1623 /* 1624 * tcp_output() will send 1625 * oldest SACK-eligible rtx. 1626 */ 1627 (void) tcp_output(tp); 1628 tp->snd_cwnd = tp->snd_ssthresh+ 1629 tp->t_maxseg * tp->t_dupacks; 1630 #endif /* TCP_FACK */ 1631 goto drop; 1632 } 1633 #endif /* TCP_SACK */ 1634 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1635 tp->t_rtttime = 0; 1636 tp->snd_nxt = th->th_ack; 1637 tp->snd_cwnd = tp->t_maxseg; 1638 #ifdef TCP_ECN 1639 tp->t_flags |= TF_SEND_CWR; 1640 #endif 1641 tcpstat_inc(tcps_cwr_frecovery); 1642 tcpstat_inc(tcps_sndrexmitfast); 1643 (void) tcp_output(tp); 1644 1645 tp->snd_cwnd = tp->snd_ssthresh + 1646 tp->t_maxseg * tp->t_dupacks; 1647 if (SEQ_GT(onxt, tp->snd_nxt)) 1648 tp->snd_nxt = onxt; 1649 goto drop; 1650 } else if (tp->t_dupacks > tcprexmtthresh) { 1651 #if defined(TCP_SACK) && defined(TCP_FACK) 1652 /* 1653 * while (awnd < cwnd) 1654 * sendsomething(); 1655 */ 1656 if (tp->sack_enable) { 1657 if (tp->snd_awnd < tp->snd_cwnd) 1658 tcp_output(tp); 1659 goto drop; 1660 } 1661 #endif /* TCP_FACK */ 1662 tp->snd_cwnd += tp->t_maxseg; 1663 (void) tcp_output(tp); 1664 goto drop; 1665 } 1666 } else if (tiwin < tp->snd_wnd) { 1667 /* 1668 * The window was retracted! Previous dup 1669 * ACKs may have been due to packets arriving 1670 * after the shrunken window, not a missing 1671 * packet, so play it safe and reset t_dupacks 1672 */ 1673 tp->t_dupacks = 0; 1674 } 1675 break; 1676 } 1677 /* 1678 * If the congestion window was inflated to account 1679 * for the other side's cached packets, retract it. 1680 */ 1681 #if defined(TCP_SACK) 1682 if (tp->sack_enable) { 1683 if (tp->t_dupacks >= tcprexmtthresh) { 1684 /* Check for a partial ACK */ 1685 if (tcp_sack_partialack(tp, th)) { 1686 #if defined(TCP_SACK) && defined(TCP_FACK) 1687 /* Force call to tcp_output */ 1688 if (tp->snd_awnd < tp->snd_cwnd) 1689 tp->t_flags |= TF_NEEDOUTPUT; 1690 #else 1691 tp->snd_cwnd += tp->t_maxseg; 1692 tp->t_flags |= TF_NEEDOUTPUT; 1693 #endif /* TCP_FACK */ 1694 } else { 1695 /* Out of fast recovery */ 1696 tp->snd_cwnd = tp->snd_ssthresh; 1697 if (tcp_seq_subtract(tp->snd_max, 1698 th->th_ack) < tp->snd_ssthresh) 1699 tp->snd_cwnd = 1700 tcp_seq_subtract(tp->snd_max, 1701 th->th_ack); 1702 tp->t_dupacks = 0; 1703 #if defined(TCP_SACK) && defined(TCP_FACK) 1704 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1705 tp->snd_fack = th->th_ack; 1706 #endif /* TCP_FACK */ 1707 } 1708 } 1709 } else { 1710 if (tp->t_dupacks >= tcprexmtthresh && 1711 !tcp_newreno(tp, th)) { 1712 /* Out of fast recovery */ 1713 tp->snd_cwnd = tp->snd_ssthresh; 1714 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1715 tp->snd_ssthresh) 1716 tp->snd_cwnd = 1717 tcp_seq_subtract(tp->snd_max, 1718 th->th_ack); 1719 tp->t_dupacks = 0; 1720 } 1721 } 1722 if (tp->t_dupacks < tcprexmtthresh) 1723 tp->t_dupacks = 0; 1724 #else /* else no TCP_SACK */ 1725 if (tp->t_dupacks >= tcprexmtthresh && 1726 tp->snd_cwnd > tp->snd_ssthresh) 1727 tp->snd_cwnd = tp->snd_ssthresh; 1728 tp->t_dupacks = 0; 1729 #endif 1730 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1731 tcpstat_inc(tcps_rcvacktoomuch); 1732 goto dropafterack_ratelim; 1733 } 1734 acked = th->th_ack - tp->snd_una; 1735 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1736 1737 /* 1738 * If we have a timestamp reply, update smoothed 1739 * round trip time. If no timestamp is present but 1740 * transmit timer is running and timed sequence 1741 * number was acked, update smoothed round trip time. 1742 * Since we now have an rtt measurement, cancel the 1743 * timer backoff (cf., Phil Karn's retransmit alg.). 1744 * Recompute the initial retransmit timer. 1745 */ 1746 if (opti.ts_present && opti.ts_ecr) 1747 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr); 1748 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1749 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1750 1751 /* 1752 * If all outstanding data is acked, stop retransmit 1753 * timer and remember to restart (more output or persist). 1754 * If there is more data to be acked, restart retransmit 1755 * timer, using current (possibly backed-off) value. 1756 */ 1757 if (th->th_ack == tp->snd_max) { 1758 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1759 tp->t_flags |= TF_NEEDOUTPUT; 1760 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1761 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1762 /* 1763 * When new data is acked, open the congestion window. 1764 * If the window gives us less than ssthresh packets 1765 * in flight, open exponentially (maxseg per packet). 1766 * Otherwise open linearly: maxseg per window 1767 * (maxseg^2 / cwnd per packet). 1768 */ 1769 { 1770 u_int cw = tp->snd_cwnd; 1771 u_int incr = tp->t_maxseg; 1772 1773 if (cw > tp->snd_ssthresh) 1774 incr = incr * incr / cw; 1775 #if defined (TCP_SACK) 1776 if (tp->t_dupacks < tcprexmtthresh) 1777 #endif 1778 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1779 } 1780 ND6_HINT(tp); 1781 if (acked > so->so_snd.sb_cc) { 1782 tp->snd_wnd -= so->so_snd.sb_cc; 1783 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1784 ourfinisacked = 1; 1785 } else { 1786 sbdrop(so, &so->so_snd, acked); 1787 tp->snd_wnd -= acked; 1788 ourfinisacked = 0; 1789 } 1790 1791 tcp_update_sndspace(tp); 1792 if (sb_notify(so, &so->so_snd)) { 1793 tp->t_flags |= TF_BLOCKOUTPUT; 1794 sowwakeup(so); 1795 tp->t_flags &= ~TF_BLOCKOUTPUT; 1796 } 1797 1798 /* 1799 * If we had a pending ICMP message that referred to data 1800 * that have just been acknowledged, disregard the recorded 1801 * ICMP message. 1802 */ 1803 if ((tp->t_flags & TF_PMTUD_PEND) && 1804 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1805 tp->t_flags &= ~TF_PMTUD_PEND; 1806 1807 /* 1808 * Keep track of the largest chunk of data acknowledged 1809 * since last PMTU update 1810 */ 1811 if (tp->t_pmtud_mss_acked < acked) 1812 tp->t_pmtud_mss_acked = acked; 1813 1814 tp->snd_una = th->th_ack; 1815 #ifdef TCP_ECN 1816 /* sync snd_last with snd_una */ 1817 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1818 tp->snd_last = tp->snd_una; 1819 #endif 1820 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1821 tp->snd_nxt = tp->snd_una; 1822 #if defined (TCP_SACK) && defined (TCP_FACK) 1823 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1824 tp->snd_fack = tp->snd_una; 1825 /* Update snd_awnd for partial ACK 1826 * without any SACK blocks. 1827 */ 1828 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1829 tp->snd_fack) + tp->retran_data; 1830 } 1831 #endif 1832 1833 switch (tp->t_state) { 1834 1835 /* 1836 * In FIN_WAIT_1 STATE in addition to the processing 1837 * for the ESTABLISHED state if our FIN is now acknowledged 1838 * then enter FIN_WAIT_2. 1839 */ 1840 case TCPS_FIN_WAIT_1: 1841 if (ourfinisacked) { 1842 /* 1843 * If we can't receive any more 1844 * data, then closing user can proceed. 1845 * Starting the timer is contrary to the 1846 * specification, but if we don't get a FIN 1847 * we'll hang forever. 1848 */ 1849 if (so->so_state & SS_CANTRCVMORE) { 1850 soisdisconnected(so); 1851 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1852 } 1853 tp->t_state = TCPS_FIN_WAIT_2; 1854 } 1855 break; 1856 1857 /* 1858 * In CLOSING STATE in addition to the processing for 1859 * the ESTABLISHED state if the ACK acknowledges our FIN 1860 * then enter the TIME-WAIT state, otherwise ignore 1861 * the segment. 1862 */ 1863 case TCPS_CLOSING: 1864 if (ourfinisacked) { 1865 tp->t_state = TCPS_TIME_WAIT; 1866 tcp_canceltimers(tp); 1867 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1868 soisdisconnected(so); 1869 } 1870 break; 1871 1872 /* 1873 * In LAST_ACK, we may still be waiting for data to drain 1874 * and/or to be acked, as well as for the ack of our FIN. 1875 * If our FIN is now acknowledged, delete the TCB, 1876 * enter the closed state and return. 1877 */ 1878 case TCPS_LAST_ACK: 1879 if (ourfinisacked) { 1880 tp = tcp_close(tp); 1881 goto drop; 1882 } 1883 break; 1884 1885 /* 1886 * In TIME_WAIT state the only thing that should arrive 1887 * is a retransmission of the remote FIN. Acknowledge 1888 * it and restart the finack timer. 1889 */ 1890 case TCPS_TIME_WAIT: 1891 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1892 goto dropafterack; 1893 } 1894 } 1895 1896 step6: 1897 /* 1898 * Update window information. 1899 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1900 */ 1901 if ((tiflags & TH_ACK) && 1902 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1903 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1904 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1905 /* keep track of pure window updates */ 1906 if (tlen == 0 && 1907 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1908 tcpstat_inc(tcps_rcvwinupd); 1909 tp->snd_wnd = tiwin; 1910 tp->snd_wl1 = th->th_seq; 1911 tp->snd_wl2 = th->th_ack; 1912 if (tp->snd_wnd > tp->max_sndwnd) 1913 tp->max_sndwnd = tp->snd_wnd; 1914 tp->t_flags |= TF_NEEDOUTPUT; 1915 } 1916 1917 /* 1918 * Process segments with URG. 1919 */ 1920 if ((tiflags & TH_URG) && th->th_urp && 1921 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1922 /* 1923 * This is a kludge, but if we receive and accept 1924 * random urgent pointers, we'll crash in 1925 * soreceive. It's hard to imagine someone 1926 * actually wanting to send this much urgent data. 1927 */ 1928 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1929 th->th_urp = 0; /* XXX */ 1930 tiflags &= ~TH_URG; /* XXX */ 1931 goto dodata; /* XXX */ 1932 } 1933 /* 1934 * If this segment advances the known urgent pointer, 1935 * then mark the data stream. This should not happen 1936 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1937 * a FIN has been received from the remote side. 1938 * In these states we ignore the URG. 1939 * 1940 * According to RFC961 (Assigned Protocols), 1941 * the urgent pointer points to the last octet 1942 * of urgent data. We continue, however, 1943 * to consider it to indicate the first octet 1944 * of data past the urgent section as the original 1945 * spec states (in one of two places). 1946 */ 1947 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1948 tp->rcv_up = th->th_seq + th->th_urp; 1949 so->so_oobmark = so->so_rcv.sb_cc + 1950 (tp->rcv_up - tp->rcv_nxt) - 1; 1951 if (so->so_oobmark == 0) 1952 so->so_state |= SS_RCVATMARK; 1953 sohasoutofband(so); 1954 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1955 } 1956 /* 1957 * Remove out of band data so doesn't get presented to user. 1958 * This can happen independent of advancing the URG pointer, 1959 * but if two URG's are pending at once, some out-of-band 1960 * data may creep in... ick. 1961 */ 1962 if (th->th_urp <= (u_int16_t) tlen && 1963 (so->so_options & SO_OOBINLINE) == 0) 1964 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1965 } else 1966 /* 1967 * If no out of band data is expected, 1968 * pull receive urgent pointer along 1969 * with the receive window. 1970 */ 1971 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1972 tp->rcv_up = tp->rcv_nxt; 1973 dodata: /* XXX */ 1974 1975 /* 1976 * Process the segment text, merging it into the TCP sequencing queue, 1977 * and arranging for acknowledgment of receipt if necessary. 1978 * This process logically involves adjusting tp->rcv_wnd as data 1979 * is presented to the user (this happens in tcp_usrreq.c, 1980 * case PRU_RCVD). If a FIN has already been received on this 1981 * connection then we just ignore the text. 1982 */ 1983 if ((tlen || (tiflags & TH_FIN)) && 1984 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1985 #ifdef TCP_SACK 1986 tcp_seq laststart = th->th_seq; 1987 tcp_seq lastend = th->th_seq + tlen; 1988 #endif 1989 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1990 tp->t_state == TCPS_ESTABLISHED) { 1991 TCP_SETUP_ACK(tp, tiflags, m); 1992 tp->rcv_nxt += tlen; 1993 tiflags = th->th_flags & TH_FIN; 1994 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1995 ND6_HINT(tp); 1996 if (so->so_state & SS_CANTRCVMORE) 1997 m_freem(m); 1998 else { 1999 m_adj(m, hdroptlen); 2000 sbappendstream(so, &so->so_rcv, m); 2001 } 2002 tp->t_flags |= TF_BLOCKOUTPUT; 2003 sorwakeup(so); 2004 tp->t_flags &= ~TF_BLOCKOUTPUT; 2005 } else { 2006 m_adj(m, hdroptlen); 2007 tiflags = tcp_reass(tp, th, m, &tlen); 2008 tp->t_flags |= TF_ACKNOW; 2009 } 2010 #ifdef TCP_SACK 2011 if (tp->sack_enable) 2012 tcp_update_sack_list(tp, laststart, lastend); 2013 #endif 2014 2015 /* 2016 * variable len never referenced again in modern BSD, 2017 * so why bother computing it ?? 2018 */ 2019 #if 0 2020 /* 2021 * Note the amount of data that peer has sent into 2022 * our window, in order to estimate the sender's 2023 * buffer size. 2024 */ 2025 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2026 #endif /* 0 */ 2027 } else { 2028 m_freem(m); 2029 tiflags &= ~TH_FIN; 2030 } 2031 2032 /* 2033 * If FIN is received ACK the FIN and let the user know 2034 * that the connection is closing. Ignore a FIN received before 2035 * the connection is fully established. 2036 */ 2037 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2038 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2039 socantrcvmore(so); 2040 tp->t_flags |= TF_ACKNOW; 2041 tp->rcv_nxt++; 2042 } 2043 switch (tp->t_state) { 2044 2045 /* 2046 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2047 */ 2048 case TCPS_ESTABLISHED: 2049 tp->t_state = TCPS_CLOSE_WAIT; 2050 break; 2051 2052 /* 2053 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2054 * enter the CLOSING state. 2055 */ 2056 case TCPS_FIN_WAIT_1: 2057 tp->t_state = TCPS_CLOSING; 2058 break; 2059 2060 /* 2061 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2062 * starting the time-wait timer, turning off the other 2063 * standard timers. 2064 */ 2065 case TCPS_FIN_WAIT_2: 2066 tp->t_state = TCPS_TIME_WAIT; 2067 tcp_canceltimers(tp); 2068 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2069 soisdisconnected(so); 2070 break; 2071 2072 /* 2073 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2074 */ 2075 case TCPS_TIME_WAIT: 2076 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2077 break; 2078 } 2079 } 2080 if (so->so_options & SO_DEBUG) { 2081 switch (tp->pf) { 2082 #ifdef INET6 2083 case PF_INET6: 2084 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2085 0, tlen); 2086 break; 2087 #endif /* INET6 */ 2088 case PF_INET: 2089 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2090 0, tlen); 2091 break; 2092 } 2093 } 2094 2095 /* 2096 * Return any desired output. 2097 */ 2098 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2099 (void) tcp_output(tp); 2100 return IPPROTO_DONE; 2101 2102 badsyn: 2103 /* 2104 * Received a bad SYN. Increment counters and dropwithreset. 2105 */ 2106 tcpstat_inc(tcps_badsyn); 2107 tp = NULL; 2108 goto dropwithreset; 2109 2110 dropafterack_ratelim: 2111 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2112 tcp_ackdrop_ppslim) == 0) { 2113 /* XXX stat */ 2114 goto drop; 2115 } 2116 /* ...fall into dropafterack... */ 2117 2118 dropafterack: 2119 /* 2120 * Generate an ACK dropping incoming segment if it occupies 2121 * sequence space, where the ACK reflects our state. 2122 */ 2123 if (tiflags & TH_RST) 2124 goto drop; 2125 m_freem(m); 2126 tp->t_flags |= TF_ACKNOW; 2127 (void) tcp_output(tp); 2128 return IPPROTO_DONE; 2129 2130 dropwithreset_ratelim: 2131 /* 2132 * We may want to rate-limit RSTs in certain situations, 2133 * particularly if we are sending an RST in response to 2134 * an attempt to connect to or otherwise communicate with 2135 * a port for which we have no socket. 2136 */ 2137 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2138 tcp_rst_ppslim) == 0) { 2139 /* XXX stat */ 2140 goto drop; 2141 } 2142 /* ...fall into dropwithreset... */ 2143 2144 dropwithreset: 2145 /* 2146 * Generate a RST, dropping incoming segment. 2147 * Make ACK acceptable to originator of segment. 2148 * Don't bother to respond to RST. 2149 */ 2150 if (tiflags & TH_RST) 2151 goto drop; 2152 if (tiflags & TH_ACK) { 2153 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2154 TH_RST, m->m_pkthdr.ph_rtableid); 2155 } else { 2156 if (tiflags & TH_SYN) 2157 tlen++; 2158 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2159 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid); 2160 } 2161 m_freem(m); 2162 return IPPROTO_DONE; 2163 2164 drop: 2165 /* 2166 * Drop space held by incoming segment and return. 2167 */ 2168 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2169 switch (tp->pf) { 2170 #ifdef INET6 2171 case PF_INET6: 2172 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2173 0, tlen); 2174 break; 2175 #endif /* INET6 */ 2176 case PF_INET: 2177 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2178 0, tlen); 2179 break; 2180 } 2181 } 2182 2183 m_freem(m); 2184 return IPPROTO_DONE; 2185 } 2186 2187 int 2188 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2189 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2190 u_int rtableid) 2191 { 2192 u_int16_t mss = 0; 2193 int opt, optlen; 2194 #ifdef TCP_SIGNATURE 2195 caddr_t sigp = NULL; 2196 struct tdb *tdb = NULL; 2197 #endif /* TCP_SIGNATURE */ 2198 2199 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2200 opt = cp[0]; 2201 if (opt == TCPOPT_EOL) 2202 break; 2203 if (opt == TCPOPT_NOP) 2204 optlen = 1; 2205 else { 2206 if (cnt < 2) 2207 break; 2208 optlen = cp[1]; 2209 if (optlen < 2 || optlen > cnt) 2210 break; 2211 } 2212 switch (opt) { 2213 2214 default: 2215 continue; 2216 2217 case TCPOPT_MAXSEG: 2218 if (optlen != TCPOLEN_MAXSEG) 2219 continue; 2220 if (!(th->th_flags & TH_SYN)) 2221 continue; 2222 if (TCPS_HAVERCVDSYN(tp->t_state)) 2223 continue; 2224 memcpy(&mss, cp + 2, sizeof(mss)); 2225 mss = ntohs(mss); 2226 oi->maxseg = mss; 2227 break; 2228 2229 case TCPOPT_WINDOW: 2230 if (optlen != TCPOLEN_WINDOW) 2231 continue; 2232 if (!(th->th_flags & TH_SYN)) 2233 continue; 2234 if (TCPS_HAVERCVDSYN(tp->t_state)) 2235 continue; 2236 tp->t_flags |= TF_RCVD_SCALE; 2237 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2238 break; 2239 2240 case TCPOPT_TIMESTAMP: 2241 if (optlen != TCPOLEN_TIMESTAMP) 2242 continue; 2243 oi->ts_present = 1; 2244 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2245 oi->ts_val = ntohl(oi->ts_val); 2246 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2247 oi->ts_ecr = ntohl(oi->ts_ecr); 2248 2249 if (!(th->th_flags & TH_SYN)) 2250 continue; 2251 if (TCPS_HAVERCVDSYN(tp->t_state)) 2252 continue; 2253 /* 2254 * A timestamp received in a SYN makes 2255 * it ok to send timestamp requests and replies. 2256 */ 2257 tp->t_flags |= TF_RCVD_TSTMP; 2258 tp->ts_recent = oi->ts_val; 2259 tp->ts_recent_age = tcp_now; 2260 break; 2261 2262 #ifdef TCP_SACK 2263 case TCPOPT_SACK_PERMITTED: 2264 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2265 continue; 2266 if (!(th->th_flags & TH_SYN)) 2267 continue; 2268 if (TCPS_HAVERCVDSYN(tp->t_state)) 2269 continue; 2270 /* MUST only be set on SYN */ 2271 tp->t_flags |= TF_SACK_PERMIT; 2272 break; 2273 case TCPOPT_SACK: 2274 tcp_sack_option(tp, th, cp, optlen); 2275 break; 2276 #endif 2277 #ifdef TCP_SIGNATURE 2278 case TCPOPT_SIGNATURE: 2279 if (optlen != TCPOLEN_SIGNATURE) 2280 continue; 2281 2282 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2283 return (-1); 2284 2285 sigp = cp + 2; 2286 break; 2287 #endif /* TCP_SIGNATURE */ 2288 } 2289 } 2290 2291 #ifdef TCP_SIGNATURE 2292 if (tp->t_flags & TF_SIGNATURE) { 2293 union sockaddr_union src, dst; 2294 2295 memset(&src, 0, sizeof(union sockaddr_union)); 2296 memset(&dst, 0, sizeof(union sockaddr_union)); 2297 2298 switch (tp->pf) { 2299 case 0: 2300 case AF_INET: 2301 src.sa.sa_len = sizeof(struct sockaddr_in); 2302 src.sa.sa_family = AF_INET; 2303 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2304 dst.sa.sa_len = sizeof(struct sockaddr_in); 2305 dst.sa.sa_family = AF_INET; 2306 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2307 break; 2308 #ifdef INET6 2309 case AF_INET6: 2310 src.sa.sa_len = sizeof(struct sockaddr_in6); 2311 src.sa.sa_family = AF_INET6; 2312 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2313 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2314 dst.sa.sa_family = AF_INET6; 2315 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2316 break; 2317 #endif /* INET6 */ 2318 } 2319 2320 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2321 0, &src, &dst, IPPROTO_TCP); 2322 2323 /* 2324 * We don't have an SA for this peer, so we turn off 2325 * TF_SIGNATURE on the listen socket 2326 */ 2327 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2328 tp->t_flags &= ~TF_SIGNATURE; 2329 2330 } 2331 2332 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2333 tcpstat_inc(tcps_rcvbadsig); 2334 return (-1); 2335 } 2336 2337 if (sigp) { 2338 char sig[16]; 2339 2340 if (tdb == NULL) { 2341 tcpstat_inc(tcps_rcvbadsig); 2342 return (-1); 2343 } 2344 2345 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2346 return (-1); 2347 2348 if (timingsafe_bcmp(sig, sigp, 16)) { 2349 tcpstat_inc(tcps_rcvbadsig); 2350 return (-1); 2351 } 2352 2353 tcpstat_inc(tcps_rcvgoodsig); 2354 } 2355 #endif /* TCP_SIGNATURE */ 2356 2357 return (0); 2358 } 2359 2360 #if defined(TCP_SACK) 2361 u_long 2362 tcp_seq_subtract(u_long a, u_long b) 2363 { 2364 return ((long)(a - b)); 2365 } 2366 #endif 2367 2368 2369 #ifdef TCP_SACK 2370 /* 2371 * This function is called upon receipt of new valid data (while not in header 2372 * prediction mode), and it updates the ordered list of sacks. 2373 */ 2374 void 2375 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2376 tcp_seq rcv_lastend) 2377 { 2378 /* 2379 * First reported block MUST be the most recent one. Subsequent 2380 * blocks SHOULD be in the order in which they arrived at the 2381 * receiver. These two conditions make the implementation fully 2382 * compliant with RFC 2018. 2383 */ 2384 int i, j = 0, count = 0, lastpos = -1; 2385 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2386 2387 /* First clean up current list of sacks */ 2388 for (i = 0; i < tp->rcv_numsacks; i++) { 2389 sack = tp->sackblks[i]; 2390 if (sack.start == 0 && sack.end == 0) { 2391 count++; /* count = number of blocks to be discarded */ 2392 continue; 2393 } 2394 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2395 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2396 count++; 2397 } else { 2398 temp[j].start = tp->sackblks[i].start; 2399 temp[j++].end = tp->sackblks[i].end; 2400 } 2401 } 2402 tp->rcv_numsacks -= count; 2403 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2404 tcp_clean_sackreport(tp); 2405 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2406 /* ==> need first sack block */ 2407 tp->sackblks[0].start = rcv_laststart; 2408 tp->sackblks[0].end = rcv_lastend; 2409 tp->rcv_numsacks = 1; 2410 } 2411 return; 2412 } 2413 /* Otherwise, sack blocks are already present. */ 2414 for (i = 0; i < tp->rcv_numsacks; i++) 2415 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2416 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2417 return; /* sack list remains unchanged */ 2418 /* 2419 * From here, segment just received should be (part of) the 1st sack. 2420 * Go through list, possibly coalescing sack block entries. 2421 */ 2422 firstsack.start = rcv_laststart; 2423 firstsack.end = rcv_lastend; 2424 for (i = 0; i < tp->rcv_numsacks; i++) { 2425 sack = tp->sackblks[i]; 2426 if (SEQ_LT(sack.end, firstsack.start) || 2427 SEQ_GT(sack.start, firstsack.end)) 2428 continue; /* no overlap */ 2429 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2430 /* 2431 * identical block; delete it here since we will 2432 * move it to the front of the list. 2433 */ 2434 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2435 lastpos = i; /* last posn with a zero entry */ 2436 continue; 2437 } 2438 if (SEQ_LEQ(sack.start, firstsack.start)) 2439 firstsack.start = sack.start; /* merge blocks */ 2440 if (SEQ_GEQ(sack.end, firstsack.end)) 2441 firstsack.end = sack.end; /* merge blocks */ 2442 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2443 lastpos = i; /* last posn with a zero entry */ 2444 } 2445 if (lastpos != -1) { /* at least one merge */ 2446 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2447 sack = tp->sackblks[i]; 2448 if (sack.start == 0 && sack.end == 0) 2449 continue; 2450 temp[j++] = sack; 2451 } 2452 tp->rcv_numsacks = j; /* including first blk (added later) */ 2453 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2454 tp->sackblks[i] = temp[i]; 2455 } else { /* no merges -- shift sacks by 1 */ 2456 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2457 tp->rcv_numsacks++; 2458 for (i = tp->rcv_numsacks-1; i > 0; i--) 2459 tp->sackblks[i] = tp->sackblks[i-1]; 2460 } 2461 tp->sackblks[0] = firstsack; 2462 return; 2463 } 2464 2465 /* 2466 * Process the TCP SACK option. tp->snd_holes is an ordered list 2467 * of holes (oldest to newest, in terms of the sequence space). 2468 */ 2469 void 2470 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2471 { 2472 int tmp_olen; 2473 u_char *tmp_cp; 2474 struct sackhole *cur, *p, *temp; 2475 2476 if (!tp->sack_enable) 2477 return; 2478 /* SACK without ACK doesn't make sense. */ 2479 if ((th->th_flags & TH_ACK) == 0) 2480 return; 2481 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2482 if (SEQ_LT(th->th_ack, tp->snd_una) || 2483 SEQ_GT(th->th_ack, tp->snd_max)) 2484 return; 2485 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2486 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2487 return; 2488 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2489 tmp_cp = cp + 2; 2490 tmp_olen = optlen - 2; 2491 tcpstat_inc(tcps_sack_rcv_opts); 2492 if (tp->snd_numholes < 0) 2493 tp->snd_numholes = 0; 2494 if (tp->t_maxseg == 0) 2495 panic("tcp_sack_option"); /* Should never happen */ 2496 while (tmp_olen > 0) { 2497 struct sackblk sack; 2498 2499 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2500 sack.start = ntohl(sack.start); 2501 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2502 sack.end = ntohl(sack.end); 2503 tmp_olen -= TCPOLEN_SACK; 2504 tmp_cp += TCPOLEN_SACK; 2505 if (SEQ_LEQ(sack.end, sack.start)) 2506 continue; /* bad SACK fields */ 2507 if (SEQ_LEQ(sack.end, tp->snd_una)) 2508 continue; /* old block */ 2509 #if defined(TCP_SACK) && defined(TCP_FACK) 2510 /* Updates snd_fack. */ 2511 if (SEQ_GT(sack.end, tp->snd_fack)) 2512 tp->snd_fack = sack.end; 2513 #endif /* TCP_FACK */ 2514 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2515 if (SEQ_LT(sack.start, th->th_ack)) 2516 continue; 2517 } 2518 if (SEQ_GT(sack.end, tp->snd_max)) 2519 continue; 2520 if (tp->snd_holes == NULL) { /* first hole */ 2521 tp->snd_holes = (struct sackhole *) 2522 pool_get(&sackhl_pool, PR_NOWAIT); 2523 if (tp->snd_holes == NULL) { 2524 /* ENOBUFS, so ignore SACKed block for now*/ 2525 goto done; 2526 } 2527 cur = tp->snd_holes; 2528 cur->start = th->th_ack; 2529 cur->end = sack.start; 2530 cur->rxmit = cur->start; 2531 cur->next = NULL; 2532 tp->snd_numholes = 1; 2533 tp->rcv_lastsack = sack.end; 2534 /* 2535 * dups is at least one. If more data has been 2536 * SACKed, it can be greater than one. 2537 */ 2538 cur->dups = min(tcprexmtthresh, 2539 ((sack.end - cur->end)/tp->t_maxseg)); 2540 if (cur->dups < 1) 2541 cur->dups = 1; 2542 continue; /* with next sack block */ 2543 } 2544 /* Go thru list of holes: p = previous, cur = current */ 2545 p = cur = tp->snd_holes; 2546 while (cur) { 2547 if (SEQ_LEQ(sack.end, cur->start)) 2548 /* SACKs data before the current hole */ 2549 break; /* no use going through more holes */ 2550 if (SEQ_GEQ(sack.start, cur->end)) { 2551 /* SACKs data beyond the current hole */ 2552 cur->dups++; 2553 if (((sack.end - cur->end)/tp->t_maxseg) >= 2554 tcprexmtthresh) 2555 cur->dups = tcprexmtthresh; 2556 p = cur; 2557 cur = cur->next; 2558 continue; 2559 } 2560 if (SEQ_LEQ(sack.start, cur->start)) { 2561 /* Data acks at least the beginning of hole */ 2562 #if defined(TCP_SACK) && defined(TCP_FACK) 2563 if (SEQ_GT(sack.end, cur->rxmit)) 2564 tp->retran_data -= 2565 tcp_seq_subtract(cur->rxmit, 2566 cur->start); 2567 else 2568 tp->retran_data -= 2569 tcp_seq_subtract(sack.end, 2570 cur->start); 2571 #endif /* TCP_FACK */ 2572 if (SEQ_GEQ(sack.end, cur->end)) { 2573 /* Acks entire hole, so delete hole */ 2574 if (p != cur) { 2575 p->next = cur->next; 2576 pool_put(&sackhl_pool, cur); 2577 cur = p->next; 2578 } else { 2579 cur = cur->next; 2580 pool_put(&sackhl_pool, p); 2581 p = cur; 2582 tp->snd_holes = p; 2583 } 2584 tp->snd_numholes--; 2585 continue; 2586 } 2587 /* otherwise, move start of hole forward */ 2588 cur->start = sack.end; 2589 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2590 p = cur; 2591 cur = cur->next; 2592 continue; 2593 } 2594 /* move end of hole backward */ 2595 if (SEQ_GEQ(sack.end, cur->end)) { 2596 #if defined(TCP_SACK) && defined(TCP_FACK) 2597 if (SEQ_GT(cur->rxmit, sack.start)) 2598 tp->retran_data -= 2599 tcp_seq_subtract(cur->rxmit, 2600 sack.start); 2601 #endif /* TCP_FACK */ 2602 cur->end = sack.start; 2603 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2604 cur->dups++; 2605 if (((sack.end - cur->end)/tp->t_maxseg) >= 2606 tcprexmtthresh) 2607 cur->dups = tcprexmtthresh; 2608 p = cur; 2609 cur = cur->next; 2610 continue; 2611 } 2612 if (SEQ_LT(cur->start, sack.start) && 2613 SEQ_GT(cur->end, sack.end)) { 2614 /* 2615 * ACKs some data in middle of a hole; need to 2616 * split current hole 2617 */ 2618 temp = (struct sackhole *) 2619 pool_get(&sackhl_pool, PR_NOWAIT); 2620 if (temp == NULL) 2621 goto done; /* ENOBUFS */ 2622 #if defined(TCP_SACK) && defined(TCP_FACK) 2623 if (SEQ_GT(cur->rxmit, sack.end)) 2624 tp->retran_data -= 2625 tcp_seq_subtract(sack.end, 2626 sack.start); 2627 else if (SEQ_GT(cur->rxmit, sack.start)) 2628 tp->retran_data -= 2629 tcp_seq_subtract(cur->rxmit, 2630 sack.start); 2631 #endif /* TCP_FACK */ 2632 temp->next = cur->next; 2633 temp->start = sack.end; 2634 temp->end = cur->end; 2635 temp->dups = cur->dups; 2636 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2637 cur->end = sack.start; 2638 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2639 cur->dups++; 2640 if (((sack.end - cur->end)/tp->t_maxseg) >= 2641 tcprexmtthresh) 2642 cur->dups = tcprexmtthresh; 2643 cur->next = temp; 2644 p = temp; 2645 cur = p->next; 2646 tp->snd_numholes++; 2647 } 2648 } 2649 /* At this point, p points to the last hole on the list */ 2650 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2651 /* 2652 * Need to append new hole at end. 2653 * Last hole is p (and it's not NULL). 2654 */ 2655 temp = (struct sackhole *) 2656 pool_get(&sackhl_pool, PR_NOWAIT); 2657 if (temp == NULL) 2658 goto done; /* ENOBUFS */ 2659 temp->start = tp->rcv_lastsack; 2660 temp->end = sack.start; 2661 temp->dups = min(tcprexmtthresh, 2662 ((sack.end - sack.start)/tp->t_maxseg)); 2663 if (temp->dups < 1) 2664 temp->dups = 1; 2665 temp->rxmit = temp->start; 2666 temp->next = 0; 2667 p->next = temp; 2668 tp->rcv_lastsack = sack.end; 2669 tp->snd_numholes++; 2670 } 2671 } 2672 done: 2673 #if defined(TCP_SACK) && defined(TCP_FACK) 2674 /* 2675 * Update retran_data and snd_awnd. Go through the list of 2676 * holes. Increment retran_data by (hole->rxmit - hole->start). 2677 */ 2678 tp->retran_data = 0; 2679 cur = tp->snd_holes; 2680 while (cur) { 2681 tp->retran_data += cur->rxmit - cur->start; 2682 cur = cur->next; 2683 } 2684 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2685 tp->retran_data; 2686 #endif /* TCP_FACK */ 2687 2688 return; 2689 } 2690 2691 /* 2692 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2693 * it is completely acked; otherwise, tcp_sack_option(), called from 2694 * tcp_dooptions(), will fix up the hole. 2695 */ 2696 void 2697 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2698 { 2699 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2700 /* max because this could be an older ack just arrived */ 2701 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2702 th->th_ack : tp->snd_una; 2703 struct sackhole *cur = tp->snd_holes; 2704 struct sackhole *prev; 2705 while (cur) 2706 if (SEQ_LEQ(cur->end, lastack)) { 2707 prev = cur; 2708 cur = cur->next; 2709 pool_put(&sackhl_pool, prev); 2710 tp->snd_numholes--; 2711 } else if (SEQ_LT(cur->start, lastack)) { 2712 cur->start = lastack; 2713 if (SEQ_LT(cur->rxmit, cur->start)) 2714 cur->rxmit = cur->start; 2715 break; 2716 } else 2717 break; 2718 tp->snd_holes = cur; 2719 } 2720 } 2721 2722 /* 2723 * Delete all receiver-side SACK information. 2724 */ 2725 void 2726 tcp_clean_sackreport(struct tcpcb *tp) 2727 { 2728 int i; 2729 2730 tp->rcv_numsacks = 0; 2731 for (i = 0; i < MAX_SACK_BLKS; i++) 2732 tp->sackblks[i].start = tp->sackblks[i].end=0; 2733 2734 } 2735 2736 /* 2737 * Checks for partial ack. If partial ack arrives, turn off retransmission 2738 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2739 * If the ack advances at least to tp->snd_last, return 0. 2740 */ 2741 int 2742 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2743 { 2744 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2745 /* Turn off retx. timer (will start again next segment) */ 2746 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2747 tp->t_rtttime = 0; 2748 #ifndef TCP_FACK 2749 /* 2750 * Partial window deflation. This statement relies on the 2751 * fact that tp->snd_una has not been updated yet. In FACK 2752 * hold snd_cwnd constant during fast recovery. 2753 */ 2754 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2755 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2756 tp->snd_cwnd += tp->t_maxseg; 2757 } else 2758 tp->snd_cwnd = tp->t_maxseg; 2759 #endif 2760 return (1); 2761 } 2762 return (0); 2763 } 2764 #endif /* TCP_SACK */ 2765 2766 /* 2767 * Pull out of band byte out of a segment so 2768 * it doesn't appear in the user's data queue. 2769 * It is still reflected in the segment length for 2770 * sequencing purposes. 2771 */ 2772 void 2773 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2774 { 2775 int cnt = off + urgent - 1; 2776 2777 while (cnt >= 0) { 2778 if (m->m_len > cnt) { 2779 char *cp = mtod(m, caddr_t) + cnt; 2780 struct tcpcb *tp = sototcpcb(so); 2781 2782 tp->t_iobc = *cp; 2783 tp->t_oobflags |= TCPOOB_HAVEDATA; 2784 memmove(cp, cp + 1, m->m_len - cnt - 1); 2785 m->m_len--; 2786 return; 2787 } 2788 cnt -= m->m_len; 2789 m = m->m_next; 2790 if (m == NULL) 2791 break; 2792 } 2793 panic("tcp_pulloutofband"); 2794 } 2795 2796 /* 2797 * Collect new round-trip time estimate 2798 * and update averages and current timeout. 2799 */ 2800 void 2801 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2802 { 2803 short delta; 2804 short rttmin; 2805 2806 if (rtt < 0) 2807 rtt = 0; 2808 else if (rtt > TCP_RTT_MAX) 2809 rtt = TCP_RTT_MAX; 2810 2811 tcpstat_inc(tcps_rttupdated); 2812 if (tp->t_srtt != 0) { 2813 /* 2814 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2815 * after the binary point (scaled by 4), whereas 2816 * srtt is stored as fixed point with 5 bits after the 2817 * binary point (i.e., scaled by 32). The following magic 2818 * is equivalent to the smoothing algorithm in rfc793 with 2819 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2820 * point). 2821 */ 2822 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2823 (tp->t_srtt >> TCP_RTT_SHIFT); 2824 if ((tp->t_srtt += delta) <= 0) 2825 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2826 /* 2827 * We accumulate a smoothed rtt variance (actually, a 2828 * smoothed mean difference), then set the retransmit 2829 * timer to smoothed rtt + 4 times the smoothed variance. 2830 * rttvar is stored as fixed point with 4 bits after the 2831 * binary point (scaled by 16). The following is 2832 * equivalent to rfc793 smoothing with an alpha of .75 2833 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2834 * rfc793's wired-in beta. 2835 */ 2836 if (delta < 0) 2837 delta = -delta; 2838 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2839 if ((tp->t_rttvar += delta) <= 0) 2840 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2841 } else { 2842 /* 2843 * No rtt measurement yet - use the unsmoothed rtt. 2844 * Set the variance to half the rtt (so our first 2845 * retransmit happens at 3*rtt). 2846 */ 2847 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2848 tp->t_rttvar = (rtt + 1) << 2849 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2850 } 2851 tp->t_rtttime = 0; 2852 tp->t_rxtshift = 0; 2853 2854 /* 2855 * the retransmit should happen at rtt + 4 * rttvar. 2856 * Because of the way we do the smoothing, srtt and rttvar 2857 * will each average +1/2 tick of bias. When we compute 2858 * the retransmit timer, we want 1/2 tick of rounding and 2859 * 1 extra tick because of +-1/2 tick uncertainty in the 2860 * firing of the timer. The bias will give us exactly the 2861 * 1.5 tick we need. But, because the bias is 2862 * statistical, we have to test that we don't drop below 2863 * the minimum feasible timer (which is 2 ticks). 2864 */ 2865 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX); 2866 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2867 2868 /* 2869 * We received an ack for a packet that wasn't retransmitted; 2870 * it is probably safe to discard any error indications we've 2871 * received recently. This isn't quite right, but close enough 2872 * for now (a route might have failed after we sent a segment, 2873 * and the return path might not be symmetrical). 2874 */ 2875 tp->t_softerror = 0; 2876 } 2877 2878 /* 2879 * Determine a reasonable value for maxseg size. 2880 * If the route is known, check route for mtu. 2881 * If none, use an mss that can be handled on the outgoing 2882 * interface without forcing IP to fragment; if bigger than 2883 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2884 * to utilize large mbufs. If no route is found, route has no mtu, 2885 * or the destination isn't local, use a default, hopefully conservative 2886 * size (usually 512 or the default IP max size, but no more than the mtu 2887 * of the interface), as we can't discover anything about intervening 2888 * gateways or networks. We also initialize the congestion/slow start 2889 * window to be a single segment if the destination isn't local. 2890 * While looking at the routing entry, we also initialize other path-dependent 2891 * parameters from pre-set or cached values in the routing entry. 2892 * 2893 * Also take into account the space needed for options that we 2894 * send regularly. Make maxseg shorter by that amount to assure 2895 * that we can send maxseg amount of data even when the options 2896 * are present. Store the upper limit of the length of options plus 2897 * data in maxopd. 2898 * 2899 * NOTE: offer == -1 indicates that the maxseg size changed due to 2900 * Path MTU discovery. 2901 */ 2902 int 2903 tcp_mss(struct tcpcb *tp, int offer) 2904 { 2905 struct rtentry *rt; 2906 struct ifnet *ifp = NULL; 2907 int mss, mssopt; 2908 int iphlen; 2909 struct inpcb *inp; 2910 2911 inp = tp->t_inpcb; 2912 2913 mssopt = mss = tcp_mssdflt; 2914 2915 rt = in_pcbrtentry(inp); 2916 2917 if (rt == NULL) 2918 goto out; 2919 2920 ifp = if_get(rt->rt_ifidx); 2921 if (ifp == NULL) 2922 goto out; 2923 2924 switch (tp->pf) { 2925 #ifdef INET6 2926 case AF_INET6: 2927 iphlen = sizeof(struct ip6_hdr); 2928 break; 2929 #endif 2930 case AF_INET: 2931 iphlen = sizeof(struct ip); 2932 break; 2933 default: 2934 /* the family does not support path MTU discovery */ 2935 goto out; 2936 } 2937 2938 /* 2939 * if there's an mtu associated with the route and we support 2940 * path MTU discovery for the underlying protocol family, use it. 2941 */ 2942 if (rt->rt_mtu) { 2943 /* 2944 * One may wish to lower MSS to take into account options, 2945 * especially security-related options. 2946 */ 2947 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2948 /* 2949 * RFC2460 section 5, last paragraph: if path MTU is 2950 * smaller than 1280, use 1280 as packet size and 2951 * attach fragment header. 2952 */ 2953 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2954 sizeof(struct tcphdr); 2955 } else { 2956 mss = rt->rt_mtu - iphlen - 2957 sizeof(struct tcphdr); 2958 } 2959 } else if (ifp->if_flags & IFF_LOOPBACK) { 2960 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2961 } else if (tp->pf == AF_INET) { 2962 if (ip_mtudisc) 2963 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2964 } 2965 #ifdef INET6 2966 else if (tp->pf == AF_INET6) { 2967 /* 2968 * for IPv6, path MTU discovery is always turned on, 2969 * or the node must use packet size <= 1280. 2970 */ 2971 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2972 } 2973 #endif /* INET6 */ 2974 2975 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2976 if (offer != -1) { 2977 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2978 mssopt = max(tcp_mssdflt, mssopt); 2979 } 2980 out: 2981 if_put(ifp); 2982 /* 2983 * The current mss, t_maxseg, is initialized to the default value. 2984 * If we compute a smaller value, reduce the current mss. 2985 * If we compute a larger value, return it for use in sending 2986 * a max seg size option, but don't store it for use 2987 * unless we received an offer at least that large from peer. 2988 * 2989 * However, do not accept offers lower than the minimum of 2990 * the interface MTU and 216. 2991 */ 2992 if (offer > 0) 2993 tp->t_peermss = offer; 2994 if (tp->t_peermss) 2995 mss = min(mss, max(tp->t_peermss, 216)); 2996 2997 /* sanity - at least max opt. space */ 2998 mss = max(mss, 64); 2999 3000 /* 3001 * maxopd stores the maximum length of data AND options 3002 * in a segment; maxseg is the amount of data in a normal 3003 * segment. We need to store this value (maxopd) apart 3004 * from maxseg, because now every segment carries options 3005 * and thus we normally have somewhat less data in segments. 3006 */ 3007 tp->t_maxopd = mss; 3008 3009 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3010 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3011 mss -= TCPOLEN_TSTAMP_APPA; 3012 #ifdef TCP_SIGNATURE 3013 if (tp->t_flags & TF_SIGNATURE) 3014 mss -= TCPOLEN_SIGLEN; 3015 #endif 3016 3017 if (offer == -1) { 3018 /* mss changed due to Path MTU discovery */ 3019 tp->t_flags &= ~TF_PMTUD_PEND; 3020 tp->t_pmtud_mtu_sent = 0; 3021 tp->t_pmtud_mss_acked = 0; 3022 if (mss < tp->t_maxseg) { 3023 /* 3024 * Follow suggestion in RFC 2414 to reduce the 3025 * congestion window by the ratio of the old 3026 * segment size to the new segment size. 3027 */ 3028 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 3029 mss, mss); 3030 } 3031 } else if (tcp_do_rfc3390 == 2) { 3032 /* increase initial window */ 3033 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 3034 } else if (tcp_do_rfc3390) { 3035 /* increase initial window */ 3036 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 3037 } else 3038 tp->snd_cwnd = mss; 3039 3040 tp->t_maxseg = mss; 3041 3042 return (offer != -1 ? mssopt : mss); 3043 } 3044 3045 u_int 3046 tcp_hdrsz(struct tcpcb *tp) 3047 { 3048 u_int hlen; 3049 3050 switch (tp->pf) { 3051 #ifdef INET6 3052 case AF_INET6: 3053 hlen = sizeof(struct ip6_hdr); 3054 break; 3055 #endif 3056 case AF_INET: 3057 hlen = sizeof(struct ip); 3058 break; 3059 default: 3060 hlen = 0; 3061 break; 3062 } 3063 hlen += sizeof(struct tcphdr); 3064 3065 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3066 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 3067 hlen += TCPOLEN_TSTAMP_APPA; 3068 #ifdef TCP_SIGNATURE 3069 if (tp->t_flags & TF_SIGNATURE) 3070 hlen += TCPOLEN_SIGLEN; 3071 #endif 3072 return (hlen); 3073 } 3074 3075 /* 3076 * Set connection variables based on the effective MSS. 3077 * We are passed the TCPCB for the actual connection. If we 3078 * are the server, we are called by the compressed state engine 3079 * when the 3-way handshake is complete. If we are the client, 3080 * we are called when we receive the SYN,ACK from the server. 3081 * 3082 * NOTE: The t_maxseg value must be initialized in the TCPCB 3083 * before this routine is called! 3084 */ 3085 void 3086 tcp_mss_update(struct tcpcb *tp) 3087 { 3088 int mss; 3089 u_long bufsize; 3090 struct rtentry *rt; 3091 struct socket *so; 3092 3093 so = tp->t_inpcb->inp_socket; 3094 mss = tp->t_maxseg; 3095 3096 rt = in_pcbrtentry(tp->t_inpcb); 3097 3098 if (rt == NULL) 3099 return; 3100 3101 bufsize = so->so_snd.sb_hiwat; 3102 if (bufsize < mss) { 3103 mss = bufsize; 3104 /* Update t_maxseg and t_maxopd */ 3105 tcp_mss(tp, mss); 3106 } else { 3107 bufsize = roundup(bufsize, mss); 3108 if (bufsize > sb_max) 3109 bufsize = sb_max; 3110 (void)sbreserve(so, &so->so_snd, bufsize); 3111 } 3112 3113 bufsize = so->so_rcv.sb_hiwat; 3114 if (bufsize > mss) { 3115 bufsize = roundup(bufsize, mss); 3116 if (bufsize > sb_max) 3117 bufsize = sb_max; 3118 (void)sbreserve(so, &so->so_rcv, bufsize); 3119 } 3120 3121 } 3122 3123 #if defined (TCP_SACK) 3124 /* 3125 * Checks for partial ack. If partial ack arrives, force the retransmission 3126 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3127 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3128 * be started again. If the ack advances at least to tp->snd_last, return 0. 3129 */ 3130 int 3131 tcp_newreno(struct tcpcb *tp, struct tcphdr *th) 3132 { 3133 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3134 /* 3135 * snd_una has not been updated and the socket send buffer 3136 * not yet drained of the acked data, so we have to leave 3137 * snd_una as it was to get the correct data offset in 3138 * tcp_output(). 3139 */ 3140 tcp_seq onxt = tp->snd_nxt; 3141 u_long ocwnd = tp->snd_cwnd; 3142 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3143 tp->t_rtttime = 0; 3144 tp->snd_nxt = th->th_ack; 3145 /* 3146 * Set snd_cwnd to one segment beyond acknowledged offset 3147 * (tp->snd_una not yet updated when this function is called) 3148 */ 3149 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3150 (void) tcp_output(tp); 3151 tp->snd_cwnd = ocwnd; 3152 if (SEQ_GT(onxt, tp->snd_nxt)) 3153 tp->snd_nxt = onxt; 3154 /* 3155 * Partial window deflation. Relies on fact that tp->snd_una 3156 * not updated yet. 3157 */ 3158 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3159 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3160 else 3161 tp->snd_cwnd = 0; 3162 tp->snd_cwnd += tp->t_maxseg; 3163 3164 return 1; 3165 } 3166 return 0; 3167 } 3168 #endif /* TCP_SACK */ 3169 3170 int 3171 tcp_mss_adv(struct mbuf *m, int af) 3172 { 3173 int mss = 0; 3174 int iphlen; 3175 struct ifnet *ifp = NULL; 3176 3177 if (m && (m->m_flags & M_PKTHDR)) 3178 ifp = if_get(m->m_pkthdr.ph_ifidx); 3179 3180 switch (af) { 3181 case AF_INET: 3182 if (ifp != NULL) 3183 mss = ifp->if_mtu; 3184 iphlen = sizeof(struct ip); 3185 break; 3186 #ifdef INET6 3187 case AF_INET6: 3188 if (ifp != NULL) 3189 mss = ifp->if_mtu; 3190 iphlen = sizeof(struct ip6_hdr); 3191 break; 3192 #endif 3193 default: 3194 unhandled_af(af); 3195 } 3196 if_put(ifp); 3197 mss = mss - iphlen - sizeof(struct tcphdr); 3198 return (max(mss, tcp_mssdflt)); 3199 } 3200 3201 /* 3202 * TCP compressed state engine. Currently used to hold compressed 3203 * state for SYN_RECEIVED. 3204 */ 3205 3206 /* syn hash parameters */ 3207 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; 3208 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 3209 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 3210 int tcp_syn_use_limit = 100000; 3211 3212 struct syn_cache_set tcp_syn_cache[2]; 3213 int tcp_syn_cache_active; 3214 3215 #define SYN_HASH(sa, sp, dp, rand) \ 3216 (((sa)->s_addr ^ (rand)[0]) * \ 3217 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3218 #ifndef INET6 3219 #define SYN_HASHALL(hash, src, dst, rand) \ 3220 do { \ 3221 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3222 satosin(src)->sin_port, \ 3223 satosin(dst)->sin_port, (rand)); \ 3224 } while (/*CONSTCOND*/ 0) 3225 #else 3226 #define SYN_HASH6(sa, sp, dp, rand) \ 3227 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3228 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3229 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3230 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3231 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3232 3233 #define SYN_HASHALL(hash, src, dst, rand) \ 3234 do { \ 3235 switch ((src)->sa_family) { \ 3236 case AF_INET: \ 3237 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3238 satosin(src)->sin_port, \ 3239 satosin(dst)->sin_port, (rand)); \ 3240 break; \ 3241 case AF_INET6: \ 3242 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3243 satosin6(src)->sin6_port, \ 3244 satosin6(dst)->sin6_port, (rand)); \ 3245 break; \ 3246 default: \ 3247 hash = 0; \ 3248 } \ 3249 } while (/*CONSTCOND*/0) 3250 #endif /* INET6 */ 3251 3252 void 3253 syn_cache_rm(struct syn_cache *sc) 3254 { 3255 sc->sc_flags |= SCF_DEAD; 3256 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3257 sc->sc_tp = NULL; 3258 LIST_REMOVE(sc, sc_tpq); 3259 sc->sc_buckethead->sch_length--; 3260 timeout_del(&sc->sc_timer); 3261 sc->sc_set->scs_count--; 3262 } 3263 3264 void 3265 syn_cache_put(struct syn_cache *sc) 3266 { 3267 m_free(sc->sc_ipopts); 3268 if (sc->sc_route4.ro_rt != NULL) { 3269 rtfree(sc->sc_route4.ro_rt); 3270 sc->sc_route4.ro_rt = NULL; 3271 } 3272 timeout_set(&sc->sc_timer, syn_cache_reaper, sc); 3273 timeout_add(&sc->sc_timer, 0); 3274 } 3275 3276 struct pool syn_cache_pool; 3277 3278 /* 3279 * We don't estimate RTT with SYNs, so each packet starts with the default 3280 * RTT and each timer step has a fixed timeout value. 3281 */ 3282 #define SYN_CACHE_TIMER_ARM(sc) \ 3283 do { \ 3284 TCPT_RANGESET((sc)->sc_rxtcur, \ 3285 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3286 TCPTV_REXMTMAX); \ 3287 if (!timeout_initialized(&(sc)->sc_timer)) \ 3288 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \ 3289 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \ 3290 } while (/*CONSTCOND*/0) 3291 3292 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate 3293 3294 void 3295 syn_cache_init(void) 3296 { 3297 int i; 3298 3299 /* Initialize the hash buckets. */ 3300 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3301 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3302 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3303 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3304 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3305 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3306 for (i = 0; i < tcp_syn_hash_size; i++) { 3307 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3308 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3309 } 3310 3311 /* Initialize the syn cache pool. */ 3312 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3313 0, "syncache", NULL); 3314 } 3315 3316 void 3317 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3318 { 3319 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3320 struct syn_cache_head *scp; 3321 struct syn_cache *sc2; 3322 int i; 3323 3324 NET_ASSERT_LOCKED(); 3325 3326 /* 3327 * If there are no entries in the hash table, reinitialize 3328 * the hash secrets. To avoid useless cache swaps and 3329 * reinitialization, use it until the limit is reached. 3330 * An emtpy cache is also the oportunity to resize the hash. 3331 */ 3332 if (set->scs_count == 0 && set->scs_use <= 0) { 3333 set->scs_use = tcp_syn_use_limit; 3334 if (set->scs_size != tcp_syn_hash_size) { 3335 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3336 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3337 if (scp == NULL) { 3338 /* Try again next time. */ 3339 set->scs_use = 0; 3340 } else { 3341 free(set->scs_buckethead, M_SYNCACHE, 3342 set->scs_size * 3343 sizeof(struct syn_cache_head)); 3344 set->scs_buckethead = scp; 3345 set->scs_size = tcp_syn_hash_size; 3346 for (i = 0; i < tcp_syn_hash_size; i++) 3347 TAILQ_INIT(&scp[i].sch_bucket); 3348 } 3349 } 3350 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3351 tcpstat_inc(tcps_sc_seedrandom); 3352 } 3353 3354 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3355 set->scs_random); 3356 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3357 sc->sc_buckethead = scp; 3358 3359 /* 3360 * Make sure that we don't overflow the per-bucket 3361 * limit or the total cache size limit. 3362 */ 3363 if (scp->sch_length >= tcp_syn_bucket_limit) { 3364 tcpstat_inc(tcps_sc_bucketoverflow); 3365 /* 3366 * Someone might attack our bucket hash function. Reseed 3367 * with random as soon as the passive syn cache gets empty. 3368 */ 3369 set->scs_use = 0; 3370 /* 3371 * The bucket is full. Toss the oldest element in the 3372 * bucket. This will be the first entry in the bucket. 3373 */ 3374 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3375 #ifdef DIAGNOSTIC 3376 /* 3377 * This should never happen; we should always find an 3378 * entry in our bucket. 3379 */ 3380 if (sc2 == NULL) 3381 panic("%s: bucketoverflow: impossible", __func__); 3382 #endif 3383 syn_cache_rm(sc2); 3384 syn_cache_put(sc2); 3385 } else if (set->scs_count >= tcp_syn_cache_limit) { 3386 struct syn_cache_head *scp2, *sce; 3387 3388 tcpstat_inc(tcps_sc_overflowed); 3389 /* 3390 * The cache is full. Toss the oldest entry in the 3391 * first non-empty bucket we can find. 3392 * 3393 * XXX We would really like to toss the oldest 3394 * entry in the cache, but we hope that this 3395 * condition doesn't happen very often. 3396 */ 3397 scp2 = scp; 3398 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3399 sce = &set->scs_buckethead[set->scs_size]; 3400 for (++scp2; scp2 != scp; scp2++) { 3401 if (scp2 >= sce) 3402 scp2 = &set->scs_buckethead[0]; 3403 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3404 break; 3405 } 3406 #ifdef DIAGNOSTIC 3407 /* 3408 * This should never happen; we should always find a 3409 * non-empty bucket. 3410 */ 3411 if (scp2 == scp) 3412 panic("%s: cacheoverflow: impossible", 3413 __func__); 3414 #endif 3415 } 3416 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3417 syn_cache_rm(sc2); 3418 syn_cache_put(sc2); 3419 } 3420 3421 /* 3422 * Initialize the entry's timer. 3423 */ 3424 sc->sc_rxttot = 0; 3425 sc->sc_rxtshift = 0; 3426 SYN_CACHE_TIMER_ARM(sc); 3427 3428 /* Link it from tcpcb entry */ 3429 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3430 3431 /* Put it into the bucket. */ 3432 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3433 scp->sch_length++; 3434 sc->sc_set = set; 3435 set->scs_count++; 3436 set->scs_use--; 3437 3438 tcpstat_inc(tcps_sc_added); 3439 3440 /* 3441 * If the active cache has exceeded its use limit and 3442 * the passive syn cache is empty, exchange their roles. 3443 */ 3444 if (set->scs_use <= 0 && 3445 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3446 tcp_syn_cache_active = !tcp_syn_cache_active; 3447 } 3448 3449 /* 3450 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3451 * If we have retransmitted an entry the maximum number of times, expire 3452 * that entry. 3453 */ 3454 void 3455 syn_cache_timer(void *arg) 3456 { 3457 struct syn_cache *sc = arg; 3458 3459 NET_LOCK(); 3460 if (sc->sc_flags & SCF_DEAD) 3461 goto out; 3462 3463 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3464 /* Drop it -- too many retransmissions. */ 3465 goto dropit; 3466 } 3467 3468 /* 3469 * Compute the total amount of time this entry has 3470 * been on a queue. If this entry has been on longer 3471 * than the keep alive timer would allow, expire it. 3472 */ 3473 sc->sc_rxttot += sc->sc_rxtcur; 3474 if (sc->sc_rxttot >= tcptv_keep_init) 3475 goto dropit; 3476 3477 tcpstat_inc(tcps_sc_retransmitted); 3478 (void) syn_cache_respond(sc, NULL); 3479 3480 /* Advance the timer back-off. */ 3481 sc->sc_rxtshift++; 3482 SYN_CACHE_TIMER_ARM(sc); 3483 3484 out: 3485 NET_UNLOCK(); 3486 return; 3487 3488 dropit: 3489 tcpstat_inc(tcps_sc_timed_out); 3490 syn_cache_rm(sc); 3491 syn_cache_put(sc); 3492 NET_UNLOCK(); 3493 } 3494 3495 void 3496 syn_cache_reaper(void *arg) 3497 { 3498 struct syn_cache *sc = arg; 3499 3500 pool_put(&syn_cache_pool, (sc)); 3501 return; 3502 } 3503 3504 /* 3505 * Remove syn cache created by the specified tcb entry, 3506 * because this does not make sense to keep them 3507 * (if there's no tcb entry, syn cache entry will never be used) 3508 */ 3509 void 3510 syn_cache_cleanup(struct tcpcb *tp) 3511 { 3512 struct syn_cache *sc, *nsc; 3513 3514 NET_ASSERT_LOCKED(); 3515 3516 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3517 #ifdef DIAGNOSTIC 3518 if (sc->sc_tp != tp) 3519 panic("invalid sc_tp in syn_cache_cleanup"); 3520 #endif 3521 syn_cache_rm(sc); 3522 syn_cache_put(sc); 3523 } 3524 /* just for safety */ 3525 LIST_INIT(&tp->t_sc); 3526 } 3527 3528 /* 3529 * Find an entry in the syn cache. 3530 */ 3531 struct syn_cache * 3532 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3533 struct syn_cache_head **headp, u_int rtableid) 3534 { 3535 struct syn_cache_set *sets[2]; 3536 struct syn_cache *sc; 3537 struct syn_cache_head *scp; 3538 u_int32_t hash; 3539 int i; 3540 3541 NET_ASSERT_LOCKED(); 3542 3543 /* Check the active cache first, the passive cache is likely emtpy. */ 3544 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3545 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3546 for (i = 0; i < 2; i++) { 3547 if (sets[i]->scs_count == 0) 3548 continue; 3549 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3550 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3551 *headp = scp; 3552 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3553 if (sc->sc_hash != hash) 3554 continue; 3555 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3556 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3557 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3558 return (sc); 3559 } 3560 } 3561 return (NULL); 3562 } 3563 3564 /* 3565 * This function gets called when we receive an ACK for a 3566 * socket in the LISTEN state. We look up the connection 3567 * in the syn cache, and if its there, we pull it out of 3568 * the cache and turn it into a full-blown connection in 3569 * the SYN-RECEIVED state. 3570 * 3571 * The return values may not be immediately obvious, and their effects 3572 * can be subtle, so here they are: 3573 * 3574 * NULL SYN was not found in cache; caller should drop the 3575 * packet and send an RST. 3576 * 3577 * -1 We were unable to create the new connection, and are 3578 * aborting it. An ACK,RST is being sent to the peer 3579 * (unless we got screwey sequence numbners; see below), 3580 * because the 3-way handshake has been completed. Caller 3581 * should not free the mbuf, since we may be using it. If 3582 * we are not, we will free it. 3583 * 3584 * Otherwise, the return value is a pointer to the new socket 3585 * associated with the connection. 3586 */ 3587 struct socket * 3588 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3589 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m) 3590 { 3591 struct syn_cache *sc; 3592 struct syn_cache_head *scp; 3593 struct inpcb *inp, *oldinp; 3594 struct tcpcb *tp = NULL; 3595 struct mbuf *am; 3596 struct socket *oso; 3597 #if NPF > 0 3598 struct pf_divert *divert = NULL; 3599 #endif 3600 3601 NET_ASSERT_LOCKED(); 3602 3603 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3604 if (sc == NULL) 3605 return (NULL); 3606 3607 /* 3608 * Verify the sequence and ack numbers. Try getting the correct 3609 * response again. 3610 */ 3611 if ((th->th_ack != sc->sc_iss + 1) || 3612 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3613 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3614 (void) syn_cache_respond(sc, m); 3615 return ((struct socket *)(-1)); 3616 } 3617 3618 /* Remove this cache entry */ 3619 syn_cache_rm(sc); 3620 3621 /* 3622 * Ok, create the full blown connection, and set things up 3623 * as they would have been set up if we had created the 3624 * connection when the SYN arrived. If we can't create 3625 * the connection, abort it. 3626 */ 3627 oso = so; 3628 so = sonewconn(so, SS_ISCONNECTED); 3629 if (so == NULL) 3630 goto resetandabort; 3631 3632 oldinp = sotoinpcb(oso); 3633 inp = sotoinpcb(so); 3634 3635 #ifdef IPSEC 3636 /* 3637 * We need to copy the required security levels 3638 * from the old pcb. Ditto for any other 3639 * IPsec-related information. 3640 */ 3641 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3642 sizeof(oldinp->inp_seclevel)); 3643 #endif /* IPSEC */ 3644 #ifdef INET6 3645 /* 3646 * inp still has the OLD in_pcb stuff, set the 3647 * v6-related flags on the new guy, too. 3648 */ 3649 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3650 if (inp->inp_flags & INP_IPV6) { 3651 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3652 inp->inp_hops = oldinp->inp_hops; 3653 } else 3654 #endif /* INET6 */ 3655 { 3656 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3657 } 3658 3659 #if NPF > 0 3660 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED && 3661 (divert = pf_find_divert(m)) != NULL) 3662 inp->inp_rtableid = divert->rdomain; 3663 else 3664 #endif 3665 /* inherit rtable from listening socket */ 3666 inp->inp_rtableid = sc->sc_rtableid; 3667 3668 inp->inp_lport = th->th_dport; 3669 switch (src->sa_family) { 3670 #ifdef INET6 3671 case AF_INET6: 3672 inp->inp_laddr6 = satosin6(dst)->sin6_addr; 3673 break; 3674 #endif /* INET6 */ 3675 case AF_INET: 3676 inp->inp_laddr = satosin(dst)->sin_addr; 3677 inp->inp_options = ip_srcroute(m); 3678 if (inp->inp_options == NULL) { 3679 inp->inp_options = sc->sc_ipopts; 3680 sc->sc_ipopts = NULL; 3681 } 3682 break; 3683 } 3684 in_pcbrehash(inp); 3685 3686 /* 3687 * Give the new socket our cached route reference. 3688 */ 3689 if (src->sa_family == AF_INET) 3690 inp->inp_route = sc->sc_route4; /* struct assignment */ 3691 #ifdef INET6 3692 else 3693 inp->inp_route6 = sc->sc_route6; 3694 #endif 3695 sc->sc_route4.ro_rt = NULL; 3696 3697 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3698 if (am == NULL) 3699 goto resetandabort; 3700 am->m_len = src->sa_len; 3701 memcpy(mtod(am, caddr_t), src, src->sa_len); 3702 3703 switch (src->sa_family) { 3704 case AF_INET: 3705 /* drop IPv4 packet to AF_INET6 socket */ 3706 if (inp->inp_flags & INP_IPV6) { 3707 (void) m_free(am); 3708 goto resetandabort; 3709 } 3710 if (in_pcbconnect(inp, am)) { 3711 (void) m_free(am); 3712 goto resetandabort; 3713 } 3714 break; 3715 #ifdef INET6 3716 case AF_INET6: 3717 if (in6_pcbconnect(inp, am)) { 3718 (void) m_free(am); 3719 goto resetandabort; 3720 } 3721 break; 3722 #endif 3723 } 3724 (void) m_free(am); 3725 3726 tp = intotcpcb(inp); 3727 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3728 if (sc->sc_request_r_scale != 15) { 3729 tp->requested_s_scale = sc->sc_requested_s_scale; 3730 tp->request_r_scale = sc->sc_request_r_scale; 3731 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3732 } 3733 if (sc->sc_flags & SCF_TIMESTAMP) 3734 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3735 3736 tp->t_template = tcp_template(tp); 3737 if (tp->t_template == 0) { 3738 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3739 so = NULL; 3740 m_freem(m); 3741 goto abort; 3742 } 3743 #ifdef TCP_SACK 3744 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; 3745 #endif 3746 3747 tp->ts_modulate = sc->sc_modulate; 3748 tp->ts_recent = sc->sc_timestamp; 3749 tp->iss = sc->sc_iss; 3750 tp->irs = sc->sc_irs; 3751 tcp_sendseqinit(tp); 3752 #if defined (TCP_SACK) || defined(TCP_ECN) 3753 tp->snd_last = tp->snd_una; 3754 #endif /* TCP_SACK */ 3755 #if defined(TCP_SACK) && defined(TCP_FACK) 3756 tp->snd_fack = tp->snd_una; 3757 tp->retran_data = 0; 3758 tp->snd_awnd = 0; 3759 #endif /* TCP_FACK */ 3760 #ifdef TCP_ECN 3761 if (sc->sc_flags & SCF_ECN_PERMIT) { 3762 tp->t_flags |= TF_ECN_PERMIT; 3763 tcpstat_inc(tcps_ecn_accepts); 3764 } 3765 #endif 3766 #ifdef TCP_SACK 3767 if (sc->sc_flags & SCF_SACK_PERMIT) 3768 tp->t_flags |= TF_SACK_PERMIT; 3769 #endif 3770 #ifdef TCP_SIGNATURE 3771 if (sc->sc_flags & SCF_SIGNATURE) 3772 tp->t_flags |= TF_SIGNATURE; 3773 #endif 3774 tcp_rcvseqinit(tp); 3775 tp->t_state = TCPS_SYN_RECEIVED; 3776 tp->t_rcvtime = tcp_now; 3777 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3778 tcpstat_inc(tcps_accepts); 3779 3780 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3781 if (sc->sc_peermaxseg) 3782 tcp_mss_update(tp); 3783 /* Reset initial window to 1 segment for retransmit */ 3784 if (sc->sc_rxtshift > 0) 3785 tp->snd_cwnd = tp->t_maxseg; 3786 tp->snd_wl1 = sc->sc_irs; 3787 tp->rcv_up = sc->sc_irs + 1; 3788 3789 /* 3790 * This is what whould have happened in tcp_output() when 3791 * the SYN,ACK was sent. 3792 */ 3793 tp->snd_up = tp->snd_una; 3794 tp->snd_max = tp->snd_nxt = tp->iss+1; 3795 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3796 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3797 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3798 tp->last_ack_sent = tp->rcv_nxt; 3799 3800 tcpstat_inc(tcps_sc_completed); 3801 syn_cache_put(sc); 3802 return (so); 3803 3804 resetandabort: 3805 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3806 m->m_pkthdr.ph_rtableid); 3807 m_freem(m); 3808 abort: 3809 if (so != NULL) 3810 (void) soabort(so); 3811 syn_cache_put(sc); 3812 tcpstat_inc(tcps_sc_aborted); 3813 return ((struct socket *)(-1)); 3814 } 3815 3816 /* 3817 * This function is called when we get a RST for a 3818 * non-existent connection, so that we can see if the 3819 * connection is in the syn cache. If it is, zap it. 3820 */ 3821 3822 void 3823 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3824 u_int rtableid) 3825 { 3826 struct syn_cache *sc; 3827 struct syn_cache_head *scp; 3828 3829 NET_ASSERT_LOCKED(); 3830 3831 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3832 return; 3833 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3834 SEQ_GT(th->th_seq, sc->sc_irs + 1)) 3835 return; 3836 syn_cache_rm(sc); 3837 tcpstat_inc(tcps_sc_reset); 3838 syn_cache_put(sc); 3839 } 3840 3841 void 3842 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3843 u_int rtableid) 3844 { 3845 struct syn_cache *sc; 3846 struct syn_cache_head *scp; 3847 3848 NET_ASSERT_LOCKED(); 3849 3850 if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL) 3851 return; 3852 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3853 if (ntohl (th->th_seq) != sc->sc_iss) { 3854 return; 3855 } 3856 3857 /* 3858 * If we've retransmitted 3 times and this is our second error, 3859 * we remove the entry. Otherwise, we allow it to continue on. 3860 * This prevents us from incorrectly nuking an entry during a 3861 * spurious network outage. 3862 * 3863 * See tcp_notify(). 3864 */ 3865 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3866 sc->sc_flags |= SCF_UNREACH; 3867 return; 3868 } 3869 3870 syn_cache_rm(sc); 3871 tcpstat_inc(tcps_sc_unreach); 3872 syn_cache_put(sc); 3873 } 3874 3875 /* 3876 * Given a LISTEN socket and an inbound SYN request, add 3877 * this to the syn cache, and send back a segment: 3878 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3879 * to the source. 3880 * 3881 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3882 * Doing so would require that we hold onto the data and deliver it 3883 * to the application. However, if we are the target of a SYN-flood 3884 * DoS attack, an attacker could send data which would eventually 3885 * consume all available buffer space if it were ACKed. By not ACKing 3886 * the data, we avoid this DoS scenario. 3887 */ 3888 3889 int 3890 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3891 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3892 struct tcp_opt_info *oi, tcp_seq *issp) 3893 { 3894 struct tcpcb tb, *tp; 3895 long win; 3896 struct syn_cache *sc; 3897 struct syn_cache_head *scp; 3898 struct mbuf *ipopts; 3899 3900 tp = sototcpcb(so); 3901 3902 /* 3903 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3904 * 3905 * Note this check is performed in tcp_input() very early on. 3906 */ 3907 3908 /* 3909 * Initialize some local state. 3910 */ 3911 win = sbspace(so, &so->so_rcv); 3912 if (win > TCP_MAXWIN) 3913 win = TCP_MAXWIN; 3914 3915 bzero(&tb, sizeof(tb)); 3916 #ifdef TCP_SIGNATURE 3917 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3918 #else 3919 if (optp) { 3920 #endif 3921 tb.pf = tp->pf; 3922 #ifdef TCP_SACK 3923 tb.sack_enable = tp->sack_enable; 3924 #endif 3925 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3926 #ifdef TCP_SIGNATURE 3927 if (tp->t_flags & TF_SIGNATURE) 3928 tb.t_flags |= TF_SIGNATURE; 3929 #endif 3930 tb.t_state = TCPS_LISTEN; 3931 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3932 sotoinpcb(so)->inp_rtableid)) 3933 return (-1); 3934 } 3935 3936 switch (src->sa_family) { 3937 case AF_INET: 3938 /* 3939 * Remember the IP options, if any. 3940 */ 3941 ipopts = ip_srcroute(m); 3942 break; 3943 default: 3944 ipopts = NULL; 3945 } 3946 3947 /* 3948 * See if we already have an entry for this connection. 3949 * If we do, resend the SYN,ACK. We do not count this 3950 * as a retransmission (XXX though maybe we should). 3951 */ 3952 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3953 if (sc != NULL) { 3954 tcpstat_inc(tcps_sc_dupesyn); 3955 if (ipopts) { 3956 /* 3957 * If we were remembering a previous source route, 3958 * forget it and use the new one we've been given. 3959 */ 3960 m_free(sc->sc_ipopts); 3961 sc->sc_ipopts = ipopts; 3962 } 3963 sc->sc_timestamp = tb.ts_recent; 3964 if (syn_cache_respond(sc, m) == 0) { 3965 tcpstat_inc(tcps_sndacks); 3966 tcpstat_inc(tcps_sndtotal); 3967 } 3968 return (0); 3969 } 3970 3971 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3972 if (sc == NULL) { 3973 m_free(ipopts); 3974 return (-1); 3975 } 3976 3977 /* 3978 * Fill in the cache, and put the necessary IP and TCP 3979 * options into the reply. 3980 */ 3981 memcpy(&sc->sc_src, src, src->sa_len); 3982 memcpy(&sc->sc_dst, dst, dst->sa_len); 3983 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3984 sc->sc_flags = 0; 3985 sc->sc_ipopts = ipopts; 3986 sc->sc_irs = th->th_seq; 3987 3988 sc->sc_iss = issp ? *issp : arc4random(); 3989 sc->sc_peermaxseg = oi->maxseg; 3990 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3991 sc->sc_win = win; 3992 sc->sc_timestamp = tb.ts_recent; 3993 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3994 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3995 sc->sc_flags |= SCF_TIMESTAMP; 3996 sc->sc_modulate = arc4random(); 3997 } 3998 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3999 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4000 sc->sc_requested_s_scale = tb.requested_s_scale; 4001 sc->sc_request_r_scale = 0; 4002 /* 4003 * Pick the smallest possible scaling factor that 4004 * will still allow us to scale up to sb_max. 4005 * 4006 * We do this because there are broken firewalls that 4007 * will corrupt the window scale option, leading to 4008 * the other endpoint believing that our advertised 4009 * window is unscaled. At scale factors larger than 4010 * 5 the unscaled window will drop below 1500 bytes, 4011 * leading to serious problems when traversing these 4012 * broken firewalls. 4013 * 4014 * With the default sbmax of 256K, a scale factor 4015 * of 3 will be chosen by this algorithm. Those who 4016 * choose a larger sbmax should watch out 4017 * for the compatiblity problems mentioned above. 4018 * 4019 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4020 * or <SYN,ACK>) segment itself is never scaled. 4021 */ 4022 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4023 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4024 sc->sc_request_r_scale++; 4025 } else { 4026 sc->sc_requested_s_scale = 15; 4027 sc->sc_request_r_scale = 15; 4028 } 4029 #ifdef TCP_ECN 4030 /* 4031 * if both ECE and CWR flag bits are set, peer is ECN capable. 4032 */ 4033 if (tcp_do_ecn && 4034 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 4035 sc->sc_flags |= SCF_ECN_PERMIT; 4036 #endif 4037 #ifdef TCP_SACK 4038 /* 4039 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 4040 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 4041 */ 4042 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 4043 sc->sc_flags |= SCF_SACK_PERMIT; 4044 #endif 4045 #ifdef TCP_SIGNATURE 4046 if (tb.t_flags & TF_SIGNATURE) 4047 sc->sc_flags |= SCF_SIGNATURE; 4048 #endif 4049 sc->sc_tp = tp; 4050 if (syn_cache_respond(sc, m) == 0) { 4051 syn_cache_insert(sc, tp); 4052 tcpstat_inc(tcps_sndacks); 4053 tcpstat_inc(tcps_sndtotal); 4054 } else { 4055 syn_cache_put(sc); 4056 tcpstat_inc(tcps_sc_dropped); 4057 } 4058 4059 return (0); 4060 } 4061 4062 int 4063 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4064 { 4065 u_int8_t *optp; 4066 int optlen, error; 4067 u_int16_t tlen; 4068 struct ip *ip = NULL; 4069 #ifdef INET6 4070 struct ip6_hdr *ip6 = NULL; 4071 #endif 4072 struct tcphdr *th; 4073 u_int hlen; 4074 struct inpcb *inp; 4075 4076 switch (sc->sc_src.sa.sa_family) { 4077 case AF_INET: 4078 hlen = sizeof(struct ip); 4079 break; 4080 #ifdef INET6 4081 case AF_INET6: 4082 hlen = sizeof(struct ip6_hdr); 4083 break; 4084 #endif 4085 default: 4086 m_freem(m); 4087 return (EAFNOSUPPORT); 4088 } 4089 4090 /* Compute the size of the TCP options. */ 4091 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4092 #ifdef TCP_SACK 4093 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + 4094 #endif 4095 #ifdef TCP_SIGNATURE 4096 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 4097 #endif 4098 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4099 4100 tlen = hlen + sizeof(struct tcphdr) + optlen; 4101 4102 /* 4103 * Create the IP+TCP header from scratch. 4104 */ 4105 m_freem(m); 4106 #ifdef DIAGNOSTIC 4107 if (max_linkhdr + tlen > MCLBYTES) 4108 return (ENOBUFS); 4109 #endif 4110 MGETHDR(m, M_DONTWAIT, MT_DATA); 4111 if (m && max_linkhdr + tlen > MHLEN) { 4112 MCLGET(m, M_DONTWAIT); 4113 if ((m->m_flags & M_EXT) == 0) { 4114 m_freem(m); 4115 m = NULL; 4116 } 4117 } 4118 if (m == NULL) 4119 return (ENOBUFS); 4120 4121 /* Fixup the mbuf. */ 4122 m->m_data += max_linkhdr; 4123 m->m_len = m->m_pkthdr.len = tlen; 4124 m->m_pkthdr.ph_ifidx = 0; 4125 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4126 memset(mtod(m, u_char *), 0, tlen); 4127 4128 switch (sc->sc_src.sa.sa_family) { 4129 case AF_INET: 4130 ip = mtod(m, struct ip *); 4131 ip->ip_dst = sc->sc_src.sin.sin_addr; 4132 ip->ip_src = sc->sc_dst.sin.sin_addr; 4133 ip->ip_p = IPPROTO_TCP; 4134 th = (struct tcphdr *)(ip + 1); 4135 th->th_dport = sc->sc_src.sin.sin_port; 4136 th->th_sport = sc->sc_dst.sin.sin_port; 4137 break; 4138 #ifdef INET6 4139 case AF_INET6: 4140 ip6 = mtod(m, struct ip6_hdr *); 4141 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4142 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4143 ip6->ip6_nxt = IPPROTO_TCP; 4144 /* ip6_plen will be updated in ip6_output() */ 4145 th = (struct tcphdr *)(ip6 + 1); 4146 th->th_dport = sc->sc_src.sin6.sin6_port; 4147 th->th_sport = sc->sc_dst.sin6.sin6_port; 4148 break; 4149 #endif 4150 default: 4151 unhandled_af(sc->sc_src.sa.sa_family); 4152 } 4153 4154 th->th_seq = htonl(sc->sc_iss); 4155 th->th_ack = htonl(sc->sc_irs + 1); 4156 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4157 th->th_flags = TH_SYN|TH_ACK; 4158 #ifdef TCP_ECN 4159 /* Set ECE for SYN-ACK if peer supports ECN. */ 4160 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT)) 4161 th->th_flags |= TH_ECE; 4162 #endif 4163 th->th_win = htons(sc->sc_win); 4164 /* th_sum already 0 */ 4165 /* th_urp already 0 */ 4166 4167 /* Tack on the TCP options. */ 4168 optp = (u_int8_t *)(th + 1); 4169 *optp++ = TCPOPT_MAXSEG; 4170 *optp++ = 4; 4171 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4172 *optp++ = sc->sc_ourmaxseg & 0xff; 4173 4174 #ifdef TCP_SACK 4175 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4176 if (sc->sc_flags & SCF_SACK_PERMIT) { 4177 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4178 optp += 4; 4179 } 4180 #endif 4181 4182 if (sc->sc_request_r_scale != 15) { 4183 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4184 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4185 sc->sc_request_r_scale); 4186 optp += 4; 4187 } 4188 4189 if (sc->sc_flags & SCF_TIMESTAMP) { 4190 u_int32_t *lp = (u_int32_t *)(optp); 4191 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4192 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4193 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4194 *lp = htonl(sc->sc_timestamp); 4195 optp += TCPOLEN_TSTAMP_APPA; 4196 } 4197 4198 #ifdef TCP_SIGNATURE 4199 if (sc->sc_flags & SCF_SIGNATURE) { 4200 union sockaddr_union src, dst; 4201 struct tdb *tdb; 4202 4203 bzero(&src, sizeof(union sockaddr_union)); 4204 bzero(&dst, sizeof(union sockaddr_union)); 4205 src.sa.sa_len = sc->sc_src.sa.sa_len; 4206 src.sa.sa_family = sc->sc_src.sa.sa_family; 4207 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4208 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4209 4210 switch (sc->sc_src.sa.sa_family) { 4211 case 0: /*default to PF_INET*/ 4212 case AF_INET: 4213 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4214 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4215 break; 4216 #ifdef INET6 4217 case AF_INET6: 4218 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4219 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4220 break; 4221 #endif /* INET6 */ 4222 } 4223 4224 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4225 0, &src, &dst, IPPROTO_TCP); 4226 if (tdb == NULL) { 4227 m_freem(m); 4228 return (EPERM); 4229 } 4230 4231 /* Send signature option */ 4232 *(optp++) = TCPOPT_SIGNATURE; 4233 *(optp++) = TCPOLEN_SIGNATURE; 4234 4235 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4236 hlen, 0, optp) < 0) { 4237 m_freem(m); 4238 return (EINVAL); 4239 } 4240 optp += 16; 4241 4242 /* Pad options list to the next 32 bit boundary and 4243 * terminate it. 4244 */ 4245 *optp++ = TCPOPT_NOP; 4246 *optp++ = TCPOPT_EOL; 4247 } 4248 #endif /* TCP_SIGNATURE */ 4249 4250 /* Compute the packet's checksum. */ 4251 switch (sc->sc_src.sa.sa_family) { 4252 case AF_INET: 4253 ip->ip_len = htons(tlen - hlen); 4254 th->th_sum = 0; 4255 th->th_sum = in_cksum(m, tlen); 4256 break; 4257 #ifdef INET6 4258 case AF_INET6: 4259 ip6->ip6_plen = htons(tlen - hlen); 4260 th->th_sum = 0; 4261 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4262 break; 4263 #endif 4264 } 4265 4266 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4267 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4268 4269 /* 4270 * Fill in some straggling IP bits. Note the stack expects 4271 * ip_len to be in host order, for convenience. 4272 */ 4273 switch (sc->sc_src.sa.sa_family) { 4274 case AF_INET: 4275 ip->ip_len = htons(tlen); 4276 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4277 if (inp != NULL) 4278 ip->ip_tos = inp->inp_ip.ip_tos; 4279 break; 4280 #ifdef INET6 4281 case AF_INET6: 4282 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4283 ip6->ip6_vfc |= IPV6_VERSION; 4284 ip6->ip6_plen = htons(tlen - hlen); 4285 /* ip6_hlim will be initialized afterwards */ 4286 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4287 break; 4288 #endif 4289 } 4290 4291 switch (sc->sc_src.sa.sa_family) { 4292 case AF_INET: 4293 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4294 (ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0); 4295 break; 4296 #ifdef INET6 4297 case AF_INET6: 4298 ip6->ip6_hlim = in6_selecthlim(inp); 4299 4300 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4301 NULL, NULL); 4302 break; 4303 #endif 4304 default: 4305 error = EAFNOSUPPORT; 4306 break; 4307 } 4308 return (error); 4309 } 4310