1 /* $NetBSD: tcp_input.c,v 1.105 2000/03/01 12:49:37 itojun Exp $ */ 2 3 /* 4 %%% portions-copyright-nrl-95 5 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 7 Reserved. All rights under this copyright have been assigned to the US 8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 10 software. 11 You should have received a copy of the license with this software. If you 12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 13 14 */ 15 16 /* 17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 18 * All rights reserved. 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 1. Redistributions of source code must retain the above copyright 24 * notice, this list of conditions and the following disclaimer. 25 * 2. Redistributions in binary form must reproduce the above copyright 26 * notice, this list of conditions and the following disclaimer in the 27 * documentation and/or other materials provided with the distribution. 28 * 3. Neither the name of the project nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 45 /*- 46 * Copyright (c) 1997, 1998, 1999 The NetBSD Foundation, Inc. 47 * All rights reserved. 48 * 49 * This code is derived from software contributed to The NetBSD Foundation 50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 51 * Facility, NASA Ames Research Center. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 3. All advertising materials mentioning features or use of this software 62 * must display the following acknowledgement: 63 * This product includes software developed by the NetBSD 64 * Foundation, Inc. and its contributors. 65 * 4. Neither the name of The NetBSD Foundation nor the names of its 66 * contributors may be used to endorse or promote products derived 67 * from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 * POSSIBILITY OF SUCH DAMAGE. 80 */ 81 82 /* 83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 84 * The Regents of the University of California. All rights reserved. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. All advertising materials mentioning features or use of this software 95 * must display the following acknowledgement: 96 * This product includes software developed by the University of 97 * California, Berkeley and its contributors. 98 * 4. Neither the name of the University nor the names of its contributors 99 * may be used to endorse or promote products derived from this software 100 * without specific prior written permission. 101 * 102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 112 * SUCH DAMAGE. 113 * 114 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 115 */ 116 117 /* 118 * TODO list for SYN cache stuff: 119 * 120 * Find room for a "state" field, which is needed to keep a 121 * compressed state for TIME_WAIT TCBs. It's been noted already 122 * that this is fairly important for very high-volume web and 123 * mail servers, which use a large number of short-lived 124 * connections. 125 */ 126 127 #include "opt_inet.h" 128 #include "opt_ipsec.h" 129 130 #include <sys/param.h> 131 #include <sys/systm.h> 132 #include <sys/malloc.h> 133 #include <sys/mbuf.h> 134 #include <sys/protosw.h> 135 #include <sys/socket.h> 136 #include <sys/socketvar.h> 137 #include <sys/errno.h> 138 #include <sys/syslog.h> 139 #include <sys/pool.h> 140 #include <sys/domain.h> 141 142 #include <net/if.h> 143 #include <net/route.h> 144 #include <net/if_types.h> 145 146 #include <netinet/in.h> 147 #include <netinet/in_systm.h> 148 #include <netinet/ip.h> 149 #include <netinet/in_pcb.h> 150 #include <netinet/ip_var.h> 151 152 #ifdef INET6 153 #ifndef INET 154 #include <netinet/in.h> 155 #endif 156 #include <netinet/ip6.h> 157 #include <netinet6/in6_pcb.h> 158 #include <netinet6/ip6_var.h> 159 #include <netinet6/in6_var.h> 160 #include <netinet/icmp6.h> 161 #include <netinet6/nd6.h> 162 #endif 163 164 #ifdef PULLDOWN_TEST 165 #ifndef INET6 166 /* always need ip6.h for IP6_EXTHDR_GET */ 167 #include <netinet/ip6.h> 168 #endif 169 #endif 170 171 #include <netinet/tcp.h> 172 #include <netinet/tcp_fsm.h> 173 #include <netinet/tcp_seq.h> 174 #include <netinet/tcp_timer.h> 175 #include <netinet/tcp_var.h> 176 #include <netinet/tcpip.h> 177 #include <netinet/tcp_debug.h> 178 179 #include <machine/stdarg.h> 180 181 #ifdef IPSEC 182 #include <netinet6/ipsec.h> 183 #include <netkey/key.h> 184 #include <netkey/key_debug.h> 185 #endif /*IPSEC*/ 186 #ifdef INET6 187 #include "faith.h" 188 #endif 189 190 int tcprexmtthresh = 3; 191 int tcp_log_refused; 192 193 struct timeval tcp_rst_ratelim_last; 194 195 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 196 197 /* for modulo comparisons of timestamps */ 198 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 199 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 200 201 /* 202 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 203 */ 204 #ifdef INET6 205 #define ND6_HINT(tp) \ 206 do { \ 207 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 \ 208 && tp->t_in6pcb->in6p_route.ro_rt) { \ 209 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL); \ 210 } \ 211 } while (0) 212 #else 213 #define ND6_HINT(tp) 214 #endif 215 216 /* 217 * Macro to compute ACK transmission behavior. Delay the ACK unless 218 * we have already delayed an ACK (must send an ACK every two segments). 219 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 220 * option is enabled. 221 */ 222 #define TCP_SETUP_ACK(tp, th) \ 223 do { \ 224 if ((tp)->t_flags & TF_DELACK || \ 225 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \ 226 tp->t_flags |= TF_ACKNOW; \ 227 else \ 228 TCP_SET_DELACK(tp); \ 229 } while (0) 230 231 /* 232 * Convert TCP protocol fields to host order for easier processing. 233 */ 234 #define TCP_FIELDS_TO_HOST(th) \ 235 do { \ 236 NTOHL((th)->th_seq); \ 237 NTOHL((th)->th_ack); \ 238 NTOHS((th)->th_win); \ 239 NTOHS((th)->th_urp); \ 240 } while (0) 241 242 int 243 tcp_reass(tp, th, m, tlen) 244 register struct tcpcb *tp; 245 register struct tcphdr *th; 246 struct mbuf *m; 247 int *tlen; 248 { 249 register struct ipqent *p, *q, *nq, *tiqe = NULL; 250 struct socket *so = NULL; 251 int pkt_flags; 252 tcp_seq pkt_seq; 253 unsigned pkt_len; 254 u_long rcvpartdupbyte = 0; 255 u_long rcvoobyte; 256 257 if (tp->t_inpcb) 258 so = tp->t_inpcb->inp_socket; 259 #ifdef INET6 260 else if (tp->t_in6pcb) 261 so = tp->t_in6pcb->in6p_socket; 262 #endif 263 264 TCP_REASS_LOCK_CHECK(tp); 265 266 /* 267 * Call with th==0 after become established to 268 * force pre-ESTABLISHED data up to user socket. 269 */ 270 if (th == 0) 271 goto present; 272 273 rcvoobyte = *tlen; 274 /* 275 * Copy these to local variables because the tcpiphdr 276 * gets munged while we are collapsing mbufs. 277 */ 278 pkt_seq = th->th_seq; 279 pkt_len = *tlen; 280 pkt_flags = th->th_flags; 281 /* 282 * Find a segment which begins after this one does. 283 */ 284 for (p = NULL, q = tp->segq.lh_first; q != NULL; q = nq) { 285 nq = q->ipqe_q.le_next; 286 /* 287 * If the received segment is just right after this 288 * fragment, merge the two together and then check 289 * for further overlaps. 290 */ 291 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 292 #ifdef TCPREASS_DEBUG 293 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 294 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 295 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 296 #endif 297 pkt_len += q->ipqe_len; 298 pkt_flags |= q->ipqe_flags; 299 pkt_seq = q->ipqe_seq; 300 m_cat(q->ipqe_m, m); 301 m = q->ipqe_m; 302 goto free_ipqe; 303 } 304 /* 305 * If the received segment is completely past this 306 * fragment, we need to go the next fragment. 307 */ 308 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 309 p = q; 310 continue; 311 } 312 /* 313 * If the fragment is past the received segment, 314 * it (or any following) can't be concatenated. 315 */ 316 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) 317 break; 318 /* 319 * We've received all the data in this segment before. 320 * mark it as a duplicate and return. 321 */ 322 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 323 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 324 tcpstat.tcps_rcvduppack++; 325 tcpstat.tcps_rcvdupbyte += pkt_len; 326 m_freem(m); 327 if (tiqe != NULL) 328 pool_put(&ipqent_pool, tiqe); 329 return (0); 330 } 331 /* 332 * Received segment completely overlaps this fragment 333 * so we drop the fragment (this keeps the temporal 334 * ordering of segments correct). 335 */ 336 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 337 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 338 rcvpartdupbyte += q->ipqe_len; 339 m_freem(q->ipqe_m); 340 goto free_ipqe; 341 } 342 /* 343 * RX'ed segment extends past the end of the 344 * fragment. Drop the overlapping bytes. Then 345 * merge the fragment and segment then treat as 346 * a longer received packet. 347 */ 348 if (SEQ_LT(q->ipqe_seq, pkt_seq) 349 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 350 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 351 #ifdef TCPREASS_DEBUG 352 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 353 tp, overlap, 354 pkt_seq, pkt_seq + pkt_len, pkt_len); 355 #endif 356 m_adj(m, overlap); 357 rcvpartdupbyte += overlap; 358 m_cat(q->ipqe_m, m); 359 m = q->ipqe_m; 360 pkt_seq = q->ipqe_seq; 361 pkt_len += q->ipqe_len - overlap; 362 rcvoobyte -= overlap; 363 goto free_ipqe; 364 } 365 /* 366 * RX'ed segment extends past the front of the 367 * fragment. Drop the overlapping bytes on the 368 * received packet. The packet will then be 369 * contatentated with this fragment a bit later. 370 */ 371 if (SEQ_GT(q->ipqe_seq, pkt_seq) 372 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 373 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 374 #ifdef TCPREASS_DEBUG 375 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 376 tp, overlap, 377 pkt_seq, pkt_seq + pkt_len, pkt_len); 378 #endif 379 m_adj(m, -overlap); 380 pkt_len -= overlap; 381 rcvpartdupbyte += overlap; 382 rcvoobyte -= overlap; 383 } 384 /* 385 * If the received segment immediates precedes this 386 * fragment then tack the fragment onto this segment 387 * and reinsert the data. 388 */ 389 if (q->ipqe_seq == pkt_seq + pkt_len) { 390 #ifdef TCPREASS_DEBUG 391 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 392 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 393 pkt_seq, pkt_seq + pkt_len, pkt_len); 394 #endif 395 pkt_len += q->ipqe_len; 396 pkt_flags |= q->ipqe_flags; 397 m_cat(m, q->ipqe_m); 398 LIST_REMOVE(q, ipqe_q); 399 LIST_REMOVE(q, ipqe_timeq); 400 if (tiqe == NULL) { 401 tiqe = q; 402 } else { 403 pool_put(&ipqent_pool, q); 404 } 405 break; 406 } 407 /* 408 * If the fragment is before the segment, remember it. 409 * When this loop is terminated, p will contain the 410 * pointer to fragment that is right before the received 411 * segment. 412 */ 413 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 414 p = q; 415 416 continue; 417 418 /* 419 * This is a common operation. It also will allow 420 * to save doing a malloc/free in most instances. 421 */ 422 free_ipqe: 423 LIST_REMOVE(q, ipqe_q); 424 LIST_REMOVE(q, ipqe_timeq); 425 if (tiqe == NULL) { 426 tiqe = q; 427 } else { 428 pool_put(&ipqent_pool, q); 429 } 430 } 431 432 /* 433 * Allocate a new queue entry since the received segment did not 434 * collapse onto any other out-of-order block; thus we are allocating 435 * a new block. If it had collapsed, tiqe would not be NULL and 436 * we would be reusing it. 437 * XXX If we can't, just drop the packet. XXX 438 */ 439 if (tiqe == NULL) { 440 tiqe = pool_get(&ipqent_pool, PR_NOWAIT); 441 if (tiqe == NULL) { 442 tcpstat.tcps_rcvmemdrop++; 443 m_freem(m); 444 return (0); 445 } 446 } 447 448 /* 449 * Update the counters. 450 */ 451 tcpstat.tcps_rcvoopack++; 452 tcpstat.tcps_rcvoobyte += rcvoobyte; 453 if (rcvpartdupbyte) { 454 tcpstat.tcps_rcvpartduppack++; 455 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte; 456 } 457 458 /* 459 * Insert the new fragment queue entry into both queues. 460 */ 461 tiqe->ipqe_m = m; 462 tiqe->ipqe_seq = pkt_seq; 463 tiqe->ipqe_len = pkt_len; 464 tiqe->ipqe_flags = pkt_flags; 465 if (p == NULL) { 466 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 467 #ifdef TCPREASS_DEBUG 468 if (tiqe->ipqe_seq != tp->rcv_nxt) 469 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 470 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 471 #endif 472 } else { 473 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 474 #ifdef TCPREASS_DEBUG 475 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 476 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 477 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 478 #endif 479 } 480 481 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 482 483 present: 484 /* 485 * Present data to user, advancing rcv_nxt through 486 * completed sequence space. 487 */ 488 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 489 return (0); 490 q = tp->segq.lh_first; 491 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 492 return (0); 493 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 494 return (0); 495 496 tp->rcv_nxt += q->ipqe_len; 497 pkt_flags = q->ipqe_flags & TH_FIN; 498 ND6_HINT(tp); 499 500 LIST_REMOVE(q, ipqe_q); 501 LIST_REMOVE(q, ipqe_timeq); 502 if (so->so_state & SS_CANTRCVMORE) 503 m_freem(q->ipqe_m); 504 else 505 sbappend(&so->so_rcv, q->ipqe_m); 506 pool_put(&ipqent_pool, q); 507 sorwakeup(so); 508 return (pkt_flags); 509 } 510 511 #if defined(INET6) && !defined(TCP6) 512 int 513 tcp6_input(mp, offp, proto) 514 struct mbuf **mp; 515 int *offp, proto; 516 { 517 struct mbuf *m = *mp; 518 519 #if defined(NFAITH) && 0 < NFAITH 520 if (m->m_pkthdr.rcvif) { 521 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 522 /* XXX send icmp6 host/port unreach? */ 523 m_freem(m); 524 return IPPROTO_DONE; 525 } 526 } 527 #endif 528 529 /* 530 * draft-itojun-ipv6-tcp-to-anycast 531 * better place to put this in? 532 */ 533 if (m->m_flags & M_ANYCAST6) { 534 struct ip6_hdr *ip6; 535 if (m->m_len < sizeof(struct ip6_hdr)) { 536 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 537 tcpstat.tcps_rcvshort++; 538 return IPPROTO_DONE; 539 } 540 } 541 ip6 = mtod(m, struct ip6_hdr *); 542 icmp6_error(m, ICMP6_DST_UNREACH, 543 ICMP6_DST_UNREACH_ADDR, 544 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 545 return IPPROTO_DONE; 546 } 547 548 tcp_input(m, *offp, proto); 549 return IPPROTO_DONE; 550 } 551 #endif 552 553 /* 554 * TCP input routine, follows pages 65-76 of the 555 * protocol specification dated September, 1981 very closely. 556 */ 557 void 558 #if __STDC__ 559 tcp_input(struct mbuf *m, ...) 560 #else 561 tcp_input(m, va_alist) 562 register struct mbuf *m; 563 #endif 564 { 565 int proto; 566 register struct tcphdr *th; 567 struct ip *ip; 568 register struct inpcb *inp; 569 #ifdef INET6 570 struct ip6_hdr *ip6; 571 register struct in6pcb *in6p; 572 #endif 573 caddr_t optp = NULL; 574 int optlen = 0; 575 int len, tlen, toff, hdroptlen = 0; 576 register struct tcpcb *tp = 0; 577 register int tiflags; 578 struct socket *so = NULL; 579 int todrop, acked, ourfinisacked, needoutput = 0; 580 short ostate = 0; 581 int iss = 0; 582 u_long tiwin; 583 struct tcp_opt_info opti; 584 int off, iphlen; 585 va_list ap; 586 int af; /* af on the wire */ 587 struct mbuf *tcp_saveti = NULL; 588 589 va_start(ap, m); 590 toff = va_arg(ap, int); 591 proto = va_arg(ap, int); 592 va_end(ap); 593 594 tcpstat.tcps_rcvtotal++; 595 596 bzero(&opti, sizeof(opti)); 597 opti.ts_present = 0; 598 opti.maxseg = 0; 599 600 /* 601 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 602 * 603 * TCP is, by definition, unicast, so we reject all 604 * multicast outright. 605 * 606 * Note, there are additional src/dst address checks in 607 * the AF-specific code below. 608 */ 609 if (m->m_flags & (M_BCAST|M_MCAST)) { 610 /* XXX stat */ 611 goto drop; 612 } 613 #ifdef INET6 614 if (m->m_flags & M_ANYCAST6) { 615 /* XXX stat */ 616 goto drop; 617 } 618 #endif 619 620 /* 621 * Get IP and TCP header together in first mbuf. 622 * Note: IP leaves IP header in first mbuf. 623 */ 624 ip = mtod(m, struct ip *); 625 #ifdef INET6 626 ip6 = NULL; 627 #endif 628 switch (ip->ip_v) { 629 case 4: 630 af = AF_INET; 631 iphlen = sizeof(struct ip); 632 #ifndef PULLDOWN_TEST 633 /* would like to get rid of this... */ 634 if (toff > sizeof (struct ip)) { 635 ip_stripoptions(m, (struct mbuf *)0); 636 toff = sizeof(struct ip); 637 } 638 if (m->m_len < toff + sizeof (struct tcphdr)) { 639 if ((m = m_pullup(m, toff + sizeof (struct tcphdr))) == 0) { 640 tcpstat.tcps_rcvshort++; 641 return; 642 } 643 } 644 ip = mtod(m, struct ip *); 645 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 646 #else 647 ip = mtod(m, struct ip *); 648 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 649 sizeof(struct tcphdr)); 650 if (th == NULL) { 651 tcpstat.tcps_rcvshort++; 652 return; 653 } 654 #endif 655 656 /* 657 * Make sure destination address is not multicast. 658 * Source address checked in ip_input(). 659 */ 660 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 661 /* XXX stat */ 662 goto drop; 663 } 664 665 /* We do the checksum after PCB lookup... */ 666 len = ip->ip_len; 667 tlen = len - toff; 668 break; 669 #ifdef INET6 670 case 6: 671 ip = NULL; 672 iphlen = sizeof(struct ip6_hdr); 673 af = AF_INET6; 674 #ifndef PULLDOWN_TEST 675 if (m->m_len < toff + sizeof(struct tcphdr)) { 676 m = m_pullup(m, toff + sizeof(struct tcphdr)); /*XXX*/ 677 if (m == NULL) { 678 tcpstat.tcps_rcvshort++; 679 return; 680 } 681 } 682 ip6 = mtod(m, struct ip6_hdr *); 683 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 684 #else 685 ip6 = mtod(m, struct ip6_hdr *); 686 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 687 sizeof(struct tcphdr)); 688 if (th == NULL) { 689 tcpstat.tcps_rcvshort++; 690 return; 691 } 692 #endif 693 694 /* Be proactive about malicious use of IPv4 mapped address */ 695 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 696 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 697 /* XXX stat */ 698 goto drop; 699 } 700 701 /* 702 * Make sure destination address is not multicast. 703 * Source address checked in ip6_input(). 704 */ 705 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 706 /* XXX stat */ 707 goto drop; 708 } 709 710 /* We do the checksum after PCB lookup... */ 711 len = m->m_pkthdr.len; 712 tlen = len - toff; 713 break; 714 #endif 715 default: 716 m_freem(m); 717 return; 718 } 719 720 /* 721 * Check that TCP offset makes sense, 722 * pull out TCP options and adjust length. XXX 723 */ 724 off = th->th_off << 2; 725 if (off < sizeof (struct tcphdr) || off > tlen) { 726 tcpstat.tcps_rcvbadoff++; 727 goto drop; 728 } 729 tlen -= off; 730 731 /* 732 * tcp_input() has been modified to use tlen to mean the TCP data 733 * length throughout the function. Other functions can use 734 * m->m_pkthdr.len as the basis for calculating the TCP data length. 735 * rja 736 */ 737 738 if (off > sizeof (struct tcphdr)) { 739 #ifndef PULLDOWN_TEST 740 if (m->m_len < toff + off) { 741 if ((m = m_pullup(m, toff + off)) == 0) { 742 tcpstat.tcps_rcvshort++; 743 return; 744 } 745 switch (af) { 746 case AF_INET: 747 ip = mtod(m, struct ip *); 748 break; 749 #ifdef INET6 750 case AF_INET6: 751 ip6 = mtod(m, struct ip6_hdr *); 752 break; 753 #endif 754 } 755 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 756 } 757 #else 758 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); 759 if (th == NULL) { 760 tcpstat.tcps_rcvshort++; 761 return; 762 } 763 /* 764 * NOTE: ip/ip6 will not be affected by m_pulldown() 765 * (as they're before toff) and we don't need to update those. 766 */ 767 #endif 768 optlen = off - sizeof (struct tcphdr); 769 optp = ((caddr_t)th) + sizeof(struct tcphdr); 770 /* 771 * Do quick retrieval of timestamp options ("options 772 * prediction?"). If timestamp is the only option and it's 773 * formatted as recommended in RFC 1323 appendix A, we 774 * quickly get the values now and not bother calling 775 * tcp_dooptions(), etc. 776 */ 777 if ((optlen == TCPOLEN_TSTAMP_APPA || 778 (optlen > TCPOLEN_TSTAMP_APPA && 779 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 780 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 781 (th->th_flags & TH_SYN) == 0) { 782 opti.ts_present = 1; 783 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 784 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 785 optp = NULL; /* we've parsed the options */ 786 } 787 } 788 tiflags = th->th_flags; 789 790 /* 791 * Locate pcb for segment. 792 */ 793 findpcb: 794 inp = NULL; 795 #ifdef INET6 796 in6p = NULL; 797 #endif 798 switch (af) { 799 case AF_INET: 800 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, 801 ip->ip_dst, th->th_dport); 802 if (inp == 0) { 803 ++tcpstat.tcps_pcbhashmiss; 804 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); 805 } 806 #if defined(INET6) && !defined(TCP6) 807 if (inp == 0) { 808 struct in6_addr s, d; 809 810 /* mapped addr case */ 811 bzero(&s, sizeof(s)); 812 s.s6_addr16[5] = htons(0xffff); 813 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src)); 814 bzero(&d, sizeof(d)); 815 d.s6_addr16[5] = htons(0xffff); 816 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst)); 817 in6p = in6_pcblookup_connect(&tcb6, &s, th->th_sport, 818 &d, th->th_dport, 0); 819 if (in6p == 0) { 820 ++tcpstat.tcps_pcbhashmiss; 821 in6p = in6_pcblookup_bind(&tcb6, &d, 822 th->th_dport, 0); 823 } 824 } 825 #endif 826 #ifndef INET6 827 if (inp == 0) 828 #else 829 if (inp == 0 && in6p == 0) 830 #endif 831 { 832 ++tcpstat.tcps_noport; 833 if (tcp_log_refused && (tiflags & TH_SYN)) { 834 #ifndef INET6 835 char src[4*sizeof "123"]; 836 char dst[4*sizeof "123"]; 837 #else 838 char src[INET6_ADDRSTRLEN]; 839 char dst[INET6_ADDRSTRLEN]; 840 #endif 841 if (ip) { 842 strcpy(src, inet_ntoa(ip->ip_src)); 843 strcpy(dst, inet_ntoa(ip->ip_dst)); 844 } 845 #ifdef INET6 846 else if (ip6) { 847 strcpy(src, ip6_sprintf(&ip6->ip6_src)); 848 strcpy(dst, ip6_sprintf(&ip6->ip6_dst)); 849 } 850 #endif 851 else { 852 strcpy(src, "(unknown)"); 853 strcpy(dst, "(unknown)"); 854 } 855 log(LOG_INFO, 856 "Connection attempt to TCP %s:%d from %s:%d\n", 857 dst, ntohs(th->th_dport), 858 src, ntohs(th->th_sport)); 859 } 860 TCP_FIELDS_TO_HOST(th); 861 goto dropwithreset_ratelim; 862 } 863 #ifdef IPSEC 864 if (inp && ipsec4_in_reject(m, inp)) { 865 ipsecstat.in_polvio++; 866 goto drop; 867 } 868 #ifdef INET6 869 else if (in6p && ipsec4_in_reject_so(m, in6p->in6p_socket)) { 870 ipsecstat.in_polvio++; 871 goto drop; 872 } 873 #endif 874 #endif /*IPSEC*/ 875 break; 876 #if defined(INET6) && !defined(TCP6) 877 case AF_INET6: 878 { 879 int faith; 880 881 #if defined(NFAITH) && NFAITH > 0 882 if (m->m_pkthdr.rcvif 883 && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 884 faith = 1; 885 } else 886 faith = 0; 887 #else 888 faith = 0; 889 #endif 890 in6p = in6_pcblookup_connect(&tcb6, &ip6->ip6_src, th->th_sport, 891 &ip6->ip6_dst, th->th_dport, faith); 892 if (in6p == NULL) { 893 ++tcpstat.tcps_pcbhashmiss; 894 in6p = in6_pcblookup_bind(&tcb6, &ip6->ip6_dst, 895 th->th_dport, faith); 896 } 897 if (in6p == NULL) { 898 ++tcpstat.tcps_noport; 899 TCP_FIELDS_TO_HOST(th); 900 goto dropwithreset_ratelim; 901 } 902 #ifdef IPSEC 903 if (ipsec6_in_reject(m, in6p)) { 904 ipsec6stat.in_polvio++; 905 goto drop; 906 } 907 #endif /*IPSEC*/ 908 break; 909 } 910 #endif 911 } 912 913 /* 914 * If the state is CLOSED (i.e., TCB does not exist) then 915 * all data in the incoming segment is discarded. 916 * If the TCB exists but is in CLOSED state, it is embryonic, 917 * but should either do a listen or a connect soon. 918 */ 919 tp = NULL; 920 so = NULL; 921 if (inp) { 922 tp = intotcpcb(inp); 923 so = inp->inp_socket; 924 } 925 #ifdef INET6 926 else if (in6p) { 927 tp = in6totcpcb(in6p); 928 so = in6p->in6p_socket; 929 } 930 #endif 931 if (tp == 0) { 932 TCP_FIELDS_TO_HOST(th); 933 goto dropwithreset_ratelim; 934 } 935 if (tp->t_state == TCPS_CLOSED) 936 goto drop; 937 938 /* 939 * Checksum extended TCP header and data. 940 */ 941 switch (af) { 942 case AF_INET: 943 #ifndef PULLDOWN_TEST 944 { 945 struct ipovly *ipov; 946 ipov = (struct ipovly *)ip; 947 bzero(ipov->ih_x1, sizeof ipov->ih_x1); 948 ipov->ih_len = htons(tlen + off); 949 950 if (in_cksum(m, len) != 0) { 951 tcpstat.tcps_rcvbadsum++; 952 goto drop; 953 } 954 } 955 #else 956 if (in4_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) { 957 tcpstat.tcps_rcvbadsum++; 958 goto drop; 959 } 960 #endif 961 break; 962 963 #ifdef INET6 964 case AF_INET6: 965 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) { 966 tcpstat.tcps_rcvbadsum++; 967 goto drop; 968 } 969 break; 970 #endif 971 } 972 973 TCP_FIELDS_TO_HOST(th); 974 975 /* Unscale the window into a 32-bit value. */ 976 if ((tiflags & TH_SYN) == 0) 977 tiwin = th->th_win << tp->snd_scale; 978 else 979 tiwin = th->th_win; 980 981 #ifdef INET6 982 /* save packet options if user wanted */ 983 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { 984 if (in6p->in6p_options) { 985 m_freem(in6p->in6p_options); 986 in6p->in6p_options = 0; 987 } 988 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); 989 } 990 #endif 991 992 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 993 union syn_cache_sa src; 994 union syn_cache_sa dst; 995 996 bzero(&src, sizeof(src)); 997 bzero(&dst, sizeof(dst)); 998 switch (af) { 999 case AF_INET: 1000 src.sin.sin_len = sizeof(struct sockaddr_in); 1001 src.sin.sin_family = AF_INET; 1002 src.sin.sin_addr = ip->ip_src; 1003 src.sin.sin_port = th->th_sport; 1004 1005 dst.sin.sin_len = sizeof(struct sockaddr_in); 1006 dst.sin.sin_family = AF_INET; 1007 dst.sin.sin_addr = ip->ip_dst; 1008 dst.sin.sin_port = th->th_dport; 1009 break; 1010 #ifdef INET6 1011 case AF_INET6: 1012 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 1013 src.sin6.sin6_family = AF_INET6; 1014 src.sin6.sin6_addr = ip6->ip6_src; 1015 src.sin6.sin6_port = th->th_sport; 1016 1017 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 1018 dst.sin6.sin6_family = AF_INET6; 1019 dst.sin6.sin6_addr = ip6->ip6_dst; 1020 dst.sin6.sin6_port = th->th_dport; 1021 break; 1022 #endif /* INET6 */ 1023 default: 1024 goto badsyn; /*sanity*/ 1025 } 1026 1027 if (so->so_options & SO_DEBUG) { 1028 ostate = tp->t_state; 1029 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 1030 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 1031 m_freem(tcp_saveti); 1032 tcp_saveti = NULL; 1033 } else { 1034 tcp_saveti->m_len += sizeof(struct tcphdr); 1035 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen, 1036 sizeof(struct tcphdr)); 1037 } 1038 if (tcp_saveti) { 1039 /* 1040 * need to recover version # field, which was 1041 * overwritten on ip_cksum computation. 1042 */ 1043 struct ip *sip; 1044 sip = mtod(tcp_saveti, struct ip *); 1045 switch (af) { 1046 case AF_INET: 1047 sip->ip_v = 4; 1048 break; 1049 #ifdef INET6 1050 case AF_INET6: 1051 sip->ip_v = 6; 1052 break; 1053 #endif 1054 } 1055 } 1056 } 1057 if (so->so_options & SO_ACCEPTCONN) { 1058 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1059 if (tiflags & TH_RST) { 1060 syn_cache_reset(&src.sa, &dst.sa, th); 1061 } else if ((tiflags & (TH_ACK|TH_SYN)) == 1062 (TH_ACK|TH_SYN)) { 1063 /* 1064 * Received a SYN,ACK. This should 1065 * never happen while we are in 1066 * LISTEN. Send an RST. 1067 */ 1068 goto badsyn; 1069 } else if (tiflags & TH_ACK) { 1070 so = syn_cache_get(&src.sa, &dst.sa, 1071 th, toff, tlen, so, m); 1072 if (so == NULL) { 1073 /* 1074 * We don't have a SYN for 1075 * this ACK; send an RST. 1076 */ 1077 goto badsyn; 1078 } else if (so == 1079 (struct socket *)(-1)) { 1080 /* 1081 * We were unable to create 1082 * the connection. If the 1083 * 3-way handshake was 1084 * completed, and RST has 1085 * been sent to the peer. 1086 * Since the mbuf might be 1087 * in use for the reply, 1088 * do not free it. 1089 */ 1090 m = NULL; 1091 } else { 1092 /* 1093 * We have created a 1094 * full-blown connection. 1095 */ 1096 tp = NULL; 1097 inp = NULL; 1098 #ifdef INET6 1099 in6p = NULL; 1100 #endif 1101 switch (so->so_proto->pr_domain->dom_family) { 1102 case AF_INET: 1103 inp = sotoinpcb(so); 1104 tp = intotcpcb(inp); 1105 break; 1106 #ifdef INET6 1107 case AF_INET6: 1108 in6p = sotoin6pcb(so); 1109 tp = in6totcpcb(in6p); 1110 break; 1111 #endif 1112 } 1113 if (tp == NULL) 1114 goto badsyn; /*XXX*/ 1115 tiwin <<= tp->snd_scale; 1116 goto after_listen; 1117 } 1118 } else { 1119 /* 1120 * None of RST, SYN or ACK was set. 1121 * This is an invalid packet for a 1122 * TCB in LISTEN state. Send a RST. 1123 */ 1124 goto badsyn; 1125 } 1126 } else { 1127 /* 1128 * Received a SYN. 1129 */ 1130 1131 /* 1132 * LISTEN socket received a SYN 1133 * from itself? This can't possibly 1134 * be valid; drop the packet. 1135 */ 1136 if (th->th_sport == th->th_dport) { 1137 int i; 1138 1139 switch (af) { 1140 case AF_INET: 1141 i = in_hosteq(ip->ip_src, ip->ip_dst); 1142 break; 1143 #ifdef INET6 1144 case AF_INET6: 1145 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1146 break; 1147 #endif 1148 default: 1149 i = 1; 1150 } 1151 if (i) { 1152 tcpstat.tcps_badsyn++; 1153 goto drop; 1154 } 1155 } 1156 1157 /* 1158 * SYN looks ok; create compressed TCP 1159 * state for it. 1160 */ 1161 if (so->so_qlen <= so->so_qlimit && 1162 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1163 so, m, optp, optlen, &opti)) 1164 m = NULL; 1165 } 1166 goto drop; 1167 } 1168 } 1169 1170 after_listen: 1171 #ifdef DIAGNOSTIC 1172 /* 1173 * Should not happen now that all embryonic connections 1174 * are handled with compressed state. 1175 */ 1176 if (tp->t_state == TCPS_LISTEN) 1177 panic("tcp_input: TCPS_LISTEN"); 1178 #endif 1179 1180 /* 1181 * Segment received on connection. 1182 * Reset idle time and keep-alive timer. 1183 */ 1184 tp->t_idle = 0; 1185 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1186 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1187 1188 /* 1189 * Process options. 1190 */ 1191 if (optp) 1192 tcp_dooptions(tp, optp, optlen, th, &opti); 1193 1194 /* 1195 * Header prediction: check for the two common cases 1196 * of a uni-directional data xfer. If the packet has 1197 * no control flags, is in-sequence, the window didn't 1198 * change and we're not retransmitting, it's a 1199 * candidate. If the length is zero and the ack moved 1200 * forward, we're the sender side of the xfer. Just 1201 * free the data acked & wake any higher level process 1202 * that was blocked waiting for space. If the length 1203 * is non-zero and the ack didn't move, we're the 1204 * receiver side. If we're getting packets in-order 1205 * (the reassembly queue is empty), add the data to 1206 * the socket buffer and note that we need a delayed ack. 1207 */ 1208 if (tp->t_state == TCPS_ESTABLISHED && 1209 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1210 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1211 th->th_seq == tp->rcv_nxt && 1212 tiwin && tiwin == tp->snd_wnd && 1213 tp->snd_nxt == tp->snd_max) { 1214 1215 /* 1216 * If last ACK falls within this segment's sequence numbers, 1217 * record the timestamp. 1218 */ 1219 if (opti.ts_present && 1220 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1221 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) { 1222 tp->ts_recent_age = tcp_now; 1223 tp->ts_recent = opti.ts_val; 1224 } 1225 1226 if (tlen == 0) { 1227 if (SEQ_GT(th->th_ack, tp->snd_una) && 1228 SEQ_LEQ(th->th_ack, tp->snd_max) && 1229 tp->snd_cwnd >= tp->snd_wnd && 1230 tp->t_dupacks < tcprexmtthresh) { 1231 /* 1232 * this is a pure ack for outstanding data. 1233 */ 1234 ++tcpstat.tcps_predack; 1235 if (opti.ts_present && opti.ts_ecr) 1236 tcp_xmit_timer(tp, 1237 tcp_now - opti.ts_ecr + 1); 1238 else if (tp->t_rtt && 1239 SEQ_GT(th->th_ack, tp->t_rtseq)) 1240 tcp_xmit_timer(tp, tp->t_rtt); 1241 acked = th->th_ack - tp->snd_una; 1242 tcpstat.tcps_rcvackpack++; 1243 tcpstat.tcps_rcvackbyte += acked; 1244 ND6_HINT(tp); 1245 sbdrop(&so->so_snd, acked); 1246 /* 1247 * We want snd_recover to track snd_una to 1248 * avoid sequence wraparound problems for 1249 * very large transfers. 1250 */ 1251 tp->snd_una = tp->snd_recover = th->th_ack; 1252 m_freem(m); 1253 1254 /* 1255 * If all outstanding data are acked, stop 1256 * retransmit timer, otherwise restart timer 1257 * using current (possibly backed-off) value. 1258 * If process is waiting for space, 1259 * wakeup/selwakeup/signal. If data 1260 * are ready to send, let tcp_output 1261 * decide between more output or persist. 1262 */ 1263 if (tp->snd_una == tp->snd_max) 1264 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1265 else if (TCP_TIMER_ISARMED(tp, 1266 TCPT_PERSIST) == 0) 1267 TCP_TIMER_ARM(tp, TCPT_REXMT, 1268 tp->t_rxtcur); 1269 1270 sowwakeup(so); 1271 if (so->so_snd.sb_cc) 1272 (void) tcp_output(tp); 1273 if (tcp_saveti) 1274 m_freem(tcp_saveti); 1275 return; 1276 } 1277 } else if (th->th_ack == tp->snd_una && 1278 tp->segq.lh_first == NULL && 1279 tlen <= sbspace(&so->so_rcv)) { 1280 /* 1281 * this is a pure, in-sequence data packet 1282 * with nothing on the reassembly queue and 1283 * we have enough buffer space to take it. 1284 */ 1285 ++tcpstat.tcps_preddat; 1286 tp->rcv_nxt += tlen; 1287 tcpstat.tcps_rcvpack++; 1288 tcpstat.tcps_rcvbyte += tlen; 1289 ND6_HINT(tp); 1290 /* 1291 * Drop TCP, IP headers and TCP options then add data 1292 * to socket buffer. 1293 */ 1294 m_adj(m, toff + off); 1295 sbappend(&so->so_rcv, m); 1296 sorwakeup(so); 1297 TCP_SETUP_ACK(tp, th); 1298 if (tp->t_flags & TF_ACKNOW) 1299 (void) tcp_output(tp); 1300 if (tcp_saveti) 1301 m_freem(tcp_saveti); 1302 return; 1303 } 1304 } 1305 1306 /* 1307 * Compute mbuf offset to TCP data segment. 1308 */ 1309 hdroptlen = toff + off; 1310 1311 /* 1312 * Calculate amount of space in receive window, 1313 * and then do TCP input processing. 1314 * Receive window is amount of space in rcv queue, 1315 * but not less than advertised window. 1316 */ 1317 { int win; 1318 1319 win = sbspace(&so->so_rcv); 1320 if (win < 0) 1321 win = 0; 1322 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1323 } 1324 1325 switch (tp->t_state) { 1326 1327 /* 1328 * If the state is SYN_SENT: 1329 * if seg contains an ACK, but not for our SYN, drop the input. 1330 * if seg contains a RST, then drop the connection. 1331 * if seg does not contain SYN, then drop it. 1332 * Otherwise this is an acceptable SYN segment 1333 * initialize tp->rcv_nxt and tp->irs 1334 * if seg contains ack then advance tp->snd_una 1335 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1336 * arrange for segment to be acked (eventually) 1337 * continue processing rest of data/controls, beginning with URG 1338 */ 1339 case TCPS_SYN_SENT: 1340 if ((tiflags & TH_ACK) && 1341 (SEQ_LEQ(th->th_ack, tp->iss) || 1342 SEQ_GT(th->th_ack, tp->snd_max))) 1343 goto dropwithreset; 1344 if (tiflags & TH_RST) { 1345 if (tiflags & TH_ACK) 1346 tp = tcp_drop(tp, ECONNREFUSED); 1347 goto drop; 1348 } 1349 if ((tiflags & TH_SYN) == 0) 1350 goto drop; 1351 if (tiflags & TH_ACK) { 1352 tp->snd_una = tp->snd_recover = th->th_ack; 1353 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1354 tp->snd_nxt = tp->snd_una; 1355 } 1356 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1357 tp->irs = th->th_seq; 1358 tcp_rcvseqinit(tp); 1359 tp->t_flags |= TF_ACKNOW; 1360 tcp_mss_from_peer(tp, opti.maxseg); 1361 1362 /* 1363 * Initialize the initial congestion window. If we 1364 * had to retransmit the SYN, we must initialize cwnd 1365 * to 1 segment (i.e. the Loss Window). 1366 */ 1367 if (tp->t_flags & TF_SYN_REXMT) 1368 tp->snd_cwnd = tp->t_peermss; 1369 else 1370 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, 1371 tp->t_peermss); 1372 1373 tcp_rmx_rtt(tp); 1374 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1375 tcpstat.tcps_connects++; 1376 soisconnected(so); 1377 tcp_established(tp); 1378 /* Do window scaling on this connection? */ 1379 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1380 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1381 tp->snd_scale = tp->requested_s_scale; 1382 tp->rcv_scale = tp->request_r_scale; 1383 } 1384 TCP_REASS_LOCK(tp); 1385 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1386 TCP_REASS_UNLOCK(tp); 1387 /* 1388 * if we didn't have to retransmit the SYN, 1389 * use its rtt as our initial srtt & rtt var. 1390 */ 1391 if (tp->t_rtt) 1392 tcp_xmit_timer(tp, tp->t_rtt); 1393 } else 1394 tp->t_state = TCPS_SYN_RECEIVED; 1395 1396 /* 1397 * Advance th->th_seq to correspond to first data byte. 1398 * If data, trim to stay within window, 1399 * dropping FIN if necessary. 1400 */ 1401 th->th_seq++; 1402 if (tlen > tp->rcv_wnd) { 1403 todrop = tlen - tp->rcv_wnd; 1404 m_adj(m, -todrop); 1405 tlen = tp->rcv_wnd; 1406 tiflags &= ~TH_FIN; 1407 tcpstat.tcps_rcvpackafterwin++; 1408 tcpstat.tcps_rcvbyteafterwin += todrop; 1409 } 1410 tp->snd_wl1 = th->th_seq - 1; 1411 tp->rcv_up = th->th_seq; 1412 goto step6; 1413 1414 /* 1415 * If the state is SYN_RECEIVED: 1416 * If seg contains an ACK, but not for our SYN, drop the input 1417 * and generate an RST. See page 36, rfc793 1418 */ 1419 case TCPS_SYN_RECEIVED: 1420 if ((tiflags & TH_ACK) && 1421 (SEQ_LEQ(th->th_ack, tp->iss) || 1422 SEQ_GT(th->th_ack, tp->snd_max))) 1423 goto dropwithreset; 1424 break; 1425 } 1426 1427 /* 1428 * States other than LISTEN or SYN_SENT. 1429 * First check timestamp, if present. 1430 * Then check that at least some bytes of segment are within 1431 * receive window. If segment begins before rcv_nxt, 1432 * drop leading data (and SYN); if nothing left, just ack. 1433 * 1434 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1435 * and it's less than ts_recent, drop it. 1436 */ 1437 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1438 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1439 1440 /* Check to see if ts_recent is over 24 days old. */ 1441 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1442 /* 1443 * Invalidate ts_recent. If this segment updates 1444 * ts_recent, the age will be reset later and ts_recent 1445 * will get a valid value. If it does not, setting 1446 * ts_recent to zero will at least satisfy the 1447 * requirement that zero be placed in the timestamp 1448 * echo reply when ts_recent isn't valid. The 1449 * age isn't reset until we get a valid ts_recent 1450 * because we don't want out-of-order segments to be 1451 * dropped when ts_recent is old. 1452 */ 1453 tp->ts_recent = 0; 1454 } else { 1455 tcpstat.tcps_rcvduppack++; 1456 tcpstat.tcps_rcvdupbyte += tlen; 1457 tcpstat.tcps_pawsdrop++; 1458 goto dropafterack; 1459 } 1460 } 1461 1462 todrop = tp->rcv_nxt - th->th_seq; 1463 if (todrop > 0) { 1464 if (tiflags & TH_SYN) { 1465 tiflags &= ~TH_SYN; 1466 th->th_seq++; 1467 if (th->th_urp > 1) 1468 th->th_urp--; 1469 else { 1470 tiflags &= ~TH_URG; 1471 th->th_urp = 0; 1472 } 1473 todrop--; 1474 } 1475 if (todrop > tlen || 1476 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1477 /* 1478 * Any valid FIN must be to the left of the window. 1479 * At this point the FIN must be a duplicate or 1480 * out of sequence; drop it. 1481 */ 1482 tiflags &= ~TH_FIN; 1483 /* 1484 * Send an ACK to resynchronize and drop any data. 1485 * But keep on processing for RST or ACK. 1486 */ 1487 tp->t_flags |= TF_ACKNOW; 1488 todrop = tlen; 1489 tcpstat.tcps_rcvdupbyte += todrop; 1490 tcpstat.tcps_rcvduppack++; 1491 } else { 1492 tcpstat.tcps_rcvpartduppack++; 1493 tcpstat.tcps_rcvpartdupbyte += todrop; 1494 } 1495 hdroptlen += todrop; /*drop from head afterwards*/ 1496 th->th_seq += todrop; 1497 tlen -= todrop; 1498 if (th->th_urp > todrop) 1499 th->th_urp -= todrop; 1500 else { 1501 tiflags &= ~TH_URG; 1502 th->th_urp = 0; 1503 } 1504 } 1505 1506 /* 1507 * If new data are received on a connection after the 1508 * user processes are gone, then RST the other end. 1509 */ 1510 if ((so->so_state & SS_NOFDREF) && 1511 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1512 tp = tcp_close(tp); 1513 tcpstat.tcps_rcvafterclose++; 1514 goto dropwithreset; 1515 } 1516 1517 /* 1518 * If segment ends after window, drop trailing data 1519 * (and PUSH and FIN); if nothing left, just ACK. 1520 */ 1521 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1522 if (todrop > 0) { 1523 tcpstat.tcps_rcvpackafterwin++; 1524 if (todrop >= tlen) { 1525 tcpstat.tcps_rcvbyteafterwin += tlen; 1526 /* 1527 * If a new connection request is received 1528 * while in TIME_WAIT, drop the old connection 1529 * and start over if the sequence numbers 1530 * are above the previous ones. 1531 */ 1532 if (tiflags & TH_SYN && 1533 tp->t_state == TCPS_TIME_WAIT && 1534 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1535 iss = tcp_new_iss(tp, sizeof(struct tcpcb), 1536 tp->snd_nxt); 1537 tp = tcp_close(tp); 1538 goto findpcb; 1539 } 1540 /* 1541 * If window is closed can only take segments at 1542 * window edge, and have to drop data and PUSH from 1543 * incoming segments. Continue processing, but 1544 * remember to ack. Otherwise, drop segment 1545 * and ack. 1546 */ 1547 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1548 tp->t_flags |= TF_ACKNOW; 1549 tcpstat.tcps_rcvwinprobe++; 1550 } else 1551 goto dropafterack; 1552 } else 1553 tcpstat.tcps_rcvbyteafterwin += todrop; 1554 m_adj(m, -todrop); 1555 tlen -= todrop; 1556 tiflags &= ~(TH_PUSH|TH_FIN); 1557 } 1558 1559 /* 1560 * If last ACK falls within this segment's sequence numbers, 1561 * and the timestamp is newer, record it. 1562 */ 1563 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1564 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1565 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen + 1566 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 1567 tp->ts_recent_age = tcp_now; 1568 tp->ts_recent = opti.ts_val; 1569 } 1570 1571 /* 1572 * If the RST bit is set examine the state: 1573 * SYN_RECEIVED STATE: 1574 * If passive open, return to LISTEN state. 1575 * If active open, inform user that connection was refused. 1576 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1577 * Inform user that connection was reset, and close tcb. 1578 * CLOSING, LAST_ACK, TIME_WAIT STATES 1579 * Close the tcb. 1580 */ 1581 if (tiflags&TH_RST) switch (tp->t_state) { 1582 1583 case TCPS_SYN_RECEIVED: 1584 so->so_error = ECONNREFUSED; 1585 goto close; 1586 1587 case TCPS_ESTABLISHED: 1588 case TCPS_FIN_WAIT_1: 1589 case TCPS_FIN_WAIT_2: 1590 case TCPS_CLOSE_WAIT: 1591 so->so_error = ECONNRESET; 1592 close: 1593 tp->t_state = TCPS_CLOSED; 1594 tcpstat.tcps_drops++; 1595 tp = tcp_close(tp); 1596 goto drop; 1597 1598 case TCPS_CLOSING: 1599 case TCPS_LAST_ACK: 1600 case TCPS_TIME_WAIT: 1601 tp = tcp_close(tp); 1602 goto drop; 1603 } 1604 1605 /* 1606 * If a SYN is in the window, then this is an 1607 * error and we send an RST and drop the connection. 1608 */ 1609 if (tiflags & TH_SYN) { 1610 tp = tcp_drop(tp, ECONNRESET); 1611 goto dropwithreset; 1612 } 1613 1614 /* 1615 * If the ACK bit is off we drop the segment and return. 1616 */ 1617 if ((tiflags & TH_ACK) == 0) { 1618 if (tp->t_flags & TF_ACKNOW) 1619 goto dropafterack; 1620 else 1621 goto drop; 1622 } 1623 1624 /* 1625 * Ack processing. 1626 */ 1627 switch (tp->t_state) { 1628 1629 /* 1630 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 1631 * ESTABLISHED state and continue processing, otherwise 1632 * send an RST. 1633 */ 1634 case TCPS_SYN_RECEIVED: 1635 if (SEQ_GT(tp->snd_una, th->th_ack) || 1636 SEQ_GT(th->th_ack, tp->snd_max)) 1637 goto dropwithreset; 1638 tcpstat.tcps_connects++; 1639 soisconnected(so); 1640 tcp_established(tp); 1641 /* Do window scaling? */ 1642 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1643 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1644 tp->snd_scale = tp->requested_s_scale; 1645 tp->rcv_scale = tp->request_r_scale; 1646 } 1647 TCP_REASS_LOCK(tp); 1648 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1649 TCP_REASS_UNLOCK(tp); 1650 tp->snd_wl1 = th->th_seq - 1; 1651 /* fall into ... */ 1652 1653 /* 1654 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1655 * ACKs. If the ack is in the range 1656 * tp->snd_una < th->th_ack <= tp->snd_max 1657 * then advance tp->snd_una to th->th_ack and drop 1658 * data from the retransmission queue. If this ACK reflects 1659 * more up to date window information we update our window information. 1660 */ 1661 case TCPS_ESTABLISHED: 1662 case TCPS_FIN_WAIT_1: 1663 case TCPS_FIN_WAIT_2: 1664 case TCPS_CLOSE_WAIT: 1665 case TCPS_CLOSING: 1666 case TCPS_LAST_ACK: 1667 case TCPS_TIME_WAIT: 1668 1669 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1670 if (tlen == 0 && tiwin == tp->snd_wnd) { 1671 tcpstat.tcps_rcvdupack++; 1672 /* 1673 * If we have outstanding data (other than 1674 * a window probe), this is a completely 1675 * duplicate ack (ie, window info didn't 1676 * change), the ack is the biggest we've 1677 * seen and we've seen exactly our rexmt 1678 * threshhold of them, assume a packet 1679 * has been dropped and retransmit it. 1680 * Kludge snd_nxt & the congestion 1681 * window so we send only this one 1682 * packet. 1683 * 1684 * We know we're losing at the current 1685 * window size so do congestion avoidance 1686 * (set ssthresh to half the current window 1687 * and pull our congestion window back to 1688 * the new ssthresh). 1689 * 1690 * Dup acks mean that packets have left the 1691 * network (they're now cached at the receiver) 1692 * so bump cwnd by the amount in the receiver 1693 * to keep a constant cwnd packets in the 1694 * network. 1695 */ 1696 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 1697 th->th_ack != tp->snd_una) 1698 tp->t_dupacks = 0; 1699 else if (++tp->t_dupacks == tcprexmtthresh) { 1700 tcp_seq onxt = tp->snd_nxt; 1701 u_int win = 1702 min(tp->snd_wnd, tp->snd_cwnd) / 1703 2 / tp->t_segsz; 1704 if (tcp_do_newreno && SEQ_LT(th->th_ack, 1705 tp->snd_recover)) { 1706 /* 1707 * False fast retransmit after 1708 * timeout. Do not cut window. 1709 */ 1710 tp->snd_cwnd += tp->t_segsz; 1711 tp->t_dupacks = 0; 1712 (void) tcp_output(tp); 1713 goto drop; 1714 } 1715 1716 if (win < 2) 1717 win = 2; 1718 tp->snd_ssthresh = win * tp->t_segsz; 1719 tp->snd_recover = tp->snd_max; 1720 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1721 tp->t_rtt = 0; 1722 tp->snd_nxt = th->th_ack; 1723 tp->snd_cwnd = tp->t_segsz; 1724 (void) tcp_output(tp); 1725 tp->snd_cwnd = tp->snd_ssthresh + 1726 tp->t_segsz * tp->t_dupacks; 1727 if (SEQ_GT(onxt, tp->snd_nxt)) 1728 tp->snd_nxt = onxt; 1729 goto drop; 1730 } else if (tp->t_dupacks > tcprexmtthresh) { 1731 tp->snd_cwnd += tp->t_segsz; 1732 (void) tcp_output(tp); 1733 goto drop; 1734 } 1735 } else 1736 tp->t_dupacks = 0; 1737 break; 1738 } 1739 /* 1740 * If the congestion window was inflated to account 1741 * for the other side's cached packets, retract it. 1742 */ 1743 if (tcp_do_newreno == 0) { 1744 if (tp->t_dupacks >= tcprexmtthresh && 1745 tp->snd_cwnd > tp->snd_ssthresh) 1746 tp->snd_cwnd = tp->snd_ssthresh; 1747 tp->t_dupacks = 0; 1748 } else if (tp->t_dupacks >= tcprexmtthresh && 1749 tcp_newreno(tp, th) == 0) { 1750 tp->snd_cwnd = tp->snd_ssthresh; 1751 /* 1752 * Window inflation should have left us with approx. 1753 * snd_ssthresh outstanding data. But in case we 1754 * would be inclined to send a burst, better to do 1755 * it via the slow start mechanism. 1756 */ 1757 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 1758 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 1759 + tp->t_segsz; 1760 tp->t_dupacks = 0; 1761 } 1762 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1763 tcpstat.tcps_rcvacktoomuch++; 1764 goto dropafterack; 1765 } 1766 acked = th->th_ack - tp->snd_una; 1767 tcpstat.tcps_rcvackpack++; 1768 tcpstat.tcps_rcvackbyte += acked; 1769 1770 /* 1771 * If we have a timestamp reply, update smoothed 1772 * round trip time. If no timestamp is present but 1773 * transmit timer is running and timed sequence 1774 * number was acked, update smoothed round trip time. 1775 * Since we now have an rtt measurement, cancel the 1776 * timer backoff (cf., Phil Karn's retransmit alg.). 1777 * Recompute the initial retransmit timer. 1778 */ 1779 if (opti.ts_present && opti.ts_ecr) 1780 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr + 1); 1781 else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq)) 1782 tcp_xmit_timer(tp,tp->t_rtt); 1783 1784 /* 1785 * If all outstanding data is acked, stop retransmit 1786 * timer and remember to restart (more output or persist). 1787 * If there is more data to be acked, restart retransmit 1788 * timer, using current (possibly backed-off) value. 1789 */ 1790 if (th->th_ack == tp->snd_max) { 1791 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1792 needoutput = 1; 1793 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1794 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1795 /* 1796 * When new data is acked, open the congestion window. 1797 * If the window gives us less than ssthresh packets 1798 * in flight, open exponentially (segsz per packet). 1799 * Otherwise open linearly: segsz per window 1800 * (segsz^2 / cwnd per packet), plus a constant 1801 * fraction of a packet (segsz/8) to help larger windows 1802 * open quickly enough. 1803 */ 1804 { 1805 register u_int cw = tp->snd_cwnd; 1806 register u_int incr = tp->t_segsz; 1807 1808 if (cw > tp->snd_ssthresh) 1809 incr = incr * incr / cw; 1810 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover)) 1811 tp->snd_cwnd = min(cw + incr, 1812 TCP_MAXWIN << tp->snd_scale); 1813 } 1814 ND6_HINT(tp); 1815 if (acked > so->so_snd.sb_cc) { 1816 tp->snd_wnd -= so->so_snd.sb_cc; 1817 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1818 ourfinisacked = 1; 1819 } else { 1820 sbdrop(&so->so_snd, acked); 1821 tp->snd_wnd -= acked; 1822 ourfinisacked = 0; 1823 } 1824 sowwakeup(so); 1825 /* 1826 * We want snd_recover to track snd_una to 1827 * avoid sequence wraparound problems for 1828 * very large transfers. 1829 */ 1830 tp->snd_una = tp->snd_recover = th->th_ack; 1831 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1832 tp->snd_nxt = tp->snd_una; 1833 1834 switch (tp->t_state) { 1835 1836 /* 1837 * In FIN_WAIT_1 STATE in addition to the processing 1838 * for the ESTABLISHED state if our FIN is now acknowledged 1839 * then enter FIN_WAIT_2. 1840 */ 1841 case TCPS_FIN_WAIT_1: 1842 if (ourfinisacked) { 1843 /* 1844 * If we can't receive any more 1845 * data, then closing user can proceed. 1846 * Starting the timer is contrary to the 1847 * specification, but if we don't get a FIN 1848 * we'll hang forever. 1849 */ 1850 if (so->so_state & SS_CANTRCVMORE) { 1851 soisdisconnected(so); 1852 if (tcp_maxidle > 0) 1853 TCP_TIMER_ARM(tp, TCPT_2MSL, 1854 tcp_maxidle); 1855 } 1856 tp->t_state = TCPS_FIN_WAIT_2; 1857 } 1858 break; 1859 1860 /* 1861 * In CLOSING STATE in addition to the processing for 1862 * the ESTABLISHED state if the ACK acknowledges our FIN 1863 * then enter the TIME-WAIT state, otherwise ignore 1864 * the segment. 1865 */ 1866 case TCPS_CLOSING: 1867 if (ourfinisacked) { 1868 tp->t_state = TCPS_TIME_WAIT; 1869 tcp_canceltimers(tp); 1870 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1871 soisdisconnected(so); 1872 } 1873 break; 1874 1875 /* 1876 * In LAST_ACK, we may still be waiting for data to drain 1877 * and/or to be acked, as well as for the ack of our FIN. 1878 * If our FIN is now acknowledged, delete the TCB, 1879 * enter the closed state and return. 1880 */ 1881 case TCPS_LAST_ACK: 1882 if (ourfinisacked) { 1883 tp = tcp_close(tp); 1884 goto drop; 1885 } 1886 break; 1887 1888 /* 1889 * In TIME_WAIT state the only thing that should arrive 1890 * is a retransmission of the remote FIN. Acknowledge 1891 * it and restart the finack timer. 1892 */ 1893 case TCPS_TIME_WAIT: 1894 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1895 goto dropafterack; 1896 } 1897 } 1898 1899 step6: 1900 /* 1901 * Update window information. 1902 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1903 */ 1904 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 1905 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 1906 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 1907 /* keep track of pure window updates */ 1908 if (tlen == 0 && 1909 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1910 tcpstat.tcps_rcvwinupd++; 1911 tp->snd_wnd = tiwin; 1912 tp->snd_wl1 = th->th_seq; 1913 tp->snd_wl2 = th->th_ack; 1914 if (tp->snd_wnd > tp->max_sndwnd) 1915 tp->max_sndwnd = tp->snd_wnd; 1916 needoutput = 1; 1917 } 1918 1919 /* 1920 * Process segments with URG. 1921 */ 1922 if ((tiflags & TH_URG) && th->th_urp && 1923 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1924 /* 1925 * This is a kludge, but if we receive and accept 1926 * random urgent pointers, we'll crash in 1927 * soreceive. It's hard to imagine someone 1928 * actually wanting to send this much urgent data. 1929 */ 1930 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1931 th->th_urp = 0; /* XXX */ 1932 tiflags &= ~TH_URG; /* XXX */ 1933 goto dodata; /* XXX */ 1934 } 1935 /* 1936 * If this segment advances the known urgent pointer, 1937 * then mark the data stream. This should not happen 1938 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1939 * a FIN has been received from the remote side. 1940 * In these states we ignore the URG. 1941 * 1942 * According to RFC961 (Assigned Protocols), 1943 * the urgent pointer points to the last octet 1944 * of urgent data. We continue, however, 1945 * to consider it to indicate the first octet 1946 * of data past the urgent section as the original 1947 * spec states (in one of two places). 1948 */ 1949 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1950 tp->rcv_up = th->th_seq + th->th_urp; 1951 so->so_oobmark = so->so_rcv.sb_cc + 1952 (tp->rcv_up - tp->rcv_nxt) - 1; 1953 if (so->so_oobmark == 0) 1954 so->so_state |= SS_RCVATMARK; 1955 sohasoutofband(so); 1956 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1957 } 1958 /* 1959 * Remove out of band data so doesn't get presented to user. 1960 * This can happen independent of advancing the URG pointer, 1961 * but if two URG's are pending at once, some out-of-band 1962 * data may creep in... ick. 1963 */ 1964 if (th->th_urp <= (u_int16_t) tlen 1965 #ifdef SO_OOBINLINE 1966 && (so->so_options & SO_OOBINLINE) == 0 1967 #endif 1968 ) 1969 tcp_pulloutofband(so, th, m, hdroptlen); 1970 } else 1971 /* 1972 * If no out of band data is expected, 1973 * pull receive urgent pointer along 1974 * with the receive window. 1975 */ 1976 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1977 tp->rcv_up = tp->rcv_nxt; 1978 dodata: /* XXX */ 1979 1980 /* 1981 * Process the segment text, merging it into the TCP sequencing queue, 1982 * and arranging for acknowledgement of receipt if necessary. 1983 * This process logically involves adjusting tp->rcv_wnd as data 1984 * is presented to the user (this happens in tcp_usrreq.c, 1985 * case PRU_RCVD). If a FIN has already been received on this 1986 * connection then we just ignore the text. 1987 */ 1988 if ((tlen || (tiflags & TH_FIN)) && 1989 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1990 /* 1991 * Insert segment ti into reassembly queue of tcp with 1992 * control block tp. Return TH_FIN if reassembly now includes 1993 * a segment with FIN. The macro form does the common case 1994 * inline (segment is the next to be received on an 1995 * established connection, and the queue is empty), 1996 * avoiding linkage into and removal from the queue and 1997 * repetition of various conversions. 1998 * Set DELACK for segments received in order, but ack 1999 * immediately when segments are out of order 2000 * (so fast retransmit can work). 2001 */ 2002 /* NOTE: this was TCP_REASS() macro, but used only once */ 2003 TCP_REASS_LOCK(tp); 2004 if (th->th_seq == tp->rcv_nxt && 2005 tp->segq.lh_first == NULL && 2006 tp->t_state == TCPS_ESTABLISHED) { 2007 TCP_SETUP_ACK(tp, th); 2008 tp->rcv_nxt += tlen; 2009 tiflags = th->th_flags & TH_FIN; 2010 tcpstat.tcps_rcvpack++; 2011 tcpstat.tcps_rcvbyte += tlen; 2012 ND6_HINT(tp); 2013 m_adj(m, hdroptlen); 2014 sbappend(&(so)->so_rcv, m); 2015 sorwakeup(so); 2016 } else { 2017 m_adj(m, hdroptlen); 2018 tiflags = tcp_reass(tp, th, m, &tlen); 2019 tp->t_flags |= TF_ACKNOW; 2020 } 2021 TCP_REASS_UNLOCK(tp); 2022 2023 /* 2024 * Note the amount of data that peer has sent into 2025 * our window, in order to estimate the sender's 2026 * buffer size. 2027 */ 2028 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2029 } else { 2030 m_freem(m); 2031 m = NULL; 2032 tiflags &= ~TH_FIN; 2033 } 2034 2035 /* 2036 * If FIN is received ACK the FIN and let the user know 2037 * that the connection is closing. Ignore a FIN received before 2038 * the connection is fully established. 2039 */ 2040 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2041 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2042 socantrcvmore(so); 2043 tp->t_flags |= TF_ACKNOW; 2044 tp->rcv_nxt++; 2045 } 2046 switch (tp->t_state) { 2047 2048 /* 2049 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2050 */ 2051 case TCPS_ESTABLISHED: 2052 tp->t_state = TCPS_CLOSE_WAIT; 2053 break; 2054 2055 /* 2056 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2057 * enter the CLOSING state. 2058 */ 2059 case TCPS_FIN_WAIT_1: 2060 tp->t_state = TCPS_CLOSING; 2061 break; 2062 2063 /* 2064 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2065 * starting the time-wait timer, turning off the other 2066 * standard timers. 2067 */ 2068 case TCPS_FIN_WAIT_2: 2069 tp->t_state = TCPS_TIME_WAIT; 2070 tcp_canceltimers(tp); 2071 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2072 soisdisconnected(so); 2073 break; 2074 2075 /* 2076 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2077 */ 2078 case TCPS_TIME_WAIT: 2079 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2080 break; 2081 } 2082 } 2083 if (so->so_options & SO_DEBUG) { 2084 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2085 } 2086 2087 /* 2088 * Return any desired output. 2089 */ 2090 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2091 (void) tcp_output(tp); 2092 if (tcp_saveti) 2093 m_freem(tcp_saveti); 2094 return; 2095 2096 badsyn: 2097 /* 2098 * Received a bad SYN. Increment counters and dropwithreset. 2099 */ 2100 tcpstat.tcps_badsyn++; 2101 tp = NULL; 2102 goto dropwithreset; 2103 2104 dropafterack: 2105 /* 2106 * Generate an ACK dropping incoming segment if it occupies 2107 * sequence space, where the ACK reflects our state. 2108 */ 2109 if (tiflags & TH_RST) 2110 goto drop; 2111 m_freem(m); 2112 tp->t_flags |= TF_ACKNOW; 2113 (void) tcp_output(tp); 2114 if (tcp_saveti) 2115 m_freem(tcp_saveti); 2116 return; 2117 2118 dropwithreset_ratelim: 2119 /* 2120 * We may want to rate-limit RSTs in certain situations, 2121 * particularly if we are sending an RST in response to 2122 * an attempt to connect to or otherwise communicate with 2123 * a port for which we have no socket. 2124 */ 2125 if (ratecheck(&tcp_rst_ratelim_last, &tcp_rst_ratelim) == 0) { 2126 /* XXX stat */ 2127 goto drop; 2128 } 2129 /* ...fall into dropwithreset... */ 2130 2131 dropwithreset: 2132 /* 2133 * Generate a RST, dropping incoming segment. 2134 * Make ACK acceptable to originator of segment. 2135 */ 2136 if (tiflags & TH_RST) 2137 goto drop; 2138 { 2139 /* 2140 * need to recover version # field, which was overwritten on 2141 * ip_cksum computation. 2142 */ 2143 struct ip *sip; 2144 sip = mtod(m, struct ip *); 2145 switch (af) { 2146 case AF_INET: 2147 sip->ip_v = 4; 2148 break; 2149 #ifdef INET6 2150 case AF_INET6: 2151 sip->ip_v = 6; 2152 break; 2153 #endif 2154 } 2155 } 2156 if (tiflags & TH_ACK) 2157 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2158 else { 2159 if (tiflags & TH_SYN) 2160 tlen++; 2161 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2162 TH_RST|TH_ACK); 2163 } 2164 if (tcp_saveti) 2165 m_freem(tcp_saveti); 2166 return; 2167 2168 drop: 2169 /* 2170 * Drop space held by incoming segment and return. 2171 */ 2172 if (tp) { 2173 if (tp->t_inpcb) 2174 so = tp->t_inpcb->inp_socket; 2175 #ifdef INET6 2176 else if (tp->t_in6pcb) 2177 so = tp->t_in6pcb->in6p_socket; 2178 #endif 2179 else 2180 so = NULL; 2181 if (so && (so->so_options & SO_DEBUG) != 0) 2182 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2183 } 2184 if (tcp_saveti) 2185 m_freem(tcp_saveti); 2186 m_freem(m); 2187 return; 2188 } 2189 2190 void 2191 tcp_dooptions(tp, cp, cnt, th, oi) 2192 struct tcpcb *tp; 2193 u_char *cp; 2194 int cnt; 2195 struct tcphdr *th; 2196 struct tcp_opt_info *oi; 2197 { 2198 u_int16_t mss; 2199 int opt, optlen; 2200 2201 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2202 opt = cp[0]; 2203 if (opt == TCPOPT_EOL) 2204 break; 2205 if (opt == TCPOPT_NOP) 2206 optlen = 1; 2207 else { 2208 optlen = cp[1]; 2209 if (optlen <= 0) 2210 break; 2211 } 2212 switch (opt) { 2213 2214 default: 2215 continue; 2216 2217 case TCPOPT_MAXSEG: 2218 if (optlen != TCPOLEN_MAXSEG) 2219 continue; 2220 if (!(th->th_flags & TH_SYN)) 2221 continue; 2222 bcopy(cp + 2, &mss, sizeof(mss)); 2223 oi->maxseg = ntohs(mss); 2224 break; 2225 2226 case TCPOPT_WINDOW: 2227 if (optlen != TCPOLEN_WINDOW) 2228 continue; 2229 if (!(th->th_flags & TH_SYN)) 2230 continue; 2231 tp->t_flags |= TF_RCVD_SCALE; 2232 tp->requested_s_scale = cp[2]; 2233 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 2234 #if 0 /*XXX*/ 2235 char *p; 2236 2237 if (ip) 2238 p = ntohl(ip->ip_src); 2239 #ifdef INET6 2240 else if (ip6) 2241 p = ip6_sprintf(&ip6->ip6_src); 2242 #endif 2243 else 2244 p = "(unknown)"; 2245 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 2246 "assuming %d\n", 2247 tp->requested_s_scale, p, 2248 TCP_MAX_WINSHIFT); 2249 #else 2250 log(LOG_ERR, "TCP: invalid wscale %d, " 2251 "assuming %d\n", 2252 tp->requested_s_scale, 2253 TCP_MAX_WINSHIFT); 2254 #endif 2255 tp->requested_s_scale = TCP_MAX_WINSHIFT; 2256 } 2257 break; 2258 2259 case TCPOPT_TIMESTAMP: 2260 if (optlen != TCPOLEN_TIMESTAMP) 2261 continue; 2262 oi->ts_present = 1; 2263 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2264 NTOHL(oi->ts_val); 2265 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2266 NTOHL(oi->ts_ecr); 2267 2268 /* 2269 * A timestamp received in a SYN makes 2270 * it ok to send timestamp requests and replies. 2271 */ 2272 if (th->th_flags & TH_SYN) { 2273 tp->t_flags |= TF_RCVD_TSTMP; 2274 tp->ts_recent = oi->ts_val; 2275 tp->ts_recent_age = tcp_now; 2276 } 2277 break; 2278 case TCPOPT_SACK_PERMITTED: 2279 if (optlen != TCPOLEN_SACK_PERMITTED) 2280 continue; 2281 if (!(th->th_flags & TH_SYN)) 2282 continue; 2283 tp->t_flags &= ~TF_CANT_TXSACK; 2284 break; 2285 2286 case TCPOPT_SACK: 2287 if (tp->t_flags & TF_IGNR_RXSACK) 2288 continue; 2289 if (optlen % 8 != 2 || optlen < 10) 2290 continue; 2291 cp += 2; 2292 optlen -= 2; 2293 for (; optlen > 0; cp -= 8, optlen -= 8) { 2294 tcp_seq lwe, rwe; 2295 bcopy((char *)cp, (char *) &lwe, sizeof(lwe)); 2296 NTOHL(lwe); 2297 bcopy((char *)cp, (char *) &rwe, sizeof(rwe)); 2298 NTOHL(rwe); 2299 /* tcp_mark_sacked(tp, lwe, rwe); */ 2300 } 2301 break; 2302 } 2303 } 2304 } 2305 2306 /* 2307 * Pull out of band byte out of a segment so 2308 * it doesn't appear in the user's data queue. 2309 * It is still reflected in the segment length for 2310 * sequencing purposes. 2311 */ 2312 void 2313 tcp_pulloutofband(so, th, m, off) 2314 struct socket *so; 2315 struct tcphdr *th; 2316 register struct mbuf *m; 2317 int off; 2318 { 2319 int cnt = off + th->th_urp - 1; 2320 2321 while (cnt >= 0) { 2322 if (m->m_len > cnt) { 2323 char *cp = mtod(m, caddr_t) + cnt; 2324 struct tcpcb *tp = sototcpcb(so); 2325 2326 tp->t_iobc = *cp; 2327 tp->t_oobflags |= TCPOOB_HAVEDATA; 2328 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2329 m->m_len--; 2330 return; 2331 } 2332 cnt -= m->m_len; 2333 m = m->m_next; 2334 if (m == 0) 2335 break; 2336 } 2337 panic("tcp_pulloutofband"); 2338 } 2339 2340 /* 2341 * Collect new round-trip time estimate 2342 * and update averages and current timeout. 2343 */ 2344 void 2345 tcp_xmit_timer(tp, rtt) 2346 register struct tcpcb *tp; 2347 short rtt; 2348 { 2349 register short delta; 2350 short rttmin; 2351 2352 tcpstat.tcps_rttupdated++; 2353 --rtt; 2354 if (tp->t_srtt != 0) { 2355 /* 2356 * srtt is stored as fixed point with 3 bits after the 2357 * binary point (i.e., scaled by 8). The following magic 2358 * is equivalent to the smoothing algorithm in rfc793 with 2359 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2360 * point). Adjust rtt to origin 0. 2361 */ 2362 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2363 if ((tp->t_srtt += delta) <= 0) 2364 tp->t_srtt = 1 << 2; 2365 /* 2366 * We accumulate a smoothed rtt variance (actually, a 2367 * smoothed mean difference), then set the retransmit 2368 * timer to smoothed rtt + 4 times the smoothed variance. 2369 * rttvar is stored as fixed point with 2 bits after the 2370 * binary point (scaled by 4). The following is 2371 * equivalent to rfc793 smoothing with an alpha of .75 2372 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2373 * rfc793's wired-in beta. 2374 */ 2375 if (delta < 0) 2376 delta = -delta; 2377 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2378 if ((tp->t_rttvar += delta) <= 0) 2379 tp->t_rttvar = 1 << 2; 2380 } else { 2381 /* 2382 * No rtt measurement yet - use the unsmoothed rtt. 2383 * Set the variance to half the rtt (so our first 2384 * retransmit happens at 3*rtt). 2385 */ 2386 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2387 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2388 } 2389 tp->t_rtt = 0; 2390 tp->t_rxtshift = 0; 2391 2392 /* 2393 * the retransmit should happen at rtt + 4 * rttvar. 2394 * Because of the way we do the smoothing, srtt and rttvar 2395 * will each average +1/2 tick of bias. When we compute 2396 * the retransmit timer, we want 1/2 tick of rounding and 2397 * 1 extra tick because of +-1/2 tick uncertainty in the 2398 * firing of the timer. The bias will give us exactly the 2399 * 1.5 tick we need. But, because the bias is 2400 * statistical, we have to test that we don't drop below 2401 * the minimum feasible timer (which is 2 ticks). 2402 */ 2403 if (tp->t_rttmin > rtt + 2) 2404 rttmin = tp->t_rttmin; 2405 else 2406 rttmin = rtt + 2; 2407 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2408 2409 /* 2410 * We received an ack for a packet that wasn't retransmitted; 2411 * it is probably safe to discard any error indications we've 2412 * received recently. This isn't quite right, but close enough 2413 * for now (a route might have failed after we sent a segment, 2414 * and the return path might not be symmetrical). 2415 */ 2416 tp->t_softerror = 0; 2417 } 2418 2419 /* 2420 * Checks for partial ack. If partial ack arrives, force the retransmission 2421 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2422 * 1. By setting snd_nxt to th_ack, this forces retransmission timer to 2423 * be started again. If the ack advances at least to tp->snd_recover, return 0. 2424 */ 2425 int 2426 tcp_newreno(tp, th) 2427 struct tcpcb *tp; 2428 struct tcphdr *th; 2429 { 2430 tcp_seq onxt = tp->snd_nxt; 2431 u_long ocwnd = tp->snd_cwnd; 2432 2433 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2434 /* 2435 * snd_una has not yet been updated and the socket's send 2436 * buffer has not yet drained off the ACK'd data, so we 2437 * have to leave snd_una as it was to get the correct data 2438 * offset in tcp_output(). 2439 */ 2440 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2441 tp->t_rtt = 0; 2442 tp->snd_nxt = th->th_ack; 2443 /* 2444 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una 2445 * is not yet updated when we're called. 2446 */ 2447 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 2448 (void) tcp_output(tp); 2449 tp->snd_cwnd = ocwnd; 2450 if (SEQ_GT(onxt, tp->snd_nxt)) 2451 tp->snd_nxt = onxt; 2452 /* 2453 * Partial window deflation. Relies on fact that tp->snd_una 2454 * not updated yet. 2455 */ 2456 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz); 2457 return 1; 2458 } 2459 return 0; 2460 } 2461 2462 2463 /* 2464 * TCP compressed state engine. Currently used to hold compressed 2465 * state for SYN_RECEIVED. 2466 */ 2467 2468 u_long syn_cache_count; 2469 u_int32_t syn_hash1, syn_hash2; 2470 2471 #define SYN_HASH(sa, sp, dp) \ 2472 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 2473 ((u_int32_t)(sp)))^syn_hash2))) 2474 #ifndef INET6 2475 #define SYN_HASHALL(hash, src, dst) \ 2476 do { \ 2477 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2478 ((struct sockaddr_in *)(src))->sin_port, \ 2479 ((struct sockaddr_in *)(dst))->sin_port); \ 2480 } while (0) 2481 #else 2482 #define SYN_HASH6(sa, sp, dp) \ 2483 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 2484 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 2485 & 0x7fffffff) 2486 2487 #define SYN_HASHALL(hash, src, dst) \ 2488 do { \ 2489 switch ((src)->sa_family) { \ 2490 case AF_INET: \ 2491 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2492 ((struct sockaddr_in *)(src))->sin_port, \ 2493 ((struct sockaddr_in *)(dst))->sin_port); \ 2494 break; \ 2495 case AF_INET6: \ 2496 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 2497 ((struct sockaddr_in6 *)(src))->sin6_port, \ 2498 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 2499 break; \ 2500 default: \ 2501 hash = 0; \ 2502 } \ 2503 } while (0) 2504 #endif /* INET6 */ 2505 2506 #define SYN_CACHE_RM(sc) \ 2507 do { \ 2508 LIST_REMOVE((sc), sc_bucketq); \ 2509 (sc)->sc_tp = NULL; \ 2510 LIST_REMOVE((sc), sc_tpq); \ 2511 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 2512 TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \ 2513 syn_cache_count--; \ 2514 } while (0) 2515 2516 #define SYN_CACHE_PUT(sc) \ 2517 do { \ 2518 if ((sc)->sc_ipopts) \ 2519 (void) m_free((sc)->sc_ipopts); \ 2520 if ((sc)->sc_route4.ro_rt != NULL) \ 2521 RTFREE((sc)->sc_route4.ro_rt); \ 2522 pool_put(&syn_cache_pool, (sc)); \ 2523 } while (0) 2524 2525 struct pool syn_cache_pool; 2526 2527 /* 2528 * We don't estimate RTT with SYNs, so each packet starts with the default 2529 * RTT and each timer queue has a fixed timeout value. This allows us to 2530 * optimize the timer queues somewhat. 2531 */ 2532 #define SYN_CACHE_TIMER_ARM(sc) \ 2533 do { \ 2534 TCPT_RANGESET((sc)->sc_rxtcur, \ 2535 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 2536 TCPTV_REXMTMAX); \ 2537 PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur); \ 2538 } while (0) 2539 2540 TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1]; 2541 2542 void 2543 syn_cache_init() 2544 { 2545 int i; 2546 2547 /* Initialize the hash buckets. */ 2548 for (i = 0; i < tcp_syn_cache_size; i++) 2549 LIST_INIT(&tcp_syn_cache[i].sch_bucket); 2550 2551 /* Initialize the timer queues. */ 2552 for (i = 0; i <= TCP_MAXRXTSHIFT; i++) 2553 TAILQ_INIT(&tcp_syn_cache_timeq[i]); 2554 2555 /* Initialize the syn cache pool. */ 2556 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 2557 "synpl", 0, NULL, NULL, M_PCB); 2558 } 2559 2560 void 2561 syn_cache_insert(sc, tp) 2562 struct syn_cache *sc; 2563 struct tcpcb *tp; 2564 { 2565 struct syn_cache_head *scp; 2566 struct syn_cache *sc2; 2567 int s, i; 2568 2569 /* 2570 * If there are no entries in the hash table, reinitialize 2571 * the hash secrets. 2572 */ 2573 if (syn_cache_count == 0) { 2574 struct timeval tv; 2575 microtime(&tv); 2576 syn_hash1 = random() ^ (u_long)≻ 2577 syn_hash2 = random() ^ tv.tv_usec; 2578 } 2579 2580 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 2581 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 2582 scp = &tcp_syn_cache[sc->sc_bucketidx]; 2583 2584 /* 2585 * Make sure that we don't overflow the per-bucket 2586 * limit or the total cache size limit. 2587 */ 2588 s = splsoftnet(); 2589 if (scp->sch_length >= tcp_syn_bucket_limit) { 2590 tcpstat.tcps_sc_bucketoverflow++; 2591 /* 2592 * The bucket is full. Toss the oldest element in the 2593 * bucket. This will be the entry with our bucket 2594 * index closest to the front of the timer queue with 2595 * the largest timeout value. 2596 * 2597 * Note: This timer queue traversal may be expensive, so 2598 * we hope that this doesn't happen very often. It is 2599 * much more likely that we'll overflow the entire 2600 * cache, which is much easier to handle; see below. 2601 */ 2602 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2603 for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2604 sc2 != NULL; 2605 sc2 = TAILQ_NEXT(sc2, sc_timeq)) { 2606 if (sc2->sc_bucketidx == sc->sc_bucketidx) { 2607 SYN_CACHE_RM(sc2); 2608 SYN_CACHE_PUT(sc2); 2609 goto insert; /* 2 level break */ 2610 } 2611 } 2612 } 2613 #ifdef DIAGNOSTIC 2614 /* 2615 * This should never happen; we should always find an 2616 * entry in our bucket. 2617 */ 2618 panic("syn_cache_insert: bucketoverflow: impossible"); 2619 #endif 2620 } else if (syn_cache_count >= tcp_syn_cache_limit) { 2621 tcpstat.tcps_sc_overflowed++; 2622 /* 2623 * The cache is full. Toss the oldest entry in the 2624 * entire cache. This is the front entry in the 2625 * first non-empty timer queue with the largest 2626 * timeout value. 2627 */ 2628 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2629 sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2630 if (sc2 == NULL) 2631 continue; 2632 SYN_CACHE_RM(sc2); 2633 SYN_CACHE_PUT(sc2); 2634 goto insert; /* symmetry with above */ 2635 } 2636 #ifdef DIAGNOSTIC 2637 /* 2638 * This should never happen; we should always find an 2639 * entry in the cache. 2640 */ 2641 panic("syn_cache_insert: cache overflow: impossible"); 2642 #endif 2643 } 2644 2645 insert: 2646 /* 2647 * Initialize the entry's timer. 2648 */ 2649 sc->sc_rxttot = 0; 2650 sc->sc_rxtshift = 0; 2651 SYN_CACHE_TIMER_ARM(sc); 2652 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq); 2653 2654 /* Link it from tcpcb entry */ 2655 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 2656 2657 /* Put it into the bucket. */ 2658 LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq); 2659 scp->sch_length++; 2660 syn_cache_count++; 2661 2662 tcpstat.tcps_sc_added++; 2663 splx(s); 2664 } 2665 2666 /* 2667 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 2668 * If we have retransmitted an entry the maximum number of times, expire 2669 * that entry. 2670 */ 2671 void 2672 syn_cache_timer() 2673 { 2674 struct syn_cache *sc, *nsc; 2675 int i, s; 2676 2677 s = splsoftnet(); 2678 2679 /* 2680 * First, get all the entries that need to be retransmitted, or 2681 * must be expired due to exceeding the initial keepalive time. 2682 */ 2683 for (i = 0; i < TCP_MAXRXTSHIFT; i++) { 2684 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2685 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2686 sc = nsc) { 2687 nsc = TAILQ_NEXT(sc, sc_timeq); 2688 2689 /* 2690 * Compute the total amount of time this entry has 2691 * been on a queue. If this entry has been on longer 2692 * than the keep alive timer would allow, expire it. 2693 */ 2694 sc->sc_rxttot += sc->sc_rxtcur; 2695 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) { 2696 tcpstat.tcps_sc_timed_out++; 2697 SYN_CACHE_RM(sc); 2698 SYN_CACHE_PUT(sc); 2699 continue; 2700 } 2701 2702 tcpstat.tcps_sc_retransmitted++; 2703 (void) syn_cache_respond(sc, NULL); 2704 2705 /* Advance this entry onto the next timer queue. */ 2706 TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq); 2707 sc->sc_rxtshift = i + 1; 2708 SYN_CACHE_TIMER_ARM(sc); 2709 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], 2710 sc, sc_timeq); 2711 } 2712 } 2713 2714 /* 2715 * Now get all the entries that are expired due to too many 2716 * retransmissions. 2717 */ 2718 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]); 2719 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2720 sc = nsc) { 2721 nsc = TAILQ_NEXT(sc, sc_timeq); 2722 tcpstat.tcps_sc_timed_out++; 2723 SYN_CACHE_RM(sc); 2724 SYN_CACHE_PUT(sc); 2725 } 2726 splx(s); 2727 } 2728 2729 /* 2730 * Remove syn cache created by the specified tcb entry, 2731 * because this does not make sense to keep them 2732 * (if there's no tcb entry, syn cache entry will never be used) 2733 */ 2734 void 2735 syn_cache_cleanup(tp) 2736 struct tcpcb *tp; 2737 { 2738 struct syn_cache *sc, *nsc; 2739 int s; 2740 2741 s = splsoftnet(); 2742 2743 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 2744 nsc = LIST_NEXT(sc, sc_tpq); 2745 2746 #ifdef DIAGNOSTIC 2747 if (sc->sc_tp != tp) 2748 panic("invalid sc_tp in syn_cache_cleanup"); 2749 #endif 2750 SYN_CACHE_RM(sc); 2751 SYN_CACHE_PUT(sc); 2752 } 2753 /* just for safety */ 2754 LIST_INIT(&tp->t_sc); 2755 2756 splx(s); 2757 } 2758 2759 /* 2760 * Find an entry in the syn cache. 2761 */ 2762 struct syn_cache * 2763 syn_cache_lookup(src, dst, headp) 2764 struct sockaddr *src; 2765 struct sockaddr *dst; 2766 struct syn_cache_head **headp; 2767 { 2768 struct syn_cache *sc; 2769 struct syn_cache_head *scp; 2770 u_int32_t hash; 2771 int s; 2772 2773 SYN_HASHALL(hash, src, dst); 2774 2775 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 2776 *headp = scp; 2777 s = splsoftnet(); 2778 for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL; 2779 sc = LIST_NEXT(sc, sc_bucketq)) { 2780 if (sc->sc_hash != hash) 2781 continue; 2782 if (!bcmp(&sc->sc_src, src, src->sa_len) && 2783 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 2784 splx(s); 2785 return (sc); 2786 } 2787 } 2788 splx(s); 2789 return (NULL); 2790 } 2791 2792 /* 2793 * This function gets called when we receive an ACK for a 2794 * socket in the LISTEN state. We look up the connection 2795 * in the syn cache, and if its there, we pull it out of 2796 * the cache and turn it into a full-blown connection in 2797 * the SYN-RECEIVED state. 2798 * 2799 * The return values may not be immediately obvious, and their effects 2800 * can be subtle, so here they are: 2801 * 2802 * NULL SYN was not found in cache; caller should drop the 2803 * packet and send an RST. 2804 * 2805 * -1 We were unable to create the new connection, and are 2806 * aborting it. An ACK,RST is being sent to the peer 2807 * (unless we got screwey sequence numbners; see below), 2808 * because the 3-way handshake has been completed. Caller 2809 * should not free the mbuf, since we may be using it. If 2810 * we are not, we will free it. 2811 * 2812 * Otherwise, the return value is a pointer to the new socket 2813 * associated with the connection. 2814 */ 2815 struct socket * 2816 syn_cache_get(src, dst, th, hlen, tlen, so, m) 2817 struct sockaddr *src; 2818 struct sockaddr *dst; 2819 struct tcphdr *th; 2820 unsigned int hlen, tlen; 2821 struct socket *so; 2822 struct mbuf *m; 2823 { 2824 struct syn_cache *sc; 2825 struct syn_cache_head *scp; 2826 register struct inpcb *inp = NULL; 2827 #ifdef INET6 2828 register struct in6pcb *in6p = NULL; 2829 #endif 2830 register struct tcpcb *tp = 0; 2831 struct mbuf *am; 2832 int s; 2833 struct socket *oso; 2834 2835 s = splsoftnet(); 2836 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 2837 splx(s); 2838 return (NULL); 2839 } 2840 2841 /* 2842 * Verify the sequence and ack numbers. Try getting the correct 2843 * response again. 2844 */ 2845 if ((th->th_ack != sc->sc_iss + 1) || 2846 SEQ_LEQ(th->th_seq, sc->sc_irs) || 2847 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 2848 (void) syn_cache_respond(sc, m); 2849 splx(s); 2850 return ((struct socket *)(-1)); 2851 } 2852 2853 /* Remove this cache entry */ 2854 SYN_CACHE_RM(sc); 2855 splx(s); 2856 2857 /* 2858 * Ok, create the full blown connection, and set things up 2859 * as they would have been set up if we had created the 2860 * connection when the SYN arrived. If we can't create 2861 * the connection, abort it. 2862 */ 2863 /* 2864 * inp still has the OLD in_pcb stuff, set the 2865 * v6-related flags on the new guy, too. This is 2866 * done particularly for the case where an AF_INET6 2867 * socket is bound only to a port, and a v4 connection 2868 * comes in on that port. 2869 * we also copy the flowinfo from the original pcb 2870 * to the new one. 2871 */ 2872 { 2873 struct inpcb *parentinpcb; 2874 2875 parentinpcb = (struct inpcb *)so->so_pcb; 2876 2877 oso = so; 2878 so = sonewconn(so, SS_ISCONNECTED); 2879 if (so == NULL) 2880 goto resetandabort; 2881 2882 switch (so->so_proto->pr_domain->dom_family) { 2883 case AF_INET: 2884 inp = sotoinpcb(so); 2885 break; 2886 #ifdef INET6 2887 case AF_INET6: 2888 in6p = sotoin6pcb(so); 2889 #if 0 /*def INET6*/ 2890 inp->inp_flags |= (parentinpcb->inp_flags & 2891 (INP_IPV6 | INP_IPV6_UNDEC | INP_IPV6_MAPPED)); 2892 if ((inp->inp_flags & INP_IPV6) && 2893 !(inp->inp_flags & INP_IPV6_MAPPED)) { 2894 inp->inp_ipv6.ip6_hlim = parentinpcb->inp_ipv6.ip6_hlim; 2895 inp->inp_ipv6.ip6_vfc = parentinpcb->inp_ipv6.ip6_vfc; 2896 } 2897 #endif 2898 break; 2899 #endif 2900 } 2901 } 2902 switch (src->sa_family) { 2903 case AF_INET: 2904 if (inp) { 2905 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 2906 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 2907 inp->inp_options = ip_srcroute(); 2908 in_pcbstate(inp, INP_BOUND); 2909 if (inp->inp_options == NULL) { 2910 inp->inp_options = sc->sc_ipopts; 2911 sc->sc_ipopts = NULL; 2912 } 2913 } 2914 #ifdef INET6 2915 else if (in6p) { 2916 /* IPv4 packet to AF_INET6 socket */ 2917 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr)); 2918 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 2919 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 2920 &in6p->in6p_laddr.s6_addr32[3], 2921 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 2922 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 2923 in6totcpcb(in6p)->t_family = AF_INET; 2924 } 2925 #endif 2926 break; 2927 #ifdef INET6 2928 case AF_INET6: 2929 if (in6p) { 2930 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 2931 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 2932 #if 0 2933 in6p->in6p_flowinfo = ip6->ip6_flow & IPV6_FLOWINFO_MASK; 2934 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 2935 #endif 2936 } 2937 break; 2938 #endif 2939 } 2940 #ifdef INET6 2941 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 2942 struct in6pcb *oin6p = sotoin6pcb(oso); 2943 /* inherit socket options from the listening socket */ 2944 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 2945 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 2946 m_freem(in6p->in6p_options); 2947 in6p->in6p_options = 0; 2948 } 2949 ip6_savecontrol(in6p, &in6p->in6p_options, 2950 mtod(m, struct ip6_hdr *), m); 2951 } 2952 #endif 2953 2954 #ifdef IPSEC 2955 /* 2956 * we make a copy of policy, instead of sharing the policy, 2957 * for better behavior in terms of SA lookup and dead SA removal. 2958 */ 2959 if (inp) { 2960 /* copy old policy into new socket's */ 2961 if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) 2962 printf("tcp_input: could not copy policy\n"); 2963 } 2964 #ifdef INET6 2965 else if (in6p) { 2966 /* copy old policy into new socket's */ 2967 if (ipsec_copy_policy(sotoin6pcb(oso)->in6p_sp, in6p->in6p_sp)) 2968 printf("tcp_input: could not copy policy\n"); 2969 } 2970 #endif 2971 #endif 2972 2973 /* 2974 * Give the new socket our cached route reference. 2975 */ 2976 if (inp) 2977 inp->inp_route = sc->sc_route4; /* struct assignment */ 2978 #ifdef INET6 2979 else 2980 in6p->in6p_route = sc->sc_route6; 2981 #endif 2982 sc->sc_route4.ro_rt = NULL; 2983 2984 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 2985 if (am == NULL) 2986 goto resetandabort; 2987 am->m_len = src->sa_len; 2988 bcopy(src, mtod(am, caddr_t), src->sa_len); 2989 if (inp) { 2990 if (in_pcbconnect(inp, am)) { 2991 (void) m_free(am); 2992 goto resetandabort; 2993 } 2994 } 2995 #ifdef INET6 2996 else if (in6p) { 2997 if (src->sa_family == AF_INET) { 2998 /* IPv4 packet to AF_INET6 socket */ 2999 struct sockaddr_in6 *sin6; 3000 sin6 = mtod(am, struct sockaddr_in6 *); 3001 am->m_len = sizeof(*sin6); 3002 bzero(sin6, sizeof(*sin6)); 3003 sin6->sin6_family = AF_INET6; 3004 sin6->sin6_len = sizeof(*sin6); 3005 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port; 3006 sin6->sin6_addr.s6_addr16[5] = htons(0xffff); 3007 bcopy(&((struct sockaddr_in *)src)->sin_addr, 3008 &sin6->sin6_addr.s6_addr32[3], 3009 sizeof(sin6->sin6_addr.s6_addr32[3])); 3010 } 3011 if (in6_pcbconnect(in6p, am)) { 3012 (void) m_free(am); 3013 goto resetandabort; 3014 } 3015 } 3016 #endif 3017 else { 3018 (void) m_free(am); 3019 goto resetandabort; 3020 } 3021 (void) m_free(am); 3022 3023 if (inp) 3024 tp = intotcpcb(inp); 3025 #ifdef INET6 3026 else if (in6p) 3027 tp = in6totcpcb(in6p); 3028 #endif 3029 else 3030 tp = NULL; 3031 if (sc->sc_request_r_scale != 15) { 3032 tp->requested_s_scale = sc->sc_requested_s_scale; 3033 tp->request_r_scale = sc->sc_request_r_scale; 3034 tp->snd_scale = sc->sc_requested_s_scale; 3035 tp->rcv_scale = sc->sc_request_r_scale; 3036 tp->t_flags |= TF_RCVD_SCALE; 3037 } 3038 if (sc->sc_flags & SCF_TIMESTAMP) 3039 tp->t_flags |= TF_RCVD_TSTMP; 3040 3041 tp->t_template = tcp_template(tp); 3042 if (tp->t_template == 0) { 3043 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3044 so = NULL; 3045 m_freem(m); 3046 goto abort; 3047 } 3048 3049 tp->iss = sc->sc_iss; 3050 tp->irs = sc->sc_irs; 3051 tcp_sendseqinit(tp); 3052 tcp_rcvseqinit(tp); 3053 tp->t_state = TCPS_SYN_RECEIVED; 3054 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT); 3055 tcpstat.tcps_accepts++; 3056 3057 /* Initialize tp->t_ourmss before we deal with the peer's! */ 3058 tp->t_ourmss = sc->sc_ourmaxseg; 3059 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 3060 3061 /* 3062 * Initialize the initial congestion window. If we 3063 * had to retransmit the SYN,ACK, we must initialize cwnd 3064 * to 1 segment (i.e. the Loss Window). 3065 */ 3066 if (sc->sc_rxtshift) 3067 tp->snd_cwnd = tp->t_peermss; 3068 else 3069 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss); 3070 3071 tcp_rmx_rtt(tp); 3072 tp->snd_wl1 = sc->sc_irs; 3073 tp->rcv_up = sc->sc_irs + 1; 3074 3075 /* 3076 * This is what whould have happened in tcp_ouput() when 3077 * the SYN,ACK was sent. 3078 */ 3079 tp->snd_up = tp->snd_una; 3080 tp->snd_max = tp->snd_nxt = tp->iss+1; 3081 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3082 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3083 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3084 tp->last_ack_sent = tp->rcv_nxt; 3085 3086 tcpstat.tcps_sc_completed++; 3087 SYN_CACHE_PUT(sc); 3088 return (so); 3089 3090 resetandabort: 3091 (void) tcp_respond(NULL, m, m, th, 3092 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); 3093 abort: 3094 if (so != NULL) 3095 (void) soabort(so); 3096 SYN_CACHE_PUT(sc); 3097 tcpstat.tcps_sc_aborted++; 3098 return ((struct socket *)(-1)); 3099 } 3100 3101 /* 3102 * This function is called when we get a RST for a 3103 * non-existant connection, so that we can see if the 3104 * connection is in the syn cache. If it is, zap it. 3105 */ 3106 3107 void 3108 syn_cache_reset(src, dst, th) 3109 struct sockaddr *src; 3110 struct sockaddr *dst; 3111 struct tcphdr *th; 3112 { 3113 struct syn_cache *sc; 3114 struct syn_cache_head *scp; 3115 int s = splsoftnet(); 3116 3117 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3118 splx(s); 3119 return; 3120 } 3121 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3122 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3123 splx(s); 3124 return; 3125 } 3126 SYN_CACHE_RM(sc); 3127 splx(s); 3128 tcpstat.tcps_sc_reset++; 3129 SYN_CACHE_PUT(sc); 3130 } 3131 3132 void 3133 syn_cache_unreach(src, dst, th) 3134 struct sockaddr *src; 3135 struct sockaddr *dst; 3136 struct tcphdr *th; 3137 { 3138 struct syn_cache *sc; 3139 struct syn_cache_head *scp; 3140 int s; 3141 3142 s = splsoftnet(); 3143 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3144 splx(s); 3145 return; 3146 } 3147 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3148 if (ntohl (th->th_seq) != sc->sc_iss) { 3149 splx(s); 3150 return; 3151 } 3152 3153 /* 3154 * If we've rertransmitted 3 times and this is our second error, 3155 * we remove the entry. Otherwise, we allow it to continue on. 3156 * This prevents us from incorrectly nuking an entry during a 3157 * spurious network outage. 3158 * 3159 * See tcp_notify(). 3160 */ 3161 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3162 sc->sc_flags |= SCF_UNREACH; 3163 splx(s); 3164 return; 3165 } 3166 3167 SYN_CACHE_RM(sc); 3168 splx(s); 3169 tcpstat.tcps_sc_unreach++; 3170 SYN_CACHE_PUT(sc); 3171 } 3172 3173 /* 3174 * Given a LISTEN socket and an inbound SYN request, add 3175 * this to the syn cache, and send back a segment: 3176 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3177 * to the source. 3178 * 3179 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3180 * Doing so would require that we hold onto the data and deliver it 3181 * to the application. However, if we are the target of a SYN-flood 3182 * DoS attack, an attacker could send data which would eventually 3183 * consume all available buffer space if it were ACKed. By not ACKing 3184 * the data, we avoid this DoS scenario. 3185 */ 3186 3187 int 3188 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi) 3189 struct sockaddr *src; 3190 struct sockaddr *dst; 3191 struct tcphdr *th; 3192 unsigned int hlen; 3193 struct socket *so; 3194 struct mbuf *m; 3195 u_char *optp; 3196 int optlen; 3197 struct tcp_opt_info *oi; 3198 { 3199 struct tcpcb tb, *tp; 3200 long win; 3201 struct syn_cache *sc; 3202 struct syn_cache_head *scp; 3203 struct mbuf *ipopts; 3204 3205 tp = sototcpcb(so); 3206 3207 /* 3208 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3209 * 3210 * Note this check is performed in tcp_input() very early on. 3211 */ 3212 3213 /* 3214 * Initialize some local state. 3215 */ 3216 win = sbspace(&so->so_rcv); 3217 if (win > TCP_MAXWIN) 3218 win = TCP_MAXWIN; 3219 3220 if (src->sa_family == AF_INET) { 3221 /* 3222 * Remember the IP options, if any. 3223 */ 3224 ipopts = ip_srcroute(); 3225 } else 3226 ipopts = NULL; 3227 3228 if (optp) { 3229 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3230 tcp_dooptions(&tb, optp, optlen, th, oi); 3231 } else 3232 tb.t_flags = 0; 3233 3234 /* 3235 * See if we already have an entry for this connection. 3236 * If we do, resend the SYN,ACK. We do not count this 3237 * as a retransmission (XXX though maybe we should). 3238 */ 3239 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3240 tcpstat.tcps_sc_dupesyn++; 3241 if (ipopts) { 3242 /* 3243 * If we were remembering a previous source route, 3244 * forget it and use the new one we've been given. 3245 */ 3246 if (sc->sc_ipopts) 3247 (void) m_free(sc->sc_ipopts); 3248 sc->sc_ipopts = ipopts; 3249 } 3250 sc->sc_timestamp = tb.ts_recent; 3251 if (syn_cache_respond(sc, m) == 0) { 3252 tcpstat.tcps_sndacks++; 3253 tcpstat.tcps_sndtotal++; 3254 } 3255 return (1); 3256 } 3257 3258 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 3259 if (sc == NULL) { 3260 if (ipopts) 3261 (void) m_free(ipopts); 3262 return (0); 3263 } 3264 3265 /* 3266 * Fill in the cache, and put the necessary IP and TCP 3267 * options into the reply. 3268 */ 3269 bzero(sc, sizeof(struct syn_cache)); 3270 bcopy(src, &sc->sc_src, src->sa_len); 3271 bcopy(dst, &sc->sc_dst, dst->sa_len); 3272 sc->sc_flags = 0; 3273 sc->sc_ipopts = ipopts; 3274 sc->sc_irs = th->th_seq; 3275 sc->sc_iss = tcp_new_iss(sc, sizeof(struct syn_cache), 0); 3276 sc->sc_peermaxseg = oi->maxseg; 3277 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 3278 m->m_pkthdr.rcvif : NULL, 3279 sc->sc_src.sa.sa_family); 3280 sc->sc_win = win; 3281 sc->sc_timestamp = tb.ts_recent; 3282 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) 3283 sc->sc_flags |= SCF_TIMESTAMP; 3284 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3285 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3286 sc->sc_requested_s_scale = tb.requested_s_scale; 3287 sc->sc_request_r_scale = 0; 3288 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3289 TCP_MAXWIN << sc->sc_request_r_scale < 3290 so->so_rcv.sb_hiwat) 3291 sc->sc_request_r_scale++; 3292 } else { 3293 sc->sc_requested_s_scale = 15; 3294 sc->sc_request_r_scale = 15; 3295 } 3296 sc->sc_tp = tp; 3297 if (syn_cache_respond(sc, m) == 0) { 3298 syn_cache_insert(sc, tp); 3299 tcpstat.tcps_sndacks++; 3300 tcpstat.tcps_sndtotal++; 3301 } else { 3302 SYN_CACHE_PUT(sc); 3303 tcpstat.tcps_sc_dropped++; 3304 } 3305 return (1); 3306 } 3307 3308 int 3309 syn_cache_respond(sc, m) 3310 struct syn_cache *sc; 3311 struct mbuf *m; 3312 { 3313 struct route *ro; 3314 struct rtentry *rt; 3315 u_int8_t *optp; 3316 int optlen, error; 3317 u_int16_t tlen; 3318 struct ip *ip = NULL; 3319 #ifdef INET6 3320 struct ip6_hdr *ip6 = NULL; 3321 #endif 3322 struct tcphdr *th; 3323 u_int hlen; 3324 3325 switch (sc->sc_src.sa.sa_family) { 3326 case AF_INET: 3327 hlen = sizeof(struct ip); 3328 ro = &sc->sc_route4; 3329 break; 3330 #ifdef INET6 3331 case AF_INET6: 3332 hlen = sizeof(struct ip6_hdr); 3333 ro = (struct route *)&sc->sc_route6; 3334 break; 3335 #endif 3336 default: 3337 if (m) 3338 m_freem(m); 3339 return EAFNOSUPPORT; 3340 } 3341 3342 /* Compute the size of the TCP options. */ 3343 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3344 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3345 3346 tlen = hlen + sizeof(struct tcphdr) + optlen; 3347 3348 /* 3349 * Create the IP+TCP header from scratch. Reuse the received mbuf 3350 * if possible. 3351 */ 3352 if (m != NULL) { 3353 m_freem(m->m_next); 3354 m->m_next = NULL; 3355 MRESETDATA(m); 3356 } else { 3357 MGETHDR(m, M_DONTWAIT, MT_DATA); 3358 if (m == NULL) 3359 return (ENOBUFS); 3360 } 3361 3362 /* Fixup the mbuf. */ 3363 m->m_data += max_linkhdr; 3364 m->m_len = m->m_pkthdr.len = tlen; 3365 #ifdef IPSEC 3366 if (sc->sc_tp) { 3367 struct tcpcb *tp; 3368 struct socket *so; 3369 3370 tp = sc->sc_tp; 3371 if (tp->t_inpcb) 3372 so = tp->t_inpcb->inp_socket; 3373 #ifdef INET6 3374 else if (tp->t_in6pcb) 3375 so = tp->t_in6pcb->in6p_socket; 3376 #endif 3377 else 3378 so = NULL; 3379 /* use IPsec policy on listening socket, on SYN ACK */ 3380 ipsec_setsocket(m, so); 3381 } 3382 #endif 3383 memset(mtod(m, u_char *), 0, tlen); 3384 3385 switch (sc->sc_src.sa.sa_family) { 3386 case AF_INET: 3387 ip = mtod(m, struct ip *); 3388 ip->ip_dst = sc->sc_src.sin.sin_addr; 3389 ip->ip_src = sc->sc_dst.sin.sin_addr; 3390 ip->ip_p = IPPROTO_TCP; 3391 th = (struct tcphdr *)(ip + 1); 3392 th->th_dport = sc->sc_src.sin.sin_port; 3393 th->th_sport = sc->sc_dst.sin.sin_port; 3394 break; 3395 #ifdef INET6 3396 case AF_INET6: 3397 ip6 = mtod(m, struct ip6_hdr *); 3398 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3399 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3400 ip6->ip6_nxt = IPPROTO_TCP; 3401 /* ip6_plen will be updated in ip6_output() */ 3402 th = (struct tcphdr *)(ip6 + 1); 3403 th->th_dport = sc->sc_src.sin6.sin6_port; 3404 th->th_sport = sc->sc_dst.sin6.sin6_port; 3405 break; 3406 #endif 3407 default: 3408 th = NULL; 3409 } 3410 3411 th->th_seq = htonl(sc->sc_iss); 3412 th->th_ack = htonl(sc->sc_irs + 1); 3413 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3414 th->th_flags = TH_SYN|TH_ACK; 3415 th->th_win = htons(sc->sc_win); 3416 /* th_sum already 0 */ 3417 /* th_urp already 0 */ 3418 3419 /* Tack on the TCP options. */ 3420 optp = (u_int8_t *)(th + 1); 3421 *optp++ = TCPOPT_MAXSEG; 3422 *optp++ = 4; 3423 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3424 *optp++ = sc->sc_ourmaxseg & 0xff; 3425 3426 if (sc->sc_request_r_scale != 15) { 3427 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3428 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3429 sc->sc_request_r_scale); 3430 optp += 4; 3431 } 3432 3433 if (sc->sc_flags & SCF_TIMESTAMP) { 3434 u_int32_t *lp = (u_int32_t *)(optp); 3435 /* Form timestamp option as shown in appendix A of RFC 1323. */ 3436 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 3437 *lp++ = htonl(tcp_now); 3438 *lp = htonl(sc->sc_timestamp); 3439 optp += TCPOLEN_TSTAMP_APPA; 3440 } 3441 3442 /* Compute the packet's checksum. */ 3443 switch (sc->sc_src.sa.sa_family) { 3444 case AF_INET: 3445 ip->ip_len = htons(tlen - hlen); 3446 th->th_sum = 0; 3447 th->th_sum = in_cksum(m, tlen); 3448 break; 3449 #ifdef INET6 3450 case AF_INET6: 3451 ip6->ip6_plen = htons(tlen - hlen); 3452 th->th_sum = 0; 3453 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 3454 break; 3455 #endif 3456 } 3457 3458 /* 3459 * Fill in some straggling IP bits. Note the stack expects 3460 * ip_len to be in host order, for convenience. 3461 */ 3462 switch (sc->sc_src.sa.sa_family) { 3463 case AF_INET: 3464 ip->ip_len = tlen; 3465 ip->ip_ttl = ip_defttl; 3466 /* XXX tos? */ 3467 break; 3468 #ifdef INET6 3469 case AF_INET6: 3470 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 3471 ip6->ip6_vfc |= IPV6_VERSION; 3472 ip6->ip6_plen = htons(tlen - hlen); 3473 /* ip6_hlim will be initialized afterwards */ 3474 /* XXX flowlabel? */ 3475 break; 3476 #endif 3477 } 3478 3479 /* 3480 * If we're doing Path MTU discovery, we need to set DF unless 3481 * the route's MTU is locked. If we don't yet know the route, 3482 * look it up now. We will copy this reference to the inpcb 3483 * when we finish creating the connection. 3484 */ 3485 if ((rt = ro->ro_rt) == NULL || (rt->rt_flags & RTF_UP) == 0) { 3486 if (ro->ro_rt != NULL) { 3487 RTFREE(ro->ro_rt); 3488 ro->ro_rt = NULL; 3489 } 3490 bcopy(&sc->sc_src, &ro->ro_dst, sc->sc_src.sa.sa_len); 3491 rtalloc(ro); 3492 if ((rt = ro->ro_rt) == NULL) { 3493 m_freem(m); 3494 switch (sc->sc_src.sa.sa_family) { 3495 case AF_INET: 3496 ipstat.ips_noroute++; 3497 break; 3498 #ifdef INET6 3499 case AF_INET6: 3500 ip6stat.ip6s_noroute++; 3501 break; 3502 #endif 3503 } 3504 return (EHOSTUNREACH); 3505 } 3506 } 3507 3508 switch (sc->sc_src.sa.sa_family) { 3509 case AF_INET: 3510 if (ip_mtudisc != 0 && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0) 3511 ip->ip_off |= IP_DF; 3512 3513 /* ...and send it off! */ 3514 error = ip_output(m, sc->sc_ipopts, ro, 0, NULL); 3515 break; 3516 #ifdef INET6 3517 case AF_INET6: 3518 ip6->ip6_hlim = in6_selecthlim(NULL, 3519 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 3520 3521 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 3522 0, NULL, NULL); 3523 break; 3524 #endif 3525 default: 3526 error = EAFNOSUPPORT; 3527 break; 3528 } 3529 return (error); 3530 } 3531