1 /* $NetBSD: tcp_input.c,v 1.123 2001/03/20 20:07:51 thorpej Exp $ */ 2 3 /* 4 %%% portions-copyright-nrl-95 5 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 7 Reserved. All rights under this copyright have been assigned to the US 8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 10 software. 11 You should have received a copy of the license with this software. If you 12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 13 14 */ 15 16 /* 17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 18 * All rights reserved. 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 1. Redistributions of source code must retain the above copyright 24 * notice, this list of conditions and the following disclaimer. 25 * 2. Redistributions in binary form must reproduce the above copyright 26 * notice, this list of conditions and the following disclaimer in the 27 * documentation and/or other materials provided with the distribution. 28 * 3. Neither the name of the project nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 45 /*- 46 * Copyright (c) 1997, 1998, 1999, 2001 The NetBSD Foundation, Inc. 47 * All rights reserved. 48 * 49 * This code is derived from software contributed to The NetBSD Foundation 50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 51 * Facility, NASA Ames Research Center. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 3. All advertising materials mentioning features or use of this software 62 * must display the following acknowledgement: 63 * This product includes software developed by the NetBSD 64 * Foundation, Inc. and its contributors. 65 * 4. Neither the name of The NetBSD Foundation nor the names of its 66 * contributors may be used to endorse or promote products derived 67 * from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 * POSSIBILITY OF SUCH DAMAGE. 80 */ 81 82 /* 83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 84 * The Regents of the University of California. All rights reserved. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. All advertising materials mentioning features or use of this software 95 * must display the following acknowledgement: 96 * This product includes software developed by the University of 97 * California, Berkeley and its contributors. 98 * 4. Neither the name of the University nor the names of its contributors 99 * may be used to endorse or promote products derived from this software 100 * without specific prior written permission. 101 * 102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 112 * SUCH DAMAGE. 113 * 114 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 115 */ 116 117 /* 118 * TODO list for SYN cache stuff: 119 * 120 * Find room for a "state" field, which is needed to keep a 121 * compressed state for TIME_WAIT TCBs. It's been noted already 122 * that this is fairly important for very high-volume web and 123 * mail servers, which use a large number of short-lived 124 * connections. 125 */ 126 127 #include "opt_inet.h" 128 #include "opt_ipsec.h" 129 130 #include <sys/param.h> 131 #include <sys/systm.h> 132 #include <sys/malloc.h> 133 #include <sys/mbuf.h> 134 #include <sys/protosw.h> 135 #include <sys/socket.h> 136 #include <sys/socketvar.h> 137 #include <sys/errno.h> 138 #include <sys/syslog.h> 139 #include <sys/pool.h> 140 #include <sys/domain.h> 141 142 #include <net/if.h> 143 #include <net/route.h> 144 #include <net/if_types.h> 145 146 #include <netinet/in.h> 147 #include <netinet/in_systm.h> 148 #include <netinet/ip.h> 149 #include <netinet/in_pcb.h> 150 #include <netinet/ip_var.h> 151 152 #ifdef INET6 153 #ifndef INET 154 #include <netinet/in.h> 155 #endif 156 #include <netinet/ip6.h> 157 #include <netinet6/ip6_var.h> 158 #include <netinet6/in6_pcb.h> 159 #include <netinet6/ip6_var.h> 160 #include <netinet6/in6_var.h> 161 #include <netinet/icmp6.h> 162 #include <netinet6/nd6.h> 163 #endif 164 165 #ifdef PULLDOWN_TEST 166 #ifndef INET6 167 /* always need ip6.h for IP6_EXTHDR_GET */ 168 #include <netinet/ip6.h> 169 #endif 170 #endif 171 172 #include <netinet/tcp.h> 173 #include <netinet/tcp_fsm.h> 174 #include <netinet/tcp_seq.h> 175 #include <netinet/tcp_timer.h> 176 #include <netinet/tcp_var.h> 177 #include <netinet/tcpip.h> 178 #include <netinet/tcp_debug.h> 179 180 #include <machine/stdarg.h> 181 182 #ifdef IPSEC 183 #include <netinet6/ipsec.h> 184 #include <netkey/key.h> 185 #endif /*IPSEC*/ 186 #ifdef INET6 187 #include "faith.h" 188 #endif 189 190 int tcprexmtthresh = 3; 191 int tcp_log_refused; 192 193 static int tcp_rst_ppslim_count = 0; 194 static struct timeval tcp_rst_ppslim_last; 195 196 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 197 198 /* for modulo comparisons of timestamps */ 199 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 200 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 201 202 /* 203 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 204 */ 205 #ifdef INET6 206 #define ND6_HINT(tp) \ 207 do { \ 208 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 \ 209 && tp->t_in6pcb->in6p_route.ro_rt) { \ 210 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \ 211 } \ 212 } while (0) 213 #else 214 #define ND6_HINT(tp) 215 #endif 216 217 /* 218 * Macro to compute ACK transmission behavior. Delay the ACK unless 219 * we have already delayed an ACK (must send an ACK every two segments). 220 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 221 * option is enabled. 222 */ 223 #define TCP_SETUP_ACK(tp, th) \ 224 do { \ 225 if ((tp)->t_flags & TF_DELACK || \ 226 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \ 227 tp->t_flags |= TF_ACKNOW; \ 228 else \ 229 TCP_SET_DELACK(tp); \ 230 } while (0) 231 232 /* 233 * Convert TCP protocol fields to host order for easier processing. 234 */ 235 #define TCP_FIELDS_TO_HOST(th) \ 236 do { \ 237 NTOHL((th)->th_seq); \ 238 NTOHL((th)->th_ack); \ 239 NTOHS((th)->th_win); \ 240 NTOHS((th)->th_urp); \ 241 } while (0) 242 243 int 244 tcp_reass(tp, th, m, tlen) 245 struct tcpcb *tp; 246 struct tcphdr *th; 247 struct mbuf *m; 248 int *tlen; 249 { 250 struct ipqent *p, *q, *nq, *tiqe = NULL; 251 struct socket *so = NULL; 252 int pkt_flags; 253 tcp_seq pkt_seq; 254 unsigned pkt_len; 255 u_long rcvpartdupbyte = 0; 256 u_long rcvoobyte; 257 258 if (tp->t_inpcb) 259 so = tp->t_inpcb->inp_socket; 260 #ifdef INET6 261 else if (tp->t_in6pcb) 262 so = tp->t_in6pcb->in6p_socket; 263 #endif 264 265 TCP_REASS_LOCK_CHECK(tp); 266 267 /* 268 * Call with th==0 after become established to 269 * force pre-ESTABLISHED data up to user socket. 270 */ 271 if (th == 0) 272 goto present; 273 274 rcvoobyte = *tlen; 275 /* 276 * Copy these to local variables because the tcpiphdr 277 * gets munged while we are collapsing mbufs. 278 */ 279 pkt_seq = th->th_seq; 280 pkt_len = *tlen; 281 pkt_flags = th->th_flags; 282 /* 283 * Find a segment which begins after this one does. 284 */ 285 for (p = NULL, q = tp->segq.lh_first; q != NULL; q = nq) { 286 nq = q->ipqe_q.le_next; 287 /* 288 * If the received segment is just right after this 289 * fragment, merge the two together and then check 290 * for further overlaps. 291 */ 292 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 293 #ifdef TCPREASS_DEBUG 294 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 295 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 296 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 297 #endif 298 pkt_len += q->ipqe_len; 299 pkt_flags |= q->ipqe_flags; 300 pkt_seq = q->ipqe_seq; 301 m_cat(q->ipqe_m, m); 302 m = q->ipqe_m; 303 goto free_ipqe; 304 } 305 /* 306 * If the received segment is completely past this 307 * fragment, we need to go the next fragment. 308 */ 309 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 310 p = q; 311 continue; 312 } 313 /* 314 * If the fragment is past the received segment, 315 * it (or any following) can't be concatenated. 316 */ 317 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) 318 break; 319 /* 320 * We've received all the data in this segment before. 321 * mark it as a duplicate and return. 322 */ 323 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 324 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 325 tcpstat.tcps_rcvduppack++; 326 tcpstat.tcps_rcvdupbyte += pkt_len; 327 m_freem(m); 328 if (tiqe != NULL) 329 pool_put(&ipqent_pool, tiqe); 330 return (0); 331 } 332 /* 333 * Received segment completely overlaps this fragment 334 * so we drop the fragment (this keeps the temporal 335 * ordering of segments correct). 336 */ 337 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 338 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 339 rcvpartdupbyte += q->ipqe_len; 340 m_freem(q->ipqe_m); 341 goto free_ipqe; 342 } 343 /* 344 * RX'ed segment extends past the end of the 345 * fragment. Drop the overlapping bytes. Then 346 * merge the fragment and segment then treat as 347 * a longer received packet. 348 */ 349 if (SEQ_LT(q->ipqe_seq, pkt_seq) 350 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 351 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 352 #ifdef TCPREASS_DEBUG 353 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 354 tp, overlap, 355 pkt_seq, pkt_seq + pkt_len, pkt_len); 356 #endif 357 m_adj(m, overlap); 358 rcvpartdupbyte += overlap; 359 m_cat(q->ipqe_m, m); 360 m = q->ipqe_m; 361 pkt_seq = q->ipqe_seq; 362 pkt_len += q->ipqe_len - overlap; 363 rcvoobyte -= overlap; 364 goto free_ipqe; 365 } 366 /* 367 * RX'ed segment extends past the front of the 368 * fragment. Drop the overlapping bytes on the 369 * received packet. The packet will then be 370 * contatentated with this fragment a bit later. 371 */ 372 if (SEQ_GT(q->ipqe_seq, pkt_seq) 373 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 374 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 375 #ifdef TCPREASS_DEBUG 376 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 377 tp, overlap, 378 pkt_seq, pkt_seq + pkt_len, pkt_len); 379 #endif 380 m_adj(m, -overlap); 381 pkt_len -= overlap; 382 rcvpartdupbyte += overlap; 383 rcvoobyte -= overlap; 384 } 385 /* 386 * If the received segment immediates precedes this 387 * fragment then tack the fragment onto this segment 388 * and reinsert the data. 389 */ 390 if (q->ipqe_seq == pkt_seq + pkt_len) { 391 #ifdef TCPREASS_DEBUG 392 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 393 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 394 pkt_seq, pkt_seq + pkt_len, pkt_len); 395 #endif 396 pkt_len += q->ipqe_len; 397 pkt_flags |= q->ipqe_flags; 398 m_cat(m, q->ipqe_m); 399 LIST_REMOVE(q, ipqe_q); 400 LIST_REMOVE(q, ipqe_timeq); 401 if (tiqe == NULL) { 402 tiqe = q; 403 } else { 404 pool_put(&ipqent_pool, q); 405 } 406 break; 407 } 408 /* 409 * If the fragment is before the segment, remember it. 410 * When this loop is terminated, p will contain the 411 * pointer to fragment that is right before the received 412 * segment. 413 */ 414 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 415 p = q; 416 417 continue; 418 419 /* 420 * This is a common operation. It also will allow 421 * to save doing a malloc/free in most instances. 422 */ 423 free_ipqe: 424 LIST_REMOVE(q, ipqe_q); 425 LIST_REMOVE(q, ipqe_timeq); 426 if (tiqe == NULL) { 427 tiqe = q; 428 } else { 429 pool_put(&ipqent_pool, q); 430 } 431 } 432 433 /* 434 * Allocate a new queue entry since the received segment did not 435 * collapse onto any other out-of-order block; thus we are allocating 436 * a new block. If it had collapsed, tiqe would not be NULL and 437 * we would be reusing it. 438 * XXX If we can't, just drop the packet. XXX 439 */ 440 if (tiqe == NULL) { 441 tiqe = pool_get(&ipqent_pool, PR_NOWAIT); 442 if (tiqe == NULL) { 443 tcpstat.tcps_rcvmemdrop++; 444 m_freem(m); 445 return (0); 446 } 447 } 448 449 /* 450 * Update the counters. 451 */ 452 tcpstat.tcps_rcvoopack++; 453 tcpstat.tcps_rcvoobyte += rcvoobyte; 454 if (rcvpartdupbyte) { 455 tcpstat.tcps_rcvpartduppack++; 456 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte; 457 } 458 459 /* 460 * Insert the new fragment queue entry into both queues. 461 */ 462 tiqe->ipqe_m = m; 463 tiqe->ipqe_seq = pkt_seq; 464 tiqe->ipqe_len = pkt_len; 465 tiqe->ipqe_flags = pkt_flags; 466 if (p == NULL) { 467 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 468 #ifdef TCPREASS_DEBUG 469 if (tiqe->ipqe_seq != tp->rcv_nxt) 470 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 471 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 472 #endif 473 } else { 474 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 475 #ifdef TCPREASS_DEBUG 476 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 477 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 478 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 479 #endif 480 } 481 482 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 483 484 present: 485 /* 486 * Present data to user, advancing rcv_nxt through 487 * completed sequence space. 488 */ 489 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 490 return (0); 491 q = tp->segq.lh_first; 492 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 493 return (0); 494 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 495 return (0); 496 497 tp->rcv_nxt += q->ipqe_len; 498 pkt_flags = q->ipqe_flags & TH_FIN; 499 ND6_HINT(tp); 500 501 LIST_REMOVE(q, ipqe_q); 502 LIST_REMOVE(q, ipqe_timeq); 503 if (so->so_state & SS_CANTRCVMORE) 504 m_freem(q->ipqe_m); 505 else 506 sbappend(&so->so_rcv, q->ipqe_m); 507 pool_put(&ipqent_pool, q); 508 sorwakeup(so); 509 return (pkt_flags); 510 } 511 512 #ifdef INET6 513 int 514 tcp6_input(mp, offp, proto) 515 struct mbuf **mp; 516 int *offp, proto; 517 { 518 struct mbuf *m = *mp; 519 520 /* 521 * draft-itojun-ipv6-tcp-to-anycast 522 * better place to put this in? 523 */ 524 if (m->m_flags & M_ANYCAST6) { 525 struct ip6_hdr *ip6; 526 if (m->m_len < sizeof(struct ip6_hdr)) { 527 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 528 tcpstat.tcps_rcvshort++; 529 return IPPROTO_DONE; 530 } 531 } 532 ip6 = mtod(m, struct ip6_hdr *); 533 icmp6_error(m, ICMP6_DST_UNREACH, 534 ICMP6_DST_UNREACH_ADDR, 535 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 536 return IPPROTO_DONE; 537 } 538 539 tcp_input(m, *offp, proto); 540 return IPPROTO_DONE; 541 } 542 #endif 543 544 /* 545 * TCP input routine, follows pages 65-76 of the 546 * protocol specification dated September, 1981 very closely. 547 */ 548 void 549 #if __STDC__ 550 tcp_input(struct mbuf *m, ...) 551 #else 552 tcp_input(m, va_alist) 553 struct mbuf *m; 554 #endif 555 { 556 int proto; 557 struct tcphdr *th; 558 struct ip *ip; 559 struct inpcb *inp; 560 #ifdef INET6 561 struct ip6_hdr *ip6; 562 struct in6pcb *in6p; 563 #endif 564 caddr_t optp = NULL; 565 int optlen = 0; 566 int len, tlen, toff, hdroptlen = 0; 567 struct tcpcb *tp = 0; 568 int tiflags; 569 struct socket *so = NULL; 570 int todrop, acked, ourfinisacked, needoutput = 0; 571 short ostate = 0; 572 int iss = 0; 573 u_long tiwin; 574 struct tcp_opt_info opti; 575 int off, iphlen; 576 va_list ap; 577 int af; /* af on the wire */ 578 struct mbuf *tcp_saveti = NULL; 579 580 va_start(ap, m); 581 toff = va_arg(ap, int); 582 proto = va_arg(ap, int); 583 va_end(ap); 584 585 tcpstat.tcps_rcvtotal++; 586 587 bzero(&opti, sizeof(opti)); 588 opti.ts_present = 0; 589 opti.maxseg = 0; 590 591 /* 592 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 593 * 594 * TCP is, by definition, unicast, so we reject all 595 * multicast outright. 596 * 597 * Note, there are additional src/dst address checks in 598 * the AF-specific code below. 599 */ 600 if (m->m_flags & (M_BCAST|M_MCAST)) { 601 /* XXX stat */ 602 goto drop; 603 } 604 #ifdef INET6 605 if (m->m_flags & M_ANYCAST6) { 606 /* XXX stat */ 607 goto drop; 608 } 609 #endif 610 611 /* 612 * Get IP and TCP header together in first mbuf. 613 * Note: IP leaves IP header in first mbuf. 614 */ 615 ip = mtod(m, struct ip *); 616 #ifdef INET6 617 ip6 = NULL; 618 #endif 619 switch (ip->ip_v) { 620 #ifdef INET 621 case 4: 622 af = AF_INET; 623 iphlen = sizeof(struct ip); 624 #ifndef PULLDOWN_TEST 625 /* would like to get rid of this... */ 626 if (toff > sizeof (struct ip)) { 627 ip_stripoptions(m, (struct mbuf *)0); 628 toff = sizeof(struct ip); 629 } 630 if (m->m_len < toff + sizeof (struct tcphdr)) { 631 if ((m = m_pullup(m, toff + sizeof (struct tcphdr))) == 0) { 632 tcpstat.tcps_rcvshort++; 633 return; 634 } 635 } 636 ip = mtod(m, struct ip *); 637 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 638 #else 639 ip = mtod(m, struct ip *); 640 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 641 sizeof(struct tcphdr)); 642 if (th == NULL) { 643 tcpstat.tcps_rcvshort++; 644 return; 645 } 646 #endif 647 648 /* 649 * Make sure destination address is not multicast. 650 * Source address checked in ip_input(). 651 */ 652 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 653 /* XXX stat */ 654 goto drop; 655 } 656 657 /* We do the checksum after PCB lookup... */ 658 len = ip->ip_len; 659 tlen = len - toff; 660 break; 661 #endif 662 #ifdef INET6 663 case 6: 664 ip = NULL; 665 iphlen = sizeof(struct ip6_hdr); 666 af = AF_INET6; 667 #ifndef PULLDOWN_TEST 668 if (m->m_len < toff + sizeof(struct tcphdr)) { 669 m = m_pullup(m, toff + sizeof(struct tcphdr)); /*XXX*/ 670 if (m == NULL) { 671 tcpstat.tcps_rcvshort++; 672 return; 673 } 674 } 675 ip6 = mtod(m, struct ip6_hdr *); 676 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 677 #else 678 ip6 = mtod(m, struct ip6_hdr *); 679 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 680 sizeof(struct tcphdr)); 681 if (th == NULL) { 682 tcpstat.tcps_rcvshort++; 683 return; 684 } 685 #endif 686 687 /* Be proactive about malicious use of IPv4 mapped address */ 688 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 689 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 690 /* XXX stat */ 691 goto drop; 692 } 693 694 /* 695 * Be proactive about unspecified IPv6 address in source. 696 * As we use all-zero to indicate unbounded/unconnected pcb, 697 * unspecified IPv6 address can be used to confuse us. 698 * 699 * Note that packets with unspecified IPv6 destination is 700 * already dropped in ip6_input. 701 */ 702 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 703 /* XXX stat */ 704 goto drop; 705 } 706 707 /* 708 * Make sure destination address is not multicast. 709 * Source address checked in ip6_input(). 710 */ 711 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 712 /* XXX stat */ 713 goto drop; 714 } 715 716 /* We do the checksum after PCB lookup... */ 717 len = m->m_pkthdr.len; 718 tlen = len - toff; 719 break; 720 #endif 721 default: 722 m_freem(m); 723 return; 724 } 725 726 /* 727 * Check that TCP offset makes sense, 728 * pull out TCP options and adjust length. XXX 729 */ 730 off = th->th_off << 2; 731 if (off < sizeof (struct tcphdr) || off > tlen) { 732 tcpstat.tcps_rcvbadoff++; 733 goto drop; 734 } 735 tlen -= off; 736 737 /* 738 * tcp_input() has been modified to use tlen to mean the TCP data 739 * length throughout the function. Other functions can use 740 * m->m_pkthdr.len as the basis for calculating the TCP data length. 741 * rja 742 */ 743 744 if (off > sizeof (struct tcphdr)) { 745 #ifndef PULLDOWN_TEST 746 if (m->m_len < toff + off) { 747 if ((m = m_pullup(m, toff + off)) == 0) { 748 tcpstat.tcps_rcvshort++; 749 return; 750 } 751 switch (af) { 752 #ifdef INET 753 case AF_INET: 754 ip = mtod(m, struct ip *); 755 break; 756 #endif 757 #ifdef INET6 758 case AF_INET6: 759 ip6 = mtod(m, struct ip6_hdr *); 760 break; 761 #endif 762 } 763 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 764 } 765 #else 766 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); 767 if (th == NULL) { 768 tcpstat.tcps_rcvshort++; 769 return; 770 } 771 /* 772 * NOTE: ip/ip6 will not be affected by m_pulldown() 773 * (as they're before toff) and we don't need to update those. 774 */ 775 #endif 776 optlen = off - sizeof (struct tcphdr); 777 optp = ((caddr_t)th) + sizeof(struct tcphdr); 778 /* 779 * Do quick retrieval of timestamp options ("options 780 * prediction?"). If timestamp is the only option and it's 781 * formatted as recommended in RFC 1323 appendix A, we 782 * quickly get the values now and not bother calling 783 * tcp_dooptions(), etc. 784 */ 785 if ((optlen == TCPOLEN_TSTAMP_APPA || 786 (optlen > TCPOLEN_TSTAMP_APPA && 787 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 788 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 789 (th->th_flags & TH_SYN) == 0) { 790 opti.ts_present = 1; 791 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 792 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 793 optp = NULL; /* we've parsed the options */ 794 } 795 } 796 tiflags = th->th_flags; 797 798 /* 799 * Locate pcb for segment. 800 */ 801 findpcb: 802 inp = NULL; 803 #ifdef INET6 804 in6p = NULL; 805 #endif 806 switch (af) { 807 #ifdef INET 808 case AF_INET: 809 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, 810 ip->ip_dst, th->th_dport); 811 if (inp == 0) { 812 ++tcpstat.tcps_pcbhashmiss; 813 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); 814 } 815 #ifdef INET6 816 if (inp == 0) { 817 struct in6_addr s, d; 818 819 /* mapped addr case */ 820 bzero(&s, sizeof(s)); 821 s.s6_addr16[5] = htons(0xffff); 822 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src)); 823 bzero(&d, sizeof(d)); 824 d.s6_addr16[5] = htons(0xffff); 825 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst)); 826 in6p = in6_pcblookup_connect(&tcb6, &s, th->th_sport, 827 &d, th->th_dport, 0); 828 if (in6p == 0) { 829 ++tcpstat.tcps_pcbhashmiss; 830 in6p = in6_pcblookup_bind(&tcb6, &d, 831 th->th_dport, 0); 832 } 833 } 834 #endif 835 #ifndef INET6 836 if (inp == 0) 837 #else 838 if (inp == 0 && in6p == 0) 839 #endif 840 { 841 ++tcpstat.tcps_noport; 842 if (tcp_log_refused && (tiflags & TH_SYN)) { 843 #ifndef INET6 844 char src[4*sizeof "123"]; 845 char dst[4*sizeof "123"]; 846 #else 847 char src[INET6_ADDRSTRLEN]; 848 char dst[INET6_ADDRSTRLEN]; 849 #endif 850 if (ip) { 851 strcpy(src, inet_ntoa(ip->ip_src)); 852 strcpy(dst, inet_ntoa(ip->ip_dst)); 853 } 854 #ifdef INET6 855 else if (ip6) { 856 strcpy(src, ip6_sprintf(&ip6->ip6_src)); 857 strcpy(dst, ip6_sprintf(&ip6->ip6_dst)); 858 } 859 #endif 860 else { 861 strcpy(src, "(unknown)"); 862 strcpy(dst, "(unknown)"); 863 } 864 log(LOG_INFO, 865 "Connection attempt to TCP %s:%d from %s:%d\n", 866 dst, ntohs(th->th_dport), 867 src, ntohs(th->th_sport)); 868 } 869 TCP_FIELDS_TO_HOST(th); 870 goto dropwithreset_ratelim; 871 } 872 #ifdef IPSEC 873 if (inp && ipsec4_in_reject(m, inp)) { 874 ipsecstat.in_polvio++; 875 goto drop; 876 } 877 #ifdef INET6 878 else if (in6p && ipsec4_in_reject_so(m, in6p->in6p_socket)) { 879 ipsecstat.in_polvio++; 880 goto drop; 881 } 882 #endif 883 #endif /*IPSEC*/ 884 break; 885 #endif /*INET*/ 886 #ifdef INET6 887 case AF_INET6: 888 { 889 int faith; 890 891 #if defined(NFAITH) && NFAITH > 0 892 if (m->m_pkthdr.rcvif 893 && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 894 faith = 1; 895 } else 896 faith = 0; 897 #else 898 faith = 0; 899 #endif 900 in6p = in6_pcblookup_connect(&tcb6, &ip6->ip6_src, th->th_sport, 901 &ip6->ip6_dst, th->th_dport, faith); 902 if (in6p == NULL) { 903 ++tcpstat.tcps_pcbhashmiss; 904 in6p = in6_pcblookup_bind(&tcb6, &ip6->ip6_dst, 905 th->th_dport, faith); 906 } 907 if (in6p == NULL) { 908 ++tcpstat.tcps_noport; 909 TCP_FIELDS_TO_HOST(th); 910 goto dropwithreset_ratelim; 911 } 912 #ifdef IPSEC 913 if (ipsec6_in_reject(m, in6p)) { 914 ipsec6stat.in_polvio++; 915 goto drop; 916 } 917 #endif /*IPSEC*/ 918 break; 919 } 920 #endif 921 } 922 923 /* 924 * If the state is CLOSED (i.e., TCB does not exist) then 925 * all data in the incoming segment is discarded. 926 * If the TCB exists but is in CLOSED state, it is embryonic, 927 * but should either do a listen or a connect soon. 928 */ 929 tp = NULL; 930 so = NULL; 931 if (inp) { 932 tp = intotcpcb(inp); 933 so = inp->inp_socket; 934 } 935 #ifdef INET6 936 else if (in6p) { 937 tp = in6totcpcb(in6p); 938 so = in6p->in6p_socket; 939 } 940 #endif 941 if (tp == 0) { 942 TCP_FIELDS_TO_HOST(th); 943 goto dropwithreset_ratelim; 944 } 945 if (tp->t_state == TCPS_CLOSED) 946 goto drop; 947 948 /* 949 * Checksum extended TCP header and data. 950 */ 951 switch (af) { 952 #ifdef INET 953 case AF_INET: 954 #ifndef PULLDOWN_TEST 955 { 956 struct ipovly *ipov; 957 ipov = (struct ipovly *)ip; 958 bzero(ipov->ih_x1, sizeof ipov->ih_x1); 959 ipov->ih_len = htons(tlen + off); 960 961 if (in_cksum(m, len) != 0) { 962 tcpstat.tcps_rcvbadsum++; 963 goto drop; 964 } 965 } 966 #else 967 if (in4_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) { 968 tcpstat.tcps_rcvbadsum++; 969 goto drop; 970 } 971 #endif 972 break; 973 #endif 974 975 #ifdef INET6 976 case AF_INET6: 977 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) { 978 tcpstat.tcps_rcvbadsum++; 979 goto drop; 980 } 981 break; 982 #endif 983 } 984 985 TCP_FIELDS_TO_HOST(th); 986 987 /* Unscale the window into a 32-bit value. */ 988 if ((tiflags & TH_SYN) == 0) 989 tiwin = th->th_win << tp->snd_scale; 990 else 991 tiwin = th->th_win; 992 993 #ifdef INET6 994 /* save packet options if user wanted */ 995 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { 996 if (in6p->in6p_options) { 997 m_freem(in6p->in6p_options); 998 in6p->in6p_options = 0; 999 } 1000 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); 1001 } 1002 #endif 1003 1004 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 1005 union syn_cache_sa src; 1006 union syn_cache_sa dst; 1007 1008 bzero(&src, sizeof(src)); 1009 bzero(&dst, sizeof(dst)); 1010 switch (af) { 1011 #ifdef INET 1012 case AF_INET: 1013 src.sin.sin_len = sizeof(struct sockaddr_in); 1014 src.sin.sin_family = AF_INET; 1015 src.sin.sin_addr = ip->ip_src; 1016 src.sin.sin_port = th->th_sport; 1017 1018 dst.sin.sin_len = sizeof(struct sockaddr_in); 1019 dst.sin.sin_family = AF_INET; 1020 dst.sin.sin_addr = ip->ip_dst; 1021 dst.sin.sin_port = th->th_dport; 1022 break; 1023 #endif 1024 #ifdef INET6 1025 case AF_INET6: 1026 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 1027 src.sin6.sin6_family = AF_INET6; 1028 src.sin6.sin6_addr = ip6->ip6_src; 1029 src.sin6.sin6_port = th->th_sport; 1030 1031 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 1032 dst.sin6.sin6_family = AF_INET6; 1033 dst.sin6.sin6_addr = ip6->ip6_dst; 1034 dst.sin6.sin6_port = th->th_dport; 1035 break; 1036 #endif /* INET6 */ 1037 default: 1038 goto badsyn; /*sanity*/ 1039 } 1040 1041 if (so->so_options & SO_DEBUG) { 1042 ostate = tp->t_state; 1043 1044 tcp_saveti = NULL; 1045 if (iphlen + sizeof(struct tcphdr) > MHLEN) 1046 goto nosave; 1047 1048 if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) { 1049 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 1050 if (!tcp_saveti) 1051 goto nosave; 1052 } else { 1053 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER); 1054 if (!tcp_saveti) 1055 goto nosave; 1056 tcp_saveti->m_len = iphlen; 1057 m_copydata(m, 0, iphlen, 1058 mtod(tcp_saveti, caddr_t)); 1059 } 1060 1061 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 1062 m_freem(tcp_saveti); 1063 tcp_saveti = NULL; 1064 } else { 1065 tcp_saveti->m_len += sizeof(struct tcphdr); 1066 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen, 1067 sizeof(struct tcphdr)); 1068 } 1069 if (tcp_saveti) { 1070 /* 1071 * need to recover version # field, which was 1072 * overwritten on ip_cksum computation. 1073 */ 1074 struct ip *sip; 1075 sip = mtod(tcp_saveti, struct ip *); 1076 switch (af) { 1077 #ifdef INET 1078 case AF_INET: 1079 sip->ip_v = 4; 1080 break; 1081 #endif 1082 #ifdef INET6 1083 case AF_INET6: 1084 sip->ip_v = 6; 1085 break; 1086 #endif 1087 } 1088 } 1089 nosave:; 1090 } 1091 if (so->so_options & SO_ACCEPTCONN) { 1092 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1093 if (tiflags & TH_RST) { 1094 syn_cache_reset(&src.sa, &dst.sa, th); 1095 } else if ((tiflags & (TH_ACK|TH_SYN)) == 1096 (TH_ACK|TH_SYN)) { 1097 /* 1098 * Received a SYN,ACK. This should 1099 * never happen while we are in 1100 * LISTEN. Send an RST. 1101 */ 1102 goto badsyn; 1103 } else if (tiflags & TH_ACK) { 1104 so = syn_cache_get(&src.sa, &dst.sa, 1105 th, toff, tlen, so, m); 1106 if (so == NULL) { 1107 /* 1108 * We don't have a SYN for 1109 * this ACK; send an RST. 1110 */ 1111 goto badsyn; 1112 } else if (so == 1113 (struct socket *)(-1)) { 1114 /* 1115 * We were unable to create 1116 * the connection. If the 1117 * 3-way handshake was 1118 * completed, and RST has 1119 * been sent to the peer. 1120 * Since the mbuf might be 1121 * in use for the reply, 1122 * do not free it. 1123 */ 1124 m = NULL; 1125 } else { 1126 /* 1127 * We have created a 1128 * full-blown connection. 1129 */ 1130 tp = NULL; 1131 inp = NULL; 1132 #ifdef INET6 1133 in6p = NULL; 1134 #endif 1135 switch (so->so_proto->pr_domain->dom_family) { 1136 #ifdef INET 1137 case AF_INET: 1138 inp = sotoinpcb(so); 1139 tp = intotcpcb(inp); 1140 break; 1141 #endif 1142 #ifdef INET6 1143 case AF_INET6: 1144 in6p = sotoin6pcb(so); 1145 tp = in6totcpcb(in6p); 1146 break; 1147 #endif 1148 } 1149 if (tp == NULL) 1150 goto badsyn; /*XXX*/ 1151 tiwin <<= tp->snd_scale; 1152 goto after_listen; 1153 } 1154 } else { 1155 /* 1156 * None of RST, SYN or ACK was set. 1157 * This is an invalid packet for a 1158 * TCB in LISTEN state. Send a RST. 1159 */ 1160 goto badsyn; 1161 } 1162 } else { 1163 /* 1164 * Received a SYN. 1165 */ 1166 1167 /* 1168 * LISTEN socket received a SYN 1169 * from itself? This can't possibly 1170 * be valid; drop the packet. 1171 */ 1172 if (th->th_sport == th->th_dport) { 1173 int i; 1174 1175 switch (af) { 1176 #ifdef INET 1177 case AF_INET: 1178 i = in_hosteq(ip->ip_src, ip->ip_dst); 1179 break; 1180 #endif 1181 #ifdef INET6 1182 case AF_INET6: 1183 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1184 break; 1185 #endif 1186 default: 1187 i = 1; 1188 } 1189 if (i) { 1190 tcpstat.tcps_badsyn++; 1191 goto drop; 1192 } 1193 } 1194 1195 /* 1196 * SYN looks ok; create compressed TCP 1197 * state for it. 1198 */ 1199 if (so->so_qlen <= so->so_qlimit && 1200 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1201 so, m, optp, optlen, &opti)) 1202 m = NULL; 1203 } 1204 goto drop; 1205 } 1206 } 1207 1208 after_listen: 1209 #ifdef DIAGNOSTIC 1210 /* 1211 * Should not happen now that all embryonic connections 1212 * are handled with compressed state. 1213 */ 1214 if (tp->t_state == TCPS_LISTEN) 1215 panic("tcp_input: TCPS_LISTEN"); 1216 #endif 1217 1218 /* 1219 * Segment received on connection. 1220 * Reset idle time and keep-alive timer. 1221 */ 1222 tp->t_idle = 0; 1223 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1224 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1225 1226 /* 1227 * Process options. 1228 */ 1229 if (optp) 1230 tcp_dooptions(tp, optp, optlen, th, &opti); 1231 1232 /* 1233 * Header prediction: check for the two common cases 1234 * of a uni-directional data xfer. If the packet has 1235 * no control flags, is in-sequence, the window didn't 1236 * change and we're not retransmitting, it's a 1237 * candidate. If the length is zero and the ack moved 1238 * forward, we're the sender side of the xfer. Just 1239 * free the data acked & wake any higher level process 1240 * that was blocked waiting for space. If the length 1241 * is non-zero and the ack didn't move, we're the 1242 * receiver side. If we're getting packets in-order 1243 * (the reassembly queue is empty), add the data to 1244 * the socket buffer and note that we need a delayed ack. 1245 */ 1246 if (tp->t_state == TCPS_ESTABLISHED && 1247 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1248 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1249 th->th_seq == tp->rcv_nxt && 1250 tiwin && tiwin == tp->snd_wnd && 1251 tp->snd_nxt == tp->snd_max) { 1252 1253 /* 1254 * If last ACK falls within this segment's sequence numbers, 1255 * record the timestamp. 1256 */ 1257 if (opti.ts_present && 1258 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1259 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) { 1260 tp->ts_recent_age = TCP_TIMESTAMP(tp); 1261 tp->ts_recent = opti.ts_val; 1262 } 1263 1264 if (tlen == 0) { 1265 if (SEQ_GT(th->th_ack, tp->snd_una) && 1266 SEQ_LEQ(th->th_ack, tp->snd_max) && 1267 tp->snd_cwnd >= tp->snd_wnd && 1268 tp->t_dupacks < tcprexmtthresh) { 1269 /* 1270 * this is a pure ack for outstanding data. 1271 */ 1272 ++tcpstat.tcps_predack; 1273 if (opti.ts_present && opti.ts_ecr) 1274 tcp_xmit_timer(tp, 1275 TCP_TIMESTAMP(tp) - opti.ts_ecr + 1); 1276 else if (tp->t_rtt && 1277 SEQ_GT(th->th_ack, tp->t_rtseq)) 1278 tcp_xmit_timer(tp, tp->t_rtt); 1279 acked = th->th_ack - tp->snd_una; 1280 tcpstat.tcps_rcvackpack++; 1281 tcpstat.tcps_rcvackbyte += acked; 1282 ND6_HINT(tp); 1283 sbdrop(&so->so_snd, acked); 1284 /* 1285 * We want snd_recover to track snd_una to 1286 * avoid sequence wraparound problems for 1287 * very large transfers. 1288 */ 1289 tp->snd_una = tp->snd_recover = th->th_ack; 1290 m_freem(m); 1291 1292 /* 1293 * If all outstanding data are acked, stop 1294 * retransmit timer, otherwise restart timer 1295 * using current (possibly backed-off) value. 1296 * If process is waiting for space, 1297 * wakeup/selwakeup/signal. If data 1298 * are ready to send, let tcp_output 1299 * decide between more output or persist. 1300 */ 1301 if (tp->snd_una == tp->snd_max) 1302 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1303 else if (TCP_TIMER_ISARMED(tp, 1304 TCPT_PERSIST) == 0) 1305 TCP_TIMER_ARM(tp, TCPT_REXMT, 1306 tp->t_rxtcur); 1307 1308 sowwakeup(so); 1309 if (so->so_snd.sb_cc) 1310 (void) tcp_output(tp); 1311 if (tcp_saveti) 1312 m_freem(tcp_saveti); 1313 return; 1314 } 1315 } else if (th->th_ack == tp->snd_una && 1316 tp->segq.lh_first == NULL && 1317 tlen <= sbspace(&so->so_rcv)) { 1318 /* 1319 * this is a pure, in-sequence data packet 1320 * with nothing on the reassembly queue and 1321 * we have enough buffer space to take it. 1322 */ 1323 ++tcpstat.tcps_preddat; 1324 tp->rcv_nxt += tlen; 1325 tcpstat.tcps_rcvpack++; 1326 tcpstat.tcps_rcvbyte += tlen; 1327 ND6_HINT(tp); 1328 /* 1329 * Drop TCP, IP headers and TCP options then add data 1330 * to socket buffer. 1331 */ 1332 m_adj(m, toff + off); 1333 sbappend(&so->so_rcv, m); 1334 sorwakeup(so); 1335 TCP_SETUP_ACK(tp, th); 1336 if (tp->t_flags & TF_ACKNOW) 1337 (void) tcp_output(tp); 1338 if (tcp_saveti) 1339 m_freem(tcp_saveti); 1340 return; 1341 } 1342 } 1343 1344 /* 1345 * Compute mbuf offset to TCP data segment. 1346 */ 1347 hdroptlen = toff + off; 1348 1349 /* 1350 * Calculate amount of space in receive window, 1351 * and then do TCP input processing. 1352 * Receive window is amount of space in rcv queue, 1353 * but not less than advertised window. 1354 */ 1355 { int win; 1356 1357 win = sbspace(&so->so_rcv); 1358 if (win < 0) 1359 win = 0; 1360 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1361 } 1362 1363 switch (tp->t_state) { 1364 1365 /* 1366 * If the state is SYN_SENT: 1367 * if seg contains an ACK, but not for our SYN, drop the input. 1368 * if seg contains a RST, then drop the connection. 1369 * if seg does not contain SYN, then drop it. 1370 * Otherwise this is an acceptable SYN segment 1371 * initialize tp->rcv_nxt and tp->irs 1372 * if seg contains ack then advance tp->snd_una 1373 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1374 * arrange for segment to be acked (eventually) 1375 * continue processing rest of data/controls, beginning with URG 1376 */ 1377 case TCPS_SYN_SENT: 1378 if ((tiflags & TH_ACK) && 1379 (SEQ_LEQ(th->th_ack, tp->iss) || 1380 SEQ_GT(th->th_ack, tp->snd_max))) 1381 goto dropwithreset; 1382 if (tiflags & TH_RST) { 1383 if (tiflags & TH_ACK) 1384 tp = tcp_drop(tp, ECONNREFUSED); 1385 goto drop; 1386 } 1387 if ((tiflags & TH_SYN) == 0) 1388 goto drop; 1389 if (tiflags & TH_ACK) { 1390 tp->snd_una = tp->snd_recover = th->th_ack; 1391 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1392 tp->snd_nxt = tp->snd_una; 1393 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1394 } 1395 tp->irs = th->th_seq; 1396 tcp_rcvseqinit(tp); 1397 tp->t_flags |= TF_ACKNOW; 1398 tcp_mss_from_peer(tp, opti.maxseg); 1399 1400 /* 1401 * Initialize the initial congestion window. If we 1402 * had to retransmit the SYN, we must initialize cwnd 1403 * to 1 segment (i.e. the Loss Window). 1404 */ 1405 if (tp->t_flags & TF_SYN_REXMT) 1406 tp->snd_cwnd = tp->t_peermss; 1407 else 1408 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, 1409 tp->t_peermss); 1410 1411 tcp_rmx_rtt(tp); 1412 if (tiflags & TH_ACK) { 1413 tcpstat.tcps_connects++; 1414 soisconnected(so); 1415 tcp_established(tp); 1416 /* Do window scaling on this connection? */ 1417 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1418 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1419 tp->snd_scale = tp->requested_s_scale; 1420 tp->rcv_scale = tp->request_r_scale; 1421 } 1422 TCP_REASS_LOCK(tp); 1423 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1424 TCP_REASS_UNLOCK(tp); 1425 /* 1426 * if we didn't have to retransmit the SYN, 1427 * use its rtt as our initial srtt & rtt var. 1428 */ 1429 if (tp->t_rtt) 1430 tcp_xmit_timer(tp, tp->t_rtt); 1431 } else 1432 tp->t_state = TCPS_SYN_RECEIVED; 1433 1434 /* 1435 * Advance th->th_seq to correspond to first data byte. 1436 * If data, trim to stay within window, 1437 * dropping FIN if necessary. 1438 */ 1439 th->th_seq++; 1440 if (tlen > tp->rcv_wnd) { 1441 todrop = tlen - tp->rcv_wnd; 1442 m_adj(m, -todrop); 1443 tlen = tp->rcv_wnd; 1444 tiflags &= ~TH_FIN; 1445 tcpstat.tcps_rcvpackafterwin++; 1446 tcpstat.tcps_rcvbyteafterwin += todrop; 1447 } 1448 tp->snd_wl1 = th->th_seq - 1; 1449 tp->rcv_up = th->th_seq; 1450 goto step6; 1451 1452 /* 1453 * If the state is SYN_RECEIVED: 1454 * If seg contains an ACK, but not for our SYN, drop the input 1455 * and generate an RST. See page 36, rfc793 1456 */ 1457 case TCPS_SYN_RECEIVED: 1458 if ((tiflags & TH_ACK) && 1459 (SEQ_LEQ(th->th_ack, tp->iss) || 1460 SEQ_GT(th->th_ack, tp->snd_max))) 1461 goto dropwithreset; 1462 break; 1463 } 1464 1465 /* 1466 * States other than LISTEN or SYN_SENT. 1467 * First check timestamp, if present. 1468 * Then check that at least some bytes of segment are within 1469 * receive window. If segment begins before rcv_nxt, 1470 * drop leading data (and SYN); if nothing left, just ack. 1471 * 1472 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1473 * and it's less than ts_recent, drop it. 1474 */ 1475 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1476 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1477 1478 /* Check to see if ts_recent is over 24 days old. */ 1479 if ((int)(TCP_TIMESTAMP(tp) - tp->ts_recent_age) > 1480 TCP_PAWS_IDLE) { 1481 /* 1482 * Invalidate ts_recent. If this segment updates 1483 * ts_recent, the age will be reset later and ts_recent 1484 * will get a valid value. If it does not, setting 1485 * ts_recent to zero will at least satisfy the 1486 * requirement that zero be placed in the timestamp 1487 * echo reply when ts_recent isn't valid. The 1488 * age isn't reset until we get a valid ts_recent 1489 * because we don't want out-of-order segments to be 1490 * dropped when ts_recent is old. 1491 */ 1492 tp->ts_recent = 0; 1493 } else { 1494 tcpstat.tcps_rcvduppack++; 1495 tcpstat.tcps_rcvdupbyte += tlen; 1496 tcpstat.tcps_pawsdrop++; 1497 goto dropafterack; 1498 } 1499 } 1500 1501 todrop = tp->rcv_nxt - th->th_seq; 1502 if (todrop > 0) { 1503 if (tiflags & TH_SYN) { 1504 tiflags &= ~TH_SYN; 1505 th->th_seq++; 1506 if (th->th_urp > 1) 1507 th->th_urp--; 1508 else { 1509 tiflags &= ~TH_URG; 1510 th->th_urp = 0; 1511 } 1512 todrop--; 1513 } 1514 if (todrop > tlen || 1515 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1516 /* 1517 * Any valid FIN must be to the left of the window. 1518 * At this point the FIN must be a duplicate or 1519 * out of sequence; drop it. 1520 */ 1521 tiflags &= ~TH_FIN; 1522 /* 1523 * Send an ACK to resynchronize and drop any data. 1524 * But keep on processing for RST or ACK. 1525 */ 1526 tp->t_flags |= TF_ACKNOW; 1527 todrop = tlen; 1528 tcpstat.tcps_rcvdupbyte += todrop; 1529 tcpstat.tcps_rcvduppack++; 1530 } else { 1531 tcpstat.tcps_rcvpartduppack++; 1532 tcpstat.tcps_rcvpartdupbyte += todrop; 1533 } 1534 hdroptlen += todrop; /*drop from head afterwards*/ 1535 th->th_seq += todrop; 1536 tlen -= todrop; 1537 if (th->th_urp > todrop) 1538 th->th_urp -= todrop; 1539 else { 1540 tiflags &= ~TH_URG; 1541 th->th_urp = 0; 1542 } 1543 } 1544 1545 /* 1546 * If new data are received on a connection after the 1547 * user processes are gone, then RST the other end. 1548 */ 1549 if ((so->so_state & SS_NOFDREF) && 1550 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1551 tp = tcp_close(tp); 1552 tcpstat.tcps_rcvafterclose++; 1553 goto dropwithreset; 1554 } 1555 1556 /* 1557 * If segment ends after window, drop trailing data 1558 * (and PUSH and FIN); if nothing left, just ACK. 1559 */ 1560 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1561 if (todrop > 0) { 1562 tcpstat.tcps_rcvpackafterwin++; 1563 if (todrop >= tlen) { 1564 tcpstat.tcps_rcvbyteafterwin += tlen; 1565 /* 1566 * If a new connection request is received 1567 * while in TIME_WAIT, drop the old connection 1568 * and start over if the sequence numbers 1569 * are above the previous ones. 1570 */ 1571 if (tiflags & TH_SYN && 1572 tp->t_state == TCPS_TIME_WAIT && 1573 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1574 iss = tcp_new_iss(tp, tp->snd_nxt); 1575 tp = tcp_close(tp); 1576 goto findpcb; 1577 } 1578 /* 1579 * If window is closed can only take segments at 1580 * window edge, and have to drop data and PUSH from 1581 * incoming segments. Continue processing, but 1582 * remember to ack. Otherwise, drop segment 1583 * and ack. 1584 */ 1585 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1586 tp->t_flags |= TF_ACKNOW; 1587 tcpstat.tcps_rcvwinprobe++; 1588 } else 1589 goto dropafterack; 1590 } else 1591 tcpstat.tcps_rcvbyteafterwin += todrop; 1592 m_adj(m, -todrop); 1593 tlen -= todrop; 1594 tiflags &= ~(TH_PUSH|TH_FIN); 1595 } 1596 1597 /* 1598 * If last ACK falls within this segment's sequence numbers, 1599 * and the timestamp is newer, record it. 1600 */ 1601 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1602 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1603 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen + 1604 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 1605 tp->ts_recent_age = TCP_TIMESTAMP(tp); 1606 tp->ts_recent = opti.ts_val; 1607 } 1608 1609 /* 1610 * If the RST bit is set examine the state: 1611 * SYN_RECEIVED STATE: 1612 * If passive open, return to LISTEN state. 1613 * If active open, inform user that connection was refused. 1614 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1615 * Inform user that connection was reset, and close tcb. 1616 * CLOSING, LAST_ACK, TIME_WAIT STATES 1617 * Close the tcb. 1618 */ 1619 if (tiflags&TH_RST) switch (tp->t_state) { 1620 1621 case TCPS_SYN_RECEIVED: 1622 so->so_error = ECONNREFUSED; 1623 goto close; 1624 1625 case TCPS_ESTABLISHED: 1626 case TCPS_FIN_WAIT_1: 1627 case TCPS_FIN_WAIT_2: 1628 case TCPS_CLOSE_WAIT: 1629 so->so_error = ECONNRESET; 1630 close: 1631 tp->t_state = TCPS_CLOSED; 1632 tcpstat.tcps_drops++; 1633 tp = tcp_close(tp); 1634 goto drop; 1635 1636 case TCPS_CLOSING: 1637 case TCPS_LAST_ACK: 1638 case TCPS_TIME_WAIT: 1639 tp = tcp_close(tp); 1640 goto drop; 1641 } 1642 1643 /* 1644 * If a SYN is in the window, then this is an 1645 * error and we send an RST and drop the connection. 1646 */ 1647 if (tiflags & TH_SYN) { 1648 tp = tcp_drop(tp, ECONNRESET); 1649 goto dropwithreset; 1650 } 1651 1652 /* 1653 * If the ACK bit is off we drop the segment and return. 1654 */ 1655 if ((tiflags & TH_ACK) == 0) { 1656 if (tp->t_flags & TF_ACKNOW) 1657 goto dropafterack; 1658 else 1659 goto drop; 1660 } 1661 1662 /* 1663 * Ack processing. 1664 */ 1665 switch (tp->t_state) { 1666 1667 /* 1668 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 1669 * ESTABLISHED state and continue processing, otherwise 1670 * send an RST. 1671 */ 1672 case TCPS_SYN_RECEIVED: 1673 if (SEQ_GT(tp->snd_una, th->th_ack) || 1674 SEQ_GT(th->th_ack, tp->snd_max)) 1675 goto dropwithreset; 1676 tcpstat.tcps_connects++; 1677 soisconnected(so); 1678 tcp_established(tp); 1679 /* Do window scaling? */ 1680 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1681 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1682 tp->snd_scale = tp->requested_s_scale; 1683 tp->rcv_scale = tp->request_r_scale; 1684 } 1685 TCP_REASS_LOCK(tp); 1686 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1687 TCP_REASS_UNLOCK(tp); 1688 tp->snd_wl1 = th->th_seq - 1; 1689 /* fall into ... */ 1690 1691 /* 1692 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1693 * ACKs. If the ack is in the range 1694 * tp->snd_una < th->th_ack <= tp->snd_max 1695 * then advance tp->snd_una to th->th_ack and drop 1696 * data from the retransmission queue. If this ACK reflects 1697 * more up to date window information we update our window information. 1698 */ 1699 case TCPS_ESTABLISHED: 1700 case TCPS_FIN_WAIT_1: 1701 case TCPS_FIN_WAIT_2: 1702 case TCPS_CLOSE_WAIT: 1703 case TCPS_CLOSING: 1704 case TCPS_LAST_ACK: 1705 case TCPS_TIME_WAIT: 1706 1707 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1708 if (tlen == 0 && tiwin == tp->snd_wnd) { 1709 tcpstat.tcps_rcvdupack++; 1710 /* 1711 * If we have outstanding data (other than 1712 * a window probe), this is a completely 1713 * duplicate ack (ie, window info didn't 1714 * change), the ack is the biggest we've 1715 * seen and we've seen exactly our rexmt 1716 * threshhold of them, assume a packet 1717 * has been dropped and retransmit it. 1718 * Kludge snd_nxt & the congestion 1719 * window so we send only this one 1720 * packet. 1721 * 1722 * We know we're losing at the current 1723 * window size so do congestion avoidance 1724 * (set ssthresh to half the current window 1725 * and pull our congestion window back to 1726 * the new ssthresh). 1727 * 1728 * Dup acks mean that packets have left the 1729 * network (they're now cached at the receiver) 1730 * so bump cwnd by the amount in the receiver 1731 * to keep a constant cwnd packets in the 1732 * network. 1733 */ 1734 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 1735 th->th_ack != tp->snd_una) 1736 tp->t_dupacks = 0; 1737 else if (++tp->t_dupacks == tcprexmtthresh) { 1738 tcp_seq onxt = tp->snd_nxt; 1739 u_int win = 1740 min(tp->snd_wnd, tp->snd_cwnd) / 1741 2 / tp->t_segsz; 1742 if (tcp_do_newreno && SEQ_LT(th->th_ack, 1743 tp->snd_recover)) { 1744 /* 1745 * False fast retransmit after 1746 * timeout. Do not cut window. 1747 */ 1748 tp->snd_cwnd += tp->t_segsz; 1749 tp->t_dupacks = 0; 1750 (void) tcp_output(tp); 1751 goto drop; 1752 } 1753 1754 if (win < 2) 1755 win = 2; 1756 tp->snd_ssthresh = win * tp->t_segsz; 1757 tp->snd_recover = tp->snd_max; 1758 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1759 tp->t_rtt = 0; 1760 tp->snd_nxt = th->th_ack; 1761 tp->snd_cwnd = tp->t_segsz; 1762 (void) tcp_output(tp); 1763 tp->snd_cwnd = tp->snd_ssthresh + 1764 tp->t_segsz * tp->t_dupacks; 1765 if (SEQ_GT(onxt, tp->snd_nxt)) 1766 tp->snd_nxt = onxt; 1767 goto drop; 1768 } else if (tp->t_dupacks > tcprexmtthresh) { 1769 tp->snd_cwnd += tp->t_segsz; 1770 (void) tcp_output(tp); 1771 goto drop; 1772 } 1773 } else 1774 tp->t_dupacks = 0; 1775 break; 1776 } 1777 /* 1778 * If the congestion window was inflated to account 1779 * for the other side's cached packets, retract it. 1780 */ 1781 if (tcp_do_newreno == 0) { 1782 if (tp->t_dupacks >= tcprexmtthresh && 1783 tp->snd_cwnd > tp->snd_ssthresh) 1784 tp->snd_cwnd = tp->snd_ssthresh; 1785 tp->t_dupacks = 0; 1786 } else if (tp->t_dupacks >= tcprexmtthresh && 1787 tcp_newreno(tp, th) == 0) { 1788 tp->snd_cwnd = tp->snd_ssthresh; 1789 /* 1790 * Window inflation should have left us with approx. 1791 * snd_ssthresh outstanding data. But in case we 1792 * would be inclined to send a burst, better to do 1793 * it via the slow start mechanism. 1794 */ 1795 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 1796 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 1797 + tp->t_segsz; 1798 tp->t_dupacks = 0; 1799 } 1800 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1801 tcpstat.tcps_rcvacktoomuch++; 1802 goto dropafterack; 1803 } 1804 acked = th->th_ack - tp->snd_una; 1805 tcpstat.tcps_rcvackpack++; 1806 tcpstat.tcps_rcvackbyte += acked; 1807 1808 /* 1809 * If we have a timestamp reply, update smoothed 1810 * round trip time. If no timestamp is present but 1811 * transmit timer is running and timed sequence 1812 * number was acked, update smoothed round trip time. 1813 * Since we now have an rtt measurement, cancel the 1814 * timer backoff (cf., Phil Karn's retransmit alg.). 1815 * Recompute the initial retransmit timer. 1816 */ 1817 if (opti.ts_present && opti.ts_ecr) 1818 tcp_xmit_timer(tp, TCP_TIMESTAMP(tp) - opti.ts_ecr + 1); 1819 else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq)) 1820 tcp_xmit_timer(tp,tp->t_rtt); 1821 1822 /* 1823 * If all outstanding data is acked, stop retransmit 1824 * timer and remember to restart (more output or persist). 1825 * If there is more data to be acked, restart retransmit 1826 * timer, using current (possibly backed-off) value. 1827 */ 1828 if (th->th_ack == tp->snd_max) { 1829 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1830 needoutput = 1; 1831 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1832 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1833 /* 1834 * When new data is acked, open the congestion window. 1835 * If the window gives us less than ssthresh packets 1836 * in flight, open exponentially (segsz per packet). 1837 * Otherwise open linearly: segsz per window 1838 * (segsz^2 / cwnd per packet), plus a constant 1839 * fraction of a packet (segsz/8) to help larger windows 1840 * open quickly enough. 1841 */ 1842 { 1843 u_int cw = tp->snd_cwnd; 1844 u_int incr = tp->t_segsz; 1845 1846 if (cw > tp->snd_ssthresh) 1847 incr = incr * incr / cw; 1848 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover)) 1849 tp->snd_cwnd = min(cw + incr, 1850 TCP_MAXWIN << tp->snd_scale); 1851 } 1852 ND6_HINT(tp); 1853 if (acked > so->so_snd.sb_cc) { 1854 tp->snd_wnd -= so->so_snd.sb_cc; 1855 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1856 ourfinisacked = 1; 1857 } else { 1858 sbdrop(&so->so_snd, acked); 1859 tp->snd_wnd -= acked; 1860 ourfinisacked = 0; 1861 } 1862 sowwakeup(so); 1863 /* 1864 * We want snd_recover to track snd_una to 1865 * avoid sequence wraparound problems for 1866 * very large transfers. 1867 */ 1868 tp->snd_una = tp->snd_recover = th->th_ack; 1869 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1870 tp->snd_nxt = tp->snd_una; 1871 1872 switch (tp->t_state) { 1873 1874 /* 1875 * In FIN_WAIT_1 STATE in addition to the processing 1876 * for the ESTABLISHED state if our FIN is now acknowledged 1877 * then enter FIN_WAIT_2. 1878 */ 1879 case TCPS_FIN_WAIT_1: 1880 if (ourfinisacked) { 1881 /* 1882 * If we can't receive any more 1883 * data, then closing user can proceed. 1884 * Starting the timer is contrary to the 1885 * specification, but if we don't get a FIN 1886 * we'll hang forever. 1887 */ 1888 if (so->so_state & SS_CANTRCVMORE) { 1889 soisdisconnected(so); 1890 if (tcp_maxidle > 0) 1891 TCP_TIMER_ARM(tp, TCPT_2MSL, 1892 tcp_maxidle); 1893 } 1894 tp->t_state = TCPS_FIN_WAIT_2; 1895 } 1896 break; 1897 1898 /* 1899 * In CLOSING STATE in addition to the processing for 1900 * the ESTABLISHED state if the ACK acknowledges our FIN 1901 * then enter the TIME-WAIT state, otherwise ignore 1902 * the segment. 1903 */ 1904 case TCPS_CLOSING: 1905 if (ourfinisacked) { 1906 tp->t_state = TCPS_TIME_WAIT; 1907 tcp_canceltimers(tp); 1908 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1909 soisdisconnected(so); 1910 } 1911 break; 1912 1913 /* 1914 * In LAST_ACK, we may still be waiting for data to drain 1915 * and/or to be acked, as well as for the ack of our FIN. 1916 * If our FIN is now acknowledged, delete the TCB, 1917 * enter the closed state and return. 1918 */ 1919 case TCPS_LAST_ACK: 1920 if (ourfinisacked) { 1921 tp = tcp_close(tp); 1922 goto drop; 1923 } 1924 break; 1925 1926 /* 1927 * In TIME_WAIT state the only thing that should arrive 1928 * is a retransmission of the remote FIN. Acknowledge 1929 * it and restart the finack timer. 1930 */ 1931 case TCPS_TIME_WAIT: 1932 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1933 goto dropafterack; 1934 } 1935 } 1936 1937 step6: 1938 /* 1939 * Update window information. 1940 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1941 */ 1942 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 1943 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 1944 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 1945 /* keep track of pure window updates */ 1946 if (tlen == 0 && 1947 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1948 tcpstat.tcps_rcvwinupd++; 1949 tp->snd_wnd = tiwin; 1950 tp->snd_wl1 = th->th_seq; 1951 tp->snd_wl2 = th->th_ack; 1952 if (tp->snd_wnd > tp->max_sndwnd) 1953 tp->max_sndwnd = tp->snd_wnd; 1954 needoutput = 1; 1955 } 1956 1957 /* 1958 * Process segments with URG. 1959 */ 1960 if ((tiflags & TH_URG) && th->th_urp && 1961 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1962 /* 1963 * This is a kludge, but if we receive and accept 1964 * random urgent pointers, we'll crash in 1965 * soreceive. It's hard to imagine someone 1966 * actually wanting to send this much urgent data. 1967 */ 1968 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1969 th->th_urp = 0; /* XXX */ 1970 tiflags &= ~TH_URG; /* XXX */ 1971 goto dodata; /* XXX */ 1972 } 1973 /* 1974 * If this segment advances the known urgent pointer, 1975 * then mark the data stream. This should not happen 1976 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1977 * a FIN has been received from the remote side. 1978 * In these states we ignore the URG. 1979 * 1980 * According to RFC961 (Assigned Protocols), 1981 * the urgent pointer points to the last octet 1982 * of urgent data. We continue, however, 1983 * to consider it to indicate the first octet 1984 * of data past the urgent section as the original 1985 * spec states (in one of two places). 1986 */ 1987 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1988 tp->rcv_up = th->th_seq + th->th_urp; 1989 so->so_oobmark = so->so_rcv.sb_cc + 1990 (tp->rcv_up - tp->rcv_nxt) - 1; 1991 if (so->so_oobmark == 0) 1992 so->so_state |= SS_RCVATMARK; 1993 sohasoutofband(so); 1994 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1995 } 1996 /* 1997 * Remove out of band data so doesn't get presented to user. 1998 * This can happen independent of advancing the URG pointer, 1999 * but if two URG's are pending at once, some out-of-band 2000 * data may creep in... ick. 2001 */ 2002 if (th->th_urp <= (u_int16_t) tlen 2003 #ifdef SO_OOBINLINE 2004 && (so->so_options & SO_OOBINLINE) == 0 2005 #endif 2006 ) 2007 tcp_pulloutofband(so, th, m, hdroptlen); 2008 } else 2009 /* 2010 * If no out of band data is expected, 2011 * pull receive urgent pointer along 2012 * with the receive window. 2013 */ 2014 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2015 tp->rcv_up = tp->rcv_nxt; 2016 dodata: /* XXX */ 2017 2018 /* 2019 * Process the segment text, merging it into the TCP sequencing queue, 2020 * and arranging for acknowledgement of receipt if necessary. 2021 * This process logically involves adjusting tp->rcv_wnd as data 2022 * is presented to the user (this happens in tcp_usrreq.c, 2023 * case PRU_RCVD). If a FIN has already been received on this 2024 * connection then we just ignore the text. 2025 */ 2026 if ((tlen || (tiflags & TH_FIN)) && 2027 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2028 /* 2029 * Insert segment ti into reassembly queue of tcp with 2030 * control block tp. Return TH_FIN if reassembly now includes 2031 * a segment with FIN. The macro form does the common case 2032 * inline (segment is the next to be received on an 2033 * established connection, and the queue is empty), 2034 * avoiding linkage into and removal from the queue and 2035 * repetition of various conversions. 2036 * Set DELACK for segments received in order, but ack 2037 * immediately when segments are out of order 2038 * (so fast retransmit can work). 2039 */ 2040 /* NOTE: this was TCP_REASS() macro, but used only once */ 2041 TCP_REASS_LOCK(tp); 2042 if (th->th_seq == tp->rcv_nxt && 2043 tp->segq.lh_first == NULL && 2044 tp->t_state == TCPS_ESTABLISHED) { 2045 TCP_SETUP_ACK(tp, th); 2046 tp->rcv_nxt += tlen; 2047 tiflags = th->th_flags & TH_FIN; 2048 tcpstat.tcps_rcvpack++; 2049 tcpstat.tcps_rcvbyte += tlen; 2050 ND6_HINT(tp); 2051 m_adj(m, hdroptlen); 2052 sbappend(&(so)->so_rcv, m); 2053 sorwakeup(so); 2054 } else { 2055 m_adj(m, hdroptlen); 2056 tiflags = tcp_reass(tp, th, m, &tlen); 2057 tp->t_flags |= TF_ACKNOW; 2058 } 2059 TCP_REASS_UNLOCK(tp); 2060 2061 /* 2062 * Note the amount of data that peer has sent into 2063 * our window, in order to estimate the sender's 2064 * buffer size. 2065 */ 2066 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2067 } else { 2068 m_freem(m); 2069 m = NULL; 2070 tiflags &= ~TH_FIN; 2071 } 2072 2073 /* 2074 * If FIN is received ACK the FIN and let the user know 2075 * that the connection is closing. Ignore a FIN received before 2076 * the connection is fully established. 2077 */ 2078 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2079 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2080 socantrcvmore(so); 2081 tp->t_flags |= TF_ACKNOW; 2082 tp->rcv_nxt++; 2083 } 2084 switch (tp->t_state) { 2085 2086 /* 2087 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2088 */ 2089 case TCPS_ESTABLISHED: 2090 tp->t_state = TCPS_CLOSE_WAIT; 2091 break; 2092 2093 /* 2094 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2095 * enter the CLOSING state. 2096 */ 2097 case TCPS_FIN_WAIT_1: 2098 tp->t_state = TCPS_CLOSING; 2099 break; 2100 2101 /* 2102 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2103 * starting the time-wait timer, turning off the other 2104 * standard timers. 2105 */ 2106 case TCPS_FIN_WAIT_2: 2107 tp->t_state = TCPS_TIME_WAIT; 2108 tcp_canceltimers(tp); 2109 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2110 soisdisconnected(so); 2111 break; 2112 2113 /* 2114 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2115 */ 2116 case TCPS_TIME_WAIT: 2117 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2118 break; 2119 } 2120 } 2121 if (so->so_options & SO_DEBUG) { 2122 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2123 } 2124 2125 /* 2126 * Return any desired output. 2127 */ 2128 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2129 (void) tcp_output(tp); 2130 if (tcp_saveti) 2131 m_freem(tcp_saveti); 2132 return; 2133 2134 badsyn: 2135 /* 2136 * Received a bad SYN. Increment counters and dropwithreset. 2137 */ 2138 tcpstat.tcps_badsyn++; 2139 tp = NULL; 2140 goto dropwithreset; 2141 2142 dropafterack: 2143 /* 2144 * Generate an ACK dropping incoming segment if it occupies 2145 * sequence space, where the ACK reflects our state. 2146 */ 2147 if (tiflags & TH_RST) 2148 goto drop; 2149 m_freem(m); 2150 tp->t_flags |= TF_ACKNOW; 2151 (void) tcp_output(tp); 2152 if (tcp_saveti) 2153 m_freem(tcp_saveti); 2154 return; 2155 2156 dropwithreset_ratelim: 2157 /* 2158 * We may want to rate-limit RSTs in certain situations, 2159 * particularly if we are sending an RST in response to 2160 * an attempt to connect to or otherwise communicate with 2161 * a port for which we have no socket. 2162 */ 2163 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2164 tcp_rst_ppslim) == 0) { 2165 /* XXX stat */ 2166 goto drop; 2167 } 2168 /* ...fall into dropwithreset... */ 2169 2170 dropwithreset: 2171 /* 2172 * Generate a RST, dropping incoming segment. 2173 * Make ACK acceptable to originator of segment. 2174 */ 2175 if (tiflags & TH_RST) 2176 goto drop; 2177 { 2178 /* 2179 * need to recover version # field, which was overwritten on 2180 * ip_cksum computation. 2181 */ 2182 struct ip *sip; 2183 sip = mtod(m, struct ip *); 2184 switch (af) { 2185 #ifdef INET 2186 case AF_INET: 2187 sip->ip_v = 4; 2188 break; 2189 #endif 2190 #ifdef INET6 2191 case AF_INET6: 2192 sip->ip_v = 6; 2193 break; 2194 #endif 2195 } 2196 } 2197 if (tiflags & TH_ACK) 2198 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2199 else { 2200 if (tiflags & TH_SYN) 2201 tlen++; 2202 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2203 TH_RST|TH_ACK); 2204 } 2205 if (tcp_saveti) 2206 m_freem(tcp_saveti); 2207 return; 2208 2209 drop: 2210 /* 2211 * Drop space held by incoming segment and return. 2212 */ 2213 if (tp) { 2214 if (tp->t_inpcb) 2215 so = tp->t_inpcb->inp_socket; 2216 #ifdef INET6 2217 else if (tp->t_in6pcb) 2218 so = tp->t_in6pcb->in6p_socket; 2219 #endif 2220 else 2221 so = NULL; 2222 if (so && (so->so_options & SO_DEBUG) != 0) 2223 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2224 } 2225 if (tcp_saveti) 2226 m_freem(tcp_saveti); 2227 m_freem(m); 2228 return; 2229 } 2230 2231 void 2232 tcp_dooptions(tp, cp, cnt, th, oi) 2233 struct tcpcb *tp; 2234 u_char *cp; 2235 int cnt; 2236 struct tcphdr *th; 2237 struct tcp_opt_info *oi; 2238 { 2239 u_int16_t mss; 2240 int opt, optlen; 2241 2242 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2243 opt = cp[0]; 2244 if (opt == TCPOPT_EOL) 2245 break; 2246 if (opt == TCPOPT_NOP) 2247 optlen = 1; 2248 else { 2249 if (cnt < 2) 2250 break; 2251 optlen = cp[1]; 2252 if (optlen < 2 || optlen > cnt) 2253 break; 2254 } 2255 switch (opt) { 2256 2257 default: 2258 continue; 2259 2260 case TCPOPT_MAXSEG: 2261 if (optlen != TCPOLEN_MAXSEG) 2262 continue; 2263 if (!(th->th_flags & TH_SYN)) 2264 continue; 2265 bcopy(cp + 2, &mss, sizeof(mss)); 2266 oi->maxseg = ntohs(mss); 2267 break; 2268 2269 case TCPOPT_WINDOW: 2270 if (optlen != TCPOLEN_WINDOW) 2271 continue; 2272 if (!(th->th_flags & TH_SYN)) 2273 continue; 2274 tp->t_flags |= TF_RCVD_SCALE; 2275 tp->requested_s_scale = cp[2]; 2276 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 2277 #if 0 /*XXX*/ 2278 char *p; 2279 2280 if (ip) 2281 p = ntohl(ip->ip_src); 2282 #ifdef INET6 2283 else if (ip6) 2284 p = ip6_sprintf(&ip6->ip6_src); 2285 #endif 2286 else 2287 p = "(unknown)"; 2288 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 2289 "assuming %d\n", 2290 tp->requested_s_scale, p, 2291 TCP_MAX_WINSHIFT); 2292 #else 2293 log(LOG_ERR, "TCP: invalid wscale %d, " 2294 "assuming %d\n", 2295 tp->requested_s_scale, 2296 TCP_MAX_WINSHIFT); 2297 #endif 2298 tp->requested_s_scale = TCP_MAX_WINSHIFT; 2299 } 2300 break; 2301 2302 case TCPOPT_TIMESTAMP: 2303 if (optlen != TCPOLEN_TIMESTAMP) 2304 continue; 2305 oi->ts_present = 1; 2306 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2307 NTOHL(oi->ts_val); 2308 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2309 NTOHL(oi->ts_ecr); 2310 2311 /* 2312 * A timestamp received in a SYN makes 2313 * it ok to send timestamp requests and replies. 2314 */ 2315 if (th->th_flags & TH_SYN) { 2316 tp->t_flags |= TF_RCVD_TSTMP; 2317 tp->ts_recent = oi->ts_val; 2318 tp->ts_recent_age = TCP_TIMESTAMP(tp); 2319 } 2320 break; 2321 case TCPOPT_SACK_PERMITTED: 2322 if (optlen != TCPOLEN_SACK_PERMITTED) 2323 continue; 2324 if (!(th->th_flags & TH_SYN)) 2325 continue; 2326 tp->t_flags &= ~TF_CANT_TXSACK; 2327 break; 2328 2329 case TCPOPT_SACK: 2330 if (tp->t_flags & TF_IGNR_RXSACK) 2331 continue; 2332 if (optlen % 8 != 2 || optlen < 10) 2333 continue; 2334 cp += 2; 2335 optlen -= 2; 2336 for (; optlen > 0; cp -= 8, optlen -= 8) { 2337 tcp_seq lwe, rwe; 2338 bcopy((char *)cp, (char *) &lwe, sizeof(lwe)); 2339 NTOHL(lwe); 2340 bcopy((char *)cp, (char *) &rwe, sizeof(rwe)); 2341 NTOHL(rwe); 2342 /* tcp_mark_sacked(tp, lwe, rwe); */ 2343 } 2344 break; 2345 } 2346 } 2347 } 2348 2349 /* 2350 * Pull out of band byte out of a segment so 2351 * it doesn't appear in the user's data queue. 2352 * It is still reflected in the segment length for 2353 * sequencing purposes. 2354 */ 2355 void 2356 tcp_pulloutofband(so, th, m, off) 2357 struct socket *so; 2358 struct tcphdr *th; 2359 struct mbuf *m; 2360 int off; 2361 { 2362 int cnt = off + th->th_urp - 1; 2363 2364 while (cnt >= 0) { 2365 if (m->m_len > cnt) { 2366 char *cp = mtod(m, caddr_t) + cnt; 2367 struct tcpcb *tp = sototcpcb(so); 2368 2369 tp->t_iobc = *cp; 2370 tp->t_oobflags |= TCPOOB_HAVEDATA; 2371 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2372 m->m_len--; 2373 return; 2374 } 2375 cnt -= m->m_len; 2376 m = m->m_next; 2377 if (m == 0) 2378 break; 2379 } 2380 panic("tcp_pulloutofband"); 2381 } 2382 2383 /* 2384 * Collect new round-trip time estimate 2385 * and update averages and current timeout. 2386 */ 2387 void 2388 tcp_xmit_timer(tp, rtt) 2389 struct tcpcb *tp; 2390 short rtt; 2391 { 2392 short delta; 2393 short rttmin; 2394 2395 tcpstat.tcps_rttupdated++; 2396 --rtt; 2397 if (tp->t_srtt != 0) { 2398 /* 2399 * srtt is stored as fixed point with 3 bits after the 2400 * binary point (i.e., scaled by 8). The following magic 2401 * is equivalent to the smoothing algorithm in rfc793 with 2402 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2403 * point). Adjust rtt to origin 0. 2404 */ 2405 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2406 if ((tp->t_srtt += delta) <= 0) 2407 tp->t_srtt = 1 << 2; 2408 /* 2409 * We accumulate a smoothed rtt variance (actually, a 2410 * smoothed mean difference), then set the retransmit 2411 * timer to smoothed rtt + 4 times the smoothed variance. 2412 * rttvar is stored as fixed point with 2 bits after the 2413 * binary point (scaled by 4). The following is 2414 * equivalent to rfc793 smoothing with an alpha of .75 2415 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2416 * rfc793's wired-in beta. 2417 */ 2418 if (delta < 0) 2419 delta = -delta; 2420 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2421 if ((tp->t_rttvar += delta) <= 0) 2422 tp->t_rttvar = 1 << 2; 2423 } else { 2424 /* 2425 * No rtt measurement yet - use the unsmoothed rtt. 2426 * Set the variance to half the rtt (so our first 2427 * retransmit happens at 3*rtt). 2428 */ 2429 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2430 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2431 } 2432 tp->t_rtt = 0; 2433 tp->t_rxtshift = 0; 2434 2435 /* 2436 * the retransmit should happen at rtt + 4 * rttvar. 2437 * Because of the way we do the smoothing, srtt and rttvar 2438 * will each average +1/2 tick of bias. When we compute 2439 * the retransmit timer, we want 1/2 tick of rounding and 2440 * 1 extra tick because of +-1/2 tick uncertainty in the 2441 * firing of the timer. The bias will give us exactly the 2442 * 1.5 tick we need. But, because the bias is 2443 * statistical, we have to test that we don't drop below 2444 * the minimum feasible timer (which is 2 ticks). 2445 */ 2446 if (tp->t_rttmin > rtt + 2) 2447 rttmin = tp->t_rttmin; 2448 else 2449 rttmin = rtt + 2; 2450 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2451 2452 /* 2453 * We received an ack for a packet that wasn't retransmitted; 2454 * it is probably safe to discard any error indications we've 2455 * received recently. This isn't quite right, but close enough 2456 * for now (a route might have failed after we sent a segment, 2457 * and the return path might not be symmetrical). 2458 */ 2459 tp->t_softerror = 0; 2460 } 2461 2462 /* 2463 * Checks for partial ack. If partial ack arrives, force the retransmission 2464 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2465 * 1. By setting snd_nxt to th_ack, this forces retransmission timer to 2466 * be started again. If the ack advances at least to tp->snd_recover, return 0. 2467 */ 2468 int 2469 tcp_newreno(tp, th) 2470 struct tcpcb *tp; 2471 struct tcphdr *th; 2472 { 2473 tcp_seq onxt = tp->snd_nxt; 2474 u_long ocwnd = tp->snd_cwnd; 2475 2476 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2477 /* 2478 * snd_una has not yet been updated and the socket's send 2479 * buffer has not yet drained off the ACK'd data, so we 2480 * have to leave snd_una as it was to get the correct data 2481 * offset in tcp_output(). 2482 */ 2483 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2484 tp->t_rtt = 0; 2485 tp->snd_nxt = th->th_ack; 2486 /* 2487 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una 2488 * is not yet updated when we're called. 2489 */ 2490 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 2491 (void) tcp_output(tp); 2492 tp->snd_cwnd = ocwnd; 2493 if (SEQ_GT(onxt, tp->snd_nxt)) 2494 tp->snd_nxt = onxt; 2495 /* 2496 * Partial window deflation. Relies on fact that tp->snd_una 2497 * not updated yet. 2498 */ 2499 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz); 2500 return 1; 2501 } 2502 return 0; 2503 } 2504 2505 2506 /* 2507 * TCP compressed state engine. Currently used to hold compressed 2508 * state for SYN_RECEIVED. 2509 */ 2510 2511 u_long syn_cache_count; 2512 u_int32_t syn_hash1, syn_hash2; 2513 2514 #define SYN_HASH(sa, sp, dp) \ 2515 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 2516 ((u_int32_t)(sp)))^syn_hash2))) 2517 #ifndef INET6 2518 #define SYN_HASHALL(hash, src, dst) \ 2519 do { \ 2520 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2521 ((struct sockaddr_in *)(src))->sin_port, \ 2522 ((struct sockaddr_in *)(dst))->sin_port); \ 2523 } while (0) 2524 #else 2525 #define SYN_HASH6(sa, sp, dp) \ 2526 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 2527 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 2528 & 0x7fffffff) 2529 2530 #define SYN_HASHALL(hash, src, dst) \ 2531 do { \ 2532 switch ((src)->sa_family) { \ 2533 case AF_INET: \ 2534 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2535 ((struct sockaddr_in *)(src))->sin_port, \ 2536 ((struct sockaddr_in *)(dst))->sin_port); \ 2537 break; \ 2538 case AF_INET6: \ 2539 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 2540 ((struct sockaddr_in6 *)(src))->sin6_port, \ 2541 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 2542 break; \ 2543 default: \ 2544 hash = 0; \ 2545 } \ 2546 } while (0) 2547 #endif /* INET6 */ 2548 2549 #define SYN_CACHE_RM(sc) \ 2550 do { \ 2551 LIST_REMOVE((sc), sc_bucketq); \ 2552 (sc)->sc_tp = NULL; \ 2553 LIST_REMOVE((sc), sc_tpq); \ 2554 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 2555 TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \ 2556 syn_cache_count--; \ 2557 } while (0) 2558 2559 #define SYN_CACHE_PUT(sc) \ 2560 do { \ 2561 if ((sc)->sc_ipopts) \ 2562 (void) m_free((sc)->sc_ipopts); \ 2563 if ((sc)->sc_route4.ro_rt != NULL) \ 2564 RTFREE((sc)->sc_route4.ro_rt); \ 2565 pool_put(&syn_cache_pool, (sc)); \ 2566 } while (0) 2567 2568 struct pool syn_cache_pool; 2569 2570 /* 2571 * We don't estimate RTT with SYNs, so each packet starts with the default 2572 * RTT and each timer queue has a fixed timeout value. This allows us to 2573 * optimize the timer queues somewhat. 2574 */ 2575 #define SYN_CACHE_TIMER_ARM(sc) \ 2576 do { \ 2577 TCPT_RANGESET((sc)->sc_rxtcur, \ 2578 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 2579 TCPTV_REXMTMAX); \ 2580 PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur); \ 2581 } while (0) 2582 2583 TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1]; 2584 2585 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) 2586 2587 void 2588 syn_cache_init() 2589 { 2590 int i; 2591 2592 /* Initialize the hash buckets. */ 2593 for (i = 0; i < tcp_syn_cache_size; i++) 2594 LIST_INIT(&tcp_syn_cache[i].sch_bucket); 2595 2596 /* Initialize the timer queues. */ 2597 for (i = 0; i <= TCP_MAXRXTSHIFT; i++) 2598 TAILQ_INIT(&tcp_syn_cache_timeq[i]); 2599 2600 /* Initialize the syn cache pool. */ 2601 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 2602 "synpl", 0, NULL, NULL, M_PCB); 2603 } 2604 2605 void 2606 syn_cache_insert(sc, tp) 2607 struct syn_cache *sc; 2608 struct tcpcb *tp; 2609 { 2610 struct syn_cache_head *scp; 2611 struct syn_cache *sc2; 2612 int s, i; 2613 2614 /* 2615 * If there are no entries in the hash table, reinitialize 2616 * the hash secrets. 2617 */ 2618 if (syn_cache_count == 0) { 2619 struct timeval tv; 2620 microtime(&tv); 2621 syn_hash1 = random() ^ (u_long)≻ 2622 syn_hash2 = random() ^ tv.tv_usec; 2623 } 2624 2625 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 2626 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 2627 scp = &tcp_syn_cache[sc->sc_bucketidx]; 2628 2629 /* 2630 * Make sure that we don't overflow the per-bucket 2631 * limit or the total cache size limit. 2632 */ 2633 s = splsoftnet(); 2634 if (scp->sch_length >= tcp_syn_bucket_limit) { 2635 tcpstat.tcps_sc_bucketoverflow++; 2636 /* 2637 * The bucket is full. Toss the oldest element in the 2638 * bucket. This will be the entry with our bucket 2639 * index closest to the front of the timer queue with 2640 * the largest timeout value. 2641 * 2642 * Note: This timer queue traversal may be expensive, so 2643 * we hope that this doesn't happen very often. It is 2644 * much more likely that we'll overflow the entire 2645 * cache, which is much easier to handle; see below. 2646 */ 2647 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2648 for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2649 sc2 != NULL; 2650 sc2 = TAILQ_NEXT(sc2, sc_timeq)) { 2651 if (sc2->sc_bucketidx == sc->sc_bucketidx) { 2652 SYN_CACHE_RM(sc2); 2653 SYN_CACHE_PUT(sc2); 2654 goto insert; /* 2 level break */ 2655 } 2656 } 2657 } 2658 #ifdef DIAGNOSTIC 2659 /* 2660 * This should never happen; we should always find an 2661 * entry in our bucket. 2662 */ 2663 panic("syn_cache_insert: bucketoverflow: impossible"); 2664 #endif 2665 } else if (syn_cache_count >= tcp_syn_cache_limit) { 2666 tcpstat.tcps_sc_overflowed++; 2667 /* 2668 * The cache is full. Toss the oldest entry in the 2669 * entire cache. This is the front entry in the 2670 * first non-empty timer queue with the largest 2671 * timeout value. 2672 */ 2673 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2674 sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2675 if (sc2 == NULL) 2676 continue; 2677 SYN_CACHE_RM(sc2); 2678 SYN_CACHE_PUT(sc2); 2679 goto insert; /* symmetry with above */ 2680 } 2681 #ifdef DIAGNOSTIC 2682 /* 2683 * This should never happen; we should always find an 2684 * entry in the cache. 2685 */ 2686 panic("syn_cache_insert: cache overflow: impossible"); 2687 #endif 2688 } 2689 2690 insert: 2691 /* 2692 * Initialize the entry's timer. 2693 */ 2694 sc->sc_rxttot = 0; 2695 sc->sc_rxtshift = 0; 2696 SYN_CACHE_TIMER_ARM(sc); 2697 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq); 2698 2699 /* Link it from tcpcb entry */ 2700 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 2701 2702 /* Put it into the bucket. */ 2703 LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq); 2704 scp->sch_length++; 2705 syn_cache_count++; 2706 2707 tcpstat.tcps_sc_added++; 2708 splx(s); 2709 } 2710 2711 /* 2712 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 2713 * If we have retransmitted an entry the maximum number of times, expire 2714 * that entry. 2715 */ 2716 void 2717 syn_cache_timer() 2718 { 2719 struct syn_cache *sc, *nsc; 2720 int i, s; 2721 2722 s = splsoftnet(); 2723 2724 /* 2725 * First, get all the entries that need to be retransmitted, or 2726 * must be expired due to exceeding the initial keepalive time. 2727 */ 2728 for (i = 0; i < TCP_MAXRXTSHIFT; i++) { 2729 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2730 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2731 sc = nsc) { 2732 nsc = TAILQ_NEXT(sc, sc_timeq); 2733 2734 /* 2735 * Compute the total amount of time this entry has 2736 * been on a queue. If this entry has been on longer 2737 * than the keep alive timer would allow, expire it. 2738 */ 2739 sc->sc_rxttot += sc->sc_rxtcur; 2740 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) { 2741 tcpstat.tcps_sc_timed_out++; 2742 SYN_CACHE_RM(sc); 2743 SYN_CACHE_PUT(sc); 2744 continue; 2745 } 2746 2747 tcpstat.tcps_sc_retransmitted++; 2748 (void) syn_cache_respond(sc, NULL); 2749 2750 /* Advance this entry onto the next timer queue. */ 2751 TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq); 2752 sc->sc_rxtshift = i + 1; 2753 SYN_CACHE_TIMER_ARM(sc); 2754 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], 2755 sc, sc_timeq); 2756 } 2757 } 2758 2759 /* 2760 * Now get all the entries that are expired due to too many 2761 * retransmissions. 2762 */ 2763 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]); 2764 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2765 sc = nsc) { 2766 nsc = TAILQ_NEXT(sc, sc_timeq); 2767 tcpstat.tcps_sc_timed_out++; 2768 SYN_CACHE_RM(sc); 2769 SYN_CACHE_PUT(sc); 2770 } 2771 splx(s); 2772 } 2773 2774 /* 2775 * Remove syn cache created by the specified tcb entry, 2776 * because this does not make sense to keep them 2777 * (if there's no tcb entry, syn cache entry will never be used) 2778 */ 2779 void 2780 syn_cache_cleanup(tp) 2781 struct tcpcb *tp; 2782 { 2783 struct syn_cache *sc, *nsc; 2784 int s; 2785 2786 s = splsoftnet(); 2787 2788 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 2789 nsc = LIST_NEXT(sc, sc_tpq); 2790 2791 #ifdef DIAGNOSTIC 2792 if (sc->sc_tp != tp) 2793 panic("invalid sc_tp in syn_cache_cleanup"); 2794 #endif 2795 SYN_CACHE_RM(sc); 2796 SYN_CACHE_PUT(sc); 2797 } 2798 /* just for safety */ 2799 LIST_INIT(&tp->t_sc); 2800 2801 splx(s); 2802 } 2803 2804 /* 2805 * Find an entry in the syn cache. 2806 */ 2807 struct syn_cache * 2808 syn_cache_lookup(src, dst, headp) 2809 struct sockaddr *src; 2810 struct sockaddr *dst; 2811 struct syn_cache_head **headp; 2812 { 2813 struct syn_cache *sc; 2814 struct syn_cache_head *scp; 2815 u_int32_t hash; 2816 int s; 2817 2818 SYN_HASHALL(hash, src, dst); 2819 2820 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 2821 *headp = scp; 2822 s = splsoftnet(); 2823 for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL; 2824 sc = LIST_NEXT(sc, sc_bucketq)) { 2825 if (sc->sc_hash != hash) 2826 continue; 2827 if (!bcmp(&sc->sc_src, src, src->sa_len) && 2828 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 2829 splx(s); 2830 return (sc); 2831 } 2832 } 2833 splx(s); 2834 return (NULL); 2835 } 2836 2837 /* 2838 * This function gets called when we receive an ACK for a 2839 * socket in the LISTEN state. We look up the connection 2840 * in the syn cache, and if its there, we pull it out of 2841 * the cache and turn it into a full-blown connection in 2842 * the SYN-RECEIVED state. 2843 * 2844 * The return values may not be immediately obvious, and their effects 2845 * can be subtle, so here they are: 2846 * 2847 * NULL SYN was not found in cache; caller should drop the 2848 * packet and send an RST. 2849 * 2850 * -1 We were unable to create the new connection, and are 2851 * aborting it. An ACK,RST is being sent to the peer 2852 * (unless we got screwey sequence numbners; see below), 2853 * because the 3-way handshake has been completed. Caller 2854 * should not free the mbuf, since we may be using it. If 2855 * we are not, we will free it. 2856 * 2857 * Otherwise, the return value is a pointer to the new socket 2858 * associated with the connection. 2859 */ 2860 struct socket * 2861 syn_cache_get(src, dst, th, hlen, tlen, so, m) 2862 struct sockaddr *src; 2863 struct sockaddr *dst; 2864 struct tcphdr *th; 2865 unsigned int hlen, tlen; 2866 struct socket *so; 2867 struct mbuf *m; 2868 { 2869 struct syn_cache *sc; 2870 struct syn_cache_head *scp; 2871 struct inpcb *inp = NULL; 2872 #ifdef INET6 2873 struct in6pcb *in6p = NULL; 2874 #endif 2875 struct tcpcb *tp = 0; 2876 struct mbuf *am; 2877 int s; 2878 struct socket *oso; 2879 2880 s = splsoftnet(); 2881 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 2882 splx(s); 2883 return (NULL); 2884 } 2885 2886 /* 2887 * Verify the sequence and ack numbers. Try getting the correct 2888 * response again. 2889 */ 2890 if ((th->th_ack != sc->sc_iss + 1) || 2891 SEQ_LEQ(th->th_seq, sc->sc_irs) || 2892 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 2893 (void) syn_cache_respond(sc, m); 2894 splx(s); 2895 return ((struct socket *)(-1)); 2896 } 2897 2898 /* Remove this cache entry */ 2899 SYN_CACHE_RM(sc); 2900 splx(s); 2901 2902 /* 2903 * Ok, create the full blown connection, and set things up 2904 * as they would have been set up if we had created the 2905 * connection when the SYN arrived. If we can't create 2906 * the connection, abort it. 2907 */ 2908 /* 2909 * inp still has the OLD in_pcb stuff, set the 2910 * v6-related flags on the new guy, too. This is 2911 * done particularly for the case where an AF_INET6 2912 * socket is bound only to a port, and a v4 connection 2913 * comes in on that port. 2914 * we also copy the flowinfo from the original pcb 2915 * to the new one. 2916 */ 2917 { 2918 struct inpcb *parentinpcb; 2919 2920 parentinpcb = (struct inpcb *)so->so_pcb; 2921 2922 oso = so; 2923 so = sonewconn(so, SS_ISCONNECTED); 2924 if (so == NULL) 2925 goto resetandabort; 2926 2927 switch (so->so_proto->pr_domain->dom_family) { 2928 #ifdef INET 2929 case AF_INET: 2930 inp = sotoinpcb(so); 2931 break; 2932 #endif 2933 #ifdef INET6 2934 case AF_INET6: 2935 in6p = sotoin6pcb(so); 2936 break; 2937 #endif 2938 } 2939 } 2940 switch (src->sa_family) { 2941 #ifdef INET 2942 case AF_INET: 2943 if (inp) { 2944 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 2945 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 2946 inp->inp_options = ip_srcroute(); 2947 in_pcbstate(inp, INP_BOUND); 2948 if (inp->inp_options == NULL) { 2949 inp->inp_options = sc->sc_ipopts; 2950 sc->sc_ipopts = NULL; 2951 } 2952 } 2953 #ifdef INET6 2954 else if (in6p) { 2955 /* IPv4 packet to AF_INET6 socket */ 2956 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr)); 2957 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 2958 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 2959 &in6p->in6p_laddr.s6_addr32[3], 2960 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 2961 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 2962 in6totcpcb(in6p)->t_family = AF_INET; 2963 } 2964 #endif 2965 break; 2966 #endif 2967 #ifdef INET6 2968 case AF_INET6: 2969 if (in6p) { 2970 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 2971 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 2972 #if 0 2973 in6p->in6p_flowinfo = ip6->ip6_flow & IPV6_FLOWINFO_MASK; 2974 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 2975 #endif 2976 } 2977 break; 2978 #endif 2979 } 2980 #ifdef INET6 2981 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 2982 struct in6pcb *oin6p = sotoin6pcb(oso); 2983 /* inherit socket options from the listening socket */ 2984 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 2985 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 2986 m_freem(in6p->in6p_options); 2987 in6p->in6p_options = 0; 2988 } 2989 ip6_savecontrol(in6p, &in6p->in6p_options, 2990 mtod(m, struct ip6_hdr *), m); 2991 } 2992 #endif 2993 2994 #ifdef IPSEC 2995 /* 2996 * we make a copy of policy, instead of sharing the policy, 2997 * for better behavior in terms of SA lookup and dead SA removal. 2998 */ 2999 if (inp) { 3000 /* copy old policy into new socket's */ 3001 if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) 3002 printf("tcp_input: could not copy policy\n"); 3003 } 3004 #ifdef INET6 3005 else if (in6p) { 3006 /* copy old policy into new socket's */ 3007 if (ipsec_copy_policy(sotoin6pcb(oso)->in6p_sp, in6p->in6p_sp)) 3008 printf("tcp_input: could not copy policy\n"); 3009 } 3010 #endif 3011 #endif 3012 3013 /* 3014 * Give the new socket our cached route reference. 3015 */ 3016 if (inp) 3017 inp->inp_route = sc->sc_route4; /* struct assignment */ 3018 #ifdef INET6 3019 else 3020 in6p->in6p_route = sc->sc_route6; 3021 #endif 3022 sc->sc_route4.ro_rt = NULL; 3023 3024 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3025 if (am == NULL) 3026 goto resetandabort; 3027 am->m_len = src->sa_len; 3028 bcopy(src, mtod(am, caddr_t), src->sa_len); 3029 if (inp) { 3030 if (in_pcbconnect(inp, am)) { 3031 (void) m_free(am); 3032 goto resetandabort; 3033 } 3034 } 3035 #ifdef INET6 3036 else if (in6p) { 3037 if (src->sa_family == AF_INET) { 3038 /* IPv4 packet to AF_INET6 socket */ 3039 struct sockaddr_in6 *sin6; 3040 sin6 = mtod(am, struct sockaddr_in6 *); 3041 am->m_len = sizeof(*sin6); 3042 bzero(sin6, sizeof(*sin6)); 3043 sin6->sin6_family = AF_INET6; 3044 sin6->sin6_len = sizeof(*sin6); 3045 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port; 3046 sin6->sin6_addr.s6_addr16[5] = htons(0xffff); 3047 bcopy(&((struct sockaddr_in *)src)->sin_addr, 3048 &sin6->sin6_addr.s6_addr32[3], 3049 sizeof(sin6->sin6_addr.s6_addr32[3])); 3050 } 3051 if (in6_pcbconnect(in6p, am)) { 3052 (void) m_free(am); 3053 goto resetandabort; 3054 } 3055 } 3056 #endif 3057 else { 3058 (void) m_free(am); 3059 goto resetandabort; 3060 } 3061 (void) m_free(am); 3062 3063 if (inp) 3064 tp = intotcpcb(inp); 3065 #ifdef INET6 3066 else if (in6p) 3067 tp = in6totcpcb(in6p); 3068 #endif 3069 else 3070 tp = NULL; 3071 if (sc->sc_request_r_scale != 15) { 3072 tp->requested_s_scale = sc->sc_requested_s_scale; 3073 tp->request_r_scale = sc->sc_request_r_scale; 3074 tp->snd_scale = sc->sc_requested_s_scale; 3075 tp->rcv_scale = sc->sc_request_r_scale; 3076 tp->t_flags |= TF_RCVD_SCALE; 3077 } 3078 if (sc->sc_flags & SCF_TIMESTAMP) 3079 tp->t_flags |= TF_RCVD_TSTMP; 3080 tp->ts_timebase = sc->sc_timebase; 3081 3082 tp->t_template = tcp_template(tp); 3083 if (tp->t_template == 0) { 3084 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3085 so = NULL; 3086 m_freem(m); 3087 goto abort; 3088 } 3089 3090 tp->iss = sc->sc_iss; 3091 tp->irs = sc->sc_irs; 3092 tcp_sendseqinit(tp); 3093 tcp_rcvseqinit(tp); 3094 tp->t_state = TCPS_SYN_RECEIVED; 3095 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT); 3096 tcpstat.tcps_accepts++; 3097 3098 /* Initialize tp->t_ourmss before we deal with the peer's! */ 3099 tp->t_ourmss = sc->sc_ourmaxseg; 3100 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 3101 3102 /* 3103 * Initialize the initial congestion window. If we 3104 * had to retransmit the SYN,ACK, we must initialize cwnd 3105 * to 1 segment (i.e. the Loss Window). 3106 */ 3107 if (sc->sc_rxtshift) 3108 tp->snd_cwnd = tp->t_peermss; 3109 else 3110 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss); 3111 3112 tcp_rmx_rtt(tp); 3113 tp->snd_wl1 = sc->sc_irs; 3114 tp->rcv_up = sc->sc_irs + 1; 3115 3116 /* 3117 * This is what whould have happened in tcp_ouput() when 3118 * the SYN,ACK was sent. 3119 */ 3120 tp->snd_up = tp->snd_una; 3121 tp->snd_max = tp->snd_nxt = tp->iss+1; 3122 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3123 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3124 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3125 tp->last_ack_sent = tp->rcv_nxt; 3126 3127 tcpstat.tcps_sc_completed++; 3128 SYN_CACHE_PUT(sc); 3129 return (so); 3130 3131 resetandabort: 3132 (void) tcp_respond(NULL, m, m, th, 3133 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); 3134 abort: 3135 if (so != NULL) 3136 (void) soabort(so); 3137 SYN_CACHE_PUT(sc); 3138 tcpstat.tcps_sc_aborted++; 3139 return ((struct socket *)(-1)); 3140 } 3141 3142 /* 3143 * This function is called when we get a RST for a 3144 * non-existant connection, so that we can see if the 3145 * connection is in the syn cache. If it is, zap it. 3146 */ 3147 3148 void 3149 syn_cache_reset(src, dst, th) 3150 struct sockaddr *src; 3151 struct sockaddr *dst; 3152 struct tcphdr *th; 3153 { 3154 struct syn_cache *sc; 3155 struct syn_cache_head *scp; 3156 int s = splsoftnet(); 3157 3158 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3159 splx(s); 3160 return; 3161 } 3162 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3163 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3164 splx(s); 3165 return; 3166 } 3167 SYN_CACHE_RM(sc); 3168 splx(s); 3169 tcpstat.tcps_sc_reset++; 3170 SYN_CACHE_PUT(sc); 3171 } 3172 3173 void 3174 syn_cache_unreach(src, dst, th) 3175 struct sockaddr *src; 3176 struct sockaddr *dst; 3177 struct tcphdr *th; 3178 { 3179 struct syn_cache *sc; 3180 struct syn_cache_head *scp; 3181 int s; 3182 3183 s = splsoftnet(); 3184 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3185 splx(s); 3186 return; 3187 } 3188 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3189 if (ntohl (th->th_seq) != sc->sc_iss) { 3190 splx(s); 3191 return; 3192 } 3193 3194 /* 3195 * If we've rertransmitted 3 times and this is our second error, 3196 * we remove the entry. Otherwise, we allow it to continue on. 3197 * This prevents us from incorrectly nuking an entry during a 3198 * spurious network outage. 3199 * 3200 * See tcp_notify(). 3201 */ 3202 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3203 sc->sc_flags |= SCF_UNREACH; 3204 splx(s); 3205 return; 3206 } 3207 3208 SYN_CACHE_RM(sc); 3209 splx(s); 3210 tcpstat.tcps_sc_unreach++; 3211 SYN_CACHE_PUT(sc); 3212 } 3213 3214 /* 3215 * Given a LISTEN socket and an inbound SYN request, add 3216 * this to the syn cache, and send back a segment: 3217 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3218 * to the source. 3219 * 3220 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3221 * Doing so would require that we hold onto the data and deliver it 3222 * to the application. However, if we are the target of a SYN-flood 3223 * DoS attack, an attacker could send data which would eventually 3224 * consume all available buffer space if it were ACKed. By not ACKing 3225 * the data, we avoid this DoS scenario. 3226 */ 3227 3228 int 3229 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi) 3230 struct sockaddr *src; 3231 struct sockaddr *dst; 3232 struct tcphdr *th; 3233 unsigned int hlen; 3234 struct socket *so; 3235 struct mbuf *m; 3236 u_char *optp; 3237 int optlen; 3238 struct tcp_opt_info *oi; 3239 { 3240 struct tcpcb tb, *tp; 3241 long win; 3242 struct syn_cache *sc; 3243 struct syn_cache_head *scp; 3244 struct mbuf *ipopts; 3245 3246 tp = sototcpcb(so); 3247 3248 /* 3249 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3250 * 3251 * Note this check is performed in tcp_input() very early on. 3252 */ 3253 3254 /* 3255 * Initialize some local state. 3256 */ 3257 win = sbspace(&so->so_rcv); 3258 if (win > TCP_MAXWIN) 3259 win = TCP_MAXWIN; 3260 3261 switch (src->sa_family) { 3262 #ifdef INET 3263 case AF_INET: 3264 /* 3265 * Remember the IP options, if any. 3266 */ 3267 ipopts = ip_srcroute(); 3268 break; 3269 #endif 3270 default: 3271 ipopts = NULL; 3272 } 3273 3274 if (optp) { 3275 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3276 tcp_dooptions(&tb, optp, optlen, th, oi); 3277 } else 3278 tb.t_flags = 0; 3279 3280 /* 3281 * See if we already have an entry for this connection. 3282 * If we do, resend the SYN,ACK. We do not count this 3283 * as a retransmission (XXX though maybe we should). 3284 */ 3285 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3286 tcpstat.tcps_sc_dupesyn++; 3287 if (ipopts) { 3288 /* 3289 * If we were remembering a previous source route, 3290 * forget it and use the new one we've been given. 3291 */ 3292 if (sc->sc_ipopts) 3293 (void) m_free(sc->sc_ipopts); 3294 sc->sc_ipopts = ipopts; 3295 } 3296 sc->sc_timestamp = tb.ts_recent; 3297 if (syn_cache_respond(sc, m) == 0) { 3298 tcpstat.tcps_sndacks++; 3299 tcpstat.tcps_sndtotal++; 3300 } 3301 return (1); 3302 } 3303 3304 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 3305 if (sc == NULL) { 3306 if (ipopts) 3307 (void) m_free(ipopts); 3308 return (0); 3309 } 3310 3311 /* 3312 * Fill in the cache, and put the necessary IP and TCP 3313 * options into the reply. 3314 */ 3315 bzero(sc, sizeof(struct syn_cache)); 3316 bcopy(src, &sc->sc_src, src->sa_len); 3317 bcopy(dst, &sc->sc_dst, dst->sa_len); 3318 sc->sc_flags = 0; 3319 sc->sc_ipopts = ipopts; 3320 sc->sc_irs = th->th_seq; 3321 switch (src->sa_family) { 3322 #ifdef INET 3323 case AF_INET: 3324 { 3325 struct sockaddr_in *srcin = (void *) src; 3326 struct sockaddr_in *dstin = (void *) dst; 3327 3328 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, 3329 &srcin->sin_addr, dstin->sin_port, 3330 srcin->sin_port, sizeof(dstin->sin_addr), 0); 3331 break; 3332 } 3333 #endif /* INET */ 3334 #ifdef INET6 3335 case AF_INET6: 3336 { 3337 struct sockaddr_in6 *srcin6 = (void *) src; 3338 struct sockaddr_in6 *dstin6 = (void *) dst; 3339 3340 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, 3341 &srcin6->sin6_addr, dstin6->sin6_port, 3342 srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0); 3343 break; 3344 } 3345 #endif /* INET6 */ 3346 } 3347 sc->sc_peermaxseg = oi->maxseg; 3348 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 3349 m->m_pkthdr.rcvif : NULL, 3350 sc->sc_src.sa.sa_family); 3351 sc->sc_win = win; 3352 sc->sc_timebase = tcp_now; /* see tcp_newtcpcb() */ 3353 sc->sc_timestamp = tb.ts_recent; 3354 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) 3355 sc->sc_flags |= SCF_TIMESTAMP; 3356 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3357 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3358 sc->sc_requested_s_scale = tb.requested_s_scale; 3359 sc->sc_request_r_scale = 0; 3360 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3361 TCP_MAXWIN << sc->sc_request_r_scale < 3362 so->so_rcv.sb_hiwat) 3363 sc->sc_request_r_scale++; 3364 } else { 3365 sc->sc_requested_s_scale = 15; 3366 sc->sc_request_r_scale = 15; 3367 } 3368 sc->sc_tp = tp; 3369 if (syn_cache_respond(sc, m) == 0) { 3370 syn_cache_insert(sc, tp); 3371 tcpstat.tcps_sndacks++; 3372 tcpstat.tcps_sndtotal++; 3373 } else { 3374 SYN_CACHE_PUT(sc); 3375 tcpstat.tcps_sc_dropped++; 3376 } 3377 return (1); 3378 } 3379 3380 int 3381 syn_cache_respond(sc, m) 3382 struct syn_cache *sc; 3383 struct mbuf *m; 3384 { 3385 struct route *ro; 3386 u_int8_t *optp; 3387 int optlen, error; 3388 u_int16_t tlen; 3389 struct ip *ip = NULL; 3390 #ifdef INET6 3391 struct ip6_hdr *ip6 = NULL; 3392 #endif 3393 struct tcphdr *th; 3394 u_int hlen; 3395 3396 switch (sc->sc_src.sa.sa_family) { 3397 case AF_INET: 3398 hlen = sizeof(struct ip); 3399 ro = &sc->sc_route4; 3400 break; 3401 #ifdef INET6 3402 case AF_INET6: 3403 hlen = sizeof(struct ip6_hdr); 3404 ro = (struct route *)&sc->sc_route6; 3405 break; 3406 #endif 3407 default: 3408 if (m) 3409 m_freem(m); 3410 return EAFNOSUPPORT; 3411 } 3412 3413 /* Compute the size of the TCP options. */ 3414 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3415 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3416 3417 tlen = hlen + sizeof(struct tcphdr) + optlen; 3418 3419 /* 3420 * Create the IP+TCP header from scratch. 3421 */ 3422 if (m) 3423 m_freem(m); 3424 #ifdef DIAGNOSTIC 3425 if (max_linkhdr + tlen > MCLBYTES) 3426 return (ENOBUFS); 3427 #endif 3428 MGETHDR(m, M_DONTWAIT, MT_DATA); 3429 if (m && tlen > MHLEN) { 3430 MCLGET(m, M_DONTWAIT); 3431 if ((m->m_flags & M_EXT) == 0) { 3432 m_freem(m); 3433 m = NULL; 3434 } 3435 } 3436 if (m == NULL) 3437 return (ENOBUFS); 3438 3439 /* Fixup the mbuf. */ 3440 m->m_data += max_linkhdr; 3441 m->m_len = m->m_pkthdr.len = tlen; 3442 #ifdef IPSEC 3443 if (sc->sc_tp) { 3444 struct tcpcb *tp; 3445 struct socket *so; 3446 3447 tp = sc->sc_tp; 3448 if (tp->t_inpcb) 3449 so = tp->t_inpcb->inp_socket; 3450 #ifdef INET6 3451 else if (tp->t_in6pcb) 3452 so = tp->t_in6pcb->in6p_socket; 3453 #endif 3454 else 3455 so = NULL; 3456 /* use IPsec policy on listening socket, on SYN ACK */ 3457 if (ipsec_setsocket(m, so) != 0) { 3458 m_freem(m); 3459 return ENOBUFS; 3460 } 3461 } 3462 #endif 3463 m->m_pkthdr.rcvif = NULL; 3464 memset(mtod(m, u_char *), 0, tlen); 3465 3466 switch (sc->sc_src.sa.sa_family) { 3467 case AF_INET: 3468 ip = mtod(m, struct ip *); 3469 ip->ip_dst = sc->sc_src.sin.sin_addr; 3470 ip->ip_src = sc->sc_dst.sin.sin_addr; 3471 ip->ip_p = IPPROTO_TCP; 3472 th = (struct tcphdr *)(ip + 1); 3473 th->th_dport = sc->sc_src.sin.sin_port; 3474 th->th_sport = sc->sc_dst.sin.sin_port; 3475 break; 3476 #ifdef INET6 3477 case AF_INET6: 3478 ip6 = mtod(m, struct ip6_hdr *); 3479 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3480 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3481 ip6->ip6_nxt = IPPROTO_TCP; 3482 /* ip6_plen will be updated in ip6_output() */ 3483 th = (struct tcphdr *)(ip6 + 1); 3484 th->th_dport = sc->sc_src.sin6.sin6_port; 3485 th->th_sport = sc->sc_dst.sin6.sin6_port; 3486 break; 3487 #endif 3488 default: 3489 th = NULL; 3490 } 3491 3492 th->th_seq = htonl(sc->sc_iss); 3493 th->th_ack = htonl(sc->sc_irs + 1); 3494 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3495 th->th_flags = TH_SYN|TH_ACK; 3496 th->th_win = htons(sc->sc_win); 3497 /* th_sum already 0 */ 3498 /* th_urp already 0 */ 3499 3500 /* Tack on the TCP options. */ 3501 optp = (u_int8_t *)(th + 1); 3502 *optp++ = TCPOPT_MAXSEG; 3503 *optp++ = 4; 3504 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3505 *optp++ = sc->sc_ourmaxseg & 0xff; 3506 3507 if (sc->sc_request_r_scale != 15) { 3508 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3509 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3510 sc->sc_request_r_scale); 3511 optp += 4; 3512 } 3513 3514 if (sc->sc_flags & SCF_TIMESTAMP) { 3515 u_int32_t *lp = (u_int32_t *)(optp); 3516 /* Form timestamp option as shown in appendix A of RFC 1323. */ 3517 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 3518 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 3519 *lp = htonl(sc->sc_timestamp); 3520 optp += TCPOLEN_TSTAMP_APPA; 3521 } 3522 3523 /* Compute the packet's checksum. */ 3524 switch (sc->sc_src.sa.sa_family) { 3525 case AF_INET: 3526 ip->ip_len = htons(tlen - hlen); 3527 th->th_sum = 0; 3528 th->th_sum = in_cksum(m, tlen); 3529 break; 3530 #ifdef INET6 3531 case AF_INET6: 3532 ip6->ip6_plen = htons(tlen - hlen); 3533 th->th_sum = 0; 3534 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 3535 break; 3536 #endif 3537 } 3538 3539 /* 3540 * Fill in some straggling IP bits. Note the stack expects 3541 * ip_len to be in host order, for convenience. 3542 */ 3543 switch (sc->sc_src.sa.sa_family) { 3544 #ifdef INET 3545 case AF_INET: 3546 ip->ip_len = tlen; 3547 ip->ip_ttl = ip_defttl; 3548 /* XXX tos? */ 3549 break; 3550 #endif 3551 #ifdef INET6 3552 case AF_INET6: 3553 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 3554 ip6->ip6_vfc |= IPV6_VERSION; 3555 ip6->ip6_plen = htons(tlen - hlen); 3556 /* ip6_hlim will be initialized afterwards */ 3557 /* XXX flowlabel? */ 3558 break; 3559 #endif 3560 } 3561 3562 switch (sc->sc_src.sa.sa_family) { 3563 #ifdef INET 3564 case AF_INET: 3565 error = ip_output(m, sc->sc_ipopts, ro, 3566 (ip_mtudisc ? IP_MTUDISC : 0), 3567 NULL); 3568 break; 3569 #endif 3570 #ifdef INET6 3571 case AF_INET6: 3572 ip6->ip6_hlim = in6_selecthlim(NULL, 3573 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 3574 3575 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 3576 0, NULL, NULL); 3577 break; 3578 #endif 3579 default: 3580 error = EAFNOSUPPORT; 3581 break; 3582 } 3583 return (error); 3584 } 3585