1 /* $NetBSD: tcp_input.c,v 1.113 2000/07/09 12:49:08 itojun Exp $ */ 2 3 /* 4 %%% portions-copyright-nrl-95 5 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 7 Reserved. All rights under this copyright have been assigned to the US 8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 10 software. 11 You should have received a copy of the license with this software. If you 12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 13 14 */ 15 16 /* 17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 18 * All rights reserved. 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 1. Redistributions of source code must retain the above copyright 24 * notice, this list of conditions and the following disclaimer. 25 * 2. Redistributions in binary form must reproduce the above copyright 26 * notice, this list of conditions and the following disclaimer in the 27 * documentation and/or other materials provided with the distribution. 28 * 3. Neither the name of the project nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 45 /*- 46 * Copyright (c) 1997, 1998, 1999 The NetBSD Foundation, Inc. 47 * All rights reserved. 48 * 49 * This code is derived from software contributed to The NetBSD Foundation 50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 51 * Facility, NASA Ames Research Center. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 3. All advertising materials mentioning features or use of this software 62 * must display the following acknowledgement: 63 * This product includes software developed by the NetBSD 64 * Foundation, Inc. and its contributors. 65 * 4. Neither the name of The NetBSD Foundation nor the names of its 66 * contributors may be used to endorse or promote products derived 67 * from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 * POSSIBILITY OF SUCH DAMAGE. 80 */ 81 82 /* 83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 84 * The Regents of the University of California. All rights reserved. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. All advertising materials mentioning features or use of this software 95 * must display the following acknowledgement: 96 * This product includes software developed by the University of 97 * California, Berkeley and its contributors. 98 * 4. Neither the name of the University nor the names of its contributors 99 * may be used to endorse or promote products derived from this software 100 * without specific prior written permission. 101 * 102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 112 * SUCH DAMAGE. 113 * 114 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 115 */ 116 117 /* 118 * TODO list for SYN cache stuff: 119 * 120 * Find room for a "state" field, which is needed to keep a 121 * compressed state for TIME_WAIT TCBs. It's been noted already 122 * that this is fairly important for very high-volume web and 123 * mail servers, which use a large number of short-lived 124 * connections. 125 */ 126 127 #include "opt_inet.h" 128 #include "opt_ipsec.h" 129 130 #include <sys/param.h> 131 #include <sys/systm.h> 132 #include <sys/malloc.h> 133 #include <sys/mbuf.h> 134 #include <sys/protosw.h> 135 #include <sys/socket.h> 136 #include <sys/socketvar.h> 137 #include <sys/errno.h> 138 #include <sys/syslog.h> 139 #include <sys/pool.h> 140 #include <sys/domain.h> 141 142 #include <net/if.h> 143 #include <net/route.h> 144 #include <net/if_types.h> 145 146 #include <netinet/in.h> 147 #include <netinet/in_systm.h> 148 #include <netinet/ip.h> 149 #include <netinet/in_pcb.h> 150 #include <netinet/ip_var.h> 151 152 #ifdef INET6 153 #ifndef INET 154 #include <netinet/in.h> 155 #endif 156 #include <netinet/ip6.h> 157 #include <netinet6/ip6_var.h> 158 #include <netinet6/in6_pcb.h> 159 #include <netinet6/ip6_var.h> 160 #include <netinet6/in6_var.h> 161 #include <netinet/icmp6.h> 162 #include <netinet6/nd6.h> 163 #endif 164 165 #ifdef PULLDOWN_TEST 166 #ifndef INET6 167 /* always need ip6.h for IP6_EXTHDR_GET */ 168 #include <netinet/ip6.h> 169 #endif 170 #endif 171 172 #include <netinet/tcp.h> 173 #include <netinet/tcp_fsm.h> 174 #include <netinet/tcp_seq.h> 175 #include <netinet/tcp_timer.h> 176 #include <netinet/tcp_var.h> 177 #include <netinet/tcpip.h> 178 #include <netinet/tcp_debug.h> 179 180 #include <machine/stdarg.h> 181 182 #ifdef IPSEC 183 #include <netinet6/ipsec.h> 184 #include <netkey/key.h> 185 #endif /*IPSEC*/ 186 #ifdef INET6 187 #include "faith.h" 188 #endif 189 190 int tcprexmtthresh = 3; 191 int tcp_log_refused; 192 193 struct timeval tcp_rst_ratelim_last; 194 195 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 196 197 /* for modulo comparisons of timestamps */ 198 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 199 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 200 201 /* 202 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 203 */ 204 #ifdef INET6 205 #define ND6_HINT(tp) \ 206 do { \ 207 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 \ 208 && tp->t_in6pcb->in6p_route.ro_rt) { \ 209 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \ 210 } \ 211 } while (0) 212 #else 213 #define ND6_HINT(tp) 214 #endif 215 216 /* 217 * Macro to compute ACK transmission behavior. Delay the ACK unless 218 * we have already delayed an ACK (must send an ACK every two segments). 219 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 220 * option is enabled. 221 */ 222 #define TCP_SETUP_ACK(tp, th) \ 223 do { \ 224 if ((tp)->t_flags & TF_DELACK || \ 225 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \ 226 tp->t_flags |= TF_ACKNOW; \ 227 else \ 228 TCP_SET_DELACK(tp); \ 229 } while (0) 230 231 /* 232 * Convert TCP protocol fields to host order for easier processing. 233 */ 234 #define TCP_FIELDS_TO_HOST(th) \ 235 do { \ 236 NTOHL((th)->th_seq); \ 237 NTOHL((th)->th_ack); \ 238 NTOHS((th)->th_win); \ 239 NTOHS((th)->th_urp); \ 240 } while (0) 241 242 int 243 tcp_reass(tp, th, m, tlen) 244 struct tcpcb *tp; 245 struct tcphdr *th; 246 struct mbuf *m; 247 int *tlen; 248 { 249 struct ipqent *p, *q, *nq, *tiqe = NULL; 250 struct socket *so = NULL; 251 int pkt_flags; 252 tcp_seq pkt_seq; 253 unsigned pkt_len; 254 u_long rcvpartdupbyte = 0; 255 u_long rcvoobyte; 256 257 if (tp->t_inpcb) 258 so = tp->t_inpcb->inp_socket; 259 #ifdef INET6 260 else if (tp->t_in6pcb) 261 so = tp->t_in6pcb->in6p_socket; 262 #endif 263 264 TCP_REASS_LOCK_CHECK(tp); 265 266 /* 267 * Call with th==0 after become established to 268 * force pre-ESTABLISHED data up to user socket. 269 */ 270 if (th == 0) 271 goto present; 272 273 rcvoobyte = *tlen; 274 /* 275 * Copy these to local variables because the tcpiphdr 276 * gets munged while we are collapsing mbufs. 277 */ 278 pkt_seq = th->th_seq; 279 pkt_len = *tlen; 280 pkt_flags = th->th_flags; 281 /* 282 * Find a segment which begins after this one does. 283 */ 284 for (p = NULL, q = tp->segq.lh_first; q != NULL; q = nq) { 285 nq = q->ipqe_q.le_next; 286 /* 287 * If the received segment is just right after this 288 * fragment, merge the two together and then check 289 * for further overlaps. 290 */ 291 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 292 #ifdef TCPREASS_DEBUG 293 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 294 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 295 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 296 #endif 297 pkt_len += q->ipqe_len; 298 pkt_flags |= q->ipqe_flags; 299 pkt_seq = q->ipqe_seq; 300 m_cat(q->ipqe_m, m); 301 m = q->ipqe_m; 302 goto free_ipqe; 303 } 304 /* 305 * If the received segment is completely past this 306 * fragment, we need to go the next fragment. 307 */ 308 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 309 p = q; 310 continue; 311 } 312 /* 313 * If the fragment is past the received segment, 314 * it (or any following) can't be concatenated. 315 */ 316 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) 317 break; 318 /* 319 * We've received all the data in this segment before. 320 * mark it as a duplicate and return. 321 */ 322 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 323 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 324 tcpstat.tcps_rcvduppack++; 325 tcpstat.tcps_rcvdupbyte += pkt_len; 326 m_freem(m); 327 if (tiqe != NULL) 328 pool_put(&ipqent_pool, tiqe); 329 return (0); 330 } 331 /* 332 * Received segment completely overlaps this fragment 333 * so we drop the fragment (this keeps the temporal 334 * ordering of segments correct). 335 */ 336 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 337 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 338 rcvpartdupbyte += q->ipqe_len; 339 m_freem(q->ipqe_m); 340 goto free_ipqe; 341 } 342 /* 343 * RX'ed segment extends past the end of the 344 * fragment. Drop the overlapping bytes. Then 345 * merge the fragment and segment then treat as 346 * a longer received packet. 347 */ 348 if (SEQ_LT(q->ipqe_seq, pkt_seq) 349 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 350 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 351 #ifdef TCPREASS_DEBUG 352 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 353 tp, overlap, 354 pkt_seq, pkt_seq + pkt_len, pkt_len); 355 #endif 356 m_adj(m, overlap); 357 rcvpartdupbyte += overlap; 358 m_cat(q->ipqe_m, m); 359 m = q->ipqe_m; 360 pkt_seq = q->ipqe_seq; 361 pkt_len += q->ipqe_len - overlap; 362 rcvoobyte -= overlap; 363 goto free_ipqe; 364 } 365 /* 366 * RX'ed segment extends past the front of the 367 * fragment. Drop the overlapping bytes on the 368 * received packet. The packet will then be 369 * contatentated with this fragment a bit later. 370 */ 371 if (SEQ_GT(q->ipqe_seq, pkt_seq) 372 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 373 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 374 #ifdef TCPREASS_DEBUG 375 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 376 tp, overlap, 377 pkt_seq, pkt_seq + pkt_len, pkt_len); 378 #endif 379 m_adj(m, -overlap); 380 pkt_len -= overlap; 381 rcvpartdupbyte += overlap; 382 rcvoobyte -= overlap; 383 } 384 /* 385 * If the received segment immediates precedes this 386 * fragment then tack the fragment onto this segment 387 * and reinsert the data. 388 */ 389 if (q->ipqe_seq == pkt_seq + pkt_len) { 390 #ifdef TCPREASS_DEBUG 391 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 392 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 393 pkt_seq, pkt_seq + pkt_len, pkt_len); 394 #endif 395 pkt_len += q->ipqe_len; 396 pkt_flags |= q->ipqe_flags; 397 m_cat(m, q->ipqe_m); 398 LIST_REMOVE(q, ipqe_q); 399 LIST_REMOVE(q, ipqe_timeq); 400 if (tiqe == NULL) { 401 tiqe = q; 402 } else { 403 pool_put(&ipqent_pool, q); 404 } 405 break; 406 } 407 /* 408 * If the fragment is before the segment, remember it. 409 * When this loop is terminated, p will contain the 410 * pointer to fragment that is right before the received 411 * segment. 412 */ 413 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 414 p = q; 415 416 continue; 417 418 /* 419 * This is a common operation. It also will allow 420 * to save doing a malloc/free in most instances. 421 */ 422 free_ipqe: 423 LIST_REMOVE(q, ipqe_q); 424 LIST_REMOVE(q, ipqe_timeq); 425 if (tiqe == NULL) { 426 tiqe = q; 427 } else { 428 pool_put(&ipqent_pool, q); 429 } 430 } 431 432 /* 433 * Allocate a new queue entry since the received segment did not 434 * collapse onto any other out-of-order block; thus we are allocating 435 * a new block. If it had collapsed, tiqe would not be NULL and 436 * we would be reusing it. 437 * XXX If we can't, just drop the packet. XXX 438 */ 439 if (tiqe == NULL) { 440 tiqe = pool_get(&ipqent_pool, PR_NOWAIT); 441 if (tiqe == NULL) { 442 tcpstat.tcps_rcvmemdrop++; 443 m_freem(m); 444 return (0); 445 } 446 } 447 448 /* 449 * Update the counters. 450 */ 451 tcpstat.tcps_rcvoopack++; 452 tcpstat.tcps_rcvoobyte += rcvoobyte; 453 if (rcvpartdupbyte) { 454 tcpstat.tcps_rcvpartduppack++; 455 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte; 456 } 457 458 /* 459 * Insert the new fragment queue entry into both queues. 460 */ 461 tiqe->ipqe_m = m; 462 tiqe->ipqe_seq = pkt_seq; 463 tiqe->ipqe_len = pkt_len; 464 tiqe->ipqe_flags = pkt_flags; 465 if (p == NULL) { 466 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 467 #ifdef TCPREASS_DEBUG 468 if (tiqe->ipqe_seq != tp->rcv_nxt) 469 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 470 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 471 #endif 472 } else { 473 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 474 #ifdef TCPREASS_DEBUG 475 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 476 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 477 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 478 #endif 479 } 480 481 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 482 483 present: 484 /* 485 * Present data to user, advancing rcv_nxt through 486 * completed sequence space. 487 */ 488 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 489 return (0); 490 q = tp->segq.lh_first; 491 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 492 return (0); 493 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 494 return (0); 495 496 tp->rcv_nxt += q->ipqe_len; 497 pkt_flags = q->ipqe_flags & TH_FIN; 498 ND6_HINT(tp); 499 500 LIST_REMOVE(q, ipqe_q); 501 LIST_REMOVE(q, ipqe_timeq); 502 if (so->so_state & SS_CANTRCVMORE) 503 m_freem(q->ipqe_m); 504 else 505 sbappend(&so->so_rcv, q->ipqe_m); 506 pool_put(&ipqent_pool, q); 507 sorwakeup(so); 508 return (pkt_flags); 509 } 510 511 #if defined(INET6) && !defined(TCP6) 512 int 513 tcp6_input(mp, offp, proto) 514 struct mbuf **mp; 515 int *offp, proto; 516 { 517 struct mbuf *m = *mp; 518 519 /* 520 * draft-itojun-ipv6-tcp-to-anycast 521 * better place to put this in? 522 */ 523 if (m->m_flags & M_ANYCAST6) { 524 struct ip6_hdr *ip6; 525 if (m->m_len < sizeof(struct ip6_hdr)) { 526 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 527 tcpstat.tcps_rcvshort++; 528 return IPPROTO_DONE; 529 } 530 } 531 ip6 = mtod(m, struct ip6_hdr *); 532 icmp6_error(m, ICMP6_DST_UNREACH, 533 ICMP6_DST_UNREACH_ADDR, 534 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 535 return IPPROTO_DONE; 536 } 537 538 tcp_input(m, *offp, proto); 539 return IPPROTO_DONE; 540 } 541 #endif 542 543 /* 544 * TCP input routine, follows pages 65-76 of the 545 * protocol specification dated September, 1981 very closely. 546 */ 547 void 548 #if __STDC__ 549 tcp_input(struct mbuf *m, ...) 550 #else 551 tcp_input(m, va_alist) 552 struct mbuf *m; 553 #endif 554 { 555 int proto; 556 struct tcphdr *th; 557 struct ip *ip; 558 struct inpcb *inp; 559 #ifdef INET6 560 struct ip6_hdr *ip6; 561 struct in6pcb *in6p; 562 #endif 563 caddr_t optp = NULL; 564 int optlen = 0; 565 int len, tlen, toff, hdroptlen = 0; 566 struct tcpcb *tp = 0; 567 int tiflags; 568 struct socket *so = NULL; 569 int todrop, acked, ourfinisacked, needoutput = 0; 570 short ostate = 0; 571 int iss = 0; 572 u_long tiwin; 573 struct tcp_opt_info opti; 574 int off, iphlen; 575 va_list ap; 576 int af; /* af on the wire */ 577 struct mbuf *tcp_saveti = NULL; 578 579 va_start(ap, m); 580 toff = va_arg(ap, int); 581 proto = va_arg(ap, int); 582 va_end(ap); 583 584 tcpstat.tcps_rcvtotal++; 585 586 bzero(&opti, sizeof(opti)); 587 opti.ts_present = 0; 588 opti.maxseg = 0; 589 590 /* 591 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 592 * 593 * TCP is, by definition, unicast, so we reject all 594 * multicast outright. 595 * 596 * Note, there are additional src/dst address checks in 597 * the AF-specific code below. 598 */ 599 if (m->m_flags & (M_BCAST|M_MCAST)) { 600 /* XXX stat */ 601 goto drop; 602 } 603 #ifdef INET6 604 if (m->m_flags & M_ANYCAST6) { 605 /* XXX stat */ 606 goto drop; 607 } 608 #endif 609 610 /* 611 * Get IP and TCP header together in first mbuf. 612 * Note: IP leaves IP header in first mbuf. 613 */ 614 ip = mtod(m, struct ip *); 615 #ifdef INET6 616 ip6 = NULL; 617 #endif 618 switch (ip->ip_v) { 619 case 4: 620 af = AF_INET; 621 iphlen = sizeof(struct ip); 622 #ifndef PULLDOWN_TEST 623 /* would like to get rid of this... */ 624 if (toff > sizeof (struct ip)) { 625 ip_stripoptions(m, (struct mbuf *)0); 626 toff = sizeof(struct ip); 627 } 628 if (m->m_len < toff + sizeof (struct tcphdr)) { 629 if ((m = m_pullup(m, toff + sizeof (struct tcphdr))) == 0) { 630 tcpstat.tcps_rcvshort++; 631 return; 632 } 633 } 634 ip = mtod(m, struct ip *); 635 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 636 #else 637 ip = mtod(m, struct ip *); 638 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 639 sizeof(struct tcphdr)); 640 if (th == NULL) { 641 tcpstat.tcps_rcvshort++; 642 return; 643 } 644 #endif 645 646 /* 647 * Make sure destination address is not multicast. 648 * Source address checked in ip_input(). 649 */ 650 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 651 /* XXX stat */ 652 goto drop; 653 } 654 655 /* We do the checksum after PCB lookup... */ 656 len = ip->ip_len; 657 tlen = len - toff; 658 break; 659 #ifdef INET6 660 case 6: 661 ip = NULL; 662 iphlen = sizeof(struct ip6_hdr); 663 af = AF_INET6; 664 #ifndef PULLDOWN_TEST 665 if (m->m_len < toff + sizeof(struct tcphdr)) { 666 m = m_pullup(m, toff + sizeof(struct tcphdr)); /*XXX*/ 667 if (m == NULL) { 668 tcpstat.tcps_rcvshort++; 669 return; 670 } 671 } 672 ip6 = mtod(m, struct ip6_hdr *); 673 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 674 #else 675 ip6 = mtod(m, struct ip6_hdr *); 676 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 677 sizeof(struct tcphdr)); 678 if (th == NULL) { 679 tcpstat.tcps_rcvshort++; 680 return; 681 } 682 #endif 683 684 /* Be proactive about malicious use of IPv4 mapped address */ 685 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 686 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 687 /* XXX stat */ 688 goto drop; 689 } 690 691 /* 692 * Make sure destination address is not multicast. 693 * Source address checked in ip6_input(). 694 */ 695 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 696 /* XXX stat */ 697 goto drop; 698 } 699 700 /* We do the checksum after PCB lookup... */ 701 len = m->m_pkthdr.len; 702 tlen = len - toff; 703 break; 704 #endif 705 default: 706 m_freem(m); 707 return; 708 } 709 710 /* 711 * Check that TCP offset makes sense, 712 * pull out TCP options and adjust length. XXX 713 */ 714 off = th->th_off << 2; 715 if (off < sizeof (struct tcphdr) || off > tlen) { 716 tcpstat.tcps_rcvbadoff++; 717 goto drop; 718 } 719 tlen -= off; 720 721 /* 722 * tcp_input() has been modified to use tlen to mean the TCP data 723 * length throughout the function. Other functions can use 724 * m->m_pkthdr.len as the basis for calculating the TCP data length. 725 * rja 726 */ 727 728 if (off > sizeof (struct tcphdr)) { 729 #ifndef PULLDOWN_TEST 730 if (m->m_len < toff + off) { 731 if ((m = m_pullup(m, toff + off)) == 0) { 732 tcpstat.tcps_rcvshort++; 733 return; 734 } 735 switch (af) { 736 case AF_INET: 737 ip = mtod(m, struct ip *); 738 break; 739 #ifdef INET6 740 case AF_INET6: 741 ip6 = mtod(m, struct ip6_hdr *); 742 break; 743 #endif 744 } 745 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 746 } 747 #else 748 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); 749 if (th == NULL) { 750 tcpstat.tcps_rcvshort++; 751 return; 752 } 753 /* 754 * NOTE: ip/ip6 will not be affected by m_pulldown() 755 * (as they're before toff) and we don't need to update those. 756 */ 757 #endif 758 optlen = off - sizeof (struct tcphdr); 759 optp = ((caddr_t)th) + sizeof(struct tcphdr); 760 /* 761 * Do quick retrieval of timestamp options ("options 762 * prediction?"). If timestamp is the only option and it's 763 * formatted as recommended in RFC 1323 appendix A, we 764 * quickly get the values now and not bother calling 765 * tcp_dooptions(), etc. 766 */ 767 if ((optlen == TCPOLEN_TSTAMP_APPA || 768 (optlen > TCPOLEN_TSTAMP_APPA && 769 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 770 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 771 (th->th_flags & TH_SYN) == 0) { 772 opti.ts_present = 1; 773 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 774 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 775 optp = NULL; /* we've parsed the options */ 776 } 777 } 778 tiflags = th->th_flags; 779 780 /* 781 * Locate pcb for segment. 782 */ 783 findpcb: 784 inp = NULL; 785 #ifdef INET6 786 in6p = NULL; 787 #endif 788 switch (af) { 789 case AF_INET: 790 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, 791 ip->ip_dst, th->th_dport); 792 if (inp == 0) { 793 ++tcpstat.tcps_pcbhashmiss; 794 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); 795 } 796 #if defined(INET6) && !defined(TCP6) 797 if (inp == 0) { 798 struct in6_addr s, d; 799 800 /* mapped addr case */ 801 bzero(&s, sizeof(s)); 802 s.s6_addr16[5] = htons(0xffff); 803 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src)); 804 bzero(&d, sizeof(d)); 805 d.s6_addr16[5] = htons(0xffff); 806 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst)); 807 in6p = in6_pcblookup_connect(&tcb6, &s, th->th_sport, 808 &d, th->th_dport, 0); 809 if (in6p == 0) { 810 ++tcpstat.tcps_pcbhashmiss; 811 in6p = in6_pcblookup_bind(&tcb6, &d, 812 th->th_dport, 0); 813 } 814 } 815 #endif 816 #ifndef INET6 817 if (inp == 0) 818 #else 819 if (inp == 0 && in6p == 0) 820 #endif 821 { 822 ++tcpstat.tcps_noport; 823 if (tcp_log_refused && (tiflags & TH_SYN)) { 824 #ifndef INET6 825 char src[4*sizeof "123"]; 826 char dst[4*sizeof "123"]; 827 #else 828 char src[INET6_ADDRSTRLEN]; 829 char dst[INET6_ADDRSTRLEN]; 830 #endif 831 if (ip) { 832 strcpy(src, inet_ntoa(ip->ip_src)); 833 strcpy(dst, inet_ntoa(ip->ip_dst)); 834 } 835 #ifdef INET6 836 else if (ip6) { 837 strcpy(src, ip6_sprintf(&ip6->ip6_src)); 838 strcpy(dst, ip6_sprintf(&ip6->ip6_dst)); 839 } 840 #endif 841 else { 842 strcpy(src, "(unknown)"); 843 strcpy(dst, "(unknown)"); 844 } 845 log(LOG_INFO, 846 "Connection attempt to TCP %s:%d from %s:%d\n", 847 dst, ntohs(th->th_dport), 848 src, ntohs(th->th_sport)); 849 } 850 TCP_FIELDS_TO_HOST(th); 851 goto dropwithreset_ratelim; 852 } 853 #ifdef IPSEC 854 if (inp && ipsec4_in_reject(m, inp)) { 855 ipsecstat.in_polvio++; 856 goto drop; 857 } 858 #ifdef INET6 859 else if (in6p && ipsec4_in_reject_so(m, in6p->in6p_socket)) { 860 ipsecstat.in_polvio++; 861 goto drop; 862 } 863 #endif 864 #endif /*IPSEC*/ 865 break; 866 #if defined(INET6) && !defined(TCP6) 867 case AF_INET6: 868 { 869 int faith; 870 871 #if defined(NFAITH) && NFAITH > 0 872 if (m->m_pkthdr.rcvif 873 && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 874 faith = 1; 875 } else 876 faith = 0; 877 #else 878 faith = 0; 879 #endif 880 in6p = in6_pcblookup_connect(&tcb6, &ip6->ip6_src, th->th_sport, 881 &ip6->ip6_dst, th->th_dport, faith); 882 if (in6p == NULL) { 883 ++tcpstat.tcps_pcbhashmiss; 884 in6p = in6_pcblookup_bind(&tcb6, &ip6->ip6_dst, 885 th->th_dport, faith); 886 } 887 if (in6p == NULL) { 888 ++tcpstat.tcps_noport; 889 TCP_FIELDS_TO_HOST(th); 890 goto dropwithreset_ratelim; 891 } 892 #ifdef IPSEC 893 if (ipsec6_in_reject(m, in6p)) { 894 ipsec6stat.in_polvio++; 895 goto drop; 896 } 897 #endif /*IPSEC*/ 898 break; 899 } 900 #endif 901 } 902 903 /* 904 * If the state is CLOSED (i.e., TCB does not exist) then 905 * all data in the incoming segment is discarded. 906 * If the TCB exists but is in CLOSED state, it is embryonic, 907 * but should either do a listen or a connect soon. 908 */ 909 tp = NULL; 910 so = NULL; 911 if (inp) { 912 tp = intotcpcb(inp); 913 so = inp->inp_socket; 914 } 915 #ifdef INET6 916 else if (in6p) { 917 tp = in6totcpcb(in6p); 918 so = in6p->in6p_socket; 919 } 920 #endif 921 if (tp == 0) { 922 TCP_FIELDS_TO_HOST(th); 923 goto dropwithreset_ratelim; 924 } 925 if (tp->t_state == TCPS_CLOSED) 926 goto drop; 927 928 /* 929 * Checksum extended TCP header and data. 930 */ 931 switch (af) { 932 case AF_INET: 933 #ifndef PULLDOWN_TEST 934 { 935 struct ipovly *ipov; 936 ipov = (struct ipovly *)ip; 937 bzero(ipov->ih_x1, sizeof ipov->ih_x1); 938 ipov->ih_len = htons(tlen + off); 939 940 if (in_cksum(m, len) != 0) { 941 tcpstat.tcps_rcvbadsum++; 942 goto drop; 943 } 944 } 945 #else 946 if (in4_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) { 947 tcpstat.tcps_rcvbadsum++; 948 goto drop; 949 } 950 #endif 951 break; 952 953 #ifdef INET6 954 case AF_INET6: 955 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) { 956 tcpstat.tcps_rcvbadsum++; 957 goto drop; 958 } 959 break; 960 #endif 961 } 962 963 TCP_FIELDS_TO_HOST(th); 964 965 /* Unscale the window into a 32-bit value. */ 966 if ((tiflags & TH_SYN) == 0) 967 tiwin = th->th_win << tp->snd_scale; 968 else 969 tiwin = th->th_win; 970 971 #ifdef INET6 972 /* save packet options if user wanted */ 973 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { 974 if (in6p->in6p_options) { 975 m_freem(in6p->in6p_options); 976 in6p->in6p_options = 0; 977 } 978 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); 979 } 980 #endif 981 982 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 983 union syn_cache_sa src; 984 union syn_cache_sa dst; 985 986 bzero(&src, sizeof(src)); 987 bzero(&dst, sizeof(dst)); 988 switch (af) { 989 case AF_INET: 990 src.sin.sin_len = sizeof(struct sockaddr_in); 991 src.sin.sin_family = AF_INET; 992 src.sin.sin_addr = ip->ip_src; 993 src.sin.sin_port = th->th_sport; 994 995 dst.sin.sin_len = sizeof(struct sockaddr_in); 996 dst.sin.sin_family = AF_INET; 997 dst.sin.sin_addr = ip->ip_dst; 998 dst.sin.sin_port = th->th_dport; 999 break; 1000 #ifdef INET6 1001 case AF_INET6: 1002 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 1003 src.sin6.sin6_family = AF_INET6; 1004 src.sin6.sin6_addr = ip6->ip6_src; 1005 src.sin6.sin6_port = th->th_sport; 1006 1007 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 1008 dst.sin6.sin6_family = AF_INET6; 1009 dst.sin6.sin6_addr = ip6->ip6_dst; 1010 dst.sin6.sin6_port = th->th_dport; 1011 break; 1012 #endif /* INET6 */ 1013 default: 1014 goto badsyn; /*sanity*/ 1015 } 1016 1017 if (so->so_options & SO_DEBUG) { 1018 ostate = tp->t_state; 1019 1020 tcp_saveti = NULL; 1021 if (iphlen + sizeof(struct tcphdr) > MHLEN) 1022 goto nosave; 1023 1024 if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) { 1025 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 1026 if (!tcp_saveti) 1027 goto nosave; 1028 } else { 1029 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER); 1030 if (!tcp_saveti) 1031 goto nosave; 1032 tcp_saveti->m_len = iphlen; 1033 m_copydata(m, 0, iphlen, 1034 mtod(tcp_saveti, caddr_t)); 1035 } 1036 1037 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 1038 m_freem(tcp_saveti); 1039 tcp_saveti = NULL; 1040 } else { 1041 tcp_saveti->m_len += sizeof(struct tcphdr); 1042 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen, 1043 sizeof(struct tcphdr)); 1044 } 1045 if (tcp_saveti) { 1046 /* 1047 * need to recover version # field, which was 1048 * overwritten on ip_cksum computation. 1049 */ 1050 struct ip *sip; 1051 sip = mtod(tcp_saveti, struct ip *); 1052 switch (af) { 1053 case AF_INET: 1054 sip->ip_v = 4; 1055 break; 1056 #ifdef INET6 1057 case AF_INET6: 1058 sip->ip_v = 6; 1059 break; 1060 #endif 1061 } 1062 } 1063 nosave:; 1064 } 1065 if (so->so_options & SO_ACCEPTCONN) { 1066 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1067 if (tiflags & TH_RST) { 1068 syn_cache_reset(&src.sa, &dst.sa, th); 1069 } else if ((tiflags & (TH_ACK|TH_SYN)) == 1070 (TH_ACK|TH_SYN)) { 1071 /* 1072 * Received a SYN,ACK. This should 1073 * never happen while we are in 1074 * LISTEN. Send an RST. 1075 */ 1076 goto badsyn; 1077 } else if (tiflags & TH_ACK) { 1078 so = syn_cache_get(&src.sa, &dst.sa, 1079 th, toff, tlen, so, m); 1080 if (so == NULL) { 1081 /* 1082 * We don't have a SYN for 1083 * this ACK; send an RST. 1084 */ 1085 goto badsyn; 1086 } else if (so == 1087 (struct socket *)(-1)) { 1088 /* 1089 * We were unable to create 1090 * the connection. If the 1091 * 3-way handshake was 1092 * completed, and RST has 1093 * been sent to the peer. 1094 * Since the mbuf might be 1095 * in use for the reply, 1096 * do not free it. 1097 */ 1098 m = NULL; 1099 } else { 1100 /* 1101 * We have created a 1102 * full-blown connection. 1103 */ 1104 tp = NULL; 1105 inp = NULL; 1106 #ifdef INET6 1107 in6p = NULL; 1108 #endif 1109 switch (so->so_proto->pr_domain->dom_family) { 1110 case AF_INET: 1111 inp = sotoinpcb(so); 1112 tp = intotcpcb(inp); 1113 break; 1114 #ifdef INET6 1115 case AF_INET6: 1116 in6p = sotoin6pcb(so); 1117 tp = in6totcpcb(in6p); 1118 break; 1119 #endif 1120 } 1121 if (tp == NULL) 1122 goto badsyn; /*XXX*/ 1123 tiwin <<= tp->snd_scale; 1124 goto after_listen; 1125 } 1126 } else { 1127 /* 1128 * None of RST, SYN or ACK was set. 1129 * This is an invalid packet for a 1130 * TCB in LISTEN state. Send a RST. 1131 */ 1132 goto badsyn; 1133 } 1134 } else { 1135 /* 1136 * Received a SYN. 1137 */ 1138 1139 /* 1140 * LISTEN socket received a SYN 1141 * from itself? This can't possibly 1142 * be valid; drop the packet. 1143 */ 1144 if (th->th_sport == th->th_dport) { 1145 int i; 1146 1147 switch (af) { 1148 case AF_INET: 1149 i = in_hosteq(ip->ip_src, ip->ip_dst); 1150 break; 1151 #ifdef INET6 1152 case AF_INET6: 1153 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1154 break; 1155 #endif 1156 default: 1157 i = 1; 1158 } 1159 if (i) { 1160 tcpstat.tcps_badsyn++; 1161 goto drop; 1162 } 1163 } 1164 1165 /* 1166 * SYN looks ok; create compressed TCP 1167 * state for it. 1168 */ 1169 if (so->so_qlen <= so->so_qlimit && 1170 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1171 so, m, optp, optlen, &opti)) 1172 m = NULL; 1173 } 1174 goto drop; 1175 } 1176 } 1177 1178 after_listen: 1179 #ifdef DIAGNOSTIC 1180 /* 1181 * Should not happen now that all embryonic connections 1182 * are handled with compressed state. 1183 */ 1184 if (tp->t_state == TCPS_LISTEN) 1185 panic("tcp_input: TCPS_LISTEN"); 1186 #endif 1187 1188 /* 1189 * Segment received on connection. 1190 * Reset idle time and keep-alive timer. 1191 */ 1192 tp->t_idle = 0; 1193 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1194 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1195 1196 /* 1197 * Process options. 1198 */ 1199 if (optp) 1200 tcp_dooptions(tp, optp, optlen, th, &opti); 1201 1202 /* 1203 * Header prediction: check for the two common cases 1204 * of a uni-directional data xfer. If the packet has 1205 * no control flags, is in-sequence, the window didn't 1206 * change and we're not retransmitting, it's a 1207 * candidate. If the length is zero and the ack moved 1208 * forward, we're the sender side of the xfer. Just 1209 * free the data acked & wake any higher level process 1210 * that was blocked waiting for space. If the length 1211 * is non-zero and the ack didn't move, we're the 1212 * receiver side. If we're getting packets in-order 1213 * (the reassembly queue is empty), add the data to 1214 * the socket buffer and note that we need a delayed ack. 1215 */ 1216 if (tp->t_state == TCPS_ESTABLISHED && 1217 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1218 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1219 th->th_seq == tp->rcv_nxt && 1220 tiwin && tiwin == tp->snd_wnd && 1221 tp->snd_nxt == tp->snd_max) { 1222 1223 /* 1224 * If last ACK falls within this segment's sequence numbers, 1225 * record the timestamp. 1226 */ 1227 if (opti.ts_present && 1228 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1229 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) { 1230 tp->ts_recent_age = tcp_now; 1231 tp->ts_recent = opti.ts_val; 1232 } 1233 1234 if (tlen == 0) { 1235 if (SEQ_GT(th->th_ack, tp->snd_una) && 1236 SEQ_LEQ(th->th_ack, tp->snd_max) && 1237 tp->snd_cwnd >= tp->snd_wnd && 1238 tp->t_dupacks < tcprexmtthresh) { 1239 /* 1240 * this is a pure ack for outstanding data. 1241 */ 1242 ++tcpstat.tcps_predack; 1243 if (opti.ts_present && opti.ts_ecr) 1244 tcp_xmit_timer(tp, 1245 tcp_now - opti.ts_ecr + 1); 1246 else if (tp->t_rtt && 1247 SEQ_GT(th->th_ack, tp->t_rtseq)) 1248 tcp_xmit_timer(tp, tp->t_rtt); 1249 acked = th->th_ack - tp->snd_una; 1250 tcpstat.tcps_rcvackpack++; 1251 tcpstat.tcps_rcvackbyte += acked; 1252 ND6_HINT(tp); 1253 sbdrop(&so->so_snd, acked); 1254 /* 1255 * We want snd_recover to track snd_una to 1256 * avoid sequence wraparound problems for 1257 * very large transfers. 1258 */ 1259 tp->snd_una = tp->snd_recover = th->th_ack; 1260 m_freem(m); 1261 1262 /* 1263 * If all outstanding data are acked, stop 1264 * retransmit timer, otherwise restart timer 1265 * using current (possibly backed-off) value. 1266 * If process is waiting for space, 1267 * wakeup/selwakeup/signal. If data 1268 * are ready to send, let tcp_output 1269 * decide between more output or persist. 1270 */ 1271 if (tp->snd_una == tp->snd_max) 1272 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1273 else if (TCP_TIMER_ISARMED(tp, 1274 TCPT_PERSIST) == 0) 1275 TCP_TIMER_ARM(tp, TCPT_REXMT, 1276 tp->t_rxtcur); 1277 1278 sowwakeup(so); 1279 if (so->so_snd.sb_cc) 1280 (void) tcp_output(tp); 1281 if (tcp_saveti) 1282 m_freem(tcp_saveti); 1283 return; 1284 } 1285 } else if (th->th_ack == tp->snd_una && 1286 tp->segq.lh_first == NULL && 1287 tlen <= sbspace(&so->so_rcv)) { 1288 /* 1289 * this is a pure, in-sequence data packet 1290 * with nothing on the reassembly queue and 1291 * we have enough buffer space to take it. 1292 */ 1293 ++tcpstat.tcps_preddat; 1294 tp->rcv_nxt += tlen; 1295 tcpstat.tcps_rcvpack++; 1296 tcpstat.tcps_rcvbyte += tlen; 1297 ND6_HINT(tp); 1298 /* 1299 * Drop TCP, IP headers and TCP options then add data 1300 * to socket buffer. 1301 */ 1302 m_adj(m, toff + off); 1303 sbappend(&so->so_rcv, m); 1304 sorwakeup(so); 1305 TCP_SETUP_ACK(tp, th); 1306 if (tp->t_flags & TF_ACKNOW) 1307 (void) tcp_output(tp); 1308 if (tcp_saveti) 1309 m_freem(tcp_saveti); 1310 return; 1311 } 1312 } 1313 1314 /* 1315 * Compute mbuf offset to TCP data segment. 1316 */ 1317 hdroptlen = toff + off; 1318 1319 /* 1320 * Calculate amount of space in receive window, 1321 * and then do TCP input processing. 1322 * Receive window is amount of space in rcv queue, 1323 * but not less than advertised window. 1324 */ 1325 { int win; 1326 1327 win = sbspace(&so->so_rcv); 1328 if (win < 0) 1329 win = 0; 1330 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1331 } 1332 1333 switch (tp->t_state) { 1334 1335 /* 1336 * If the state is SYN_SENT: 1337 * if seg contains an ACK, but not for our SYN, drop the input. 1338 * if seg contains a RST, then drop the connection. 1339 * if seg does not contain SYN, then drop it. 1340 * Otherwise this is an acceptable SYN segment 1341 * initialize tp->rcv_nxt and tp->irs 1342 * if seg contains ack then advance tp->snd_una 1343 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1344 * arrange for segment to be acked (eventually) 1345 * continue processing rest of data/controls, beginning with URG 1346 */ 1347 case TCPS_SYN_SENT: 1348 if ((tiflags & TH_ACK) && 1349 (SEQ_LEQ(th->th_ack, tp->iss) || 1350 SEQ_GT(th->th_ack, tp->snd_max))) 1351 goto dropwithreset; 1352 if (tiflags & TH_RST) { 1353 if (tiflags & TH_ACK) 1354 tp = tcp_drop(tp, ECONNREFUSED); 1355 goto drop; 1356 } 1357 if ((tiflags & TH_SYN) == 0) 1358 goto drop; 1359 if (tiflags & TH_ACK) { 1360 tp->snd_una = tp->snd_recover = th->th_ack; 1361 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1362 tp->snd_nxt = tp->snd_una; 1363 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1364 } 1365 tp->irs = th->th_seq; 1366 tcp_rcvseqinit(tp); 1367 tp->t_flags |= TF_ACKNOW; 1368 tcp_mss_from_peer(tp, opti.maxseg); 1369 1370 /* 1371 * Initialize the initial congestion window. If we 1372 * had to retransmit the SYN, we must initialize cwnd 1373 * to 1 segment (i.e. the Loss Window). 1374 */ 1375 if (tp->t_flags & TF_SYN_REXMT) 1376 tp->snd_cwnd = tp->t_peermss; 1377 else 1378 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, 1379 tp->t_peermss); 1380 1381 tcp_rmx_rtt(tp); 1382 if (tiflags & TH_ACK) { 1383 tcpstat.tcps_connects++; 1384 soisconnected(so); 1385 tcp_established(tp); 1386 /* Do window scaling on this connection? */ 1387 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1388 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1389 tp->snd_scale = tp->requested_s_scale; 1390 tp->rcv_scale = tp->request_r_scale; 1391 } 1392 TCP_REASS_LOCK(tp); 1393 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1394 TCP_REASS_UNLOCK(tp); 1395 /* 1396 * if we didn't have to retransmit the SYN, 1397 * use its rtt as our initial srtt & rtt var. 1398 */ 1399 if (tp->t_rtt) 1400 tcp_xmit_timer(tp, tp->t_rtt); 1401 } else 1402 tp->t_state = TCPS_SYN_RECEIVED; 1403 1404 /* 1405 * Advance th->th_seq to correspond to first data byte. 1406 * If data, trim to stay within window, 1407 * dropping FIN if necessary. 1408 */ 1409 th->th_seq++; 1410 if (tlen > tp->rcv_wnd) { 1411 todrop = tlen - tp->rcv_wnd; 1412 m_adj(m, -todrop); 1413 tlen = tp->rcv_wnd; 1414 tiflags &= ~TH_FIN; 1415 tcpstat.tcps_rcvpackafterwin++; 1416 tcpstat.tcps_rcvbyteafterwin += todrop; 1417 } 1418 tp->snd_wl1 = th->th_seq - 1; 1419 tp->rcv_up = th->th_seq; 1420 goto step6; 1421 1422 /* 1423 * If the state is SYN_RECEIVED: 1424 * If seg contains an ACK, but not for our SYN, drop the input 1425 * and generate an RST. See page 36, rfc793 1426 */ 1427 case TCPS_SYN_RECEIVED: 1428 if ((tiflags & TH_ACK) && 1429 (SEQ_LEQ(th->th_ack, tp->iss) || 1430 SEQ_GT(th->th_ack, tp->snd_max))) 1431 goto dropwithreset; 1432 break; 1433 } 1434 1435 /* 1436 * States other than LISTEN or SYN_SENT. 1437 * First check timestamp, if present. 1438 * Then check that at least some bytes of segment are within 1439 * receive window. If segment begins before rcv_nxt, 1440 * drop leading data (and SYN); if nothing left, just ack. 1441 * 1442 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1443 * and it's less than ts_recent, drop it. 1444 */ 1445 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1446 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1447 1448 /* Check to see if ts_recent is over 24 days old. */ 1449 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1450 /* 1451 * Invalidate ts_recent. If this segment updates 1452 * ts_recent, the age will be reset later and ts_recent 1453 * will get a valid value. If it does not, setting 1454 * ts_recent to zero will at least satisfy the 1455 * requirement that zero be placed in the timestamp 1456 * echo reply when ts_recent isn't valid. The 1457 * age isn't reset until we get a valid ts_recent 1458 * because we don't want out-of-order segments to be 1459 * dropped when ts_recent is old. 1460 */ 1461 tp->ts_recent = 0; 1462 } else { 1463 tcpstat.tcps_rcvduppack++; 1464 tcpstat.tcps_rcvdupbyte += tlen; 1465 tcpstat.tcps_pawsdrop++; 1466 goto dropafterack; 1467 } 1468 } 1469 1470 todrop = tp->rcv_nxt - th->th_seq; 1471 if (todrop > 0) { 1472 if (tiflags & TH_SYN) { 1473 tiflags &= ~TH_SYN; 1474 th->th_seq++; 1475 if (th->th_urp > 1) 1476 th->th_urp--; 1477 else { 1478 tiflags &= ~TH_URG; 1479 th->th_urp = 0; 1480 } 1481 todrop--; 1482 } 1483 if (todrop > tlen || 1484 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1485 /* 1486 * Any valid FIN must be to the left of the window. 1487 * At this point the FIN must be a duplicate or 1488 * out of sequence; drop it. 1489 */ 1490 tiflags &= ~TH_FIN; 1491 /* 1492 * Send an ACK to resynchronize and drop any data. 1493 * But keep on processing for RST or ACK. 1494 */ 1495 tp->t_flags |= TF_ACKNOW; 1496 todrop = tlen; 1497 tcpstat.tcps_rcvdupbyte += todrop; 1498 tcpstat.tcps_rcvduppack++; 1499 } else { 1500 tcpstat.tcps_rcvpartduppack++; 1501 tcpstat.tcps_rcvpartdupbyte += todrop; 1502 } 1503 hdroptlen += todrop; /*drop from head afterwards*/ 1504 th->th_seq += todrop; 1505 tlen -= todrop; 1506 if (th->th_urp > todrop) 1507 th->th_urp -= todrop; 1508 else { 1509 tiflags &= ~TH_URG; 1510 th->th_urp = 0; 1511 } 1512 } 1513 1514 /* 1515 * If new data are received on a connection after the 1516 * user processes are gone, then RST the other end. 1517 */ 1518 if ((so->so_state & SS_NOFDREF) && 1519 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1520 tp = tcp_close(tp); 1521 tcpstat.tcps_rcvafterclose++; 1522 goto dropwithreset; 1523 } 1524 1525 /* 1526 * If segment ends after window, drop trailing data 1527 * (and PUSH and FIN); if nothing left, just ACK. 1528 */ 1529 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1530 if (todrop > 0) { 1531 tcpstat.tcps_rcvpackafterwin++; 1532 if (todrop >= tlen) { 1533 tcpstat.tcps_rcvbyteafterwin += tlen; 1534 /* 1535 * If a new connection request is received 1536 * while in TIME_WAIT, drop the old connection 1537 * and start over if the sequence numbers 1538 * are above the previous ones. 1539 */ 1540 if (tiflags & TH_SYN && 1541 tp->t_state == TCPS_TIME_WAIT && 1542 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1543 iss = tcp_new_iss(tp, sizeof(struct tcpcb), 1544 tp->snd_nxt); 1545 tp = tcp_close(tp); 1546 goto findpcb; 1547 } 1548 /* 1549 * If window is closed can only take segments at 1550 * window edge, and have to drop data and PUSH from 1551 * incoming segments. Continue processing, but 1552 * remember to ack. Otherwise, drop segment 1553 * and ack. 1554 */ 1555 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1556 tp->t_flags |= TF_ACKNOW; 1557 tcpstat.tcps_rcvwinprobe++; 1558 } else 1559 goto dropafterack; 1560 } else 1561 tcpstat.tcps_rcvbyteafterwin += todrop; 1562 m_adj(m, -todrop); 1563 tlen -= todrop; 1564 tiflags &= ~(TH_PUSH|TH_FIN); 1565 } 1566 1567 /* 1568 * If last ACK falls within this segment's sequence numbers, 1569 * and the timestamp is newer, record it. 1570 */ 1571 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1572 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1573 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen + 1574 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 1575 tp->ts_recent_age = tcp_now; 1576 tp->ts_recent = opti.ts_val; 1577 } 1578 1579 /* 1580 * If the RST bit is set examine the state: 1581 * SYN_RECEIVED STATE: 1582 * If passive open, return to LISTEN state. 1583 * If active open, inform user that connection was refused. 1584 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1585 * Inform user that connection was reset, and close tcb. 1586 * CLOSING, LAST_ACK, TIME_WAIT STATES 1587 * Close the tcb. 1588 */ 1589 if (tiflags&TH_RST) switch (tp->t_state) { 1590 1591 case TCPS_SYN_RECEIVED: 1592 so->so_error = ECONNREFUSED; 1593 goto close; 1594 1595 case TCPS_ESTABLISHED: 1596 case TCPS_FIN_WAIT_1: 1597 case TCPS_FIN_WAIT_2: 1598 case TCPS_CLOSE_WAIT: 1599 so->so_error = ECONNRESET; 1600 close: 1601 tp->t_state = TCPS_CLOSED; 1602 tcpstat.tcps_drops++; 1603 tp = tcp_close(tp); 1604 goto drop; 1605 1606 case TCPS_CLOSING: 1607 case TCPS_LAST_ACK: 1608 case TCPS_TIME_WAIT: 1609 tp = tcp_close(tp); 1610 goto drop; 1611 } 1612 1613 /* 1614 * If a SYN is in the window, then this is an 1615 * error and we send an RST and drop the connection. 1616 */ 1617 if (tiflags & TH_SYN) { 1618 tp = tcp_drop(tp, ECONNRESET); 1619 goto dropwithreset; 1620 } 1621 1622 /* 1623 * If the ACK bit is off we drop the segment and return. 1624 */ 1625 if ((tiflags & TH_ACK) == 0) { 1626 if (tp->t_flags & TF_ACKNOW) 1627 goto dropafterack; 1628 else 1629 goto drop; 1630 } 1631 1632 /* 1633 * Ack processing. 1634 */ 1635 switch (tp->t_state) { 1636 1637 /* 1638 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 1639 * ESTABLISHED state and continue processing, otherwise 1640 * send an RST. 1641 */ 1642 case TCPS_SYN_RECEIVED: 1643 if (SEQ_GT(tp->snd_una, th->th_ack) || 1644 SEQ_GT(th->th_ack, tp->snd_max)) 1645 goto dropwithreset; 1646 tcpstat.tcps_connects++; 1647 soisconnected(so); 1648 tcp_established(tp); 1649 /* Do window scaling? */ 1650 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1651 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1652 tp->snd_scale = tp->requested_s_scale; 1653 tp->rcv_scale = tp->request_r_scale; 1654 } 1655 TCP_REASS_LOCK(tp); 1656 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1657 TCP_REASS_UNLOCK(tp); 1658 tp->snd_wl1 = th->th_seq - 1; 1659 /* fall into ... */ 1660 1661 /* 1662 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1663 * ACKs. If the ack is in the range 1664 * tp->snd_una < th->th_ack <= tp->snd_max 1665 * then advance tp->snd_una to th->th_ack and drop 1666 * data from the retransmission queue. If this ACK reflects 1667 * more up to date window information we update our window information. 1668 */ 1669 case TCPS_ESTABLISHED: 1670 case TCPS_FIN_WAIT_1: 1671 case TCPS_FIN_WAIT_2: 1672 case TCPS_CLOSE_WAIT: 1673 case TCPS_CLOSING: 1674 case TCPS_LAST_ACK: 1675 case TCPS_TIME_WAIT: 1676 1677 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1678 if (tlen == 0 && tiwin == tp->snd_wnd) { 1679 tcpstat.tcps_rcvdupack++; 1680 /* 1681 * If we have outstanding data (other than 1682 * a window probe), this is a completely 1683 * duplicate ack (ie, window info didn't 1684 * change), the ack is the biggest we've 1685 * seen and we've seen exactly our rexmt 1686 * threshhold of them, assume a packet 1687 * has been dropped and retransmit it. 1688 * Kludge snd_nxt & the congestion 1689 * window so we send only this one 1690 * packet. 1691 * 1692 * We know we're losing at the current 1693 * window size so do congestion avoidance 1694 * (set ssthresh to half the current window 1695 * and pull our congestion window back to 1696 * the new ssthresh). 1697 * 1698 * Dup acks mean that packets have left the 1699 * network (they're now cached at the receiver) 1700 * so bump cwnd by the amount in the receiver 1701 * to keep a constant cwnd packets in the 1702 * network. 1703 */ 1704 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 1705 th->th_ack != tp->snd_una) 1706 tp->t_dupacks = 0; 1707 else if (++tp->t_dupacks == tcprexmtthresh) { 1708 tcp_seq onxt = tp->snd_nxt; 1709 u_int win = 1710 min(tp->snd_wnd, tp->snd_cwnd) / 1711 2 / tp->t_segsz; 1712 if (tcp_do_newreno && SEQ_LT(th->th_ack, 1713 tp->snd_recover)) { 1714 /* 1715 * False fast retransmit after 1716 * timeout. Do not cut window. 1717 */ 1718 tp->snd_cwnd += tp->t_segsz; 1719 tp->t_dupacks = 0; 1720 (void) tcp_output(tp); 1721 goto drop; 1722 } 1723 1724 if (win < 2) 1725 win = 2; 1726 tp->snd_ssthresh = win * tp->t_segsz; 1727 tp->snd_recover = tp->snd_max; 1728 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1729 tp->t_rtt = 0; 1730 tp->snd_nxt = th->th_ack; 1731 tp->snd_cwnd = tp->t_segsz; 1732 (void) tcp_output(tp); 1733 tp->snd_cwnd = tp->snd_ssthresh + 1734 tp->t_segsz * tp->t_dupacks; 1735 if (SEQ_GT(onxt, tp->snd_nxt)) 1736 tp->snd_nxt = onxt; 1737 goto drop; 1738 } else if (tp->t_dupacks > tcprexmtthresh) { 1739 tp->snd_cwnd += tp->t_segsz; 1740 (void) tcp_output(tp); 1741 goto drop; 1742 } 1743 } else 1744 tp->t_dupacks = 0; 1745 break; 1746 } 1747 /* 1748 * If the congestion window was inflated to account 1749 * for the other side's cached packets, retract it. 1750 */ 1751 if (tcp_do_newreno == 0) { 1752 if (tp->t_dupacks >= tcprexmtthresh && 1753 tp->snd_cwnd > tp->snd_ssthresh) 1754 tp->snd_cwnd = tp->snd_ssthresh; 1755 tp->t_dupacks = 0; 1756 } else if (tp->t_dupacks >= tcprexmtthresh && 1757 tcp_newreno(tp, th) == 0) { 1758 tp->snd_cwnd = tp->snd_ssthresh; 1759 /* 1760 * Window inflation should have left us with approx. 1761 * snd_ssthresh outstanding data. But in case we 1762 * would be inclined to send a burst, better to do 1763 * it via the slow start mechanism. 1764 */ 1765 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 1766 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 1767 + tp->t_segsz; 1768 tp->t_dupacks = 0; 1769 } 1770 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1771 tcpstat.tcps_rcvacktoomuch++; 1772 goto dropafterack; 1773 } 1774 acked = th->th_ack - tp->snd_una; 1775 tcpstat.tcps_rcvackpack++; 1776 tcpstat.tcps_rcvackbyte += acked; 1777 1778 /* 1779 * If we have a timestamp reply, update smoothed 1780 * round trip time. If no timestamp is present but 1781 * transmit timer is running and timed sequence 1782 * number was acked, update smoothed round trip time. 1783 * Since we now have an rtt measurement, cancel the 1784 * timer backoff (cf., Phil Karn's retransmit alg.). 1785 * Recompute the initial retransmit timer. 1786 */ 1787 if (opti.ts_present && opti.ts_ecr) 1788 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr + 1); 1789 else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq)) 1790 tcp_xmit_timer(tp,tp->t_rtt); 1791 1792 /* 1793 * If all outstanding data is acked, stop retransmit 1794 * timer and remember to restart (more output or persist). 1795 * If there is more data to be acked, restart retransmit 1796 * timer, using current (possibly backed-off) value. 1797 */ 1798 if (th->th_ack == tp->snd_max) { 1799 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1800 needoutput = 1; 1801 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1802 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1803 /* 1804 * When new data is acked, open the congestion window. 1805 * If the window gives us less than ssthresh packets 1806 * in flight, open exponentially (segsz per packet). 1807 * Otherwise open linearly: segsz per window 1808 * (segsz^2 / cwnd per packet), plus a constant 1809 * fraction of a packet (segsz/8) to help larger windows 1810 * open quickly enough. 1811 */ 1812 { 1813 u_int cw = tp->snd_cwnd; 1814 u_int incr = tp->t_segsz; 1815 1816 if (cw > tp->snd_ssthresh) 1817 incr = incr * incr / cw; 1818 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover)) 1819 tp->snd_cwnd = min(cw + incr, 1820 TCP_MAXWIN << tp->snd_scale); 1821 } 1822 ND6_HINT(tp); 1823 if (acked > so->so_snd.sb_cc) { 1824 tp->snd_wnd -= so->so_snd.sb_cc; 1825 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1826 ourfinisacked = 1; 1827 } else { 1828 sbdrop(&so->so_snd, acked); 1829 tp->snd_wnd -= acked; 1830 ourfinisacked = 0; 1831 } 1832 sowwakeup(so); 1833 /* 1834 * We want snd_recover to track snd_una to 1835 * avoid sequence wraparound problems for 1836 * very large transfers. 1837 */ 1838 tp->snd_una = tp->snd_recover = th->th_ack; 1839 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1840 tp->snd_nxt = tp->snd_una; 1841 1842 switch (tp->t_state) { 1843 1844 /* 1845 * In FIN_WAIT_1 STATE in addition to the processing 1846 * for the ESTABLISHED state if our FIN is now acknowledged 1847 * then enter FIN_WAIT_2. 1848 */ 1849 case TCPS_FIN_WAIT_1: 1850 if (ourfinisacked) { 1851 /* 1852 * If we can't receive any more 1853 * data, then closing user can proceed. 1854 * Starting the timer is contrary to the 1855 * specification, but if we don't get a FIN 1856 * we'll hang forever. 1857 */ 1858 if (so->so_state & SS_CANTRCVMORE) { 1859 soisdisconnected(so); 1860 if (tcp_maxidle > 0) 1861 TCP_TIMER_ARM(tp, TCPT_2MSL, 1862 tcp_maxidle); 1863 } 1864 tp->t_state = TCPS_FIN_WAIT_2; 1865 } 1866 break; 1867 1868 /* 1869 * In CLOSING STATE in addition to the processing for 1870 * the ESTABLISHED state if the ACK acknowledges our FIN 1871 * then enter the TIME-WAIT state, otherwise ignore 1872 * the segment. 1873 */ 1874 case TCPS_CLOSING: 1875 if (ourfinisacked) { 1876 tp->t_state = TCPS_TIME_WAIT; 1877 tcp_canceltimers(tp); 1878 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1879 soisdisconnected(so); 1880 } 1881 break; 1882 1883 /* 1884 * In LAST_ACK, we may still be waiting for data to drain 1885 * and/or to be acked, as well as for the ack of our FIN. 1886 * If our FIN is now acknowledged, delete the TCB, 1887 * enter the closed state and return. 1888 */ 1889 case TCPS_LAST_ACK: 1890 if (ourfinisacked) { 1891 tp = tcp_close(tp); 1892 goto drop; 1893 } 1894 break; 1895 1896 /* 1897 * In TIME_WAIT state the only thing that should arrive 1898 * is a retransmission of the remote FIN. Acknowledge 1899 * it and restart the finack timer. 1900 */ 1901 case TCPS_TIME_WAIT: 1902 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1903 goto dropafterack; 1904 } 1905 } 1906 1907 step6: 1908 /* 1909 * Update window information. 1910 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1911 */ 1912 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 1913 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 1914 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 1915 /* keep track of pure window updates */ 1916 if (tlen == 0 && 1917 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1918 tcpstat.tcps_rcvwinupd++; 1919 tp->snd_wnd = tiwin; 1920 tp->snd_wl1 = th->th_seq; 1921 tp->snd_wl2 = th->th_ack; 1922 if (tp->snd_wnd > tp->max_sndwnd) 1923 tp->max_sndwnd = tp->snd_wnd; 1924 needoutput = 1; 1925 } 1926 1927 /* 1928 * Process segments with URG. 1929 */ 1930 if ((tiflags & TH_URG) && th->th_urp && 1931 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1932 /* 1933 * This is a kludge, but if we receive and accept 1934 * random urgent pointers, we'll crash in 1935 * soreceive. It's hard to imagine someone 1936 * actually wanting to send this much urgent data. 1937 */ 1938 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1939 th->th_urp = 0; /* XXX */ 1940 tiflags &= ~TH_URG; /* XXX */ 1941 goto dodata; /* XXX */ 1942 } 1943 /* 1944 * If this segment advances the known urgent pointer, 1945 * then mark the data stream. This should not happen 1946 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1947 * a FIN has been received from the remote side. 1948 * In these states we ignore the URG. 1949 * 1950 * According to RFC961 (Assigned Protocols), 1951 * the urgent pointer points to the last octet 1952 * of urgent data. We continue, however, 1953 * to consider it to indicate the first octet 1954 * of data past the urgent section as the original 1955 * spec states (in one of two places). 1956 */ 1957 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1958 tp->rcv_up = th->th_seq + th->th_urp; 1959 so->so_oobmark = so->so_rcv.sb_cc + 1960 (tp->rcv_up - tp->rcv_nxt) - 1; 1961 if (so->so_oobmark == 0) 1962 so->so_state |= SS_RCVATMARK; 1963 sohasoutofband(so); 1964 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1965 } 1966 /* 1967 * Remove out of band data so doesn't get presented to user. 1968 * This can happen independent of advancing the URG pointer, 1969 * but if two URG's are pending at once, some out-of-band 1970 * data may creep in... ick. 1971 */ 1972 if (th->th_urp <= (u_int16_t) tlen 1973 #ifdef SO_OOBINLINE 1974 && (so->so_options & SO_OOBINLINE) == 0 1975 #endif 1976 ) 1977 tcp_pulloutofband(so, th, m, hdroptlen); 1978 } else 1979 /* 1980 * If no out of band data is expected, 1981 * pull receive urgent pointer along 1982 * with the receive window. 1983 */ 1984 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1985 tp->rcv_up = tp->rcv_nxt; 1986 dodata: /* XXX */ 1987 1988 /* 1989 * Process the segment text, merging it into the TCP sequencing queue, 1990 * and arranging for acknowledgement of receipt if necessary. 1991 * This process logically involves adjusting tp->rcv_wnd as data 1992 * is presented to the user (this happens in tcp_usrreq.c, 1993 * case PRU_RCVD). If a FIN has already been received on this 1994 * connection then we just ignore the text. 1995 */ 1996 if ((tlen || (tiflags & TH_FIN)) && 1997 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1998 /* 1999 * Insert segment ti into reassembly queue of tcp with 2000 * control block tp. Return TH_FIN if reassembly now includes 2001 * a segment with FIN. The macro form does the common case 2002 * inline (segment is the next to be received on an 2003 * established connection, and the queue is empty), 2004 * avoiding linkage into and removal from the queue and 2005 * repetition of various conversions. 2006 * Set DELACK for segments received in order, but ack 2007 * immediately when segments are out of order 2008 * (so fast retransmit can work). 2009 */ 2010 /* NOTE: this was TCP_REASS() macro, but used only once */ 2011 TCP_REASS_LOCK(tp); 2012 if (th->th_seq == tp->rcv_nxt && 2013 tp->segq.lh_first == NULL && 2014 tp->t_state == TCPS_ESTABLISHED) { 2015 TCP_SETUP_ACK(tp, th); 2016 tp->rcv_nxt += tlen; 2017 tiflags = th->th_flags & TH_FIN; 2018 tcpstat.tcps_rcvpack++; 2019 tcpstat.tcps_rcvbyte += tlen; 2020 ND6_HINT(tp); 2021 m_adj(m, hdroptlen); 2022 sbappend(&(so)->so_rcv, m); 2023 sorwakeup(so); 2024 } else { 2025 m_adj(m, hdroptlen); 2026 tiflags = tcp_reass(tp, th, m, &tlen); 2027 tp->t_flags |= TF_ACKNOW; 2028 } 2029 TCP_REASS_UNLOCK(tp); 2030 2031 /* 2032 * Note the amount of data that peer has sent into 2033 * our window, in order to estimate the sender's 2034 * buffer size. 2035 */ 2036 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2037 } else { 2038 m_freem(m); 2039 m = NULL; 2040 tiflags &= ~TH_FIN; 2041 } 2042 2043 /* 2044 * If FIN is received ACK the FIN and let the user know 2045 * that the connection is closing. Ignore a FIN received before 2046 * the connection is fully established. 2047 */ 2048 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2049 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2050 socantrcvmore(so); 2051 tp->t_flags |= TF_ACKNOW; 2052 tp->rcv_nxt++; 2053 } 2054 switch (tp->t_state) { 2055 2056 /* 2057 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2058 */ 2059 case TCPS_ESTABLISHED: 2060 tp->t_state = TCPS_CLOSE_WAIT; 2061 break; 2062 2063 /* 2064 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2065 * enter the CLOSING state. 2066 */ 2067 case TCPS_FIN_WAIT_1: 2068 tp->t_state = TCPS_CLOSING; 2069 break; 2070 2071 /* 2072 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2073 * starting the time-wait timer, turning off the other 2074 * standard timers. 2075 */ 2076 case TCPS_FIN_WAIT_2: 2077 tp->t_state = TCPS_TIME_WAIT; 2078 tcp_canceltimers(tp); 2079 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2080 soisdisconnected(so); 2081 break; 2082 2083 /* 2084 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2085 */ 2086 case TCPS_TIME_WAIT: 2087 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2088 break; 2089 } 2090 } 2091 if (so->so_options & SO_DEBUG) { 2092 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2093 } 2094 2095 /* 2096 * Return any desired output. 2097 */ 2098 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2099 (void) tcp_output(tp); 2100 if (tcp_saveti) 2101 m_freem(tcp_saveti); 2102 return; 2103 2104 badsyn: 2105 /* 2106 * Received a bad SYN. Increment counters and dropwithreset. 2107 */ 2108 tcpstat.tcps_badsyn++; 2109 tp = NULL; 2110 goto dropwithreset; 2111 2112 dropafterack: 2113 /* 2114 * Generate an ACK dropping incoming segment if it occupies 2115 * sequence space, where the ACK reflects our state. 2116 */ 2117 if (tiflags & TH_RST) 2118 goto drop; 2119 m_freem(m); 2120 tp->t_flags |= TF_ACKNOW; 2121 (void) tcp_output(tp); 2122 if (tcp_saveti) 2123 m_freem(tcp_saveti); 2124 return; 2125 2126 dropwithreset_ratelim: 2127 /* 2128 * We may want to rate-limit RSTs in certain situations, 2129 * particularly if we are sending an RST in response to 2130 * an attempt to connect to or otherwise communicate with 2131 * a port for which we have no socket. 2132 */ 2133 if (ratecheck(&tcp_rst_ratelim_last, &tcp_rst_ratelim) == 0) { 2134 /* XXX stat */ 2135 goto drop; 2136 } 2137 /* ...fall into dropwithreset... */ 2138 2139 dropwithreset: 2140 /* 2141 * Generate a RST, dropping incoming segment. 2142 * Make ACK acceptable to originator of segment. 2143 */ 2144 if (tiflags & TH_RST) 2145 goto drop; 2146 { 2147 /* 2148 * need to recover version # field, which was overwritten on 2149 * ip_cksum computation. 2150 */ 2151 struct ip *sip; 2152 sip = mtod(m, struct ip *); 2153 switch (af) { 2154 case AF_INET: 2155 sip->ip_v = 4; 2156 break; 2157 #ifdef INET6 2158 case AF_INET6: 2159 sip->ip_v = 6; 2160 break; 2161 #endif 2162 } 2163 } 2164 if (tiflags & TH_ACK) 2165 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2166 else { 2167 if (tiflags & TH_SYN) 2168 tlen++; 2169 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2170 TH_RST|TH_ACK); 2171 } 2172 if (tcp_saveti) 2173 m_freem(tcp_saveti); 2174 return; 2175 2176 drop: 2177 /* 2178 * Drop space held by incoming segment and return. 2179 */ 2180 if (tp) { 2181 if (tp->t_inpcb) 2182 so = tp->t_inpcb->inp_socket; 2183 #ifdef INET6 2184 else if (tp->t_in6pcb) 2185 so = tp->t_in6pcb->in6p_socket; 2186 #endif 2187 else 2188 so = NULL; 2189 if (so && (so->so_options & SO_DEBUG) != 0) 2190 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2191 } 2192 if (tcp_saveti) 2193 m_freem(tcp_saveti); 2194 m_freem(m); 2195 return; 2196 } 2197 2198 void 2199 tcp_dooptions(tp, cp, cnt, th, oi) 2200 struct tcpcb *tp; 2201 u_char *cp; 2202 int cnt; 2203 struct tcphdr *th; 2204 struct tcp_opt_info *oi; 2205 { 2206 u_int16_t mss; 2207 int opt, optlen; 2208 2209 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2210 opt = cp[0]; 2211 if (opt == TCPOPT_EOL) 2212 break; 2213 if (opt == TCPOPT_NOP) 2214 optlen = 1; 2215 else { 2216 if (cnt < 2) 2217 break; 2218 optlen = cp[1]; 2219 if (optlen < 2 || optlen > cnt) 2220 break; 2221 } 2222 switch (opt) { 2223 2224 default: 2225 continue; 2226 2227 case TCPOPT_MAXSEG: 2228 if (optlen != TCPOLEN_MAXSEG) 2229 continue; 2230 if (!(th->th_flags & TH_SYN)) 2231 continue; 2232 bcopy(cp + 2, &mss, sizeof(mss)); 2233 oi->maxseg = ntohs(mss); 2234 break; 2235 2236 case TCPOPT_WINDOW: 2237 if (optlen != TCPOLEN_WINDOW) 2238 continue; 2239 if (!(th->th_flags & TH_SYN)) 2240 continue; 2241 tp->t_flags |= TF_RCVD_SCALE; 2242 tp->requested_s_scale = cp[2]; 2243 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 2244 #if 0 /*XXX*/ 2245 char *p; 2246 2247 if (ip) 2248 p = ntohl(ip->ip_src); 2249 #ifdef INET6 2250 else if (ip6) 2251 p = ip6_sprintf(&ip6->ip6_src); 2252 #endif 2253 else 2254 p = "(unknown)"; 2255 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 2256 "assuming %d\n", 2257 tp->requested_s_scale, p, 2258 TCP_MAX_WINSHIFT); 2259 #else 2260 log(LOG_ERR, "TCP: invalid wscale %d, " 2261 "assuming %d\n", 2262 tp->requested_s_scale, 2263 TCP_MAX_WINSHIFT); 2264 #endif 2265 tp->requested_s_scale = TCP_MAX_WINSHIFT; 2266 } 2267 break; 2268 2269 case TCPOPT_TIMESTAMP: 2270 if (optlen != TCPOLEN_TIMESTAMP) 2271 continue; 2272 oi->ts_present = 1; 2273 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2274 NTOHL(oi->ts_val); 2275 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2276 NTOHL(oi->ts_ecr); 2277 2278 /* 2279 * A timestamp received in a SYN makes 2280 * it ok to send timestamp requests and replies. 2281 */ 2282 if (th->th_flags & TH_SYN) { 2283 tp->t_flags |= TF_RCVD_TSTMP; 2284 tp->ts_recent = oi->ts_val; 2285 tp->ts_recent_age = tcp_now; 2286 } 2287 break; 2288 case TCPOPT_SACK_PERMITTED: 2289 if (optlen != TCPOLEN_SACK_PERMITTED) 2290 continue; 2291 if (!(th->th_flags & TH_SYN)) 2292 continue; 2293 tp->t_flags &= ~TF_CANT_TXSACK; 2294 break; 2295 2296 case TCPOPT_SACK: 2297 if (tp->t_flags & TF_IGNR_RXSACK) 2298 continue; 2299 if (optlen % 8 != 2 || optlen < 10) 2300 continue; 2301 cp += 2; 2302 optlen -= 2; 2303 for (; optlen > 0; cp -= 8, optlen -= 8) { 2304 tcp_seq lwe, rwe; 2305 bcopy((char *)cp, (char *) &lwe, sizeof(lwe)); 2306 NTOHL(lwe); 2307 bcopy((char *)cp, (char *) &rwe, sizeof(rwe)); 2308 NTOHL(rwe); 2309 /* tcp_mark_sacked(tp, lwe, rwe); */ 2310 } 2311 break; 2312 } 2313 } 2314 } 2315 2316 /* 2317 * Pull out of band byte out of a segment so 2318 * it doesn't appear in the user's data queue. 2319 * It is still reflected in the segment length for 2320 * sequencing purposes. 2321 */ 2322 void 2323 tcp_pulloutofband(so, th, m, off) 2324 struct socket *so; 2325 struct tcphdr *th; 2326 struct mbuf *m; 2327 int off; 2328 { 2329 int cnt = off + th->th_urp - 1; 2330 2331 while (cnt >= 0) { 2332 if (m->m_len > cnt) { 2333 char *cp = mtod(m, caddr_t) + cnt; 2334 struct tcpcb *tp = sototcpcb(so); 2335 2336 tp->t_iobc = *cp; 2337 tp->t_oobflags |= TCPOOB_HAVEDATA; 2338 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2339 m->m_len--; 2340 return; 2341 } 2342 cnt -= m->m_len; 2343 m = m->m_next; 2344 if (m == 0) 2345 break; 2346 } 2347 panic("tcp_pulloutofband"); 2348 } 2349 2350 /* 2351 * Collect new round-trip time estimate 2352 * and update averages and current timeout. 2353 */ 2354 void 2355 tcp_xmit_timer(tp, rtt) 2356 struct tcpcb *tp; 2357 short rtt; 2358 { 2359 short delta; 2360 short rttmin; 2361 2362 tcpstat.tcps_rttupdated++; 2363 --rtt; 2364 if (tp->t_srtt != 0) { 2365 /* 2366 * srtt is stored as fixed point with 3 bits after the 2367 * binary point (i.e., scaled by 8). The following magic 2368 * is equivalent to the smoothing algorithm in rfc793 with 2369 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2370 * point). Adjust rtt to origin 0. 2371 */ 2372 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2373 if ((tp->t_srtt += delta) <= 0) 2374 tp->t_srtt = 1 << 2; 2375 /* 2376 * We accumulate a smoothed rtt variance (actually, a 2377 * smoothed mean difference), then set the retransmit 2378 * timer to smoothed rtt + 4 times the smoothed variance. 2379 * rttvar is stored as fixed point with 2 bits after the 2380 * binary point (scaled by 4). The following is 2381 * equivalent to rfc793 smoothing with an alpha of .75 2382 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2383 * rfc793's wired-in beta. 2384 */ 2385 if (delta < 0) 2386 delta = -delta; 2387 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2388 if ((tp->t_rttvar += delta) <= 0) 2389 tp->t_rttvar = 1 << 2; 2390 } else { 2391 /* 2392 * No rtt measurement yet - use the unsmoothed rtt. 2393 * Set the variance to half the rtt (so our first 2394 * retransmit happens at 3*rtt). 2395 */ 2396 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2397 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2398 } 2399 tp->t_rtt = 0; 2400 tp->t_rxtshift = 0; 2401 2402 /* 2403 * the retransmit should happen at rtt + 4 * rttvar. 2404 * Because of the way we do the smoothing, srtt and rttvar 2405 * will each average +1/2 tick of bias. When we compute 2406 * the retransmit timer, we want 1/2 tick of rounding and 2407 * 1 extra tick because of +-1/2 tick uncertainty in the 2408 * firing of the timer. The bias will give us exactly the 2409 * 1.5 tick we need. But, because the bias is 2410 * statistical, we have to test that we don't drop below 2411 * the minimum feasible timer (which is 2 ticks). 2412 */ 2413 if (tp->t_rttmin > rtt + 2) 2414 rttmin = tp->t_rttmin; 2415 else 2416 rttmin = rtt + 2; 2417 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2418 2419 /* 2420 * We received an ack for a packet that wasn't retransmitted; 2421 * it is probably safe to discard any error indications we've 2422 * received recently. This isn't quite right, but close enough 2423 * for now (a route might have failed after we sent a segment, 2424 * and the return path might not be symmetrical). 2425 */ 2426 tp->t_softerror = 0; 2427 } 2428 2429 /* 2430 * Checks for partial ack. If partial ack arrives, force the retransmission 2431 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2432 * 1. By setting snd_nxt to th_ack, this forces retransmission timer to 2433 * be started again. If the ack advances at least to tp->snd_recover, return 0. 2434 */ 2435 int 2436 tcp_newreno(tp, th) 2437 struct tcpcb *tp; 2438 struct tcphdr *th; 2439 { 2440 tcp_seq onxt = tp->snd_nxt; 2441 u_long ocwnd = tp->snd_cwnd; 2442 2443 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2444 /* 2445 * snd_una has not yet been updated and the socket's send 2446 * buffer has not yet drained off the ACK'd data, so we 2447 * have to leave snd_una as it was to get the correct data 2448 * offset in tcp_output(). 2449 */ 2450 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2451 tp->t_rtt = 0; 2452 tp->snd_nxt = th->th_ack; 2453 /* 2454 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una 2455 * is not yet updated when we're called. 2456 */ 2457 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 2458 (void) tcp_output(tp); 2459 tp->snd_cwnd = ocwnd; 2460 if (SEQ_GT(onxt, tp->snd_nxt)) 2461 tp->snd_nxt = onxt; 2462 /* 2463 * Partial window deflation. Relies on fact that tp->snd_una 2464 * not updated yet. 2465 */ 2466 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz); 2467 return 1; 2468 } 2469 return 0; 2470 } 2471 2472 2473 /* 2474 * TCP compressed state engine. Currently used to hold compressed 2475 * state for SYN_RECEIVED. 2476 */ 2477 2478 u_long syn_cache_count; 2479 u_int32_t syn_hash1, syn_hash2; 2480 2481 #define SYN_HASH(sa, sp, dp) \ 2482 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 2483 ((u_int32_t)(sp)))^syn_hash2))) 2484 #ifndef INET6 2485 #define SYN_HASHALL(hash, src, dst) \ 2486 do { \ 2487 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2488 ((struct sockaddr_in *)(src))->sin_port, \ 2489 ((struct sockaddr_in *)(dst))->sin_port); \ 2490 } while (0) 2491 #else 2492 #define SYN_HASH6(sa, sp, dp) \ 2493 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 2494 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 2495 & 0x7fffffff) 2496 2497 #define SYN_HASHALL(hash, src, dst) \ 2498 do { \ 2499 switch ((src)->sa_family) { \ 2500 case AF_INET: \ 2501 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2502 ((struct sockaddr_in *)(src))->sin_port, \ 2503 ((struct sockaddr_in *)(dst))->sin_port); \ 2504 break; \ 2505 case AF_INET6: \ 2506 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 2507 ((struct sockaddr_in6 *)(src))->sin6_port, \ 2508 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 2509 break; \ 2510 default: \ 2511 hash = 0; \ 2512 } \ 2513 } while (0) 2514 #endif /* INET6 */ 2515 2516 #define SYN_CACHE_RM(sc) \ 2517 do { \ 2518 LIST_REMOVE((sc), sc_bucketq); \ 2519 (sc)->sc_tp = NULL; \ 2520 LIST_REMOVE((sc), sc_tpq); \ 2521 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 2522 TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \ 2523 syn_cache_count--; \ 2524 } while (0) 2525 2526 #define SYN_CACHE_PUT(sc) \ 2527 do { \ 2528 if ((sc)->sc_ipopts) \ 2529 (void) m_free((sc)->sc_ipopts); \ 2530 if ((sc)->sc_route4.ro_rt != NULL) \ 2531 RTFREE((sc)->sc_route4.ro_rt); \ 2532 pool_put(&syn_cache_pool, (sc)); \ 2533 } while (0) 2534 2535 struct pool syn_cache_pool; 2536 2537 /* 2538 * We don't estimate RTT with SYNs, so each packet starts with the default 2539 * RTT and each timer queue has a fixed timeout value. This allows us to 2540 * optimize the timer queues somewhat. 2541 */ 2542 #define SYN_CACHE_TIMER_ARM(sc) \ 2543 do { \ 2544 TCPT_RANGESET((sc)->sc_rxtcur, \ 2545 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 2546 TCPTV_REXMTMAX); \ 2547 PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur); \ 2548 } while (0) 2549 2550 TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1]; 2551 2552 void 2553 syn_cache_init() 2554 { 2555 int i; 2556 2557 /* Initialize the hash buckets. */ 2558 for (i = 0; i < tcp_syn_cache_size; i++) 2559 LIST_INIT(&tcp_syn_cache[i].sch_bucket); 2560 2561 /* Initialize the timer queues. */ 2562 for (i = 0; i <= TCP_MAXRXTSHIFT; i++) 2563 TAILQ_INIT(&tcp_syn_cache_timeq[i]); 2564 2565 /* Initialize the syn cache pool. */ 2566 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 2567 "synpl", 0, NULL, NULL, M_PCB); 2568 } 2569 2570 void 2571 syn_cache_insert(sc, tp) 2572 struct syn_cache *sc; 2573 struct tcpcb *tp; 2574 { 2575 struct syn_cache_head *scp; 2576 struct syn_cache *sc2; 2577 int s, i; 2578 2579 /* 2580 * If there are no entries in the hash table, reinitialize 2581 * the hash secrets. 2582 */ 2583 if (syn_cache_count == 0) { 2584 struct timeval tv; 2585 microtime(&tv); 2586 syn_hash1 = random() ^ (u_long)≻ 2587 syn_hash2 = random() ^ tv.tv_usec; 2588 } 2589 2590 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 2591 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 2592 scp = &tcp_syn_cache[sc->sc_bucketidx]; 2593 2594 /* 2595 * Make sure that we don't overflow the per-bucket 2596 * limit or the total cache size limit. 2597 */ 2598 s = splsoftnet(); 2599 if (scp->sch_length >= tcp_syn_bucket_limit) { 2600 tcpstat.tcps_sc_bucketoverflow++; 2601 /* 2602 * The bucket is full. Toss the oldest element in the 2603 * bucket. This will be the entry with our bucket 2604 * index closest to the front of the timer queue with 2605 * the largest timeout value. 2606 * 2607 * Note: This timer queue traversal may be expensive, so 2608 * we hope that this doesn't happen very often. It is 2609 * much more likely that we'll overflow the entire 2610 * cache, which is much easier to handle; see below. 2611 */ 2612 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2613 for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2614 sc2 != NULL; 2615 sc2 = TAILQ_NEXT(sc2, sc_timeq)) { 2616 if (sc2->sc_bucketidx == sc->sc_bucketidx) { 2617 SYN_CACHE_RM(sc2); 2618 SYN_CACHE_PUT(sc2); 2619 goto insert; /* 2 level break */ 2620 } 2621 } 2622 } 2623 #ifdef DIAGNOSTIC 2624 /* 2625 * This should never happen; we should always find an 2626 * entry in our bucket. 2627 */ 2628 panic("syn_cache_insert: bucketoverflow: impossible"); 2629 #endif 2630 } else if (syn_cache_count >= tcp_syn_cache_limit) { 2631 tcpstat.tcps_sc_overflowed++; 2632 /* 2633 * The cache is full. Toss the oldest entry in the 2634 * entire cache. This is the front entry in the 2635 * first non-empty timer queue with the largest 2636 * timeout value. 2637 */ 2638 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2639 sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2640 if (sc2 == NULL) 2641 continue; 2642 SYN_CACHE_RM(sc2); 2643 SYN_CACHE_PUT(sc2); 2644 goto insert; /* symmetry with above */ 2645 } 2646 #ifdef DIAGNOSTIC 2647 /* 2648 * This should never happen; we should always find an 2649 * entry in the cache. 2650 */ 2651 panic("syn_cache_insert: cache overflow: impossible"); 2652 #endif 2653 } 2654 2655 insert: 2656 /* 2657 * Initialize the entry's timer. 2658 */ 2659 sc->sc_rxttot = 0; 2660 sc->sc_rxtshift = 0; 2661 SYN_CACHE_TIMER_ARM(sc); 2662 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq); 2663 2664 /* Link it from tcpcb entry */ 2665 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 2666 2667 /* Put it into the bucket. */ 2668 LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq); 2669 scp->sch_length++; 2670 syn_cache_count++; 2671 2672 tcpstat.tcps_sc_added++; 2673 splx(s); 2674 } 2675 2676 /* 2677 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 2678 * If we have retransmitted an entry the maximum number of times, expire 2679 * that entry. 2680 */ 2681 void 2682 syn_cache_timer() 2683 { 2684 struct syn_cache *sc, *nsc; 2685 int i, s; 2686 2687 s = splsoftnet(); 2688 2689 /* 2690 * First, get all the entries that need to be retransmitted, or 2691 * must be expired due to exceeding the initial keepalive time. 2692 */ 2693 for (i = 0; i < TCP_MAXRXTSHIFT; i++) { 2694 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2695 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2696 sc = nsc) { 2697 nsc = TAILQ_NEXT(sc, sc_timeq); 2698 2699 /* 2700 * Compute the total amount of time this entry has 2701 * been on a queue. If this entry has been on longer 2702 * than the keep alive timer would allow, expire it. 2703 */ 2704 sc->sc_rxttot += sc->sc_rxtcur; 2705 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) { 2706 tcpstat.tcps_sc_timed_out++; 2707 SYN_CACHE_RM(sc); 2708 SYN_CACHE_PUT(sc); 2709 continue; 2710 } 2711 2712 tcpstat.tcps_sc_retransmitted++; 2713 (void) syn_cache_respond(sc, NULL); 2714 2715 /* Advance this entry onto the next timer queue. */ 2716 TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq); 2717 sc->sc_rxtshift = i + 1; 2718 SYN_CACHE_TIMER_ARM(sc); 2719 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], 2720 sc, sc_timeq); 2721 } 2722 } 2723 2724 /* 2725 * Now get all the entries that are expired due to too many 2726 * retransmissions. 2727 */ 2728 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]); 2729 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2730 sc = nsc) { 2731 nsc = TAILQ_NEXT(sc, sc_timeq); 2732 tcpstat.tcps_sc_timed_out++; 2733 SYN_CACHE_RM(sc); 2734 SYN_CACHE_PUT(sc); 2735 } 2736 splx(s); 2737 } 2738 2739 /* 2740 * Remove syn cache created by the specified tcb entry, 2741 * because this does not make sense to keep them 2742 * (if there's no tcb entry, syn cache entry will never be used) 2743 */ 2744 void 2745 syn_cache_cleanup(tp) 2746 struct tcpcb *tp; 2747 { 2748 struct syn_cache *sc, *nsc; 2749 int s; 2750 2751 s = splsoftnet(); 2752 2753 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 2754 nsc = LIST_NEXT(sc, sc_tpq); 2755 2756 #ifdef DIAGNOSTIC 2757 if (sc->sc_tp != tp) 2758 panic("invalid sc_tp in syn_cache_cleanup"); 2759 #endif 2760 SYN_CACHE_RM(sc); 2761 SYN_CACHE_PUT(sc); 2762 } 2763 /* just for safety */ 2764 LIST_INIT(&tp->t_sc); 2765 2766 splx(s); 2767 } 2768 2769 /* 2770 * Find an entry in the syn cache. 2771 */ 2772 struct syn_cache * 2773 syn_cache_lookup(src, dst, headp) 2774 struct sockaddr *src; 2775 struct sockaddr *dst; 2776 struct syn_cache_head **headp; 2777 { 2778 struct syn_cache *sc; 2779 struct syn_cache_head *scp; 2780 u_int32_t hash; 2781 int s; 2782 2783 SYN_HASHALL(hash, src, dst); 2784 2785 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 2786 *headp = scp; 2787 s = splsoftnet(); 2788 for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL; 2789 sc = LIST_NEXT(sc, sc_bucketq)) { 2790 if (sc->sc_hash != hash) 2791 continue; 2792 if (!bcmp(&sc->sc_src, src, src->sa_len) && 2793 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 2794 splx(s); 2795 return (sc); 2796 } 2797 } 2798 splx(s); 2799 return (NULL); 2800 } 2801 2802 /* 2803 * This function gets called when we receive an ACK for a 2804 * socket in the LISTEN state. We look up the connection 2805 * in the syn cache, and if its there, we pull it out of 2806 * the cache and turn it into a full-blown connection in 2807 * the SYN-RECEIVED state. 2808 * 2809 * The return values may not be immediately obvious, and their effects 2810 * can be subtle, so here they are: 2811 * 2812 * NULL SYN was not found in cache; caller should drop the 2813 * packet and send an RST. 2814 * 2815 * -1 We were unable to create the new connection, and are 2816 * aborting it. An ACK,RST is being sent to the peer 2817 * (unless we got screwey sequence numbners; see below), 2818 * because the 3-way handshake has been completed. Caller 2819 * should not free the mbuf, since we may be using it. If 2820 * we are not, we will free it. 2821 * 2822 * Otherwise, the return value is a pointer to the new socket 2823 * associated with the connection. 2824 */ 2825 struct socket * 2826 syn_cache_get(src, dst, th, hlen, tlen, so, m) 2827 struct sockaddr *src; 2828 struct sockaddr *dst; 2829 struct tcphdr *th; 2830 unsigned int hlen, tlen; 2831 struct socket *so; 2832 struct mbuf *m; 2833 { 2834 struct syn_cache *sc; 2835 struct syn_cache_head *scp; 2836 struct inpcb *inp = NULL; 2837 #ifdef INET6 2838 struct in6pcb *in6p = NULL; 2839 #endif 2840 struct tcpcb *tp = 0; 2841 struct mbuf *am; 2842 int s; 2843 struct socket *oso; 2844 2845 s = splsoftnet(); 2846 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 2847 splx(s); 2848 return (NULL); 2849 } 2850 2851 /* 2852 * Verify the sequence and ack numbers. Try getting the correct 2853 * response again. 2854 */ 2855 if ((th->th_ack != sc->sc_iss + 1) || 2856 SEQ_LEQ(th->th_seq, sc->sc_irs) || 2857 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 2858 (void) syn_cache_respond(sc, m); 2859 splx(s); 2860 return ((struct socket *)(-1)); 2861 } 2862 2863 /* Remove this cache entry */ 2864 SYN_CACHE_RM(sc); 2865 splx(s); 2866 2867 /* 2868 * Ok, create the full blown connection, and set things up 2869 * as they would have been set up if we had created the 2870 * connection when the SYN arrived. If we can't create 2871 * the connection, abort it. 2872 */ 2873 /* 2874 * inp still has the OLD in_pcb stuff, set the 2875 * v6-related flags on the new guy, too. This is 2876 * done particularly for the case where an AF_INET6 2877 * socket is bound only to a port, and a v4 connection 2878 * comes in on that port. 2879 * we also copy the flowinfo from the original pcb 2880 * to the new one. 2881 */ 2882 { 2883 struct inpcb *parentinpcb; 2884 2885 parentinpcb = (struct inpcb *)so->so_pcb; 2886 2887 oso = so; 2888 so = sonewconn(so, SS_ISCONNECTED); 2889 if (so == NULL) 2890 goto resetandabort; 2891 2892 switch (so->so_proto->pr_domain->dom_family) { 2893 case AF_INET: 2894 inp = sotoinpcb(so); 2895 break; 2896 #ifdef INET6 2897 case AF_INET6: 2898 in6p = sotoin6pcb(so); 2899 #if 0 /*def INET6*/ 2900 inp->inp_flags |= (parentinpcb->inp_flags & 2901 (INP_IPV6 | INP_IPV6_UNDEC | INP_IPV6_MAPPED)); 2902 if ((inp->inp_flags & INP_IPV6) && 2903 !(inp->inp_flags & INP_IPV6_MAPPED)) { 2904 inp->inp_ipv6.ip6_hlim = parentinpcb->inp_ipv6.ip6_hlim; 2905 inp->inp_ipv6.ip6_vfc = parentinpcb->inp_ipv6.ip6_vfc; 2906 } 2907 #endif 2908 break; 2909 #endif 2910 } 2911 } 2912 switch (src->sa_family) { 2913 case AF_INET: 2914 if (inp) { 2915 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 2916 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 2917 inp->inp_options = ip_srcroute(); 2918 in_pcbstate(inp, INP_BOUND); 2919 if (inp->inp_options == NULL) { 2920 inp->inp_options = sc->sc_ipopts; 2921 sc->sc_ipopts = NULL; 2922 } 2923 } 2924 #ifdef INET6 2925 else if (in6p) { 2926 /* IPv4 packet to AF_INET6 socket */ 2927 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr)); 2928 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 2929 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 2930 &in6p->in6p_laddr.s6_addr32[3], 2931 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 2932 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 2933 in6totcpcb(in6p)->t_family = AF_INET; 2934 } 2935 #endif 2936 break; 2937 #ifdef INET6 2938 case AF_INET6: 2939 if (in6p) { 2940 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 2941 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 2942 #if 0 2943 in6p->in6p_flowinfo = ip6->ip6_flow & IPV6_FLOWINFO_MASK; 2944 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 2945 #endif 2946 } 2947 break; 2948 #endif 2949 } 2950 #ifdef INET6 2951 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 2952 struct in6pcb *oin6p = sotoin6pcb(oso); 2953 /* inherit socket options from the listening socket */ 2954 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 2955 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 2956 m_freem(in6p->in6p_options); 2957 in6p->in6p_options = 0; 2958 } 2959 ip6_savecontrol(in6p, &in6p->in6p_options, 2960 mtod(m, struct ip6_hdr *), m); 2961 } 2962 #endif 2963 2964 #ifdef IPSEC 2965 /* 2966 * we make a copy of policy, instead of sharing the policy, 2967 * for better behavior in terms of SA lookup and dead SA removal. 2968 */ 2969 if (inp) { 2970 /* copy old policy into new socket's */ 2971 if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) 2972 printf("tcp_input: could not copy policy\n"); 2973 } 2974 #ifdef INET6 2975 else if (in6p) { 2976 /* copy old policy into new socket's */ 2977 if (ipsec_copy_policy(sotoin6pcb(oso)->in6p_sp, in6p->in6p_sp)) 2978 printf("tcp_input: could not copy policy\n"); 2979 } 2980 #endif 2981 #endif 2982 2983 /* 2984 * Give the new socket our cached route reference. 2985 */ 2986 if (inp) 2987 inp->inp_route = sc->sc_route4; /* struct assignment */ 2988 #ifdef INET6 2989 else 2990 in6p->in6p_route = sc->sc_route6; 2991 #endif 2992 sc->sc_route4.ro_rt = NULL; 2993 2994 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 2995 if (am == NULL) 2996 goto resetandabort; 2997 am->m_len = src->sa_len; 2998 bcopy(src, mtod(am, caddr_t), src->sa_len); 2999 if (inp) { 3000 if (in_pcbconnect(inp, am)) { 3001 (void) m_free(am); 3002 goto resetandabort; 3003 } 3004 } 3005 #ifdef INET6 3006 else if (in6p) { 3007 if (src->sa_family == AF_INET) { 3008 /* IPv4 packet to AF_INET6 socket */ 3009 struct sockaddr_in6 *sin6; 3010 sin6 = mtod(am, struct sockaddr_in6 *); 3011 am->m_len = sizeof(*sin6); 3012 bzero(sin6, sizeof(*sin6)); 3013 sin6->sin6_family = AF_INET6; 3014 sin6->sin6_len = sizeof(*sin6); 3015 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port; 3016 sin6->sin6_addr.s6_addr16[5] = htons(0xffff); 3017 bcopy(&((struct sockaddr_in *)src)->sin_addr, 3018 &sin6->sin6_addr.s6_addr32[3], 3019 sizeof(sin6->sin6_addr.s6_addr32[3])); 3020 } 3021 if (in6_pcbconnect(in6p, am)) { 3022 (void) m_free(am); 3023 goto resetandabort; 3024 } 3025 } 3026 #endif 3027 else { 3028 (void) m_free(am); 3029 goto resetandabort; 3030 } 3031 (void) m_free(am); 3032 3033 if (inp) 3034 tp = intotcpcb(inp); 3035 #ifdef INET6 3036 else if (in6p) 3037 tp = in6totcpcb(in6p); 3038 #endif 3039 else 3040 tp = NULL; 3041 if (sc->sc_request_r_scale != 15) { 3042 tp->requested_s_scale = sc->sc_requested_s_scale; 3043 tp->request_r_scale = sc->sc_request_r_scale; 3044 tp->snd_scale = sc->sc_requested_s_scale; 3045 tp->rcv_scale = sc->sc_request_r_scale; 3046 tp->t_flags |= TF_RCVD_SCALE; 3047 } 3048 if (sc->sc_flags & SCF_TIMESTAMP) 3049 tp->t_flags |= TF_RCVD_TSTMP; 3050 3051 tp->t_template = tcp_template(tp); 3052 if (tp->t_template == 0) { 3053 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3054 so = NULL; 3055 m_freem(m); 3056 goto abort; 3057 } 3058 3059 tp->iss = sc->sc_iss; 3060 tp->irs = sc->sc_irs; 3061 tcp_sendseqinit(tp); 3062 tcp_rcvseqinit(tp); 3063 tp->t_state = TCPS_SYN_RECEIVED; 3064 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT); 3065 tcpstat.tcps_accepts++; 3066 3067 /* Initialize tp->t_ourmss before we deal with the peer's! */ 3068 tp->t_ourmss = sc->sc_ourmaxseg; 3069 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 3070 3071 /* 3072 * Initialize the initial congestion window. If we 3073 * had to retransmit the SYN,ACK, we must initialize cwnd 3074 * to 1 segment (i.e. the Loss Window). 3075 */ 3076 if (sc->sc_rxtshift) 3077 tp->snd_cwnd = tp->t_peermss; 3078 else 3079 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss); 3080 3081 tcp_rmx_rtt(tp); 3082 tp->snd_wl1 = sc->sc_irs; 3083 tp->rcv_up = sc->sc_irs + 1; 3084 3085 /* 3086 * This is what whould have happened in tcp_ouput() when 3087 * the SYN,ACK was sent. 3088 */ 3089 tp->snd_up = tp->snd_una; 3090 tp->snd_max = tp->snd_nxt = tp->iss+1; 3091 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3092 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3093 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3094 tp->last_ack_sent = tp->rcv_nxt; 3095 3096 tcpstat.tcps_sc_completed++; 3097 SYN_CACHE_PUT(sc); 3098 return (so); 3099 3100 resetandabort: 3101 (void) tcp_respond(NULL, m, m, th, 3102 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); 3103 abort: 3104 if (so != NULL) 3105 (void) soabort(so); 3106 SYN_CACHE_PUT(sc); 3107 tcpstat.tcps_sc_aborted++; 3108 return ((struct socket *)(-1)); 3109 } 3110 3111 /* 3112 * This function is called when we get a RST for a 3113 * non-existant connection, so that we can see if the 3114 * connection is in the syn cache. If it is, zap it. 3115 */ 3116 3117 void 3118 syn_cache_reset(src, dst, th) 3119 struct sockaddr *src; 3120 struct sockaddr *dst; 3121 struct tcphdr *th; 3122 { 3123 struct syn_cache *sc; 3124 struct syn_cache_head *scp; 3125 int s = splsoftnet(); 3126 3127 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3128 splx(s); 3129 return; 3130 } 3131 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3132 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3133 splx(s); 3134 return; 3135 } 3136 SYN_CACHE_RM(sc); 3137 splx(s); 3138 tcpstat.tcps_sc_reset++; 3139 SYN_CACHE_PUT(sc); 3140 } 3141 3142 void 3143 syn_cache_unreach(src, dst, th) 3144 struct sockaddr *src; 3145 struct sockaddr *dst; 3146 struct tcphdr *th; 3147 { 3148 struct syn_cache *sc; 3149 struct syn_cache_head *scp; 3150 int s; 3151 3152 s = splsoftnet(); 3153 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3154 splx(s); 3155 return; 3156 } 3157 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3158 if (ntohl (th->th_seq) != sc->sc_iss) { 3159 splx(s); 3160 return; 3161 } 3162 3163 /* 3164 * If we've rertransmitted 3 times and this is our second error, 3165 * we remove the entry. Otherwise, we allow it to continue on. 3166 * This prevents us from incorrectly nuking an entry during a 3167 * spurious network outage. 3168 * 3169 * See tcp_notify(). 3170 */ 3171 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3172 sc->sc_flags |= SCF_UNREACH; 3173 splx(s); 3174 return; 3175 } 3176 3177 SYN_CACHE_RM(sc); 3178 splx(s); 3179 tcpstat.tcps_sc_unreach++; 3180 SYN_CACHE_PUT(sc); 3181 } 3182 3183 /* 3184 * Given a LISTEN socket and an inbound SYN request, add 3185 * this to the syn cache, and send back a segment: 3186 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3187 * to the source. 3188 * 3189 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3190 * Doing so would require that we hold onto the data and deliver it 3191 * to the application. However, if we are the target of a SYN-flood 3192 * DoS attack, an attacker could send data which would eventually 3193 * consume all available buffer space if it were ACKed. By not ACKing 3194 * the data, we avoid this DoS scenario. 3195 */ 3196 3197 int 3198 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi) 3199 struct sockaddr *src; 3200 struct sockaddr *dst; 3201 struct tcphdr *th; 3202 unsigned int hlen; 3203 struct socket *so; 3204 struct mbuf *m; 3205 u_char *optp; 3206 int optlen; 3207 struct tcp_opt_info *oi; 3208 { 3209 struct tcpcb tb, *tp; 3210 long win; 3211 struct syn_cache *sc; 3212 struct syn_cache_head *scp; 3213 struct mbuf *ipopts; 3214 3215 tp = sototcpcb(so); 3216 3217 /* 3218 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3219 * 3220 * Note this check is performed in tcp_input() very early on. 3221 */ 3222 3223 /* 3224 * Initialize some local state. 3225 */ 3226 win = sbspace(&so->so_rcv); 3227 if (win > TCP_MAXWIN) 3228 win = TCP_MAXWIN; 3229 3230 if (src->sa_family == AF_INET) { 3231 /* 3232 * Remember the IP options, if any. 3233 */ 3234 ipopts = ip_srcroute(); 3235 } else 3236 ipopts = NULL; 3237 3238 if (optp) { 3239 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3240 tcp_dooptions(&tb, optp, optlen, th, oi); 3241 } else 3242 tb.t_flags = 0; 3243 3244 /* 3245 * See if we already have an entry for this connection. 3246 * If we do, resend the SYN,ACK. We do not count this 3247 * as a retransmission (XXX though maybe we should). 3248 */ 3249 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3250 tcpstat.tcps_sc_dupesyn++; 3251 if (ipopts) { 3252 /* 3253 * If we were remembering a previous source route, 3254 * forget it and use the new one we've been given. 3255 */ 3256 if (sc->sc_ipopts) 3257 (void) m_free(sc->sc_ipopts); 3258 sc->sc_ipopts = ipopts; 3259 } 3260 sc->sc_timestamp = tb.ts_recent; 3261 if (syn_cache_respond(sc, m) == 0) { 3262 tcpstat.tcps_sndacks++; 3263 tcpstat.tcps_sndtotal++; 3264 } 3265 return (1); 3266 } 3267 3268 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 3269 if (sc == NULL) { 3270 if (ipopts) 3271 (void) m_free(ipopts); 3272 return (0); 3273 } 3274 3275 /* 3276 * Fill in the cache, and put the necessary IP and TCP 3277 * options into the reply. 3278 */ 3279 bzero(sc, sizeof(struct syn_cache)); 3280 bcopy(src, &sc->sc_src, src->sa_len); 3281 bcopy(dst, &sc->sc_dst, dst->sa_len); 3282 sc->sc_flags = 0; 3283 sc->sc_ipopts = ipopts; 3284 sc->sc_irs = th->th_seq; 3285 sc->sc_iss = tcp_new_iss(sc, sizeof(struct syn_cache), 0); 3286 sc->sc_peermaxseg = oi->maxseg; 3287 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 3288 m->m_pkthdr.rcvif : NULL, 3289 sc->sc_src.sa.sa_family); 3290 sc->sc_win = win; 3291 sc->sc_timestamp = tb.ts_recent; 3292 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) 3293 sc->sc_flags |= SCF_TIMESTAMP; 3294 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3295 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3296 sc->sc_requested_s_scale = tb.requested_s_scale; 3297 sc->sc_request_r_scale = 0; 3298 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3299 TCP_MAXWIN << sc->sc_request_r_scale < 3300 so->so_rcv.sb_hiwat) 3301 sc->sc_request_r_scale++; 3302 } else { 3303 sc->sc_requested_s_scale = 15; 3304 sc->sc_request_r_scale = 15; 3305 } 3306 sc->sc_tp = tp; 3307 if (syn_cache_respond(sc, m) == 0) { 3308 syn_cache_insert(sc, tp); 3309 tcpstat.tcps_sndacks++; 3310 tcpstat.tcps_sndtotal++; 3311 } else { 3312 SYN_CACHE_PUT(sc); 3313 tcpstat.tcps_sc_dropped++; 3314 } 3315 return (1); 3316 } 3317 3318 int 3319 syn_cache_respond(sc, m) 3320 struct syn_cache *sc; 3321 struct mbuf *m; 3322 { 3323 struct route *ro; 3324 struct rtentry *rt; 3325 u_int8_t *optp; 3326 int optlen, error; 3327 u_int16_t tlen; 3328 struct ip *ip = NULL; 3329 #ifdef INET6 3330 struct ip6_hdr *ip6 = NULL; 3331 #endif 3332 struct tcphdr *th; 3333 u_int hlen; 3334 3335 switch (sc->sc_src.sa.sa_family) { 3336 case AF_INET: 3337 hlen = sizeof(struct ip); 3338 ro = &sc->sc_route4; 3339 break; 3340 #ifdef INET6 3341 case AF_INET6: 3342 hlen = sizeof(struct ip6_hdr); 3343 ro = (struct route *)&sc->sc_route6; 3344 break; 3345 #endif 3346 default: 3347 if (m) 3348 m_freem(m); 3349 return EAFNOSUPPORT; 3350 } 3351 3352 /* Compute the size of the TCP options. */ 3353 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3354 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3355 3356 tlen = hlen + sizeof(struct tcphdr) + optlen; 3357 3358 /* 3359 * Create the IP+TCP header from scratch. 3360 */ 3361 if (m) 3362 m_freem(m); 3363 MGETHDR(m, M_DONTWAIT, MT_DATA); 3364 if (m && tlen > MHLEN) { 3365 MCLGET(m, M_DONTWAIT); 3366 if ((m->m_flags & M_EXT) == 0) { 3367 m_freem(m); 3368 m = NULL; 3369 } 3370 } 3371 if (m == NULL) 3372 return (ENOBUFS); 3373 3374 /* Fixup the mbuf. */ 3375 m->m_data += max_linkhdr; 3376 m->m_len = m->m_pkthdr.len = tlen; 3377 #ifdef IPSEC 3378 if (sc->sc_tp) { 3379 struct tcpcb *tp; 3380 struct socket *so; 3381 3382 tp = sc->sc_tp; 3383 if (tp->t_inpcb) 3384 so = tp->t_inpcb->inp_socket; 3385 #ifdef INET6 3386 else if (tp->t_in6pcb) 3387 so = tp->t_in6pcb->in6p_socket; 3388 #endif 3389 else 3390 so = NULL; 3391 /* use IPsec policy on listening socket, on SYN ACK */ 3392 ipsec_setsocket(m, so); 3393 } 3394 #endif 3395 m->m_pkthdr.rcvif = NULL; 3396 memset(mtod(m, u_char *), 0, tlen); 3397 3398 switch (sc->sc_src.sa.sa_family) { 3399 case AF_INET: 3400 ip = mtod(m, struct ip *); 3401 ip->ip_dst = sc->sc_src.sin.sin_addr; 3402 ip->ip_src = sc->sc_dst.sin.sin_addr; 3403 ip->ip_p = IPPROTO_TCP; 3404 th = (struct tcphdr *)(ip + 1); 3405 th->th_dport = sc->sc_src.sin.sin_port; 3406 th->th_sport = sc->sc_dst.sin.sin_port; 3407 break; 3408 #ifdef INET6 3409 case AF_INET6: 3410 ip6 = mtod(m, struct ip6_hdr *); 3411 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3412 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3413 ip6->ip6_nxt = IPPROTO_TCP; 3414 /* ip6_plen will be updated in ip6_output() */ 3415 th = (struct tcphdr *)(ip6 + 1); 3416 th->th_dport = sc->sc_src.sin6.sin6_port; 3417 th->th_sport = sc->sc_dst.sin6.sin6_port; 3418 break; 3419 #endif 3420 default: 3421 th = NULL; 3422 } 3423 3424 th->th_seq = htonl(sc->sc_iss); 3425 th->th_ack = htonl(sc->sc_irs + 1); 3426 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3427 th->th_flags = TH_SYN|TH_ACK; 3428 th->th_win = htons(sc->sc_win); 3429 /* th_sum already 0 */ 3430 /* th_urp already 0 */ 3431 3432 /* Tack on the TCP options. */ 3433 optp = (u_int8_t *)(th + 1); 3434 *optp++ = TCPOPT_MAXSEG; 3435 *optp++ = 4; 3436 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3437 *optp++ = sc->sc_ourmaxseg & 0xff; 3438 3439 if (sc->sc_request_r_scale != 15) { 3440 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3441 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3442 sc->sc_request_r_scale); 3443 optp += 4; 3444 } 3445 3446 if (sc->sc_flags & SCF_TIMESTAMP) { 3447 u_int32_t *lp = (u_int32_t *)(optp); 3448 /* Form timestamp option as shown in appendix A of RFC 1323. */ 3449 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 3450 *lp++ = htonl(tcp_now); 3451 *lp = htonl(sc->sc_timestamp); 3452 optp += TCPOLEN_TSTAMP_APPA; 3453 } 3454 3455 /* Compute the packet's checksum. */ 3456 switch (sc->sc_src.sa.sa_family) { 3457 case AF_INET: 3458 ip->ip_len = htons(tlen - hlen); 3459 th->th_sum = 0; 3460 th->th_sum = in_cksum(m, tlen); 3461 break; 3462 #ifdef INET6 3463 case AF_INET6: 3464 ip6->ip6_plen = htons(tlen - hlen); 3465 th->th_sum = 0; 3466 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 3467 break; 3468 #endif 3469 } 3470 3471 /* 3472 * Fill in some straggling IP bits. Note the stack expects 3473 * ip_len to be in host order, for convenience. 3474 */ 3475 switch (sc->sc_src.sa.sa_family) { 3476 case AF_INET: 3477 ip->ip_len = tlen; 3478 ip->ip_ttl = ip_defttl; 3479 /* XXX tos? */ 3480 break; 3481 #ifdef INET6 3482 case AF_INET6: 3483 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 3484 ip6->ip6_vfc |= IPV6_VERSION; 3485 ip6->ip6_plen = htons(tlen - hlen); 3486 /* ip6_hlim will be initialized afterwards */ 3487 /* XXX flowlabel? */ 3488 break; 3489 #endif 3490 } 3491 3492 /* 3493 * If we're doing Path MTU discovery, we need to set DF unless 3494 * the route's MTU is locked. If we don't yet know the route, 3495 * look it up now. We will copy this reference to the inpcb 3496 * when we finish creating the connection. 3497 */ 3498 if ((rt = ro->ro_rt) == NULL || (rt->rt_flags & RTF_UP) == 0) { 3499 if (ro->ro_rt != NULL) { 3500 RTFREE(ro->ro_rt); 3501 ro->ro_rt = NULL; 3502 } 3503 bcopy(&sc->sc_src, &ro->ro_dst, sc->sc_src.sa.sa_len); 3504 rtalloc(ro); 3505 if ((rt = ro->ro_rt) == NULL) { 3506 m_freem(m); 3507 switch (sc->sc_src.sa.sa_family) { 3508 case AF_INET: 3509 ipstat.ips_noroute++; 3510 break; 3511 #ifdef INET6 3512 case AF_INET6: 3513 ip6stat.ip6s_noroute++; 3514 break; 3515 #endif 3516 } 3517 return (EHOSTUNREACH); 3518 } 3519 } 3520 3521 switch (sc->sc_src.sa.sa_family) { 3522 case AF_INET: 3523 if (ip_mtudisc != 0 && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0) 3524 ip->ip_off |= IP_DF; 3525 3526 /* ...and send it off! */ 3527 error = ip_output(m, sc->sc_ipopts, ro, 0, NULL); 3528 break; 3529 #ifdef INET6 3530 case AF_INET6: 3531 ip6->ip6_hlim = in6_selecthlim(NULL, 3532 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 3533 3534 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 3535 0, NULL, NULL); 3536 break; 3537 #endif 3538 default: 3539 error = EAFNOSUPPORT; 3540 break; 3541 } 3542 return (error); 3543 } 3544