1 /* $NetBSD: tcp_input.c,v 1.127 2001/07/08 16:18:57 abs Exp $ */ 2 3 /* 4 %%% portions-copyright-nrl-95 5 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 7 Reserved. All rights under this copyright have been assigned to the US 8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 10 software. 11 You should have received a copy of the license with this software. If you 12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 13 14 */ 15 16 /* 17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 18 * All rights reserved. 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 1. Redistributions of source code must retain the above copyright 24 * notice, this list of conditions and the following disclaimer. 25 * 2. Redistributions in binary form must reproduce the above copyright 26 * notice, this list of conditions and the following disclaimer in the 27 * documentation and/or other materials provided with the distribution. 28 * 3. Neither the name of the project nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 45 /*- 46 * Copyright (c) 1997, 1998, 1999, 2001 The NetBSD Foundation, Inc. 47 * All rights reserved. 48 * 49 * This code is derived from software contributed to The NetBSD Foundation 50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 51 * Facility, NASA Ames Research Center. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 3. All advertising materials mentioning features or use of this software 62 * must display the following acknowledgement: 63 * This product includes software developed by the NetBSD 64 * Foundation, Inc. and its contributors. 65 * 4. Neither the name of The NetBSD Foundation nor the names of its 66 * contributors may be used to endorse or promote products derived 67 * from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 * POSSIBILITY OF SUCH DAMAGE. 80 */ 81 82 /* 83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 84 * The Regents of the University of California. All rights reserved. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. All advertising materials mentioning features or use of this software 95 * must display the following acknowledgement: 96 * This product includes software developed by the University of 97 * California, Berkeley and its contributors. 98 * 4. Neither the name of the University nor the names of its contributors 99 * may be used to endorse or promote products derived from this software 100 * without specific prior written permission. 101 * 102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 112 * SUCH DAMAGE. 113 * 114 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 115 */ 116 117 /* 118 * TODO list for SYN cache stuff: 119 * 120 * Find room for a "state" field, which is needed to keep a 121 * compressed state for TIME_WAIT TCBs. It's been noted already 122 * that this is fairly important for very high-volume web and 123 * mail servers, which use a large number of short-lived 124 * connections. 125 */ 126 127 #include "opt_inet.h" 128 #include "opt_ipsec.h" 129 #include "opt_inet_csum.h" 130 #include "opt_tcp_debug.h" 131 132 #include <sys/param.h> 133 #include <sys/systm.h> 134 #include <sys/malloc.h> 135 #include <sys/mbuf.h> 136 #include <sys/protosw.h> 137 #include <sys/socket.h> 138 #include <sys/socketvar.h> 139 #include <sys/errno.h> 140 #include <sys/syslog.h> 141 #include <sys/pool.h> 142 #include <sys/domain.h> 143 144 #include <net/if.h> 145 #include <net/route.h> 146 #include <net/if_types.h> 147 148 #include <netinet/in.h> 149 #include <netinet/in_systm.h> 150 #include <netinet/ip.h> 151 #include <netinet/in_pcb.h> 152 #include <netinet/ip_var.h> 153 154 #ifdef INET6 155 #ifndef INET 156 #include <netinet/in.h> 157 #endif 158 #include <netinet/ip6.h> 159 #include <netinet6/ip6_var.h> 160 #include <netinet6/in6_pcb.h> 161 #include <netinet6/ip6_var.h> 162 #include <netinet6/in6_var.h> 163 #include <netinet/icmp6.h> 164 #include <netinet6/nd6.h> 165 #endif 166 167 #ifdef PULLDOWN_TEST 168 #ifndef INET6 169 /* always need ip6.h for IP6_EXTHDR_GET */ 170 #include <netinet/ip6.h> 171 #endif 172 #endif 173 174 #include <netinet/tcp.h> 175 #include <netinet/tcp_fsm.h> 176 #include <netinet/tcp_seq.h> 177 #include <netinet/tcp_timer.h> 178 #include <netinet/tcp_var.h> 179 #include <netinet/tcpip.h> 180 #include <netinet/tcp_debug.h> 181 182 #include <machine/stdarg.h> 183 184 #ifdef IPSEC 185 #include <netinet6/ipsec.h> 186 #include <netkey/key.h> 187 #endif /*IPSEC*/ 188 #ifdef INET6 189 #include "faith.h" 190 #if defined(NFAITH) && NFAITH > 0 191 #include <net/if_faith.h> 192 #endif 193 #endif 194 195 int tcprexmtthresh = 3; 196 int tcp_log_refused; 197 198 static int tcp_rst_ppslim_count = 0; 199 static struct timeval tcp_rst_ppslim_last; 200 201 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 202 203 /* for modulo comparisons of timestamps */ 204 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 205 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 206 207 /* 208 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 209 */ 210 #ifdef INET6 211 #define ND6_HINT(tp) \ 212 do { \ 213 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 \ 214 && tp->t_in6pcb->in6p_route.ro_rt) { \ 215 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \ 216 } \ 217 } while (0) 218 #else 219 #define ND6_HINT(tp) 220 #endif 221 222 /* 223 * Macro to compute ACK transmission behavior. Delay the ACK unless 224 * we have already delayed an ACK (must send an ACK every two segments). 225 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 226 * option is enabled. 227 */ 228 #define TCP_SETUP_ACK(tp, th) \ 229 do { \ 230 if ((tp)->t_flags & TF_DELACK || \ 231 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \ 232 tp->t_flags |= TF_ACKNOW; \ 233 else \ 234 TCP_SET_DELACK(tp); \ 235 } while (0) 236 237 /* 238 * Convert TCP protocol fields to host order for easier processing. 239 */ 240 #define TCP_FIELDS_TO_HOST(th) \ 241 do { \ 242 NTOHL((th)->th_seq); \ 243 NTOHL((th)->th_ack); \ 244 NTOHS((th)->th_win); \ 245 NTOHS((th)->th_urp); \ 246 } while (0) 247 248 #ifdef TCP_CSUM_COUNTERS 249 #include <sys/device.h> 250 251 extern struct evcnt tcp_hwcsum_ok; 252 extern struct evcnt tcp_hwcsum_bad; 253 extern struct evcnt tcp_hwcsum_data; 254 extern struct evcnt tcp_swcsum; 255 256 #define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ 257 258 #else 259 260 #define TCP_CSUM_COUNTER_INCR(ev) /* nothing */ 261 262 #endif /* TCP_CSUM_COUNTERS */ 263 264 int 265 tcp_reass(tp, th, m, tlen) 266 struct tcpcb *tp; 267 struct tcphdr *th; 268 struct mbuf *m; 269 int *tlen; 270 { 271 struct ipqent *p, *q, *nq, *tiqe = NULL; 272 struct socket *so = NULL; 273 int pkt_flags; 274 tcp_seq pkt_seq; 275 unsigned pkt_len; 276 u_long rcvpartdupbyte = 0; 277 u_long rcvoobyte; 278 279 if (tp->t_inpcb) 280 so = tp->t_inpcb->inp_socket; 281 #ifdef INET6 282 else if (tp->t_in6pcb) 283 so = tp->t_in6pcb->in6p_socket; 284 #endif 285 286 TCP_REASS_LOCK_CHECK(tp); 287 288 /* 289 * Call with th==0 after become established to 290 * force pre-ESTABLISHED data up to user socket. 291 */ 292 if (th == 0) 293 goto present; 294 295 rcvoobyte = *tlen; 296 /* 297 * Copy these to local variables because the tcpiphdr 298 * gets munged while we are collapsing mbufs. 299 */ 300 pkt_seq = th->th_seq; 301 pkt_len = *tlen; 302 pkt_flags = th->th_flags; 303 /* 304 * Find a segment which begins after this one does. 305 */ 306 for (p = NULL, q = tp->segq.lh_first; q != NULL; q = nq) { 307 nq = q->ipqe_q.le_next; 308 /* 309 * If the received segment is just right after this 310 * fragment, merge the two together and then check 311 * for further overlaps. 312 */ 313 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 314 #ifdef TCPREASS_DEBUG 315 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 316 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 317 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 318 #endif 319 pkt_len += q->ipqe_len; 320 pkt_flags |= q->ipqe_flags; 321 pkt_seq = q->ipqe_seq; 322 m_cat(q->ipqe_m, m); 323 m = q->ipqe_m; 324 goto free_ipqe; 325 } 326 /* 327 * If the received segment is completely past this 328 * fragment, we need to go the next fragment. 329 */ 330 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 331 p = q; 332 continue; 333 } 334 /* 335 * If the fragment is past the received segment, 336 * it (or any following) can't be concatenated. 337 */ 338 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) 339 break; 340 /* 341 * We've received all the data in this segment before. 342 * mark it as a duplicate and return. 343 */ 344 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 345 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 346 tcpstat.tcps_rcvduppack++; 347 tcpstat.tcps_rcvdupbyte += pkt_len; 348 m_freem(m); 349 if (tiqe != NULL) 350 pool_put(&ipqent_pool, tiqe); 351 return (0); 352 } 353 /* 354 * Received segment completely overlaps this fragment 355 * so we drop the fragment (this keeps the temporal 356 * ordering of segments correct). 357 */ 358 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 359 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 360 rcvpartdupbyte += q->ipqe_len; 361 m_freem(q->ipqe_m); 362 goto free_ipqe; 363 } 364 /* 365 * RX'ed segment extends past the end of the 366 * fragment. Drop the overlapping bytes. Then 367 * merge the fragment and segment then treat as 368 * a longer received packet. 369 */ 370 if (SEQ_LT(q->ipqe_seq, pkt_seq) 371 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 372 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 373 #ifdef TCPREASS_DEBUG 374 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 375 tp, overlap, 376 pkt_seq, pkt_seq + pkt_len, pkt_len); 377 #endif 378 m_adj(m, overlap); 379 rcvpartdupbyte += overlap; 380 m_cat(q->ipqe_m, m); 381 m = q->ipqe_m; 382 pkt_seq = q->ipqe_seq; 383 pkt_len += q->ipqe_len - overlap; 384 rcvoobyte -= overlap; 385 goto free_ipqe; 386 } 387 /* 388 * RX'ed segment extends past the front of the 389 * fragment. Drop the overlapping bytes on the 390 * received packet. The packet will then be 391 * contatentated with this fragment a bit later. 392 */ 393 if (SEQ_GT(q->ipqe_seq, pkt_seq) 394 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 395 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 396 #ifdef TCPREASS_DEBUG 397 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 398 tp, overlap, 399 pkt_seq, pkt_seq + pkt_len, pkt_len); 400 #endif 401 m_adj(m, -overlap); 402 pkt_len -= overlap; 403 rcvpartdupbyte += overlap; 404 rcvoobyte -= overlap; 405 } 406 /* 407 * If the received segment immediates precedes this 408 * fragment then tack the fragment onto this segment 409 * and reinsert the data. 410 */ 411 if (q->ipqe_seq == pkt_seq + pkt_len) { 412 #ifdef TCPREASS_DEBUG 413 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 414 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 415 pkt_seq, pkt_seq + pkt_len, pkt_len); 416 #endif 417 pkt_len += q->ipqe_len; 418 pkt_flags |= q->ipqe_flags; 419 m_cat(m, q->ipqe_m); 420 LIST_REMOVE(q, ipqe_q); 421 LIST_REMOVE(q, ipqe_timeq); 422 if (tiqe == NULL) { 423 tiqe = q; 424 } else { 425 pool_put(&ipqent_pool, q); 426 } 427 break; 428 } 429 /* 430 * If the fragment is before the segment, remember it. 431 * When this loop is terminated, p will contain the 432 * pointer to fragment that is right before the received 433 * segment. 434 */ 435 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 436 p = q; 437 438 continue; 439 440 /* 441 * This is a common operation. It also will allow 442 * to save doing a malloc/free in most instances. 443 */ 444 free_ipqe: 445 LIST_REMOVE(q, ipqe_q); 446 LIST_REMOVE(q, ipqe_timeq); 447 if (tiqe == NULL) { 448 tiqe = q; 449 } else { 450 pool_put(&ipqent_pool, q); 451 } 452 } 453 454 /* 455 * Allocate a new queue entry since the received segment did not 456 * collapse onto any other out-of-order block; thus we are allocating 457 * a new block. If it had collapsed, tiqe would not be NULL and 458 * we would be reusing it. 459 * XXX If we can't, just drop the packet. XXX 460 */ 461 if (tiqe == NULL) { 462 tiqe = pool_get(&ipqent_pool, PR_NOWAIT); 463 if (tiqe == NULL) { 464 tcpstat.tcps_rcvmemdrop++; 465 m_freem(m); 466 return (0); 467 } 468 } 469 470 /* 471 * Update the counters. 472 */ 473 tcpstat.tcps_rcvoopack++; 474 tcpstat.tcps_rcvoobyte += rcvoobyte; 475 if (rcvpartdupbyte) { 476 tcpstat.tcps_rcvpartduppack++; 477 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte; 478 } 479 480 /* 481 * Insert the new fragment queue entry into both queues. 482 */ 483 tiqe->ipqe_m = m; 484 tiqe->ipqe_seq = pkt_seq; 485 tiqe->ipqe_len = pkt_len; 486 tiqe->ipqe_flags = pkt_flags; 487 if (p == NULL) { 488 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 489 #ifdef TCPREASS_DEBUG 490 if (tiqe->ipqe_seq != tp->rcv_nxt) 491 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 492 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 493 #endif 494 } else { 495 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 496 #ifdef TCPREASS_DEBUG 497 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 498 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 499 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 500 #endif 501 } 502 503 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 504 505 present: 506 /* 507 * Present data to user, advancing rcv_nxt through 508 * completed sequence space. 509 */ 510 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 511 return (0); 512 q = tp->segq.lh_first; 513 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 514 return (0); 515 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 516 return (0); 517 518 tp->rcv_nxt += q->ipqe_len; 519 pkt_flags = q->ipqe_flags & TH_FIN; 520 ND6_HINT(tp); 521 522 LIST_REMOVE(q, ipqe_q); 523 LIST_REMOVE(q, ipqe_timeq); 524 if (so->so_state & SS_CANTRCVMORE) 525 m_freem(q->ipqe_m); 526 else 527 sbappend(&so->so_rcv, q->ipqe_m); 528 pool_put(&ipqent_pool, q); 529 sorwakeup(so); 530 return (pkt_flags); 531 } 532 533 #ifdef INET6 534 int 535 tcp6_input(mp, offp, proto) 536 struct mbuf **mp; 537 int *offp, proto; 538 { 539 struct mbuf *m = *mp; 540 541 /* 542 * draft-itojun-ipv6-tcp-to-anycast 543 * better place to put this in? 544 */ 545 if (m->m_flags & M_ANYCAST6) { 546 struct ip6_hdr *ip6; 547 if (m->m_len < sizeof(struct ip6_hdr)) { 548 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 549 tcpstat.tcps_rcvshort++; 550 return IPPROTO_DONE; 551 } 552 } 553 ip6 = mtod(m, struct ip6_hdr *); 554 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 555 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 556 return IPPROTO_DONE; 557 } 558 559 tcp_input(m, *offp, proto); 560 return IPPROTO_DONE; 561 } 562 #endif 563 564 /* 565 * TCP input routine, follows pages 65-76 of the 566 * protocol specification dated September, 1981 very closely. 567 */ 568 void 569 #if __STDC__ 570 tcp_input(struct mbuf *m, ...) 571 #else 572 tcp_input(m, va_alist) 573 struct mbuf *m; 574 #endif 575 { 576 int proto; 577 struct tcphdr *th; 578 struct ip *ip; 579 struct inpcb *inp; 580 #ifdef INET6 581 struct ip6_hdr *ip6; 582 struct in6pcb *in6p; 583 #endif 584 caddr_t optp = NULL; 585 int optlen = 0; 586 int len, tlen, toff, hdroptlen = 0; 587 struct tcpcb *tp = 0; 588 int tiflags; 589 struct socket *so = NULL; 590 int todrop, acked, ourfinisacked, needoutput = 0; 591 short ostate = 0; 592 int iss = 0; 593 u_long tiwin; 594 struct tcp_opt_info opti; 595 int off, iphlen; 596 va_list ap; 597 int af; /* af on the wire */ 598 struct mbuf *tcp_saveti = NULL; 599 600 va_start(ap, m); 601 toff = va_arg(ap, int); 602 proto = va_arg(ap, int); 603 va_end(ap); 604 605 tcpstat.tcps_rcvtotal++; 606 607 bzero(&opti, sizeof(opti)); 608 opti.ts_present = 0; 609 opti.maxseg = 0; 610 611 /* 612 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 613 * 614 * TCP is, by definition, unicast, so we reject all 615 * multicast outright. 616 * 617 * Note, there are additional src/dst address checks in 618 * the AF-specific code below. 619 */ 620 if (m->m_flags & (M_BCAST|M_MCAST)) { 621 /* XXX stat */ 622 goto drop; 623 } 624 #ifdef INET6 625 if (m->m_flags & M_ANYCAST6) { 626 /* XXX stat */ 627 goto drop; 628 } 629 #endif 630 631 /* 632 * Get IP and TCP header together in first mbuf. 633 * Note: IP leaves IP header in first mbuf. 634 */ 635 ip = mtod(m, struct ip *); 636 #ifdef INET6 637 ip6 = NULL; 638 #endif 639 switch (ip->ip_v) { 640 #ifdef INET 641 case 4: 642 af = AF_INET; 643 iphlen = sizeof(struct ip); 644 #ifndef PULLDOWN_TEST 645 /* would like to get rid of this... */ 646 if (toff > sizeof (struct ip)) { 647 ip_stripoptions(m, (struct mbuf *)0); 648 toff = sizeof(struct ip); 649 } 650 if (m->m_len < toff + sizeof (struct tcphdr)) { 651 if ((m = m_pullup(m, toff + sizeof (struct tcphdr))) == 0) { 652 tcpstat.tcps_rcvshort++; 653 return; 654 } 655 } 656 ip = mtod(m, struct ip *); 657 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 658 #else 659 ip = mtod(m, struct ip *); 660 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 661 sizeof(struct tcphdr)); 662 if (th == NULL) { 663 tcpstat.tcps_rcvshort++; 664 return; 665 } 666 #endif 667 668 /* 669 * Make sure destination address is not multicast. 670 * Source address checked in ip_input(). 671 */ 672 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 673 /* XXX stat */ 674 goto drop; 675 } 676 677 /* We do the checksum after PCB lookup... */ 678 len = ip->ip_len; 679 tlen = len - toff; 680 break; 681 #endif 682 #ifdef INET6 683 case 6: 684 ip = NULL; 685 iphlen = sizeof(struct ip6_hdr); 686 af = AF_INET6; 687 #ifndef PULLDOWN_TEST 688 if (m->m_len < toff + sizeof(struct tcphdr)) { 689 m = m_pullup(m, toff + sizeof(struct tcphdr)); /*XXX*/ 690 if (m == NULL) { 691 tcpstat.tcps_rcvshort++; 692 return; 693 } 694 } 695 ip6 = mtod(m, struct ip6_hdr *); 696 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 697 #else 698 ip6 = mtod(m, struct ip6_hdr *); 699 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 700 sizeof(struct tcphdr)); 701 if (th == NULL) { 702 tcpstat.tcps_rcvshort++; 703 return; 704 } 705 #endif 706 707 /* Be proactive about malicious use of IPv4 mapped address */ 708 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 709 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 710 /* XXX stat */ 711 goto drop; 712 } 713 714 /* 715 * Be proactive about unspecified IPv6 address in source. 716 * As we use all-zero to indicate unbounded/unconnected pcb, 717 * unspecified IPv6 address can be used to confuse us. 718 * 719 * Note that packets with unspecified IPv6 destination is 720 * already dropped in ip6_input. 721 */ 722 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 723 /* XXX stat */ 724 goto drop; 725 } 726 727 /* 728 * Make sure destination address is not multicast. 729 * Source address checked in ip6_input(). 730 */ 731 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 732 /* XXX stat */ 733 goto drop; 734 } 735 736 /* We do the checksum after PCB lookup... */ 737 len = m->m_pkthdr.len; 738 tlen = len - toff; 739 break; 740 #endif 741 default: 742 m_freem(m); 743 return; 744 } 745 746 /* 747 * Check that TCP offset makes sense, 748 * pull out TCP options and adjust length. XXX 749 */ 750 off = th->th_off << 2; 751 if (off < sizeof (struct tcphdr) || off > tlen) { 752 tcpstat.tcps_rcvbadoff++; 753 goto drop; 754 } 755 tlen -= off; 756 757 /* 758 * tcp_input() has been modified to use tlen to mean the TCP data 759 * length throughout the function. Other functions can use 760 * m->m_pkthdr.len as the basis for calculating the TCP data length. 761 * rja 762 */ 763 764 if (off > sizeof (struct tcphdr)) { 765 #ifndef PULLDOWN_TEST 766 if (m->m_len < toff + off) { 767 if ((m = m_pullup(m, toff + off)) == 0) { 768 tcpstat.tcps_rcvshort++; 769 return; 770 } 771 switch (af) { 772 #ifdef INET 773 case AF_INET: 774 ip = mtod(m, struct ip *); 775 break; 776 #endif 777 #ifdef INET6 778 case AF_INET6: 779 ip6 = mtod(m, struct ip6_hdr *); 780 break; 781 #endif 782 } 783 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 784 } 785 #else 786 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); 787 if (th == NULL) { 788 tcpstat.tcps_rcvshort++; 789 return; 790 } 791 /* 792 * NOTE: ip/ip6 will not be affected by m_pulldown() 793 * (as they're before toff) and we don't need to update those. 794 */ 795 #endif 796 optlen = off - sizeof (struct tcphdr); 797 optp = ((caddr_t)th) + sizeof(struct tcphdr); 798 /* 799 * Do quick retrieval of timestamp options ("options 800 * prediction?"). If timestamp is the only option and it's 801 * formatted as recommended in RFC 1323 appendix A, we 802 * quickly get the values now and not bother calling 803 * tcp_dooptions(), etc. 804 */ 805 if ((optlen == TCPOLEN_TSTAMP_APPA || 806 (optlen > TCPOLEN_TSTAMP_APPA && 807 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 808 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 809 (th->th_flags & TH_SYN) == 0) { 810 opti.ts_present = 1; 811 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 812 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 813 optp = NULL; /* we've parsed the options */ 814 } 815 } 816 tiflags = th->th_flags; 817 818 /* 819 * Locate pcb for segment. 820 */ 821 findpcb: 822 inp = NULL; 823 #ifdef INET6 824 in6p = NULL; 825 #endif 826 switch (af) { 827 #ifdef INET 828 case AF_INET: 829 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, 830 ip->ip_dst, th->th_dport); 831 if (inp == 0) { 832 ++tcpstat.tcps_pcbhashmiss; 833 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); 834 } 835 #ifdef INET6 836 if (inp == 0) { 837 struct in6_addr s, d; 838 839 /* mapped addr case */ 840 bzero(&s, sizeof(s)); 841 s.s6_addr16[5] = htons(0xffff); 842 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src)); 843 bzero(&d, sizeof(d)); 844 d.s6_addr16[5] = htons(0xffff); 845 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst)); 846 in6p = in6_pcblookup_connect(&tcb6, &s, th->th_sport, 847 &d, th->th_dport, 0); 848 if (in6p == 0) { 849 ++tcpstat.tcps_pcbhashmiss; 850 in6p = in6_pcblookup_bind(&tcb6, &d, 851 th->th_dport, 0); 852 } 853 } 854 #endif 855 #ifndef INET6 856 if (inp == 0) 857 #else 858 if (inp == 0 && in6p == 0) 859 #endif 860 { 861 ++tcpstat.tcps_noport; 862 if (tcp_log_refused && (tiflags & TH_SYN)) { 863 #ifndef INET6 864 char src[4*sizeof "123"]; 865 char dst[4*sizeof "123"]; 866 #else 867 char src[INET6_ADDRSTRLEN]; 868 char dst[INET6_ADDRSTRLEN]; 869 #endif 870 if (ip) { 871 strcpy(src, inet_ntoa(ip->ip_src)); 872 strcpy(dst, inet_ntoa(ip->ip_dst)); 873 } 874 #ifdef INET6 875 else if (ip6) { 876 strcpy(src, ip6_sprintf(&ip6->ip6_src)); 877 strcpy(dst, ip6_sprintf(&ip6->ip6_dst)); 878 } 879 #endif 880 else { 881 strcpy(src, "(unknown)"); 882 strcpy(dst, "(unknown)"); 883 } 884 log(LOG_INFO, 885 "Connection attempt to TCP %s:%d from %s:%d\n", 886 dst, ntohs(th->th_dport), 887 src, ntohs(th->th_sport)); 888 } 889 TCP_FIELDS_TO_HOST(th); 890 goto dropwithreset_ratelim; 891 } 892 #ifdef IPSEC 893 if (inp && ipsec4_in_reject(m, inp)) { 894 ipsecstat.in_polvio++; 895 goto drop; 896 } 897 #ifdef INET6 898 else if (in6p && ipsec4_in_reject_so(m, in6p->in6p_socket)) { 899 ipsecstat.in_polvio++; 900 goto drop; 901 } 902 #endif 903 #endif /*IPSEC*/ 904 break; 905 #endif /*INET*/ 906 #ifdef INET6 907 case AF_INET6: 908 { 909 int faith; 910 911 #if defined(NFAITH) && NFAITH > 0 912 faith = faithprefix(&ip6->ip6_dst); 913 #else 914 faith = 0; 915 #endif 916 in6p = in6_pcblookup_connect(&tcb6, &ip6->ip6_src, th->th_sport, 917 &ip6->ip6_dst, th->th_dport, faith); 918 if (in6p == NULL) { 919 ++tcpstat.tcps_pcbhashmiss; 920 in6p = in6_pcblookup_bind(&tcb6, &ip6->ip6_dst, 921 th->th_dport, faith); 922 } 923 if (in6p == NULL) { 924 ++tcpstat.tcps_noport; 925 TCP_FIELDS_TO_HOST(th); 926 goto dropwithreset_ratelim; 927 } 928 #ifdef IPSEC 929 if (ipsec6_in_reject(m, in6p)) { 930 ipsec6stat.in_polvio++; 931 goto drop; 932 } 933 #endif /*IPSEC*/ 934 break; 935 } 936 #endif 937 } 938 939 /* 940 * If the state is CLOSED (i.e., TCB does not exist) then 941 * all data in the incoming segment is discarded. 942 * If the TCB exists but is in CLOSED state, it is embryonic, 943 * but should either do a listen or a connect soon. 944 */ 945 tp = NULL; 946 so = NULL; 947 if (inp) { 948 tp = intotcpcb(inp); 949 so = inp->inp_socket; 950 } 951 #ifdef INET6 952 else if (in6p) { 953 tp = in6totcpcb(in6p); 954 so = in6p->in6p_socket; 955 } 956 #endif 957 if (tp == 0) { 958 TCP_FIELDS_TO_HOST(th); 959 goto dropwithreset_ratelim; 960 } 961 if (tp->t_state == TCPS_CLOSED) 962 goto drop; 963 964 /* 965 * Checksum extended TCP header and data. 966 */ 967 switch (af) { 968 #ifdef INET 969 case AF_INET: 970 switch (m->m_pkthdr.csum_flags & 971 ((m->m_pkthdr.rcvif->if_csum_flags & M_CSUM_TCPv4) | 972 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 973 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD: 974 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad); 975 goto badcsum; 976 977 case M_CSUM_TCPv4|M_CSUM_DATA: 978 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data); 979 if ((m->m_pkthdr.csum_data ^ 0xffff) != 0) 980 goto badcsum; 981 break; 982 983 case M_CSUM_TCPv4: 984 /* Checksum was okay. */ 985 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok); 986 break; 987 988 default: 989 /* Must compute it ourselves. */ 990 TCP_CSUM_COUNTER_INCR(&tcp_swcsum); 991 #ifndef PULLDOWN_TEST 992 { 993 struct ipovly *ipov; 994 ipov = (struct ipovly *)ip; 995 bzero(ipov->ih_x1, sizeof ipov->ih_x1); 996 ipov->ih_len = htons(tlen + off); 997 998 if (in_cksum(m, len) != 0) 999 goto badcsum; 1000 } 1001 #else 1002 if (in4_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) 1003 goto badcsum; 1004 #endif /* ! PULLDOWN_TEST */ 1005 break; 1006 } 1007 break; 1008 #endif /* INET4 */ 1009 1010 #ifdef INET6 1011 case AF_INET6: 1012 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) 1013 goto badcsum; 1014 break; 1015 #endif /* INET6 */ 1016 } 1017 1018 TCP_FIELDS_TO_HOST(th); 1019 1020 /* Unscale the window into a 32-bit value. */ 1021 if ((tiflags & TH_SYN) == 0) 1022 tiwin = th->th_win << tp->snd_scale; 1023 else 1024 tiwin = th->th_win; 1025 1026 #ifdef INET6 1027 /* save packet options if user wanted */ 1028 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { 1029 if (in6p->in6p_options) { 1030 m_freem(in6p->in6p_options); 1031 in6p->in6p_options = 0; 1032 } 1033 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); 1034 } 1035 #endif 1036 1037 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 1038 union syn_cache_sa src; 1039 union syn_cache_sa dst; 1040 1041 bzero(&src, sizeof(src)); 1042 bzero(&dst, sizeof(dst)); 1043 switch (af) { 1044 #ifdef INET 1045 case AF_INET: 1046 src.sin.sin_len = sizeof(struct sockaddr_in); 1047 src.sin.sin_family = AF_INET; 1048 src.sin.sin_addr = ip->ip_src; 1049 src.sin.sin_port = th->th_sport; 1050 1051 dst.sin.sin_len = sizeof(struct sockaddr_in); 1052 dst.sin.sin_family = AF_INET; 1053 dst.sin.sin_addr = ip->ip_dst; 1054 dst.sin.sin_port = th->th_dport; 1055 break; 1056 #endif 1057 #ifdef INET6 1058 case AF_INET6: 1059 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 1060 src.sin6.sin6_family = AF_INET6; 1061 src.sin6.sin6_addr = ip6->ip6_src; 1062 src.sin6.sin6_port = th->th_sport; 1063 1064 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 1065 dst.sin6.sin6_family = AF_INET6; 1066 dst.sin6.sin6_addr = ip6->ip6_dst; 1067 dst.sin6.sin6_port = th->th_dport; 1068 break; 1069 #endif /* INET6 */ 1070 default: 1071 goto badsyn; /*sanity*/ 1072 } 1073 1074 if (so->so_options & SO_DEBUG) { 1075 ostate = tp->t_state; 1076 1077 tcp_saveti = NULL; 1078 if (iphlen + sizeof(struct tcphdr) > MHLEN) 1079 goto nosave; 1080 1081 if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) { 1082 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 1083 if (!tcp_saveti) 1084 goto nosave; 1085 } else { 1086 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER); 1087 if (!tcp_saveti) 1088 goto nosave; 1089 tcp_saveti->m_len = iphlen; 1090 m_copydata(m, 0, iphlen, 1091 mtod(tcp_saveti, caddr_t)); 1092 } 1093 1094 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 1095 m_freem(tcp_saveti); 1096 tcp_saveti = NULL; 1097 } else { 1098 tcp_saveti->m_len += sizeof(struct tcphdr); 1099 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen, 1100 sizeof(struct tcphdr)); 1101 } 1102 if (tcp_saveti) { 1103 /* 1104 * need to recover version # field, which was 1105 * overwritten on ip_cksum computation. 1106 */ 1107 struct ip *sip; 1108 sip = mtod(tcp_saveti, struct ip *); 1109 switch (af) { 1110 #ifdef INET 1111 case AF_INET: 1112 sip->ip_v = 4; 1113 break; 1114 #endif 1115 #ifdef INET6 1116 case AF_INET6: 1117 sip->ip_v = 6; 1118 break; 1119 #endif 1120 } 1121 } 1122 nosave:; 1123 } 1124 if (so->so_options & SO_ACCEPTCONN) { 1125 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1126 if (tiflags & TH_RST) { 1127 syn_cache_reset(&src.sa, &dst.sa, th); 1128 } else if ((tiflags & (TH_ACK|TH_SYN)) == 1129 (TH_ACK|TH_SYN)) { 1130 /* 1131 * Received a SYN,ACK. This should 1132 * never happen while we are in 1133 * LISTEN. Send an RST. 1134 */ 1135 goto badsyn; 1136 } else if (tiflags & TH_ACK) { 1137 so = syn_cache_get(&src.sa, &dst.sa, 1138 th, toff, tlen, so, m); 1139 if (so == NULL) { 1140 /* 1141 * We don't have a SYN for 1142 * this ACK; send an RST. 1143 */ 1144 goto badsyn; 1145 } else if (so == 1146 (struct socket *)(-1)) { 1147 /* 1148 * We were unable to create 1149 * the connection. If the 1150 * 3-way handshake was 1151 * completed, and RST has 1152 * been sent to the peer. 1153 * Since the mbuf might be 1154 * in use for the reply, 1155 * do not free it. 1156 */ 1157 m = NULL; 1158 } else { 1159 /* 1160 * We have created a 1161 * full-blown connection. 1162 */ 1163 tp = NULL; 1164 inp = NULL; 1165 #ifdef INET6 1166 in6p = NULL; 1167 #endif 1168 switch (so->so_proto->pr_domain->dom_family) { 1169 #ifdef INET 1170 case AF_INET: 1171 inp = sotoinpcb(so); 1172 tp = intotcpcb(inp); 1173 break; 1174 #endif 1175 #ifdef INET6 1176 case AF_INET6: 1177 in6p = sotoin6pcb(so); 1178 tp = in6totcpcb(in6p); 1179 break; 1180 #endif 1181 } 1182 if (tp == NULL) 1183 goto badsyn; /*XXX*/ 1184 tiwin <<= tp->snd_scale; 1185 goto after_listen; 1186 } 1187 } else { 1188 /* 1189 * None of RST, SYN or ACK was set. 1190 * This is an invalid packet for a 1191 * TCB in LISTEN state. Send a RST. 1192 */ 1193 goto badsyn; 1194 } 1195 } else { 1196 /* 1197 * Received a SYN. 1198 */ 1199 1200 /* 1201 * LISTEN socket received a SYN 1202 * from itself? This can't possibly 1203 * be valid; drop the packet. 1204 */ 1205 if (th->th_sport == th->th_dport) { 1206 int i; 1207 1208 switch (af) { 1209 #ifdef INET 1210 case AF_INET: 1211 i = in_hosteq(ip->ip_src, ip->ip_dst); 1212 break; 1213 #endif 1214 #ifdef INET6 1215 case AF_INET6: 1216 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1217 break; 1218 #endif 1219 default: 1220 i = 1; 1221 } 1222 if (i) { 1223 tcpstat.tcps_badsyn++; 1224 goto drop; 1225 } 1226 } 1227 1228 /* 1229 * SYN looks ok; create compressed TCP 1230 * state for it. 1231 */ 1232 if (so->so_qlen <= so->so_qlimit && 1233 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1234 so, m, optp, optlen, &opti)) 1235 m = NULL; 1236 } 1237 goto drop; 1238 } 1239 } 1240 1241 after_listen: 1242 #ifdef DIAGNOSTIC 1243 /* 1244 * Should not happen now that all embryonic connections 1245 * are handled with compressed state. 1246 */ 1247 if (tp->t_state == TCPS_LISTEN) 1248 panic("tcp_input: TCPS_LISTEN"); 1249 #endif 1250 1251 /* 1252 * Segment received on connection. 1253 * Reset idle time and keep-alive timer. 1254 */ 1255 tp->t_idle = 0; 1256 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1257 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1258 1259 /* 1260 * Process options. 1261 */ 1262 if (optp) 1263 tcp_dooptions(tp, optp, optlen, th, &opti); 1264 1265 /* 1266 * Header prediction: check for the two common cases 1267 * of a uni-directional data xfer. If the packet has 1268 * no control flags, is in-sequence, the window didn't 1269 * change and we're not retransmitting, it's a 1270 * candidate. If the length is zero and the ack moved 1271 * forward, we're the sender side of the xfer. Just 1272 * free the data acked & wake any higher level process 1273 * that was blocked waiting for space. If the length 1274 * is non-zero and the ack didn't move, we're the 1275 * receiver side. If we're getting packets in-order 1276 * (the reassembly queue is empty), add the data to 1277 * the socket buffer and note that we need a delayed ack. 1278 */ 1279 if (tp->t_state == TCPS_ESTABLISHED && 1280 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1281 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1282 th->th_seq == tp->rcv_nxt && 1283 tiwin && tiwin == tp->snd_wnd && 1284 tp->snd_nxt == tp->snd_max) { 1285 1286 /* 1287 * If last ACK falls within this segment's sequence numbers, 1288 * record the timestamp. 1289 */ 1290 if (opti.ts_present && 1291 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1292 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) { 1293 tp->ts_recent_age = TCP_TIMESTAMP(tp); 1294 tp->ts_recent = opti.ts_val; 1295 } 1296 1297 if (tlen == 0) { 1298 if (SEQ_GT(th->th_ack, tp->snd_una) && 1299 SEQ_LEQ(th->th_ack, tp->snd_max) && 1300 tp->snd_cwnd >= tp->snd_wnd && 1301 tp->t_dupacks < tcprexmtthresh) { 1302 /* 1303 * this is a pure ack for outstanding data. 1304 */ 1305 ++tcpstat.tcps_predack; 1306 if (opti.ts_present && opti.ts_ecr) 1307 tcp_xmit_timer(tp, 1308 TCP_TIMESTAMP(tp) - opti.ts_ecr + 1); 1309 else if (tp->t_rtt && 1310 SEQ_GT(th->th_ack, tp->t_rtseq)) 1311 tcp_xmit_timer(tp, tp->t_rtt); 1312 acked = th->th_ack - tp->snd_una; 1313 tcpstat.tcps_rcvackpack++; 1314 tcpstat.tcps_rcvackbyte += acked; 1315 ND6_HINT(tp); 1316 sbdrop(&so->so_snd, acked); 1317 /* 1318 * We want snd_recover to track snd_una to 1319 * avoid sequence wraparound problems for 1320 * very large transfers. 1321 */ 1322 tp->snd_una = tp->snd_recover = th->th_ack; 1323 m_freem(m); 1324 1325 /* 1326 * If all outstanding data are acked, stop 1327 * retransmit timer, otherwise restart timer 1328 * using current (possibly backed-off) value. 1329 * If process is waiting for space, 1330 * wakeup/selwakeup/signal. If data 1331 * are ready to send, let tcp_output 1332 * decide between more output or persist. 1333 */ 1334 if (tp->snd_una == tp->snd_max) 1335 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1336 else if (TCP_TIMER_ISARMED(tp, 1337 TCPT_PERSIST) == 0) 1338 TCP_TIMER_ARM(tp, TCPT_REXMT, 1339 tp->t_rxtcur); 1340 1341 sowwakeup(so); 1342 if (so->so_snd.sb_cc) 1343 (void) tcp_output(tp); 1344 if (tcp_saveti) 1345 m_freem(tcp_saveti); 1346 return; 1347 } 1348 } else if (th->th_ack == tp->snd_una && 1349 tp->segq.lh_first == NULL && 1350 tlen <= sbspace(&so->so_rcv)) { 1351 /* 1352 * this is a pure, in-sequence data packet 1353 * with nothing on the reassembly queue and 1354 * we have enough buffer space to take it. 1355 */ 1356 ++tcpstat.tcps_preddat; 1357 tp->rcv_nxt += tlen; 1358 tcpstat.tcps_rcvpack++; 1359 tcpstat.tcps_rcvbyte += tlen; 1360 ND6_HINT(tp); 1361 /* 1362 * Drop TCP, IP headers and TCP options then add data 1363 * to socket buffer. 1364 */ 1365 m_adj(m, toff + off); 1366 sbappend(&so->so_rcv, m); 1367 sorwakeup(so); 1368 TCP_SETUP_ACK(tp, th); 1369 if (tp->t_flags & TF_ACKNOW) 1370 (void) tcp_output(tp); 1371 if (tcp_saveti) 1372 m_freem(tcp_saveti); 1373 return; 1374 } 1375 } 1376 1377 /* 1378 * Compute mbuf offset to TCP data segment. 1379 */ 1380 hdroptlen = toff + off; 1381 1382 /* 1383 * Calculate amount of space in receive window, 1384 * and then do TCP input processing. 1385 * Receive window is amount of space in rcv queue, 1386 * but not less than advertised window. 1387 */ 1388 { int win; 1389 1390 win = sbspace(&so->so_rcv); 1391 if (win < 0) 1392 win = 0; 1393 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1394 } 1395 1396 switch (tp->t_state) { 1397 1398 /* 1399 * If the state is SYN_SENT: 1400 * if seg contains an ACK, but not for our SYN, drop the input. 1401 * if seg contains a RST, then drop the connection. 1402 * if seg does not contain SYN, then drop it. 1403 * Otherwise this is an acceptable SYN segment 1404 * initialize tp->rcv_nxt and tp->irs 1405 * if seg contains ack then advance tp->snd_una 1406 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1407 * arrange for segment to be acked (eventually) 1408 * continue processing rest of data/controls, beginning with URG 1409 */ 1410 case TCPS_SYN_SENT: 1411 if ((tiflags & TH_ACK) && 1412 (SEQ_LEQ(th->th_ack, tp->iss) || 1413 SEQ_GT(th->th_ack, tp->snd_max))) 1414 goto dropwithreset; 1415 if (tiflags & TH_RST) { 1416 if (tiflags & TH_ACK) 1417 tp = tcp_drop(tp, ECONNREFUSED); 1418 goto drop; 1419 } 1420 if ((tiflags & TH_SYN) == 0) 1421 goto drop; 1422 if (tiflags & TH_ACK) { 1423 tp->snd_una = tp->snd_recover = th->th_ack; 1424 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1425 tp->snd_nxt = tp->snd_una; 1426 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1427 } 1428 tp->irs = th->th_seq; 1429 tcp_rcvseqinit(tp); 1430 tp->t_flags |= TF_ACKNOW; 1431 tcp_mss_from_peer(tp, opti.maxseg); 1432 1433 /* 1434 * Initialize the initial congestion window. If we 1435 * had to retransmit the SYN, we must initialize cwnd 1436 * to 1 segment (i.e. the Loss Window). 1437 */ 1438 if (tp->t_flags & TF_SYN_REXMT) 1439 tp->snd_cwnd = tp->t_peermss; 1440 else 1441 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, 1442 tp->t_peermss); 1443 1444 tcp_rmx_rtt(tp); 1445 if (tiflags & TH_ACK) { 1446 tcpstat.tcps_connects++; 1447 soisconnected(so); 1448 tcp_established(tp); 1449 /* Do window scaling on this connection? */ 1450 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1451 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1452 tp->snd_scale = tp->requested_s_scale; 1453 tp->rcv_scale = tp->request_r_scale; 1454 } 1455 TCP_REASS_LOCK(tp); 1456 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1457 TCP_REASS_UNLOCK(tp); 1458 /* 1459 * if we didn't have to retransmit the SYN, 1460 * use its rtt as our initial srtt & rtt var. 1461 */ 1462 if (tp->t_rtt) 1463 tcp_xmit_timer(tp, tp->t_rtt); 1464 } else 1465 tp->t_state = TCPS_SYN_RECEIVED; 1466 1467 /* 1468 * Advance th->th_seq to correspond to first data byte. 1469 * If data, trim to stay within window, 1470 * dropping FIN if necessary. 1471 */ 1472 th->th_seq++; 1473 if (tlen > tp->rcv_wnd) { 1474 todrop = tlen - tp->rcv_wnd; 1475 m_adj(m, -todrop); 1476 tlen = tp->rcv_wnd; 1477 tiflags &= ~TH_FIN; 1478 tcpstat.tcps_rcvpackafterwin++; 1479 tcpstat.tcps_rcvbyteafterwin += todrop; 1480 } 1481 tp->snd_wl1 = th->th_seq - 1; 1482 tp->rcv_up = th->th_seq; 1483 goto step6; 1484 1485 /* 1486 * If the state is SYN_RECEIVED: 1487 * If seg contains an ACK, but not for our SYN, drop the input 1488 * and generate an RST. See page 36, rfc793 1489 */ 1490 case TCPS_SYN_RECEIVED: 1491 if ((tiflags & TH_ACK) && 1492 (SEQ_LEQ(th->th_ack, tp->iss) || 1493 SEQ_GT(th->th_ack, tp->snd_max))) 1494 goto dropwithreset; 1495 break; 1496 } 1497 1498 /* 1499 * States other than LISTEN or SYN_SENT. 1500 * First check timestamp, if present. 1501 * Then check that at least some bytes of segment are within 1502 * receive window. If segment begins before rcv_nxt, 1503 * drop leading data (and SYN); if nothing left, just ack. 1504 * 1505 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1506 * and it's less than ts_recent, drop it. 1507 */ 1508 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1509 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1510 1511 /* Check to see if ts_recent is over 24 days old. */ 1512 if ((int)(TCP_TIMESTAMP(tp) - tp->ts_recent_age) > 1513 TCP_PAWS_IDLE) { 1514 /* 1515 * Invalidate ts_recent. If this segment updates 1516 * ts_recent, the age will be reset later and ts_recent 1517 * will get a valid value. If it does not, setting 1518 * ts_recent to zero will at least satisfy the 1519 * requirement that zero be placed in the timestamp 1520 * echo reply when ts_recent isn't valid. The 1521 * age isn't reset until we get a valid ts_recent 1522 * because we don't want out-of-order segments to be 1523 * dropped when ts_recent is old. 1524 */ 1525 tp->ts_recent = 0; 1526 } else { 1527 tcpstat.tcps_rcvduppack++; 1528 tcpstat.tcps_rcvdupbyte += tlen; 1529 tcpstat.tcps_pawsdrop++; 1530 goto dropafterack; 1531 } 1532 } 1533 1534 todrop = tp->rcv_nxt - th->th_seq; 1535 if (todrop > 0) { 1536 if (tiflags & TH_SYN) { 1537 tiflags &= ~TH_SYN; 1538 th->th_seq++; 1539 if (th->th_urp > 1) 1540 th->th_urp--; 1541 else { 1542 tiflags &= ~TH_URG; 1543 th->th_urp = 0; 1544 } 1545 todrop--; 1546 } 1547 if (todrop > tlen || 1548 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1549 /* 1550 * Any valid FIN must be to the left of the window. 1551 * At this point the FIN must be a duplicate or 1552 * out of sequence; drop it. 1553 */ 1554 tiflags &= ~TH_FIN; 1555 /* 1556 * Send an ACK to resynchronize and drop any data. 1557 * But keep on processing for RST or ACK. 1558 */ 1559 tp->t_flags |= TF_ACKNOW; 1560 todrop = tlen; 1561 tcpstat.tcps_rcvdupbyte += todrop; 1562 tcpstat.tcps_rcvduppack++; 1563 } else { 1564 tcpstat.tcps_rcvpartduppack++; 1565 tcpstat.tcps_rcvpartdupbyte += todrop; 1566 } 1567 hdroptlen += todrop; /*drop from head afterwards*/ 1568 th->th_seq += todrop; 1569 tlen -= todrop; 1570 if (th->th_urp > todrop) 1571 th->th_urp -= todrop; 1572 else { 1573 tiflags &= ~TH_URG; 1574 th->th_urp = 0; 1575 } 1576 } 1577 1578 /* 1579 * If new data are received on a connection after the 1580 * user processes are gone, then RST the other end. 1581 */ 1582 if ((so->so_state & SS_NOFDREF) && 1583 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1584 tp = tcp_close(tp); 1585 tcpstat.tcps_rcvafterclose++; 1586 goto dropwithreset; 1587 } 1588 1589 /* 1590 * If segment ends after window, drop trailing data 1591 * (and PUSH and FIN); if nothing left, just ACK. 1592 */ 1593 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1594 if (todrop > 0) { 1595 tcpstat.tcps_rcvpackafterwin++; 1596 if (todrop >= tlen) { 1597 tcpstat.tcps_rcvbyteafterwin += tlen; 1598 /* 1599 * If a new connection request is received 1600 * while in TIME_WAIT, drop the old connection 1601 * and start over if the sequence numbers 1602 * are above the previous ones. 1603 */ 1604 if (tiflags & TH_SYN && 1605 tp->t_state == TCPS_TIME_WAIT && 1606 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1607 iss = tcp_new_iss(tp, tp->snd_nxt); 1608 tp = tcp_close(tp); 1609 goto findpcb; 1610 } 1611 /* 1612 * If window is closed can only take segments at 1613 * window edge, and have to drop data and PUSH from 1614 * incoming segments. Continue processing, but 1615 * remember to ack. Otherwise, drop segment 1616 * and ack. 1617 */ 1618 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1619 tp->t_flags |= TF_ACKNOW; 1620 tcpstat.tcps_rcvwinprobe++; 1621 } else 1622 goto dropafterack; 1623 } else 1624 tcpstat.tcps_rcvbyteafterwin += todrop; 1625 m_adj(m, -todrop); 1626 tlen -= todrop; 1627 tiflags &= ~(TH_PUSH|TH_FIN); 1628 } 1629 1630 /* 1631 * If last ACK falls within this segment's sequence numbers, 1632 * and the timestamp is newer, record it. 1633 */ 1634 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1635 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1636 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen + 1637 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 1638 tp->ts_recent_age = TCP_TIMESTAMP(tp); 1639 tp->ts_recent = opti.ts_val; 1640 } 1641 1642 /* 1643 * If the RST bit is set examine the state: 1644 * SYN_RECEIVED STATE: 1645 * If passive open, return to LISTEN state. 1646 * If active open, inform user that connection was refused. 1647 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1648 * Inform user that connection was reset, and close tcb. 1649 * CLOSING, LAST_ACK, TIME_WAIT STATES 1650 * Close the tcb. 1651 */ 1652 if (tiflags&TH_RST) switch (tp->t_state) { 1653 1654 case TCPS_SYN_RECEIVED: 1655 so->so_error = ECONNREFUSED; 1656 goto close; 1657 1658 case TCPS_ESTABLISHED: 1659 case TCPS_FIN_WAIT_1: 1660 case TCPS_FIN_WAIT_2: 1661 case TCPS_CLOSE_WAIT: 1662 so->so_error = ECONNRESET; 1663 close: 1664 tp->t_state = TCPS_CLOSED; 1665 tcpstat.tcps_drops++; 1666 tp = tcp_close(tp); 1667 goto drop; 1668 1669 case TCPS_CLOSING: 1670 case TCPS_LAST_ACK: 1671 case TCPS_TIME_WAIT: 1672 tp = tcp_close(tp); 1673 goto drop; 1674 } 1675 1676 /* 1677 * If a SYN is in the window, then this is an 1678 * error and we send an RST and drop the connection. 1679 */ 1680 if (tiflags & TH_SYN) { 1681 tp = tcp_drop(tp, ECONNRESET); 1682 goto dropwithreset; 1683 } 1684 1685 /* 1686 * If the ACK bit is off we drop the segment and return. 1687 */ 1688 if ((tiflags & TH_ACK) == 0) { 1689 if (tp->t_flags & TF_ACKNOW) 1690 goto dropafterack; 1691 else 1692 goto drop; 1693 } 1694 1695 /* 1696 * Ack processing. 1697 */ 1698 switch (tp->t_state) { 1699 1700 /* 1701 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 1702 * ESTABLISHED state and continue processing, otherwise 1703 * send an RST. 1704 */ 1705 case TCPS_SYN_RECEIVED: 1706 if (SEQ_GT(tp->snd_una, th->th_ack) || 1707 SEQ_GT(th->th_ack, tp->snd_max)) 1708 goto dropwithreset; 1709 tcpstat.tcps_connects++; 1710 soisconnected(so); 1711 tcp_established(tp); 1712 /* Do window scaling? */ 1713 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1714 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1715 tp->snd_scale = tp->requested_s_scale; 1716 tp->rcv_scale = tp->request_r_scale; 1717 } 1718 TCP_REASS_LOCK(tp); 1719 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1720 TCP_REASS_UNLOCK(tp); 1721 tp->snd_wl1 = th->th_seq - 1; 1722 /* fall into ... */ 1723 1724 /* 1725 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1726 * ACKs. If the ack is in the range 1727 * tp->snd_una < th->th_ack <= tp->snd_max 1728 * then advance tp->snd_una to th->th_ack and drop 1729 * data from the retransmission queue. If this ACK reflects 1730 * more up to date window information we update our window information. 1731 */ 1732 case TCPS_ESTABLISHED: 1733 case TCPS_FIN_WAIT_1: 1734 case TCPS_FIN_WAIT_2: 1735 case TCPS_CLOSE_WAIT: 1736 case TCPS_CLOSING: 1737 case TCPS_LAST_ACK: 1738 case TCPS_TIME_WAIT: 1739 1740 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1741 if (tlen == 0 && tiwin == tp->snd_wnd) { 1742 tcpstat.tcps_rcvdupack++; 1743 /* 1744 * If we have outstanding data (other than 1745 * a window probe), this is a completely 1746 * duplicate ack (ie, window info didn't 1747 * change), the ack is the biggest we've 1748 * seen and we've seen exactly our rexmt 1749 * threshhold of them, assume a packet 1750 * has been dropped and retransmit it. 1751 * Kludge snd_nxt & the congestion 1752 * window so we send only this one 1753 * packet. 1754 * 1755 * We know we're losing at the current 1756 * window size so do congestion avoidance 1757 * (set ssthresh to half the current window 1758 * and pull our congestion window back to 1759 * the new ssthresh). 1760 * 1761 * Dup acks mean that packets have left the 1762 * network (they're now cached at the receiver) 1763 * so bump cwnd by the amount in the receiver 1764 * to keep a constant cwnd packets in the 1765 * network. 1766 */ 1767 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 1768 th->th_ack != tp->snd_una) 1769 tp->t_dupacks = 0; 1770 else if (++tp->t_dupacks == tcprexmtthresh) { 1771 tcp_seq onxt = tp->snd_nxt; 1772 u_int win = 1773 min(tp->snd_wnd, tp->snd_cwnd) / 1774 2 / tp->t_segsz; 1775 if (tcp_do_newreno && SEQ_LT(th->th_ack, 1776 tp->snd_recover)) { 1777 /* 1778 * False fast retransmit after 1779 * timeout. Do not cut window. 1780 */ 1781 tp->snd_cwnd += tp->t_segsz; 1782 tp->t_dupacks = 0; 1783 (void) tcp_output(tp); 1784 goto drop; 1785 } 1786 1787 if (win < 2) 1788 win = 2; 1789 tp->snd_ssthresh = win * tp->t_segsz; 1790 tp->snd_recover = tp->snd_max; 1791 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1792 tp->t_rtt = 0; 1793 tp->snd_nxt = th->th_ack; 1794 tp->snd_cwnd = tp->t_segsz; 1795 (void) tcp_output(tp); 1796 tp->snd_cwnd = tp->snd_ssthresh + 1797 tp->t_segsz * tp->t_dupacks; 1798 if (SEQ_GT(onxt, tp->snd_nxt)) 1799 tp->snd_nxt = onxt; 1800 goto drop; 1801 } else if (tp->t_dupacks > tcprexmtthresh) { 1802 tp->snd_cwnd += tp->t_segsz; 1803 (void) tcp_output(tp); 1804 goto drop; 1805 } 1806 } else 1807 tp->t_dupacks = 0; 1808 break; 1809 } 1810 /* 1811 * If the congestion window was inflated to account 1812 * for the other side's cached packets, retract it. 1813 */ 1814 if (tcp_do_newreno == 0) { 1815 if (tp->t_dupacks >= tcprexmtthresh && 1816 tp->snd_cwnd > tp->snd_ssthresh) 1817 tp->snd_cwnd = tp->snd_ssthresh; 1818 tp->t_dupacks = 0; 1819 } else if (tp->t_dupacks >= tcprexmtthresh && 1820 tcp_newreno(tp, th) == 0) { 1821 tp->snd_cwnd = tp->snd_ssthresh; 1822 /* 1823 * Window inflation should have left us with approx. 1824 * snd_ssthresh outstanding data. But in case we 1825 * would be inclined to send a burst, better to do 1826 * it via the slow start mechanism. 1827 */ 1828 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 1829 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 1830 + tp->t_segsz; 1831 tp->t_dupacks = 0; 1832 } 1833 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1834 tcpstat.tcps_rcvacktoomuch++; 1835 goto dropafterack; 1836 } 1837 acked = th->th_ack - tp->snd_una; 1838 tcpstat.tcps_rcvackpack++; 1839 tcpstat.tcps_rcvackbyte += acked; 1840 1841 /* 1842 * If we have a timestamp reply, update smoothed 1843 * round trip time. If no timestamp is present but 1844 * transmit timer is running and timed sequence 1845 * number was acked, update smoothed round trip time. 1846 * Since we now have an rtt measurement, cancel the 1847 * timer backoff (cf., Phil Karn's retransmit alg.). 1848 * Recompute the initial retransmit timer. 1849 */ 1850 if (opti.ts_present && opti.ts_ecr) 1851 tcp_xmit_timer(tp, TCP_TIMESTAMP(tp) - opti.ts_ecr + 1); 1852 else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq)) 1853 tcp_xmit_timer(tp,tp->t_rtt); 1854 1855 /* 1856 * If all outstanding data is acked, stop retransmit 1857 * timer and remember to restart (more output or persist). 1858 * If there is more data to be acked, restart retransmit 1859 * timer, using current (possibly backed-off) value. 1860 */ 1861 if (th->th_ack == tp->snd_max) { 1862 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1863 needoutput = 1; 1864 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1865 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1866 /* 1867 * When new data is acked, open the congestion window. 1868 * If the window gives us less than ssthresh packets 1869 * in flight, open exponentially (segsz per packet). 1870 * Otherwise open linearly: segsz per window 1871 * (segsz^2 / cwnd per packet), plus a constant 1872 * fraction of a packet (segsz/8) to help larger windows 1873 * open quickly enough. 1874 */ 1875 { 1876 u_int cw = tp->snd_cwnd; 1877 u_int incr = tp->t_segsz; 1878 1879 if (cw > tp->snd_ssthresh) 1880 incr = incr * incr / cw; 1881 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover)) 1882 tp->snd_cwnd = min(cw + incr, 1883 TCP_MAXWIN << tp->snd_scale); 1884 } 1885 ND6_HINT(tp); 1886 if (acked > so->so_snd.sb_cc) { 1887 tp->snd_wnd -= so->so_snd.sb_cc; 1888 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1889 ourfinisacked = 1; 1890 } else { 1891 sbdrop(&so->so_snd, acked); 1892 tp->snd_wnd -= acked; 1893 ourfinisacked = 0; 1894 } 1895 sowwakeup(so); 1896 /* 1897 * We want snd_recover to track snd_una to 1898 * avoid sequence wraparound problems for 1899 * very large transfers. 1900 */ 1901 tp->snd_una = tp->snd_recover = th->th_ack; 1902 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1903 tp->snd_nxt = tp->snd_una; 1904 1905 switch (tp->t_state) { 1906 1907 /* 1908 * In FIN_WAIT_1 STATE in addition to the processing 1909 * for the ESTABLISHED state if our FIN is now acknowledged 1910 * then enter FIN_WAIT_2. 1911 */ 1912 case TCPS_FIN_WAIT_1: 1913 if (ourfinisacked) { 1914 /* 1915 * If we can't receive any more 1916 * data, then closing user can proceed. 1917 * Starting the timer is contrary to the 1918 * specification, but if we don't get a FIN 1919 * we'll hang forever. 1920 */ 1921 if (so->so_state & SS_CANTRCVMORE) { 1922 soisdisconnected(so); 1923 if (tcp_maxidle > 0) 1924 TCP_TIMER_ARM(tp, TCPT_2MSL, 1925 tcp_maxidle); 1926 } 1927 tp->t_state = TCPS_FIN_WAIT_2; 1928 } 1929 break; 1930 1931 /* 1932 * In CLOSING STATE in addition to the processing for 1933 * the ESTABLISHED state if the ACK acknowledges our FIN 1934 * then enter the TIME-WAIT state, otherwise ignore 1935 * the segment. 1936 */ 1937 case TCPS_CLOSING: 1938 if (ourfinisacked) { 1939 tp->t_state = TCPS_TIME_WAIT; 1940 tcp_canceltimers(tp); 1941 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1942 soisdisconnected(so); 1943 } 1944 break; 1945 1946 /* 1947 * In LAST_ACK, we may still be waiting for data to drain 1948 * and/or to be acked, as well as for the ack of our FIN. 1949 * If our FIN is now acknowledged, delete the TCB, 1950 * enter the closed state and return. 1951 */ 1952 case TCPS_LAST_ACK: 1953 if (ourfinisacked) { 1954 tp = tcp_close(tp); 1955 goto drop; 1956 } 1957 break; 1958 1959 /* 1960 * In TIME_WAIT state the only thing that should arrive 1961 * is a retransmission of the remote FIN. Acknowledge 1962 * it and restart the finack timer. 1963 */ 1964 case TCPS_TIME_WAIT: 1965 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1966 goto dropafterack; 1967 } 1968 } 1969 1970 step6: 1971 /* 1972 * Update window information. 1973 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1974 */ 1975 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 1976 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 1977 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 1978 /* keep track of pure window updates */ 1979 if (tlen == 0 && 1980 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1981 tcpstat.tcps_rcvwinupd++; 1982 tp->snd_wnd = tiwin; 1983 tp->snd_wl1 = th->th_seq; 1984 tp->snd_wl2 = th->th_ack; 1985 if (tp->snd_wnd > tp->max_sndwnd) 1986 tp->max_sndwnd = tp->snd_wnd; 1987 needoutput = 1; 1988 } 1989 1990 /* 1991 * Process segments with URG. 1992 */ 1993 if ((tiflags & TH_URG) && th->th_urp && 1994 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1995 /* 1996 * This is a kludge, but if we receive and accept 1997 * random urgent pointers, we'll crash in 1998 * soreceive. It's hard to imagine someone 1999 * actually wanting to send this much urgent data. 2000 */ 2001 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2002 th->th_urp = 0; /* XXX */ 2003 tiflags &= ~TH_URG; /* XXX */ 2004 goto dodata; /* XXX */ 2005 } 2006 /* 2007 * If this segment advances the known urgent pointer, 2008 * then mark the data stream. This should not happen 2009 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2010 * a FIN has been received from the remote side. 2011 * In these states we ignore the URG. 2012 * 2013 * According to RFC961 (Assigned Protocols), 2014 * the urgent pointer points to the last octet 2015 * of urgent data. We continue, however, 2016 * to consider it to indicate the first octet 2017 * of data past the urgent section as the original 2018 * spec states (in one of two places). 2019 */ 2020 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2021 tp->rcv_up = th->th_seq + th->th_urp; 2022 so->so_oobmark = so->so_rcv.sb_cc + 2023 (tp->rcv_up - tp->rcv_nxt) - 1; 2024 if (so->so_oobmark == 0) 2025 so->so_state |= SS_RCVATMARK; 2026 sohasoutofband(so); 2027 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2028 } 2029 /* 2030 * Remove out of band data so doesn't get presented to user. 2031 * This can happen independent of advancing the URG pointer, 2032 * but if two URG's are pending at once, some out-of-band 2033 * data may creep in... ick. 2034 */ 2035 if (th->th_urp <= (u_int16_t) tlen 2036 #ifdef SO_OOBINLINE 2037 && (so->so_options & SO_OOBINLINE) == 0 2038 #endif 2039 ) 2040 tcp_pulloutofband(so, th, m, hdroptlen); 2041 } else 2042 /* 2043 * If no out of band data is expected, 2044 * pull receive urgent pointer along 2045 * with the receive window. 2046 */ 2047 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2048 tp->rcv_up = tp->rcv_nxt; 2049 dodata: /* XXX */ 2050 2051 /* 2052 * Process the segment text, merging it into the TCP sequencing queue, 2053 * and arranging for acknowledgement of receipt if necessary. 2054 * This process logically involves adjusting tp->rcv_wnd as data 2055 * is presented to the user (this happens in tcp_usrreq.c, 2056 * case PRU_RCVD). If a FIN has already been received on this 2057 * connection then we just ignore the text. 2058 */ 2059 if ((tlen || (tiflags & TH_FIN)) && 2060 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2061 /* 2062 * Insert segment ti into reassembly queue of tcp with 2063 * control block tp. Return TH_FIN if reassembly now includes 2064 * a segment with FIN. The macro form does the common case 2065 * inline (segment is the next to be received on an 2066 * established connection, and the queue is empty), 2067 * avoiding linkage into and removal from the queue and 2068 * repetition of various conversions. 2069 * Set DELACK for segments received in order, but ack 2070 * immediately when segments are out of order 2071 * (so fast retransmit can work). 2072 */ 2073 /* NOTE: this was TCP_REASS() macro, but used only once */ 2074 TCP_REASS_LOCK(tp); 2075 if (th->th_seq == tp->rcv_nxt && 2076 tp->segq.lh_first == NULL && 2077 tp->t_state == TCPS_ESTABLISHED) { 2078 TCP_SETUP_ACK(tp, th); 2079 tp->rcv_nxt += tlen; 2080 tiflags = th->th_flags & TH_FIN; 2081 tcpstat.tcps_rcvpack++; 2082 tcpstat.tcps_rcvbyte += tlen; 2083 ND6_HINT(tp); 2084 m_adj(m, hdroptlen); 2085 sbappend(&(so)->so_rcv, m); 2086 sorwakeup(so); 2087 } else { 2088 m_adj(m, hdroptlen); 2089 tiflags = tcp_reass(tp, th, m, &tlen); 2090 tp->t_flags |= TF_ACKNOW; 2091 } 2092 TCP_REASS_UNLOCK(tp); 2093 2094 /* 2095 * Note the amount of data that peer has sent into 2096 * our window, in order to estimate the sender's 2097 * buffer size. 2098 */ 2099 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2100 } else { 2101 m_freem(m); 2102 m = NULL; 2103 tiflags &= ~TH_FIN; 2104 } 2105 2106 /* 2107 * If FIN is received ACK the FIN and let the user know 2108 * that the connection is closing. Ignore a FIN received before 2109 * the connection is fully established. 2110 */ 2111 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2112 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2113 socantrcvmore(so); 2114 tp->t_flags |= TF_ACKNOW; 2115 tp->rcv_nxt++; 2116 } 2117 switch (tp->t_state) { 2118 2119 /* 2120 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2121 */ 2122 case TCPS_ESTABLISHED: 2123 tp->t_state = TCPS_CLOSE_WAIT; 2124 break; 2125 2126 /* 2127 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2128 * enter the CLOSING state. 2129 */ 2130 case TCPS_FIN_WAIT_1: 2131 tp->t_state = TCPS_CLOSING; 2132 break; 2133 2134 /* 2135 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2136 * starting the time-wait timer, turning off the other 2137 * standard timers. 2138 */ 2139 case TCPS_FIN_WAIT_2: 2140 tp->t_state = TCPS_TIME_WAIT; 2141 tcp_canceltimers(tp); 2142 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2143 soisdisconnected(so); 2144 break; 2145 2146 /* 2147 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2148 */ 2149 case TCPS_TIME_WAIT: 2150 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2151 break; 2152 } 2153 } 2154 #ifdef TCP_DEBUG 2155 if (so->so_options & SO_DEBUG) 2156 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2157 #endif 2158 2159 /* 2160 * Return any desired output. 2161 */ 2162 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2163 (void) tcp_output(tp); 2164 if (tcp_saveti) 2165 m_freem(tcp_saveti); 2166 return; 2167 2168 badsyn: 2169 /* 2170 * Received a bad SYN. Increment counters and dropwithreset. 2171 */ 2172 tcpstat.tcps_badsyn++; 2173 tp = NULL; 2174 goto dropwithreset; 2175 2176 dropafterack: 2177 /* 2178 * Generate an ACK dropping incoming segment if it occupies 2179 * sequence space, where the ACK reflects our state. 2180 */ 2181 if (tiflags & TH_RST) 2182 goto drop; 2183 m_freem(m); 2184 tp->t_flags |= TF_ACKNOW; 2185 (void) tcp_output(tp); 2186 if (tcp_saveti) 2187 m_freem(tcp_saveti); 2188 return; 2189 2190 dropwithreset_ratelim: 2191 /* 2192 * We may want to rate-limit RSTs in certain situations, 2193 * particularly if we are sending an RST in response to 2194 * an attempt to connect to or otherwise communicate with 2195 * a port for which we have no socket. 2196 */ 2197 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2198 tcp_rst_ppslim) == 0) { 2199 /* XXX stat */ 2200 goto drop; 2201 } 2202 /* ...fall into dropwithreset... */ 2203 2204 dropwithreset: 2205 /* 2206 * Generate a RST, dropping incoming segment. 2207 * Make ACK acceptable to originator of segment. 2208 */ 2209 if (tiflags & TH_RST) 2210 goto drop; 2211 { 2212 /* 2213 * need to recover version # field, which was overwritten on 2214 * ip_cksum computation. 2215 */ 2216 struct ip *sip; 2217 sip = mtod(m, struct ip *); 2218 switch (af) { 2219 #ifdef INET 2220 case AF_INET: 2221 sip->ip_v = 4; 2222 break; 2223 #endif 2224 #ifdef INET6 2225 case AF_INET6: 2226 sip->ip_v = 6; 2227 break; 2228 #endif 2229 } 2230 } 2231 if (tiflags & TH_ACK) 2232 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2233 else { 2234 if (tiflags & TH_SYN) 2235 tlen++; 2236 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2237 TH_RST|TH_ACK); 2238 } 2239 if (tcp_saveti) 2240 m_freem(tcp_saveti); 2241 return; 2242 2243 badcsum: 2244 tcpstat.tcps_rcvbadsum++; 2245 drop: 2246 /* 2247 * Drop space held by incoming segment and return. 2248 */ 2249 if (tp) { 2250 if (tp->t_inpcb) 2251 so = tp->t_inpcb->inp_socket; 2252 #ifdef INET6 2253 else if (tp->t_in6pcb) 2254 so = tp->t_in6pcb->in6p_socket; 2255 #endif 2256 else 2257 so = NULL; 2258 #ifdef TCP_DEBUG 2259 if (so && (so->so_options & SO_DEBUG) != 0) 2260 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2261 #endif 2262 } 2263 if (tcp_saveti) 2264 m_freem(tcp_saveti); 2265 m_freem(m); 2266 return; 2267 } 2268 2269 void 2270 tcp_dooptions(tp, cp, cnt, th, oi) 2271 struct tcpcb *tp; 2272 u_char *cp; 2273 int cnt; 2274 struct tcphdr *th; 2275 struct tcp_opt_info *oi; 2276 { 2277 u_int16_t mss; 2278 int opt, optlen; 2279 2280 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2281 opt = cp[0]; 2282 if (opt == TCPOPT_EOL) 2283 break; 2284 if (opt == TCPOPT_NOP) 2285 optlen = 1; 2286 else { 2287 if (cnt < 2) 2288 break; 2289 optlen = cp[1]; 2290 if (optlen < 2 || optlen > cnt) 2291 break; 2292 } 2293 switch (opt) { 2294 2295 default: 2296 continue; 2297 2298 case TCPOPT_MAXSEG: 2299 if (optlen != TCPOLEN_MAXSEG) 2300 continue; 2301 if (!(th->th_flags & TH_SYN)) 2302 continue; 2303 bcopy(cp + 2, &mss, sizeof(mss)); 2304 oi->maxseg = ntohs(mss); 2305 break; 2306 2307 case TCPOPT_WINDOW: 2308 if (optlen != TCPOLEN_WINDOW) 2309 continue; 2310 if (!(th->th_flags & TH_SYN)) 2311 continue; 2312 tp->t_flags |= TF_RCVD_SCALE; 2313 tp->requested_s_scale = cp[2]; 2314 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 2315 #if 0 /*XXX*/ 2316 char *p; 2317 2318 if (ip) 2319 p = ntohl(ip->ip_src); 2320 #ifdef INET6 2321 else if (ip6) 2322 p = ip6_sprintf(&ip6->ip6_src); 2323 #endif 2324 else 2325 p = "(unknown)"; 2326 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 2327 "assuming %d\n", 2328 tp->requested_s_scale, p, 2329 TCP_MAX_WINSHIFT); 2330 #else 2331 log(LOG_ERR, "TCP: invalid wscale %d, " 2332 "assuming %d\n", 2333 tp->requested_s_scale, 2334 TCP_MAX_WINSHIFT); 2335 #endif 2336 tp->requested_s_scale = TCP_MAX_WINSHIFT; 2337 } 2338 break; 2339 2340 case TCPOPT_TIMESTAMP: 2341 if (optlen != TCPOLEN_TIMESTAMP) 2342 continue; 2343 oi->ts_present = 1; 2344 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2345 NTOHL(oi->ts_val); 2346 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2347 NTOHL(oi->ts_ecr); 2348 2349 /* 2350 * A timestamp received in a SYN makes 2351 * it ok to send timestamp requests and replies. 2352 */ 2353 if (th->th_flags & TH_SYN) { 2354 tp->t_flags |= TF_RCVD_TSTMP; 2355 tp->ts_recent = oi->ts_val; 2356 tp->ts_recent_age = TCP_TIMESTAMP(tp); 2357 } 2358 break; 2359 case TCPOPT_SACK_PERMITTED: 2360 if (optlen != TCPOLEN_SACK_PERMITTED) 2361 continue; 2362 if (!(th->th_flags & TH_SYN)) 2363 continue; 2364 tp->t_flags &= ~TF_CANT_TXSACK; 2365 break; 2366 2367 case TCPOPT_SACK: 2368 if (tp->t_flags & TF_IGNR_RXSACK) 2369 continue; 2370 if (optlen % 8 != 2 || optlen < 10) 2371 continue; 2372 cp += 2; 2373 optlen -= 2; 2374 for (; optlen > 0; cp -= 8, optlen -= 8) { 2375 tcp_seq lwe, rwe; 2376 bcopy((char *)cp, (char *) &lwe, sizeof(lwe)); 2377 NTOHL(lwe); 2378 bcopy((char *)cp, (char *) &rwe, sizeof(rwe)); 2379 NTOHL(rwe); 2380 /* tcp_mark_sacked(tp, lwe, rwe); */ 2381 } 2382 break; 2383 } 2384 } 2385 } 2386 2387 /* 2388 * Pull out of band byte out of a segment so 2389 * it doesn't appear in the user's data queue. 2390 * It is still reflected in the segment length for 2391 * sequencing purposes. 2392 */ 2393 void 2394 tcp_pulloutofband(so, th, m, off) 2395 struct socket *so; 2396 struct tcphdr *th; 2397 struct mbuf *m; 2398 int off; 2399 { 2400 int cnt = off + th->th_urp - 1; 2401 2402 while (cnt >= 0) { 2403 if (m->m_len > cnt) { 2404 char *cp = mtod(m, caddr_t) + cnt; 2405 struct tcpcb *tp = sototcpcb(so); 2406 2407 tp->t_iobc = *cp; 2408 tp->t_oobflags |= TCPOOB_HAVEDATA; 2409 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2410 m->m_len--; 2411 return; 2412 } 2413 cnt -= m->m_len; 2414 m = m->m_next; 2415 if (m == 0) 2416 break; 2417 } 2418 panic("tcp_pulloutofband"); 2419 } 2420 2421 /* 2422 * Collect new round-trip time estimate 2423 * and update averages and current timeout. 2424 */ 2425 void 2426 tcp_xmit_timer(tp, rtt) 2427 struct tcpcb *tp; 2428 short rtt; 2429 { 2430 short delta; 2431 short rttmin; 2432 2433 tcpstat.tcps_rttupdated++; 2434 --rtt; 2435 if (tp->t_srtt != 0) { 2436 /* 2437 * srtt is stored as fixed point with 3 bits after the 2438 * binary point (i.e., scaled by 8). The following magic 2439 * is equivalent to the smoothing algorithm in rfc793 with 2440 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2441 * point). Adjust rtt to origin 0. 2442 */ 2443 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2444 if ((tp->t_srtt += delta) <= 0) 2445 tp->t_srtt = 1 << 2; 2446 /* 2447 * We accumulate a smoothed rtt variance (actually, a 2448 * smoothed mean difference), then set the retransmit 2449 * timer to smoothed rtt + 4 times the smoothed variance. 2450 * rttvar is stored as fixed point with 2 bits after the 2451 * binary point (scaled by 4). The following is 2452 * equivalent to rfc793 smoothing with an alpha of .75 2453 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2454 * rfc793's wired-in beta. 2455 */ 2456 if (delta < 0) 2457 delta = -delta; 2458 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2459 if ((tp->t_rttvar += delta) <= 0) 2460 tp->t_rttvar = 1 << 2; 2461 } else { 2462 /* 2463 * No rtt measurement yet - use the unsmoothed rtt. 2464 * Set the variance to half the rtt (so our first 2465 * retransmit happens at 3*rtt). 2466 */ 2467 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2468 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2469 } 2470 tp->t_rtt = 0; 2471 tp->t_rxtshift = 0; 2472 2473 /* 2474 * the retransmit should happen at rtt + 4 * rttvar. 2475 * Because of the way we do the smoothing, srtt and rttvar 2476 * will each average +1/2 tick of bias. When we compute 2477 * the retransmit timer, we want 1/2 tick of rounding and 2478 * 1 extra tick because of +-1/2 tick uncertainty in the 2479 * firing of the timer. The bias will give us exactly the 2480 * 1.5 tick we need. But, because the bias is 2481 * statistical, we have to test that we don't drop below 2482 * the minimum feasible timer (which is 2 ticks). 2483 */ 2484 if (tp->t_rttmin > rtt + 2) 2485 rttmin = tp->t_rttmin; 2486 else 2487 rttmin = rtt + 2; 2488 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2489 2490 /* 2491 * We received an ack for a packet that wasn't retransmitted; 2492 * it is probably safe to discard any error indications we've 2493 * received recently. This isn't quite right, but close enough 2494 * for now (a route might have failed after we sent a segment, 2495 * and the return path might not be symmetrical). 2496 */ 2497 tp->t_softerror = 0; 2498 } 2499 2500 /* 2501 * Checks for partial ack. If partial ack arrives, force the retransmission 2502 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2503 * 1. By setting snd_nxt to th_ack, this forces retransmission timer to 2504 * be started again. If the ack advances at least to tp->snd_recover, return 0. 2505 */ 2506 int 2507 tcp_newreno(tp, th) 2508 struct tcpcb *tp; 2509 struct tcphdr *th; 2510 { 2511 tcp_seq onxt = tp->snd_nxt; 2512 u_long ocwnd = tp->snd_cwnd; 2513 2514 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2515 /* 2516 * snd_una has not yet been updated and the socket's send 2517 * buffer has not yet drained off the ACK'd data, so we 2518 * have to leave snd_una as it was to get the correct data 2519 * offset in tcp_output(). 2520 */ 2521 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2522 tp->t_rtt = 0; 2523 tp->snd_nxt = th->th_ack; 2524 /* 2525 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una 2526 * is not yet updated when we're called. 2527 */ 2528 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 2529 (void) tcp_output(tp); 2530 tp->snd_cwnd = ocwnd; 2531 if (SEQ_GT(onxt, tp->snd_nxt)) 2532 tp->snd_nxt = onxt; 2533 /* 2534 * Partial window deflation. Relies on fact that tp->snd_una 2535 * not updated yet. 2536 */ 2537 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz); 2538 return 1; 2539 } 2540 return 0; 2541 } 2542 2543 2544 /* 2545 * TCP compressed state engine. Currently used to hold compressed 2546 * state for SYN_RECEIVED. 2547 */ 2548 2549 u_long syn_cache_count; 2550 u_int32_t syn_hash1, syn_hash2; 2551 2552 #define SYN_HASH(sa, sp, dp) \ 2553 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 2554 ((u_int32_t)(sp)))^syn_hash2))) 2555 #ifndef INET6 2556 #define SYN_HASHALL(hash, src, dst) \ 2557 do { \ 2558 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2559 ((struct sockaddr_in *)(src))->sin_port, \ 2560 ((struct sockaddr_in *)(dst))->sin_port); \ 2561 } while (0) 2562 #else 2563 #define SYN_HASH6(sa, sp, dp) \ 2564 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 2565 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 2566 & 0x7fffffff) 2567 2568 #define SYN_HASHALL(hash, src, dst) \ 2569 do { \ 2570 switch ((src)->sa_family) { \ 2571 case AF_INET: \ 2572 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2573 ((struct sockaddr_in *)(src))->sin_port, \ 2574 ((struct sockaddr_in *)(dst))->sin_port); \ 2575 break; \ 2576 case AF_INET6: \ 2577 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 2578 ((struct sockaddr_in6 *)(src))->sin6_port, \ 2579 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 2580 break; \ 2581 default: \ 2582 hash = 0; \ 2583 } \ 2584 } while (0) 2585 #endif /* INET6 */ 2586 2587 #define SYN_CACHE_RM(sc) \ 2588 do { \ 2589 LIST_REMOVE((sc), sc_bucketq); \ 2590 (sc)->sc_tp = NULL; \ 2591 LIST_REMOVE((sc), sc_tpq); \ 2592 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 2593 TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \ 2594 syn_cache_count--; \ 2595 } while (0) 2596 2597 #define SYN_CACHE_PUT(sc) \ 2598 do { \ 2599 if ((sc)->sc_ipopts) \ 2600 (void) m_free((sc)->sc_ipopts); \ 2601 if ((sc)->sc_route4.ro_rt != NULL) \ 2602 RTFREE((sc)->sc_route4.ro_rt); \ 2603 pool_put(&syn_cache_pool, (sc)); \ 2604 } while (0) 2605 2606 struct pool syn_cache_pool; 2607 2608 /* 2609 * We don't estimate RTT with SYNs, so each packet starts with the default 2610 * RTT and each timer queue has a fixed timeout value. This allows us to 2611 * optimize the timer queues somewhat. 2612 */ 2613 #define SYN_CACHE_TIMER_ARM(sc) \ 2614 do { \ 2615 TCPT_RANGESET((sc)->sc_rxtcur, \ 2616 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 2617 TCPTV_REXMTMAX); \ 2618 PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur); \ 2619 } while (0) 2620 2621 TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1]; 2622 2623 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) 2624 2625 void 2626 syn_cache_init() 2627 { 2628 int i; 2629 2630 /* Initialize the hash buckets. */ 2631 for (i = 0; i < tcp_syn_cache_size; i++) 2632 LIST_INIT(&tcp_syn_cache[i].sch_bucket); 2633 2634 /* Initialize the timer queues. */ 2635 for (i = 0; i <= TCP_MAXRXTSHIFT; i++) 2636 TAILQ_INIT(&tcp_syn_cache_timeq[i]); 2637 2638 /* Initialize the syn cache pool. */ 2639 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 2640 "synpl", 0, NULL, NULL, M_PCB); 2641 } 2642 2643 void 2644 syn_cache_insert(sc, tp) 2645 struct syn_cache *sc; 2646 struct tcpcb *tp; 2647 { 2648 struct syn_cache_head *scp; 2649 struct syn_cache *sc2; 2650 int s, i; 2651 2652 /* 2653 * If there are no entries in the hash table, reinitialize 2654 * the hash secrets. 2655 */ 2656 if (syn_cache_count == 0) { 2657 struct timeval tv; 2658 microtime(&tv); 2659 syn_hash1 = random() ^ (u_long)≻ 2660 syn_hash2 = random() ^ tv.tv_usec; 2661 } 2662 2663 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 2664 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 2665 scp = &tcp_syn_cache[sc->sc_bucketidx]; 2666 2667 /* 2668 * Make sure that we don't overflow the per-bucket 2669 * limit or the total cache size limit. 2670 */ 2671 s = splsoftnet(); 2672 if (scp->sch_length >= tcp_syn_bucket_limit) { 2673 tcpstat.tcps_sc_bucketoverflow++; 2674 /* 2675 * The bucket is full. Toss the oldest element in the 2676 * bucket. This will be the entry with our bucket 2677 * index closest to the front of the timer queue with 2678 * the largest timeout value. 2679 * 2680 * Note: This timer queue traversal may be expensive, so 2681 * we hope that this doesn't happen very often. It is 2682 * much more likely that we'll overflow the entire 2683 * cache, which is much easier to handle; see below. 2684 */ 2685 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2686 for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2687 sc2 != NULL; 2688 sc2 = TAILQ_NEXT(sc2, sc_timeq)) { 2689 if (sc2->sc_bucketidx == sc->sc_bucketidx) { 2690 SYN_CACHE_RM(sc2); 2691 SYN_CACHE_PUT(sc2); 2692 goto insert; /* 2 level break */ 2693 } 2694 } 2695 } 2696 #ifdef DIAGNOSTIC 2697 /* 2698 * This should never happen; we should always find an 2699 * entry in our bucket. 2700 */ 2701 panic("syn_cache_insert: bucketoverflow: impossible"); 2702 #endif 2703 } else if (syn_cache_count >= tcp_syn_cache_limit) { 2704 tcpstat.tcps_sc_overflowed++; 2705 /* 2706 * The cache is full. Toss the oldest entry in the 2707 * entire cache. This is the front entry in the 2708 * first non-empty timer queue with the largest 2709 * timeout value. 2710 */ 2711 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2712 sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2713 if (sc2 == NULL) 2714 continue; 2715 SYN_CACHE_RM(sc2); 2716 SYN_CACHE_PUT(sc2); 2717 goto insert; /* symmetry with above */ 2718 } 2719 #ifdef DIAGNOSTIC 2720 /* 2721 * This should never happen; we should always find an 2722 * entry in the cache. 2723 */ 2724 panic("syn_cache_insert: cache overflow: impossible"); 2725 #endif 2726 } 2727 2728 insert: 2729 /* 2730 * Initialize the entry's timer. 2731 */ 2732 sc->sc_rxttot = 0; 2733 sc->sc_rxtshift = 0; 2734 SYN_CACHE_TIMER_ARM(sc); 2735 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq); 2736 2737 /* Link it from tcpcb entry */ 2738 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 2739 2740 /* Put it into the bucket. */ 2741 LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq); 2742 scp->sch_length++; 2743 syn_cache_count++; 2744 2745 tcpstat.tcps_sc_added++; 2746 splx(s); 2747 } 2748 2749 /* 2750 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 2751 * If we have retransmitted an entry the maximum number of times, expire 2752 * that entry. 2753 */ 2754 void 2755 syn_cache_timer() 2756 { 2757 struct syn_cache *sc, *nsc; 2758 int i, s; 2759 2760 s = splsoftnet(); 2761 2762 /* 2763 * First, get all the entries that need to be retransmitted, or 2764 * must be expired due to exceeding the initial keepalive time. 2765 */ 2766 for (i = 0; i < TCP_MAXRXTSHIFT; i++) { 2767 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2768 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2769 sc = nsc) { 2770 nsc = TAILQ_NEXT(sc, sc_timeq); 2771 2772 /* 2773 * Compute the total amount of time this entry has 2774 * been on a queue. If this entry has been on longer 2775 * than the keep alive timer would allow, expire it. 2776 */ 2777 sc->sc_rxttot += sc->sc_rxtcur; 2778 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) { 2779 tcpstat.tcps_sc_timed_out++; 2780 SYN_CACHE_RM(sc); 2781 SYN_CACHE_PUT(sc); 2782 continue; 2783 } 2784 2785 tcpstat.tcps_sc_retransmitted++; 2786 (void) syn_cache_respond(sc, NULL); 2787 2788 /* Advance this entry onto the next timer queue. */ 2789 TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq); 2790 sc->sc_rxtshift = i + 1; 2791 SYN_CACHE_TIMER_ARM(sc); 2792 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], 2793 sc, sc_timeq); 2794 } 2795 } 2796 2797 /* 2798 * Now get all the entries that are expired due to too many 2799 * retransmissions. 2800 */ 2801 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]); 2802 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2803 sc = nsc) { 2804 nsc = TAILQ_NEXT(sc, sc_timeq); 2805 tcpstat.tcps_sc_timed_out++; 2806 SYN_CACHE_RM(sc); 2807 SYN_CACHE_PUT(sc); 2808 } 2809 splx(s); 2810 } 2811 2812 /* 2813 * Remove syn cache created by the specified tcb entry, 2814 * because this does not make sense to keep them 2815 * (if there's no tcb entry, syn cache entry will never be used) 2816 */ 2817 void 2818 syn_cache_cleanup(tp) 2819 struct tcpcb *tp; 2820 { 2821 struct syn_cache *sc, *nsc; 2822 int s; 2823 2824 s = splsoftnet(); 2825 2826 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 2827 nsc = LIST_NEXT(sc, sc_tpq); 2828 2829 #ifdef DIAGNOSTIC 2830 if (sc->sc_tp != tp) 2831 panic("invalid sc_tp in syn_cache_cleanup"); 2832 #endif 2833 SYN_CACHE_RM(sc); 2834 SYN_CACHE_PUT(sc); 2835 } 2836 /* just for safety */ 2837 LIST_INIT(&tp->t_sc); 2838 2839 splx(s); 2840 } 2841 2842 /* 2843 * Find an entry in the syn cache. 2844 */ 2845 struct syn_cache * 2846 syn_cache_lookup(src, dst, headp) 2847 struct sockaddr *src; 2848 struct sockaddr *dst; 2849 struct syn_cache_head **headp; 2850 { 2851 struct syn_cache *sc; 2852 struct syn_cache_head *scp; 2853 u_int32_t hash; 2854 int s; 2855 2856 SYN_HASHALL(hash, src, dst); 2857 2858 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 2859 *headp = scp; 2860 s = splsoftnet(); 2861 for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL; 2862 sc = LIST_NEXT(sc, sc_bucketq)) { 2863 if (sc->sc_hash != hash) 2864 continue; 2865 if (!bcmp(&sc->sc_src, src, src->sa_len) && 2866 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 2867 splx(s); 2868 return (sc); 2869 } 2870 } 2871 splx(s); 2872 return (NULL); 2873 } 2874 2875 /* 2876 * This function gets called when we receive an ACK for a 2877 * socket in the LISTEN state. We look up the connection 2878 * in the syn cache, and if its there, we pull it out of 2879 * the cache and turn it into a full-blown connection in 2880 * the SYN-RECEIVED state. 2881 * 2882 * The return values may not be immediately obvious, and their effects 2883 * can be subtle, so here they are: 2884 * 2885 * NULL SYN was not found in cache; caller should drop the 2886 * packet and send an RST. 2887 * 2888 * -1 We were unable to create the new connection, and are 2889 * aborting it. An ACK,RST is being sent to the peer 2890 * (unless we got screwey sequence numbners; see below), 2891 * because the 3-way handshake has been completed. Caller 2892 * should not free the mbuf, since we may be using it. If 2893 * we are not, we will free it. 2894 * 2895 * Otherwise, the return value is a pointer to the new socket 2896 * associated with the connection. 2897 */ 2898 struct socket * 2899 syn_cache_get(src, dst, th, hlen, tlen, so, m) 2900 struct sockaddr *src; 2901 struct sockaddr *dst; 2902 struct tcphdr *th; 2903 unsigned int hlen, tlen; 2904 struct socket *so; 2905 struct mbuf *m; 2906 { 2907 struct syn_cache *sc; 2908 struct syn_cache_head *scp; 2909 struct inpcb *inp = NULL; 2910 #ifdef INET6 2911 struct in6pcb *in6p = NULL; 2912 #endif 2913 struct tcpcb *tp = 0; 2914 struct mbuf *am; 2915 int s; 2916 struct socket *oso; 2917 2918 s = splsoftnet(); 2919 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 2920 splx(s); 2921 return (NULL); 2922 } 2923 2924 /* 2925 * Verify the sequence and ack numbers. Try getting the correct 2926 * response again. 2927 */ 2928 if ((th->th_ack != sc->sc_iss + 1) || 2929 SEQ_LEQ(th->th_seq, sc->sc_irs) || 2930 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 2931 (void) syn_cache_respond(sc, m); 2932 splx(s); 2933 return ((struct socket *)(-1)); 2934 } 2935 2936 /* Remove this cache entry */ 2937 SYN_CACHE_RM(sc); 2938 splx(s); 2939 2940 /* 2941 * Ok, create the full blown connection, and set things up 2942 * as they would have been set up if we had created the 2943 * connection when the SYN arrived. If we can't create 2944 * the connection, abort it. 2945 */ 2946 /* 2947 * inp still has the OLD in_pcb stuff, set the 2948 * v6-related flags on the new guy, too. This is 2949 * done particularly for the case where an AF_INET6 2950 * socket is bound only to a port, and a v4 connection 2951 * comes in on that port. 2952 * we also copy the flowinfo from the original pcb 2953 * to the new one. 2954 */ 2955 { 2956 struct inpcb *parentinpcb; 2957 2958 parentinpcb = (struct inpcb *)so->so_pcb; 2959 2960 oso = so; 2961 so = sonewconn(so, SS_ISCONNECTED); 2962 if (so == NULL) 2963 goto resetandabort; 2964 2965 switch (so->so_proto->pr_domain->dom_family) { 2966 #ifdef INET 2967 case AF_INET: 2968 inp = sotoinpcb(so); 2969 break; 2970 #endif 2971 #ifdef INET6 2972 case AF_INET6: 2973 in6p = sotoin6pcb(so); 2974 break; 2975 #endif 2976 } 2977 } 2978 switch (src->sa_family) { 2979 #ifdef INET 2980 case AF_INET: 2981 if (inp) { 2982 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 2983 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 2984 inp->inp_options = ip_srcroute(); 2985 in_pcbstate(inp, INP_BOUND); 2986 if (inp->inp_options == NULL) { 2987 inp->inp_options = sc->sc_ipopts; 2988 sc->sc_ipopts = NULL; 2989 } 2990 } 2991 #ifdef INET6 2992 else if (in6p) { 2993 /* IPv4 packet to AF_INET6 socket */ 2994 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr)); 2995 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 2996 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 2997 &in6p->in6p_laddr.s6_addr32[3], 2998 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 2999 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 3000 in6totcpcb(in6p)->t_family = AF_INET; 3001 } 3002 #endif 3003 break; 3004 #endif 3005 #ifdef INET6 3006 case AF_INET6: 3007 if (in6p) { 3008 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 3009 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 3010 #if 0 3011 in6p->in6p_flowinfo = ip6->ip6_flow & IPV6_FLOWINFO_MASK; 3012 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 3013 #endif 3014 } 3015 break; 3016 #endif 3017 } 3018 #ifdef INET6 3019 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 3020 struct in6pcb *oin6p = sotoin6pcb(oso); 3021 /* inherit socket options from the listening socket */ 3022 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 3023 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 3024 m_freem(in6p->in6p_options); 3025 in6p->in6p_options = 0; 3026 } 3027 ip6_savecontrol(in6p, &in6p->in6p_options, 3028 mtod(m, struct ip6_hdr *), m); 3029 } 3030 #endif 3031 3032 #ifdef IPSEC 3033 /* 3034 * we make a copy of policy, instead of sharing the policy, 3035 * for better behavior in terms of SA lookup and dead SA removal. 3036 */ 3037 if (inp) { 3038 /* copy old policy into new socket's */ 3039 if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) 3040 printf("tcp_input: could not copy policy\n"); 3041 } 3042 #ifdef INET6 3043 else if (in6p) { 3044 /* copy old policy into new socket's */ 3045 if (ipsec_copy_policy(sotoin6pcb(oso)->in6p_sp, in6p->in6p_sp)) 3046 printf("tcp_input: could not copy policy\n"); 3047 } 3048 #endif 3049 #endif 3050 3051 /* 3052 * Give the new socket our cached route reference. 3053 */ 3054 if (inp) 3055 inp->inp_route = sc->sc_route4; /* struct assignment */ 3056 #ifdef INET6 3057 else 3058 in6p->in6p_route = sc->sc_route6; 3059 #endif 3060 sc->sc_route4.ro_rt = NULL; 3061 3062 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3063 if (am == NULL) 3064 goto resetandabort; 3065 am->m_len = src->sa_len; 3066 bcopy(src, mtod(am, caddr_t), src->sa_len); 3067 if (inp) { 3068 if (in_pcbconnect(inp, am)) { 3069 (void) m_free(am); 3070 goto resetandabort; 3071 } 3072 } 3073 #ifdef INET6 3074 else if (in6p) { 3075 if (src->sa_family == AF_INET) { 3076 /* IPv4 packet to AF_INET6 socket */ 3077 struct sockaddr_in6 *sin6; 3078 sin6 = mtod(am, struct sockaddr_in6 *); 3079 am->m_len = sizeof(*sin6); 3080 bzero(sin6, sizeof(*sin6)); 3081 sin6->sin6_family = AF_INET6; 3082 sin6->sin6_len = sizeof(*sin6); 3083 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port; 3084 sin6->sin6_addr.s6_addr16[5] = htons(0xffff); 3085 bcopy(&((struct sockaddr_in *)src)->sin_addr, 3086 &sin6->sin6_addr.s6_addr32[3], 3087 sizeof(sin6->sin6_addr.s6_addr32[3])); 3088 } 3089 if (in6_pcbconnect(in6p, am)) { 3090 (void) m_free(am); 3091 goto resetandabort; 3092 } 3093 } 3094 #endif 3095 else { 3096 (void) m_free(am); 3097 goto resetandabort; 3098 } 3099 (void) m_free(am); 3100 3101 if (inp) 3102 tp = intotcpcb(inp); 3103 #ifdef INET6 3104 else if (in6p) 3105 tp = in6totcpcb(in6p); 3106 #endif 3107 else 3108 tp = NULL; 3109 if (sc->sc_request_r_scale != 15) { 3110 tp->requested_s_scale = sc->sc_requested_s_scale; 3111 tp->request_r_scale = sc->sc_request_r_scale; 3112 tp->snd_scale = sc->sc_requested_s_scale; 3113 tp->rcv_scale = sc->sc_request_r_scale; 3114 tp->t_flags |= TF_RCVD_SCALE; 3115 } 3116 if (sc->sc_flags & SCF_TIMESTAMP) 3117 tp->t_flags |= TF_RCVD_TSTMP; 3118 tp->ts_timebase = sc->sc_timebase; 3119 3120 tp->t_template = tcp_template(tp); 3121 if (tp->t_template == 0) { 3122 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3123 so = NULL; 3124 m_freem(m); 3125 goto abort; 3126 } 3127 3128 tp->iss = sc->sc_iss; 3129 tp->irs = sc->sc_irs; 3130 tcp_sendseqinit(tp); 3131 tcp_rcvseqinit(tp); 3132 tp->t_state = TCPS_SYN_RECEIVED; 3133 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT); 3134 tcpstat.tcps_accepts++; 3135 3136 /* Initialize tp->t_ourmss before we deal with the peer's! */ 3137 tp->t_ourmss = sc->sc_ourmaxseg; 3138 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 3139 3140 /* 3141 * Initialize the initial congestion window. If we 3142 * had to retransmit the SYN,ACK, we must initialize cwnd 3143 * to 1 segment (i.e. the Loss Window). 3144 */ 3145 if (sc->sc_rxtshift) 3146 tp->snd_cwnd = tp->t_peermss; 3147 else 3148 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss); 3149 3150 tcp_rmx_rtt(tp); 3151 tp->snd_wl1 = sc->sc_irs; 3152 tp->rcv_up = sc->sc_irs + 1; 3153 3154 /* 3155 * This is what whould have happened in tcp_ouput() when 3156 * the SYN,ACK was sent. 3157 */ 3158 tp->snd_up = tp->snd_una; 3159 tp->snd_max = tp->snd_nxt = tp->iss+1; 3160 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3161 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3162 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3163 tp->last_ack_sent = tp->rcv_nxt; 3164 3165 tcpstat.tcps_sc_completed++; 3166 SYN_CACHE_PUT(sc); 3167 return (so); 3168 3169 resetandabort: 3170 (void) tcp_respond(NULL, m, m, th, 3171 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); 3172 abort: 3173 if (so != NULL) 3174 (void) soabort(so); 3175 SYN_CACHE_PUT(sc); 3176 tcpstat.tcps_sc_aborted++; 3177 return ((struct socket *)(-1)); 3178 } 3179 3180 /* 3181 * This function is called when we get a RST for a 3182 * non-existent connection, so that we can see if the 3183 * connection is in the syn cache. If it is, zap it. 3184 */ 3185 3186 void 3187 syn_cache_reset(src, dst, th) 3188 struct sockaddr *src; 3189 struct sockaddr *dst; 3190 struct tcphdr *th; 3191 { 3192 struct syn_cache *sc; 3193 struct syn_cache_head *scp; 3194 int s = splsoftnet(); 3195 3196 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3197 splx(s); 3198 return; 3199 } 3200 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3201 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3202 splx(s); 3203 return; 3204 } 3205 SYN_CACHE_RM(sc); 3206 splx(s); 3207 tcpstat.tcps_sc_reset++; 3208 SYN_CACHE_PUT(sc); 3209 } 3210 3211 void 3212 syn_cache_unreach(src, dst, th) 3213 struct sockaddr *src; 3214 struct sockaddr *dst; 3215 struct tcphdr *th; 3216 { 3217 struct syn_cache *sc; 3218 struct syn_cache_head *scp; 3219 int s; 3220 3221 s = splsoftnet(); 3222 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3223 splx(s); 3224 return; 3225 } 3226 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3227 if (ntohl (th->th_seq) != sc->sc_iss) { 3228 splx(s); 3229 return; 3230 } 3231 3232 /* 3233 * If we've rertransmitted 3 times and this is our second error, 3234 * we remove the entry. Otherwise, we allow it to continue on. 3235 * This prevents us from incorrectly nuking an entry during a 3236 * spurious network outage. 3237 * 3238 * See tcp_notify(). 3239 */ 3240 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3241 sc->sc_flags |= SCF_UNREACH; 3242 splx(s); 3243 return; 3244 } 3245 3246 SYN_CACHE_RM(sc); 3247 splx(s); 3248 tcpstat.tcps_sc_unreach++; 3249 SYN_CACHE_PUT(sc); 3250 } 3251 3252 /* 3253 * Given a LISTEN socket and an inbound SYN request, add 3254 * this to the syn cache, and send back a segment: 3255 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3256 * to the source. 3257 * 3258 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3259 * Doing so would require that we hold onto the data and deliver it 3260 * to the application. However, if we are the target of a SYN-flood 3261 * DoS attack, an attacker could send data which would eventually 3262 * consume all available buffer space if it were ACKed. By not ACKing 3263 * the data, we avoid this DoS scenario. 3264 */ 3265 3266 int 3267 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi) 3268 struct sockaddr *src; 3269 struct sockaddr *dst; 3270 struct tcphdr *th; 3271 unsigned int hlen; 3272 struct socket *so; 3273 struct mbuf *m; 3274 u_char *optp; 3275 int optlen; 3276 struct tcp_opt_info *oi; 3277 { 3278 struct tcpcb tb, *tp; 3279 long win; 3280 struct syn_cache *sc; 3281 struct syn_cache_head *scp; 3282 struct mbuf *ipopts; 3283 3284 tp = sototcpcb(so); 3285 3286 /* 3287 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3288 * 3289 * Note this check is performed in tcp_input() very early on. 3290 */ 3291 3292 /* 3293 * Initialize some local state. 3294 */ 3295 win = sbspace(&so->so_rcv); 3296 if (win > TCP_MAXWIN) 3297 win = TCP_MAXWIN; 3298 3299 switch (src->sa_family) { 3300 #ifdef INET 3301 case AF_INET: 3302 /* 3303 * Remember the IP options, if any. 3304 */ 3305 ipopts = ip_srcroute(); 3306 break; 3307 #endif 3308 default: 3309 ipopts = NULL; 3310 } 3311 3312 if (optp) { 3313 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3314 tcp_dooptions(&tb, optp, optlen, th, oi); 3315 } else 3316 tb.t_flags = 0; 3317 3318 /* 3319 * See if we already have an entry for this connection. 3320 * If we do, resend the SYN,ACK. We do not count this 3321 * as a retransmission (XXX though maybe we should). 3322 */ 3323 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3324 tcpstat.tcps_sc_dupesyn++; 3325 if (ipopts) { 3326 /* 3327 * If we were remembering a previous source route, 3328 * forget it and use the new one we've been given. 3329 */ 3330 if (sc->sc_ipopts) 3331 (void) m_free(sc->sc_ipopts); 3332 sc->sc_ipopts = ipopts; 3333 } 3334 sc->sc_timestamp = tb.ts_recent; 3335 if (syn_cache_respond(sc, m) == 0) { 3336 tcpstat.tcps_sndacks++; 3337 tcpstat.tcps_sndtotal++; 3338 } 3339 return (1); 3340 } 3341 3342 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 3343 if (sc == NULL) { 3344 if (ipopts) 3345 (void) m_free(ipopts); 3346 return (0); 3347 } 3348 3349 /* 3350 * Fill in the cache, and put the necessary IP and TCP 3351 * options into the reply. 3352 */ 3353 bzero(sc, sizeof(struct syn_cache)); 3354 bcopy(src, &sc->sc_src, src->sa_len); 3355 bcopy(dst, &sc->sc_dst, dst->sa_len); 3356 sc->sc_flags = 0; 3357 sc->sc_ipopts = ipopts; 3358 sc->sc_irs = th->th_seq; 3359 switch (src->sa_family) { 3360 #ifdef INET 3361 case AF_INET: 3362 { 3363 struct sockaddr_in *srcin = (void *) src; 3364 struct sockaddr_in *dstin = (void *) dst; 3365 3366 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, 3367 &srcin->sin_addr, dstin->sin_port, 3368 srcin->sin_port, sizeof(dstin->sin_addr), 0); 3369 break; 3370 } 3371 #endif /* INET */ 3372 #ifdef INET6 3373 case AF_INET6: 3374 { 3375 struct sockaddr_in6 *srcin6 = (void *) src; 3376 struct sockaddr_in6 *dstin6 = (void *) dst; 3377 3378 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, 3379 &srcin6->sin6_addr, dstin6->sin6_port, 3380 srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0); 3381 break; 3382 } 3383 #endif /* INET6 */ 3384 } 3385 sc->sc_peermaxseg = oi->maxseg; 3386 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 3387 m->m_pkthdr.rcvif : NULL, 3388 sc->sc_src.sa.sa_family); 3389 sc->sc_win = win; 3390 sc->sc_timebase = tcp_now; /* see tcp_newtcpcb() */ 3391 sc->sc_timestamp = tb.ts_recent; 3392 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) 3393 sc->sc_flags |= SCF_TIMESTAMP; 3394 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3395 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3396 sc->sc_requested_s_scale = tb.requested_s_scale; 3397 sc->sc_request_r_scale = 0; 3398 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3399 TCP_MAXWIN << sc->sc_request_r_scale < 3400 so->so_rcv.sb_hiwat) 3401 sc->sc_request_r_scale++; 3402 } else { 3403 sc->sc_requested_s_scale = 15; 3404 sc->sc_request_r_scale = 15; 3405 } 3406 sc->sc_tp = tp; 3407 if (syn_cache_respond(sc, m) == 0) { 3408 syn_cache_insert(sc, tp); 3409 tcpstat.tcps_sndacks++; 3410 tcpstat.tcps_sndtotal++; 3411 } else { 3412 SYN_CACHE_PUT(sc); 3413 tcpstat.tcps_sc_dropped++; 3414 } 3415 return (1); 3416 } 3417 3418 int 3419 syn_cache_respond(sc, m) 3420 struct syn_cache *sc; 3421 struct mbuf *m; 3422 { 3423 struct route *ro; 3424 u_int8_t *optp; 3425 int optlen, error; 3426 u_int16_t tlen; 3427 struct ip *ip = NULL; 3428 #ifdef INET6 3429 struct ip6_hdr *ip6 = NULL; 3430 #endif 3431 struct tcphdr *th; 3432 u_int hlen; 3433 3434 switch (sc->sc_src.sa.sa_family) { 3435 case AF_INET: 3436 hlen = sizeof(struct ip); 3437 ro = &sc->sc_route4; 3438 break; 3439 #ifdef INET6 3440 case AF_INET6: 3441 hlen = sizeof(struct ip6_hdr); 3442 ro = (struct route *)&sc->sc_route6; 3443 break; 3444 #endif 3445 default: 3446 if (m) 3447 m_freem(m); 3448 return EAFNOSUPPORT; 3449 } 3450 3451 /* Compute the size of the TCP options. */ 3452 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3453 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3454 3455 tlen = hlen + sizeof(struct tcphdr) + optlen; 3456 3457 /* 3458 * Create the IP+TCP header from scratch. 3459 */ 3460 if (m) 3461 m_freem(m); 3462 #ifdef DIAGNOSTIC 3463 if (max_linkhdr + tlen > MCLBYTES) 3464 return (ENOBUFS); 3465 #endif 3466 MGETHDR(m, M_DONTWAIT, MT_DATA); 3467 if (m && tlen > MHLEN) { 3468 MCLGET(m, M_DONTWAIT); 3469 if ((m->m_flags & M_EXT) == 0) { 3470 m_freem(m); 3471 m = NULL; 3472 } 3473 } 3474 if (m == NULL) 3475 return (ENOBUFS); 3476 3477 /* Fixup the mbuf. */ 3478 m->m_data += max_linkhdr; 3479 m->m_len = m->m_pkthdr.len = tlen; 3480 #ifdef IPSEC 3481 if (sc->sc_tp) { 3482 struct tcpcb *tp; 3483 struct socket *so; 3484 3485 tp = sc->sc_tp; 3486 if (tp->t_inpcb) 3487 so = tp->t_inpcb->inp_socket; 3488 #ifdef INET6 3489 else if (tp->t_in6pcb) 3490 so = tp->t_in6pcb->in6p_socket; 3491 #endif 3492 else 3493 so = NULL; 3494 /* use IPsec policy on listening socket, on SYN ACK */ 3495 if (ipsec_setsocket(m, so) != 0) { 3496 m_freem(m); 3497 return ENOBUFS; 3498 } 3499 } 3500 #endif 3501 m->m_pkthdr.rcvif = NULL; 3502 memset(mtod(m, u_char *), 0, tlen); 3503 3504 switch (sc->sc_src.sa.sa_family) { 3505 case AF_INET: 3506 ip = mtod(m, struct ip *); 3507 ip->ip_dst = sc->sc_src.sin.sin_addr; 3508 ip->ip_src = sc->sc_dst.sin.sin_addr; 3509 ip->ip_p = IPPROTO_TCP; 3510 th = (struct tcphdr *)(ip + 1); 3511 th->th_dport = sc->sc_src.sin.sin_port; 3512 th->th_sport = sc->sc_dst.sin.sin_port; 3513 break; 3514 #ifdef INET6 3515 case AF_INET6: 3516 ip6 = mtod(m, struct ip6_hdr *); 3517 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3518 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3519 ip6->ip6_nxt = IPPROTO_TCP; 3520 /* ip6_plen will be updated in ip6_output() */ 3521 th = (struct tcphdr *)(ip6 + 1); 3522 th->th_dport = sc->sc_src.sin6.sin6_port; 3523 th->th_sport = sc->sc_dst.sin6.sin6_port; 3524 break; 3525 #endif 3526 default: 3527 th = NULL; 3528 } 3529 3530 th->th_seq = htonl(sc->sc_iss); 3531 th->th_ack = htonl(sc->sc_irs + 1); 3532 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3533 th->th_flags = TH_SYN|TH_ACK; 3534 th->th_win = htons(sc->sc_win); 3535 /* th_sum already 0 */ 3536 /* th_urp already 0 */ 3537 3538 /* Tack on the TCP options. */ 3539 optp = (u_int8_t *)(th + 1); 3540 *optp++ = TCPOPT_MAXSEG; 3541 *optp++ = 4; 3542 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3543 *optp++ = sc->sc_ourmaxseg & 0xff; 3544 3545 if (sc->sc_request_r_scale != 15) { 3546 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3547 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3548 sc->sc_request_r_scale); 3549 optp += 4; 3550 } 3551 3552 if (sc->sc_flags & SCF_TIMESTAMP) { 3553 u_int32_t *lp = (u_int32_t *)(optp); 3554 /* Form timestamp option as shown in appendix A of RFC 1323. */ 3555 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 3556 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 3557 *lp = htonl(sc->sc_timestamp); 3558 optp += TCPOLEN_TSTAMP_APPA; 3559 } 3560 3561 /* Compute the packet's checksum. */ 3562 switch (sc->sc_src.sa.sa_family) { 3563 case AF_INET: 3564 ip->ip_len = htons(tlen - hlen); 3565 th->th_sum = 0; 3566 th->th_sum = in_cksum(m, tlen); 3567 break; 3568 #ifdef INET6 3569 case AF_INET6: 3570 ip6->ip6_plen = htons(tlen - hlen); 3571 th->th_sum = 0; 3572 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 3573 break; 3574 #endif 3575 } 3576 3577 /* 3578 * Fill in some straggling IP bits. Note the stack expects 3579 * ip_len to be in host order, for convenience. 3580 */ 3581 switch (sc->sc_src.sa.sa_family) { 3582 #ifdef INET 3583 case AF_INET: 3584 ip->ip_len = tlen; 3585 ip->ip_ttl = ip_defttl; 3586 /* XXX tos? */ 3587 break; 3588 #endif 3589 #ifdef INET6 3590 case AF_INET6: 3591 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 3592 ip6->ip6_vfc |= IPV6_VERSION; 3593 ip6->ip6_plen = htons(tlen - hlen); 3594 /* ip6_hlim will be initialized afterwards */ 3595 /* XXX flowlabel? */ 3596 break; 3597 #endif 3598 } 3599 3600 switch (sc->sc_src.sa.sa_family) { 3601 #ifdef INET 3602 case AF_INET: 3603 error = ip_output(m, sc->sc_ipopts, ro, 3604 (ip_mtudisc ? IP_MTUDISC : 0), 3605 NULL); 3606 break; 3607 #endif 3608 #ifdef INET6 3609 case AF_INET6: 3610 ip6->ip6_hlim = in6_selecthlim(NULL, 3611 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 3612 3613 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 3614 0, NULL, NULL); 3615 break; 3616 #endif 3617 default: 3618 error = EAFNOSUPPORT; 3619 break; 3620 } 3621 return (error); 3622 } 3623