1 /* $NetBSD: tcp_input.c,v 1.133 2001/11/13 00:32:40 lukem Exp $ */ 2 3 /* 4 %%% portions-copyright-nrl-95 5 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 7 Reserved. All rights under this copyright have been assigned to the US 8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 10 software. 11 You should have received a copy of the license with this software. If you 12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 13 14 */ 15 16 /* 17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 18 * All rights reserved. 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 1. Redistributions of source code must retain the above copyright 24 * notice, this list of conditions and the following disclaimer. 25 * 2. Redistributions in binary form must reproduce the above copyright 26 * notice, this list of conditions and the following disclaimer in the 27 * documentation and/or other materials provided with the distribution. 28 * 3. Neither the name of the project nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 45 /*- 46 * Copyright (c) 1997, 1998, 1999, 2001 The NetBSD Foundation, Inc. 47 * All rights reserved. 48 * 49 * This code is derived from software contributed to The NetBSD Foundation 50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 51 * Facility, NASA Ames Research Center. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 3. All advertising materials mentioning features or use of this software 62 * must display the following acknowledgement: 63 * This product includes software developed by the NetBSD 64 * Foundation, Inc. and its contributors. 65 * 4. Neither the name of The NetBSD Foundation nor the names of its 66 * contributors may be used to endorse or promote products derived 67 * from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 * POSSIBILITY OF SUCH DAMAGE. 80 */ 81 82 /* 83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 84 * The Regents of the University of California. All rights reserved. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. All advertising materials mentioning features or use of this software 95 * must display the following acknowledgement: 96 * This product includes software developed by the University of 97 * California, Berkeley and its contributors. 98 * 4. Neither the name of the University nor the names of its contributors 99 * may be used to endorse or promote products derived from this software 100 * without specific prior written permission. 101 * 102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 112 * SUCH DAMAGE. 113 * 114 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 115 */ 116 117 /* 118 * TODO list for SYN cache stuff: 119 * 120 * Find room for a "state" field, which is needed to keep a 121 * compressed state for TIME_WAIT TCBs. It's been noted already 122 * that this is fairly important for very high-volume web and 123 * mail servers, which use a large number of short-lived 124 * connections. 125 */ 126 127 #include <sys/cdefs.h> 128 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.133 2001/11/13 00:32:40 lukem Exp $"); 129 130 #include "opt_inet.h" 131 #include "opt_ipsec.h" 132 #include "opt_inet_csum.h" 133 #include "opt_tcp_debug.h" 134 135 #include <sys/param.h> 136 #include <sys/systm.h> 137 #include <sys/malloc.h> 138 #include <sys/mbuf.h> 139 #include <sys/protosw.h> 140 #include <sys/socket.h> 141 #include <sys/socketvar.h> 142 #include <sys/errno.h> 143 #include <sys/syslog.h> 144 #include <sys/pool.h> 145 #include <sys/domain.h> 146 #include <sys/kernel.h> 147 148 #include <net/if.h> 149 #include <net/route.h> 150 #include <net/if_types.h> 151 152 #include <netinet/in.h> 153 #include <netinet/in_systm.h> 154 #include <netinet/ip.h> 155 #include <netinet/in_pcb.h> 156 #include <netinet/ip_var.h> 157 158 #ifdef INET6 159 #ifndef INET 160 #include <netinet/in.h> 161 #endif 162 #include <netinet/ip6.h> 163 #include <netinet6/ip6_var.h> 164 #include <netinet6/in6_pcb.h> 165 #include <netinet6/ip6_var.h> 166 #include <netinet6/in6_var.h> 167 #include <netinet/icmp6.h> 168 #include <netinet6/nd6.h> 169 #endif 170 171 #ifdef PULLDOWN_TEST 172 #ifndef INET6 173 /* always need ip6.h for IP6_EXTHDR_GET */ 174 #include <netinet/ip6.h> 175 #endif 176 #endif 177 178 #include <netinet/tcp.h> 179 #include <netinet/tcp_fsm.h> 180 #include <netinet/tcp_seq.h> 181 #include <netinet/tcp_timer.h> 182 #include <netinet/tcp_var.h> 183 #include <netinet/tcpip.h> 184 #include <netinet/tcp_debug.h> 185 186 #include <machine/stdarg.h> 187 188 #ifdef IPSEC 189 #include <netinet6/ipsec.h> 190 #include <netkey/key.h> 191 #endif /*IPSEC*/ 192 #ifdef INET6 193 #include "faith.h" 194 #if defined(NFAITH) && NFAITH > 0 195 #include <net/if_faith.h> 196 #endif 197 #endif 198 199 int tcprexmtthresh = 3; 200 int tcp_log_refused; 201 202 static int tcp_rst_ppslim_count = 0; 203 static struct timeval tcp_rst_ppslim_last; 204 205 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 206 207 /* for modulo comparisons of timestamps */ 208 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 209 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 210 211 /* 212 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 213 */ 214 #ifdef INET6 215 #define ND6_HINT(tp) \ 216 do { \ 217 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 \ 218 && tp->t_in6pcb->in6p_route.ro_rt) { \ 219 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \ 220 } \ 221 } while (0) 222 #else 223 #define ND6_HINT(tp) 224 #endif 225 226 /* 227 * Macro to compute ACK transmission behavior. Delay the ACK unless 228 * we have already delayed an ACK (must send an ACK every two segments). 229 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 230 * option is enabled. 231 */ 232 #define TCP_SETUP_ACK(tp, th) \ 233 do { \ 234 if ((tp)->t_flags & TF_DELACK || \ 235 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \ 236 tp->t_flags |= TF_ACKNOW; \ 237 else \ 238 TCP_SET_DELACK(tp); \ 239 } while (0) 240 241 /* 242 * Convert TCP protocol fields to host order for easier processing. 243 */ 244 #define TCP_FIELDS_TO_HOST(th) \ 245 do { \ 246 NTOHL((th)->th_seq); \ 247 NTOHL((th)->th_ack); \ 248 NTOHS((th)->th_win); \ 249 NTOHS((th)->th_urp); \ 250 } while (0) 251 252 #ifdef TCP_CSUM_COUNTERS 253 #include <sys/device.h> 254 255 extern struct evcnt tcp_hwcsum_ok; 256 extern struct evcnt tcp_hwcsum_bad; 257 extern struct evcnt tcp_hwcsum_data; 258 extern struct evcnt tcp_swcsum; 259 260 #define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ 261 262 #else 263 264 #define TCP_CSUM_COUNTER_INCR(ev) /* nothing */ 265 266 #endif /* TCP_CSUM_COUNTERS */ 267 268 int 269 tcp_reass(tp, th, m, tlen) 270 struct tcpcb *tp; 271 struct tcphdr *th; 272 struct mbuf *m; 273 int *tlen; 274 { 275 struct ipqent *p, *q, *nq, *tiqe = NULL; 276 struct socket *so = NULL; 277 int pkt_flags; 278 tcp_seq pkt_seq; 279 unsigned pkt_len; 280 u_long rcvpartdupbyte = 0; 281 u_long rcvoobyte; 282 283 if (tp->t_inpcb) 284 so = tp->t_inpcb->inp_socket; 285 #ifdef INET6 286 else if (tp->t_in6pcb) 287 so = tp->t_in6pcb->in6p_socket; 288 #endif 289 290 TCP_REASS_LOCK_CHECK(tp); 291 292 /* 293 * Call with th==0 after become established to 294 * force pre-ESTABLISHED data up to user socket. 295 */ 296 if (th == 0) 297 goto present; 298 299 rcvoobyte = *tlen; 300 /* 301 * Copy these to local variables because the tcpiphdr 302 * gets munged while we are collapsing mbufs. 303 */ 304 pkt_seq = th->th_seq; 305 pkt_len = *tlen; 306 pkt_flags = th->th_flags; 307 /* 308 * Find a segment which begins after this one does. 309 */ 310 for (p = NULL, q = LIST_FIRST(&tp->segq); q != NULL; q = nq) { 311 nq = LIST_NEXT(q, ipqe_q); 312 /* 313 * If the received segment is just right after this 314 * fragment, merge the two together and then check 315 * for further overlaps. 316 */ 317 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 318 #ifdef TCPREASS_DEBUG 319 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 320 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 321 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 322 #endif 323 pkt_len += q->ipqe_len; 324 pkt_flags |= q->ipqe_flags; 325 pkt_seq = q->ipqe_seq; 326 m_cat(q->ipqe_m, m); 327 m = q->ipqe_m; 328 goto free_ipqe; 329 } 330 /* 331 * If the received segment is completely past this 332 * fragment, we need to go the next fragment. 333 */ 334 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 335 p = q; 336 continue; 337 } 338 /* 339 * If the fragment is past the received segment, 340 * it (or any following) can't be concatenated. 341 */ 342 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) 343 break; 344 /* 345 * We've received all the data in this segment before. 346 * mark it as a duplicate and return. 347 */ 348 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 349 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 350 tcpstat.tcps_rcvduppack++; 351 tcpstat.tcps_rcvdupbyte += pkt_len; 352 m_freem(m); 353 if (tiqe != NULL) 354 pool_put(&ipqent_pool, tiqe); 355 return (0); 356 } 357 /* 358 * Received segment completely overlaps this fragment 359 * so we drop the fragment (this keeps the temporal 360 * ordering of segments correct). 361 */ 362 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 363 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 364 rcvpartdupbyte += q->ipqe_len; 365 m_freem(q->ipqe_m); 366 goto free_ipqe; 367 } 368 /* 369 * RX'ed segment extends past the end of the 370 * fragment. Drop the overlapping bytes. Then 371 * merge the fragment and segment then treat as 372 * a longer received packet. 373 */ 374 if (SEQ_LT(q->ipqe_seq, pkt_seq) 375 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 376 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 377 #ifdef TCPREASS_DEBUG 378 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 379 tp, overlap, 380 pkt_seq, pkt_seq + pkt_len, pkt_len); 381 #endif 382 m_adj(m, overlap); 383 rcvpartdupbyte += overlap; 384 m_cat(q->ipqe_m, m); 385 m = q->ipqe_m; 386 pkt_seq = q->ipqe_seq; 387 pkt_len += q->ipqe_len - overlap; 388 rcvoobyte -= overlap; 389 goto free_ipqe; 390 } 391 /* 392 * RX'ed segment extends past the front of the 393 * fragment. Drop the overlapping bytes on the 394 * received packet. The packet will then be 395 * contatentated with this fragment a bit later. 396 */ 397 if (SEQ_GT(q->ipqe_seq, pkt_seq) 398 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 399 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 400 #ifdef TCPREASS_DEBUG 401 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 402 tp, overlap, 403 pkt_seq, pkt_seq + pkt_len, pkt_len); 404 #endif 405 m_adj(m, -overlap); 406 pkt_len -= overlap; 407 rcvpartdupbyte += overlap; 408 rcvoobyte -= overlap; 409 } 410 /* 411 * If the received segment immediates precedes this 412 * fragment then tack the fragment onto this segment 413 * and reinsert the data. 414 */ 415 if (q->ipqe_seq == pkt_seq + pkt_len) { 416 #ifdef TCPREASS_DEBUG 417 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 418 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 419 pkt_seq, pkt_seq + pkt_len, pkt_len); 420 #endif 421 pkt_len += q->ipqe_len; 422 pkt_flags |= q->ipqe_flags; 423 m_cat(m, q->ipqe_m); 424 LIST_REMOVE(q, ipqe_q); 425 LIST_REMOVE(q, ipqe_timeq); 426 if (tiqe == NULL) { 427 tiqe = q; 428 } else { 429 pool_put(&ipqent_pool, q); 430 } 431 break; 432 } 433 /* 434 * If the fragment is before the segment, remember it. 435 * When this loop is terminated, p will contain the 436 * pointer to fragment that is right before the received 437 * segment. 438 */ 439 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 440 p = q; 441 442 continue; 443 444 /* 445 * This is a common operation. It also will allow 446 * to save doing a malloc/free in most instances. 447 */ 448 free_ipqe: 449 LIST_REMOVE(q, ipqe_q); 450 LIST_REMOVE(q, ipqe_timeq); 451 if (tiqe == NULL) { 452 tiqe = q; 453 } else { 454 pool_put(&ipqent_pool, q); 455 } 456 } 457 458 /* 459 * Allocate a new queue entry since the received segment did not 460 * collapse onto any other out-of-order block; thus we are allocating 461 * a new block. If it had collapsed, tiqe would not be NULL and 462 * we would be reusing it. 463 * XXX If we can't, just drop the packet. XXX 464 */ 465 if (tiqe == NULL) { 466 tiqe = pool_get(&ipqent_pool, PR_NOWAIT); 467 if (tiqe == NULL) { 468 tcpstat.tcps_rcvmemdrop++; 469 m_freem(m); 470 return (0); 471 } 472 } 473 474 /* 475 * Update the counters. 476 */ 477 tcpstat.tcps_rcvoopack++; 478 tcpstat.tcps_rcvoobyte += rcvoobyte; 479 if (rcvpartdupbyte) { 480 tcpstat.tcps_rcvpartduppack++; 481 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte; 482 } 483 484 /* 485 * Insert the new fragment queue entry into both queues. 486 */ 487 tiqe->ipqe_m = m; 488 tiqe->ipqe_seq = pkt_seq; 489 tiqe->ipqe_len = pkt_len; 490 tiqe->ipqe_flags = pkt_flags; 491 if (p == NULL) { 492 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 493 #ifdef TCPREASS_DEBUG 494 if (tiqe->ipqe_seq != tp->rcv_nxt) 495 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 496 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 497 #endif 498 } else { 499 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 500 #ifdef TCPREASS_DEBUG 501 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 502 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 503 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 504 #endif 505 } 506 507 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 508 509 present: 510 /* 511 * Present data to user, advancing rcv_nxt through 512 * completed sequence space. 513 */ 514 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 515 return (0); 516 q = LIST_FIRST(&tp->segq); 517 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 518 return (0); 519 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 520 return (0); 521 522 tp->rcv_nxt += q->ipqe_len; 523 pkt_flags = q->ipqe_flags & TH_FIN; 524 ND6_HINT(tp); 525 526 LIST_REMOVE(q, ipqe_q); 527 LIST_REMOVE(q, ipqe_timeq); 528 if (so->so_state & SS_CANTRCVMORE) 529 m_freem(q->ipqe_m); 530 else 531 sbappend(&so->so_rcv, q->ipqe_m); 532 pool_put(&ipqent_pool, q); 533 sorwakeup(so); 534 return (pkt_flags); 535 } 536 537 #ifdef INET6 538 int 539 tcp6_input(mp, offp, proto) 540 struct mbuf **mp; 541 int *offp, proto; 542 { 543 struct mbuf *m = *mp; 544 545 /* 546 * draft-itojun-ipv6-tcp-to-anycast 547 * better place to put this in? 548 */ 549 if (m->m_flags & M_ANYCAST6) { 550 struct ip6_hdr *ip6; 551 if (m->m_len < sizeof(struct ip6_hdr)) { 552 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 553 tcpstat.tcps_rcvshort++; 554 return IPPROTO_DONE; 555 } 556 } 557 ip6 = mtod(m, struct ip6_hdr *); 558 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 559 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 560 return IPPROTO_DONE; 561 } 562 563 tcp_input(m, *offp, proto); 564 return IPPROTO_DONE; 565 } 566 #endif 567 568 /* 569 * TCP input routine, follows pages 65-76 of the 570 * protocol specification dated September, 1981 very closely. 571 */ 572 void 573 #if __STDC__ 574 tcp_input(struct mbuf *m, ...) 575 #else 576 tcp_input(m, va_alist) 577 struct mbuf *m; 578 #endif 579 { 580 int proto; 581 struct tcphdr *th; 582 struct ip *ip; 583 struct inpcb *inp; 584 #ifdef INET6 585 struct ip6_hdr *ip6; 586 struct in6pcb *in6p; 587 #endif 588 caddr_t optp = NULL; 589 int optlen = 0; 590 int len, tlen, toff, hdroptlen = 0; 591 struct tcpcb *tp = 0; 592 int tiflags; 593 struct socket *so = NULL; 594 int todrop, acked, ourfinisacked, needoutput = 0; 595 short ostate = 0; 596 int iss = 0; 597 u_long tiwin; 598 struct tcp_opt_info opti; 599 int off, iphlen; 600 va_list ap; 601 int af; /* af on the wire */ 602 struct mbuf *tcp_saveti = NULL; 603 604 va_start(ap, m); 605 toff = va_arg(ap, int); 606 proto = va_arg(ap, int); 607 va_end(ap); 608 609 tcpstat.tcps_rcvtotal++; 610 611 bzero(&opti, sizeof(opti)); 612 opti.ts_present = 0; 613 opti.maxseg = 0; 614 615 /* 616 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 617 * 618 * TCP is, by definition, unicast, so we reject all 619 * multicast outright. 620 * 621 * Note, there are additional src/dst address checks in 622 * the AF-specific code below. 623 */ 624 if (m->m_flags & (M_BCAST|M_MCAST)) { 625 /* XXX stat */ 626 goto drop; 627 } 628 #ifdef INET6 629 if (m->m_flags & M_ANYCAST6) { 630 /* XXX stat */ 631 goto drop; 632 } 633 #endif 634 635 /* 636 * Get IP and TCP header together in first mbuf. 637 * Note: IP leaves IP header in first mbuf. 638 */ 639 ip = mtod(m, struct ip *); 640 #ifdef INET6 641 ip6 = NULL; 642 #endif 643 switch (ip->ip_v) { 644 #ifdef INET 645 case 4: 646 af = AF_INET; 647 iphlen = sizeof(struct ip); 648 #ifndef PULLDOWN_TEST 649 /* would like to get rid of this... */ 650 if (toff > sizeof (struct ip)) { 651 ip_stripoptions(m, (struct mbuf *)0); 652 toff = sizeof(struct ip); 653 } 654 if (m->m_len < toff + sizeof (struct tcphdr)) { 655 if ((m = m_pullup(m, toff + sizeof (struct tcphdr))) == 0) { 656 tcpstat.tcps_rcvshort++; 657 return; 658 } 659 } 660 ip = mtod(m, struct ip *); 661 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 662 #else 663 ip = mtod(m, struct ip *); 664 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 665 sizeof(struct tcphdr)); 666 if (th == NULL) { 667 tcpstat.tcps_rcvshort++; 668 return; 669 } 670 #endif 671 672 /* 673 * Make sure destination address is not multicast. 674 * Source address checked in ip_input(). 675 */ 676 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 677 /* XXX stat */ 678 goto drop; 679 } 680 681 /* We do the checksum after PCB lookup... */ 682 len = ip->ip_len; 683 tlen = len - toff; 684 break; 685 #endif 686 #ifdef INET6 687 case 6: 688 ip = NULL; 689 iphlen = sizeof(struct ip6_hdr); 690 af = AF_INET6; 691 #ifndef PULLDOWN_TEST 692 if (m->m_len < toff + sizeof(struct tcphdr)) { 693 m = m_pullup(m, toff + sizeof(struct tcphdr)); /*XXX*/ 694 if (m == NULL) { 695 tcpstat.tcps_rcvshort++; 696 return; 697 } 698 } 699 ip6 = mtod(m, struct ip6_hdr *); 700 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 701 #else 702 ip6 = mtod(m, struct ip6_hdr *); 703 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 704 sizeof(struct tcphdr)); 705 if (th == NULL) { 706 tcpstat.tcps_rcvshort++; 707 return; 708 } 709 #endif 710 711 /* Be proactive about malicious use of IPv4 mapped address */ 712 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 713 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 714 /* XXX stat */ 715 goto drop; 716 } 717 718 /* 719 * Be proactive about unspecified IPv6 address in source. 720 * As we use all-zero to indicate unbounded/unconnected pcb, 721 * unspecified IPv6 address can be used to confuse us. 722 * 723 * Note that packets with unspecified IPv6 destination is 724 * already dropped in ip6_input. 725 */ 726 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 727 /* XXX stat */ 728 goto drop; 729 } 730 731 /* 732 * Make sure destination address is not multicast. 733 * Source address checked in ip6_input(). 734 */ 735 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 736 /* XXX stat */ 737 goto drop; 738 } 739 740 /* We do the checksum after PCB lookup... */ 741 len = m->m_pkthdr.len; 742 tlen = len - toff; 743 break; 744 #endif 745 default: 746 m_freem(m); 747 return; 748 } 749 750 /* 751 * Check that TCP offset makes sense, 752 * pull out TCP options and adjust length. XXX 753 */ 754 off = th->th_off << 2; 755 if (off < sizeof (struct tcphdr) || off > tlen) { 756 tcpstat.tcps_rcvbadoff++; 757 goto drop; 758 } 759 tlen -= off; 760 761 /* 762 * tcp_input() has been modified to use tlen to mean the TCP data 763 * length throughout the function. Other functions can use 764 * m->m_pkthdr.len as the basis for calculating the TCP data length. 765 * rja 766 */ 767 768 if (off > sizeof (struct tcphdr)) { 769 #ifndef PULLDOWN_TEST 770 if (m->m_len < toff + off) { 771 if ((m = m_pullup(m, toff + off)) == 0) { 772 tcpstat.tcps_rcvshort++; 773 return; 774 } 775 switch (af) { 776 #ifdef INET 777 case AF_INET: 778 ip = mtod(m, struct ip *); 779 break; 780 #endif 781 #ifdef INET6 782 case AF_INET6: 783 ip6 = mtod(m, struct ip6_hdr *); 784 break; 785 #endif 786 } 787 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 788 } 789 #else 790 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); 791 if (th == NULL) { 792 tcpstat.tcps_rcvshort++; 793 return; 794 } 795 /* 796 * NOTE: ip/ip6 will not be affected by m_pulldown() 797 * (as they're before toff) and we don't need to update those. 798 */ 799 #endif 800 optlen = off - sizeof (struct tcphdr); 801 optp = ((caddr_t)th) + sizeof(struct tcphdr); 802 /* 803 * Do quick retrieval of timestamp options ("options 804 * prediction?"). If timestamp is the only option and it's 805 * formatted as recommended in RFC 1323 appendix A, we 806 * quickly get the values now and not bother calling 807 * tcp_dooptions(), etc. 808 */ 809 if ((optlen == TCPOLEN_TSTAMP_APPA || 810 (optlen > TCPOLEN_TSTAMP_APPA && 811 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 812 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 813 (th->th_flags & TH_SYN) == 0) { 814 opti.ts_present = 1; 815 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 816 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 817 optp = NULL; /* we've parsed the options */ 818 } 819 } 820 tiflags = th->th_flags; 821 822 /* 823 * Locate pcb for segment. 824 */ 825 findpcb: 826 inp = NULL; 827 #ifdef INET6 828 in6p = NULL; 829 #endif 830 switch (af) { 831 #ifdef INET 832 case AF_INET: 833 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, 834 ip->ip_dst, th->th_dport); 835 if (inp == 0) { 836 ++tcpstat.tcps_pcbhashmiss; 837 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); 838 } 839 #ifdef INET6 840 if (inp == 0) { 841 struct in6_addr s, d; 842 843 /* mapped addr case */ 844 bzero(&s, sizeof(s)); 845 s.s6_addr16[5] = htons(0xffff); 846 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src)); 847 bzero(&d, sizeof(d)); 848 d.s6_addr16[5] = htons(0xffff); 849 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst)); 850 in6p = in6_pcblookup_connect(&tcb6, &s, th->th_sport, 851 &d, th->th_dport, 0); 852 if (in6p == 0) { 853 ++tcpstat.tcps_pcbhashmiss; 854 in6p = in6_pcblookup_bind(&tcb6, &d, 855 th->th_dport, 0); 856 } 857 } 858 #endif 859 #ifndef INET6 860 if (inp == 0) 861 #else 862 if (inp == 0 && in6p == 0) 863 #endif 864 { 865 ++tcpstat.tcps_noport; 866 if (tcp_log_refused && (tiflags & TH_SYN)) { 867 #ifndef INET6 868 char src[4*sizeof "123"]; 869 char dst[4*sizeof "123"]; 870 #else 871 char src[INET6_ADDRSTRLEN]; 872 char dst[INET6_ADDRSTRLEN]; 873 #endif 874 if (ip) { 875 strcpy(src, inet_ntoa(ip->ip_src)); 876 strcpy(dst, inet_ntoa(ip->ip_dst)); 877 } 878 #ifdef INET6 879 else if (ip6) { 880 strcpy(src, ip6_sprintf(&ip6->ip6_src)); 881 strcpy(dst, ip6_sprintf(&ip6->ip6_dst)); 882 } 883 #endif 884 else { 885 strcpy(src, "(unknown)"); 886 strcpy(dst, "(unknown)"); 887 } 888 log(LOG_INFO, 889 "Connection attempt to TCP %s:%d from %s:%d\n", 890 dst, ntohs(th->th_dport), 891 src, ntohs(th->th_sport)); 892 } 893 TCP_FIELDS_TO_HOST(th); 894 goto dropwithreset_ratelim; 895 } 896 #ifdef IPSEC 897 if (inp && ipsec4_in_reject(m, inp)) { 898 ipsecstat.in_polvio++; 899 goto drop; 900 } 901 #ifdef INET6 902 else if (in6p && ipsec4_in_reject_so(m, in6p->in6p_socket)) { 903 ipsecstat.in_polvio++; 904 goto drop; 905 } 906 #endif 907 #endif /*IPSEC*/ 908 break; 909 #endif /*INET*/ 910 #ifdef INET6 911 case AF_INET6: 912 { 913 int faith; 914 915 #if defined(NFAITH) && NFAITH > 0 916 faith = faithprefix(&ip6->ip6_dst); 917 #else 918 faith = 0; 919 #endif 920 in6p = in6_pcblookup_connect(&tcb6, &ip6->ip6_src, th->th_sport, 921 &ip6->ip6_dst, th->th_dport, faith); 922 if (in6p == NULL) { 923 ++tcpstat.tcps_pcbhashmiss; 924 in6p = in6_pcblookup_bind(&tcb6, &ip6->ip6_dst, 925 th->th_dport, faith); 926 } 927 if (in6p == NULL) { 928 ++tcpstat.tcps_noport; 929 TCP_FIELDS_TO_HOST(th); 930 goto dropwithreset_ratelim; 931 } 932 #ifdef IPSEC 933 if (ipsec6_in_reject(m, in6p)) { 934 ipsec6stat.in_polvio++; 935 goto drop; 936 } 937 #endif /*IPSEC*/ 938 break; 939 } 940 #endif 941 } 942 943 /* 944 * If the state is CLOSED (i.e., TCB does not exist) then 945 * all data in the incoming segment is discarded. 946 * If the TCB exists but is in CLOSED state, it is embryonic, 947 * but should either do a listen or a connect soon. 948 */ 949 tp = NULL; 950 so = NULL; 951 if (inp) { 952 tp = intotcpcb(inp); 953 so = inp->inp_socket; 954 } 955 #ifdef INET6 956 else if (in6p) { 957 tp = in6totcpcb(in6p); 958 so = in6p->in6p_socket; 959 } 960 #endif 961 if (tp == 0) { 962 TCP_FIELDS_TO_HOST(th); 963 goto dropwithreset_ratelim; 964 } 965 if (tp->t_state == TCPS_CLOSED) 966 goto drop; 967 968 /* 969 * Checksum extended TCP header and data. 970 */ 971 switch (af) { 972 #ifdef INET 973 case AF_INET: 974 switch (m->m_pkthdr.csum_flags & 975 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) | 976 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 977 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD: 978 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad); 979 goto badcsum; 980 981 case M_CSUM_TCPv4|M_CSUM_DATA: 982 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data); 983 if ((m->m_pkthdr.csum_data ^ 0xffff) != 0) 984 goto badcsum; 985 break; 986 987 case M_CSUM_TCPv4: 988 /* Checksum was okay. */ 989 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok); 990 break; 991 992 default: 993 /* Must compute it ourselves. */ 994 TCP_CSUM_COUNTER_INCR(&tcp_swcsum); 995 #ifndef PULLDOWN_TEST 996 { 997 struct ipovly *ipov; 998 ipov = (struct ipovly *)ip; 999 bzero(ipov->ih_x1, sizeof ipov->ih_x1); 1000 ipov->ih_len = htons(tlen + off); 1001 1002 if (in_cksum(m, len) != 0) 1003 goto badcsum; 1004 } 1005 #else 1006 if (in4_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) 1007 goto badcsum; 1008 #endif /* ! PULLDOWN_TEST */ 1009 break; 1010 } 1011 break; 1012 #endif /* INET4 */ 1013 1014 #ifdef INET6 1015 case AF_INET6: 1016 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) 1017 goto badcsum; 1018 break; 1019 #endif /* INET6 */ 1020 } 1021 1022 TCP_FIELDS_TO_HOST(th); 1023 1024 /* Unscale the window into a 32-bit value. */ 1025 if ((tiflags & TH_SYN) == 0) 1026 tiwin = th->th_win << tp->snd_scale; 1027 else 1028 tiwin = th->th_win; 1029 1030 #ifdef INET6 1031 /* save packet options if user wanted */ 1032 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { 1033 if (in6p->in6p_options) { 1034 m_freem(in6p->in6p_options); 1035 in6p->in6p_options = 0; 1036 } 1037 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); 1038 } 1039 #endif 1040 1041 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 1042 union syn_cache_sa src; 1043 union syn_cache_sa dst; 1044 1045 bzero(&src, sizeof(src)); 1046 bzero(&dst, sizeof(dst)); 1047 switch (af) { 1048 #ifdef INET 1049 case AF_INET: 1050 src.sin.sin_len = sizeof(struct sockaddr_in); 1051 src.sin.sin_family = AF_INET; 1052 src.sin.sin_addr = ip->ip_src; 1053 src.sin.sin_port = th->th_sport; 1054 1055 dst.sin.sin_len = sizeof(struct sockaddr_in); 1056 dst.sin.sin_family = AF_INET; 1057 dst.sin.sin_addr = ip->ip_dst; 1058 dst.sin.sin_port = th->th_dport; 1059 break; 1060 #endif 1061 #ifdef INET6 1062 case AF_INET6: 1063 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 1064 src.sin6.sin6_family = AF_INET6; 1065 src.sin6.sin6_addr = ip6->ip6_src; 1066 src.sin6.sin6_port = th->th_sport; 1067 1068 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 1069 dst.sin6.sin6_family = AF_INET6; 1070 dst.sin6.sin6_addr = ip6->ip6_dst; 1071 dst.sin6.sin6_port = th->th_dport; 1072 break; 1073 #endif /* INET6 */ 1074 default: 1075 goto badsyn; /*sanity*/ 1076 } 1077 1078 if (so->so_options & SO_DEBUG) { 1079 ostate = tp->t_state; 1080 1081 tcp_saveti = NULL; 1082 if (iphlen + sizeof(struct tcphdr) > MHLEN) 1083 goto nosave; 1084 1085 if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) { 1086 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 1087 if (!tcp_saveti) 1088 goto nosave; 1089 } else { 1090 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER); 1091 if (!tcp_saveti) 1092 goto nosave; 1093 tcp_saveti->m_len = iphlen; 1094 m_copydata(m, 0, iphlen, 1095 mtod(tcp_saveti, caddr_t)); 1096 } 1097 1098 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 1099 m_freem(tcp_saveti); 1100 tcp_saveti = NULL; 1101 } else { 1102 tcp_saveti->m_len += sizeof(struct tcphdr); 1103 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen, 1104 sizeof(struct tcphdr)); 1105 } 1106 if (tcp_saveti) { 1107 /* 1108 * need to recover version # field, which was 1109 * overwritten on ip_cksum computation. 1110 */ 1111 struct ip *sip; 1112 sip = mtod(tcp_saveti, struct ip *); 1113 switch (af) { 1114 #ifdef INET 1115 case AF_INET: 1116 sip->ip_v = 4; 1117 break; 1118 #endif 1119 #ifdef INET6 1120 case AF_INET6: 1121 sip->ip_v = 6; 1122 break; 1123 #endif 1124 } 1125 } 1126 nosave:; 1127 } 1128 if (so->so_options & SO_ACCEPTCONN) { 1129 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1130 if (tiflags & TH_RST) { 1131 syn_cache_reset(&src.sa, &dst.sa, th); 1132 } else if ((tiflags & (TH_ACK|TH_SYN)) == 1133 (TH_ACK|TH_SYN)) { 1134 /* 1135 * Received a SYN,ACK. This should 1136 * never happen while we are in 1137 * LISTEN. Send an RST. 1138 */ 1139 goto badsyn; 1140 } else if (tiflags & TH_ACK) { 1141 so = syn_cache_get(&src.sa, &dst.sa, 1142 th, toff, tlen, so, m); 1143 if (so == NULL) { 1144 /* 1145 * We don't have a SYN for 1146 * this ACK; send an RST. 1147 */ 1148 goto badsyn; 1149 } else if (so == 1150 (struct socket *)(-1)) { 1151 /* 1152 * We were unable to create 1153 * the connection. If the 1154 * 3-way handshake was 1155 * completed, and RST has 1156 * been sent to the peer. 1157 * Since the mbuf might be 1158 * in use for the reply, 1159 * do not free it. 1160 */ 1161 m = NULL; 1162 } else { 1163 /* 1164 * We have created a 1165 * full-blown connection. 1166 */ 1167 tp = NULL; 1168 inp = NULL; 1169 #ifdef INET6 1170 in6p = NULL; 1171 #endif 1172 switch (so->so_proto->pr_domain->dom_family) { 1173 #ifdef INET 1174 case AF_INET: 1175 inp = sotoinpcb(so); 1176 tp = intotcpcb(inp); 1177 break; 1178 #endif 1179 #ifdef INET6 1180 case AF_INET6: 1181 in6p = sotoin6pcb(so); 1182 tp = in6totcpcb(in6p); 1183 break; 1184 #endif 1185 } 1186 if (tp == NULL) 1187 goto badsyn; /*XXX*/ 1188 tiwin <<= tp->snd_scale; 1189 goto after_listen; 1190 } 1191 } else { 1192 /* 1193 * None of RST, SYN or ACK was set. 1194 * This is an invalid packet for a 1195 * TCB in LISTEN state. Send a RST. 1196 */ 1197 goto badsyn; 1198 } 1199 } else { 1200 /* 1201 * Received a SYN. 1202 */ 1203 1204 /* 1205 * LISTEN socket received a SYN 1206 * from itself? This can't possibly 1207 * be valid; drop the packet. 1208 */ 1209 if (th->th_sport == th->th_dport) { 1210 int i; 1211 1212 switch (af) { 1213 #ifdef INET 1214 case AF_INET: 1215 i = in_hosteq(ip->ip_src, ip->ip_dst); 1216 break; 1217 #endif 1218 #ifdef INET6 1219 case AF_INET6: 1220 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1221 break; 1222 #endif 1223 default: 1224 i = 1; 1225 } 1226 if (i) { 1227 tcpstat.tcps_badsyn++; 1228 goto drop; 1229 } 1230 } 1231 1232 /* 1233 * SYN looks ok; create compressed TCP 1234 * state for it. 1235 */ 1236 if (so->so_qlen <= so->so_qlimit && 1237 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1238 so, m, optp, optlen, &opti)) 1239 m = NULL; 1240 } 1241 goto drop; 1242 } 1243 } 1244 1245 after_listen: 1246 #ifdef DIAGNOSTIC 1247 /* 1248 * Should not happen now that all embryonic connections 1249 * are handled with compressed state. 1250 */ 1251 if (tp->t_state == TCPS_LISTEN) 1252 panic("tcp_input: TCPS_LISTEN"); 1253 #endif 1254 1255 /* 1256 * Segment received on connection. 1257 * Reset idle time and keep-alive timer. 1258 */ 1259 tp->t_rcvtime = tcp_now; 1260 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1261 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1262 1263 /* 1264 * Process options. 1265 */ 1266 if (optp) 1267 tcp_dooptions(tp, optp, optlen, th, &opti); 1268 1269 /* 1270 * Header prediction: check for the two common cases 1271 * of a uni-directional data xfer. If the packet has 1272 * no control flags, is in-sequence, the window didn't 1273 * change and we're not retransmitting, it's a 1274 * candidate. If the length is zero and the ack moved 1275 * forward, we're the sender side of the xfer. Just 1276 * free the data acked & wake any higher level process 1277 * that was blocked waiting for space. If the length 1278 * is non-zero and the ack didn't move, we're the 1279 * receiver side. If we're getting packets in-order 1280 * (the reassembly queue is empty), add the data to 1281 * the socket buffer and note that we need a delayed ack. 1282 */ 1283 if (tp->t_state == TCPS_ESTABLISHED && 1284 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1285 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1286 th->th_seq == tp->rcv_nxt && 1287 tiwin && tiwin == tp->snd_wnd && 1288 tp->snd_nxt == tp->snd_max) { 1289 1290 /* 1291 * If last ACK falls within this segment's sequence numbers, 1292 * record the timestamp. 1293 */ 1294 if (opti.ts_present && 1295 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1296 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) { 1297 tp->ts_recent_age = TCP_TIMESTAMP(tp); 1298 tp->ts_recent = opti.ts_val; 1299 } 1300 1301 if (tlen == 0) { 1302 if (SEQ_GT(th->th_ack, tp->snd_una) && 1303 SEQ_LEQ(th->th_ack, tp->snd_max) && 1304 tp->snd_cwnd >= tp->snd_wnd && 1305 tp->t_dupacks < tcprexmtthresh) { 1306 /* 1307 * this is a pure ack for outstanding data. 1308 */ 1309 ++tcpstat.tcps_predack; 1310 if (opti.ts_present && opti.ts_ecr) 1311 tcp_xmit_timer(tp, 1312 TCP_TIMESTAMP(tp) - opti.ts_ecr + 1); 1313 else if (tp->t_rtttime && 1314 SEQ_GT(th->th_ack, tp->t_rtseq)) 1315 tcp_xmit_timer(tp, 1316 tcp_now - tp->t_rtttime); 1317 acked = th->th_ack - tp->snd_una; 1318 tcpstat.tcps_rcvackpack++; 1319 tcpstat.tcps_rcvackbyte += acked; 1320 ND6_HINT(tp); 1321 sbdrop(&so->so_snd, acked); 1322 /* 1323 * We want snd_recover to track snd_una to 1324 * avoid sequence wraparound problems for 1325 * very large transfers. 1326 */ 1327 tp->snd_una = tp->snd_recover = th->th_ack; 1328 m_freem(m); 1329 1330 /* 1331 * If all outstanding data are acked, stop 1332 * retransmit timer, otherwise restart timer 1333 * using current (possibly backed-off) value. 1334 * If process is waiting for space, 1335 * wakeup/selwakeup/signal. If data 1336 * are ready to send, let tcp_output 1337 * decide between more output or persist. 1338 */ 1339 if (tp->snd_una == tp->snd_max) 1340 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1341 else if (TCP_TIMER_ISARMED(tp, 1342 TCPT_PERSIST) == 0) 1343 TCP_TIMER_ARM(tp, TCPT_REXMT, 1344 tp->t_rxtcur); 1345 1346 sowwakeup(so); 1347 if (so->so_snd.sb_cc) 1348 (void) tcp_output(tp); 1349 if (tcp_saveti) 1350 m_freem(tcp_saveti); 1351 return; 1352 } 1353 } else if (th->th_ack == tp->snd_una && 1354 LIST_FIRST(&tp->segq) == NULL && 1355 tlen <= sbspace(&so->so_rcv)) { 1356 /* 1357 * this is a pure, in-sequence data packet 1358 * with nothing on the reassembly queue and 1359 * we have enough buffer space to take it. 1360 */ 1361 ++tcpstat.tcps_preddat; 1362 tp->rcv_nxt += tlen; 1363 tcpstat.tcps_rcvpack++; 1364 tcpstat.tcps_rcvbyte += tlen; 1365 ND6_HINT(tp); 1366 /* 1367 * Drop TCP, IP headers and TCP options then add data 1368 * to socket buffer. 1369 */ 1370 m_adj(m, toff + off); 1371 sbappend(&so->so_rcv, m); 1372 sorwakeup(so); 1373 TCP_SETUP_ACK(tp, th); 1374 if (tp->t_flags & TF_ACKNOW) 1375 (void) tcp_output(tp); 1376 if (tcp_saveti) 1377 m_freem(tcp_saveti); 1378 return; 1379 } 1380 } 1381 1382 /* 1383 * Compute mbuf offset to TCP data segment. 1384 */ 1385 hdroptlen = toff + off; 1386 1387 /* 1388 * Calculate amount of space in receive window, 1389 * and then do TCP input processing. 1390 * Receive window is amount of space in rcv queue, 1391 * but not less than advertised window. 1392 */ 1393 { int win; 1394 1395 win = sbspace(&so->so_rcv); 1396 if (win < 0) 1397 win = 0; 1398 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1399 } 1400 1401 switch (tp->t_state) { 1402 1403 /* 1404 * If the state is SYN_SENT: 1405 * if seg contains an ACK, but not for our SYN, drop the input. 1406 * if seg contains a RST, then drop the connection. 1407 * if seg does not contain SYN, then drop it. 1408 * Otherwise this is an acceptable SYN segment 1409 * initialize tp->rcv_nxt and tp->irs 1410 * if seg contains ack then advance tp->snd_una 1411 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1412 * arrange for segment to be acked (eventually) 1413 * continue processing rest of data/controls, beginning with URG 1414 */ 1415 case TCPS_SYN_SENT: 1416 if ((tiflags & TH_ACK) && 1417 (SEQ_LEQ(th->th_ack, tp->iss) || 1418 SEQ_GT(th->th_ack, tp->snd_max))) 1419 goto dropwithreset; 1420 if (tiflags & TH_RST) { 1421 if (tiflags & TH_ACK) 1422 tp = tcp_drop(tp, ECONNREFUSED); 1423 goto drop; 1424 } 1425 if ((tiflags & TH_SYN) == 0) 1426 goto drop; 1427 if (tiflags & TH_ACK) { 1428 tp->snd_una = tp->snd_recover = th->th_ack; 1429 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1430 tp->snd_nxt = tp->snd_una; 1431 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1432 } 1433 tp->irs = th->th_seq; 1434 tcp_rcvseqinit(tp); 1435 tp->t_flags |= TF_ACKNOW; 1436 tcp_mss_from_peer(tp, opti.maxseg); 1437 1438 /* 1439 * Initialize the initial congestion window. If we 1440 * had to retransmit the SYN, we must initialize cwnd 1441 * to 1 segment (i.e. the Loss Window). 1442 */ 1443 if (tp->t_flags & TF_SYN_REXMT) 1444 tp->snd_cwnd = tp->t_peermss; 1445 else 1446 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, 1447 tp->t_peermss); 1448 1449 tcp_rmx_rtt(tp); 1450 if (tiflags & TH_ACK) { 1451 tcpstat.tcps_connects++; 1452 soisconnected(so); 1453 tcp_established(tp); 1454 /* Do window scaling on this connection? */ 1455 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1456 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1457 tp->snd_scale = tp->requested_s_scale; 1458 tp->rcv_scale = tp->request_r_scale; 1459 } 1460 TCP_REASS_LOCK(tp); 1461 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1462 TCP_REASS_UNLOCK(tp); 1463 /* 1464 * if we didn't have to retransmit the SYN, 1465 * use its rtt as our initial srtt & rtt var. 1466 */ 1467 if (tp->t_rtttime) 1468 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1469 } else 1470 tp->t_state = TCPS_SYN_RECEIVED; 1471 1472 /* 1473 * Advance th->th_seq to correspond to first data byte. 1474 * If data, trim to stay within window, 1475 * dropping FIN if necessary. 1476 */ 1477 th->th_seq++; 1478 if (tlen > tp->rcv_wnd) { 1479 todrop = tlen - tp->rcv_wnd; 1480 m_adj(m, -todrop); 1481 tlen = tp->rcv_wnd; 1482 tiflags &= ~TH_FIN; 1483 tcpstat.tcps_rcvpackafterwin++; 1484 tcpstat.tcps_rcvbyteafterwin += todrop; 1485 } 1486 tp->snd_wl1 = th->th_seq - 1; 1487 tp->rcv_up = th->th_seq; 1488 goto step6; 1489 1490 /* 1491 * If the state is SYN_RECEIVED: 1492 * If seg contains an ACK, but not for our SYN, drop the input 1493 * and generate an RST. See page 36, rfc793 1494 */ 1495 case TCPS_SYN_RECEIVED: 1496 if ((tiflags & TH_ACK) && 1497 (SEQ_LEQ(th->th_ack, tp->iss) || 1498 SEQ_GT(th->th_ack, tp->snd_max))) 1499 goto dropwithreset; 1500 break; 1501 } 1502 1503 /* 1504 * States other than LISTEN or SYN_SENT. 1505 * First check timestamp, if present. 1506 * Then check that at least some bytes of segment are within 1507 * receive window. If segment begins before rcv_nxt, 1508 * drop leading data (and SYN); if nothing left, just ack. 1509 * 1510 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1511 * and it's less than ts_recent, drop it. 1512 */ 1513 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1514 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1515 1516 /* Check to see if ts_recent is over 24 days old. */ 1517 if ((int)(TCP_TIMESTAMP(tp) - tp->ts_recent_age) > 1518 TCP_PAWS_IDLE) { 1519 /* 1520 * Invalidate ts_recent. If this segment updates 1521 * ts_recent, the age will be reset later and ts_recent 1522 * will get a valid value. If it does not, setting 1523 * ts_recent to zero will at least satisfy the 1524 * requirement that zero be placed in the timestamp 1525 * echo reply when ts_recent isn't valid. The 1526 * age isn't reset until we get a valid ts_recent 1527 * because we don't want out-of-order segments to be 1528 * dropped when ts_recent is old. 1529 */ 1530 tp->ts_recent = 0; 1531 } else { 1532 tcpstat.tcps_rcvduppack++; 1533 tcpstat.tcps_rcvdupbyte += tlen; 1534 tcpstat.tcps_pawsdrop++; 1535 goto dropafterack; 1536 } 1537 } 1538 1539 todrop = tp->rcv_nxt - th->th_seq; 1540 if (todrop > 0) { 1541 if (tiflags & TH_SYN) { 1542 tiflags &= ~TH_SYN; 1543 th->th_seq++; 1544 if (th->th_urp > 1) 1545 th->th_urp--; 1546 else { 1547 tiflags &= ~TH_URG; 1548 th->th_urp = 0; 1549 } 1550 todrop--; 1551 } 1552 if (todrop > tlen || 1553 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1554 /* 1555 * Any valid FIN must be to the left of the window. 1556 * At this point the FIN must be a duplicate or 1557 * out of sequence; drop it. 1558 */ 1559 tiflags &= ~TH_FIN; 1560 /* 1561 * Send an ACK to resynchronize and drop any data. 1562 * But keep on processing for RST or ACK. 1563 */ 1564 tp->t_flags |= TF_ACKNOW; 1565 todrop = tlen; 1566 tcpstat.tcps_rcvdupbyte += todrop; 1567 tcpstat.tcps_rcvduppack++; 1568 } else { 1569 tcpstat.tcps_rcvpartduppack++; 1570 tcpstat.tcps_rcvpartdupbyte += todrop; 1571 } 1572 hdroptlen += todrop; /*drop from head afterwards*/ 1573 th->th_seq += todrop; 1574 tlen -= todrop; 1575 if (th->th_urp > todrop) 1576 th->th_urp -= todrop; 1577 else { 1578 tiflags &= ~TH_URG; 1579 th->th_urp = 0; 1580 } 1581 } 1582 1583 /* 1584 * If new data are received on a connection after the 1585 * user processes are gone, then RST the other end. 1586 */ 1587 if ((so->so_state & SS_NOFDREF) && 1588 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1589 tp = tcp_close(tp); 1590 tcpstat.tcps_rcvafterclose++; 1591 goto dropwithreset; 1592 } 1593 1594 /* 1595 * If segment ends after window, drop trailing data 1596 * (and PUSH and FIN); if nothing left, just ACK. 1597 */ 1598 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1599 if (todrop > 0) { 1600 tcpstat.tcps_rcvpackafterwin++; 1601 if (todrop >= tlen) { 1602 tcpstat.tcps_rcvbyteafterwin += tlen; 1603 /* 1604 * If a new connection request is received 1605 * while in TIME_WAIT, drop the old connection 1606 * and start over if the sequence numbers 1607 * are above the previous ones. 1608 */ 1609 if (tiflags & TH_SYN && 1610 tp->t_state == TCPS_TIME_WAIT && 1611 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1612 iss = tcp_new_iss(tp, tp->snd_nxt); 1613 tp = tcp_close(tp); 1614 goto findpcb; 1615 } 1616 /* 1617 * If window is closed can only take segments at 1618 * window edge, and have to drop data and PUSH from 1619 * incoming segments. Continue processing, but 1620 * remember to ack. Otherwise, drop segment 1621 * and ack. 1622 */ 1623 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1624 tp->t_flags |= TF_ACKNOW; 1625 tcpstat.tcps_rcvwinprobe++; 1626 } else 1627 goto dropafterack; 1628 } else 1629 tcpstat.tcps_rcvbyteafterwin += todrop; 1630 m_adj(m, -todrop); 1631 tlen -= todrop; 1632 tiflags &= ~(TH_PUSH|TH_FIN); 1633 } 1634 1635 /* 1636 * If last ACK falls within this segment's sequence numbers, 1637 * and the timestamp is newer, record it. 1638 */ 1639 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1640 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1641 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen + 1642 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 1643 tp->ts_recent_age = TCP_TIMESTAMP(tp); 1644 tp->ts_recent = opti.ts_val; 1645 } 1646 1647 /* 1648 * If the RST bit is set examine the state: 1649 * SYN_RECEIVED STATE: 1650 * If passive open, return to LISTEN state. 1651 * If active open, inform user that connection was refused. 1652 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1653 * Inform user that connection was reset, and close tcb. 1654 * CLOSING, LAST_ACK, TIME_WAIT STATES 1655 * Close the tcb. 1656 */ 1657 if (tiflags&TH_RST) switch (tp->t_state) { 1658 1659 case TCPS_SYN_RECEIVED: 1660 so->so_error = ECONNREFUSED; 1661 goto close; 1662 1663 case TCPS_ESTABLISHED: 1664 case TCPS_FIN_WAIT_1: 1665 case TCPS_FIN_WAIT_2: 1666 case TCPS_CLOSE_WAIT: 1667 so->so_error = ECONNRESET; 1668 close: 1669 tp->t_state = TCPS_CLOSED; 1670 tcpstat.tcps_drops++; 1671 tp = tcp_close(tp); 1672 goto drop; 1673 1674 case TCPS_CLOSING: 1675 case TCPS_LAST_ACK: 1676 case TCPS_TIME_WAIT: 1677 tp = tcp_close(tp); 1678 goto drop; 1679 } 1680 1681 /* 1682 * If a SYN is in the window, then this is an 1683 * error and we send an RST and drop the connection. 1684 */ 1685 if (tiflags & TH_SYN) { 1686 tp = tcp_drop(tp, ECONNRESET); 1687 goto dropwithreset; 1688 } 1689 1690 /* 1691 * If the ACK bit is off we drop the segment and return. 1692 */ 1693 if ((tiflags & TH_ACK) == 0) { 1694 if (tp->t_flags & TF_ACKNOW) 1695 goto dropafterack; 1696 else 1697 goto drop; 1698 } 1699 1700 /* 1701 * Ack processing. 1702 */ 1703 switch (tp->t_state) { 1704 1705 /* 1706 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 1707 * ESTABLISHED state and continue processing, otherwise 1708 * send an RST. 1709 */ 1710 case TCPS_SYN_RECEIVED: 1711 if (SEQ_GT(tp->snd_una, th->th_ack) || 1712 SEQ_GT(th->th_ack, tp->snd_max)) 1713 goto dropwithreset; 1714 tcpstat.tcps_connects++; 1715 soisconnected(so); 1716 tcp_established(tp); 1717 /* Do window scaling? */ 1718 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1719 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1720 tp->snd_scale = tp->requested_s_scale; 1721 tp->rcv_scale = tp->request_r_scale; 1722 } 1723 TCP_REASS_LOCK(tp); 1724 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1725 TCP_REASS_UNLOCK(tp); 1726 tp->snd_wl1 = th->th_seq - 1; 1727 /* fall into ... */ 1728 1729 /* 1730 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1731 * ACKs. If the ack is in the range 1732 * tp->snd_una < th->th_ack <= tp->snd_max 1733 * then advance tp->snd_una to th->th_ack and drop 1734 * data from the retransmission queue. If this ACK reflects 1735 * more up to date window information we update our window information. 1736 */ 1737 case TCPS_ESTABLISHED: 1738 case TCPS_FIN_WAIT_1: 1739 case TCPS_FIN_WAIT_2: 1740 case TCPS_CLOSE_WAIT: 1741 case TCPS_CLOSING: 1742 case TCPS_LAST_ACK: 1743 case TCPS_TIME_WAIT: 1744 1745 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1746 if (tlen == 0 && tiwin == tp->snd_wnd) { 1747 tcpstat.tcps_rcvdupack++; 1748 /* 1749 * If we have outstanding data (other than 1750 * a window probe), this is a completely 1751 * duplicate ack (ie, window info didn't 1752 * change), the ack is the biggest we've 1753 * seen and we've seen exactly our rexmt 1754 * threshhold of them, assume a packet 1755 * has been dropped and retransmit it. 1756 * Kludge snd_nxt & the congestion 1757 * window so we send only this one 1758 * packet. 1759 * 1760 * We know we're losing at the current 1761 * window size so do congestion avoidance 1762 * (set ssthresh to half the current window 1763 * and pull our congestion window back to 1764 * the new ssthresh). 1765 * 1766 * Dup acks mean that packets have left the 1767 * network (they're now cached at the receiver) 1768 * so bump cwnd by the amount in the receiver 1769 * to keep a constant cwnd packets in the 1770 * network. 1771 */ 1772 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 1773 th->th_ack != tp->snd_una) 1774 tp->t_dupacks = 0; 1775 else if (++tp->t_dupacks == tcprexmtthresh) { 1776 tcp_seq onxt = tp->snd_nxt; 1777 u_int win = 1778 min(tp->snd_wnd, tp->snd_cwnd) / 1779 2 / tp->t_segsz; 1780 if (tcp_do_newreno && SEQ_LT(th->th_ack, 1781 tp->snd_recover)) { 1782 /* 1783 * False fast retransmit after 1784 * timeout. Do not cut window. 1785 */ 1786 tp->snd_cwnd += tp->t_segsz; 1787 tp->t_dupacks = 0; 1788 (void) tcp_output(tp); 1789 goto drop; 1790 } 1791 1792 if (win < 2) 1793 win = 2; 1794 tp->snd_ssthresh = win * tp->t_segsz; 1795 tp->snd_recover = tp->snd_max; 1796 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1797 tp->t_rtttime = 0; 1798 tp->snd_nxt = th->th_ack; 1799 tp->snd_cwnd = tp->t_segsz; 1800 (void) tcp_output(tp); 1801 tp->snd_cwnd = tp->snd_ssthresh + 1802 tp->t_segsz * tp->t_dupacks; 1803 if (SEQ_GT(onxt, tp->snd_nxt)) 1804 tp->snd_nxt = onxt; 1805 goto drop; 1806 } else if (tp->t_dupacks > tcprexmtthresh) { 1807 tp->snd_cwnd += tp->t_segsz; 1808 (void) tcp_output(tp); 1809 goto drop; 1810 } 1811 } else 1812 tp->t_dupacks = 0; 1813 break; 1814 } 1815 /* 1816 * If the congestion window was inflated to account 1817 * for the other side's cached packets, retract it. 1818 */ 1819 if (tcp_do_newreno == 0) { 1820 if (tp->t_dupacks >= tcprexmtthresh && 1821 tp->snd_cwnd > tp->snd_ssthresh) 1822 tp->snd_cwnd = tp->snd_ssthresh; 1823 tp->t_dupacks = 0; 1824 } else if (tp->t_dupacks >= tcprexmtthresh && 1825 tcp_newreno(tp, th) == 0) { 1826 tp->snd_cwnd = tp->snd_ssthresh; 1827 /* 1828 * Window inflation should have left us with approx. 1829 * snd_ssthresh outstanding data. But in case we 1830 * would be inclined to send a burst, better to do 1831 * it via the slow start mechanism. 1832 */ 1833 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 1834 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 1835 + tp->t_segsz; 1836 tp->t_dupacks = 0; 1837 } 1838 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1839 tcpstat.tcps_rcvacktoomuch++; 1840 goto dropafterack; 1841 } 1842 acked = th->th_ack - tp->snd_una; 1843 tcpstat.tcps_rcvackpack++; 1844 tcpstat.tcps_rcvackbyte += acked; 1845 1846 /* 1847 * If we have a timestamp reply, update smoothed 1848 * round trip time. If no timestamp is present but 1849 * transmit timer is running and timed sequence 1850 * number was acked, update smoothed round trip time. 1851 * Since we now have an rtt measurement, cancel the 1852 * timer backoff (cf., Phil Karn's retransmit alg.). 1853 * Recompute the initial retransmit timer. 1854 */ 1855 if (opti.ts_present && opti.ts_ecr) 1856 tcp_xmit_timer(tp, TCP_TIMESTAMP(tp) - opti.ts_ecr + 1); 1857 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1858 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1859 1860 /* 1861 * If all outstanding data is acked, stop retransmit 1862 * timer and remember to restart (more output or persist). 1863 * If there is more data to be acked, restart retransmit 1864 * timer, using current (possibly backed-off) value. 1865 */ 1866 if (th->th_ack == tp->snd_max) { 1867 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1868 needoutput = 1; 1869 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1870 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1871 /* 1872 * When new data is acked, open the congestion window. 1873 * If the window gives us less than ssthresh packets 1874 * in flight, open exponentially (segsz per packet). 1875 * Otherwise open linearly: segsz per window 1876 * (segsz^2 / cwnd per packet), plus a constant 1877 * fraction of a packet (segsz/8) to help larger windows 1878 * open quickly enough. 1879 */ 1880 { 1881 u_int cw = tp->snd_cwnd; 1882 u_int incr = tp->t_segsz; 1883 1884 if (cw > tp->snd_ssthresh) 1885 incr = incr * incr / cw; 1886 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover)) 1887 tp->snd_cwnd = min(cw + incr, 1888 TCP_MAXWIN << tp->snd_scale); 1889 } 1890 ND6_HINT(tp); 1891 if (acked > so->so_snd.sb_cc) { 1892 tp->snd_wnd -= so->so_snd.sb_cc; 1893 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1894 ourfinisacked = 1; 1895 } else { 1896 sbdrop(&so->so_snd, acked); 1897 tp->snd_wnd -= acked; 1898 ourfinisacked = 0; 1899 } 1900 sowwakeup(so); 1901 /* 1902 * We want snd_recover to track snd_una to 1903 * avoid sequence wraparound problems for 1904 * very large transfers. 1905 */ 1906 tp->snd_una = tp->snd_recover = th->th_ack; 1907 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1908 tp->snd_nxt = tp->snd_una; 1909 1910 switch (tp->t_state) { 1911 1912 /* 1913 * In FIN_WAIT_1 STATE in addition to the processing 1914 * for the ESTABLISHED state if our FIN is now acknowledged 1915 * then enter FIN_WAIT_2. 1916 */ 1917 case TCPS_FIN_WAIT_1: 1918 if (ourfinisacked) { 1919 /* 1920 * If we can't receive any more 1921 * data, then closing user can proceed. 1922 * Starting the timer is contrary to the 1923 * specification, but if we don't get a FIN 1924 * we'll hang forever. 1925 */ 1926 if (so->so_state & SS_CANTRCVMORE) { 1927 soisdisconnected(so); 1928 if (tcp_maxidle > 0) 1929 TCP_TIMER_ARM(tp, TCPT_2MSL, 1930 tcp_maxidle); 1931 } 1932 tp->t_state = TCPS_FIN_WAIT_2; 1933 } 1934 break; 1935 1936 /* 1937 * In CLOSING STATE in addition to the processing for 1938 * the ESTABLISHED state if the ACK acknowledges our FIN 1939 * then enter the TIME-WAIT state, otherwise ignore 1940 * the segment. 1941 */ 1942 case TCPS_CLOSING: 1943 if (ourfinisacked) { 1944 tp->t_state = TCPS_TIME_WAIT; 1945 tcp_canceltimers(tp); 1946 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1947 soisdisconnected(so); 1948 } 1949 break; 1950 1951 /* 1952 * In LAST_ACK, we may still be waiting for data to drain 1953 * and/or to be acked, as well as for the ack of our FIN. 1954 * If our FIN is now acknowledged, delete the TCB, 1955 * enter the closed state and return. 1956 */ 1957 case TCPS_LAST_ACK: 1958 if (ourfinisacked) { 1959 tp = tcp_close(tp); 1960 goto drop; 1961 } 1962 break; 1963 1964 /* 1965 * In TIME_WAIT state the only thing that should arrive 1966 * is a retransmission of the remote FIN. Acknowledge 1967 * it and restart the finack timer. 1968 */ 1969 case TCPS_TIME_WAIT: 1970 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1971 goto dropafterack; 1972 } 1973 } 1974 1975 step6: 1976 /* 1977 * Update window information. 1978 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1979 */ 1980 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 1981 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 1982 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 1983 /* keep track of pure window updates */ 1984 if (tlen == 0 && 1985 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1986 tcpstat.tcps_rcvwinupd++; 1987 tp->snd_wnd = tiwin; 1988 tp->snd_wl1 = th->th_seq; 1989 tp->snd_wl2 = th->th_ack; 1990 if (tp->snd_wnd > tp->max_sndwnd) 1991 tp->max_sndwnd = tp->snd_wnd; 1992 needoutput = 1; 1993 } 1994 1995 /* 1996 * Process segments with URG. 1997 */ 1998 if ((tiflags & TH_URG) && th->th_urp && 1999 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2000 /* 2001 * This is a kludge, but if we receive and accept 2002 * random urgent pointers, we'll crash in 2003 * soreceive. It's hard to imagine someone 2004 * actually wanting to send this much urgent data. 2005 */ 2006 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2007 th->th_urp = 0; /* XXX */ 2008 tiflags &= ~TH_URG; /* XXX */ 2009 goto dodata; /* XXX */ 2010 } 2011 /* 2012 * If this segment advances the known urgent pointer, 2013 * then mark the data stream. This should not happen 2014 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2015 * a FIN has been received from the remote side. 2016 * In these states we ignore the URG. 2017 * 2018 * According to RFC961 (Assigned Protocols), 2019 * the urgent pointer points to the last octet 2020 * of urgent data. We continue, however, 2021 * to consider it to indicate the first octet 2022 * of data past the urgent section as the original 2023 * spec states (in one of two places). 2024 */ 2025 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2026 tp->rcv_up = th->th_seq + th->th_urp; 2027 so->so_oobmark = so->so_rcv.sb_cc + 2028 (tp->rcv_up - tp->rcv_nxt) - 1; 2029 if (so->so_oobmark == 0) 2030 so->so_state |= SS_RCVATMARK; 2031 sohasoutofband(so); 2032 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2033 } 2034 /* 2035 * Remove out of band data so doesn't get presented to user. 2036 * This can happen independent of advancing the URG pointer, 2037 * but if two URG's are pending at once, some out-of-band 2038 * data may creep in... ick. 2039 */ 2040 if (th->th_urp <= (u_int16_t) tlen 2041 #ifdef SO_OOBINLINE 2042 && (so->so_options & SO_OOBINLINE) == 0 2043 #endif 2044 ) 2045 tcp_pulloutofband(so, th, m, hdroptlen); 2046 } else 2047 /* 2048 * If no out of band data is expected, 2049 * pull receive urgent pointer along 2050 * with the receive window. 2051 */ 2052 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2053 tp->rcv_up = tp->rcv_nxt; 2054 dodata: /* XXX */ 2055 2056 /* 2057 * Process the segment text, merging it into the TCP sequencing queue, 2058 * and arranging for acknowledgement of receipt if necessary. 2059 * This process logically involves adjusting tp->rcv_wnd as data 2060 * is presented to the user (this happens in tcp_usrreq.c, 2061 * case PRU_RCVD). If a FIN has already been received on this 2062 * connection then we just ignore the text. 2063 */ 2064 if ((tlen || (tiflags & TH_FIN)) && 2065 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2066 /* 2067 * Insert segment ti into reassembly queue of tcp with 2068 * control block tp. Return TH_FIN if reassembly now includes 2069 * a segment with FIN. The macro form does the common case 2070 * inline (segment is the next to be received on an 2071 * established connection, and the queue is empty), 2072 * avoiding linkage into and removal from the queue and 2073 * repetition of various conversions. 2074 * Set DELACK for segments received in order, but ack 2075 * immediately when segments are out of order 2076 * (so fast retransmit can work). 2077 */ 2078 /* NOTE: this was TCP_REASS() macro, but used only once */ 2079 TCP_REASS_LOCK(tp); 2080 if (th->th_seq == tp->rcv_nxt && 2081 LIST_FIRST(&tp->segq) == NULL && 2082 tp->t_state == TCPS_ESTABLISHED) { 2083 TCP_SETUP_ACK(tp, th); 2084 tp->rcv_nxt += tlen; 2085 tiflags = th->th_flags & TH_FIN; 2086 tcpstat.tcps_rcvpack++; 2087 tcpstat.tcps_rcvbyte += tlen; 2088 ND6_HINT(tp); 2089 m_adj(m, hdroptlen); 2090 sbappend(&(so)->so_rcv, m); 2091 sorwakeup(so); 2092 } else { 2093 m_adj(m, hdroptlen); 2094 tiflags = tcp_reass(tp, th, m, &tlen); 2095 tp->t_flags |= TF_ACKNOW; 2096 } 2097 TCP_REASS_UNLOCK(tp); 2098 2099 /* 2100 * Note the amount of data that peer has sent into 2101 * our window, in order to estimate the sender's 2102 * buffer size. 2103 */ 2104 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2105 } else { 2106 m_freem(m); 2107 m = NULL; 2108 tiflags &= ~TH_FIN; 2109 } 2110 2111 /* 2112 * If FIN is received ACK the FIN and let the user know 2113 * that the connection is closing. Ignore a FIN received before 2114 * the connection is fully established. 2115 */ 2116 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2117 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2118 socantrcvmore(so); 2119 tp->t_flags |= TF_ACKNOW; 2120 tp->rcv_nxt++; 2121 } 2122 switch (tp->t_state) { 2123 2124 /* 2125 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2126 */ 2127 case TCPS_ESTABLISHED: 2128 tp->t_state = TCPS_CLOSE_WAIT; 2129 break; 2130 2131 /* 2132 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2133 * enter the CLOSING state. 2134 */ 2135 case TCPS_FIN_WAIT_1: 2136 tp->t_state = TCPS_CLOSING; 2137 break; 2138 2139 /* 2140 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2141 * starting the time-wait timer, turning off the other 2142 * standard timers. 2143 */ 2144 case TCPS_FIN_WAIT_2: 2145 tp->t_state = TCPS_TIME_WAIT; 2146 tcp_canceltimers(tp); 2147 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2148 soisdisconnected(so); 2149 break; 2150 2151 /* 2152 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2153 */ 2154 case TCPS_TIME_WAIT: 2155 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2156 break; 2157 } 2158 } 2159 #ifdef TCP_DEBUG 2160 if (so->so_options & SO_DEBUG) 2161 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2162 #endif 2163 2164 /* 2165 * Return any desired output. 2166 */ 2167 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2168 (void) tcp_output(tp); 2169 if (tcp_saveti) 2170 m_freem(tcp_saveti); 2171 return; 2172 2173 badsyn: 2174 /* 2175 * Received a bad SYN. Increment counters and dropwithreset. 2176 */ 2177 tcpstat.tcps_badsyn++; 2178 tp = NULL; 2179 goto dropwithreset; 2180 2181 dropafterack: 2182 /* 2183 * Generate an ACK dropping incoming segment if it occupies 2184 * sequence space, where the ACK reflects our state. 2185 */ 2186 if (tiflags & TH_RST) 2187 goto drop; 2188 m_freem(m); 2189 tp->t_flags |= TF_ACKNOW; 2190 (void) tcp_output(tp); 2191 if (tcp_saveti) 2192 m_freem(tcp_saveti); 2193 return; 2194 2195 dropwithreset_ratelim: 2196 /* 2197 * We may want to rate-limit RSTs in certain situations, 2198 * particularly if we are sending an RST in response to 2199 * an attempt to connect to or otherwise communicate with 2200 * a port for which we have no socket. 2201 */ 2202 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2203 tcp_rst_ppslim) == 0) { 2204 /* XXX stat */ 2205 goto drop; 2206 } 2207 /* ...fall into dropwithreset... */ 2208 2209 dropwithreset: 2210 /* 2211 * Generate a RST, dropping incoming segment. 2212 * Make ACK acceptable to originator of segment. 2213 */ 2214 if (tiflags & TH_RST) 2215 goto drop; 2216 { 2217 /* 2218 * need to recover version # field, which was overwritten on 2219 * ip_cksum computation. 2220 */ 2221 struct ip *sip; 2222 sip = mtod(m, struct ip *); 2223 switch (af) { 2224 #ifdef INET 2225 case AF_INET: 2226 sip->ip_v = 4; 2227 break; 2228 #endif 2229 #ifdef INET6 2230 case AF_INET6: 2231 sip->ip_v = 6; 2232 break; 2233 #endif 2234 } 2235 } 2236 if (tiflags & TH_ACK) 2237 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2238 else { 2239 if (tiflags & TH_SYN) 2240 tlen++; 2241 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2242 TH_RST|TH_ACK); 2243 } 2244 if (tcp_saveti) 2245 m_freem(tcp_saveti); 2246 return; 2247 2248 badcsum: 2249 tcpstat.tcps_rcvbadsum++; 2250 drop: 2251 /* 2252 * Drop space held by incoming segment and return. 2253 */ 2254 if (tp) { 2255 if (tp->t_inpcb) 2256 so = tp->t_inpcb->inp_socket; 2257 #ifdef INET6 2258 else if (tp->t_in6pcb) 2259 so = tp->t_in6pcb->in6p_socket; 2260 #endif 2261 else 2262 so = NULL; 2263 #ifdef TCP_DEBUG 2264 if (so && (so->so_options & SO_DEBUG) != 0) 2265 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2266 #endif 2267 } 2268 if (tcp_saveti) 2269 m_freem(tcp_saveti); 2270 m_freem(m); 2271 return; 2272 } 2273 2274 void 2275 tcp_dooptions(tp, cp, cnt, th, oi) 2276 struct tcpcb *tp; 2277 u_char *cp; 2278 int cnt; 2279 struct tcphdr *th; 2280 struct tcp_opt_info *oi; 2281 { 2282 u_int16_t mss; 2283 int opt, optlen; 2284 2285 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2286 opt = cp[0]; 2287 if (opt == TCPOPT_EOL) 2288 break; 2289 if (opt == TCPOPT_NOP) 2290 optlen = 1; 2291 else { 2292 if (cnt < 2) 2293 break; 2294 optlen = cp[1]; 2295 if (optlen < 2 || optlen > cnt) 2296 break; 2297 } 2298 switch (opt) { 2299 2300 default: 2301 continue; 2302 2303 case TCPOPT_MAXSEG: 2304 if (optlen != TCPOLEN_MAXSEG) 2305 continue; 2306 if (!(th->th_flags & TH_SYN)) 2307 continue; 2308 bcopy(cp + 2, &mss, sizeof(mss)); 2309 oi->maxseg = ntohs(mss); 2310 break; 2311 2312 case TCPOPT_WINDOW: 2313 if (optlen != TCPOLEN_WINDOW) 2314 continue; 2315 if (!(th->th_flags & TH_SYN)) 2316 continue; 2317 tp->t_flags |= TF_RCVD_SCALE; 2318 tp->requested_s_scale = cp[2]; 2319 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 2320 #if 0 /*XXX*/ 2321 char *p; 2322 2323 if (ip) 2324 p = ntohl(ip->ip_src); 2325 #ifdef INET6 2326 else if (ip6) 2327 p = ip6_sprintf(&ip6->ip6_src); 2328 #endif 2329 else 2330 p = "(unknown)"; 2331 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 2332 "assuming %d\n", 2333 tp->requested_s_scale, p, 2334 TCP_MAX_WINSHIFT); 2335 #else 2336 log(LOG_ERR, "TCP: invalid wscale %d, " 2337 "assuming %d\n", 2338 tp->requested_s_scale, 2339 TCP_MAX_WINSHIFT); 2340 #endif 2341 tp->requested_s_scale = TCP_MAX_WINSHIFT; 2342 } 2343 break; 2344 2345 case TCPOPT_TIMESTAMP: 2346 if (optlen != TCPOLEN_TIMESTAMP) 2347 continue; 2348 oi->ts_present = 1; 2349 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2350 NTOHL(oi->ts_val); 2351 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2352 NTOHL(oi->ts_ecr); 2353 2354 /* 2355 * A timestamp received in a SYN makes 2356 * it ok to send timestamp requests and replies. 2357 */ 2358 if (th->th_flags & TH_SYN) { 2359 tp->t_flags |= TF_RCVD_TSTMP; 2360 tp->ts_recent = oi->ts_val; 2361 tp->ts_recent_age = TCP_TIMESTAMP(tp); 2362 } 2363 break; 2364 case TCPOPT_SACK_PERMITTED: 2365 if (optlen != TCPOLEN_SACK_PERMITTED) 2366 continue; 2367 if (!(th->th_flags & TH_SYN)) 2368 continue; 2369 tp->t_flags &= ~TF_CANT_TXSACK; 2370 break; 2371 2372 case TCPOPT_SACK: 2373 if (tp->t_flags & TF_IGNR_RXSACK) 2374 continue; 2375 if (optlen % 8 != 2 || optlen < 10) 2376 continue; 2377 cp += 2; 2378 optlen -= 2; 2379 for (; optlen > 0; cp -= 8, optlen -= 8) { 2380 tcp_seq lwe, rwe; 2381 bcopy((char *)cp, (char *) &lwe, sizeof(lwe)); 2382 NTOHL(lwe); 2383 bcopy((char *)cp, (char *) &rwe, sizeof(rwe)); 2384 NTOHL(rwe); 2385 /* tcp_mark_sacked(tp, lwe, rwe); */ 2386 } 2387 break; 2388 } 2389 } 2390 } 2391 2392 /* 2393 * Pull out of band byte out of a segment so 2394 * it doesn't appear in the user's data queue. 2395 * It is still reflected in the segment length for 2396 * sequencing purposes. 2397 */ 2398 void 2399 tcp_pulloutofband(so, th, m, off) 2400 struct socket *so; 2401 struct tcphdr *th; 2402 struct mbuf *m; 2403 int off; 2404 { 2405 int cnt = off + th->th_urp - 1; 2406 2407 while (cnt >= 0) { 2408 if (m->m_len > cnt) { 2409 char *cp = mtod(m, caddr_t) + cnt; 2410 struct tcpcb *tp = sototcpcb(so); 2411 2412 tp->t_iobc = *cp; 2413 tp->t_oobflags |= TCPOOB_HAVEDATA; 2414 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2415 m->m_len--; 2416 return; 2417 } 2418 cnt -= m->m_len; 2419 m = m->m_next; 2420 if (m == 0) 2421 break; 2422 } 2423 panic("tcp_pulloutofband"); 2424 } 2425 2426 /* 2427 * Collect new round-trip time estimate 2428 * and update averages and current timeout. 2429 */ 2430 void 2431 tcp_xmit_timer(tp, rtt) 2432 struct tcpcb *tp; 2433 uint32_t rtt; 2434 { 2435 int32_t delta; 2436 2437 tcpstat.tcps_rttupdated++; 2438 if (tp->t_srtt != 0) { 2439 /* 2440 * srtt is stored as fixed point with 3 bits after the 2441 * binary point (i.e., scaled by 8). The following magic 2442 * is equivalent to the smoothing algorithm in rfc793 with 2443 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2444 * point). Adjust rtt to origin 0. 2445 */ 2446 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2447 if ((tp->t_srtt += delta) <= 0) 2448 tp->t_srtt = 1 << 2; 2449 /* 2450 * We accumulate a smoothed rtt variance (actually, a 2451 * smoothed mean difference), then set the retransmit 2452 * timer to smoothed rtt + 4 times the smoothed variance. 2453 * rttvar is stored as fixed point with 2 bits after the 2454 * binary point (scaled by 4). The following is 2455 * equivalent to rfc793 smoothing with an alpha of .75 2456 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2457 * rfc793's wired-in beta. 2458 */ 2459 if (delta < 0) 2460 delta = -delta; 2461 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2462 if ((tp->t_rttvar += delta) <= 0) 2463 tp->t_rttvar = 1 << 2; 2464 } else { 2465 /* 2466 * No rtt measurement yet - use the unsmoothed rtt. 2467 * Set the variance to half the rtt (so our first 2468 * retransmit happens at 3*rtt). 2469 */ 2470 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2471 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2472 } 2473 tp->t_rtttime = 0; 2474 tp->t_rxtshift = 0; 2475 2476 /* 2477 * the retransmit should happen at rtt + 4 * rttvar. 2478 * Because of the way we do the smoothing, srtt and rttvar 2479 * will each average +1/2 tick of bias. When we compute 2480 * the retransmit timer, we want 1/2 tick of rounding and 2481 * 1 extra tick because of +-1/2 tick uncertainty in the 2482 * firing of the timer. The bias will give us exactly the 2483 * 1.5 tick we need. But, because the bias is 2484 * statistical, we have to test that we don't drop below 2485 * the minimum feasible timer (which is 2 ticks). 2486 */ 2487 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2488 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 2489 2490 /* 2491 * We received an ack for a packet that wasn't retransmitted; 2492 * it is probably safe to discard any error indications we've 2493 * received recently. This isn't quite right, but close enough 2494 * for now (a route might have failed after we sent a segment, 2495 * and the return path might not be symmetrical). 2496 */ 2497 tp->t_softerror = 0; 2498 } 2499 2500 /* 2501 * Checks for partial ack. If partial ack arrives, force the retransmission 2502 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2503 * 1. By setting snd_nxt to th_ack, this forces retransmission timer to 2504 * be started again. If the ack advances at least to tp->snd_recover, return 0. 2505 */ 2506 int 2507 tcp_newreno(tp, th) 2508 struct tcpcb *tp; 2509 struct tcphdr *th; 2510 { 2511 tcp_seq onxt = tp->snd_nxt; 2512 u_long ocwnd = tp->snd_cwnd; 2513 2514 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2515 /* 2516 * snd_una has not yet been updated and the socket's send 2517 * buffer has not yet drained off the ACK'd data, so we 2518 * have to leave snd_una as it was to get the correct data 2519 * offset in tcp_output(). 2520 */ 2521 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2522 tp->t_rtttime = 0; 2523 tp->snd_nxt = th->th_ack; 2524 /* 2525 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una 2526 * is not yet updated when we're called. 2527 */ 2528 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 2529 (void) tcp_output(tp); 2530 tp->snd_cwnd = ocwnd; 2531 if (SEQ_GT(onxt, tp->snd_nxt)) 2532 tp->snd_nxt = onxt; 2533 /* 2534 * Partial window deflation. Relies on fact that tp->snd_una 2535 * not updated yet. 2536 */ 2537 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz); 2538 return 1; 2539 } 2540 return 0; 2541 } 2542 2543 2544 /* 2545 * TCP compressed state engine. Currently used to hold compressed 2546 * state for SYN_RECEIVED. 2547 */ 2548 2549 u_long syn_cache_count; 2550 u_int32_t syn_hash1, syn_hash2; 2551 2552 #define SYN_HASH(sa, sp, dp) \ 2553 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 2554 ((u_int32_t)(sp)))^syn_hash2))) 2555 #ifndef INET6 2556 #define SYN_HASHALL(hash, src, dst) \ 2557 do { \ 2558 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2559 ((struct sockaddr_in *)(src))->sin_port, \ 2560 ((struct sockaddr_in *)(dst))->sin_port); \ 2561 } while (0) 2562 #else 2563 #define SYN_HASH6(sa, sp, dp) \ 2564 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 2565 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 2566 & 0x7fffffff) 2567 2568 #define SYN_HASHALL(hash, src, dst) \ 2569 do { \ 2570 switch ((src)->sa_family) { \ 2571 case AF_INET: \ 2572 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2573 ((struct sockaddr_in *)(src))->sin_port, \ 2574 ((struct sockaddr_in *)(dst))->sin_port); \ 2575 break; \ 2576 case AF_INET6: \ 2577 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 2578 ((struct sockaddr_in6 *)(src))->sin6_port, \ 2579 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 2580 break; \ 2581 default: \ 2582 hash = 0; \ 2583 } \ 2584 } while (/*CONSTCOND*/0) 2585 #endif /* INET6 */ 2586 2587 #define SYN_CACHE_RM(sc) \ 2588 do { \ 2589 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 2590 (sc), sc_bucketq); \ 2591 (sc)->sc_tp = NULL; \ 2592 LIST_REMOVE((sc), sc_tpq); \ 2593 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 2594 callout_stop(&(sc)->sc_timer); \ 2595 syn_cache_count--; \ 2596 } while (/*CONSTCOND*/0) 2597 2598 #define SYN_CACHE_PUT(sc) \ 2599 do { \ 2600 if ((sc)->sc_ipopts) \ 2601 (void) m_free((sc)->sc_ipopts); \ 2602 if ((sc)->sc_route4.ro_rt != NULL) \ 2603 RTFREE((sc)->sc_route4.ro_rt); \ 2604 pool_put(&syn_cache_pool, (sc)); \ 2605 } while (/*CONSTCOND*/0) 2606 2607 struct pool syn_cache_pool; 2608 2609 /* 2610 * We don't estimate RTT with SYNs, so each packet starts with the default 2611 * RTT and each timer step has a fixed timeout value. 2612 */ 2613 #define SYN_CACHE_TIMER_ARM(sc) \ 2614 do { \ 2615 TCPT_RANGESET((sc)->sc_rxtcur, \ 2616 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 2617 TCPTV_REXMTMAX); \ 2618 callout_reset(&(sc)->sc_timer, \ 2619 (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \ 2620 } while (/*CONSTCOND*/0) 2621 2622 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) 2623 2624 void 2625 syn_cache_init() 2626 { 2627 int i; 2628 2629 /* Initialize the hash buckets. */ 2630 for (i = 0; i < tcp_syn_cache_size; i++) 2631 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 2632 2633 /* Initialize the syn cache pool. */ 2634 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 2635 "synpl", 0, NULL, NULL, M_PCB); 2636 } 2637 2638 void 2639 syn_cache_insert(sc, tp) 2640 struct syn_cache *sc; 2641 struct tcpcb *tp; 2642 { 2643 struct syn_cache_head *scp; 2644 struct syn_cache *sc2; 2645 int s; 2646 2647 /* 2648 * If there are no entries in the hash table, reinitialize 2649 * the hash secrets. 2650 */ 2651 if (syn_cache_count == 0) { 2652 struct timeval tv; 2653 microtime(&tv); 2654 syn_hash1 = random() ^ (u_long)≻ 2655 syn_hash2 = random() ^ tv.tv_usec; 2656 } 2657 2658 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 2659 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 2660 scp = &tcp_syn_cache[sc->sc_bucketidx]; 2661 2662 /* 2663 * Make sure that we don't overflow the per-bucket 2664 * limit or the total cache size limit. 2665 */ 2666 s = splsoftnet(); 2667 if (scp->sch_length >= tcp_syn_bucket_limit) { 2668 tcpstat.tcps_sc_bucketoverflow++; 2669 /* 2670 * The bucket is full. Toss the oldest element in the 2671 * bucket. This will be the first entry in the bucket. 2672 */ 2673 sc2 = TAILQ_FIRST(&scp->sch_bucket); 2674 #ifdef DIAGNOSTIC 2675 /* 2676 * This should never happen; we should always find an 2677 * entry in our bucket. 2678 */ 2679 if (sc2 == NULL) 2680 panic("syn_cache_insert: bucketoverflow: impossible"); 2681 #endif 2682 SYN_CACHE_RM(sc2); 2683 SYN_CACHE_PUT(sc2); 2684 } else if (syn_cache_count >= tcp_syn_cache_limit) { 2685 struct syn_cache_head *scp2, *sce; 2686 2687 tcpstat.tcps_sc_overflowed++; 2688 /* 2689 * The cache is full. Toss the oldest entry in the 2690 * first non-empty bucket we can find. 2691 * 2692 * XXX We would really like to toss the oldest 2693 * entry in the cache, but we hope that this 2694 * condition doesn't happen very often. 2695 */ 2696 scp2 = scp; 2697 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 2698 sce = &tcp_syn_cache[tcp_syn_cache_size]; 2699 for (++scp2; scp2 != scp; scp2++) { 2700 if (scp2 >= sce) 2701 scp2 = &tcp_syn_cache[0]; 2702 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 2703 break; 2704 } 2705 #ifdef DIAGNOSTIC 2706 /* 2707 * This should never happen; we should always find a 2708 * non-empty bucket. 2709 */ 2710 if (scp2 == scp) 2711 panic("syn_cache_insert: cacheoverflow: " 2712 "impossible"); 2713 #endif 2714 } 2715 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 2716 SYN_CACHE_RM(sc2); 2717 SYN_CACHE_PUT(sc2); 2718 } 2719 2720 /* 2721 * Initialize the entry's timer. 2722 */ 2723 sc->sc_rxttot = 0; 2724 sc->sc_rxtshift = 0; 2725 SYN_CACHE_TIMER_ARM(sc); 2726 2727 /* Link it from tcpcb entry */ 2728 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 2729 2730 /* Put it into the bucket. */ 2731 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 2732 scp->sch_length++; 2733 syn_cache_count++; 2734 2735 tcpstat.tcps_sc_added++; 2736 splx(s); 2737 } 2738 2739 /* 2740 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 2741 * If we have retransmitted an entry the maximum number of times, expire 2742 * that entry. 2743 */ 2744 void 2745 syn_cache_timer(void *arg) 2746 { 2747 struct syn_cache *sc = arg; 2748 int s; 2749 2750 s = splsoftnet(); 2751 2752 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 2753 /* Drop it -- too many retransmissions. */ 2754 goto dropit; 2755 } 2756 2757 /* 2758 * Compute the total amount of time this entry has 2759 * been on a queue. If this entry has been on longer 2760 * than the keep alive timer would allow, expire it. 2761 */ 2762 sc->sc_rxttot += sc->sc_rxtcur; 2763 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) 2764 goto dropit; 2765 2766 tcpstat.tcps_sc_retransmitted++; 2767 (void) syn_cache_respond(sc, NULL); 2768 2769 /* Advance the timer back-off. */ 2770 sc->sc_rxtshift++; 2771 SYN_CACHE_TIMER_ARM(sc); 2772 2773 splx(s); 2774 return; 2775 2776 dropit: 2777 tcpstat.tcps_sc_timed_out++; 2778 SYN_CACHE_RM(sc); 2779 SYN_CACHE_PUT(sc); 2780 splx(s); 2781 } 2782 2783 /* 2784 * Remove syn cache created by the specified tcb entry, 2785 * because this does not make sense to keep them 2786 * (if there's no tcb entry, syn cache entry will never be used) 2787 */ 2788 void 2789 syn_cache_cleanup(tp) 2790 struct tcpcb *tp; 2791 { 2792 struct syn_cache *sc, *nsc; 2793 int s; 2794 2795 s = splsoftnet(); 2796 2797 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 2798 nsc = LIST_NEXT(sc, sc_tpq); 2799 2800 #ifdef DIAGNOSTIC 2801 if (sc->sc_tp != tp) 2802 panic("invalid sc_tp in syn_cache_cleanup"); 2803 #endif 2804 SYN_CACHE_RM(sc); 2805 SYN_CACHE_PUT(sc); 2806 } 2807 /* just for safety */ 2808 LIST_INIT(&tp->t_sc); 2809 2810 splx(s); 2811 } 2812 2813 /* 2814 * Find an entry in the syn cache. 2815 */ 2816 struct syn_cache * 2817 syn_cache_lookup(src, dst, headp) 2818 struct sockaddr *src; 2819 struct sockaddr *dst; 2820 struct syn_cache_head **headp; 2821 { 2822 struct syn_cache *sc; 2823 struct syn_cache_head *scp; 2824 u_int32_t hash; 2825 int s; 2826 2827 SYN_HASHALL(hash, src, dst); 2828 2829 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 2830 *headp = scp; 2831 s = splsoftnet(); 2832 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 2833 sc = TAILQ_NEXT(sc, sc_bucketq)) { 2834 if (sc->sc_hash != hash) 2835 continue; 2836 if (!bcmp(&sc->sc_src, src, src->sa_len) && 2837 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 2838 splx(s); 2839 return (sc); 2840 } 2841 } 2842 splx(s); 2843 return (NULL); 2844 } 2845 2846 /* 2847 * This function gets called when we receive an ACK for a 2848 * socket in the LISTEN state. We look up the connection 2849 * in the syn cache, and if its there, we pull it out of 2850 * the cache and turn it into a full-blown connection in 2851 * the SYN-RECEIVED state. 2852 * 2853 * The return values may not be immediately obvious, and their effects 2854 * can be subtle, so here they are: 2855 * 2856 * NULL SYN was not found in cache; caller should drop the 2857 * packet and send an RST. 2858 * 2859 * -1 We were unable to create the new connection, and are 2860 * aborting it. An ACK,RST is being sent to the peer 2861 * (unless we got screwey sequence numbners; see below), 2862 * because the 3-way handshake has been completed. Caller 2863 * should not free the mbuf, since we may be using it. If 2864 * we are not, we will free it. 2865 * 2866 * Otherwise, the return value is a pointer to the new socket 2867 * associated with the connection. 2868 */ 2869 struct socket * 2870 syn_cache_get(src, dst, th, hlen, tlen, so, m) 2871 struct sockaddr *src; 2872 struct sockaddr *dst; 2873 struct tcphdr *th; 2874 unsigned int hlen, tlen; 2875 struct socket *so; 2876 struct mbuf *m; 2877 { 2878 struct syn_cache *sc; 2879 struct syn_cache_head *scp; 2880 struct inpcb *inp = NULL; 2881 #ifdef INET6 2882 struct in6pcb *in6p = NULL; 2883 #endif 2884 struct tcpcb *tp = 0; 2885 struct mbuf *am; 2886 int s; 2887 struct socket *oso; 2888 2889 s = splsoftnet(); 2890 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 2891 splx(s); 2892 return (NULL); 2893 } 2894 2895 /* 2896 * Verify the sequence and ack numbers. Try getting the correct 2897 * response again. 2898 */ 2899 if ((th->th_ack != sc->sc_iss + 1) || 2900 SEQ_LEQ(th->th_seq, sc->sc_irs) || 2901 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 2902 (void) syn_cache_respond(sc, m); 2903 splx(s); 2904 return ((struct socket *)(-1)); 2905 } 2906 2907 /* Remove this cache entry */ 2908 SYN_CACHE_RM(sc); 2909 splx(s); 2910 2911 /* 2912 * Ok, create the full blown connection, and set things up 2913 * as they would have been set up if we had created the 2914 * connection when the SYN arrived. If we can't create 2915 * the connection, abort it. 2916 */ 2917 /* 2918 * inp still has the OLD in_pcb stuff, set the 2919 * v6-related flags on the new guy, too. This is 2920 * done particularly for the case where an AF_INET6 2921 * socket is bound only to a port, and a v4 connection 2922 * comes in on that port. 2923 * we also copy the flowinfo from the original pcb 2924 * to the new one. 2925 */ 2926 { 2927 struct inpcb *parentinpcb; 2928 2929 parentinpcb = (struct inpcb *)so->so_pcb; 2930 2931 oso = so; 2932 so = sonewconn(so, SS_ISCONNECTED); 2933 if (so == NULL) 2934 goto resetandabort; 2935 2936 switch (so->so_proto->pr_domain->dom_family) { 2937 #ifdef INET 2938 case AF_INET: 2939 inp = sotoinpcb(so); 2940 break; 2941 #endif 2942 #ifdef INET6 2943 case AF_INET6: 2944 in6p = sotoin6pcb(so); 2945 break; 2946 #endif 2947 } 2948 } 2949 switch (src->sa_family) { 2950 #ifdef INET 2951 case AF_INET: 2952 if (inp) { 2953 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 2954 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 2955 inp->inp_options = ip_srcroute(); 2956 in_pcbstate(inp, INP_BOUND); 2957 if (inp->inp_options == NULL) { 2958 inp->inp_options = sc->sc_ipopts; 2959 sc->sc_ipopts = NULL; 2960 } 2961 } 2962 #ifdef INET6 2963 else if (in6p) { 2964 /* IPv4 packet to AF_INET6 socket */ 2965 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr)); 2966 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 2967 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 2968 &in6p->in6p_laddr.s6_addr32[3], 2969 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 2970 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 2971 in6totcpcb(in6p)->t_family = AF_INET; 2972 } 2973 #endif 2974 break; 2975 #endif 2976 #ifdef INET6 2977 case AF_INET6: 2978 if (in6p) { 2979 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 2980 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 2981 #if 0 2982 in6p->in6p_flowinfo = ip6->ip6_flow & IPV6_FLOWINFO_MASK; 2983 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 2984 #endif 2985 } 2986 break; 2987 #endif 2988 } 2989 #ifdef INET6 2990 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 2991 struct in6pcb *oin6p = sotoin6pcb(oso); 2992 /* inherit socket options from the listening socket */ 2993 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 2994 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 2995 m_freem(in6p->in6p_options); 2996 in6p->in6p_options = 0; 2997 } 2998 ip6_savecontrol(in6p, &in6p->in6p_options, 2999 mtod(m, struct ip6_hdr *), m); 3000 } 3001 #endif 3002 3003 #ifdef IPSEC 3004 /* 3005 * we make a copy of policy, instead of sharing the policy, 3006 * for better behavior in terms of SA lookup and dead SA removal. 3007 */ 3008 if (inp) { 3009 /* copy old policy into new socket's */ 3010 if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) 3011 printf("tcp_input: could not copy policy\n"); 3012 } 3013 #ifdef INET6 3014 else if (in6p) { 3015 /* copy old policy into new socket's */ 3016 if (ipsec_copy_policy(sotoin6pcb(oso)->in6p_sp, in6p->in6p_sp)) 3017 printf("tcp_input: could not copy policy\n"); 3018 } 3019 #endif 3020 #endif 3021 3022 /* 3023 * Give the new socket our cached route reference. 3024 */ 3025 if (inp) 3026 inp->inp_route = sc->sc_route4; /* struct assignment */ 3027 #ifdef INET6 3028 else 3029 in6p->in6p_route = sc->sc_route6; 3030 #endif 3031 sc->sc_route4.ro_rt = NULL; 3032 3033 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3034 if (am == NULL) 3035 goto resetandabort; 3036 am->m_len = src->sa_len; 3037 bcopy(src, mtod(am, caddr_t), src->sa_len); 3038 if (inp) { 3039 if (in_pcbconnect(inp, am)) { 3040 (void) m_free(am); 3041 goto resetandabort; 3042 } 3043 } 3044 #ifdef INET6 3045 else if (in6p) { 3046 if (src->sa_family == AF_INET) { 3047 /* IPv4 packet to AF_INET6 socket */ 3048 struct sockaddr_in6 *sin6; 3049 sin6 = mtod(am, struct sockaddr_in6 *); 3050 am->m_len = sizeof(*sin6); 3051 bzero(sin6, sizeof(*sin6)); 3052 sin6->sin6_family = AF_INET6; 3053 sin6->sin6_len = sizeof(*sin6); 3054 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port; 3055 sin6->sin6_addr.s6_addr16[5] = htons(0xffff); 3056 bcopy(&((struct sockaddr_in *)src)->sin_addr, 3057 &sin6->sin6_addr.s6_addr32[3], 3058 sizeof(sin6->sin6_addr.s6_addr32[3])); 3059 } 3060 if (in6_pcbconnect(in6p, am)) { 3061 (void) m_free(am); 3062 goto resetandabort; 3063 } 3064 } 3065 #endif 3066 else { 3067 (void) m_free(am); 3068 goto resetandabort; 3069 } 3070 (void) m_free(am); 3071 3072 if (inp) 3073 tp = intotcpcb(inp); 3074 #ifdef INET6 3075 else if (in6p) 3076 tp = in6totcpcb(in6p); 3077 #endif 3078 else 3079 tp = NULL; 3080 if (sc->sc_request_r_scale != 15) { 3081 tp->requested_s_scale = sc->sc_requested_s_scale; 3082 tp->request_r_scale = sc->sc_request_r_scale; 3083 tp->snd_scale = sc->sc_requested_s_scale; 3084 tp->rcv_scale = sc->sc_request_r_scale; 3085 tp->t_flags |= TF_RCVD_SCALE; 3086 } 3087 if (sc->sc_flags & SCF_TIMESTAMP) 3088 tp->t_flags |= TF_RCVD_TSTMP; 3089 tp->ts_timebase = sc->sc_timebase; 3090 3091 tp->t_template = tcp_template(tp); 3092 if (tp->t_template == 0) { 3093 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3094 so = NULL; 3095 m_freem(m); 3096 goto abort; 3097 } 3098 3099 tp->iss = sc->sc_iss; 3100 tp->irs = sc->sc_irs; 3101 tcp_sendseqinit(tp); 3102 tcp_rcvseqinit(tp); 3103 tp->t_state = TCPS_SYN_RECEIVED; 3104 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT); 3105 tcpstat.tcps_accepts++; 3106 3107 /* Initialize tp->t_ourmss before we deal with the peer's! */ 3108 tp->t_ourmss = sc->sc_ourmaxseg; 3109 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 3110 3111 /* 3112 * Initialize the initial congestion window. If we 3113 * had to retransmit the SYN,ACK, we must initialize cwnd 3114 * to 1 segment (i.e. the Loss Window). 3115 */ 3116 if (sc->sc_rxtshift) 3117 tp->snd_cwnd = tp->t_peermss; 3118 else 3119 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss); 3120 3121 tcp_rmx_rtt(tp); 3122 tp->snd_wl1 = sc->sc_irs; 3123 tp->rcv_up = sc->sc_irs + 1; 3124 3125 /* 3126 * This is what whould have happened in tcp_ouput() when 3127 * the SYN,ACK was sent. 3128 */ 3129 tp->snd_up = tp->snd_una; 3130 tp->snd_max = tp->snd_nxt = tp->iss+1; 3131 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3132 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3133 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3134 tp->last_ack_sent = tp->rcv_nxt; 3135 3136 tcpstat.tcps_sc_completed++; 3137 SYN_CACHE_PUT(sc); 3138 return (so); 3139 3140 resetandabort: 3141 (void) tcp_respond(NULL, m, m, th, 3142 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); 3143 abort: 3144 if (so != NULL) 3145 (void) soabort(so); 3146 SYN_CACHE_PUT(sc); 3147 tcpstat.tcps_sc_aborted++; 3148 return ((struct socket *)(-1)); 3149 } 3150 3151 /* 3152 * This function is called when we get a RST for a 3153 * non-existent connection, so that we can see if the 3154 * connection is in the syn cache. If it is, zap it. 3155 */ 3156 3157 void 3158 syn_cache_reset(src, dst, th) 3159 struct sockaddr *src; 3160 struct sockaddr *dst; 3161 struct tcphdr *th; 3162 { 3163 struct syn_cache *sc; 3164 struct syn_cache_head *scp; 3165 int s = splsoftnet(); 3166 3167 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3168 splx(s); 3169 return; 3170 } 3171 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3172 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3173 splx(s); 3174 return; 3175 } 3176 SYN_CACHE_RM(sc); 3177 splx(s); 3178 tcpstat.tcps_sc_reset++; 3179 SYN_CACHE_PUT(sc); 3180 } 3181 3182 void 3183 syn_cache_unreach(src, dst, th) 3184 struct sockaddr *src; 3185 struct sockaddr *dst; 3186 struct tcphdr *th; 3187 { 3188 struct syn_cache *sc; 3189 struct syn_cache_head *scp; 3190 int s; 3191 3192 s = splsoftnet(); 3193 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3194 splx(s); 3195 return; 3196 } 3197 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3198 if (ntohl (th->th_seq) != sc->sc_iss) { 3199 splx(s); 3200 return; 3201 } 3202 3203 /* 3204 * If we've rertransmitted 3 times and this is our second error, 3205 * we remove the entry. Otherwise, we allow it to continue on. 3206 * This prevents us from incorrectly nuking an entry during a 3207 * spurious network outage. 3208 * 3209 * See tcp_notify(). 3210 */ 3211 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3212 sc->sc_flags |= SCF_UNREACH; 3213 splx(s); 3214 return; 3215 } 3216 3217 SYN_CACHE_RM(sc); 3218 splx(s); 3219 tcpstat.tcps_sc_unreach++; 3220 SYN_CACHE_PUT(sc); 3221 } 3222 3223 /* 3224 * Given a LISTEN socket and an inbound SYN request, add 3225 * this to the syn cache, and send back a segment: 3226 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3227 * to the source. 3228 * 3229 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3230 * Doing so would require that we hold onto the data and deliver it 3231 * to the application. However, if we are the target of a SYN-flood 3232 * DoS attack, an attacker could send data which would eventually 3233 * consume all available buffer space if it were ACKed. By not ACKing 3234 * the data, we avoid this DoS scenario. 3235 */ 3236 3237 int 3238 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi) 3239 struct sockaddr *src; 3240 struct sockaddr *dst; 3241 struct tcphdr *th; 3242 unsigned int hlen; 3243 struct socket *so; 3244 struct mbuf *m; 3245 u_char *optp; 3246 int optlen; 3247 struct tcp_opt_info *oi; 3248 { 3249 struct tcpcb tb, *tp; 3250 long win; 3251 struct syn_cache *sc; 3252 struct syn_cache_head *scp; 3253 struct mbuf *ipopts; 3254 3255 tp = sototcpcb(so); 3256 3257 /* 3258 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3259 * 3260 * Note this check is performed in tcp_input() very early on. 3261 */ 3262 3263 /* 3264 * Initialize some local state. 3265 */ 3266 win = sbspace(&so->so_rcv); 3267 if (win > TCP_MAXWIN) 3268 win = TCP_MAXWIN; 3269 3270 switch (src->sa_family) { 3271 #ifdef INET 3272 case AF_INET: 3273 /* 3274 * Remember the IP options, if any. 3275 */ 3276 ipopts = ip_srcroute(); 3277 break; 3278 #endif 3279 default: 3280 ipopts = NULL; 3281 } 3282 3283 if (optp) { 3284 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3285 tcp_dooptions(&tb, optp, optlen, th, oi); 3286 } else 3287 tb.t_flags = 0; 3288 3289 /* 3290 * See if we already have an entry for this connection. 3291 * If we do, resend the SYN,ACK. We do not count this 3292 * as a retransmission (XXX though maybe we should). 3293 */ 3294 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3295 tcpstat.tcps_sc_dupesyn++; 3296 if (ipopts) { 3297 /* 3298 * If we were remembering a previous source route, 3299 * forget it and use the new one we've been given. 3300 */ 3301 if (sc->sc_ipopts) 3302 (void) m_free(sc->sc_ipopts); 3303 sc->sc_ipopts = ipopts; 3304 } 3305 sc->sc_timestamp = tb.ts_recent; 3306 if (syn_cache_respond(sc, m) == 0) { 3307 tcpstat.tcps_sndacks++; 3308 tcpstat.tcps_sndtotal++; 3309 } 3310 return (1); 3311 } 3312 3313 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 3314 if (sc == NULL) { 3315 if (ipopts) 3316 (void) m_free(ipopts); 3317 return (0); 3318 } 3319 3320 /* 3321 * Fill in the cache, and put the necessary IP and TCP 3322 * options into the reply. 3323 */ 3324 callout_init(&sc->sc_timer); 3325 bzero(sc, sizeof(struct syn_cache)); 3326 bcopy(src, &sc->sc_src, src->sa_len); 3327 bcopy(dst, &sc->sc_dst, dst->sa_len); 3328 sc->sc_flags = 0; 3329 sc->sc_ipopts = ipopts; 3330 sc->sc_irs = th->th_seq; 3331 switch (src->sa_family) { 3332 #ifdef INET 3333 case AF_INET: 3334 { 3335 struct sockaddr_in *srcin = (void *) src; 3336 struct sockaddr_in *dstin = (void *) dst; 3337 3338 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, 3339 &srcin->sin_addr, dstin->sin_port, 3340 srcin->sin_port, sizeof(dstin->sin_addr), 0); 3341 break; 3342 } 3343 #endif /* INET */ 3344 #ifdef INET6 3345 case AF_INET6: 3346 { 3347 struct sockaddr_in6 *srcin6 = (void *) src; 3348 struct sockaddr_in6 *dstin6 = (void *) dst; 3349 3350 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, 3351 &srcin6->sin6_addr, dstin6->sin6_port, 3352 srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0); 3353 break; 3354 } 3355 #endif /* INET6 */ 3356 } 3357 sc->sc_peermaxseg = oi->maxseg; 3358 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 3359 m->m_pkthdr.rcvif : NULL, 3360 sc->sc_src.sa.sa_family); 3361 sc->sc_win = win; 3362 sc->sc_timebase = tcp_now; /* see tcp_newtcpcb() */ 3363 sc->sc_timestamp = tb.ts_recent; 3364 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) 3365 sc->sc_flags |= SCF_TIMESTAMP; 3366 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3367 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3368 sc->sc_requested_s_scale = tb.requested_s_scale; 3369 sc->sc_request_r_scale = 0; 3370 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3371 TCP_MAXWIN << sc->sc_request_r_scale < 3372 so->so_rcv.sb_hiwat) 3373 sc->sc_request_r_scale++; 3374 } else { 3375 sc->sc_requested_s_scale = 15; 3376 sc->sc_request_r_scale = 15; 3377 } 3378 sc->sc_tp = tp; 3379 if (syn_cache_respond(sc, m) == 0) { 3380 syn_cache_insert(sc, tp); 3381 tcpstat.tcps_sndacks++; 3382 tcpstat.tcps_sndtotal++; 3383 } else { 3384 SYN_CACHE_PUT(sc); 3385 tcpstat.tcps_sc_dropped++; 3386 } 3387 return (1); 3388 } 3389 3390 int 3391 syn_cache_respond(sc, m) 3392 struct syn_cache *sc; 3393 struct mbuf *m; 3394 { 3395 struct route *ro; 3396 u_int8_t *optp; 3397 int optlen, error; 3398 u_int16_t tlen; 3399 struct ip *ip = NULL; 3400 #ifdef INET6 3401 struct ip6_hdr *ip6 = NULL; 3402 #endif 3403 struct tcphdr *th; 3404 u_int hlen; 3405 3406 switch (sc->sc_src.sa.sa_family) { 3407 case AF_INET: 3408 hlen = sizeof(struct ip); 3409 ro = &sc->sc_route4; 3410 break; 3411 #ifdef INET6 3412 case AF_INET6: 3413 hlen = sizeof(struct ip6_hdr); 3414 ro = (struct route *)&sc->sc_route6; 3415 break; 3416 #endif 3417 default: 3418 if (m) 3419 m_freem(m); 3420 return EAFNOSUPPORT; 3421 } 3422 3423 /* Compute the size of the TCP options. */ 3424 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3425 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3426 3427 tlen = hlen + sizeof(struct tcphdr) + optlen; 3428 3429 /* 3430 * Create the IP+TCP header from scratch. 3431 */ 3432 if (m) 3433 m_freem(m); 3434 #ifdef DIAGNOSTIC 3435 if (max_linkhdr + tlen > MCLBYTES) 3436 return (ENOBUFS); 3437 #endif 3438 MGETHDR(m, M_DONTWAIT, MT_DATA); 3439 if (m && tlen > MHLEN) { 3440 MCLGET(m, M_DONTWAIT); 3441 if ((m->m_flags & M_EXT) == 0) { 3442 m_freem(m); 3443 m = NULL; 3444 } 3445 } 3446 if (m == NULL) 3447 return (ENOBUFS); 3448 3449 /* Fixup the mbuf. */ 3450 m->m_data += max_linkhdr; 3451 m->m_len = m->m_pkthdr.len = tlen; 3452 #ifdef IPSEC 3453 if (sc->sc_tp) { 3454 struct tcpcb *tp; 3455 struct socket *so; 3456 3457 tp = sc->sc_tp; 3458 if (tp->t_inpcb) 3459 so = tp->t_inpcb->inp_socket; 3460 #ifdef INET6 3461 else if (tp->t_in6pcb) 3462 so = tp->t_in6pcb->in6p_socket; 3463 #endif 3464 else 3465 so = NULL; 3466 /* use IPsec policy on listening socket, on SYN ACK */ 3467 if (ipsec_setsocket(m, so) != 0) { 3468 m_freem(m); 3469 return ENOBUFS; 3470 } 3471 } 3472 #endif 3473 m->m_pkthdr.rcvif = NULL; 3474 memset(mtod(m, u_char *), 0, tlen); 3475 3476 switch (sc->sc_src.sa.sa_family) { 3477 case AF_INET: 3478 ip = mtod(m, struct ip *); 3479 ip->ip_dst = sc->sc_src.sin.sin_addr; 3480 ip->ip_src = sc->sc_dst.sin.sin_addr; 3481 ip->ip_p = IPPROTO_TCP; 3482 th = (struct tcphdr *)(ip + 1); 3483 th->th_dport = sc->sc_src.sin.sin_port; 3484 th->th_sport = sc->sc_dst.sin.sin_port; 3485 break; 3486 #ifdef INET6 3487 case AF_INET6: 3488 ip6 = mtod(m, struct ip6_hdr *); 3489 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3490 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3491 ip6->ip6_nxt = IPPROTO_TCP; 3492 /* ip6_plen will be updated in ip6_output() */ 3493 th = (struct tcphdr *)(ip6 + 1); 3494 th->th_dport = sc->sc_src.sin6.sin6_port; 3495 th->th_sport = sc->sc_dst.sin6.sin6_port; 3496 break; 3497 #endif 3498 default: 3499 th = NULL; 3500 } 3501 3502 th->th_seq = htonl(sc->sc_iss); 3503 th->th_ack = htonl(sc->sc_irs + 1); 3504 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3505 th->th_flags = TH_SYN|TH_ACK; 3506 th->th_win = htons(sc->sc_win); 3507 /* th_sum already 0 */ 3508 /* th_urp already 0 */ 3509 3510 /* Tack on the TCP options. */ 3511 optp = (u_int8_t *)(th + 1); 3512 *optp++ = TCPOPT_MAXSEG; 3513 *optp++ = 4; 3514 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3515 *optp++ = sc->sc_ourmaxseg & 0xff; 3516 3517 if (sc->sc_request_r_scale != 15) { 3518 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3519 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3520 sc->sc_request_r_scale); 3521 optp += 4; 3522 } 3523 3524 if (sc->sc_flags & SCF_TIMESTAMP) { 3525 u_int32_t *lp = (u_int32_t *)(optp); 3526 /* Form timestamp option as shown in appendix A of RFC 1323. */ 3527 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 3528 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 3529 *lp = htonl(sc->sc_timestamp); 3530 optp += TCPOLEN_TSTAMP_APPA; 3531 } 3532 3533 /* Compute the packet's checksum. */ 3534 switch (sc->sc_src.sa.sa_family) { 3535 case AF_INET: 3536 ip->ip_len = htons(tlen - hlen); 3537 th->th_sum = 0; 3538 th->th_sum = in_cksum(m, tlen); 3539 break; 3540 #ifdef INET6 3541 case AF_INET6: 3542 ip6->ip6_plen = htons(tlen - hlen); 3543 th->th_sum = 0; 3544 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 3545 break; 3546 #endif 3547 } 3548 3549 /* 3550 * Fill in some straggling IP bits. Note the stack expects 3551 * ip_len to be in host order, for convenience. 3552 */ 3553 switch (sc->sc_src.sa.sa_family) { 3554 #ifdef INET 3555 case AF_INET: 3556 ip->ip_len = tlen; 3557 ip->ip_ttl = ip_defttl; 3558 /* XXX tos? */ 3559 break; 3560 #endif 3561 #ifdef INET6 3562 case AF_INET6: 3563 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 3564 ip6->ip6_vfc |= IPV6_VERSION; 3565 ip6->ip6_plen = htons(tlen - hlen); 3566 /* ip6_hlim will be initialized afterwards */ 3567 /* XXX flowlabel? */ 3568 break; 3569 #endif 3570 } 3571 3572 switch (sc->sc_src.sa.sa_family) { 3573 #ifdef INET 3574 case AF_INET: 3575 error = ip_output(m, sc->sc_ipopts, ro, 3576 (ip_mtudisc ? IP_MTUDISC : 0), 3577 NULL); 3578 break; 3579 #endif 3580 #ifdef INET6 3581 case AF_INET6: 3582 ip6->ip6_hlim = in6_selecthlim(NULL, 3583 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 3584 3585 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 3586 0, NULL, NULL); 3587 break; 3588 #endif 3589 default: 3590 error = EAFNOSUPPORT; 3591 break; 3592 } 3593 return (error); 3594 } 3595